hammer2 - Fix lost flush
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/types.h>
46 #include <sys/lock.h>
47 #include <sys/uuid.h>
48
49 #include "hammer2.h"
50
51 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
52
53 static hammer2_chain_t *hammer2_chain_create_indirect(
54                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
55                         hammer2_key_t key, int keybits);
56
57 /*
58  * Splay tree
59  */
60 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
61
62 int
63 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
64 {
65         return(chain2->index - chain1->index);
66 }
67
68 /*
69  * Recursively mark the parent chain elements so flushes can find
70  * modified elements.  Stop when we hit a chain already flagged
71  * SUBMODIFIED, but ignore the SUBMODIFIED bit that might be set
72  * in chain itself.
73  *
74  * SUBMODIFIED is not set on the chain passed in.
75  *
76  * XXX rename of parent can create a SMP race
77  */
78 static void
79 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
80 {
81         hammer2_chain_t *parent;
82
83         parent = chain->parent;
84         while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
85                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
86                 parent = parent->parent;
87         }
88 }
89
90 /*
91  * Allocate a new disconnected chain element representing the specified
92  * bref.  The chain element is locked exclusively and refs is set to 1.
93  *
94  * This essentially allocates a system memory structure representing one
95  * of the media structure types, including inodes.
96  */
97 hammer2_chain_t *
98 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
99 {
100         hammer2_chain_t *chain;
101         hammer2_inode_t *ip;
102         hammer2_indblock_t *np;
103         hammer2_data_t *dp;
104         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
105
106         /*
107          * Construct the appropriate system structure.
108          */
109         switch(bref->type) {
110         case HAMMER2_BREF_TYPE_INODE:
111                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
112                 chain = &ip->chain;
113                 chain->u.ip = ip;
114                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
115                 ip->hmp = hmp;
116                 break;
117         case HAMMER2_BREF_TYPE_INDIRECT:
118                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
119                 chain = &np->chain;
120                 chain->u.np = np;
121                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
122                 break;
123         case HAMMER2_BREF_TYPE_DATA:
124                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
125                 chain = &dp->chain;
126                 chain->u.dp = dp;
127                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
128                 break;
129         case HAMMER2_BREF_TYPE_VOLUME:
130                 chain = NULL;
131                 panic("hammer2_chain_alloc volume type illegal for op");
132         default:
133                 chain = NULL;
134                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
135                       bref->type);
136         }
137
138         /*
139          * Only set bref_flush if the bref has a real media offset, otherwise
140          * the caller has to wait for the chain to be modified/block-allocated
141          * before a blockref can be synchronized with its (future) parent.
142          */
143         chain->bref = *bref;
144         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
145                 chain->bref_flush = *bref;
146         chain->index = -1;              /* not yet assigned */
147         chain->refs = 1;
148         chain->bytes = bytes;
149         lockmgr(&chain->lk, LK_EXCLUSIVE);
150
151         return (chain);
152 }
153
154 /*
155  * Free a disconnected chain element
156  */
157 void
158 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
159 {
160         void *mem;
161
162         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
163             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
164                 chain->data = NULL;
165         }
166
167         KKASSERT(chain->bp == NULL);
168         KKASSERT(chain->data == NULL);
169         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
170                  chain->u.ip->vp == NULL);
171
172         if ((mem = chain->u.mem) != NULL) {
173                 chain->u.mem = NULL;
174                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
175                         kfree(mem, hmp->minode);
176                 else
177                         kfree(mem, hmp->mchain);
178         }
179 }
180
181 /*
182  * Add a reference to a chain element (for shared access).  The chain
183  * element must already have at least 1 ref controlled by the caller.
184  */
185 void
186 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
187 {
188         KKASSERT(chain->refs > 0);
189         atomic_add_int(&chain->refs, 1);
190 }
191
192 /*
193  * Drop the callers reference to the chain element.  If the ref count
194  * reaches zero the chain element and its related structure (typically an
195  * inode or indirect block) will be freed and the parent will be
196  * recursively dropped.
197  *
198  * MOVED and MODIFIED elements hold additional references so it should not
199  * be possible for the count on a modified element to drop to 0.
200  *
201  * The chain element must NOT be locked by the caller.
202  *
203  * The parent might or might not be locked by the caller but if so it
204  * will also be referenced so we shouldn't recurse upward.
205  */
206 void
207 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
208 {
209         hammer2_chain_t *parent;
210         u_int refs;
211
212         while (chain) {
213                 refs = chain->refs;
214                 cpu_ccfence();
215                 KKASSERT(refs > 0);
216                 if (refs == 1) {
217                         KKASSERT(chain != &hmp->vchain);
218                         parent = chain->parent;
219                         if (parent)
220                                 lockmgr(&parent->lk, LK_EXCLUSIVE);
221                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
222                                 /*
223                                  * Succeeded, recurse and drop parent.
224                                  * These chain elements should be synchronized
225                                  * so no delta data or inode count updates
226                                  * should be needed.
227                                  */
228                                 KKASSERT((chain->flags &
229                                           (HAMMER2_CHAIN_MOVED |
230                                            HAMMER2_CHAIN_MODIFIED)) == 0);
231                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
232                                         SPLAY_REMOVE(hammer2_chain_splay,
233                                                      &parent->shead, chain);
234                                         atomic_set_int(&chain->flags,
235                                                        HAMMER2_CHAIN_DELETED);
236                                         /* parent refs dropped via recursion */
237                                 }
238                                 chain->parent = NULL;
239                                 if (parent)
240                                         lockmgr(&parent->lk, LK_RELEASE);
241                                 hammer2_chain_free(hmp, chain);
242                                 chain = parent;
243                                 /* recurse on parent */
244                         } else {
245                                 if (parent)
246                                         lockmgr(&parent->lk, LK_RELEASE);
247                                 /* retry the same chain */
248                         }
249                 } else {
250                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
251                                 /*
252                                  * Succeeded, count did not reach zero so
253                                  * cut out of the loop.
254                                  */
255                                 break;
256                         }
257                         /* retry the same chain */
258                 }
259         }
260 }
261
262 /*
263  * Ref and lock a chain element, acquiring its data with I/O if necessary,
264  * and specify how you would like the data to be resolved.
265  *
266  * Returns 0 on success or an error code if the data could not be acquired.
267  * The chain element is locked either way.
268  *
269  * The lock is allowed to recurse, multiple locking ops will aggregate
270  * the requested resolve types.  Once data is assigned it will not be
271  * removed until the last unlock.
272  *
273  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
274  *                         (typically used to avoid device/logical buffer
275  *                          aliasing for data)
276  *
277  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
278  *                         the INITIAL-create state (indirect blocks only).
279  *
280  *                         Do not resolve data elements for DATA chains.
281  *                         (typically used to avoid device/logical buffer
282  *                          aliasing for data)
283  *
284  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
285  *
286  *
287  * NOTE: Embedded elements (volume header, inodes) are always resolved
288  *       regardless.
289  *
290  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
291  *       element will instantiate and zero its buffer, and flush it on
292  *       release.
293  *
294  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
295  *       so as not to instantiate a device buffer, which could alias against
296  *       a logical file buffer.  However, if ALWAYS is specified the
297  *       device buffer will be instantiated anyway.
298  */
299 int
300 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
301 {
302         hammer2_blockref_t *bref;
303         hammer2_off_t pbase;
304         hammer2_off_t peof;
305         size_t boff;
306         size_t bbytes;
307         int error;
308         char *bdata;
309
310         /*
311          * Lock the element.  Under certain conditions this might end up
312          * being a recursive lock.
313          */
314         KKASSERT(chain->refs > 0);
315         atomic_add_int(&chain->refs, 1);
316         lockmgr(&chain->lk, LK_EXCLUSIVE);
317
318         /*
319          * If we already have a valid data pointer no further action is
320          * necessary.
321          */
322         if (chain->data)
323                 return (0);
324
325         /*
326          * Do we have to resolve the data?
327          */
328         switch(how) {
329         case HAMMER2_RESOLVE_NEVER:
330                 return(0);
331         case HAMMER2_RESOLVE_MAYBE:
332                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
333                         return(0);
334                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
335                         return(0);
336                 /* fall through */
337         case HAMMER2_RESOLVE_ALWAYS:
338                 break;
339         }
340
341         /*
342          * We must resolve to a device buffer, either by issuing I/O or
343          * by creating a zero-fill element.  We do not mark the buffer
344          * dirty when creating a zero-fill element (the hammer2_chain_modify()
345          * API must still be used to do that).
346          *
347          * The device buffer is variable-sized in powers of 2 down
348          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
349          * chunk always contains buffers of the same size. (XXX)
350          *
351          * The minimum physical IO size may be larger than the variable
352          * block size.
353          */
354         bref = &chain->bref;
355
356         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
357                 bbytes = HAMMER2_MINIOSIZE;
358         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
359         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
360         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
361         KKASSERT(pbase != 0);
362
363         /*
364          * The getblk() optimization can only be used on newly created
365          * elements if the physical block size matches the request.
366          */
367         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
368             chain->bytes == bbytes) {
369                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
370                 error = 0;
371         } else if (hammer2_cluster_enable) {
372                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
373                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
374                                      &chain->bp);
375         } else {
376                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
377         }
378
379         if (error) {
380                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
381                         (intmax_t)pbase, error);
382                 bqrelse(chain->bp);
383                 chain->bp = NULL;
384                 return (error);
385         }
386
387         /*
388          * Zero the data area if the chain is in the INITIAL-create state.
389          * Mark the buffer for bdwrite().
390          */
391         bdata = (char *)chain->bp->b_data + boff;
392         if (chain->flags & HAMMER2_CHAIN_INITIAL) {
393                 bzero(bdata, chain->bytes);
394                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
395         }
396
397         /*
398          * Setup the data pointer, either pointing it to an embedded data
399          * structure and copying the data from the buffer, or pointing it
400          * into the buffer.
401          *
402          * The buffer is not retained when copying to an embedded data
403          * structure in order to avoid potential deadlocks or recursions
404          * on the same physical buffer.
405          */
406         switch (bref->type) {
407         case HAMMER2_BREF_TYPE_VOLUME:
408                 /*
409                  * Copy data from bp to embedded buffer
410                  */
411                 panic("hammer2_chain_lock: called on unresolved volume header");
412 #if 0
413                 /* NOT YET */
414                 KKASSERT(pbase == 0);
415                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
416                 bcopy(bdata, &hmp->voldata, chain->bytes);
417                 chain->data = (void *)&hmp->voldata;
418                 bqrelse(chain->bp);
419                 chain->bp = NULL;
420 #endif
421                 break;
422         case HAMMER2_BREF_TYPE_INODE:
423                 /*
424                  * Copy data from bp to embedded buffer, do not retain the
425                  * device buffer.
426                  */
427                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
428                 chain->data = (void *)&chain->u.ip->ip_data;
429                 bqrelse(chain->bp);
430                 chain->bp = NULL;
431                 break;
432         case HAMMER2_BREF_TYPE_INDIRECT:
433         case HAMMER2_BREF_TYPE_DATA:
434         default:
435                 /*
436                  * Point data at the device buffer and leave bp intact.
437                  */
438                 chain->data = (void *)bdata;
439                 break;
440         }
441         return (0);
442 }
443
444 /*
445  * Unlock and deref a chain element.
446  *
447  * On the last lock release any non-embedded data (chain->bp) will be
448  * retired.
449  */
450 void
451 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
452 {
453         long *counterp;
454
455         /*
456          * Undo a recursive lock
457          *
458          * XXX shared locks not handled properly
459          */
460         if (lockcountnb(&chain->lk) > 1) {
461                 KKASSERT(chain->refs > 1);
462                 atomic_add_int(&chain->refs, -1);
463                 lockmgr(&chain->lk, LK_RELEASE);
464                 return;
465         }
466
467         /*
468          * Shortcut the case if the data is embedded or not resolved.
469          *
470          * Do NOT null-out pointers to embedded data (e.g. inode).
471          *
472          * The DIRTYBP flag is non-applicable in this situation and can
473          * be cleared to keep the flags state clean.
474          */
475         if (chain->bp == NULL) {
476                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
477                 lockmgr(&chain->lk, LK_RELEASE);
478                 hammer2_chain_drop(hmp, chain);
479                 return;
480         }
481
482         /*
483          * Statistics
484          */
485         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
486                 ;
487         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
488                 switch(chain->bref.type) {
489                 case HAMMER2_BREF_TYPE_DATA:
490                         counterp = &hammer2_ioa_file_write;
491                         break;
492                 case HAMMER2_BREF_TYPE_INODE:
493                         counterp = &hammer2_ioa_meta_write;
494                         break;
495                 case HAMMER2_BREF_TYPE_INDIRECT:
496                         counterp = &hammer2_ioa_indr_write;
497                         break;
498                 default:
499                         counterp = &hammer2_ioa_volu_write;
500                         break;
501                 }
502                 ++*counterp;
503         } else {
504                 switch(chain->bref.type) {
505                 case HAMMER2_BREF_TYPE_DATA:
506                         counterp = &hammer2_iod_file_write;
507                         break;
508                 case HAMMER2_BREF_TYPE_INODE:
509                         counterp = &hammer2_iod_meta_write;
510                         break;
511                 case HAMMER2_BREF_TYPE_INDIRECT:
512                         counterp = &hammer2_iod_indr_write;
513                         break;
514                 default:
515                         counterp = &hammer2_iod_volu_write;
516                         break;
517                 }
518                 ++*counterp;
519         }
520
521         /*
522          * Clean out the bp.
523          *
524          * If a device buffer was used for data be sure to destroy the
525          * buffer when we are done to avoid aliases (XXX what about the
526          * underlying VM pages?).
527          */
528         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
529                 chain->bp->b_flags |= B_RELBUF;
530
531         /*
532          * The DIRTYBP flag tracks whether we have to bdwrite() the buffer
533          * or not.  The flag will get re-set when chain_modify() is called,
534          * even if MODIFIED is already set, allowing the OS to retire the
535          * buffer independent of a hammer2 flus.
536          */
537         chain->data = NULL;
538         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
539                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
540                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
541                         atomic_clear_int(&chain->flags,
542                                          HAMMER2_CHAIN_IOFLUSH);
543                         chain->bp->b_flags |= B_RELBUF;
544                         cluster_awrite(chain->bp);
545                 } else {
546                         chain->bp->b_flags |= B_CLUSTEROK;
547                         bdwrite(chain->bp);
548                 }
549         } else {
550                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
551                         atomic_clear_int(&chain->flags,
552                                          HAMMER2_CHAIN_IOFLUSH);
553                         chain->bp->b_flags |= B_RELBUF;
554                         brelse(chain->bp);
555                 } else {
556                         /* bp might still be dirty */
557                         bqrelse(chain->bp);
558                 }
559         }
560         chain->bp = NULL;
561         lockmgr(&chain->lk, LK_RELEASE);
562         hammer2_chain_drop(hmp, chain);
563 }
564
565 /*
566  * Resize the chain's physical storage allocation.  Chains can be resized
567  * smaller without reallocating the storage.  Resizing larger will reallocate
568  * the storage.
569  *
570  * Must be passed a locked chain.
571  *
572  * If you want the resize code to copy the data to the new block then the
573  * caller should lock the chain RESOLVE_MAYBE or RESOLVE_ALWAYS.
574  *
575  * If the caller already holds a logical buffer containing the data and
576  * intends to bdwrite() that buffer resolve with RESOLVE_NEVER.  The resize
577  * operation will then not copy the data.
578  *
579  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
580  * to avoid instantiating a device buffer that conflicts with the vnode
581  * data buffer.
582  *
583  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
584  */
585 void
586 hammer2_chain_resize(hammer2_inode_t *ip, hammer2_chain_t *chain,
587                      int nradix, int flags)
588 {
589         hammer2_mount_t *hmp = ip->hmp;
590         struct buf *nbp;
591         hammer2_off_t pbase;
592         size_t obytes;
593         size_t nbytes;
594         size_t bbytes;
595         int boff;
596         char *bdata;
597         int error;
598
599         /*
600          * Only data and indirect blocks can be resized for now
601          */
602         KKASSERT(chain != &hmp->vchain);
603         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
604                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
605
606         /*
607          * Nothing to do if the element is already the proper size
608          */
609         obytes = chain->bytes;
610         nbytes = 1U << nradix;
611         if (obytes == nbytes)
612                 return;
613
614         /*
615          * Set MODIFIED and add a chain ref to prevent destruction.  Both
616          * modified flags share the same ref.
617          *
618          * If the chain is already marked MODIFIED then we can safely
619          * return the previous allocation to the pool without having to
620          * worry about snapshots.
621          */
622         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
623                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
624                                               HAMMER2_CHAIN_MODIFY_TID);
625                 hammer2_chain_ref(hmp, chain);
626         } else {
627                 hammer2_freemap_free(hmp, chain->bref.data_off,
628                                      chain->bref.type);
629         }
630
631         /*
632          * Relocate the block, even if making it smaller (because different
633          * block sizes may be in different regions).
634          */
635         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
636                                                      nbytes);
637         chain->bytes = nbytes;
638         ip->delta_dcount += (ssize_t)(nbytes - obytes); /* XXX atomic */
639
640         /*
641          * The device buffer may be larger than the allocation size.
642          */
643         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
644                 bbytes = HAMMER2_MINIOSIZE;
645         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
646         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
647
648         /*
649          * Only copy the data if resolved, otherwise the caller is
650          * responsible.
651          */
652         if (chain->bp) {
653                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
654                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
655                 KKASSERT(chain != &hmp->vchain);        /* safety */
656
657                 /*
658                  * The getblk() optimization can only be used if the
659                  * physical block size matches the request.
660                  */
661                 if (nbytes == bbytes) {
662                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
663                         error = 0;
664                 } else {
665                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
666                         KKASSERT(error == 0);
667                 }
668                 bdata = (char *)nbp->b_data + boff;
669
670                 if (nbytes < obytes) {
671                         bcopy(chain->data, bdata, nbytes);
672                 } else {
673                         bcopy(chain->data, bdata, obytes);
674                         bzero(bdata + obytes, nbytes - obytes);
675                 }
676
677                 /*
678                  * NOTE: The INITIAL state of the chain is left intact.
679                  *       We depend on hammer2_chain_modify() to do the
680                  *       right thing.
681                  *
682                  * NOTE: We set B_NOCACHE to throw away the previous bp and
683                  *       any VM backing store, even if it was dirty.
684                  *       Otherwise we run the risk of a logical/device
685                  *       conflict on reallocation.
686                  */
687                 chain->bp->b_flags |= B_RELBUF | B_NOCACHE;
688                 brelse(chain->bp);
689                 chain->bp = nbp;
690                 chain->data = (void *)bdata;
691                 hammer2_chain_modify(hmp, chain, 0);
692         }
693
694         /*
695          * Make sure the chain is marked MOVED and SUBMOD is set in the
696          * parent(s) so the adjustments are picked up by flush.
697          */
698         if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
699                 hammer2_chain_ref(hmp, chain);
700                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
701         }
702         hammer2_chain_parent_setsubmod(hmp, chain);
703 }
704
705 /*
706  * Convert a locked chain that was retrieved read-only to read-write.
707  *
708  * If not already marked modified a new physical block will be allocated
709  * and assigned to the bref.
710  *
711  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
712  *                   level or the COW operation will not work.
713  *
714  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
715  *                   run the data through the device buffers.
716  */
717 void
718 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
719 {
720         struct buf *nbp;
721         int error;
722         hammer2_off_t pbase;
723         size_t bbytes;
724         size_t boff;
725         void *bdata;
726
727         /*
728          * Tells flush that modify_tid must be updated, otherwise only
729          * mirror_tid is updated.  This is the default.
730          */
731         if ((flags & HAMMER2_MODIFY_NO_MODIFY_TID) == 0)
732                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFY_TID);
733
734         /*
735          * If the chain is already marked MODIFIED we can just return.
736          *
737          * However, it is possible that a prior lock/modify sequence
738          * retired the buffer.  During this lock/modify sequence MODIFIED
739          * may still be set but the buffer could wind up clean.  Since
740          * the caller is going to modify the buffer further we have to
741          * be sure that DIRTYBP is set again.
742          */
743         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
744                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
745                     chain->bp == NULL) {
746                         goto skip1;
747                 }
748                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
749                 return;
750         }
751
752         /*
753          * Set MODIFIED and add a chain ref to prevent destruction.  Both
754          * modified flags share the same ref.
755          */
756         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
757         hammer2_chain_ref(hmp, chain);
758
759         /*
760          * We must allocate the copy-on-write block.
761          *
762          * If the data is embedded no other action is required.
763          *
764          * If the data is not embedded we acquire and clear the
765          * new block.  If chain->data is not NULL we then do the
766          * copy-on-write.  chain->data will then be repointed to the new
767          * buffer and the old buffer will be released.
768          *
769          * For newly created elements with no prior allocation we go
770          * through the copy-on-write steps except without the copying part.
771          */
772         if (chain != &hmp->vchain) {
773                 if ((hammer2_debug & 0x0001) &&
774                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
775                         kprintf("Replace %d\n", chain->bytes);
776                 }
777                 chain->bref.data_off =
778                         hammer2_freemap_alloc(hmp, chain->bref.type,
779                                               chain->bytes);
780                 /* XXX failed allocation */
781         }
782
783         /*
784          * If data instantiation is optional and the chain has no current
785          * data association (typical for DATA and newly-created INDIRECT
786          * elements), don't instantiate the buffer now.
787          */
788         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
789                 goto skip2;
790
791 skip1:
792         /*
793          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
794          * written-out on unlock.  This bit is independent of the MODIFIED
795          * bit because the chain may still need meta-data adjustments done
796          * by virtue of MODIFIED for its parent, and the buffer can be
797          * flushed out (possibly multiple times) by the OS before that.
798          *
799          * Clearing the INITIAL flag (for indirect blocks) indicates that
800          * a zero-fill buffer has been instantiated.
801          */
802         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
803         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
804
805         /*
806          * We currently should never instantiate a device buffer for a
807          * data chain.
808          */
809         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
810
811         /*
812          * Execute COW operation
813          */
814         switch(chain->bref.type) {
815         case HAMMER2_BREF_TYPE_VOLUME:
816         case HAMMER2_BREF_TYPE_INODE:
817                 /*
818                  * The data is embedded, no copy-on-write operation is
819                  * needed.
820                  */
821                 KKASSERT(chain->bp == NULL);
822                 break;
823         case HAMMER2_BREF_TYPE_DATA:
824         case HAMMER2_BREF_TYPE_INDIRECT:
825                 /*
826                  * Perform the copy-on-write operation
827                  */
828                 KKASSERT(chain != &hmp->vchain);        /* safety */
829                 /*
830                  * The device buffer may be larger than the allocation size.
831                  */
832                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
833                         bbytes = HAMMER2_MINIOSIZE;
834                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
835                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
836
837                 /*
838                  * The getblk() optimization can only be used if the
839                  * physical block size matches the request.
840                  */
841                 if (chain->bytes == bbytes) {
842                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
843                         error = 0;
844                 } else {
845                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
846                         KKASSERT(error == 0);
847                 }
848                 bdata = (char *)nbp->b_data + boff;
849
850                 /*
851                  * Copy or zero-fill on write depending on whether
852                  * chain->data exists or not.
853                  */
854                 if (chain->data) {
855                         bcopy(chain->data, bdata, chain->bytes);
856                         KKASSERT(chain->bp != NULL);
857                 } else {
858                         bzero(bdata, chain->bytes);
859                 }
860                 if (chain->bp) {
861                         chain->bp->b_flags |= B_RELBUF;
862                         brelse(chain->bp);
863                 }
864                 chain->bp = nbp;
865                 chain->data = bdata;
866                 break;
867         default:
868                 panic("hammer2_chain_modify: illegal non-embedded type %d",
869                       chain->bref.type);
870                 break;
871
872         }
873 skip2:
874         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
875                 hammer2_chain_parent_setsubmod(hmp, chain);
876 }
877
878 /*
879  * Mark the volume as having been modified.  This short-cut version
880  * does not have to lock the volume's chain, which allows the ioctl
881  * code to make adjustments to connections without deadlocking.
882  */
883 void
884 hammer2_modify_volume(hammer2_mount_t *hmp)
885 {
886         hammer2_voldata_lock(hmp);
887         atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
888         hammer2_voldata_unlock(hmp);
889 }
890
891 /*
892  * Locate an in-memory chain.  The parent must be locked.  The in-memory
893  * chain is returned or NULL if no in-memory chain is present.
894  *
895  * NOTE: A chain on-media might exist for this index when NULL is returned.
896  */
897 hammer2_chain_t *
898 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
899 {
900         hammer2_chain_t dummy;
901         hammer2_chain_t *chain;
902
903         dummy.index = index;
904         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
905         return (chain);
906 }
907
908 /*
909  * Return a locked chain structure with all associated data acquired.
910  *
911  * Caller must lock the parent on call, the returned child will be locked.
912  */
913 hammer2_chain_t *
914 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
915                   int index, int flags)
916 {
917         hammer2_blockref_t *bref;
918         hammer2_chain_t *chain;
919         hammer2_chain_t dummy;
920         int how;
921
922         /*
923          * Figure out how to lock.  MAYBE can be used to optimized
924          * the initial-create state for indirect blocks.
925          */
926         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
927                 how = HAMMER2_RESOLVE_NEVER;
928         else
929                 how = HAMMER2_RESOLVE_MAYBE;
930
931         /*
932          * First see if we have a (possibly modified) chain element cached
933          * for this (parent, index).  Acquire the data if necessary.
934          *
935          * If chain->data is non-NULL the chain should already be marked
936          * modified.
937          */
938         dummy.index = index;
939         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
940         if (chain) {
941                 if (flags & HAMMER2_LOOKUP_NOLOCK)
942                         hammer2_chain_ref(hmp, chain);
943                 else
944                         hammer2_chain_lock(hmp, chain, how);
945                 return(chain);
946         }
947
948         /*
949          * the get function must always succeed, panic if there's no
950          * data to index.
951          */
952         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
953                 panic("hammer2_chain_get: Missing bref(1)");
954                 /* NOT REACHED */
955         }
956
957         /*
958          * Otherwise lookup the bref and issue I/O (switch on the parent)
959          */
960         switch(parent->bref.type) {
961         case HAMMER2_BREF_TYPE_INODE:
962                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
963                 bref = &parent->data->ipdata.u.blockset.blockref[index];
964                 break;
965         case HAMMER2_BREF_TYPE_INDIRECT:
966                 KKASSERT(parent->data != NULL);
967                 KKASSERT(index >= 0 &&
968                          index < parent->bytes / sizeof(hammer2_blockref_t));
969                 bref = &parent->data->npdata.blockref[index];
970                 break;
971         case HAMMER2_BREF_TYPE_VOLUME:
972                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
973                 bref = &hmp->voldata.sroot_blockset.blockref[index];
974                 break;
975         default:
976                 bref = NULL;
977                 panic("hammer2_chain_get: unrecognized blockref type: %d",
978                       parent->bref.type);
979         }
980         if (bref->type == 0) {
981                 panic("hammer2_chain_get: Missing bref(2)");
982                 /* NOT REACHED */
983         }
984
985         /*
986          * Allocate a chain structure representing the existing media
987          * entry.
988          *
989          * The locking operation we do later will issue I/O to read it.
990          */
991         chain = hammer2_chain_alloc(hmp, bref);
992
993         /*
994          * Link the chain into its parent.  Caller is expected to hold an
995          * exclusive lock on the parent.
996          */
997         chain->parent = parent;
998         chain->index = index;
999         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1000                 panic("hammer2_chain_link: collision");
1001         KKASSERT(parent->refs > 0);
1002         atomic_add_int(&parent->refs, 1);       /* for splay entry */
1003
1004         /*
1005          * Additional linkage for inodes.  Reuse the parent pointer to
1006          * find the parent directory.
1007          */
1008         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
1009                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1010                         parent = parent->parent;
1011                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
1012                         chain->u.ip->pip = parent->u.ip;
1013                         chain->u.ip->pmp = parent->u.ip->pmp;
1014                         chain->u.ip->depth = parent->u.ip->depth + 1;
1015                 }
1016         }
1017
1018         /*
1019          * Our new chain structure has already been referenced and locked
1020          * but the lock code handles the I/O so call it to resolve the data.
1021          * Then release one of our two exclusive locks.
1022          *
1023          * If NOLOCK is set the release will release the one-and-only lock.
1024          */
1025         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
1026                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
1027                 hammer2_chain_drop(hmp, chain);         /* excess ref */
1028         }
1029         lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
1030
1031         return (chain);
1032 }
1033
1034 /*
1035  * Locate any key between key_beg and key_end inclusive.  (*parentp)
1036  * typically points to an inode but can also point to a related indirect
1037  * block and this function will recurse upwards and find the inode again.
1038  *
1039  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
1040  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
1041  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
1042  *
1043  * (*parentp) must be exclusively locked and referenced and can be an inode
1044  * or an existing indirect block within the inode.
1045  *
1046  * On return (*parentp) will be modified to point at the deepest parent chain
1047  * element encountered during the search, as a helper for an insertion or
1048  * deletion.   The new (*parentp) will be locked and referenced and the old
1049  * will be unlocked and dereferenced (no change if they are both the same).
1050  *
1051  * The matching chain will be returned exclusively locked and referenced.
1052  *
1053  * NULL is returned if no match was found, but (*parentp) will still
1054  * potentially be adjusted.
1055  *
1056  * This function will also recurse up the chain if the key is not within the
1057  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
1058  * can simply allow (*parentp) to float inside the loop.
1059  */
1060 hammer2_chain_t *
1061 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1062                      hammer2_key_t key_beg, hammer2_key_t key_end,
1063                      int flags)
1064 {
1065         hammer2_chain_t *parent;
1066         hammer2_chain_t *chain;
1067         hammer2_chain_t *tmp;
1068         hammer2_blockref_t *base;
1069         hammer2_blockref_t *bref;
1070         hammer2_key_t scan_beg;
1071         hammer2_key_t scan_end;
1072         int count = 0;
1073         int i;
1074
1075         /*
1076          * Recurse (*parentp) upward if necessary until the parent completely
1077          * encloses the key range or we hit the inode.
1078          */
1079         parent = *parentp;
1080         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1081                 scan_beg = parent->bref.key;
1082                 scan_end = scan_beg +
1083                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1084                 if (key_beg >= scan_beg && key_end <= scan_end)
1085                         break;
1086                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1087                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1088                 parent = parent->parent;
1089                                                         /* lock new parent */
1090                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1091                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1092                 *parentp = parent;                      /* new parent */
1093         }
1094
1095 again:
1096         /*
1097          * Locate the blockref array.  Currently we do a fully associative
1098          * search through the array.
1099          */
1100         switch(parent->bref.type) {
1101         case HAMMER2_BREF_TYPE_INODE:
1102                 /*
1103                  * Special shortcut for embedded data returns the inode
1104                  * itself.  Callers must detect this condition and access
1105                  * the embedded data (the strategy code does this for us).
1106                  *
1107                  * This is only applicable to regular files and softlinks.
1108                  */
1109                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1110                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1111                                 hammer2_chain_ref(hmp, parent);
1112                         else
1113                                 hammer2_chain_lock(hmp, parent,
1114                                                    HAMMER2_RESOLVE_ALWAYS);
1115                         return (parent);
1116                 }
1117                 base = &parent->data->ipdata.u.blockset.blockref[0];
1118                 count = HAMMER2_SET_COUNT;
1119                 break;
1120         case HAMMER2_BREF_TYPE_INDIRECT:
1121                 /*
1122                  * Optimize indirect blocks in the INITIAL state to avoid
1123                  * I/O.
1124                  */
1125                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1126                         base = NULL;
1127                 } else {
1128                         if (parent->data == NULL)
1129                                 panic("parent->data is NULL");
1130                         base = &parent->data->npdata.blockref[0];
1131                 }
1132                 count = parent->bytes / sizeof(hammer2_blockref_t);
1133                 break;
1134         case HAMMER2_BREF_TYPE_VOLUME:
1135                 base = &hmp->voldata.sroot_blockset.blockref[0];
1136                 count = HAMMER2_SET_COUNT;
1137                 break;
1138         default:
1139                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1140                       parent->bref.type);
1141                 base = NULL;    /* safety */
1142                 count = 0;      /* safety */
1143         }
1144
1145         /*
1146          * If the element and key overlap we use the element.
1147          */
1148         bref = NULL;
1149         for (i = 0; i < count; ++i) {
1150                 tmp = hammer2_chain_find(hmp, parent, i);
1151                 if (tmp) {
1152                         bref = &tmp->bref;
1153                         KKASSERT(bref->type != 0);
1154                 } else if (base == NULL || base[i].type == 0) {
1155                         continue;
1156                 } else {
1157                         bref = &base[i];
1158                 }
1159                 scan_beg = bref->key;
1160                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1161                 if (key_beg <= scan_end && key_end >= scan_beg)
1162                         break;
1163         }
1164         if (i == count) {
1165                 if (key_beg == key_end)
1166                         return (NULL);
1167                 return (hammer2_chain_next(hmp, parentp, NULL,
1168                                            key_beg, key_end, flags));
1169         }
1170
1171         /*
1172          * Acquire the new chain element.  If the chain element is an
1173          * indirect block we must search recursively.
1174          */
1175         chain = hammer2_chain_get(hmp, parent, i, flags);
1176         if (chain == NULL)
1177                 return (NULL);
1178
1179         /*
1180          * If the chain element is an indirect block it becomes the new
1181          * parent and we loop on it.
1182          *
1183          * The parent always has to be locked with at least RESOLVE_MAYBE,
1184          * so it might need a fixup if the caller passed incompatible flags.
1185          */
1186         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1187                 hammer2_chain_unlock(hmp, parent);
1188                 *parentp = parent = chain;
1189                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1190                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1191                         hammer2_chain_drop(hmp, chain); /* excess ref */
1192                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1193                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1194                         hammer2_chain_unlock(hmp, chain);
1195                 }
1196                 goto again;
1197         }
1198
1199         /*
1200          * All done, return chain
1201          */
1202         return (chain);
1203 }
1204
1205 /*
1206  * After having issued a lookup we can iterate all matching keys.
1207  *
1208  * If chain is non-NULL we continue the iteration from just after it's index.
1209  *
1210  * If chain is NULL we assume the parent was exhausted and continue the
1211  * iteration at the next parent.
1212  *
1213  * parent must be locked on entry and remains locked throughout.  chain's
1214  * lock status must match flags.
1215  */
1216 hammer2_chain_t *
1217 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1218                    hammer2_chain_t *chain,
1219                    hammer2_key_t key_beg, hammer2_key_t key_end,
1220                    int flags)
1221 {
1222         hammer2_chain_t *parent;
1223         hammer2_chain_t *tmp;
1224         hammer2_blockref_t *base;
1225         hammer2_blockref_t *bref;
1226         hammer2_key_t scan_beg;
1227         hammer2_key_t scan_end;
1228         int i;
1229         int count;
1230
1231         parent = *parentp;
1232
1233 again:
1234         /*
1235          * Calculate the next index and recalculate the parent if necessary.
1236          */
1237         if (chain) {
1238                 /*
1239                  * Continue iteration within current parent.  If not NULL
1240                  * the passed-in chain may or may not be locked, based on
1241                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1242                  * or a prior next).
1243                  */
1244                 i = chain->index + 1;
1245                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1246                         hammer2_chain_drop(hmp, chain);
1247                 else
1248                         hammer2_chain_unlock(hmp, chain);
1249
1250                 /*
1251                  * Any scan where the lookup returned degenerate data embedded
1252                  * in the inode has an invalid index and must terminate.
1253                  */
1254                 if (chain == parent)
1255                         return(NULL);
1256                 chain = NULL;
1257         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1258                 /*
1259                  * We reached the end of the iteration.
1260                  */
1261                 return (NULL);
1262         } else {
1263                 /*
1264                  * Continue iteration with next parent unless the current
1265                  * parent covers the range.
1266                  */
1267                 hammer2_chain_t *nparent;
1268
1269                 scan_beg = parent->bref.key;
1270                 scan_end = scan_beg +
1271                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1272                 if (key_beg >= scan_beg && key_end <= scan_end)
1273                         return (NULL);
1274
1275                 i = parent->index + 1;
1276                 nparent = parent->parent;
1277                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1278                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1279                                                         /* lock new parent */
1280                 hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
1281                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1282                 *parentp = parent = nparent;
1283         }
1284
1285 again2:
1286         /*
1287          * Locate the blockref array.  Currently we do a fully associative
1288          * search through the array.
1289          */
1290         switch(parent->bref.type) {
1291         case HAMMER2_BREF_TYPE_INODE:
1292                 base = &parent->data->ipdata.u.blockset.blockref[0];
1293                 count = HAMMER2_SET_COUNT;
1294                 break;
1295         case HAMMER2_BREF_TYPE_INDIRECT:
1296                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1297                         base = NULL;
1298                 } else {
1299                         KKASSERT(parent->data != NULL);
1300                         base = &parent->data->npdata.blockref[0];
1301                 }
1302                 count = parent->bytes / sizeof(hammer2_blockref_t);
1303                 break;
1304         case HAMMER2_BREF_TYPE_VOLUME:
1305                 base = &hmp->voldata.sroot_blockset.blockref[0];
1306                 count = HAMMER2_SET_COUNT;
1307                 break;
1308         default:
1309                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1310                       parent->bref.type);
1311                 base = NULL;    /* safety */
1312                 count = 0;      /* safety */
1313                 break;
1314         }
1315         KKASSERT(i <= count);
1316
1317         /*
1318          * Look for the key.  If we are unable to find a match and an exact
1319          * match was requested we return NULL.  If a range was requested we
1320          * run hammer2_chain_next() to iterate.
1321          */
1322         bref = NULL;
1323         while (i < count) {
1324                 tmp = hammer2_chain_find(hmp, parent, i);
1325                 if (tmp) {
1326                         bref = &tmp->bref;
1327                 } else if (base == NULL || base[i].type == 0) {
1328                         ++i;
1329                         continue;
1330                 } else {
1331                         bref = &base[i];
1332                 }
1333                 scan_beg = bref->key;
1334                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1335                 if (key_beg <= scan_end && key_end >= scan_beg)
1336                         break;
1337                 ++i;
1338         }
1339
1340         /*
1341          * If we couldn't find a match recurse up a parent to continue the
1342          * search.
1343          */
1344         if (i == count)
1345                 goto again;
1346
1347         /*
1348          * Acquire the new chain element.  If the chain element is an
1349          * indirect block we must search recursively.
1350          */
1351         chain = hammer2_chain_get(hmp, parent, i, flags);
1352         if (chain == NULL)
1353                 return (NULL);
1354
1355         /*
1356          * If the chain element is an indirect block it becomes the new
1357          * parent and we loop on it.
1358          *
1359          * The parent always has to be locked with at least RESOLVE_MAYBE,
1360          * so it might need a fixup if the caller passed incompatible flags.
1361          */
1362         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1363                 hammer2_chain_unlock(hmp, parent);
1364                 *parentp = parent = chain;
1365                 chain = NULL;
1366                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1367                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1368                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1369                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1370                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1371                         hammer2_chain_unlock(hmp, parent);
1372                 }
1373                 i = 0;
1374                 goto again2;
1375         }
1376
1377         /*
1378          * All done, return chain
1379          */
1380         return (chain);
1381 }
1382
1383 /*
1384  * Create and return a new hammer2 system memory structure of the specified
1385  * key, type and size and insert it RELATIVE TO (PARENT).
1386  *
1387  * (parent) is typically either an inode or an indirect block, acquired
1388  * acquired as a side effect of issuing a prior failed lookup.  parent
1389  * must be locked and held.  Do not pass the inode chain to this function
1390  * unless that is the chain returned by the failed lookup.
1391  *
1392  * Non-indirect types will automatically allocate indirect blocks as required
1393  * if the new item does not fit in the current (parent).
1394  *
1395  * Indirect types will move a portion of the existing blockref array in
1396  * (parent) into the new indirect type and then use one of the free slots
1397  * to emplace the new indirect type.
1398  *
1399  * A new locked, referenced chain element is returned of the specified type.
1400  * The element may or may not have a data area associated with it:
1401  *
1402  *      VOLUME          not allowed here
1403  *      INODE           embedded data are will be set-up
1404  *      INDIRECT        not allowed here
1405  *      DATA            no data area will be set-up (caller is expected
1406  *                      to have logical buffers, we don't want to alias
1407  *                      the data onto device buffers!).
1408  */
1409 hammer2_chain_t *
1410 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1411                      hammer2_chain_t *chain,
1412                      hammer2_key_t key, int keybits, int type, size_t bytes)
1413 {
1414         hammer2_blockref_t dummy;
1415         hammer2_blockref_t *base;
1416         hammer2_chain_t dummy_chain;
1417         int unlock_parent = 0;
1418         int allocated = 0;
1419         int count;
1420         int i;
1421
1422         if (chain == NULL) {
1423                 /*
1424                  * First allocate media space and construct the dummy bref,
1425                  * then allocate the in-memory chain structure.
1426                  */
1427                 bzero(&dummy, sizeof(dummy));
1428                 dummy.type = type;
1429                 dummy.key = key;
1430                 dummy.keybits = keybits;
1431                 dummy.data_off = hammer2_bytes_to_radix(bytes);
1432                 chain = hammer2_chain_alloc(hmp, &dummy);
1433                 allocated = 1;
1434
1435                 /*
1436                  * We do NOT set INITIAL here (yet).  INITIAL is only
1437                  * used for indirect blocks.
1438                  *
1439                  * Recalculate bytes to reflect the actual media block
1440                  * allocation.
1441                  */
1442                 bytes = (hammer2_off_t)1 <<
1443                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1444                 chain->bytes = bytes;
1445
1446                 switch(type) {
1447                 case HAMMER2_BREF_TYPE_VOLUME:
1448                         panic("hammer2_chain_create: called with volume type");
1449                         break;
1450                 case HAMMER2_BREF_TYPE_INODE:
1451                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1452                         chain->data = (void *)&chain->u.ip->ip_data;
1453                         break;
1454                 case HAMMER2_BREF_TYPE_INDIRECT:
1455                         panic("hammer2_chain_create: cannot be used to"
1456                               "create indirect block");
1457                         break;
1458                 case HAMMER2_BREF_TYPE_DATA:
1459                 default:
1460                         /* leave chain->data NULL */
1461                         KKASSERT(chain->data == NULL);
1462                         break;
1463                 }
1464         } else {
1465                 /*
1466                  * Potentially update the chain's key/keybits.
1467                  */
1468                 chain->bref.key = key;
1469                 chain->bref.keybits = keybits;
1470         }
1471
1472 again:
1473         /*
1474          * Locate a free blockref in the parent's array
1475          */
1476         switch(parent->bref.type) {
1477         case HAMMER2_BREF_TYPE_INODE:
1478                 KKASSERT((parent->u.ip->ip_data.op_flags &
1479                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
1480                 KKASSERT(parent->data != NULL);
1481                 base = &parent->data->ipdata.u.blockset.blockref[0];
1482                 count = HAMMER2_SET_COUNT;
1483                 break;
1484         case HAMMER2_BREF_TYPE_INDIRECT:
1485                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1486                         base = NULL;
1487                 } else {
1488                         KKASSERT(parent->data != NULL);
1489                         base = &parent->data->npdata.blockref[0];
1490                 }
1491                 count = parent->bytes / sizeof(hammer2_blockref_t);
1492                 break;
1493         case HAMMER2_BREF_TYPE_VOLUME:
1494                 KKASSERT(parent->data != NULL);
1495                 base = &hmp->voldata.sroot_blockset.blockref[0];
1496                 count = HAMMER2_SET_COUNT;
1497                 break;
1498         default:
1499                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1500                       parent->bref.type);
1501                 count = 0;
1502                 break;
1503         }
1504
1505         /*
1506          * Scan for an unallocated bref, also skipping any slots occupied
1507          * by in-memory chain elements that may not yet have been updated
1508          * in the parent's bref array.
1509          */
1510         bzero(&dummy_chain, sizeof(dummy_chain));
1511         for (i = 0; i < count; ++i) {
1512                 if (base == NULL) {
1513                         dummy_chain.index = i;
1514                         if (SPLAY_FIND(hammer2_chain_splay,
1515                                        &parent->shead, &dummy_chain) == NULL) {
1516                                 break;
1517                         }
1518                 } else if (base[i].type == 0) {
1519                         dummy_chain.index = i;
1520                         if (SPLAY_FIND(hammer2_chain_splay,
1521                                        &parent->shead, &dummy_chain) == NULL) {
1522                                 break;
1523                         }
1524                 }
1525         }
1526
1527         /*
1528          * If no free blockref could be found we must create an indirect
1529          * block and move a number of blockrefs into it.  With the parent
1530          * locked we can safely lock each child in order to move it without
1531          * causing a deadlock.
1532          *
1533          * This may return the new indirect block or the old parent depending
1534          * on where the key falls.
1535          */
1536         if (i == count) {
1537                 hammer2_chain_t *nparent;
1538
1539                 nparent = hammer2_chain_create_indirect(hmp, parent,
1540                                                         key, keybits);
1541                 if (nparent == NULL) {
1542                         if (allocated)
1543                                 hammer2_chain_free(hmp, chain);
1544                         chain = NULL;
1545                         goto done;
1546                 }
1547                 if (parent != nparent) {
1548                         if (unlock_parent)
1549                                 hammer2_chain_unlock(hmp, parent);
1550                         parent = nparent;
1551                         unlock_parent = 1;
1552                 }
1553                 goto again;
1554         }
1555
1556         /*
1557          * Link the chain into its parent.  Later on we will have to set
1558          * the MOVED bit in situations where we don't mark the new chain
1559          * as being modified.
1560          */
1561         if (chain->parent != NULL)
1562                 panic("hammer2: hammer2_chain_create: chain already connected");
1563         KKASSERT(chain->parent == NULL);
1564         chain->parent = parent;
1565         chain->index = i;
1566         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1567                 panic("hammer2_chain_link: collision");
1568         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1569         KKASSERT(parent->refs > 0);
1570         atomic_add_int(&parent->refs, 1);
1571
1572         /*
1573          * Additional linkage for inodes.  Reuse the parent pointer to
1574          * find the parent directory.
1575          *
1576          * Cumulative adjustments are inherited on [re]attach and will
1577          * propagate up the tree on the next flush.
1578          */
1579         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1580                 hammer2_chain_t *scan = parent;
1581                 hammer2_inode_t *ip = chain->u.ip;
1582
1583                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1584                         scan = scan->parent;
1585                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
1586                         ip->pip = scan->u.ip;
1587                         ip->pmp = scan->u.ip->pmp;
1588                         ip->depth = scan->u.ip->depth + 1;
1589                         ip->pip->delta_icount += ip->ip_data.inode_count;
1590                         ip->pip->delta_dcount += ip->ip_data.data_count;
1591                         ++ip->pip->delta_icount;
1592                 }
1593         }
1594
1595         /*
1596          * (allocated) indicates that this is a newly-created chain element
1597          * rather than a renamed chain element.  In this situation we want
1598          * to place the chain element in the MODIFIED state.
1599          *
1600          * The data area will be set up as follows:
1601          *
1602          *      VOLUME          not allowed here.
1603          *
1604          *      INODE           embedded data are will be set-up.
1605          *
1606          *      INDIRECT        not allowed here.
1607          *
1608          *      DATA            no data area will be set-up (caller is expected
1609          *                      to have logical buffers, we don't want to alias
1610          *                      the data onto device buffers!).
1611          */
1612         if (allocated) {
1613                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1614                         hammer2_chain_modify(hmp, chain,
1615                                              HAMMER2_MODIFY_OPTDATA);
1616                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1617                         /* not supported in this function */
1618                         panic("hammer2_chain_create: bad type");
1619                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1620                         hammer2_chain_modify(hmp, chain,
1621                                              HAMMER2_MODIFY_OPTDATA);
1622                 } else {
1623                         hammer2_chain_modify(hmp, chain, 0);
1624                 }
1625         } else {
1626                 /*
1627                  * When reconnecting inodes we have to call setsubmod()
1628                  * to ensure that its state propagates up the newly
1629                  * connected parent.
1630                  *
1631                  * Make sure MOVED is set but do not update bref_flush.  If
1632                  * the chain is undergoing modification bref_flush will be
1633                  * updated when it gets flushed.  If it is not then the
1634                  * bref may not have been flushed yet and we do not want to
1635                  * set MODIFIED here as this could result in unnecessary
1636                  * reallocations.
1637                  */
1638                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1639                         hammer2_chain_ref(hmp, chain);
1640                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1641                 }
1642                 hammer2_chain_parent_setsubmod(hmp, chain);
1643         }
1644
1645 done:
1646         if (unlock_parent)
1647                 hammer2_chain_unlock(hmp, parent);
1648         return (chain);
1649 }
1650
1651 /*
1652  * Create an indirect block that covers one or more of the elements in the
1653  * current parent.  Either returns the existing parent with no locking or
1654  * ref changes or returns the new indirect block locked and referenced
1655  * and leaving the original parent lock/ref intact as well.
1656  *
1657  * The returned chain depends on where the specified key falls.
1658  *
1659  * The key/keybits for the indirect mode only needs to follow three rules:
1660  *
1661  * (1) That all elements underneath it fit within its key space and
1662  *
1663  * (2) That all elements outside it are outside its key space.
1664  *
1665  * (3) When creating the new indirect block any elements in the current
1666  *     parent that fit within the new indirect block's keyspace must be
1667  *     moved into the new indirect block.
1668  *
1669  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1670  *     keyspace the the current parent, but lookup/iteration rules will
1671  *     ensure (and must ensure) that rule (2) for all parents leading up
1672  *     to the nearest inode or the root volume header is adhered to.  This
1673  *     is accomplished by always recursing through matching keyspaces in
1674  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1675  *
1676  * The current implementation calculates the current worst-case keyspace by
1677  * iterating the current parent and then divides it into two halves, choosing
1678  * whichever half has the most elements (not necessarily the half containing
1679  * the requested key).
1680  *
1681  * We can also opt to use the half with the least number of elements.  This
1682  * causes lower-numbered keys (aka logical file offsets) to recurse through
1683  * fewer indirect blocks and higher-numbered keys to recurse through more.
1684  * This also has the risk of not moving enough elements to the new indirect
1685  * block and being forced to create several indirect blocks before the element
1686  * can be inserted.
1687  */
1688 static
1689 hammer2_chain_t *
1690 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1691                               hammer2_key_t create_key, int create_bits)
1692 {
1693         hammer2_blockref_t *base;
1694         hammer2_blockref_t *bref;
1695         hammer2_chain_t *chain;
1696         hammer2_chain_t *ichain;
1697         hammer2_chain_t dummy;
1698         hammer2_key_t key = create_key;
1699         int keybits = create_bits;
1700         int locount = 0;
1701         int hicount = 0;
1702         int count;
1703         int nbytes;
1704         int i;
1705
1706         /*
1707          * Calculate the base blockref pointer or NULL if the chain
1708          * is known to be empty.  We need to calculate the array count
1709          * for SPLAY lookups either way.
1710          */
1711         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1712         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1713                 base = NULL;
1714
1715                 switch(parent->bref.type) {
1716                 case HAMMER2_BREF_TYPE_INODE:
1717                         count = HAMMER2_SET_COUNT;
1718                         break;
1719                 case HAMMER2_BREF_TYPE_INDIRECT:
1720                         count = parent->bytes / sizeof(hammer2_blockref_t);
1721                         break;
1722                 case HAMMER2_BREF_TYPE_VOLUME:
1723                         count = HAMMER2_SET_COUNT;
1724                         break;
1725                 default:
1726                         panic("hammer2_chain_create_indirect: "
1727                               "unrecognized blockref type: %d",
1728                               parent->bref.type);
1729                         count = 0;
1730                         break;
1731                 }
1732         } else {
1733                 switch(parent->bref.type) {
1734                 case HAMMER2_BREF_TYPE_INODE:
1735                         base = &parent->data->ipdata.u.blockset.blockref[0];
1736                         count = HAMMER2_SET_COUNT;
1737                         break;
1738                 case HAMMER2_BREF_TYPE_INDIRECT:
1739                         base = &parent->data->npdata.blockref[0];
1740                         count = parent->bytes / sizeof(hammer2_blockref_t);
1741                         break;
1742                 case HAMMER2_BREF_TYPE_VOLUME:
1743                         base = &hmp->voldata.sroot_blockset.blockref[0];
1744                         count = HAMMER2_SET_COUNT;
1745                         break;
1746                 default:
1747                         panic("hammer2_chain_create_indirect: "
1748                               "unrecognized blockref type: %d",
1749                               parent->bref.type);
1750                         count = 0;
1751                         break;
1752                 }
1753         }
1754
1755         /*
1756          * Scan for an unallocated bref, also skipping any slots occupied
1757          * by in-memory chain elements which may not yet have been updated
1758          * in the parent's bref array.
1759          */
1760         bzero(&dummy, sizeof(dummy));
1761         for (i = 0; i < count; ++i) {
1762                 int nkeybits;
1763
1764                 dummy.index = i;
1765                 chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
1766                 if (chain) {
1767                         bref = &chain->bref;
1768                 } else if (base && base[i].type) {
1769                         bref = &base[i];
1770                 } else {
1771                         continue;
1772                 }
1773
1774                 /*
1775                  * Expand our calculated key range (key, keybits) to fit
1776                  * the scanned key.  nkeybits represents the full range
1777                  * that we will later cut in half (two halves @ nkeybits - 1).
1778                  */
1779                 nkeybits = keybits;
1780                 if (nkeybits < bref->keybits)
1781                         nkeybits = bref->keybits;
1782                 while (nkeybits < 64 &&
1783                        (~(((hammer2_key_t)1 << nkeybits) - 1) &
1784                         (key ^ bref->key)) != 0) {
1785                         ++nkeybits;
1786                 }
1787
1788                 /*
1789                  * If the new key range is larger we have to determine
1790                  * which side of the new key range the existing keys fall
1791                  * under by checking the high bit, then collapsing the
1792                  * locount into the hicount or vise-versa.
1793                  */
1794                 if (keybits != nkeybits) {
1795                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1796                                 hicount += locount;
1797                                 locount = 0;
1798                         } else {
1799                                 locount += hicount;
1800                                 hicount = 0;
1801                         }
1802                         keybits = nkeybits;
1803                 }
1804
1805                 /*
1806                  * The newly scanned key will be in the lower half or the
1807                  * higher half of the (new) key range.
1808                  */
1809                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1810                         ++hicount;
1811                 else
1812                         ++locount;
1813         }
1814
1815         /*
1816          * Adjust keybits to represent half of the full range calculated
1817          * above (radix 63 max)
1818          */
1819         --keybits;
1820
1821         /*
1822          * Select whichever half contains the most elements.  Theoretically
1823          * we can select either side as long as it contains at least one
1824          * element (in order to ensure that a free slot is present to hold
1825          * the indirect block).
1826          */
1827         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1828         if (hammer2_indirect_optimize) {
1829                 /*
1830                  * Insert node for least number of keys, this will arrange
1831                  * the first few blocks of a large file or the first few
1832                  * inodes in a directory with fewer indirect blocks when
1833                  * created linearly.
1834                  */
1835                 if (hicount < locount && hicount != 0)
1836                         key |= (hammer2_key_t)1 << keybits;
1837                 else
1838                         key &= ~(hammer2_key_t)1 << keybits;
1839         } else {
1840                 /*
1841                  * Insert node for most number of keys, best for heavily
1842                  * fragmented files.
1843                  */
1844                 if (hicount > locount)
1845                         key |= (hammer2_key_t)1 << keybits;
1846                 else
1847                         key &= ~(hammer2_key_t)1 << keybits;
1848         }
1849
1850         /*
1851          * How big should our new indirect block be?  It has to be at least
1852          * as large as its parent.
1853          */
1854         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
1855                 nbytes = HAMMER2_IND_BYTES_MIN;
1856         else
1857                 nbytes = HAMMER2_IND_BYTES_MAX;
1858         if (nbytes < count * sizeof(hammer2_blockref_t))
1859                 nbytes = count * sizeof(hammer2_blockref_t);
1860
1861         /*
1862          * Ok, create our new indirect block
1863          */
1864         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1865         dummy.bref.key = key;
1866         dummy.bref.keybits = keybits;
1867         dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
1868         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1869         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
1870
1871         /*
1872          * Iterate the original parent and move the matching brefs into
1873          * the new indirect block.
1874          */
1875         for (i = 0; i < count; ++i) {
1876                 /*
1877                  * For keying purposes access the bref from the media or
1878                  * from our in-memory cache.  In cases where the in-memory
1879                  * cache overrides the media the keyrefs will be the same
1880                  * anyway so we can avoid checking the cache when the media
1881                  * has a key.
1882                  */
1883                 dummy.index = i;
1884                 chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
1885                 if (chain) {
1886                         bref = &chain->bref;
1887                 } else if (base && base[i].type) {
1888                         bref = &base[i];
1889                 } else {
1890                         if (ichain->index < 0)
1891                                 ichain->index = i;
1892                         continue;
1893                 }
1894
1895                 /*
1896                  * Skip keys not in the chosen half (low or high), only bit
1897                  * (keybits - 1) needs to be compared but for safety we
1898                  * will compare all msb bits plus that bit again.
1899                  */
1900                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1901                     (key ^ bref->key)) != 0) {
1902                         continue;
1903                 }
1904
1905                 /*
1906                  * This element is being moved from the parent, its slot
1907                  * is available for our new indirect block.
1908                  */
1909                 if (ichain->index < 0)
1910                         ichain->index = i;
1911
1912                 /*
1913                  * Load the new indirect block by acquiring or allocating
1914                  * the related chain entries, then simply move them to the
1915                  * new parent (ichain).
1916                  *
1917                  * When adjusting the parent/child relationship we must
1918                  * set the MOVED bit but we do NOT update bref_flush
1919                  * because otherwise we might synchronize a bref that has
1920                  * not yet been flushed.  We depend on chain's bref_flush
1921                  * either being correct or the chain being in a MODIFIED
1922                  * state.
1923                  *
1924                  * We do not want to set MODIFIED here as this would result
1925                  * in unnecessary reallocations.
1926                  *
1927                  * We must still set SUBMODIFIED in the parent but we do
1928                  * that after the loop.
1929                  *
1930                  * XXX we really need a lock here but we don't need the
1931                  *     data.  NODATA feature needed.
1932                  */
1933                 chain = hammer2_chain_get(hmp, parent, i,
1934                                           HAMMER2_LOOKUP_NODATA);
1935                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1936                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1937                         panic("hammer2_chain_create_indirect: collision");
1938                 chain->parent = ichain;
1939                 if (base)
1940                         bzero(&base[i], sizeof(base[i]));
1941                 atomic_add_int(&parent->refs, -1);
1942                 atomic_add_int(&ichain->refs, 1);
1943                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1944                         hammer2_chain_ref(hmp, chain);
1945                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1946                 }
1947                 hammer2_chain_unlock(hmp, chain);
1948                 KKASSERT(parent->refs > 0);
1949                 chain = NULL;
1950         }
1951
1952         /*
1953          * Insert the new indirect block into the parent now that we've
1954          * cleared out some entries in the parent.  We calculated a good
1955          * insertion index in the loop above (ichain->index).
1956          *
1957          * We don't have to set MOVED here because we mark ichain modified
1958          * down below (so the normal modified -> flush -> set-moved sequence
1959          * applies).
1960          */
1961         KKASSERT(ichain->index >= 0);
1962         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1963                 panic("hammer2_chain_create_indirect: ichain insertion");
1964         ichain->parent = parent;
1965         atomic_add_int(&parent->refs, 1);
1966
1967         /*
1968          * Mark the new indirect block modified after insertion, which
1969          * will propagate up through parent all the way to the root and
1970          * also allocate the physical block in ichain for our caller,
1971          * and assign ichain->data to a pre-zero'd space (because there
1972          * is not prior data to copy into it).
1973          *
1974          * We have to set SUBMODIFIED in ichain's flags manually so the
1975          * flusher knows it has to recurse through it to get to all of
1976          * our moved blocks, then call setsubmod() to set the bit
1977          * recursively.
1978          */
1979         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
1980         hammer2_chain_parent_setsubmod(hmp, ichain);
1981         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1982
1983         /*
1984          * Figure out what to return.
1985          */
1986         if (create_bits > keybits) {
1987                 /*
1988                  * Key being created is way outside the key range,
1989                  * return the original parent.
1990                  */
1991                 hammer2_chain_unlock(hmp, ichain);
1992         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1993                    (create_key ^ key)) {
1994                 /*
1995                  * Key being created is outside the key range,
1996                  * return the original parent.
1997                  */
1998                 hammer2_chain_unlock(hmp, ichain);
1999         } else {
2000                 /*
2001                  * Otherwise its in the range, return the new parent.
2002                  * (leave both the new and old parent locked).
2003                  */
2004                 parent = ichain;
2005         }
2006
2007         return(parent);
2008 }
2009
2010 /*
2011  * Physically delete the specified chain element.  Note that inodes with
2012  * open descriptors should not be deleted (as with other filesystems) until
2013  * the last open descriptor is closed.
2014  *
2015  * This routine will remove the chain element from its parent and potentially
2016  * also recurse upward and delete indirect blocks which become empty as a
2017  * side effect.
2018  *
2019  * The caller must pass a pointer to the chain's parent, also locked and
2020  * referenced.  (*parentp) will be modified in a manner similar to a lookup
2021  * or iteration when indirect blocks are also deleted as a side effect.
2022  *
2023  * XXX This currently does not adhere to the MOVED flag protocol in that
2024  *     the removal is immediately indicated in the parent's blockref[]
2025  *     array.
2026  */
2027 void
2028 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
2029                      hammer2_chain_t *chain, int retain)
2030 {
2031         hammer2_blockref_t *base;
2032         hammer2_inode_t *ip;
2033         int count;
2034
2035         if (chain->parent != parent)
2036                 panic("hammer2_chain_delete: parent mismatch");
2037
2038         /*
2039          * Mark the parent modified so our base[] pointer remains valid
2040          * while we move entries.  For the optimized indirect block
2041          * case mark the parent moved instead.
2042          *
2043          * Calculate the blockref reference in the parent
2044          */
2045         switch(parent->bref.type) {
2046         case HAMMER2_BREF_TYPE_INODE:
2047                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2048                 base = &parent->data->ipdata.u.blockset.blockref[0];
2049                 count = HAMMER2_SET_COUNT;
2050                 break;
2051         case HAMMER2_BREF_TYPE_INDIRECT:
2052                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA |
2053                                                   HAMMER2_MODIFY_NO_MODIFY_TID);
2054                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
2055                         base = NULL;
2056                 else
2057                         base = &parent->data->npdata.blockref[0];
2058                 count = parent->bytes / sizeof(hammer2_blockref_t);
2059                 break;
2060         case HAMMER2_BREF_TYPE_VOLUME:
2061                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2062                 base = &hmp->voldata.sroot_blockset.blockref[0];
2063                 count = HAMMER2_SET_COUNT;
2064                 break;
2065         default:
2066                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
2067                       parent->bref.type);
2068                 count = 0;
2069                 break;
2070         }
2071
2072         /*
2073          * Disconnect the bref in the parent, remove the chain, and
2074          * disconnect in-memory fields from the parent.
2075          */
2076         KKASSERT(chain->index >= 0 && chain->index < count);
2077         if (base)
2078                 bzero(&base[chain->index], sizeof(*base));
2079
2080         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
2081         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
2082         atomic_add_int(&parent->refs, -1);      /* for splay entry */
2083         chain->index = -1;
2084         chain->parent = NULL;
2085
2086         /*
2087          * Cumulative adjustments must be propagated to the parent inode
2088          * when deleting and synchronized to ip.
2089          *
2090          * NOTE:  We do not propagate ip->delta_*count to the parent because
2091          *        these represent adjustments that have not yet been
2092          *        propagated upward, so we don't need to remove them from
2093          *        the parent.
2094          *
2095          * Clear the pointer to the parent inode.
2096          */
2097         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2098                 ip = chain->u.ip;
2099                 if (ip->pip) {
2100                         ip->pip->delta_icount -= ip->ip_data.inode_count;
2101                         ip->pip->delta_dcount -= ip->ip_data.data_count;
2102                         ip->ip_data.inode_count += ip->delta_icount;
2103                         ip->ip_data.data_count += ip->delta_dcount;
2104                         ip->delta_icount = 0;
2105                         ip->delta_dcount = 0;
2106                         --ip->pip->delta_icount;
2107                         ip->pip = NULL;
2108                 }
2109                 chain->u.ip->depth = 0;
2110         }
2111
2112         /*
2113          * If retain is 0 the deletion is permanent.  Because the chain is
2114          * no longer connected to the topology a flush will have no
2115          * visibility into it.  We must dispose of the references related
2116          * to the MODIFIED and MOVED flags, otherwise the ref count will
2117          * never transition to 0.
2118          *
2119          * If retain is non-zero the deleted element is likely an inode
2120          * which the vnops frontend will mark DESTROYED and flush.  In that
2121          * situation we must retain the flags for any open file descriptors
2122          * on the (removed) inode.  The final close will destroy the
2123          * disconnected chain.
2124          */
2125         if (retain == 0) {
2126                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2127                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
2128                         hammer2_chain_drop(hmp, chain);
2129                 }
2130                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2131                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2132                         hammer2_chain_drop(hmp, chain);
2133                 }
2134         }
2135
2136         /*
2137          * The chain is still likely referenced, possibly even by a vnode
2138          * (if an inode), so defer further action until the chain gets
2139          * dropped.
2140          */
2141 }
2142
2143 /*
2144  * Recursively flush the specified chain.  The chain is locked and
2145  * referenced by the caller and will remain so on return.  The chain
2146  * will remain referenced throughout but can temporarily lose its
2147  * lock during the recursion to avoid unnecessarily stalling user
2148  * processes.
2149  *
2150  *
2151  */
2152 TAILQ_HEAD(flush_deferral_list, hammer2_chain);
2153
2154 struct hammer2_flush_info {
2155         struct flush_deferral_list flush_list;
2156         int             depth;
2157         hammer2_tid_t   modify_tid;
2158 };
2159
2160 typedef struct hammer2_flush_info hammer2_flush_info_t;
2161
2162 static void
2163 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2164                           hammer2_flush_info_t *info)
2165 {
2166         hammer2_blockref_t *bref;
2167         hammer2_off_t pbase;
2168         size_t bbytes;
2169         size_t boff;
2170         char *bdata;
2171         struct buf *bp;
2172         int error;
2173         int wasmodified;
2174
2175         /*
2176          * If we hit the stack recursion depth limit defer the operation.
2177          * The controller of the info structure will execute the deferral
2178          * list and then retry.
2179          *
2180          * This is only applicable if SUBMODIFIED is set.  After a reflush
2181          * SUBMODIFIED will probably be cleared and we want to drop through
2182          * to finish processing the current element so our direct parent
2183          * can process the results.
2184          */
2185         if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
2186             (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2187                 if ((chain->flags & HAMMER2_CHAIN_DEFERRED) == 0) {
2188                         hammer2_chain_ref(hmp, chain);
2189                         TAILQ_INSERT_TAIL(&info->flush_list,
2190                                           chain, flush_node);
2191                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEFERRED);
2192                 }
2193                 return;
2194         }
2195
2196         if (hammer2_debug & 0x0008)
2197                 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2198                         info->depth, info->depth, "",
2199                         chain->bref.type, chain->bref.data_off,
2200                         chain, chain->refs, chain->flags);
2201
2202         /*
2203          * If SUBMODIFIED is set we recurse the flush and adjust the
2204          * blockrefs accordingly.
2205          *
2206          * NOTE: Looping on SUBMODIFIED can prevent a flush from ever
2207          *       finishing in the face of filesystem activity.
2208          */
2209         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
2210                 hammer2_chain_t *child;
2211                 hammer2_chain_t *next;
2212                 hammer2_blockref_t *base;
2213                 int count;
2214
2215                 /*
2216                  * Clear SUBMODIFIED to catch races.  Note that if any
2217                  * child has to be flushed SUBMODIFIED will wind up being
2218                  * set again (for next time), but this does not stop us from
2219                  * synchronizing block updates which occurred.
2220                  *
2221                  * We don't want to set our chain to MODIFIED gratuitously.
2222                  */
2223                 /* XXX SUBMODIFIED not interlocked, can race */
2224                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2225
2226                 /*
2227                  * Flush the children and update the blockrefs in the chain.
2228                  * Be careful of ripouts during the loop.
2229                  */
2230                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2231                 if (next)
2232                         hammer2_chain_ref(hmp, next);
2233                 while ((child = next) != NULL) {
2234                         next = SPLAY_NEXT(hammer2_chain_splay,
2235                                           &chain->shead, child);
2236                         if (next)
2237                                 hammer2_chain_ref(hmp, next);
2238                         /*
2239                          * We only recurse if SUBMODIFIED (internal node)
2240                          * or MODIFIED (internal node or leaf) is set.
2241                          * However, we must still track whether any MOVED
2242                          * entries are present to determine if the chain's
2243                          * blockref's need updating or not.
2244                          */
2245                         if ((child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2246                                              HAMMER2_CHAIN_MODIFIED |
2247                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2248                                 hammer2_chain_drop(hmp, child);
2249                                 continue;
2250                         }
2251                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_MAYBE);
2252                         hammer2_chain_drop(hmp, child);
2253                         if (child->parent != chain ||
2254                             (child->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2255                                              HAMMER2_CHAIN_MODIFIED |
2256                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2257                                 hammer2_chain_unlock(hmp, child);
2258                                 continue;
2259                         }
2260
2261                         /*
2262                          * Propagate the DESTROYED flag if found set, then
2263                          * recurse the flush.
2264                          */
2265                         if ((chain->flags & HAMMER2_CHAIN_DESTROYED) &&
2266                             (child->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
2267                                 atomic_set_int(&child->flags,
2268                                                HAMMER2_CHAIN_DESTROYED |
2269                                                HAMMER2_CHAIN_SUBMODIFIED);
2270                         }
2271                         ++info->depth;
2272                         hammer2_chain_flush_pass1(hmp, child, info);
2273                         --info->depth;
2274                         hammer2_chain_unlock(hmp, child);
2275                 }
2276
2277                 /*
2278                  * Now synchronize any block updates.
2279                  */
2280                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2281                 if (next)
2282                         hammer2_chain_ref(hmp, next);
2283                 while ((child = next) != NULL) {
2284                         next = SPLAY_NEXT(hammer2_chain_splay,
2285                                           &chain->shead, child);
2286                         if (next)
2287                                 hammer2_chain_ref(hmp, next);
2288                         if ((child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2289                                 hammer2_chain_drop(hmp, child);
2290                                 continue;
2291                         }
2292                         hammer2_chain_lock(hmp, child, HAMMER2_RESOLVE_NEVER);
2293                         hammer2_chain_drop(hmp, child);
2294                         if (child->parent != chain ||
2295                             (child->flags & HAMMER2_CHAIN_MOVED) == 0) {
2296                                 hammer2_chain_unlock(hmp, child);
2297                                 continue;
2298                         }
2299
2300                         hammer2_chain_modify(hmp, chain,
2301                                              HAMMER2_MODIFY_NO_MODIFY_TID);
2302
2303                         switch(chain->bref.type) {
2304                         case HAMMER2_BREF_TYPE_INODE:
2305                                 KKASSERT((chain->data->ipdata.op_flags &
2306                                           HAMMER2_OPFLAG_DIRECTDATA) == 0);
2307                                 base = &chain->data->ipdata.u.blockset.
2308                                         blockref[0];
2309                                 count = HAMMER2_SET_COUNT;
2310                                 break;
2311                         case HAMMER2_BREF_TYPE_INDIRECT:
2312                                 base = &chain->data->npdata.blockref[0];
2313                                 count = chain->bytes /
2314                                         sizeof(hammer2_blockref_t);
2315                                 break;
2316                         case HAMMER2_BREF_TYPE_VOLUME:
2317                                 base = &hmp->voldata.sroot_blockset.blockref[0];
2318                                 count = HAMMER2_SET_COUNT;
2319                                 break;
2320                         default:
2321                                 base = NULL;
2322                                 panic("hammer2_chain_get: "
2323                                       "unrecognized blockref type: %d",
2324                                       chain->bref.type);
2325                         }
2326
2327                         KKASSERT(child->index >= 0);
2328                         base[child->index] = child->bref_flush;
2329
2330                         if (chain->bref.mirror_tid <
2331                             child->bref_flush.mirror_tid) {
2332                                 chain->bref.mirror_tid =
2333                                         child->bref_flush.mirror_tid;
2334                         }
2335
2336                         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME &&
2337                             hmp->voldata.mirror_tid <
2338                             child->bref_flush.mirror_tid) {
2339                                 hmp->voldata.mirror_tid =
2340                                         child->bref_flush.mirror_tid;
2341                         }
2342                         atomic_clear_int(&child->flags, HAMMER2_CHAIN_MOVED);
2343                         hammer2_chain_drop(hmp, child); /* MOVED flag */
2344                         hammer2_chain_unlock(hmp, child);
2345                 }
2346         }
2347
2348         /*
2349          * If destroying the object we unconditonally clear the MODIFIED
2350          * and MOVED bits, and we destroy the buffer without writing it
2351          * out.
2352          *
2353          * We don't bother updating the hash/crc or the chain bref.
2354          *
2355          * NOTE: The destroy'd object's bref has already been updated.
2356          *       so we can clear MOVED without propagating mirror_tid
2357          *       or modify_tid upward.
2358          *
2359          * XXX allocations for unflushed data can be returned to the
2360          *     free pool.
2361          */
2362         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2363                 if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
2364                         if (chain->bp) {
2365                                 chain->bp->b_flags |= B_INVAL|B_RELBUF;
2366                         }
2367                         atomic_clear_int(&chain->flags,
2368                                          HAMMER2_CHAIN_MODIFIED |
2369                                          HAMMER2_CHAIN_MODIFY_TID);
2370                         hammer2_chain_drop(hmp, chain);
2371                 }
2372                 if (chain->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
2373                         atomic_clear_int(&chain->flags,
2374                                          HAMMER2_CHAIN_MODIFIED_AUX);
2375                 }
2376                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2377                         atomic_clear_int(&chain->flags,
2378                                          HAMMER2_CHAIN_MOVED);
2379                         hammer2_chain_drop(hmp, chain);
2380                 }
2381                 return;
2382         }
2383
2384         /*
2385          * Flush this chain entry only if it is marked modified.
2386          */
2387         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2388                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2389                 goto done;
2390         }
2391
2392         /*
2393          * Synchronize cumulative data and inode count adjustments to
2394          * the inode and propagate the deltas upward to the parent.
2395          */
2396         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
2397                 hammer2_inode_t *ip;
2398
2399                 ip = chain->u.ip;
2400                 ip->ip_data.inode_count += ip->delta_icount;
2401                 ip->ip_data.data_count += ip->delta_dcount;
2402                 if (ip->pip) {
2403                         ip->pip->delta_icount += ip->delta_icount;
2404                         ip->pip->delta_dcount += ip->delta_dcount;
2405                 }
2406                 ip->delta_icount = 0;
2407                 ip->delta_dcount = 0;
2408         }
2409
2410         /*
2411          * Flush if MODIFIED or MODIFIED_AUX is set.  MODIFIED_AUX is only
2412          * used by the volume header (&hmp->vchain).
2413          */
2414         if ((chain->flags & (HAMMER2_CHAIN_MODIFIED |
2415                              HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2416                 goto done;
2417         }
2418         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED_AUX);
2419
2420         /*
2421          * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
2422          * will re-test the MOVED bit.  We must also update the mirror_tid
2423          * and modify_tid fields as appropriate.
2424          *
2425          * bits own a single chain ref and the MOVED bit owns its own
2426          * chain ref.
2427          */
2428         chain->bref.mirror_tid = info->modify_tid;
2429         if (chain->flags & HAMMER2_CHAIN_MODIFY_TID)
2430                 chain->bref.modify_tid = info->modify_tid;
2431         wasmodified = (chain->flags & HAMMER2_CHAIN_MODIFIED) != 0;
2432         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED |
2433                                         HAMMER2_CHAIN_MODIFY_TID);
2434
2435         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2436                 /*
2437                  * Drop the ref from the MODIFIED bit we cleared.
2438                  */
2439                 if (wasmodified)
2440                         hammer2_chain_drop(hmp, chain);
2441         } else {
2442                 /*
2443                  * If we were MODIFIED we inherit the ref from clearing
2444                  * that bit, otherwise we need another ref.
2445                  */
2446                 if (wasmodified == 0)
2447                         hammer2_chain_ref(hmp, chain);
2448                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2449         }
2450         chain->bref_flush = chain->bref;
2451
2452         /*
2453          * If this is part of a recursive flush we can go ahead and write
2454          * out the buffer cache buffer and pass a new bref back up the chain.
2455          *
2456          * This will never be a volume header.
2457          */
2458         switch(chain->bref.type) {
2459         case HAMMER2_BREF_TYPE_VOLUME:
2460                 /*
2461                  * The volume header is flushed manually by the syncer, not
2462                  * here.
2463                  */
2464                 break;
2465         case HAMMER2_BREF_TYPE_DATA:
2466                 /*
2467                  * Data elements have already been flushed via the logical
2468                  * file buffer cache.  Their hash was set in the bref by
2469                  * the vop_write code.
2470                  *
2471                  * Make sure the buffer(s) have been flushed out here.
2472                  */
2473                 bbytes = chain->bytes;
2474                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
2475                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2476
2477                 bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
2478                 if (bp) {
2479                         if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
2480                             (B_CACHE | B_DIRTY)) {
2481                                 kprintf("x");
2482                                 cluster_awrite(bp);
2483                         } else {
2484                                 bp->b_flags |= B_RELBUF;
2485                                 brelse(bp);
2486                         }
2487                 }
2488                 break;
2489         case HAMMER2_BREF_TYPE_INDIRECT:
2490                 /*
2491                  * Indirect blocks may be in an INITIAL state.  Use the
2492                  * chain_lock() call to ensure that the buffer has been
2493                  * instantiated (even though it is already locked the buffer
2494                  * might not have been instantiated).
2495                  *
2496                  * Only write the buffer out if it is dirty, it is possible
2497                  * the operating system had already written out the buffer.
2498                  */
2499                 hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_ALWAYS);
2500                 KKASSERT(chain->bp != NULL);
2501
2502                 bp = chain->bp;
2503                 if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) ||
2504                     (bp->b_flags & B_DIRTY)) {
2505                         bawrite(chain->bp);
2506                 } else {
2507                         brelse(chain->bp);
2508                 }
2509                 chain->bp = NULL;
2510                 chain->data = NULL;
2511                 hammer2_chain_unlock(hmp, chain);
2512                 break;
2513         default:
2514                 /*
2515                  * Embedded elements have to be flushed out.
2516                  */
2517                 KKASSERT(chain->data != NULL);
2518                 KKASSERT(chain->bp == NULL);
2519                 bref = &chain->bref;
2520
2521                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2522
2523                 if (chain->bp == NULL) {
2524                         /*
2525                          * The data is embedded, we have to acquire the
2526                          * buffer cache buffer and copy the data into it.
2527                          */
2528                         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2529                                 bbytes = HAMMER2_MINIOSIZE;
2530                         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2531                         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2532
2533                         /*
2534                          * The getblk() optimization can only be used if the
2535                          * physical block size matches the request.
2536                          */
2537                         if (chain->bytes == bbytes) {
2538                                 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2539                                 error = 0;
2540                         } else {
2541                                 error = bread(hmp->devvp, pbase, bbytes, &bp);
2542                                 KKASSERT(error == 0);
2543                         }
2544                         bdata = (char *)bp->b_data + boff;
2545
2546                         /*
2547                          * Copy the data to the buffer, mark the buffer
2548                          * dirty, and convert the chain to unmodified.
2549                          *
2550                          * We expect we might have to make adjustments to
2551                          * non-data delayed-write buffers when doing an
2552                          * actual flush so use bawrite() instead of
2553                          * cluster_awrite() here.
2554                          */
2555                         bcopy(chain->data, bdata, chain->bytes);
2556                         bp->b_flags |= B_CLUSTEROK;
2557                         bawrite(bp);
2558                         bp = NULL;
2559                         chain->bref.check.iscsi32.value =
2560                                 hammer2_icrc32(chain->data, chain->bytes);
2561                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2562                                 ++hammer2_iod_meta_write;
2563                         else
2564                                 ++hammer2_iod_indr_write;
2565                 } else {
2566                         chain->bref.check.iscsi32.value =
2567                                 hammer2_icrc32(chain->data, chain->bytes);
2568                 }
2569         }
2570
2571         /*
2572          * Adjustments to the bref.  The caller will use this to adjust
2573          * our chain's pointer to this chain element.
2574          */
2575         bref = &chain->bref;
2576
2577         switch(bref->type) {
2578         case HAMMER2_BREF_TYPE_VOLUME:
2579                 KKASSERT(chain->data != NULL);
2580                 KKASSERT(chain->bp == NULL);
2581
2582                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2583                         hammer2_icrc32(
2584                                 (char *)&hmp->voldata +
2585                                  HAMMER2_VOLUME_ICRC1_OFF,
2586                                 HAMMER2_VOLUME_ICRC1_SIZE);
2587                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2588                         hammer2_icrc32(
2589                                 (char *)&hmp->voldata +
2590                                  HAMMER2_VOLUME_ICRC0_OFF,
2591                                 HAMMER2_VOLUME_ICRC0_SIZE);
2592                 hmp->voldata.icrc_volheader =
2593                         hammer2_icrc32(
2594                                 (char *)&hmp->voldata +
2595                                  HAMMER2_VOLUME_ICRCVH_OFF,
2596                                 HAMMER2_VOLUME_ICRCVH_SIZE);
2597                 break;
2598         default:
2599                 break;
2600
2601         }
2602 done:
2603         if (hammer2_debug & 0x0008) {
2604                 kprintf("%*.*s} %p/%d %04x ",
2605                         info->depth, info->depth, "",
2606                         chain, chain->refs, chain->flags);
2607         }
2608 }
2609
2610 #if 0
2611 /*
2612  * PASS2 - not yet implemented (should be called only with the root chain?)
2613  */
2614 static void
2615 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2616 {
2617 }
2618 #endif
2619
2620 /*
2621  * Stand-alone flush.  If the chain is unable to completely flush we have
2622  * to be sure that SUBMODIFIED propagates up the parent chain.  We must not
2623  * clear the MOVED bit after flushing in this situation or our desynchronized
2624  * bref will not properly update in the parent.
2625  *
2626  * This routine can be called from several places but the most important
2627  * is from the hammer2_vop_reclaim() function.  We want to try to completely
2628  * clean out the inode structure to prevent disconnected inodes from
2629  * building up and blowing out the kmalloc pool.
2630  *
2631  * If modify_tid is 0 (usual case), a new modify_tid is allocated and
2632  * applied to the flush.  The depth-limit handling code is the only
2633  * code which passes a non-zero modify_tid to hammer2_chain_flush().
2634  */
2635 void
2636 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
2637                     hammer2_tid_t modify_tid)
2638 {
2639         hammer2_chain_t *parent;
2640         hammer2_chain_t *scan;
2641         hammer2_blockref_t *base;
2642         hammer2_flush_info_t info;
2643         int count;
2644         int reflush;
2645
2646         /*
2647          * Execute the recursive flush and handle deferrals.
2648          *
2649          * Chains can be ridiculously long (thousands deep), so to
2650          * avoid blowing out the kernel stack the recursive flush has a
2651          * depth limit.  Elements at the limit are placed on a list
2652          * for re-execution after the stack has been popped.
2653          */
2654         bzero(&info, sizeof(info));
2655         TAILQ_INIT(&info.flush_list);
2656
2657         if (modify_tid == 0) {
2658                 hammer2_voldata_lock(hmp);
2659                 info.modify_tid = hmp->voldata.alloc_tid++;
2660                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
2661                 hammer2_voldata_unlock(hmp);
2662         } else {
2663                 info.modify_tid = modify_tid;
2664         }
2665         reflush = 1;
2666
2667         while (reflush) {
2668                 /*
2669                  * Primary recursion
2670                  */
2671                 hammer2_chain_flush_pass1(hmp, chain, &info);
2672                 reflush = 0;
2673
2674                 while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
2675                         /*
2676                          * Secondary recursion.  Note that a reference is
2677                          * retained from the element's presence on the
2678                          * deferral list.
2679                          */
2680                         KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
2681                         TAILQ_REMOVE(&info.flush_list, scan, flush_node);
2682                         atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
2683
2684                         /*
2685                          * Now that we've popped back up we can do a secondary
2686                          * recursion on the deferred elements.
2687                          */
2688                         if (hammer2_debug & 0x0040)
2689                                 kprintf("defered flush %p\n", scan);
2690                         hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
2691                         hammer2_chain_flush(hmp, scan, info.modify_tid);
2692                         hammer2_chain_unlock(hmp, scan);
2693
2694                         /*
2695                          * Only flag a reflush if SUBMODIFIED is no longer
2696                          * set.  If SUBMODIFIED is set the element will just
2697                          * wind up on our flush_list again.
2698                          */
2699                         if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2700                                             HAMMER2_CHAIN_MODIFIED |
2701                                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
2702                                 reflush = 1;
2703                         }
2704                         hammer2_chain_drop(hmp, scan);
2705                 }
2706                 if ((hammer2_debug & 0x0040) && reflush)
2707                         kprintf("reflush %p\n", chain);
2708         }
2709
2710         /*
2711          * The SUBMODIFIED bit must propagate upward if the chain could not
2712          * be completely flushed.
2713          */
2714         if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2715                             HAMMER2_CHAIN_MODIFIED |
2716                             HAMMER2_CHAIN_MODIFIED_AUX |
2717                             HAMMER2_CHAIN_MOVED)) {
2718                 hammer2_chain_parent_setsubmod(hmp, chain);
2719         }
2720
2721         /*
2722          * If the only thing left is a simple bref update try to
2723          * pro-actively update the parent, otherwise return early.
2724          */
2725         parent = chain->parent;
2726         if (parent == NULL) {
2727                 return;
2728         }
2729         if (chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
2730             (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2731                              HAMMER2_CHAIN_MODIFIED |
2732                              HAMMER2_CHAIN_MODIFIED_AUX |
2733                              HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
2734                 return;
2735         }
2736
2737         /*
2738          * We are locking backwards so allow the lock to fail
2739          */
2740         if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
2741                 return;
2742         }
2743
2744         /*
2745          * We are updating brefs but we have to call chain_modify()
2746          * because our caller is not being run from a recursive flush.
2747          *
2748          * This will also chain up the parent list and set the SUBMODIFIED
2749          * flag.
2750          *
2751          * We do not want to set HAMMER2_CHAIN_MODIFY_TID here because the
2752          * modification is only related to updating a bref in the parent.
2753          *
2754          * When updating the blockset embedded in the volume header we must
2755          * also update voldata.mirror_tid.
2756          */
2757         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
2758         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NO_MODIFY_TID);
2759
2760         switch(parent->bref.type) {
2761         case HAMMER2_BREF_TYPE_INODE:
2762                 base = &parent->data->ipdata.u.blockset.
2763                         blockref[0];
2764                 count = HAMMER2_SET_COUNT;
2765                 break;
2766         case HAMMER2_BREF_TYPE_INDIRECT:
2767                 base = &parent->data->npdata.blockref[0];
2768                 count = parent->bytes /
2769                         sizeof(hammer2_blockref_t);
2770                 break;
2771         case HAMMER2_BREF_TYPE_VOLUME:
2772                 base = &hmp->voldata.sroot_blockset.blockref[0];
2773                 count = HAMMER2_SET_COUNT;
2774                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2775                         if (hmp->voldata.mirror_tid < chain->bref.mirror_tid) {
2776                                 hmp->voldata.mirror_tid =
2777                                         chain->bref.mirror_tid;
2778                         }
2779                 }
2780                 break;
2781         default:
2782                 base = NULL;
2783                 panic("hammer2_chain_flush: "
2784                       "unrecognized blockref type: %d",
2785                       parent->bref.type);
2786         }
2787
2788         /*
2789          * Update the blockref in the parent.  We do not have to set
2790          * MOVED in the parent because the parent has been marked modified,
2791          * so the flush sequence will pick up the bref change.
2792          *
2793          * We do have to propagate mirror_tid upward.
2794          */
2795         KKASSERT(chain->index >= 0 &&
2796                  chain->index < count);
2797         KKASSERT(chain->parent == parent);
2798         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2799                 base[chain->index] = chain->bref_flush;
2800                 if (parent->bref.mirror_tid < chain->bref_flush.mirror_tid)
2801                         parent->bref.mirror_tid = chain->bref_flush.mirror_tid;
2802                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2803                 hammer2_chain_drop(hmp, chain);
2804         } else if (bcmp(&base[chain->index], &chain->bref_flush,
2805                    sizeof(chain->bref)) != 0) {
2806                 panic("hammer2: unflagged bref update(2)");
2807         }
2808
2809         lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
2810         hammer2_chain_unlock(hmp, parent);
2811 }