hammer2 - Major hammer2_chain_*() API cleanup
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/cdefs.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/types.h>
47 #include <sys/lock.h>
48 #include <sys/uuid.h>
49
50 #include "hammer2.h"
51
52 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
53
54 static hammer2_chain_t *hammer2_chain_create_indirect(
55                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
56                         hammer2_key_t key, int keybits);
57
58 /*
59  * Splay tree
60  */
61 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
62
63 int
64 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
65 {
66         return(chain2->index - chain1->index);
67 }
68
69 /*
70  * Recursively mark the parent chain elements so flushes can find
71  * modified elements.
72  *
73  * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
74  *       during the flush recursion after clearing the parent's
75  *       SUBMODIFIED bit.  We don't want to re-set the parent's
76  *       SUBMODIFIED bit in this case!
77  *
78  * XXX rename of parent can create a SMP race
79  */
80 static void
81 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
82 {
83         hammer2_chain_t *parent;
84
85         if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
86                 parent = chain->parent;
87                 while (parent &&
88                        (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
89                         atomic_set_int(&parent->flags,
90                                        HAMMER2_CHAIN_SUBMODIFIED);
91                         parent = parent->parent;
92                 }
93         }
94 }
95
96 /*
97  * Allocate a new disconnected chain element representing the specified
98  * bref.  The chain element is locked exclusively and refs is set to 1.
99  *
100  * This essentially allocates a system memory structure representing one
101  * of the media structure types, including inodes.
102  */
103 hammer2_chain_t *
104 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
105 {
106         hammer2_chain_t *chain;
107         hammer2_inode_t *ip;
108         hammer2_indblock_t *np;
109         hammer2_data_t *dp;
110         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
111
112         /*
113          * Construct the appropriate system structure.
114          */
115         switch(bref->type) {
116         case HAMMER2_BREF_TYPE_INODE:
117                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
118                 chain = &ip->chain;
119                 chain->u.ip = ip;
120                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
121                 ip->hmp = hmp;
122                 break;
123         case HAMMER2_BREF_TYPE_INDIRECT:
124                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
125                 chain = &np->chain;
126                 chain->u.np = np;
127                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
128                 break;
129         case HAMMER2_BREF_TYPE_DATA:
130                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
131                 chain = &dp->chain;
132                 chain->u.dp = dp;
133                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
134                 break;
135         case HAMMER2_BREF_TYPE_VOLUME:
136                 chain = NULL;
137                 panic("hammer2_chain_alloc volume type illegal for op");
138         default:
139                 chain = NULL;
140                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
141                       bref->type);
142         }
143         chain->bref = *bref;
144         chain->index = -1;      /* not yet assigned */
145         chain->refs = 1;
146         chain->bytes = bytes;
147         lockmgr(&chain->lk, LK_EXCLUSIVE);
148
149         return (chain);
150 }
151
152 /*
153  * Free a disconnected chain element
154  */
155 void
156 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
157 {
158         void *mem;
159
160         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
161             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
162                 chain->data = NULL;
163         }
164
165         KKASSERT(chain->bp == NULL);
166         KKASSERT(chain->data == NULL);
167         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
168                  chain->u.ip->vp == NULL);
169
170         if ((mem = chain->u.mem) != NULL) {
171                 chain->u.mem = NULL;
172                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
173                         kfree(mem, hmp->minode);
174                 else
175                         kfree(mem, hmp->mchain);
176         }
177 }
178
179 /*
180  * Add a reference to a chain element (for shared access).  The chain
181  * element must already have at least 1 ref controlled by the caller.
182  */
183 void
184 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
185 {
186         KKASSERT(chain->refs > 0);
187         atomic_add_int(&chain->refs, 1);
188 }
189
190 /*
191  * Drop the callers reference to the chain element.  If the ref count
192  * reaches zero the chain element and its related structure (typically an
193  * inode or indirect block) will be freed and the parent will be
194  * recursively dropped.
195  *
196  * Modified elements hold an additional reference so it should not be
197  * possible for the count on a modified element to drop to 0.
198  *
199  * The chain element must NOT be locked by the caller.
200  *
201  * The parent might or might not be locked by the caller but if so it
202  * will also be referenced so we shouldn't recurse upward.
203  */
204 void
205 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
206 {
207         hammer2_chain_t *parent;
208         u_int refs;
209
210         while (chain) {
211                 refs = chain->refs;
212                 cpu_ccfence();
213                 KKASSERT(refs > 0);
214                 if (refs == 1) {
215                         KKASSERT(chain != &hmp->vchain);
216                         parent = chain->parent;
217                         if (parent)
218                                 lockmgr(&parent->lk, LK_EXCLUSIVE);
219                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
220                                 /*
221                                  * Succeeded, recurse and drop parent
222                                  */
223                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
224                                         SPLAY_REMOVE(hammer2_chain_splay,
225                                                      &parent->shead, chain);
226                                         atomic_set_int(&chain->flags,
227                                                        HAMMER2_CHAIN_DELETED);
228                                         /* parent refs dropped via recursion */
229                                 }
230                                 chain->parent = NULL;
231                                 if (parent)
232                                         lockmgr(&parent->lk, LK_RELEASE);
233                                 hammer2_chain_free(hmp, chain);
234                                 chain = parent;
235                                 /* recurse on parent */
236                         } else {
237                                 if (parent)
238                                         lockmgr(&parent->lk, LK_RELEASE);
239                                 /* retry the same chain */
240                         }
241                 } else {
242                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
243                                 /*
244                                  * Succeeded, count did not reach zero so
245                                  * cut out of the loop.
246                                  */
247                                 break;
248                         }
249                         /* retry the same chain */
250                 }
251         }
252 }
253
254 /*
255  * Ref and lock a chain element, acquiring its data with I/O if necessary,
256  * and specify how you would like the data to be resolved.
257  *
258  * Returns 0 on success or an error code if the data could not be acquired.
259  * The chain element is locked either way.
260  *
261  * The lock is allowed to recurse, multiple locking ops will aggregate
262  * the requested resolve types.  Once data is assigned it will not be
263  * removed until the last unlock.
264  *
265  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
266  *                         (typically used to avoid device/logical buffer
267  *                          aliasing for data)
268  *
269  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
270  *                         the INITIAL-create state (indirect blocks only).
271  *
272  *                         Do not resolve data elements for DATA chains.
273  *                         (typically used to avoid device/logical buffer
274  *                          aliasing for data)
275  *
276  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
277  *
278  *
279  * NOTE: Embedded elements (volume header, inodes) are always resolved
280  *       regardless.
281  *
282  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
283  *       element will instantiate and zero its buffer, and flush it on
284  *       release.
285  *
286  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
287  *       so as not to instantiate a device buffer, which could alias against
288  *       a logical file buffer.  However, if ALWAYS is specified the
289  *       device buffer will be instantiated anyway.
290  */
291 int
292 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
293 {
294         hammer2_blockref_t *bref;
295         hammer2_off_t pbase;
296         hammer2_off_t peof;
297         size_t boff;
298         size_t bbytes;
299         int error;
300         char *bdata;
301
302         /*
303          * Lock the element.  Under certain conditions this might end up
304          * being a recursive lock.
305          */
306         KKASSERT(chain->refs > 0);
307         atomic_add_int(&chain->refs, 1);
308         lockmgr(&chain->lk, LK_EXCLUSIVE);
309
310         /*
311          * If we already have a valid data pointer no further action is
312          * necessary.
313          */
314         if (chain->data)
315                 return (0);
316
317         /*
318          * Do we have to resolve the data?
319          */
320         switch(how) {
321         case HAMMER2_RESOLVE_NEVER:
322                 return(0);
323         case HAMMER2_RESOLVE_MAYBE:
324                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
325                         return(0);
326                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
327                         return(0);
328                 /* fall through */
329         case HAMMER2_RESOLVE_ALWAYS:
330                 break;
331         }
332
333         /*
334          * We must resolve to a device buffer, either by issuing I/O or
335          * by creating a zero-fill element.  We do not mark the buffer
336          * dirty when creating a zero-fill element (the hammer2_chain_modify()
337          * API must still be used to do that).
338          *
339          * The device buffer is variable-sized in powers of 2 down
340          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
341          * chunk always contains buffers of the same size. (XXX)
342          *
343          * The minimum physical IO size may be larger than the variable
344          * block size.
345          */
346         bref = &chain->bref;
347
348         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
349                 bbytes = HAMMER2_MINIOSIZE;
350         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
351         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
352         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
353         KKASSERT(pbase != 0);
354
355         /*
356          * The getblk() optimization can only be used on newly created
357          * elements if the physical block size matches the request.
358          */
359         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
360             chain->bytes == bbytes) {
361                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
362                 error = 0;
363         } else if (hammer2_cluster_enable) {
364                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
365                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
366                                      &chain->bp);
367         } else {
368                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
369         }
370
371         if (error) {
372                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
373                         (intmax_t)pbase, error);
374                 bqrelse(chain->bp);
375                 chain->bp = NULL;
376                 return (error);
377         }
378
379         /*
380          * Zero the data area if the chain is in the INITIAL-create state
381          */
382         bdata = (char *)chain->bp->b_data + boff;
383         if (chain->flags & HAMMER2_CHAIN_INITIAL)
384                 bzero(bdata, chain->bytes);
385
386         /*
387          * Setup the data pointer, either pointing it to an embedded data
388          * structure and copying the data from the buffer, or pointing it
389          * into the buffer.
390          *
391          * The buffer is not retained when copying to an embedded data
392          * structure in order to avoid potential deadlocks or recursions
393          * on the same physical buffer.
394          */
395         switch (bref->type) {
396         case HAMMER2_BREF_TYPE_VOLUME:
397                 /*
398                  * Copy data from bp to embedded buffer
399                  */
400                 panic("hammer2_chain_lock: called on unresolved volume header");
401 #if 0
402                 /* NOT YET */
403                 KKASSERT(pbase == 0);
404                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
405                 bcopy(bdata, &hmp->voldata, chain->bytes);
406                 chain->data = (void *)&hmp->voldata;
407                 bqrelse(chain->bp);
408                 chain->bp = NULL;
409 #endif
410                 break;
411         case HAMMER2_BREF_TYPE_INODE:
412                 /*
413                  * Copy data from bp to embedded buffer, do not retain the
414                  * device buffer.
415                  */
416                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
417                 chain->data = (void *)&chain->u.ip->ip_data;
418                 bqrelse(chain->bp);
419                 chain->bp = NULL;
420                 break;
421         case HAMMER2_BREF_TYPE_INDIRECT:
422         case HAMMER2_BREF_TYPE_DATA:
423         default:
424                 /*
425                  * Point data at the device buffer and leave bp intact.
426                  */
427                 chain->data = (void *)bdata;
428                 break;
429         }
430         return (0);
431 }
432
433 /*
434  * Unlock and deref a chain element.
435  *
436  * On the last lock release any non-embedded data (chain->bp) will be
437  * retired.
438  */
439 void
440 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
441 {
442         long *counterp;
443
444         /*
445          * Undo a recursive lock
446          */
447         if (lockcountnb(&chain->lk) > 1) {
448                 KKASSERT(chain->refs > 1);
449                 atomic_add_int(&chain->refs, -1);
450                 lockmgr(&chain->lk, LK_RELEASE);
451                 return;
452         }
453
454         /*
455          * Shortcut the case if the data is embedded or not resolved.
456          * Do NOT null-out pointers to embedded data (e.g. inode).
457          */
458         if (chain->bp == NULL) {
459                 lockmgr(&chain->lk, LK_RELEASE);
460                 hammer2_chain_drop(hmp, chain);
461                 return;
462         }
463
464         /*
465          * Statistics
466          */
467         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
468                 ;
469         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
470                 switch(chain->bref.type) {
471                 case HAMMER2_BREF_TYPE_DATA:
472                         counterp = &hammer2_ioa_file_write;
473                         break;
474                 case HAMMER2_BREF_TYPE_INODE:
475                         counterp = &hammer2_ioa_meta_write;
476                         break;
477                 case HAMMER2_BREF_TYPE_INDIRECT:
478                         counterp = &hammer2_ioa_indr_write;
479                         break;
480                 default:
481                         counterp = &hammer2_ioa_volu_write;
482                         break;
483                 }
484                 ++*counterp;
485         } else {
486                 switch(chain->bref.type) {
487                 case HAMMER2_BREF_TYPE_DATA:
488                         counterp = &hammer2_iod_file_write;
489                         break;
490                 case HAMMER2_BREF_TYPE_INODE:
491                         counterp = &hammer2_iod_meta_write;
492                         break;
493                 case HAMMER2_BREF_TYPE_INDIRECT:
494                         counterp = &hammer2_iod_indr_write;
495                         break;
496                 default:
497                         counterp = &hammer2_iod_volu_write;
498                         break;
499                 }
500                 ++*counterp;
501         }
502
503         /*
504          * Clean out the bp.
505          *
506          * If a device buffer was used for data be sure to destroy the
507          * buffer when we are done to avoid aliases (XXX what about the
508          * underlying VM pages?).
509          */
510         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
511                 chain->bp->b_flags |= B_RELBUF;
512
513         chain->data = NULL;
514         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
515                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
516                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
517                         atomic_clear_int(&chain->flags,
518                                          HAMMER2_CHAIN_IOFLUSH);
519                         chain->bp->b_flags |= B_RELBUF;
520                         bawrite(chain->bp);
521                 } else {
522                         chain->bp->b_flags |= B_CLUSTEROK;
523                         bdwrite(chain->bp);
524                 }
525         } else {
526                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
527                         atomic_clear_int(&chain->flags,
528                                          HAMMER2_CHAIN_IOFLUSH);
529                         chain->bp->b_flags |= B_RELBUF;
530                         brelse(chain->bp);
531                 } else {
532                         /* bp might still be dirty */
533                         bqrelse(chain->bp);
534                 }
535         }
536         chain->bp = NULL;
537         lockmgr(&chain->lk, LK_RELEASE);
538         hammer2_chain_drop(hmp, chain);
539 }
540
541 /*
542  * Resize the chain's physical storage allocation.  Chains can be resized
543  * smaller without reallocating the storage.  Resizing larger will reallocate
544  * the storage.
545  *
546  * Must be passed a locked chain.  If you want the resize to copy the data
547  * you should lock the chain with RESOLVE_MAYBE or RESOLVE_ALWAYS, otherwise
548  * the resize operation will not copy the data.
549  *
550  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
551  * to avoid instantiating a device buffer that conflicts with the vnode
552  * data buffer.
553  *
554  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
555  */
556 void
557 hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain,
558                      int nradix, int flags)
559 {
560         struct buf *nbp;
561         hammer2_off_t pbase;
562         size_t obytes;
563         size_t nbytes;
564         size_t bbytes;
565         int boff;
566         char *bdata;
567         int error;
568
569         /*
570          * Only data and indirect blocks can be resized for now
571          */
572         KKASSERT(chain != &hmp->vchain);
573         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
574                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
575
576         /*
577          * Nothing to do if the element is already the proper size
578          */
579         obytes = chain->bytes;
580         nbytes = 1 << nradix;
581         if (obytes == nbytes)
582                 return;
583
584         /*
585          * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
586          * modified flags share the same ref.
587          */
588         if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0) {
589                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
590                 hammer2_chain_ref(hmp, chain);
591         }
592
593         /*
594          * Relocate the block, even if making it smaller (because different
595          * block sizes may be in different regions).
596          */
597         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
598                                                      nbytes);
599         chain->bytes = nbytes;
600
601         /*
602          * The device buffer may be larger than the allocation size.
603          */
604         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
605                 bbytes = HAMMER2_MINIOSIZE;
606         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
607         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
608
609         /*
610          * Only copy the data if resolved, otherwise the caller is
611          * responsible.
612          */
613         if (chain->bp) {
614                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
615                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
616                 KKASSERT(chain != &hmp->vchain);        /* safety */
617
618                 /*
619                  * The getblk() optimization can only be used if the
620                  * physical block size matches the request.
621                  */
622                 if (nbytes == bbytes) {
623                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
624                         error = 0;
625                 } else {
626                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
627                         KKASSERT(error == 0);
628                 }
629                 bdata = (char *)nbp->b_data + boff;
630
631                 if (nbytes < obytes) {
632                         bcopy(chain->data, bdata, nbytes);
633                 } else {
634                         bcopy(chain->data, bdata, obytes);
635                         bzero(bdata + obytes, nbytes - obytes);
636                 }
637
638                 /*
639                  * NOTE: The INITIAL state of the chain is left intact.
640                  *
641                  * NOTE: Because of the reallocation we have to set DIRTYBP
642                  *       if INITIAL is not set.
643                  */
644                 chain->bp->b_flags |= B_RELBUF;
645                 brelse(chain->bp);
646                 chain->bp = nbp;
647                 chain->data = (void *)bdata;
648                 if ((chain->flags & HAMMER2_CHAIN_INITIAL) == 0)
649                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
650         }
651         hammer2_chain_parent_setsubmod(hmp, chain);
652 }
653
654 /*
655  * Convert a locked chain that was retrieved read-only to read-write.
656  *
657  * If not already marked modified a new physical block will be allocated
658  * and assigned to the bref.
659  *
660  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
661  *                   level or the COW operation will not work.
662  *
663  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
664  *                   run the data through the device buffers.
665  */
666 void
667 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
668 {
669         struct buf *nbp;
670         int error;
671         hammer2_off_t pbase;
672         size_t bbytes;
673         size_t boff;
674         void *bdata;
675
676         /*
677          * If the chain is already marked MODIFIED1 we can just return.
678          */
679         if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
680                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
681                     chain->bp == NULL) {
682                         goto skip1;
683                 }
684                 return;
685         }
686
687         /*
688          * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
689          * modified flags share the same ref.
690          */
691         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
692         hammer2_chain_ref(hmp, chain);
693
694         /*
695          * We must allocate the copy-on-write block.
696          *
697          * If the data is embedded no other action is required.
698          *
699          * If the data is not embedded we acquire and clear the
700          * new block.  If chain->data is not NULL we then do the
701          * copy-on-write.  chain->data will then be repointed to the new
702          * buffer and the old buffer will be released.
703          *
704          * For newly created elements with no prior allocation we go
705          * through the copy-on-write steps except without the copying part.
706          */
707         if (chain != &hmp->vchain) {
708                 if ((hammer2_debug & 0x0001) &&
709                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
710                         kprintf("Replace %d\n", chain->bytes);
711                 }
712                 chain->bref.data_off =
713                         hammer2_freemap_alloc(hmp, chain->bref.type,
714                                               chain->bytes);
715                 /* XXX failed allocation */
716         }
717
718         /*
719          * If data instantiation is optional and the chain has no current
720          * data association (typical for DATA and newly-created INDIRECT
721          * elements), don't instantiate the buffer now.
722          */
723         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
724                 goto skip2;
725
726 skip1:
727         /*
728          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
729          * written-out on unlock.  This bit is independent of the MODIFIED1
730          * bit because the chain may still need meta-data adjustments done
731          * by virtue of MODIFIED1 for its parent, and the buffer can be
732          * flushed out (possibly multiple times) by the OS before that.
733          *
734          * Clearing the INITIAL flag (for indirect blocks) indicates that
735          * a zero-fill buffer has been instantiated.
736          */
737         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
738         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
739
740         /*
741          * We currently should never instantiate a device buffer for a
742          * data chain.
743          */
744         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
745
746         /*
747          * Execute COW operation
748          */
749         switch(chain->bref.type) {
750         case HAMMER2_BREF_TYPE_VOLUME:
751         case HAMMER2_BREF_TYPE_INODE:
752                 /*
753                  * The data is embedded, no copy-on-write operation is
754                  * needed.
755                  */
756                 KKASSERT(chain->bp == NULL);
757                 break;
758         case HAMMER2_BREF_TYPE_DATA:
759         case HAMMER2_BREF_TYPE_INDIRECT:
760                 /*
761                  * Perform the copy-on-write operation
762                  */
763                 KKASSERT(chain != &hmp->vchain);        /* safety */
764                 /*
765                  * The device buffer may be larger than the allocation size.
766                  */
767                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
768                         bbytes = HAMMER2_MINIOSIZE;
769                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
770                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
771
772                 /*
773                  * The getblk() optimization can only be used if the
774                  * physical block size matches the request.
775                  */
776                 if (chain->bytes == bbytes) {
777                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
778                         error = 0;
779                 } else {
780                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
781                         KKASSERT(error == 0);
782                 }
783                 bdata = (char *)nbp->b_data + boff;
784
785                 /*
786                  * Copy or zero-fill on write depending on whether
787                  * chain->data exists or not.
788                  */
789                 if (chain->data) {
790                         bcopy(chain->data, bdata, chain->bytes);
791                         KKASSERT(chain->bp != NULL);
792                 } else {
793                         bzero(bdata, chain->bytes);
794                 }
795                 if (chain->bp) {
796                         chain->bp->b_flags |= B_RELBUF;
797                         brelse(chain->bp);
798                 }
799                 chain->bp = nbp;
800                 chain->data = bdata;
801                 break;
802         default:
803                 panic("hammer2_chain_modify: illegal non-embedded type %d",
804                       chain->bref.type);
805                 break;
806
807         }
808 skip2:
809         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
810                 hammer2_chain_parent_setsubmod(hmp, chain);
811 }
812
813 /*
814  * Locate an in-memory chain.  The parent must be locked.  The in-memory
815  * chain is returned or NULL if no in-memory chain is present.
816  *
817  * NOTE: A chain on-media might exist for this index when NULL is returned.
818  */
819 hammer2_chain_t *
820 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
821 {
822         hammer2_chain_t dummy;
823         hammer2_chain_t *chain;
824
825         dummy.index = index;
826         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
827         return (chain);
828 }
829
830 /*
831  * Return a locked chain structure with all associated data acquired.
832  *
833  * Caller must lock the parent on call, the returned child will be locked.
834  */
835 hammer2_chain_t *
836 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
837                   int index, int flags)
838 {
839         hammer2_blockref_t *bref;
840         hammer2_chain_t *chain;
841         hammer2_chain_t dummy;
842         int how;
843
844         /*
845          * Figure out how to lock.  MAYBE can be used to optimized
846          * the initial-create state for indirect blocks.
847          */
848         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
849                 how = HAMMER2_RESOLVE_NEVER;
850         else
851                 how = HAMMER2_RESOLVE_MAYBE;
852
853         /*
854          * First see if we have a (possibly modified) chain element cached
855          * for this (parent, index).  Acquire the data if necessary.
856          *
857          * If chain->data is non-NULL the chain should already be marked
858          * modified.
859          */
860         dummy.index = index;
861         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
862         if (chain) {
863                 if (flags & HAMMER2_LOOKUP_NOLOCK)
864                         hammer2_chain_ref(hmp, chain);
865                 else
866                         hammer2_chain_lock(hmp, chain, how);
867                 return(chain);
868         }
869
870         /*
871          * the get function must always succeed, panic if there's no
872          * data to index.
873          */
874         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
875                 panic("hammer2_chain_get: Missing bref(1)");
876                 /* NOT REACHED */
877         }
878
879         /*
880          * Otherwise lookup the bref and issue I/O (switch on the parent)
881          */
882         switch(parent->bref.type) {
883         case HAMMER2_BREF_TYPE_INODE:
884                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
885                 bref = &parent->data->ipdata.u.blockset.blockref[index];
886                 break;
887         case HAMMER2_BREF_TYPE_INDIRECT:
888                 KKASSERT(parent->data != NULL);
889                 KKASSERT(index >= 0 &&
890                          index < parent->bytes / sizeof(hammer2_blockref_t));
891                 bref = &parent->data->npdata.blockref[index];
892                 break;
893         case HAMMER2_BREF_TYPE_VOLUME:
894                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
895                 bref = &hmp->voldata.sroot_blockset.blockref[index];
896                 break;
897         default:
898                 bref = NULL;
899                 panic("hammer2_chain_get: unrecognized blockref type: %d",
900                       parent->bref.type);
901         }
902         if (bref->type == 0) {
903                 panic("hammer2_chain_get: Missing bref(2)");
904                 /* NOT REACHED */
905         }
906
907         /*
908          * Allocate a chain structure representing the existing media
909          * entry.
910          *
911          * The locking operation we do later will issue I/O to read it.
912          */
913         chain = hammer2_chain_alloc(hmp, bref);
914
915         /*
916          * Link the chain into its parent.  Caller is expected to hold an
917          * exclusive lock on the parent.
918          */
919         chain->parent = parent;
920         chain->index = index;
921         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
922                 panic("hammer2_chain_link: collision");
923         KKASSERT(parent->refs > 0);
924         atomic_add_int(&parent->refs, 1);       /* for splay entry */
925
926         /*
927          * Additional linkage for inodes.  Reuse the parent pointer to
928          * find the parent directory.
929          */
930         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
931                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
932                         parent = parent->parent;
933                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
934                         chain->u.ip->pip = parent->u.ip;
935         }
936
937         /*
938          * Our new chain structure has already been referenced and locked
939          * but the lock code handles the I/O so call it to resolve the data.
940          * Then release one of our two exclusive locks.
941          *
942          * If NOLOCK is set the release will release the one-and-only lock.
943          */
944         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
945                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
946                 hammer2_chain_drop(hmp, chain);         /* excess ref */
947         }
948         lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
949
950         return (chain);
951 }
952
953 /*
954  * Locate any key between key_beg and key_end inclusive.  (*parentp)
955  * typically points to an inode but can also point to a related indirect
956  * block and this function will recurse upwards and find the inode again.
957  *
958  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
959  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
960  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
961  *
962  * (*parentp) must be exclusively locked and referenced and can be an inode
963  * or an existing indirect block within the inode.
964  *
965  * On return (*parentp) will be modified to point at the deepest parent chain
966  * element encountered during the search, as a helper for an insertion or
967  * deletion.   The new (*parentp) will be locked and referenced and the old
968  * will be unlocked and dereferenced (no change if they are both the same).
969  *
970  * The matching chain will be returned exclusively locked and referenced.
971  *
972  * NULL is returned if no match was found, but (*parentp) will still
973  * potentially be adjusted.
974  *
975  * This function will also recurse up the chain if the key is not within the
976  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
977  * can simply allow (*parentp) to float inside the loop.
978  */
979 hammer2_chain_t *
980 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
981                      hammer2_key_t key_beg, hammer2_key_t key_end,
982                      int flags)
983 {
984         hammer2_chain_t *parent;
985         hammer2_chain_t *chain;
986         hammer2_chain_t *tmp;
987         hammer2_blockref_t *base;
988         hammer2_blockref_t *bref;
989         hammer2_key_t scan_beg;
990         hammer2_key_t scan_end;
991         int count = 0;
992         int i;
993
994         /*
995          * Recurse (*parentp) upward if necessary until the parent completely
996          * encloses the key range or we hit the inode.
997          */
998         parent = *parentp;
999         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1000                 scan_beg = parent->bref.key;
1001                 scan_end = scan_beg +
1002                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1003                 if (key_beg >= scan_beg && key_end <= scan_end)
1004                         break;
1005                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1006                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1007                 parent = parent->parent;
1008                                                         /* lock new parent */
1009                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1010                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1011                 *parentp = parent;                      /* new parent */
1012         }
1013
1014 again:
1015         /*
1016          * Locate the blockref array.  Currently we do a fully associative
1017          * search through the array.
1018          */
1019         switch(parent->bref.type) {
1020         case HAMMER2_BREF_TYPE_INODE:
1021                 /*
1022                  * Special shortcut for embedded data returns the inode
1023                  * itself.  Callers must detect this condition and access
1024                  * the embedded data (the strategy code does this for us).
1025                  *
1026                  * This is only applicable to regular files and softlinks.
1027                  */
1028                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1029                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1030                                 hammer2_chain_ref(hmp, parent);
1031                         else
1032                                 hammer2_chain_lock(hmp, parent,
1033                                                    HAMMER2_RESOLVE_ALWAYS);
1034                         return (parent);
1035                 }
1036                 base = &parent->data->ipdata.u.blockset.blockref[0];
1037                 count = HAMMER2_SET_COUNT;
1038                 break;
1039         case HAMMER2_BREF_TYPE_INDIRECT:
1040                 /*
1041                  * Optimize indirect blocks in the INITIAL state to avoid
1042                  * I/O.
1043                  */
1044                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1045                         base = NULL;
1046                 } else {
1047                         if (parent->data == NULL)
1048                                 panic("parent->data is NULL");
1049                         base = &parent->data->npdata.blockref[0];
1050                 }
1051                 count = parent->bytes / sizeof(hammer2_blockref_t);
1052                 break;
1053         case HAMMER2_BREF_TYPE_VOLUME:
1054                 base = &hmp->voldata.sroot_blockset.blockref[0];
1055                 count = HAMMER2_SET_COUNT;
1056                 break;
1057         default:
1058                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1059                       parent->bref.type);
1060                 base = NULL;    /* safety */
1061                 count = 0;      /* safety */
1062         }
1063
1064         /*
1065          * If the element and key overlap we use the element.
1066          */
1067         bref = NULL;
1068         for (i = 0; i < count; ++i) {
1069                 tmp = hammer2_chain_find(hmp, parent, i);
1070                 if (tmp) {
1071                         bref = &tmp->bref;
1072                         KKASSERT(bref->type != 0);
1073                 } else if (base == NULL || base[i].type == 0) {
1074                         continue;
1075                 } else {
1076                         bref = &base[i];
1077                 }
1078                 scan_beg = bref->key;
1079                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1080                 if (key_beg <= scan_end && key_end >= scan_beg)
1081                         break;
1082         }
1083         if (i == count) {
1084                 if (key_beg == key_end)
1085                         return (NULL);
1086                 return (hammer2_chain_next(hmp, parentp, NULL,
1087                                            key_beg, key_end, flags));
1088         }
1089
1090         /*
1091          * Acquire the new chain element.  If the chain element is an
1092          * indirect block we must search recursively.
1093          */
1094         chain = hammer2_chain_get(hmp, parent, i, flags);
1095         if (chain == NULL)
1096                 return (NULL);
1097
1098         /*
1099          * If the chain element is an indirect block it becomes the new
1100          * parent and we loop on it.
1101          *
1102          * The parent always has to be locked with at least RESOLVE_MAYBE,
1103          * so it might need a fixup if the caller passed incompatible flags.
1104          */
1105         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1106                 hammer2_chain_unlock(hmp, parent);
1107                 *parentp = parent = chain;
1108                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1109                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1110                         hammer2_chain_drop(hmp, chain); /* excess ref */
1111                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1112                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1113                         hammer2_chain_unlock(hmp, chain);
1114                 }
1115                 goto again;
1116         }
1117
1118         /*
1119          * All done, return chain
1120          */
1121         return (chain);
1122 }
1123
1124 /*
1125  * After having issued a lookup we can iterate all matching keys.
1126  *
1127  * If chain is non-NULL we continue the iteration from just after it's index.
1128  *
1129  * If chain is NULL we assume the parent was exhausted and continue the
1130  * iteration at the next parent.
1131  */
1132 hammer2_chain_t *
1133 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1134                    hammer2_chain_t *chain,
1135                    hammer2_key_t key_beg, hammer2_key_t key_end,
1136                    int flags)
1137 {
1138         hammer2_chain_t *parent;
1139         hammer2_chain_t *tmp;
1140         hammer2_blockref_t *base;
1141         hammer2_blockref_t *bref;
1142         hammer2_key_t scan_beg;
1143         hammer2_key_t scan_end;
1144         int i;
1145         int count;
1146
1147         parent = *parentp;
1148
1149 again:
1150         /*
1151          * Calculate the next index and recalculate the parent if necessary.
1152          */
1153         if (chain) {
1154                 /*
1155                  * Continue iteration within current parent.  If not NULL
1156                  * the passed-in chain may or may not be locked, based on
1157                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1158                  * or a prior next).
1159                  */
1160                 i = chain->index + 1;
1161                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1162                         hammer2_chain_drop(hmp, chain);
1163                 else
1164                         hammer2_chain_unlock(hmp, chain);
1165
1166                 /*
1167                  * Any scan where the lookup returned degenerate data embedded
1168                  * in the inode has an invalid index and must terminate.
1169                  */
1170                 if (chain == parent)
1171                         return(NULL);
1172                 chain = NULL;
1173         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1174                 /*
1175                  * We reached the end of the iteration.
1176                  */
1177                 return (NULL);
1178         } else {
1179                 /*
1180                  * Continue iteration with next parent unless the current
1181                  * parent covers the range.
1182                  */
1183                 hammer2_chain_t *nparent;
1184
1185                 scan_beg = parent->bref.key;
1186                 scan_end = scan_beg +
1187                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1188                 if (key_beg >= scan_beg && key_end <= scan_end)
1189                         return (NULL);
1190
1191                 i = parent->index + 1;
1192                 nparent = parent->parent;
1193                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1194                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1195                                                         /* lock new parent */
1196                 hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
1197                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1198                 *parentp = parent = nparent;
1199         }
1200
1201 again2:
1202         /*
1203          * Locate the blockref array.  Currently we do a fully associative
1204          * search through the array.
1205          */
1206         switch(parent->bref.type) {
1207         case HAMMER2_BREF_TYPE_INODE:
1208                 base = &parent->data->ipdata.u.blockset.blockref[0];
1209                 count = HAMMER2_SET_COUNT;
1210                 break;
1211         case HAMMER2_BREF_TYPE_INDIRECT:
1212                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1213                         base = NULL;
1214                 } else {
1215                         KKASSERT(parent->data != NULL);
1216                         base = &parent->data->npdata.blockref[0];
1217                 }
1218                 count = parent->bytes / sizeof(hammer2_blockref_t);
1219                 break;
1220         case HAMMER2_BREF_TYPE_VOLUME:
1221                 base = &hmp->voldata.sroot_blockset.blockref[0];
1222                 count = HAMMER2_SET_COUNT;
1223                 break;
1224         default:
1225                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1226                       parent->bref.type);
1227                 base = NULL;    /* safety */
1228                 count = 0;      /* safety */
1229                 break;
1230         }
1231         KKASSERT(i <= count);
1232
1233         /*
1234          * Look for the key.  If we are unable to find a match and an exact
1235          * match was requested we return NULL.  If a range was requested we
1236          * run hammer2_chain_next() to iterate.
1237          */
1238         bref = NULL;
1239         while (i < count) {
1240                 tmp = hammer2_chain_find(hmp, parent, i);
1241                 if (tmp) {
1242                         bref = &tmp->bref;
1243                 } else if (base == NULL || base[i].type == 0) {
1244                         ++i;
1245                         continue;
1246                 } else {
1247                         bref = &base[i];
1248                 }
1249                 scan_beg = bref->key;
1250                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1251                 if (key_beg <= scan_end && key_end >= scan_beg)
1252                         break;
1253                 ++i;
1254         }
1255
1256         /*
1257          * If we couldn't find a match recurse up a parent to continue the
1258          * search.
1259          */
1260         if (i == count)
1261                 goto again;
1262
1263         /*
1264          * Acquire the new chain element.  If the chain element is an
1265          * indirect block we must search recursively.
1266          */
1267         chain = hammer2_chain_get(hmp, parent, i, flags);
1268         if (chain == NULL)
1269                 return (NULL);
1270
1271         /*
1272          * If the chain element is an indirect block it becomes the new
1273          * parent and we loop on it.
1274          *
1275          * The parent always has to be locked with at least RESOLVE_MAYBE,
1276          * so it might need a fixup if the caller passed incompatible flags.
1277          */
1278         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1279                 hammer2_chain_unlock(hmp, parent);
1280                 *parentp = parent = chain;
1281                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1282                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1283                         hammer2_chain_drop(hmp, chain); /* excess ref */
1284                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1285                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1286                         hammer2_chain_unlock(hmp, chain);
1287                 }
1288                 i = 0;
1289                 goto again2;
1290         }
1291
1292         /*
1293          * All done, return chain
1294          */
1295         return (chain);
1296 }
1297
1298 /*
1299  * Create and return a new hammer2 system memory structure of the specified
1300  * key, type and size and insert it RELATIVE TO (PARENT).
1301  *
1302  * (parent) is typically either an inode or an indirect block, acquired
1303  * acquired as a side effect of issuing a prior failed lookup.  parent
1304  * must be locked and held.  Do not pass the inode chain to this function
1305  * unless that is the chain returned by the failed lookup.
1306  *
1307  * Non-indirect types will automatically allocate indirect blocks as required
1308  * if the new item does not fit in the current (parent).
1309  *
1310  * Indirect types will move a portion of the existing blockref array in
1311  * (parent) into the new indirect type and then use one of the free slots
1312  * to emplace the new indirect type.
1313  *
1314  * A new locked, referenced chain element is returned of the specified type.
1315  * The element may or may not have a data area associated with it:
1316  *
1317  *      VOLUME          not allowed here
1318  *      INODE           embedded data are will be set-up
1319  *      INDIRECT        not allowed here
1320  *      DATA            no data area will be set-up (caller is expected
1321  *                      to have logical buffers, we don't want to alias
1322  *                      the data onto device buffers!).
1323  */
1324 hammer2_chain_t *
1325 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1326                      hammer2_chain_t *chain,
1327                      hammer2_key_t key, int keybits, int type, size_t bytes)
1328 {
1329         hammer2_blockref_t dummy;
1330         hammer2_blockref_t *base;
1331         hammer2_chain_t dummy_chain;
1332         int unlock_parent = 0;
1333         int allocated = 0;
1334         int count;
1335         int i;
1336
1337         if (chain == NULL) {
1338                 /*
1339                  * First allocate media space and construct the dummy bref,
1340                  * then allocate the in-memory chain structure.
1341                  */
1342                 bzero(&dummy, sizeof(dummy));
1343                 dummy.type = type;
1344                 dummy.key = key;
1345                 dummy.keybits = keybits;
1346                 dummy.data_off = hammer2_bytes_to_radix(bytes);
1347                 chain = hammer2_chain_alloc(hmp, &dummy);
1348                 allocated = 1;
1349
1350                 /*
1351                  * We set the WAS_MODIFIED flag here so the chain gets
1352                  * marked as modified below.
1353                  *
1354                  * We do NOT set INITIAL here (yet).  INITIAL is only
1355                  * used for indirect blocks.
1356                  */
1357                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
1358
1359                 /*
1360                  * Recalculate bytes to reflect the actual media block
1361                  * allocation.
1362                  */
1363                 bytes = (hammer2_off_t)1 <<
1364                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1365                 chain->bytes = bytes;
1366
1367                 switch(type) {
1368                 case HAMMER2_BREF_TYPE_VOLUME:
1369                         panic("hammer2_chain_create: called with volume type");
1370                         break;
1371                 case HAMMER2_BREF_TYPE_INODE:
1372                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1373                         chain->data = (void *)&chain->u.ip->ip_data;
1374                         break;
1375                 case HAMMER2_BREF_TYPE_INDIRECT:
1376                         panic("hammer2_chain_create: cannot be used to"
1377                               "create indirect block");
1378                         break;
1379                 case HAMMER2_BREF_TYPE_DATA:
1380                 default:
1381                         /* leave chain->data NULL */
1382                         KKASSERT(chain->data == NULL);
1383                         break;
1384                 }
1385         } else {
1386                 /*
1387                  * Potentially update the chain's key/keybits, but it will
1388                  * only be marked modified if WAS_MODIFIED is set (if it
1389                  * was modified at the time of its removal during a rename).
1390                  */
1391                 chain->bref.key = key;
1392                 chain->bref.keybits = keybits;
1393         }
1394
1395 again:
1396         /*
1397          * Locate a free blockref in the parent's array
1398          */
1399         switch(parent->bref.type) {
1400         case HAMMER2_BREF_TYPE_INODE:
1401                 KKASSERT(parent->data != NULL);
1402                 base = &parent->data->ipdata.u.blockset.blockref[0];
1403                 count = HAMMER2_SET_COUNT;
1404                 break;
1405         case HAMMER2_BREF_TYPE_INDIRECT:
1406                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1407                         base = NULL;
1408                 } else {
1409                         KKASSERT(parent->data != NULL);
1410                         base = &parent->data->npdata.blockref[0];
1411                 }
1412                 count = parent->bytes / sizeof(hammer2_blockref_t);
1413                 break;
1414         case HAMMER2_BREF_TYPE_VOLUME:
1415                 KKASSERT(parent->data != NULL);
1416                 base = &hmp->voldata.sroot_blockset.blockref[0];
1417                 count = HAMMER2_SET_COUNT;
1418                 break;
1419         default:
1420                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1421                       parent->bref.type);
1422                 count = 0;
1423                 break;
1424         }
1425
1426         /*
1427          * Scan for an unallocated bref, also skipping any slots occupied
1428          * by in-memory chain elements that may not yet have been updated
1429          * in the parent's bref array.
1430          */
1431         bzero(&dummy_chain, sizeof(dummy_chain));
1432         for (i = 0; i < count; ++i) {
1433                 if (base == NULL) {
1434                         dummy_chain.index = i;
1435                         if (SPLAY_FIND(hammer2_chain_splay,
1436                                        &parent->shead, &dummy_chain) == NULL) {
1437                                 break;
1438                         }
1439                 } else if (base[i].type == 0) {
1440                         dummy_chain.index = i;
1441                         if (SPLAY_FIND(hammer2_chain_splay,
1442                                        &parent->shead, &dummy_chain) == NULL) {
1443                                 break;
1444                         }
1445                 }
1446         }
1447
1448         /*
1449          * If no free blockref count be found we must create an indirect
1450          * block and move a number of blockrefs into it.  With the parent
1451          * locked we can safely lock each child in order to move it without
1452          * causing a deadlock.
1453          *
1454          * This may return the new indirect block or the old parent depending
1455          * on where the key falls.
1456          */
1457         if (i == count) {
1458                 hammer2_chain_t *nparent;
1459
1460                 nparent = hammer2_chain_create_indirect(hmp, parent,
1461                                                         key, keybits);
1462                 if (nparent == NULL) {
1463                         if (allocated)
1464                                 hammer2_chain_free(hmp, chain);
1465                         chain = NULL;
1466                         goto done;
1467                 }
1468                 if (parent != nparent) {
1469                         if (unlock_parent)
1470                                 hammer2_chain_unlock(hmp, parent);
1471                         parent = nparent;
1472                         unlock_parent = 1;
1473                 }
1474                 goto again;
1475         }
1476
1477         /*
1478          * Link the chain into its parent.
1479          */
1480         if (chain->parent != NULL)
1481                 panic("hammer2: hammer2_chain_create: chain already connected");
1482         KKASSERT(chain->parent == NULL);
1483         chain->parent = parent;
1484         chain->index = i;
1485         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1486                 panic("hammer2_chain_link: collision");
1487         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1488         KKASSERT(parent->refs > 0);
1489         atomic_add_int(&parent->refs, 1);
1490
1491         /*
1492          * Additional linkage for inodes.  Reuse the parent pointer to
1493          * find the parent directory.
1494          */
1495         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1496                 hammer2_chain_t *scan = parent;
1497                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1498                         scan = scan->parent;
1499                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE)
1500                         chain->u.ip->pip = scan->u.ip;
1501         }
1502
1503         /*
1504          * WAS_MODIFIED indicates that this is a newly-created chain element
1505          * rather than a renamed chain element.  In this situation we want
1506          * to place the chain element in the MODIFIED1 state.
1507          *
1508          * The data area will be set up as follows:
1509          *
1510          *      VOLUME          not allowed here.
1511          *
1512          *      INODE           embedded data are will be set-up.
1513          *
1514          *      INDIRECT        not allowed here.
1515          *
1516          *      DATA            no data area will be set-up (caller is expected
1517          *                      to have logical buffers, we don't want to alias
1518          *                      the data onto device buffers!).
1519          */
1520         if (chain->flags & HAMMER2_CHAIN_WAS_MODIFIED) {
1521                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
1522                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1523                         hammer2_chain_modify(hmp, chain,
1524                                              HAMMER2_MODIFY_OPTDATA);
1525                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1526                         /* not supported in this function */
1527                         panic("hammer2_chain_create: bad type");
1528                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1529                         hammer2_chain_modify(hmp, chain,
1530                                              HAMMER2_MODIFY_OPTDATA);
1531                 } else {
1532                         hammer2_chain_modify(hmp, chain, 0);
1533                 }
1534         }
1535
1536 done:
1537         if (unlock_parent)
1538                 hammer2_chain_unlock(hmp, parent);
1539         return (chain);
1540 }
1541
1542 /*
1543  * Create an indirect block that covers one or more of the elements in the
1544  * current parent.  Either returns the existing parent with no locking or
1545  * ref changes or returns the new indirect block locked and referenced,
1546  * depending on what the specified key falls into.
1547  *
1548  * The key/keybits for the indirect mode only needs to follow three rules:
1549  *
1550  * (1) That all elements underneath it fit within its key space and
1551  *
1552  * (2) That all elements outside it are outside its key space.
1553  *
1554  * (3) When creating the new indirect block any elements in the current
1555  *     parent that fit within the new indirect block's keyspace must be
1556  *     moved into the new indirect block.
1557  *
1558  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1559  *     keyspace the the current parent, but lookup/iteration rules will
1560  *     ensure (and must ensure) that rule (2) for all parents leading up
1561  *     to the nearest inode or the root volume header is adhered to.  This
1562  *     is accomplished by always recursing through matching keyspaces in
1563  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1564  *
1565  * The current implementation calculates the current worst-case keyspace by
1566  * iterating the current parent and then divides it into two halves, choosing
1567  * whichever half has the most elements (not necessarily the half containing
1568  * the requested key).
1569  *
1570  * We can also opt to use the half with the least number of elements.  This
1571  * causes lower-numbered keys (aka logical file offsets) to recurse through
1572  * fewer indirect blocks and higher-numbered keys to recurse through more.
1573  * This also has the risk of not moving enough elements to the new indirect
1574  * block and being forced to create several indirect blocks before the element
1575  * can be inserted.
1576  */
1577 static
1578 hammer2_chain_t *
1579 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1580                               hammer2_key_t create_key, int create_bits)
1581 {
1582         hammer2_blockref_t *base;
1583         hammer2_blockref_t *bref;
1584         hammer2_chain_t *chain;
1585         hammer2_chain_t *ichain;
1586         hammer2_chain_t dummy;
1587         hammer2_key_t key = create_key;
1588         int keybits = create_bits;
1589         int locount = 0;
1590         int hicount = 0;
1591         int count;
1592         int nbytes;
1593         int i;
1594
1595         /*
1596          * Calculate the base blockref pointer or NULL if the chain
1597          * is known to be empty.
1598          */
1599         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1600         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1601                 base = NULL;
1602
1603                 /*
1604                  * We still need to calculate the count for SPLAY lookups
1605                  */
1606                 switch(parent->bref.type) {
1607                 case HAMMER2_BREF_TYPE_INODE:
1608                         count = HAMMER2_SET_COUNT;
1609                         break;
1610                 case HAMMER2_BREF_TYPE_INDIRECT:
1611                         count = parent->bytes / sizeof(hammer2_blockref_t);
1612                         break;
1613                 case HAMMER2_BREF_TYPE_VOLUME:
1614                         count = HAMMER2_SET_COUNT;
1615                         break;
1616                 default:
1617                         panic("hammer2_chain_create_indirect: "
1618                               "unrecognized blockref type: %d",
1619                               parent->bref.type);
1620                         count = 0;
1621                         break;
1622                 }
1623         } else {
1624                 /*
1625                  * Locate a free blockref in the parent's array
1626                  */
1627                 switch(parent->bref.type) {
1628                 case HAMMER2_BREF_TYPE_INODE:
1629                         base = &parent->data->ipdata.u.blockset.blockref[0];
1630                         count = HAMMER2_SET_COUNT;
1631                         break;
1632                 case HAMMER2_BREF_TYPE_INDIRECT:
1633                         base = &parent->data->npdata.blockref[0];
1634                         count = parent->bytes / sizeof(hammer2_blockref_t);
1635                         break;
1636                 case HAMMER2_BREF_TYPE_VOLUME:
1637                         base = &hmp->voldata.sroot_blockset.blockref[0];
1638                         count = HAMMER2_SET_COUNT;
1639                         break;
1640                 default:
1641                         panic("hammer2_chain_create_indirect: "
1642                               "unrecognized blockref type: %d",
1643                               parent->bref.type);
1644                         count = 0;
1645                         break;
1646                 }
1647         }
1648
1649         /*
1650          * Scan for an unallocated bref, also skipping any slots occupied
1651          * by in-memory chain elements that may not yet have been updated
1652          * in the parent's bref array.
1653          */
1654         bzero(&dummy, sizeof(dummy));
1655         for (i = 0; i < count; ++i) {
1656                 int nkeybits;
1657
1658                 /*
1659                  * Optimize the case where the parent is still in its
1660                  * initially created state.
1661                  */
1662                 if (base == NULL || base[i].type == 0) {
1663                         dummy.index = i;
1664                         chain = SPLAY_FIND(hammer2_chain_splay,
1665                                            &parent->shead, &dummy);
1666                         if (chain == NULL)
1667                                 continue;
1668                         bref = &chain->bref;
1669                 } else {
1670                         bref = &base[i];
1671                 }
1672
1673                 /*
1674                  * Expand our calculated key range (key, keybits) to fit
1675                  * the scanned key.  nkeybits represents the full range
1676                  * that we will later cut in half (two halves @ nkeybits - 1).
1677                  */
1678                 nkeybits = keybits;
1679                 if (nkeybits < bref->keybits)
1680                         nkeybits = bref->keybits;
1681                 while ((~(((hammer2_key_t)1 << nkeybits) - 1) &
1682                         (key ^ bref->key)) != 0) {
1683                         ++nkeybits;
1684                 }
1685
1686                 /*
1687                  * If the new key range is larger we have to determine
1688                  * which side of the new key range the existing keys fall
1689                  * under by checking the high bit, then collapsing the
1690                  * locount into the hicount or vise-versa.
1691                  */
1692                 if (keybits != nkeybits) {
1693                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1694                                 hicount += locount;
1695                                 locount = 0;
1696                         } else {
1697                                 locount += hicount;
1698                                 hicount = 0;
1699                         }
1700                         keybits = nkeybits;
1701                 }
1702
1703                 /*
1704                  * The newly scanned key will be in the lower half or the
1705                  * higher half of the (new) key range.
1706                  */
1707                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1708                         ++hicount;
1709                 else
1710                         ++locount;
1711         }
1712
1713         /*
1714          * Adjust keybits to represent half of the full range calculated
1715          * above.
1716          */
1717         --keybits;
1718
1719         /*
1720          * Select whichever half contains the most elements.  Theoretically
1721          * we can select either side as long as it contains at least one
1722          * element (in order to ensure that a free slot is present to hold
1723          * the indirect block).
1724          */
1725         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1726         if (hammer2_indirect_optimize) {
1727                 /*
1728                  * Insert node for least number of keys, this will arrange
1729                  * the first few blocks of a large file or the first few
1730                  * inodes in a directory with fewer indirect blocks when
1731                  * created linearly.
1732                  */
1733                 if (hicount < locount && hicount != 0)
1734                         key |= (hammer2_key_t)1 << keybits;
1735                 else
1736                         key &= ~(hammer2_key_t)1 << keybits;
1737         } else {
1738                 /*
1739                  * Insert node for most number of keys, best for heavily
1740                  * fragmented files.
1741                  */
1742                 if (hicount > locount)
1743                         key |= (hammer2_key_t)1 << keybits;
1744                 else
1745                         key &= ~(hammer2_key_t)1 << keybits;
1746         }
1747
1748         /*
1749          * How big should our new indirect block be?  It has to be at least
1750          * as large as its parent.
1751          */
1752         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
1753                 nbytes = HAMMER2_IND_BYTES_MIN;
1754         else
1755                 nbytes = HAMMER2_IND_BYTES_MAX;
1756         if (nbytes < count * sizeof(hammer2_blockref_t))
1757                 nbytes = count * sizeof(hammer2_blockref_t);
1758
1759         /*
1760          * Ok, create our new indirect block
1761          */
1762         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1763         dummy.bref.key = key;
1764         dummy.bref.keybits = keybits;
1765         dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
1766         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1767         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
1768
1769         /*
1770          * Iterate the original parent and move the matching brefs into
1771          * the new indirect block.
1772          */
1773         for (i = 0; i < count; ++i) {
1774                 /*
1775                  * For keying purposes access the bref from the media or
1776                  * from our in-memory cache.  In cases where the in-memory
1777                  * cache overrides the media the keyrefs will be the same
1778                  * anyway so we can avoid checking the cache when the media
1779                  * has a key.
1780                  */
1781                 if (base == NULL || base[i].type == 0) {
1782                         dummy.index = i;
1783                         chain = SPLAY_FIND(hammer2_chain_splay,
1784                                            &parent->shead, &dummy);
1785                         if (chain == NULL) {
1786                                 /*
1787                                  * Select index indirect block is placed in
1788                                  */
1789                                 if (ichain->index < 0)
1790                                         ichain->index = i;
1791                                 continue;
1792                         }
1793                         bref = &chain->bref;
1794                 } else {
1795                         bref = &base[i];
1796                 }
1797
1798                 /*
1799                  * Skip keys not in the chosen half (low or high), only bit
1800                  * (keybits - 1) needs to be compared but for safety we
1801                  * will compare all msb bits plus that bit again.
1802                  */
1803                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1804                     (key ^ bref->key)) != 0) {
1805                         continue;
1806                 }
1807
1808                 /*
1809                  * This element is being moved, its slot is available
1810                  * for our indirect block.
1811                  */
1812                 if (ichain->index < 0)
1813                         ichain->index = i;
1814
1815                 /*
1816                  * Load the new indirect block by acquiring or allocating
1817                  * the related chain entries, then simply move it to the
1818                  * new parent (ichain).
1819                  *
1820                  * Flagging the new chain entry MOVED will cause a flush
1821                  * to synchronize its block into the new indirect block.
1822                  * The chain is unlocked after being moved but needs to
1823                  * retain a reference for the MOVED state
1824                  *
1825                  * We must still set SUBMODIFIED in the parent but we do
1826                  * that after the loop.
1827                  *
1828                  * XXX we really need a lock here but we don't need the
1829                  *     data.  NODATA feature needed.
1830                  */
1831                 chain = hammer2_chain_get(hmp, parent, i,
1832                                           HAMMER2_LOOKUP_NODATA);
1833                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1834                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1835                         panic("hammer2_chain_create_indirect: collision");
1836                 chain->parent = ichain;
1837                 if (base)
1838                         bzero(&base[i], sizeof(base[i]));
1839                 atomic_add_int(&parent->refs, -1);
1840                 atomic_add_int(&ichain->refs, 1);
1841                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1842                         hammer2_chain_ref(hmp, chain);
1843                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1844                 }
1845                 hammer2_chain_unlock(hmp, chain);
1846                 KKASSERT(parent->refs > 0);
1847                 chain = NULL;
1848         }
1849
1850         /*
1851          * Insert the new indirect block into the parent now that we've
1852          * cleared out some entries in the parent.  We calculated a good
1853          * insertion index in the loop above (ichain->index).
1854          */
1855         KKASSERT(ichain->index >= 0);
1856         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1857                 panic("hammer2_chain_create_indirect: ichain insertion");
1858         ichain->parent = parent;
1859         atomic_add_int(&parent->refs, 1);
1860
1861         /*
1862          * Mark the new indirect block modified after insertion, which
1863          * will propagate up through parent all the way to the root and
1864          * also allocate the physical block in ichain for our caller,
1865          * and assign ichain->data to a pre-zero'd space (because there
1866          * is not prior data to copy into it).
1867          *
1868          * We have to set SUBMODIFIED in ichain's flags manually so the
1869          * flusher knows it has to recurse through it to get to all of
1870          * our moved blocks, then call setsubmod() to set the bit
1871          * recursively.
1872          */
1873         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
1874         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1875         hammer2_chain_parent_setsubmod(hmp, ichain);
1876
1877         /*
1878          * Figure out what to return.
1879          */
1880         if (create_bits >= keybits) {
1881                 /*
1882                  * Key being created is way outside the key range,
1883                  * return the original parent.
1884                  */
1885                 hammer2_chain_unlock(hmp, ichain);
1886         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1887                    (create_key ^ key)) {
1888                 /*
1889                  * Key being created is outside the key range,
1890                  * return the original parent.
1891                  */
1892                 hammer2_chain_unlock(hmp, ichain);
1893         } else {
1894                 /*
1895                  * Otherwise its in the range, return the new parent.
1896                  */
1897                 parent = ichain;
1898         }
1899
1900         return(parent);
1901 }
1902
1903 /*
1904  * Physically delete the specified chain element.  Note that inodes with
1905  * open descriptors should not be deleted (as with other filesystems) until
1906  * the last open descriptor is closed.
1907  *
1908  * This routine will remove the chain element from its parent and potentially
1909  * also recurse upward and delete indirect blocks which become empty as a
1910  * side effect.
1911  *
1912  * The caller must pass a pointer to the chain's parent, also locked and
1913  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1914  * or iteration when indirect blocks are also deleted as a side effect.
1915  */
1916 void
1917 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1918                      hammer2_chain_t *chain)
1919 {
1920         hammer2_blockref_t *base;
1921         int count;
1922
1923         if (chain->parent != parent)
1924                 panic("hammer2_chain_delete: parent mismatch");
1925
1926         /*
1927          * Mark the parent modified so our base[] pointer remains valid
1928          * while we move entries.  For the optimized indirect block
1929          * case mark the parent moved instead.
1930          *
1931          * Calculate the blockref reference in the parent
1932          */
1933         switch(parent->bref.type) {
1934         case HAMMER2_BREF_TYPE_INODE:
1935                 hammer2_chain_modify(hmp, parent, 0);
1936                 base = &parent->data->ipdata.u.blockset.blockref[0];
1937                 count = HAMMER2_SET_COUNT;
1938                 break;
1939         case HAMMER2_BREF_TYPE_INDIRECT:
1940                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1941                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
1942                         base = NULL;
1943                 else
1944                         base = &parent->data->npdata.blockref[0];
1945                 count = parent->bytes / sizeof(hammer2_blockref_t);
1946                 break;
1947         case HAMMER2_BREF_TYPE_VOLUME:
1948                 hammer2_chain_modify(hmp, parent, 0);
1949                 base = &hmp->voldata.sroot_blockset.blockref[0];
1950                 count = HAMMER2_SET_COUNT;
1951                 break;
1952         default:
1953                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
1954                       parent->bref.type);
1955                 count = 0;
1956                 break;
1957         }
1958
1959         /*
1960          * Disconnect the bref in the parent, remove the chain, and
1961          * disconnect in-memory fields from the parent.
1962          */
1963         KKASSERT(chain->index >= 0 && chain->index < count);
1964         if (base)
1965                 bzero(&base[chain->index], sizeof(*base));
1966
1967         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1968         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1969         atomic_add_int(&parent->refs, -1);      /* for splay entry */
1970         chain->index = -1;
1971         chain->parent = NULL;
1972
1973         /*
1974          * If this is an inode clear the pip.
1975          */
1976         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
1977                 chain->u.ip->pip = NULL;
1978
1979         /*
1980          * The chain is still likely referenced, possibly even by a vnode
1981          * (if an inode), so defer further action until the chain gets
1982          * dropped.
1983          */
1984 }
1985
1986 /*
1987  * Recursively flush the specified chain.  The chain is locked and
1988  * referenced by the caller and will remain so on return.
1989  *
1990  * This cannot be called with the volume header's vchain (yet).
1991  *
1992  * PASS1 - clear the MODIFIED1 bit.
1993  */
1994 static void
1995 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, int tab)
1996 {
1997         hammer2_blockref_t *bref;
1998         hammer2_off_t pbase;
1999         size_t bbytes;
2000         size_t boff;
2001         char *bdata;
2002         struct buf *bp;
2003         int error;
2004
2005         if (hammer2_debug & 0x0008)
2006                 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2007                         tab, tab, "",
2008                         chain->bref.type, chain->bref.data_off,
2009                         chain, chain->refs, chain->flags);
2010
2011         /*
2012          * Flush any children of this chain entry.
2013          *
2014          * NOTE: If we use a while() here an active filesystem can
2015          *       prevent the flush from ever finishing.
2016          */
2017         if (chain->flags &
2018             (HAMMER2_CHAIN_SUBMODIFIED | HAMMER2_CHAIN_DESTROYED)) {
2019                 hammer2_blockref_t *base;
2020                 hammer2_chain_t *scan;
2021                 hammer2_chain_t *next;
2022                 int count;
2023                 int submodified = 0;
2024                 int submoved = 0;
2025
2026                 /*
2027                  * Clear SUBMODIFIED now.  Flag any races during the flush
2028                  * with the (submodified) local variable and re-arm it
2029                  * as necessary after the loop is done.
2030                  *
2031                  * Delaying the setting of the chain to MODIFIED can reduce
2032                  * unnecessary I/O.
2033                  *
2034                  * Modifications to the children will propagate up, forcing
2035                  * us to become modified and copy-on-write too.  Be sure
2036                  * to modify chain (as a side effect of the recursive
2037                  * flush) ONLY if it is actually being modified by the
2038                  * recursive flush.
2039                  */
2040                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2041
2042                 /*
2043                  * Flush the children and update the blockrefs in the parent.
2044                  * Be careful of ripouts during the loop.
2045                  */
2046                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2047                 while ((scan = next) != NULL) {
2048                         next = SPLAY_NEXT(hammer2_chain_splay,
2049                                           &chain->shead, scan);
2050                         if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2051                                             HAMMER2_CHAIN_MODIFIED1 |
2052                                             HAMMER2_CHAIN_MOVED)) == 0) {
2053                                 continue;
2054                         }
2055
2056                         /*
2057                          * Recurse the flush
2058                          */
2059                         hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
2060                         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2061                                 atomic_set_int(&scan->flags,
2062                                                HAMMER2_CHAIN_DESTROYED);
2063                         }
2064                         hammer2_chain_flush_pass1(hmp, scan, tab + 4);
2065
2066                         /*
2067                          * No point loading blockrefs yet if the
2068                          * child (recursively) is still dirty.
2069                          */
2070                         if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2071                                            HAMMER2_CHAIN_MODIFIED1)) {
2072                                 submodified = 1;
2073                                 if (hammer2_debug & 0x0008)
2074                                         kprintf("s");
2075                         }
2076                         if (scan->flags & HAMMER2_CHAIN_MOVED) {
2077                                 if (hammer2_debug & 0x0008)
2078                                         kprintf("m");
2079                                 submoved = 1;
2080                         }
2081                         if (hammer2_debug & 0x0008)
2082                                 kprintf("\n");
2083                         hammer2_chain_unlock(hmp, scan);
2084                 }
2085
2086                 if (submodified || (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2087                         /*
2088                          * No point loading up the blockrefs if submodified
2089                          * got re-set.
2090                          *
2091                          * NOTE: Even though we cleared the SUBMODIFIED flag
2092                          *       it can still get re-set by operations
2093                          *       occuring under our chain, so check both.
2094                          */
2095                         atomic_set_int(&chain->flags,
2096                                        HAMMER2_CHAIN_SUBMODIFIED);
2097                 } else if (submoved) {
2098                         /*
2099                          * Ok, we can modify the blockrefs in this chain
2100                          * entry.  Mark it modified.  Calculate the
2101                          * blockref array after marking it modified (since
2102                          * that may change the underlying data ptr).
2103                          *
2104                          * NOTE: We only do this if submoved != 0, otherwise
2105                          *       there may not be any changes and setting
2106                          *       the chain modified will re-arm the MOVED
2107                          *       bit recursively, resulting in O(N^2)
2108                          *       flushes.
2109                          *
2110                          * NOTE: We don't want hammer2_chain_modify() to
2111                          *       recursively set the SUBMODIFIED flag
2112                          *       upward in this case!
2113                          */
2114                         hammer2_chain_modify(hmp, chain, HAMMER2_MODIFY_NOSUB);
2115
2116                         switch(chain->bref.type) {
2117                         case HAMMER2_BREF_TYPE_INODE:
2118                                 base = &chain->data->ipdata.u.blockset.
2119                                         blockref[0];
2120                                 count = HAMMER2_SET_COUNT;
2121                                 break;
2122                         case HAMMER2_BREF_TYPE_INDIRECT:
2123                                 base = &chain->data->npdata.blockref[0];
2124                                 count = chain->bytes /
2125                                         sizeof(hammer2_blockref_t);
2126                                 break;
2127                         case HAMMER2_BREF_TYPE_VOLUME:
2128                                 base = &hmp->voldata.sroot_blockset.blockref[0];
2129                                 count = HAMMER2_SET_COUNT;
2130                                 break;
2131                         default:
2132                                 base = NULL;
2133                                 panic("hammer2_chain_get: "
2134                                       "unrecognized blockref type: %d",
2135                                       chain->bref.type);
2136                         }
2137
2138                         /*
2139                          * Update the blockrefs.
2140                          */
2141                         next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2142                         while ((scan = next) != NULL) {
2143                                 next = SPLAY_NEXT(hammer2_chain_splay,
2144                                                   &chain->shead, scan);
2145                                 KKASSERT(scan->index >= 0 &&
2146                                          scan->index < count);
2147                                 hammer2_chain_lock(hmp, scan,
2148                                                    HAMMER2_RESOLVE_NEVER);
2149                                 base[scan->index] = scan->bref;
2150                                 if (scan->flags & HAMMER2_CHAIN_MOVED) {
2151                                         atomic_clear_int(&scan->flags,
2152                                                  HAMMER2_CHAIN_MOVED);
2153                                         hammer2_chain_drop(hmp, scan);
2154                                 }
2155                                 hammer2_chain_unlock(hmp, scan);
2156                         }
2157                 }
2158         }
2159
2160         /*
2161          * If destroying the object we unconditonally clear the MODIFIED1
2162          * and MOVED bits, and we destroy the buffer without writing it
2163          * out.
2164          *
2165          * We don't bother updating the hash/crc or the parent bref.
2166          *
2167          * XXX allocations for unflushed data can be returned to the
2168          *     free pool.
2169          */
2170         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2171                 if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
2172                         if (chain->bp) {
2173                                 chain->bp->b_flags |= B_INVAL|B_RELBUF;
2174                         }
2175                         atomic_clear_int(&chain->flags,
2176                                          HAMMER2_CHAIN_MODIFIED1);
2177                         hammer2_chain_drop(hmp, chain);
2178                 }
2179                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2180                         atomic_clear_int(&chain->flags,
2181                                          HAMMER2_CHAIN_MOVED);
2182                         hammer2_chain_drop(hmp, chain);
2183                 }
2184                 return;
2185         }
2186
2187         /*
2188          * Flush this chain entry only if it is marked modified.
2189          */
2190         if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0) {
2191                 goto done;
2192                 return;
2193         }
2194
2195         /*
2196          * Clear MODIFIED1 and set HAMMER2_CHAIN_MOVED.  The caller
2197          * will re-test the MOVED bit.
2198          *
2199          * bits own a single parent ref and the MOVED bit owns its own
2200          * parent ref.
2201          */
2202         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
2203         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2204                 hammer2_chain_drop(hmp, chain);
2205         } else {
2206                 /* inherit ref from the MODIFIED1 we cleared */
2207                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2208         }
2209
2210         /*
2211          * If this is part of a recursive flush we can go ahead and write
2212          * out the buffer cache buffer and pass a new bref back up the chain.
2213          *
2214          * This will never be a volume header.
2215          */
2216         switch(chain->bref.type) {
2217         case HAMMER2_BREF_TYPE_VOLUME:
2218                 /*
2219                  * The volume header is flushed manually by the syncer, not
2220                  * here.
2221                  */
2222                 break;
2223         case HAMMER2_BREF_TYPE_DATA:
2224                 /*
2225                  * Data elements have already been flushed via the logical
2226                  * file buffer cache.  Their hash was set in the bref by
2227                  * the vop_write code.
2228                  */
2229                 break;
2230         case HAMMER2_BREF_TYPE_INDIRECT:
2231                 /*
2232                  * Indirect blocks may be in an INITIAL state.
2233                  */
2234                 break;
2235         default:
2236                 /*
2237                  * Embedded elements have to be flushed out.
2238                  */
2239                 KKASSERT(chain->data != NULL);
2240                 bref = &chain->bref;
2241
2242                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2243
2244                 if (chain->bp == NULL) {
2245                         /*
2246                          * The data is embedded, we have to acquire the
2247                          * buffer cache buffer and copy the data into it.
2248                          */
2249                         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2250                                 bbytes = HAMMER2_MINIOSIZE;
2251                         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2252                         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2253
2254                         /*
2255                          * The getblk() optimization can only be used if the
2256                          * physical block size matches the request.
2257                          */
2258                         if (chain->bytes == bbytes) {
2259                                 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2260                                 error = 0;
2261                         } else {
2262                                 error = bread(hmp->devvp, pbase, bbytes, &bp);
2263                                 KKASSERT(error == 0);
2264                         }
2265                         bdata = (char *)bp->b_data + boff;
2266
2267                         /*
2268                          * Copy the data to the buffer, mark the buffer
2269                          * dirty, and convert the chain to unmodified.
2270                          */
2271                         bcopy(chain->data, bdata, chain->bytes);
2272                         bp->b_flags |= B_CLUSTEROK;
2273                         bdwrite(bp);
2274                         bp = NULL;
2275                         chain->bref.check.iscsi32.value =
2276                                 hammer2_icrc32(chain->data, chain->bytes);
2277                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2278                                 ++hammer2_iod_meta_write;
2279                         else
2280                                 ++hammer2_iod_indr_write;
2281                 } else {
2282                         chain->bref.check.iscsi32.value =
2283                                 hammer2_icrc32(chain->data, chain->bytes);
2284                 }
2285         }
2286
2287         /*
2288          * Special handling
2289          */
2290         bref = &chain->bref;
2291
2292         switch(bref->type) {
2293         case HAMMER2_BREF_TYPE_VOLUME:
2294                 KKASSERT(chain->data != NULL);
2295                 KKASSERT(chain->bp == NULL);
2296
2297                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2298                         hammer2_icrc32(
2299                                 (char *)&hmp->voldata +
2300                                  HAMMER2_VOLUME_ICRC1_OFF,
2301                                 HAMMER2_VOLUME_ICRC1_SIZE);
2302                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2303                         hammer2_icrc32(
2304                                 (char *)&hmp->voldata +
2305                                  HAMMER2_VOLUME_ICRC0_OFF,
2306                                 HAMMER2_VOLUME_ICRC0_SIZE);
2307                 hmp->voldata.icrc_volheader =
2308                         hammer2_icrc32(
2309                                 (char *)&hmp->voldata +
2310                                  HAMMER2_VOLUME_ICRCVH_OFF,
2311                                 HAMMER2_VOLUME_ICRCVH_SIZE);
2312                 break;
2313         }
2314 done:
2315         if (hammer2_debug & 0x0008)
2316                 kprintf("%*.*s} %p/%d %04x ",
2317                         tab, tab, "", chain, chain->refs, chain->flags);
2318 }
2319
2320 #if 0
2321 /*
2322  * PASS2 - not yet implemented (should be called only with the root chain?)
2323  */
2324 static void
2325 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2326 {
2327 }
2328 #endif
2329
2330 /*
2331  * Stand-alone flush.  If the chain is unable to completely flush we have
2332  * to be sure that SUBMODIFIED propagates up the parent chain.
2333  *
2334  * This routine can be called from several places but the most important
2335  * is from the hammer2_vop_reclaim() function.  We want to try to completely
2336  * clean out the inode structure to prevent disconnected inodes from
2337  * building up and blowing out the kmalloc pool.
2338  */
2339 void
2340 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2341 {
2342         hammer2_chain_t *parent;
2343         hammer2_blockref_t *base;
2344         int count;
2345
2346         /*
2347          * Recursive flush
2348          */
2349         hammer2_chain_flush_pass1(hmp, chain, 0);
2350
2351         /*
2352          * The SUBMODIFIED bit must propagate upward if the chain could not
2353          * be completely flushed.
2354          */
2355         if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2356                             HAMMER2_CHAIN_MODIFIED1 |
2357                             HAMMER2_CHAIN_MOVED)) {
2358                 hammer2_chain_parent_setsubmod(hmp, chain);
2359         }
2360
2361         /*
2362          * If the only thing left is a simple bref update try to
2363          * pro-actively update the parent, otherwise return early.
2364          */
2365         parent = chain->parent;
2366         if (parent == NULL ||
2367             chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
2368             (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2369                              HAMMER2_CHAIN_MODIFIED1 |
2370                              HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
2371                 return;
2372         }
2373
2374         /*
2375          * We are locking backwards so allow the lock to fail
2376          */
2377         if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
2378                 return;
2379         }
2380
2381         /*
2382          * We are updating brefs but we have to call chain_modify() w/
2383          * setsubmod = TRUE because our caller is not a recursive
2384          * flush.
2385          */
2386         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
2387         hammer2_chain_modify(hmp, parent, 0);
2388
2389         switch(parent->bref.type) {
2390         case HAMMER2_BREF_TYPE_INODE:
2391                 base = &parent->data->ipdata.u.blockset.
2392                         blockref[0];
2393                 count = HAMMER2_SET_COUNT;
2394                 break;
2395         case HAMMER2_BREF_TYPE_INDIRECT:
2396                 base = &parent->data->npdata.blockref[0];
2397                 count = parent->bytes /
2398                         sizeof(hammer2_blockref_t);
2399                 break;
2400         case HAMMER2_BREF_TYPE_VOLUME:
2401                 base = &hmp->voldata.sroot_blockset.blockref[0];
2402                 count = HAMMER2_SET_COUNT;
2403                 break;
2404         default:
2405                 base = NULL;
2406                 panic("hammer2_chain_flush: "
2407                       "unrecognized blockref type: %d",
2408                       parent->bref.type);
2409         }
2410
2411         /*
2412          * Update the blockref in the parent
2413          */
2414         KKASSERT(chain->index >= 0 &&
2415                  chain->index < count);
2416         base[chain->index] = chain->bref;
2417         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2418                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2419                 hammer2_chain_drop(hmp, chain);
2420         }
2421
2422         lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
2423         hammer2_chain_unlock(hmp, parent);
2424 }