Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/cdefs.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/types.h>
47 #include <sys/lock.h>
48 #include <sys/uuid.h>
49
50 #include "hammer2.h"
51
52 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
53
54 static hammer2_chain_t *hammer2_chain_create_indirect(
55                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
56                         hammer2_key_t key, int keybits);
57
58 /*
59  * Splay tree
60  */
61 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
62
63 int
64 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
65 {
66         return(chain2->index - chain1->index);
67 }
68
69 /*
70  * Recursively mark the parent chain elements so flushes can find
71  * modified elements.
72  *
73  * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
74  *       during the flush recursion after clearing the parent's
75  *       SUBMODIFIED bit.  We don't want to re-set the parent's
76  *       SUBMODIFIED bit in this case!
77  *
78  * XXX rename of parent can create a SMP race
79  */
80 static void
81 hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
82 {
83         hammer2_chain_t *parent;
84
85         if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
86                 parent = chain->parent;
87                 while (parent &&
88                        (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
89                         atomic_set_int(&parent->flags,
90                                        HAMMER2_CHAIN_SUBMODIFIED);
91                         parent = parent->parent;
92                 }
93         }
94 }
95
96 /*
97  * Allocate a new disconnected chain element representing the specified
98  * bref.  The chain element is locked exclusively and refs is set to 1.
99  *
100  * This essentially allocates a system memory structure representing one
101  * of the media structure types, including inodes.
102  */
103 hammer2_chain_t *
104 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
105 {
106         hammer2_chain_t *chain;
107         hammer2_inode_t *ip;
108         hammer2_indblock_t *np;
109         hammer2_data_t *dp;
110         u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
111
112         /*
113          * Construct the appropriate system structure.
114          */
115         switch(bref->type) {
116         case HAMMER2_BREF_TYPE_INODE:
117                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
118                 chain = &ip->chain;
119                 chain->u.ip = ip;
120                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
121                 ip->hmp = hmp;
122                 break;
123         case HAMMER2_BREF_TYPE_INDIRECT:
124                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
125                 chain = &np->chain;
126                 chain->u.np = np;
127                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
128                 break;
129         case HAMMER2_BREF_TYPE_DATA:
130                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
131                 chain = &dp->chain;
132                 chain->u.dp = dp;
133                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
134                 break;
135         case HAMMER2_BREF_TYPE_VOLUME:
136                 chain = NULL;
137                 panic("hammer2_chain_alloc volume type illegal for op");
138         default:
139                 chain = NULL;
140                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
141                       bref->type);
142         }
143         chain->bref = *bref;
144         chain->index = -1;      /* not yet assigned */
145         chain->refs = 1;
146         chain->bytes = bytes;
147         lockmgr(&chain->lk, LK_EXCLUSIVE);
148
149         return (chain);
150 }
151
152 /*
153  * Free a disconnected chain element
154  */
155 void
156 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
157 {
158         void *mem;
159
160         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
161             chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
162                 chain->data = NULL;
163         }
164
165         KKASSERT(chain->bp == NULL);
166         KKASSERT(chain->data == NULL);
167         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
168                  chain->u.ip->vp == NULL);
169
170         if ((mem = chain->u.mem) != NULL) {
171                 chain->u.mem = NULL;
172                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
173                         kfree(mem, hmp->minode);
174                 else
175                         kfree(mem, hmp->mchain);
176         }
177 }
178
179 /*
180  * Add a reference to a chain element (for shared access).  The chain
181  * element must already have at least 1 ref controlled by the caller.
182  */
183 void
184 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
185 {
186         KKASSERT(chain->refs > 0);
187         atomic_add_int(&chain->refs, 1);
188 }
189
190 /*
191  * Drop the callers reference to the chain element.  If the ref count
192  * reaches zero the chain element and its related structure (typically an
193  * inode or indirect block) will be freed and the parent will be
194  * recursively dropped.
195  *
196  * Modified elements hold an additional reference so it should not be
197  * possible for the count on a modified element to drop to 0.
198  *
199  * The chain element must NOT be locked by the caller.
200  *
201  * The parent might or might not be locked by the caller but if so it
202  * will also be referenced so we shouldn't recurse upward.
203  */
204 void
205 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
206 {
207         hammer2_chain_t *parent;
208         u_int refs;
209
210         while (chain) {
211                 refs = chain->refs;
212                 cpu_ccfence();
213                 KKASSERT(refs > 0);
214                 if (refs == 1) {
215                         KKASSERT(chain != &hmp->vchain);
216                         parent = chain->parent;
217                         if (parent)
218                                 lockmgr(&parent->lk, LK_EXCLUSIVE);
219                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
220                                 /*
221                                  * Succeeded, recurse and drop parent
222                                  */
223                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
224                                         SPLAY_REMOVE(hammer2_chain_splay,
225                                                      &parent->shead, chain);
226                                         atomic_set_int(&chain->flags,
227                                                        HAMMER2_CHAIN_DELETED);
228                                         /* parent refs dropped via recursion */
229                                 }
230                                 chain->parent = NULL;
231                                 if (parent)
232                                         lockmgr(&parent->lk, LK_RELEASE);
233                                 hammer2_chain_free(hmp, chain);
234                                 chain = parent;
235                                 /* recurse on parent */
236                         } else {
237                                 if (parent)
238                                         lockmgr(&parent->lk, LK_RELEASE);
239                                 /* retry the same chain */
240                         }
241                 } else {
242                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
243                                 /*
244                                  * Succeeded, count did not reach zero so
245                                  * cut out of the loop.
246                                  */
247                                 break;
248                         }
249                         /* retry the same chain */
250                 }
251         }
252 }
253
254 /*
255  * Ref and lock a chain element, acquiring its data with I/O if necessary,
256  * and specify how you would like the data to be resolved.
257  *
258  * Returns 0 on success or an error code if the data could not be acquired.
259  * The chain element is locked either way.
260  *
261  * The lock is allowed to recurse, multiple locking ops will aggregate
262  * the requested resolve types.  Once data is assigned it will not be
263  * removed until the last unlock.
264  *
265  * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
266  *                         (typically used to avoid device/logical buffer
267  *                          aliasing for data)
268  *
269  * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
270  *                         the INITIAL-create state (indirect blocks only).
271  *
272  *                         Do not resolve data elements for DATA chains.
273  *                         (typically used to avoid device/logical buffer
274  *                          aliasing for data)
275  *
276  * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
277  *
278  *
279  * NOTE: Embedded elements (volume header, inodes) are always resolved
280  *       regardless.
281  *
282  * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
283  *       element will instantiate and zero its buffer, and flush it on
284  *       release.
285  *
286  * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
287  *       so as not to instantiate a device buffer, which could alias against
288  *       a logical file buffer.  However, if ALWAYS is specified the
289  *       device buffer will be instantiated anyway.
290  */
291 int
292 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
293 {
294         hammer2_blockref_t *bref;
295         hammer2_off_t pbase;
296         hammer2_off_t peof;
297         size_t boff;
298         size_t bbytes;
299         int error;
300         char *bdata;
301
302         /*
303          * Lock the element.  Under certain conditions this might end up
304          * being a recursive lock.
305          */
306         KKASSERT(chain->refs > 0);
307         atomic_add_int(&chain->refs, 1);
308         lockmgr(&chain->lk, LK_EXCLUSIVE);
309
310         /*
311          * If we already have a valid data pointer no further action is
312          * necessary.
313          */
314         if (chain->data)
315                 return (0);
316
317         /*
318          * Do we have to resolve the data?
319          */
320         switch(how) {
321         case HAMMER2_RESOLVE_NEVER:
322                 return(0);
323         case HAMMER2_RESOLVE_MAYBE:
324                 if (chain->flags & HAMMER2_CHAIN_INITIAL)
325                         return(0);
326                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
327                         return(0);
328                 /* fall through */
329         case HAMMER2_RESOLVE_ALWAYS:
330                 break;
331         }
332
333         /*
334          * We must resolve to a device buffer, either by issuing I/O or
335          * by creating a zero-fill element.  We do not mark the buffer
336          * dirty when creating a zero-fill element (the hammer2_chain_modify()
337          * API must still be used to do that).
338          *
339          * The device buffer is variable-sized in powers of 2 down
340          * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
341          * chunk always contains buffers of the same size. (XXX)
342          *
343          * The minimum physical IO size may be larger than the variable
344          * block size.
345          */
346         bref = &chain->bref;
347
348         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
349                 bbytes = HAMMER2_MINIOSIZE;
350         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
351         peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
352         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
353         KKASSERT(pbase != 0);
354
355         /*
356          * The getblk() optimization can only be used on newly created
357          * elements if the physical block size matches the request.
358          */
359         if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
360             chain->bytes == bbytes) {
361                 chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
362                 error = 0;
363         } else if (hammer2_cluster_enable) {
364                 error = cluster_read(hmp->devvp, peof, pbase, bbytes,
365                                      HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
366                                      &chain->bp);
367         } else {
368                 error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
369         }
370
371         if (error) {
372                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
373                         (intmax_t)pbase, error);
374                 bqrelse(chain->bp);
375                 chain->bp = NULL;
376                 return (error);
377         }
378
379         /*
380          * Zero the data area if the chain is in the INITIAL-create state
381          */
382         bdata = (char *)chain->bp->b_data + boff;
383         if (chain->flags & HAMMER2_CHAIN_INITIAL)
384                 bzero(bdata, chain->bytes);
385
386         /*
387          * Setup the data pointer, either pointing it to an embedded data
388          * structure and copying the data from the buffer, or pointing it
389          * into the buffer.
390          *
391          * The buffer is not retained when copying to an embedded data
392          * structure in order to avoid potential deadlocks or recursions
393          * on the same physical buffer.
394          */
395         switch (bref->type) {
396         case HAMMER2_BREF_TYPE_VOLUME:
397                 /*
398                  * Copy data from bp to embedded buffer
399                  */
400                 panic("hammer2_chain_lock: called on unresolved volume header");
401 #if 0
402                 /* NOT YET */
403                 KKASSERT(pbase == 0);
404                 KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
405                 bcopy(bdata, &hmp->voldata, chain->bytes);
406                 chain->data = (void *)&hmp->voldata;
407                 bqrelse(chain->bp);
408                 chain->bp = NULL;
409 #endif
410                 break;
411         case HAMMER2_BREF_TYPE_INODE:
412                 /*
413                  * Copy data from bp to embedded buffer, do not retain the
414                  * device buffer.
415                  */
416                 bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
417                 chain->data = (void *)&chain->u.ip->ip_data;
418                 bqrelse(chain->bp);
419                 chain->bp = NULL;
420                 break;
421         case HAMMER2_BREF_TYPE_INDIRECT:
422         case HAMMER2_BREF_TYPE_DATA:
423         default:
424                 /*
425                  * Point data at the device buffer and leave bp intact.
426                  */
427                 chain->data = (void *)bdata;
428                 break;
429         }
430         return (0);
431 }
432
433 /*
434  * Unlock and deref a chain element.
435  *
436  * On the last lock release any non-embedded data (chain->bp) will be
437  * retired.
438  */
439 void
440 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
441 {
442         long *counterp;
443
444         /*
445          * Undo a recursive lock
446          */
447         if (lockcountnb(&chain->lk) > 1) {
448                 KKASSERT(chain->refs > 1);
449                 atomic_add_int(&chain->refs, -1);
450                 lockmgr(&chain->lk, LK_RELEASE);
451                 return;
452         }
453
454         /*
455          * Shortcut the case if the data is embedded or not resolved.
456          * Do NOT null-out pointers to embedded data (e.g. inode).
457          */
458         if (chain->bp == NULL) {
459                 lockmgr(&chain->lk, LK_RELEASE);
460                 hammer2_chain_drop(hmp, chain);
461                 return;
462         }
463
464         /*
465          * Statistics
466          */
467         if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
468                 ;
469         } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
470                 switch(chain->bref.type) {
471                 case HAMMER2_BREF_TYPE_DATA:
472                         counterp = &hammer2_ioa_file_write;
473                         break;
474                 case HAMMER2_BREF_TYPE_INODE:
475                         counterp = &hammer2_ioa_meta_write;
476                         break;
477                 case HAMMER2_BREF_TYPE_INDIRECT:
478                         counterp = &hammer2_ioa_indr_write;
479                         break;
480                 default:
481                         counterp = &hammer2_ioa_volu_write;
482                         break;
483                 }
484                 ++*counterp;
485         } else {
486                 switch(chain->bref.type) {
487                 case HAMMER2_BREF_TYPE_DATA:
488                         counterp = &hammer2_iod_file_write;
489                         break;
490                 case HAMMER2_BREF_TYPE_INODE:
491                         counterp = &hammer2_iod_meta_write;
492                         break;
493                 case HAMMER2_BREF_TYPE_INDIRECT:
494                         counterp = &hammer2_iod_indr_write;
495                         break;
496                 default:
497                         counterp = &hammer2_iod_volu_write;
498                         break;
499                 }
500                 ++*counterp;
501         }
502
503         /*
504          * Clean out the bp.
505          *
506          * If a device buffer was used for data be sure to destroy the
507          * buffer when we are done to avoid aliases (XXX what about the
508          * underlying VM pages?).
509          */
510         if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
511                 chain->bp->b_flags |= B_RELBUF;
512
513         chain->data = NULL;
514         if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
515                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
516                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
517                         atomic_clear_int(&chain->flags,
518                                          HAMMER2_CHAIN_IOFLUSH);
519                         chain->bp->b_flags |= B_RELBUF;
520                         bawrite(chain->bp);
521                 } else {
522                         chain->bp->b_flags |= B_CLUSTEROK;
523                         bdwrite(chain->bp);
524                 }
525         } else {
526                 if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
527                         atomic_clear_int(&chain->flags,
528                                          HAMMER2_CHAIN_IOFLUSH);
529                         chain->bp->b_flags |= B_RELBUF;
530                         brelse(chain->bp);
531                 } else {
532                         /* bp might still be dirty */
533                         bqrelse(chain->bp);
534                 }
535         }
536         chain->bp = NULL;
537         lockmgr(&chain->lk, LK_RELEASE);
538         hammer2_chain_drop(hmp, chain);
539 }
540
541 /*
542  * Resize the chain's physical storage allocation.  Chains can be resized
543  * smaller without reallocating the storage.  Resizing larger will reallocate
544  * the storage.
545  *
546  * Must be passed a locked chain.  If you want the resize to copy the data
547  * you should lock the chain with RESOLVE_MAYBE or RESOLVE_ALWAYS, otherwise
548  * the resize operation will not copy the data.
549  *
550  * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
551  * to avoid instantiating a device buffer that conflicts with the vnode
552  * data buffer.
553  *
554  * XXX flags currently ignored, uses chain->bp to detect data/no-data.
555  */
556 void
557 hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain,
558                      int nradix, int flags)
559 {
560         struct buf *nbp;
561         hammer2_off_t pbase;
562         size_t obytes;
563         size_t nbytes;
564         size_t bbytes;
565         int boff;
566         char *bdata;
567         int error;
568
569         /*
570          * Only data and indirect blocks can be resized for now
571          */
572         KKASSERT(chain != &hmp->vchain);
573         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
574                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
575
576         /*
577          * Nothing to do if the element is already the proper size
578          */
579         obytes = chain->bytes;
580         nbytes = 1 << nradix;
581         if (obytes == nbytes)
582                 return;
583
584         /*
585          * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
586          * modified flags share the same ref.
587          */
588         if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0) {
589                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
590                 hammer2_chain_ref(hmp, chain);
591         }
592
593         /*
594          * Relocate the block, even if making it smaller (because different
595          * block sizes may be in different regions).
596          */
597         chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
598                                                      nbytes);
599         chain->bytes = nbytes;
600
601         /*
602          * The device buffer may be larger than the allocation size.
603          */
604         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
605                 bbytes = HAMMER2_MINIOSIZE;
606         pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
607         boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
608
609         /*
610          * Only copy the data if resolved, otherwise the caller is
611          * responsible.
612          */
613         if (chain->bp) {
614                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
615                          chain->bref.type == HAMMER2_BREF_TYPE_DATA);
616                 KKASSERT(chain != &hmp->vchain);        /* safety */
617
618                 /*
619                  * The getblk() optimization can only be used if the
620                  * physical block size matches the request.
621                  */
622                 if (nbytes == bbytes) {
623                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
624                         error = 0;
625                 } else {
626                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
627                         KKASSERT(error == 0);
628                 }
629                 bdata = (char *)nbp->b_data + boff;
630
631                 if (nbytes < obytes) {
632                         bcopy(chain->data, bdata, nbytes);
633                 } else {
634                         bcopy(chain->data, bdata, obytes);
635                         bzero(bdata + obytes, nbytes - obytes);
636                 }
637
638                 /*
639                  * NOTE: The INITIAL state of the chain is left intact.
640                  *
641                  * NOTE: Because of the reallocation we have to set DIRTYBP
642                  *       if INITIAL is not set.
643                  */
644                 chain->bp->b_flags |= B_RELBUF;
645                 brelse(chain->bp);
646                 chain->bp = nbp;
647                 chain->data = (void *)bdata;
648                 if ((chain->flags & HAMMER2_CHAIN_INITIAL) == 0)
649                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
650         }
651         hammer2_chain_parent_setsubmod(hmp, chain);
652 }
653
654 /*
655  * Convert a locked chain that was retrieved read-only to read-write.
656  *
657  * If not already marked modified a new physical block will be allocated
658  * and assigned to the bref.
659  *
660  * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
661  *                   level or the COW operation will not work.
662  *
663  * Data blocks     - The chain is usually locked RESOLVE_NEVER so as not to
664  *                   run the data through the device buffers.
665  */
666 void
667 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
668 {
669         struct buf *nbp;
670         int error;
671         hammer2_off_t pbase;
672         size_t bbytes;
673         size_t boff;
674         void *bdata;
675
676         /*
677          * If the chain is already marked MODIFIED1 we can just return.
678          */
679         if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
680                 if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
681                     chain->bp == NULL) {
682                         goto skip1;
683                 }
684                 return;
685         }
686
687         /*
688          * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
689          * modified flags share the same ref.
690          */
691         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
692         hammer2_chain_ref(hmp, chain);
693
694         /*
695          * We must allocate the copy-on-write block.
696          *
697          * If the data is embedded no other action is required.
698          *
699          * If the data is not embedded we acquire and clear the
700          * new block.  If chain->data is not NULL we then do the
701          * copy-on-write.  chain->data will then be repointed to the new
702          * buffer and the old buffer will be released.
703          *
704          * For newly created elements with no prior allocation we go
705          * through the copy-on-write steps except without the copying part.
706          */
707         if (chain != &hmp->vchain) {
708                 if ((hammer2_debug & 0x0001) &&
709                     (chain->bref.data_off & HAMMER2_OFF_MASK)) {
710                         kprintf("Replace %d\n", chain->bytes);
711                 }
712                 chain->bref.data_off =
713                         hammer2_freemap_alloc(hmp, chain->bref.type,
714                                               chain->bytes);
715                 /* XXX failed allocation */
716         }
717
718         /*
719          * If data instantiation is optional and the chain has no current
720          * data association (typical for DATA and newly-created INDIRECT
721          * elements), don't instantiate the buffer now.
722          */
723         if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
724                 goto skip2;
725
726 skip1:
727         /*
728          * Setting the DIRTYBP flag will cause the buffer to be dirtied or
729          * written-out on unlock.  This bit is independent of the MODIFIED1
730          * bit because the chain may still need meta-data adjustments done
731          * by virtue of MODIFIED1 for its parent, and the buffer can be
732          * flushed out (possibly multiple times) by the OS before that.
733          *
734          * Clearing the INITIAL flag (for indirect blocks) indicates that
735          * a zero-fill buffer has been instantiated.
736          */
737         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
738         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
739
740         /*
741          * We currently should never instantiate a device buffer for a
742          * data chain.
743          */
744         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
745
746         /*
747          * Execute COW operation
748          */
749         switch(chain->bref.type) {
750         case HAMMER2_BREF_TYPE_VOLUME:
751         case HAMMER2_BREF_TYPE_INODE:
752                 /*
753                  * The data is embedded, no copy-on-write operation is
754                  * needed.
755                  */
756                 KKASSERT(chain->bp == NULL);
757                 break;
758         case HAMMER2_BREF_TYPE_DATA:
759         case HAMMER2_BREF_TYPE_INDIRECT:
760                 /*
761                  * Perform the copy-on-write operation
762                  */
763                 KKASSERT(chain != &hmp->vchain);        /* safety */
764                 /*
765                  * The device buffer may be larger than the allocation size.
766                  */
767                 if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
768                         bbytes = HAMMER2_MINIOSIZE;
769                 pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
770                 boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
771
772                 /*
773                  * The getblk() optimization can only be used if the
774                  * physical block size matches the request.
775                  */
776                 if (chain->bytes == bbytes) {
777                         nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
778                         error = 0;
779                 } else {
780                         error = bread(hmp->devvp, pbase, bbytes, &nbp);
781                         KKASSERT(error == 0);
782                 }
783                 bdata = (char *)nbp->b_data + boff;
784
785                 /*
786                  * Copy or zero-fill on write depending on whether
787                  * chain->data exists or not.
788                  */
789                 if (chain->data) {
790                         bcopy(chain->data, bdata, chain->bytes);
791                         KKASSERT(chain->bp != NULL);
792                 } else {
793                         bzero(bdata, chain->bytes);
794                 }
795                 if (chain->bp) {
796                         chain->bp->b_flags |= B_RELBUF;
797                         brelse(chain->bp);
798                 }
799                 chain->bp = nbp;
800                 chain->data = bdata;
801                 break;
802         default:
803                 panic("hammer2_chain_modify: illegal non-embedded type %d",
804                       chain->bref.type);
805                 break;
806
807         }
808 skip2:
809         if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
810                 hammer2_chain_parent_setsubmod(hmp, chain);
811 }
812
813 /*
814  * Locate an in-memory chain.  The parent must be locked.  The in-memory
815  * chain is returned or NULL if no in-memory chain is present.
816  *
817  * NOTE: A chain on-media might exist for this index when NULL is returned.
818  */
819 hammer2_chain_t *
820 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
821 {
822         hammer2_chain_t dummy;
823         hammer2_chain_t *chain;
824
825         dummy.index = index;
826         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
827         return (chain);
828 }
829
830 /*
831  * Return a locked chain structure with all associated data acquired.
832  *
833  * Caller must lock the parent on call, the returned child will be locked.
834  */
835 hammer2_chain_t *
836 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
837                   int index, int flags)
838 {
839         hammer2_blockref_t *bref;
840         hammer2_chain_t *chain;
841         hammer2_chain_t dummy;
842         int how;
843
844         /*
845          * Figure out how to lock.  MAYBE can be used to optimized
846          * the initial-create state for indirect blocks.
847          */
848         if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
849                 how = HAMMER2_RESOLVE_NEVER;
850         else
851                 how = HAMMER2_RESOLVE_MAYBE;
852
853         /*
854          * First see if we have a (possibly modified) chain element cached
855          * for this (parent, index).  Acquire the data if necessary.
856          *
857          * If chain->data is non-NULL the chain should already be marked
858          * modified.
859          */
860         dummy.index = index;
861         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
862         if (chain) {
863                 if (flags & HAMMER2_LOOKUP_NOLOCK)
864                         hammer2_chain_ref(hmp, chain);
865                 else
866                         hammer2_chain_lock(hmp, chain, how);
867                 return(chain);
868         }
869
870         /*
871          * the get function must always succeed, panic if there's no
872          * data to index.
873          */
874         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
875                 panic("hammer2_chain_get: Missing bref(1)");
876                 /* NOT REACHED */
877         }
878
879         /*
880          * Otherwise lookup the bref and issue I/O (switch on the parent)
881          */
882         switch(parent->bref.type) {
883         case HAMMER2_BREF_TYPE_INODE:
884                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
885                 bref = &parent->data->ipdata.u.blockset.blockref[index];
886                 break;
887         case HAMMER2_BREF_TYPE_INDIRECT:
888                 KKASSERT(parent->data != NULL);
889                 KKASSERT(index >= 0 &&
890                          index < parent->bytes / sizeof(hammer2_blockref_t));
891                 bref = &parent->data->npdata.blockref[index];
892                 break;
893         case HAMMER2_BREF_TYPE_VOLUME:
894                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
895                 bref = &hmp->voldata.sroot_blockset.blockref[index];
896                 break;
897         default:
898                 bref = NULL;
899                 panic("hammer2_chain_get: unrecognized blockref type: %d",
900                       parent->bref.type);
901         }
902         if (bref->type == 0) {
903                 panic("hammer2_chain_get: Missing bref(2)");
904                 /* NOT REACHED */
905         }
906
907         /*
908          * Allocate a chain structure representing the existing media
909          * entry.
910          *
911          * The locking operation we do later will issue I/O to read it.
912          */
913         chain = hammer2_chain_alloc(hmp, bref);
914
915         /*
916          * Link the chain into its parent.  Caller is expected to hold an
917          * exclusive lock on the parent.
918          */
919         chain->parent = parent;
920         chain->index = index;
921         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
922                 panic("hammer2_chain_link: collision");
923         KKASSERT(parent->refs > 0);
924         atomic_add_int(&parent->refs, 1);       /* for splay entry */
925
926         /*
927          * Additional linkage for inodes.  Reuse the parent pointer to
928          * find the parent directory.
929          */
930         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
931                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
932                         parent = parent->parent;
933                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
934                         chain->u.ip->pip = parent->u.ip;
935         }
936
937         /*
938          * Our new chain structure has already been referenced and locked
939          * but the lock code handles the I/O so call it to resolve the data.
940          * Then release one of our two exclusive locks.
941          *
942          * If NOLOCK is set the release will release the one-and-only lock.
943          */
944         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
945                 hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
946                 hammer2_chain_drop(hmp, chain);         /* excess ref */
947         }
948         lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
949
950         return (chain);
951 }
952
953 /*
954  * Locate any key between key_beg and key_end inclusive.  (*parentp)
955  * typically points to an inode but can also point to a related indirect
956  * block and this function will recurse upwards and find the inode again.
957  *
958  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
959  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
960  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
961  *
962  * (*parentp) must be exclusively locked and referenced and can be an inode
963  * or an existing indirect block within the inode.
964  *
965  * On return (*parentp) will be modified to point at the deepest parent chain
966  * element encountered during the search, as a helper for an insertion or
967  * deletion.   The new (*parentp) will be locked and referenced and the old
968  * will be unlocked and dereferenced (no change if they are both the same).
969  *
970  * The matching chain will be returned exclusively locked and referenced.
971  *
972  * NULL is returned if no match was found, but (*parentp) will still
973  * potentially be adjusted.
974  *
975  * This function will also recurse up the chain if the key is not within the
976  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
977  * can simply allow (*parentp) to float inside the loop.
978  */
979 hammer2_chain_t *
980 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
981                      hammer2_key_t key_beg, hammer2_key_t key_end,
982                      int flags)
983 {
984         hammer2_chain_t *parent;
985         hammer2_chain_t *chain;
986         hammer2_chain_t *tmp;
987         hammer2_blockref_t *base;
988         hammer2_blockref_t *bref;
989         hammer2_key_t scan_beg;
990         hammer2_key_t scan_end;
991         int count = 0;
992         int i;
993
994         /*
995          * Recurse (*parentp) upward if necessary until the parent completely
996          * encloses the key range or we hit the inode.
997          */
998         parent = *parentp;
999         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1000                 scan_beg = parent->bref.key;
1001                 scan_end = scan_beg +
1002                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1003                 if (key_beg >= scan_beg && key_end <= scan_end)
1004                         break;
1005                 hammer2_chain_ref(hmp, parent);         /* ref old parent */
1006                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1007                 parent = parent->parent;
1008                                                         /* lock new parent */
1009                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1010                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
1011                 *parentp = parent;                      /* new parent */
1012         }
1013
1014 again:
1015         /*
1016          * Locate the blockref array.  Currently we do a fully associative
1017          * search through the array.
1018          */
1019         switch(parent->bref.type) {
1020         case HAMMER2_BREF_TYPE_INODE:
1021                 /*
1022                  * Special shortcut for embedded data returns the inode
1023                  * itself.  Callers must detect this condition and access
1024                  * the embedded data (the strategy code does this for us).
1025                  *
1026                  * This is only applicable to regular files and softlinks.
1027                  */
1028                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
1029                         if (flags & HAMMER2_LOOKUP_NOLOCK)
1030                                 hammer2_chain_ref(hmp, parent);
1031                         else
1032                                 hammer2_chain_lock(hmp, parent,
1033                                                    HAMMER2_RESOLVE_ALWAYS);
1034                         return (parent);
1035                 }
1036                 base = &parent->data->ipdata.u.blockset.blockref[0];
1037                 count = HAMMER2_SET_COUNT;
1038                 break;
1039         case HAMMER2_BREF_TYPE_INDIRECT:
1040                 /*
1041                  * Optimize indirect blocks in the INITIAL state to avoid
1042                  * I/O.
1043                  */
1044                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1045                         base = NULL;
1046                 } else {
1047                         if (parent->data == NULL)
1048                                 panic("parent->data is NULL");
1049                         base = &parent->data->npdata.blockref[0];
1050                 }
1051                 count = parent->bytes / sizeof(hammer2_blockref_t);
1052                 break;
1053         case HAMMER2_BREF_TYPE_VOLUME:
1054                 base = &hmp->voldata.sroot_blockset.blockref[0];
1055                 count = HAMMER2_SET_COUNT;
1056                 break;
1057         default:
1058                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
1059                       parent->bref.type);
1060                 base = NULL;    /* safety */
1061                 count = 0;      /* safety */
1062         }
1063
1064         /*
1065          * If the element and key overlap we use the element.
1066          */
1067         bref = NULL;
1068         for (i = 0; i < count; ++i) {
1069                 tmp = hammer2_chain_find(hmp, parent, i);
1070                 if (tmp) {
1071                         bref = &tmp->bref;
1072                         KKASSERT(bref->type != 0);
1073                 } else if (base == NULL || base[i].type == 0) {
1074                         continue;
1075                 } else {
1076                         bref = &base[i];
1077                 }
1078                 scan_beg = bref->key;
1079                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1080                 if (key_beg <= scan_end && key_end >= scan_beg)
1081                         break;
1082         }
1083         if (i == count) {
1084                 if (key_beg == key_end)
1085                         return (NULL);
1086                 return (hammer2_chain_next(hmp, parentp, NULL,
1087                                            key_beg, key_end, flags));
1088         }
1089
1090         /*
1091          * Acquire the new chain element.  If the chain element is an
1092          * indirect block we must search recursively.
1093          */
1094         chain = hammer2_chain_get(hmp, parent, i, flags);
1095         if (chain == NULL)
1096                 return (NULL);
1097
1098         /*
1099          * If the chain element is an indirect block it becomes the new
1100          * parent and we loop on it.
1101          *
1102          * The parent always has to be locked with at least RESOLVE_MAYBE,
1103          * so it might need a fixup if the caller passed incompatible flags.
1104          */
1105         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1106                 hammer2_chain_unlock(hmp, parent);
1107                 *parentp = parent = chain;
1108                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1109                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1110                         hammer2_chain_drop(hmp, chain); /* excess ref */
1111                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1112                         hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
1113                         hammer2_chain_unlock(hmp, chain);
1114                 }
1115                 goto again;
1116         }
1117
1118         /*
1119          * All done, return chain
1120          */
1121         return (chain);
1122 }
1123
1124 /*
1125  * After having issued a lookup we can iterate all matching keys.
1126  *
1127  * If chain is non-NULL we continue the iteration from just after it's index.
1128  *
1129  * If chain is NULL we assume the parent was exhausted and continue the
1130  * iteration at the next parent.
1131  *
1132  * parent must be locked on entry and remains locked throughout.  chain's
1133  * lock status must match flags.
1134  */
1135 hammer2_chain_t *
1136 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1137                    hammer2_chain_t *chain,
1138                    hammer2_key_t key_beg, hammer2_key_t key_end,
1139                    int flags)
1140 {
1141         hammer2_chain_t *parent;
1142         hammer2_chain_t *tmp;
1143         hammer2_blockref_t *base;
1144         hammer2_blockref_t *bref;
1145         hammer2_key_t scan_beg;
1146         hammer2_key_t scan_end;
1147         int i;
1148         int count;
1149
1150         parent = *parentp;
1151
1152 again:
1153         /*
1154          * Calculate the next index and recalculate the parent if necessary.
1155          */
1156         if (chain) {
1157                 /*
1158                  * Continue iteration within current parent.  If not NULL
1159                  * the passed-in chain may or may not be locked, based on
1160                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
1161                  * or a prior next).
1162                  */
1163                 i = chain->index + 1;
1164                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1165                         hammer2_chain_drop(hmp, chain);
1166                 else
1167                         hammer2_chain_unlock(hmp, chain);
1168
1169                 /*
1170                  * Any scan where the lookup returned degenerate data embedded
1171                  * in the inode has an invalid index and must terminate.
1172                  */
1173                 if (chain == parent)
1174                         return(NULL);
1175                 chain = NULL;
1176         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
1177                 /*
1178                  * We reached the end of the iteration.
1179                  */
1180                 return (NULL);
1181         } else {
1182                 /*
1183                  * Continue iteration with next parent unless the current
1184                  * parent covers the range.
1185                  */
1186                 hammer2_chain_t *nparent;
1187
1188                 scan_beg = parent->bref.key;
1189                 scan_end = scan_beg +
1190                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
1191                 if (key_beg >= scan_beg && key_end <= scan_end)
1192                         return (NULL);
1193
1194                 i = parent->index + 1;
1195                 nparent = parent->parent;
1196                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
1197                 hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
1198                                                         /* lock new parent */
1199                 hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
1200                 hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
1201                 *parentp = parent = nparent;
1202         }
1203
1204 again2:
1205         /*
1206          * Locate the blockref array.  Currently we do a fully associative
1207          * search through the array.
1208          */
1209         switch(parent->bref.type) {
1210         case HAMMER2_BREF_TYPE_INODE:
1211                 base = &parent->data->ipdata.u.blockset.blockref[0];
1212                 count = HAMMER2_SET_COUNT;
1213                 break;
1214         case HAMMER2_BREF_TYPE_INDIRECT:
1215                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1216                         base = NULL;
1217                 } else {
1218                         KKASSERT(parent->data != NULL);
1219                         base = &parent->data->npdata.blockref[0];
1220                 }
1221                 count = parent->bytes / sizeof(hammer2_blockref_t);
1222                 break;
1223         case HAMMER2_BREF_TYPE_VOLUME:
1224                 base = &hmp->voldata.sroot_blockset.blockref[0];
1225                 count = HAMMER2_SET_COUNT;
1226                 break;
1227         default:
1228                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1229                       parent->bref.type);
1230                 base = NULL;    /* safety */
1231                 count = 0;      /* safety */
1232                 break;
1233         }
1234         KKASSERT(i <= count);
1235
1236         /*
1237          * Look for the key.  If we are unable to find a match and an exact
1238          * match was requested we return NULL.  If a range was requested we
1239          * run hammer2_chain_next() to iterate.
1240          */
1241         bref = NULL;
1242         while (i < count) {
1243                 tmp = hammer2_chain_find(hmp, parent, i);
1244                 if (tmp) {
1245                         bref = &tmp->bref;
1246                 } else if (base == NULL || base[i].type == 0) {
1247                         ++i;
1248                         continue;
1249                 } else {
1250                         bref = &base[i];
1251                 }
1252                 scan_beg = bref->key;
1253                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1254                 if (key_beg <= scan_end && key_end >= scan_beg)
1255                         break;
1256                 ++i;
1257         }
1258
1259         /*
1260          * If we couldn't find a match recurse up a parent to continue the
1261          * search.
1262          */
1263         if (i == count)
1264                 goto again;
1265
1266         /*
1267          * Acquire the new chain element.  If the chain element is an
1268          * indirect block we must search recursively.
1269          */
1270         chain = hammer2_chain_get(hmp, parent, i, flags);
1271         if (chain == NULL)
1272                 return (NULL);
1273
1274         /*
1275          * If the chain element is an indirect block it becomes the new
1276          * parent and we loop on it.
1277          *
1278          * The parent always has to be locked with at least RESOLVE_MAYBE,
1279          * so it might need a fixup if the caller passed incompatible flags.
1280          */
1281         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1282                 hammer2_chain_unlock(hmp, parent);
1283                 *parentp = parent = chain;
1284                 chain = NULL;
1285                 if (flags & HAMMER2_LOOKUP_NOLOCK) {
1286                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1287                         hammer2_chain_drop(hmp, parent);        /* excess ref */
1288                 } else if (flags & HAMMER2_LOOKUP_NODATA) {
1289                         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
1290                         hammer2_chain_unlock(hmp, parent);
1291                 }
1292                 i = 0;
1293                 goto again2;
1294         }
1295
1296         /*
1297          * All done, return chain
1298          */
1299         return (chain);
1300 }
1301
1302 /*
1303  * Create and return a new hammer2 system memory structure of the specified
1304  * key, type and size and insert it RELATIVE TO (PARENT).
1305  *
1306  * (parent) is typically either an inode or an indirect block, acquired
1307  * acquired as a side effect of issuing a prior failed lookup.  parent
1308  * must be locked and held.  Do not pass the inode chain to this function
1309  * unless that is the chain returned by the failed lookup.
1310  *
1311  * Non-indirect types will automatically allocate indirect blocks as required
1312  * if the new item does not fit in the current (parent).
1313  *
1314  * Indirect types will move a portion of the existing blockref array in
1315  * (parent) into the new indirect type and then use one of the free slots
1316  * to emplace the new indirect type.
1317  *
1318  * A new locked, referenced chain element is returned of the specified type.
1319  * The element may or may not have a data area associated with it:
1320  *
1321  *      VOLUME          not allowed here
1322  *      INODE           embedded data are will be set-up
1323  *      INDIRECT        not allowed here
1324  *      DATA            no data area will be set-up (caller is expected
1325  *                      to have logical buffers, we don't want to alias
1326  *                      the data onto device buffers!).
1327  */
1328 hammer2_chain_t *
1329 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1330                      hammer2_chain_t *chain,
1331                      hammer2_key_t key, int keybits, int type, size_t bytes)
1332 {
1333         hammer2_blockref_t dummy;
1334         hammer2_blockref_t *base;
1335         hammer2_chain_t dummy_chain;
1336         int unlock_parent = 0;
1337         int allocated = 0;
1338         int count;
1339         int i;
1340
1341         if (chain == NULL) {
1342                 /*
1343                  * First allocate media space and construct the dummy bref,
1344                  * then allocate the in-memory chain structure.
1345                  */
1346                 bzero(&dummy, sizeof(dummy));
1347                 dummy.type = type;
1348                 dummy.key = key;
1349                 dummy.keybits = keybits;
1350                 dummy.data_off = hammer2_bytes_to_radix(bytes);
1351                 chain = hammer2_chain_alloc(hmp, &dummy);
1352                 allocated = 1;
1353
1354                 /*
1355                  * We do NOT set INITIAL here (yet).  INITIAL is only
1356                  * used for indirect blocks.
1357                  *
1358                  * Recalculate bytes to reflect the actual media block
1359                  * allocation.
1360                  */
1361                 bytes = (hammer2_off_t)1 <<
1362                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1363                 chain->bytes = bytes;
1364
1365                 switch(type) {
1366                 case HAMMER2_BREF_TYPE_VOLUME:
1367                         panic("hammer2_chain_create: called with volume type");
1368                         break;
1369                 case HAMMER2_BREF_TYPE_INODE:
1370                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1371                         chain->data = (void *)&chain->u.ip->ip_data;
1372                         break;
1373                 case HAMMER2_BREF_TYPE_INDIRECT:
1374                         panic("hammer2_chain_create: cannot be used to"
1375                               "create indirect block");
1376                         break;
1377                 case HAMMER2_BREF_TYPE_DATA:
1378                 default:
1379                         /* leave chain->data NULL */
1380                         KKASSERT(chain->data == NULL);
1381                         break;
1382                 }
1383         } else {
1384                 /*
1385                  * Potentially update the chain's key/keybits.
1386                  */
1387                 chain->bref.key = key;
1388                 chain->bref.keybits = keybits;
1389         }
1390
1391 again:
1392         /*
1393          * Locate a free blockref in the parent's array
1394          */
1395         switch(parent->bref.type) {
1396         case HAMMER2_BREF_TYPE_INODE:
1397                 KKASSERT(parent->data != NULL);
1398                 base = &parent->data->ipdata.u.blockset.blockref[0];
1399                 count = HAMMER2_SET_COUNT;
1400                 break;
1401         case HAMMER2_BREF_TYPE_INDIRECT:
1402                 if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1403                         base = NULL;
1404                 } else {
1405                         KKASSERT(parent->data != NULL);
1406                         base = &parent->data->npdata.blockref[0];
1407                 }
1408                 count = parent->bytes / sizeof(hammer2_blockref_t);
1409                 break;
1410         case HAMMER2_BREF_TYPE_VOLUME:
1411                 KKASSERT(parent->data != NULL);
1412                 base = &hmp->voldata.sroot_blockset.blockref[0];
1413                 count = HAMMER2_SET_COUNT;
1414                 break;
1415         default:
1416                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1417                       parent->bref.type);
1418                 count = 0;
1419                 break;
1420         }
1421
1422         /*
1423          * Scan for an unallocated bref, also skipping any slots occupied
1424          * by in-memory chain elements that may not yet have been updated
1425          * in the parent's bref array.
1426          */
1427         bzero(&dummy_chain, sizeof(dummy_chain));
1428         for (i = 0; i < count; ++i) {
1429                 if (base == NULL) {
1430                         dummy_chain.index = i;
1431                         if (SPLAY_FIND(hammer2_chain_splay,
1432                                        &parent->shead, &dummy_chain) == NULL) {
1433                                 break;
1434                         }
1435                 } else if (base[i].type == 0) {
1436                         dummy_chain.index = i;
1437                         if (SPLAY_FIND(hammer2_chain_splay,
1438                                        &parent->shead, &dummy_chain) == NULL) {
1439                                 break;
1440                         }
1441                 }
1442         }
1443
1444         /*
1445          * If no free blockref count be found we must create an indirect
1446          * block and move a number of blockrefs into it.  With the parent
1447          * locked we can safely lock each child in order to move it without
1448          * causing a deadlock.
1449          *
1450          * This may return the new indirect block or the old parent depending
1451          * on where the key falls.
1452          */
1453         if (i == count) {
1454                 hammer2_chain_t *nparent;
1455
1456                 nparent = hammer2_chain_create_indirect(hmp, parent,
1457                                                         key, keybits);
1458                 if (nparent == NULL) {
1459                         if (allocated)
1460                                 hammer2_chain_free(hmp, chain);
1461                         chain = NULL;
1462                         goto done;
1463                 }
1464                 if (parent != nparent) {
1465                         if (unlock_parent)
1466                                 hammer2_chain_unlock(hmp, parent);
1467                         parent = nparent;
1468                         unlock_parent = 1;
1469                 }
1470                 goto again;
1471         }
1472
1473         /*
1474          * Link the chain into its parent.
1475          */
1476         if (chain->parent != NULL)
1477                 panic("hammer2: hammer2_chain_create: chain already connected");
1478         KKASSERT(chain->parent == NULL);
1479         chain->parent = parent;
1480         chain->index = i;
1481         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1482                 panic("hammer2_chain_link: collision");
1483         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1484         KKASSERT(parent->refs > 0);
1485         atomic_add_int(&parent->refs, 1);
1486
1487         /*
1488          * Additional linkage for inodes.  Reuse the parent pointer to
1489          * find the parent directory.
1490          */
1491         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1492                 hammer2_chain_t *scan = parent;
1493                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1494                         scan = scan->parent;
1495                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE)
1496                         chain->u.ip->pip = scan->u.ip;
1497         }
1498
1499         /*
1500          * (allocated) indicates that this is a newly-created chain element
1501          * rather than a renamed chain element.  In this situation we want
1502          * to place the chain element in the MODIFIED1 state.
1503          *
1504          * The data area will be set up as follows:
1505          *
1506          *      VOLUME          not allowed here.
1507          *
1508          *      INODE           embedded data are will be set-up.
1509          *
1510          *      INDIRECT        not allowed here.
1511          *
1512          *      DATA            no data area will be set-up (caller is expected
1513          *                      to have logical buffers, we don't want to alias
1514          *                      the data onto device buffers!).
1515          */
1516         if (allocated) {
1517                 if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
1518                         hammer2_chain_modify(hmp, chain,
1519                                              HAMMER2_MODIFY_OPTDATA);
1520                 } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1521                         /* not supported in this function */
1522                         panic("hammer2_chain_create: bad type");
1523                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1524                         hammer2_chain_modify(hmp, chain,
1525                                              HAMMER2_MODIFY_OPTDATA);
1526                 } else {
1527                         hammer2_chain_modify(hmp, chain, 0);
1528                 }
1529         } else {
1530                 /*
1531                  * When reconnecting inodes we have to call setsubmod()
1532                  * to ensure that its state propagates up the newly
1533                  * connected parent.
1534                  *
1535                  * We cannot depend on the chain being in a MODIFIED1
1536                  * state, or it might already be in that state, so
1537                  * even if the parent calls hammer2_chain_modify()
1538                  * MOVED might not get set.  Thus we have to set it
1539                  * here, too.
1540                  */
1541                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1542                         hammer2_chain_ref(hmp, chain);
1543                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1544                 }
1545                 hammer2_chain_parent_setsubmod(hmp, chain);
1546         }
1547
1548 done:
1549         if (unlock_parent)
1550                 hammer2_chain_unlock(hmp, parent);
1551         return (chain);
1552 }
1553
1554 /*
1555  * Create an indirect block that covers one or more of the elements in the
1556  * current parent.  Either returns the existing parent with no locking or
1557  * ref changes or returns the new indirect block locked and referenced,
1558  * depending on what the specified key falls into.
1559  *
1560  * The key/keybits for the indirect mode only needs to follow three rules:
1561  *
1562  * (1) That all elements underneath it fit within its key space and
1563  *
1564  * (2) That all elements outside it are outside its key space.
1565  *
1566  * (3) When creating the new indirect block any elements in the current
1567  *     parent that fit within the new indirect block's keyspace must be
1568  *     moved into the new indirect block.
1569  *
1570  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1571  *     keyspace the the current parent, but lookup/iteration rules will
1572  *     ensure (and must ensure) that rule (2) for all parents leading up
1573  *     to the nearest inode or the root volume header is adhered to.  This
1574  *     is accomplished by always recursing through matching keyspaces in
1575  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1576  *
1577  * The current implementation calculates the current worst-case keyspace by
1578  * iterating the current parent and then divides it into two halves, choosing
1579  * whichever half has the most elements (not necessarily the half containing
1580  * the requested key).
1581  *
1582  * We can also opt to use the half with the least number of elements.  This
1583  * causes lower-numbered keys (aka logical file offsets) to recurse through
1584  * fewer indirect blocks and higher-numbered keys to recurse through more.
1585  * This also has the risk of not moving enough elements to the new indirect
1586  * block and being forced to create several indirect blocks before the element
1587  * can be inserted.
1588  */
1589 static
1590 hammer2_chain_t *
1591 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1592                               hammer2_key_t create_key, int create_bits)
1593 {
1594         hammer2_blockref_t *base;
1595         hammer2_blockref_t *bref;
1596         hammer2_chain_t *chain;
1597         hammer2_chain_t *ichain;
1598         hammer2_chain_t dummy;
1599         hammer2_key_t key = create_key;
1600         int keybits = create_bits;
1601         int locount = 0;
1602         int hicount = 0;
1603         int count;
1604         int nbytes;
1605         int i;
1606
1607         /*
1608          * Calculate the base blockref pointer or NULL if the chain
1609          * is known to be empty.
1610          */
1611         hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1612         if (parent->flags & HAMMER2_CHAIN_INITIAL) {
1613                 base = NULL;
1614
1615                 /*
1616                  * We still need to calculate the count for SPLAY lookups
1617                  */
1618                 switch(parent->bref.type) {
1619                 case HAMMER2_BREF_TYPE_INODE:
1620                         count = HAMMER2_SET_COUNT;
1621                         break;
1622                 case HAMMER2_BREF_TYPE_INDIRECT:
1623                         count = parent->bytes / sizeof(hammer2_blockref_t);
1624                         break;
1625                 case HAMMER2_BREF_TYPE_VOLUME:
1626                         count = HAMMER2_SET_COUNT;
1627                         break;
1628                 default:
1629                         panic("hammer2_chain_create_indirect: "
1630                               "unrecognized blockref type: %d",
1631                               parent->bref.type);
1632                         count = 0;
1633                         break;
1634                 }
1635         } else {
1636                 /*
1637                  * Locate a free blockref in the parent's array
1638                  */
1639                 switch(parent->bref.type) {
1640                 case HAMMER2_BREF_TYPE_INODE:
1641                         base = &parent->data->ipdata.u.blockset.blockref[0];
1642                         count = HAMMER2_SET_COUNT;
1643                         break;
1644                 case HAMMER2_BREF_TYPE_INDIRECT:
1645                         base = &parent->data->npdata.blockref[0];
1646                         count = parent->bytes / sizeof(hammer2_blockref_t);
1647                         break;
1648                 case HAMMER2_BREF_TYPE_VOLUME:
1649                         base = &hmp->voldata.sroot_blockset.blockref[0];
1650                         count = HAMMER2_SET_COUNT;
1651                         break;
1652                 default:
1653                         panic("hammer2_chain_create_indirect: "
1654                               "unrecognized blockref type: %d",
1655                               parent->bref.type);
1656                         count = 0;
1657                         break;
1658                 }
1659         }
1660
1661         /*
1662          * Scan for an unallocated bref, also skipping any slots occupied
1663          * by in-memory chain elements that may not yet have been updated
1664          * in the parent's bref array.
1665          */
1666         bzero(&dummy, sizeof(dummy));
1667         for (i = 0; i < count; ++i) {
1668                 int nkeybits;
1669
1670                 /*
1671                  * Optimize the case where the parent is still in its
1672                  * initially created state.
1673                  */
1674                 if (base == NULL || base[i].type == 0) {
1675                         dummy.index = i;
1676                         chain = SPLAY_FIND(hammer2_chain_splay,
1677                                            &parent->shead, &dummy);
1678                         if (chain == NULL)
1679                                 continue;
1680                         bref = &chain->bref;
1681                 } else {
1682                         bref = &base[i];
1683                 }
1684
1685                 /*
1686                  * Expand our calculated key range (key, keybits) to fit
1687                  * the scanned key.  nkeybits represents the full range
1688                  * that we will later cut in half (two halves @ nkeybits - 1).
1689                  */
1690                 nkeybits = keybits;
1691                 if (nkeybits < bref->keybits)
1692                         nkeybits = bref->keybits;
1693                 while ((~(((hammer2_key_t)1 << nkeybits) - 1) &
1694                         (key ^ bref->key)) != 0) {
1695                         ++nkeybits;
1696                 }
1697
1698                 /*
1699                  * If the new key range is larger we have to determine
1700                  * which side of the new key range the existing keys fall
1701                  * under by checking the high bit, then collapsing the
1702                  * locount into the hicount or vise-versa.
1703                  */
1704                 if (keybits != nkeybits) {
1705                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1706                                 hicount += locount;
1707                                 locount = 0;
1708                         } else {
1709                                 locount += hicount;
1710                                 hicount = 0;
1711                         }
1712                         keybits = nkeybits;
1713                 }
1714
1715                 /*
1716                  * The newly scanned key will be in the lower half or the
1717                  * higher half of the (new) key range.
1718                  */
1719                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1720                         ++hicount;
1721                 else
1722                         ++locount;
1723         }
1724
1725         /*
1726          * Adjust keybits to represent half of the full range calculated
1727          * above.
1728          */
1729         --keybits;
1730
1731         /*
1732          * Select whichever half contains the most elements.  Theoretically
1733          * we can select either side as long as it contains at least one
1734          * element (in order to ensure that a free slot is present to hold
1735          * the indirect block).
1736          */
1737         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1738         if (hammer2_indirect_optimize) {
1739                 /*
1740                  * Insert node for least number of keys, this will arrange
1741                  * the first few blocks of a large file or the first few
1742                  * inodes in a directory with fewer indirect blocks when
1743                  * created linearly.
1744                  */
1745                 if (hicount < locount && hicount != 0)
1746                         key |= (hammer2_key_t)1 << keybits;
1747                 else
1748                         key &= ~(hammer2_key_t)1 << keybits;
1749         } else {
1750                 /*
1751                  * Insert node for most number of keys, best for heavily
1752                  * fragmented files.
1753                  */
1754                 if (hicount > locount)
1755                         key |= (hammer2_key_t)1 << keybits;
1756                 else
1757                         key &= ~(hammer2_key_t)1 << keybits;
1758         }
1759
1760         /*
1761          * How big should our new indirect block be?  It has to be at least
1762          * as large as its parent.
1763          */
1764         if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
1765                 nbytes = HAMMER2_IND_BYTES_MIN;
1766         else
1767                 nbytes = HAMMER2_IND_BYTES_MAX;
1768         if (nbytes < count * sizeof(hammer2_blockref_t))
1769                 nbytes = count * sizeof(hammer2_blockref_t);
1770
1771         /*
1772          * Ok, create our new indirect block
1773          */
1774         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1775         dummy.bref.key = key;
1776         dummy.bref.keybits = keybits;
1777         dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
1778         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1779         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
1780
1781         /*
1782          * Iterate the original parent and move the matching brefs into
1783          * the new indirect block.
1784          */
1785         for (i = 0; i < count; ++i) {
1786                 /*
1787                  * For keying purposes access the bref from the media or
1788                  * from our in-memory cache.  In cases where the in-memory
1789                  * cache overrides the media the keyrefs will be the same
1790                  * anyway so we can avoid checking the cache when the media
1791                  * has a key.
1792                  */
1793                 if (base == NULL || base[i].type == 0) {
1794                         dummy.index = i;
1795                         chain = SPLAY_FIND(hammer2_chain_splay,
1796                                            &parent->shead, &dummy);
1797                         if (chain == NULL) {
1798                                 /*
1799                                  * Select index indirect block is placed in
1800                                  */
1801                                 if (ichain->index < 0)
1802                                         ichain->index = i;
1803                                 continue;
1804                         }
1805                         bref = &chain->bref;
1806                 } else {
1807                         bref = &base[i];
1808                 }
1809
1810                 /*
1811                  * Skip keys not in the chosen half (low or high), only bit
1812                  * (keybits - 1) needs to be compared but for safety we
1813                  * will compare all msb bits plus that bit again.
1814                  */
1815                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1816                     (key ^ bref->key)) != 0) {
1817                         continue;
1818                 }
1819
1820                 /*
1821                  * This element is being moved, its slot is available
1822                  * for our indirect block.
1823                  */
1824                 if (ichain->index < 0)
1825                         ichain->index = i;
1826
1827                 /*
1828                  * Load the new indirect block by acquiring or allocating
1829                  * the related chain entries, then simply move it to the
1830                  * new parent (ichain).
1831                  *
1832                  * Flagging the new chain entry MOVED will cause a flush
1833                  * to synchronize its block into the new indirect block.
1834                  * The chain is unlocked after being moved but needs to
1835                  * retain a reference for the MOVED state
1836                  *
1837                  * We must still set SUBMODIFIED in the parent but we do
1838                  * that after the loop.
1839                  *
1840                  * XXX we really need a lock here but we don't need the
1841                  *     data.  NODATA feature needed.
1842                  */
1843                 chain = hammer2_chain_get(hmp, parent, i,
1844                                           HAMMER2_LOOKUP_NODATA);
1845                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1846                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1847                         panic("hammer2_chain_create_indirect: collision");
1848                 chain->parent = ichain;
1849                 if (base)
1850                         bzero(&base[i], sizeof(base[i]));
1851                 atomic_add_int(&parent->refs, -1);
1852                 atomic_add_int(&ichain->refs, 1);
1853                 if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
1854                         hammer2_chain_ref(hmp, chain);
1855                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1856                 }
1857                 hammer2_chain_unlock(hmp, chain);
1858                 KKASSERT(parent->refs > 0);
1859                 chain = NULL;
1860         }
1861
1862         /*
1863          * Insert the new indirect block into the parent now that we've
1864          * cleared out some entries in the parent.  We calculated a good
1865          * insertion index in the loop above (ichain->index).
1866          */
1867         KKASSERT(ichain->index >= 0);
1868         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1869                 panic("hammer2_chain_create_indirect: ichain insertion");
1870         ichain->parent = parent;
1871         atomic_add_int(&parent->refs, 1);
1872
1873         /*
1874          * Mark the new indirect block modified after insertion, which
1875          * will propagate up through parent all the way to the root and
1876          * also allocate the physical block in ichain for our caller,
1877          * and assign ichain->data to a pre-zero'd space (because there
1878          * is not prior data to copy into it).
1879          *
1880          * We have to set SUBMODIFIED in ichain's flags manually so the
1881          * flusher knows it has to recurse through it to get to all of
1882          * our moved blocks, then call setsubmod() to set the bit
1883          * recursively.
1884          */
1885         hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
1886         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1887         hammer2_chain_parent_setsubmod(hmp, ichain);
1888
1889         /*
1890          * Figure out what to return.
1891          */
1892         if (create_bits >= keybits) {
1893                 /*
1894                  * Key being created is way outside the key range,
1895                  * return the original parent.
1896                  */
1897                 hammer2_chain_unlock(hmp, ichain);
1898         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1899                    (create_key ^ key)) {
1900                 /*
1901                  * Key being created is outside the key range,
1902                  * return the original parent.
1903                  */
1904                 hammer2_chain_unlock(hmp, ichain);
1905         } else {
1906                 /*
1907                  * Otherwise its in the range, return the new parent.
1908                  */
1909                 parent = ichain;
1910         }
1911
1912         return(parent);
1913 }
1914
1915 /*
1916  * Physically delete the specified chain element.  Note that inodes with
1917  * open descriptors should not be deleted (as with other filesystems) until
1918  * the last open descriptor is closed.
1919  *
1920  * This routine will remove the chain element from its parent and potentially
1921  * also recurse upward and delete indirect blocks which become empty as a
1922  * side effect.
1923  *
1924  * The caller must pass a pointer to the chain's parent, also locked and
1925  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1926  * or iteration when indirect blocks are also deleted as a side effect.
1927  */
1928 void
1929 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1930                      hammer2_chain_t *chain)
1931 {
1932         hammer2_blockref_t *base;
1933         int count;
1934
1935         if (chain->parent != parent)
1936                 panic("hammer2_chain_delete: parent mismatch");
1937
1938         /*
1939          * Mark the parent modified so our base[] pointer remains valid
1940          * while we move entries.  For the optimized indirect block
1941          * case mark the parent moved instead.
1942          *
1943          * Calculate the blockref reference in the parent
1944          */
1945         switch(parent->bref.type) {
1946         case HAMMER2_BREF_TYPE_INODE:
1947                 hammer2_chain_modify(hmp, parent, 0);
1948                 base = &parent->data->ipdata.u.blockset.blockref[0];
1949                 count = HAMMER2_SET_COUNT;
1950                 break;
1951         case HAMMER2_BREF_TYPE_INDIRECT:
1952                 hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
1953                 if (parent->flags & HAMMER2_CHAIN_INITIAL)
1954                         base = NULL;
1955                 else
1956                         base = &parent->data->npdata.blockref[0];
1957                 count = parent->bytes / sizeof(hammer2_blockref_t);
1958                 break;
1959         case HAMMER2_BREF_TYPE_VOLUME:
1960                 hammer2_chain_modify(hmp, parent, 0);
1961                 base = &hmp->voldata.sroot_blockset.blockref[0];
1962                 count = HAMMER2_SET_COUNT;
1963                 break;
1964         default:
1965                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
1966                       parent->bref.type);
1967                 count = 0;
1968                 break;
1969         }
1970
1971         /*
1972          * Disconnect the bref in the parent, remove the chain, and
1973          * disconnect in-memory fields from the parent.
1974          */
1975         KKASSERT(chain->index >= 0 && chain->index < count);
1976         if (base)
1977                 bzero(&base[chain->index], sizeof(*base));
1978
1979         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1980         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1981         atomic_add_int(&parent->refs, -1);      /* for splay entry */
1982         chain->index = -1;
1983         chain->parent = NULL;
1984
1985         /*
1986          * If this is an inode clear the pip.
1987          */
1988         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
1989                 chain->u.ip->pip = NULL;
1990
1991         /*
1992          * The chain is still likely referenced, possibly even by a vnode
1993          * (if an inode), so defer further action until the chain gets
1994          * dropped.
1995          */
1996 }
1997
1998 /*
1999  * Recursively flush the specified chain.  The chain is locked and
2000  * referenced by the caller and will remain so on return.
2001  *
2002  * This cannot be called with the volume header's vchain (yet).
2003  *
2004  * PASS1 - clear the MODIFIED1 bit.
2005  */
2006 static void
2007 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain, int tab)
2008 {
2009         hammer2_blockref_t *bref;
2010         hammer2_off_t pbase;
2011         size_t bbytes;
2012         size_t boff;
2013         char *bdata;
2014         struct buf *bp;
2015         int error;
2016
2017         if (hammer2_debug & 0x0008)
2018                 kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
2019                         tab, tab, "",
2020                         chain->bref.type, chain->bref.data_off,
2021                         chain, chain->refs, chain->flags);
2022
2023         /*
2024          * Flush any children of this chain entry.
2025          *
2026          * NOTE: If we use a while() here an active filesystem can
2027          *       prevent the flush from ever finishing.
2028          */
2029         if (chain->flags &
2030             (HAMMER2_CHAIN_SUBMODIFIED | HAMMER2_CHAIN_DESTROYED)) {
2031                 hammer2_blockref_t *base;
2032                 hammer2_chain_t *scan;
2033                 hammer2_chain_t *next;
2034                 int count;
2035                 int submodified = 0;
2036                 int submoved = 0;
2037
2038                 /*
2039                  * Clear SUBMODIFIED now.  Flag any races during the flush
2040                  * with the (submodified) local variable and re-arm it
2041                  * as necessary after the loop is done.
2042                  *
2043                  * Delaying the setting of the chain to MODIFIED can reduce
2044                  * unnecessary I/O.
2045                  *
2046                  * Modifications to the children will propagate up, forcing
2047                  * us to become modified and copy-on-write too.  Be sure
2048                  * to modify chain (as a side effect of the recursive
2049                  * flush) ONLY if it is actually being modified by the
2050                  * recursive flush.
2051                  */
2052                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
2053
2054                 /*
2055                  * Flush the children and update the blockrefs in the parent.
2056                  * Be careful of ripouts during the loop.
2057                  */
2058                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2059                 while ((scan = next) != NULL) {
2060                         next = SPLAY_NEXT(hammer2_chain_splay,
2061                                           &chain->shead, scan);
2062                         if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2063                                             HAMMER2_CHAIN_MODIFIED1 |
2064                                             HAMMER2_CHAIN_MOVED)) == 0) {
2065                                 continue;
2066                         }
2067
2068                         /*
2069                          * Recurse the flush
2070                          */
2071                         hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
2072                         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2073                                 atomic_set_int(&scan->flags,
2074                                                HAMMER2_CHAIN_DESTROYED);
2075                         }
2076                         hammer2_chain_flush_pass1(hmp, scan, tab + 4);
2077
2078                         /*
2079                          * No point loading blockrefs yet if the
2080                          * child (recursively) is still dirty.
2081                          */
2082                         if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2083                                            HAMMER2_CHAIN_MODIFIED1)) {
2084                                 submodified = 1;
2085                                 if (hammer2_debug & 0x0008)
2086                                         kprintf("s");
2087                         }
2088                         if (scan->flags & HAMMER2_CHAIN_MOVED) {
2089                                 if (hammer2_debug & 0x0008)
2090                                         kprintf("m");
2091                                 submoved = 1;
2092                         }
2093                         if (hammer2_debug & 0x0008)
2094                                 kprintf("\n");
2095                         hammer2_chain_unlock(hmp, scan);
2096                 }
2097
2098                 if (submodified || (chain->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
2099                         /*
2100                          * No point loading up the blockrefs if submodified
2101                          * got re-set.
2102                          *
2103                          * NOTE: Even though we cleared the SUBMODIFIED flag
2104                          *       it can still get re-set by operations
2105                          *       occuring under our chain, so check both.
2106                          */
2107                         atomic_set_int(&chain->flags,
2108                                        HAMMER2_CHAIN_SUBMODIFIED);
2109                 } else if (submoved) {
2110                         /*
2111                          * Ok, we can modify the blockrefs in this chain
2112                          * entry.  Mark it modified.  Calculate the
2113                          * blockref array after marking it modified (since
2114                          * that may change the underlying data ptr).
2115                          *
2116                          * NOTE: We only do this if submoved != 0, otherwise
2117                          *       there may not be any changes and setting
2118                          *       the chain modified will re-arm the MOVED
2119                          *       bit recursively, resulting in O(N^2)
2120                          *       flushes.
2121                          *
2122                          * NOTE: We don't want hammer2_chain_modify() to
2123                          *       recursively set the SUBMODIFIED flag
2124                          *       upward in this case!
2125                          */
2126                         hammer2_chain_modify(hmp, chain, HAMMER2_MODIFY_NOSUB);
2127
2128                         switch(chain->bref.type) {
2129                         case HAMMER2_BREF_TYPE_INODE:
2130                                 base = &chain->data->ipdata.u.blockset.
2131                                         blockref[0];
2132                                 count = HAMMER2_SET_COUNT;
2133                                 break;
2134                         case HAMMER2_BREF_TYPE_INDIRECT:
2135                                 base = &chain->data->npdata.blockref[0];
2136                                 count = chain->bytes /
2137                                         sizeof(hammer2_blockref_t);
2138                                 break;
2139                         case HAMMER2_BREF_TYPE_VOLUME:
2140                                 base = &hmp->voldata.sroot_blockset.blockref[0];
2141                                 count = HAMMER2_SET_COUNT;
2142                                 break;
2143                         default:
2144                                 base = NULL;
2145                                 panic("hammer2_chain_get: "
2146                                       "unrecognized blockref type: %d",
2147                                       chain->bref.type);
2148                         }
2149
2150                         /*
2151                          * Update the blockrefs.
2152                          */
2153                         next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
2154                         while ((scan = next) != NULL) {
2155                                 next = SPLAY_NEXT(hammer2_chain_splay,
2156                                                   &chain->shead, scan);
2157                                 KKASSERT(scan->index >= 0 &&
2158                                          scan->index < count);
2159                                 hammer2_chain_lock(hmp, scan,
2160                                                    HAMMER2_RESOLVE_NEVER);
2161                                 base[scan->index] = scan->bref;
2162                                 if (scan->flags & HAMMER2_CHAIN_MOVED) {
2163                                         atomic_clear_int(&scan->flags,
2164                                                  HAMMER2_CHAIN_MOVED);
2165                                         hammer2_chain_drop(hmp, scan);
2166                                 }
2167                                 hammer2_chain_unlock(hmp, scan);
2168                         }
2169                 }
2170         }
2171
2172         /*
2173          * If destroying the object we unconditonally clear the MODIFIED1
2174          * and MOVED bits, and we destroy the buffer without writing it
2175          * out.
2176          *
2177          * We don't bother updating the hash/crc or the parent bref.
2178          *
2179          * XXX allocations for unflushed data can be returned to the
2180          *     free pool.
2181          */
2182         if (chain->flags & HAMMER2_CHAIN_DESTROYED) {
2183                 if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
2184                         if (chain->bp) {
2185                                 chain->bp->b_flags |= B_INVAL|B_RELBUF;
2186                         }
2187                         atomic_clear_int(&chain->flags,
2188                                          HAMMER2_CHAIN_MODIFIED1);
2189                         hammer2_chain_drop(hmp, chain);
2190                 }
2191                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
2192                         atomic_clear_int(&chain->flags,
2193                                          HAMMER2_CHAIN_MOVED);
2194                         hammer2_chain_drop(hmp, chain);
2195                 }
2196                 return;
2197         }
2198
2199         /*
2200          * Flush this chain entry only if it is marked modified.
2201          */
2202         if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0) {
2203                 goto done;
2204                 return;
2205         }
2206
2207         /*
2208          * Clear MODIFIED1 and set HAMMER2_CHAIN_MOVED.  The caller
2209          * will re-test the MOVED bit.
2210          *
2211          * bits own a single parent ref and the MOVED bit owns its own
2212          * parent ref.
2213          */
2214         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
2215         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2216                 hammer2_chain_drop(hmp, chain);
2217         } else {
2218                 /* inherit ref from the MODIFIED1 we cleared */
2219                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2220         }
2221
2222         /*
2223          * If this is part of a recursive flush we can go ahead and write
2224          * out the buffer cache buffer and pass a new bref back up the chain.
2225          *
2226          * This will never be a volume header.
2227          */
2228         switch(chain->bref.type) {
2229         case HAMMER2_BREF_TYPE_VOLUME:
2230                 /*
2231                  * The volume header is flushed manually by the syncer, not
2232                  * here.
2233                  */
2234                 break;
2235         case HAMMER2_BREF_TYPE_DATA:
2236                 /*
2237                  * Data elements have already been flushed via the logical
2238                  * file buffer cache.  Their hash was set in the bref by
2239                  * the vop_write code.
2240                  */
2241                 break;
2242         case HAMMER2_BREF_TYPE_INDIRECT:
2243                 /*
2244                  * Indirect blocks may be in an INITIAL state.
2245                  */
2246                 break;
2247         default:
2248                 /*
2249                  * Embedded elements have to be flushed out.
2250                  */
2251                 KKASSERT(chain->data != NULL);
2252                 bref = &chain->bref;
2253
2254                 KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
2255
2256                 if (chain->bp == NULL) {
2257                         /*
2258                          * The data is embedded, we have to acquire the
2259                          * buffer cache buffer and copy the data into it.
2260                          */
2261                         if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
2262                                 bbytes = HAMMER2_MINIOSIZE;
2263                         pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
2264                         boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
2265
2266                         /*
2267                          * The getblk() optimization can only be used if the
2268                          * physical block size matches the request.
2269                          */
2270                         if (chain->bytes == bbytes) {
2271                                 bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
2272                                 error = 0;
2273                         } else {
2274                                 error = bread(hmp->devvp, pbase, bbytes, &bp);
2275                                 KKASSERT(error == 0);
2276                         }
2277                         bdata = (char *)bp->b_data + boff;
2278
2279                         /*
2280                          * Copy the data to the buffer, mark the buffer
2281                          * dirty, and convert the chain to unmodified.
2282                          */
2283                         bcopy(chain->data, bdata, chain->bytes);
2284                         bp->b_flags |= B_CLUSTEROK;
2285                         bdwrite(bp);
2286                         bp = NULL;
2287                         chain->bref.check.iscsi32.value =
2288                                 hammer2_icrc32(chain->data, chain->bytes);
2289                         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
2290                                 ++hammer2_iod_meta_write;
2291                         else
2292                                 ++hammer2_iod_indr_write;
2293                 } else {
2294                         chain->bref.check.iscsi32.value =
2295                                 hammer2_icrc32(chain->data, chain->bytes);
2296                 }
2297         }
2298
2299         /*
2300          * Special handling
2301          */
2302         bref = &chain->bref;
2303
2304         switch(bref->type) {
2305         case HAMMER2_BREF_TYPE_VOLUME:
2306                 KKASSERT(chain->data != NULL);
2307                 KKASSERT(chain->bp == NULL);
2308
2309                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
2310                         hammer2_icrc32(
2311                                 (char *)&hmp->voldata +
2312                                  HAMMER2_VOLUME_ICRC1_OFF,
2313                                 HAMMER2_VOLUME_ICRC1_SIZE);
2314                 hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
2315                         hammer2_icrc32(
2316                                 (char *)&hmp->voldata +
2317                                  HAMMER2_VOLUME_ICRC0_OFF,
2318                                 HAMMER2_VOLUME_ICRC0_SIZE);
2319                 hmp->voldata.icrc_volheader =
2320                         hammer2_icrc32(
2321                                 (char *)&hmp->voldata +
2322                                  HAMMER2_VOLUME_ICRCVH_OFF,
2323                                 HAMMER2_VOLUME_ICRCVH_SIZE);
2324                 break;
2325         }
2326 done:
2327         if (hammer2_debug & 0x0008)
2328                 kprintf("%*.*s} %p/%d %04x ",
2329                         tab, tab, "", chain, chain->refs, chain->flags);
2330 }
2331
2332 #if 0
2333 /*
2334  * PASS2 - not yet implemented (should be called only with the root chain?)
2335  */
2336 static void
2337 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2338 {
2339 }
2340 #endif
2341
2342 /*
2343  * Stand-alone flush.  If the chain is unable to completely flush we have
2344  * to be sure that SUBMODIFIED propagates up the parent chain.
2345  *
2346  * This routine can be called from several places but the most important
2347  * is from the hammer2_vop_reclaim() function.  We want to try to completely
2348  * clean out the inode structure to prevent disconnected inodes from
2349  * building up and blowing out the kmalloc pool.
2350  */
2351 void
2352 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain)
2353 {
2354         hammer2_chain_t *parent;
2355         hammer2_blockref_t *base;
2356         int count;
2357
2358         /*
2359          * Recursive flush
2360          */
2361         hammer2_chain_flush_pass1(hmp, chain, 0);
2362
2363         /*
2364          * The SUBMODIFIED bit must propagate upward if the chain could not
2365          * be completely flushed.
2366          */
2367         if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2368                             HAMMER2_CHAIN_MODIFIED1 |
2369                             HAMMER2_CHAIN_MOVED)) {
2370                 hammer2_chain_parent_setsubmod(hmp, chain);
2371         }
2372
2373         /*
2374          * If the only thing left is a simple bref update try to
2375          * pro-actively update the parent, otherwise return early.
2376          */
2377         parent = chain->parent;
2378         if (parent == NULL ||
2379             chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
2380             (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
2381                              HAMMER2_CHAIN_MODIFIED1 |
2382                              HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
2383                 return;
2384         }
2385
2386         /*
2387          * We are locking backwards so allow the lock to fail
2388          */
2389         if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
2390                 return;
2391         }
2392
2393         /*
2394          * We are updating brefs but we have to call chain_modify() w/
2395          * setsubmod = TRUE because our caller is not a recursive
2396          * flush.
2397          */
2398         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
2399         hammer2_chain_modify(hmp, parent, 0);
2400
2401         switch(parent->bref.type) {
2402         case HAMMER2_BREF_TYPE_INODE:
2403                 base = &parent->data->ipdata.u.blockset.
2404                         blockref[0];
2405                 count = HAMMER2_SET_COUNT;
2406                 break;
2407         case HAMMER2_BREF_TYPE_INDIRECT:
2408                 base = &parent->data->npdata.blockref[0];
2409                 count = parent->bytes /
2410                         sizeof(hammer2_blockref_t);
2411                 break;
2412         case HAMMER2_BREF_TYPE_VOLUME:
2413                 base = &hmp->voldata.sroot_blockset.blockref[0];
2414                 count = HAMMER2_SET_COUNT;
2415                 break;
2416         default:
2417                 base = NULL;
2418                 panic("hammer2_chain_flush: "
2419                       "unrecognized blockref type: %d",
2420                       parent->bref.type);
2421         }
2422
2423         /*
2424          * Update the blockref in the parent
2425          */
2426         KKASSERT(chain->index >= 0 &&
2427                  chain->index < count);
2428         base[chain->index] = chain->bref;
2429         if (chain->flags & HAMMER2_CHAIN_MOVED) {
2430                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
2431                 hammer2_chain_drop(hmp, chain);
2432         }
2433
2434         lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
2435         hammer2_chain_unlock(hmp, parent);
2436 }