hammer2 - Implelment variable-sized physical storage allocations
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/cdefs.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/types.h>
47 #include <sys/lock.h>
48 #include <sys/uuid.h>
49
50 #include "hammer2.h"
51
52 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
53
54 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
55
56 static hammer2_chain_t *hammer2_chain_create_indirect(
57                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
58                         hammer2_key_t key, int keybits);
59
60 /*
61  * Compare function for chain splay tree
62  */
63 int
64 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
65 {
66         return(chain2->index - chain1->index);
67 }
68
69 /*
70  * Allocate a new disconnected chain element representing the specified
71  * bref.  The chain element is locked exclusively and refs is set to 1.
72  *
73  * This essentially allocates a system memory structure representing one
74  * of the media structure types, including inodes.
75  */
76 hammer2_chain_t *
77 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
78 {
79         hammer2_chain_t *chain;
80         hammer2_inode_t *ip;
81         hammer2_indblock_t *np;
82         hammer2_data_t *dp;
83
84         /*
85          * Construct the appropriate system structure.
86          */
87         switch(bref->type) {
88         case HAMMER2_BREF_TYPE_INODE:
89                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
90                 chain = &ip->chain;
91                 chain->u.ip = ip;
92                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
93                 ip->hmp = hmp;
94                 break;
95         case HAMMER2_BREF_TYPE_INDIRECT:
96                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
97                 chain = &np->chain;
98                 chain->u.np = np;
99                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
100                 break;
101         case HAMMER2_BREF_TYPE_DATA:
102                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
103                 chain = &dp->chain;
104                 chain->u.dp = dp;
105                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
106                 break;
107         case HAMMER2_BREF_TYPE_VOLUME:
108                 chain = NULL;
109                 panic("hammer2_chain_alloc volume type illegal for op");
110         default:
111                 chain = NULL;
112                 panic("hammer2_chain_alloc: unrecognized blockref type: %d",
113                       bref->type);
114         }
115         chain->bref = *bref;
116         chain->index = -1;      /* not yet assigned */
117         chain->refs = 1;
118         chain->bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
119         lockmgr(&chain->lk, LK_EXCLUSIVE);
120
121         return (chain);
122 }
123
124 /*
125  * Free a disconnected chain element
126  */
127 void
128 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
129 {
130         void *mem;
131
132         KKASSERT(chain->bp == NULL);
133         KKASSERT(chain->data == NULL);
134         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
135                  chain->u.ip->vp == NULL);
136
137         if ((mem = chain->u.mem) != NULL) {
138                 chain->u.mem = NULL;
139                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
140                         kfree(mem, hmp->minode);
141                 else
142                         kfree(mem, hmp->mchain);
143         }
144 }
145
146 /*
147  * Add a reference to a chain element (for shared access).  The chain
148  * element must already have at least 1 ref controlled by the caller.
149  */
150 void
151 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
152 {
153         KKASSERT(chain->refs > 0);
154         atomic_add_int(&chain->refs, 1);
155 }
156
157 /*
158  * Drop the callers reference to the chain element.  If the ref count
159  * reaches zero the chain element and its related structure (typically an
160  * inode or indirect block) will be freed and the parent will be
161  * recursively dropped.
162  *
163  * Modified elements hold an additional reference so it should not be
164  * possible for the count on a modified element to drop to 0.
165  *
166  * The chain element must NOT be locked by the caller.
167  *
168  * The parent might or might not be locked by the caller but if so it
169  * will also be referenced so we shouldn't recurse upward.
170  */
171 void
172 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
173 {
174         hammer2_chain_t *parent;
175         u_int refs;
176
177         while (chain) {
178                 refs = chain->refs;
179                 cpu_ccfence();
180                 KKASSERT(refs > 0);
181                 if (refs == 1) {
182                         KKASSERT(chain != &hmp->vchain);
183                         parent = chain->parent;
184                         if (parent)
185                                 lockmgr(&parent->lk, LK_EXCLUSIVE);
186                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
187                                 /*
188                                  * Succeeded, recurse and drop parent
189                                  */
190                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
191                                         SPLAY_REMOVE(hammer2_chain_splay,
192                                                      &parent->shead, chain);
193                                         atomic_set_int(&chain->flags,
194                                                        HAMMER2_CHAIN_DELETED);
195                                         /* parent refs dropped via recursion */
196                                 }
197                                 chain->parent = NULL;
198                                 if (parent)
199                                         lockmgr(&parent->lk, LK_RELEASE);
200                                 hammer2_chain_free(hmp, chain);
201                                 chain = parent;
202                                 /* recurse on parent */
203                         } else {
204                                 if (parent)
205                                         lockmgr(&parent->lk, LK_RELEASE);
206                                 /* retry the same chain */
207                         }
208                 } else {
209                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
210                                 /*
211                                  * Succeeded, count did not reach zero so
212                                  * cut out of the loop.
213                                  */
214                                 break;
215                         }
216                         /* retry the same chain */
217                 }
218         }
219 }
220
221 /*
222  * Lock a chain element, acquiring its data with I/O if necessary.
223  *
224  * Returns 0 on success or an error code if the data could not be acquired.
225  * The chain element is locked either way.
226  *
227  * chain->data will be pointed either at the embedded data (e.g. for
228  * inodes), in which case the buffer cache buffer is released, or will
229  * point into the bp->b_data buffer with the bp left intact while locked.
230  */
231 int
232 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
233 {
234         hammer2_blockref_t *bref;
235         hammer2_off_t off_hi;
236         size_t off_lo;
237         int error;
238         void *data;
239
240         /*
241          * Lock the element.  Under certain conditions this might end up
242          * being a recursive lock.
243          */
244         KKASSERT(chain->refs > 0);
245         lockmgr(&chain->lk, LK_EXCLUSIVE);
246
247         /*
248          * The volume header is a special case
249          */
250         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME)
251                 return(0);
252
253         /*
254          * bp must be NULL, so if the data pointer is valid here it points
255          * to embedded data and no I/O is necessary (whether modified or not).
256          */
257         KKASSERT(chain->bp == NULL);
258         if (chain->data)
259                 return (0);
260
261         /*
262          * If data is NULL we must issue I/O.  Any error returns the error
263          * code but leaves the chain locked.
264          *
265          * If the chain was modified a new bref will have already been
266          * allocated and its related bp is probably still sitting in the
267          * buffer cache.
268          */
269         bref = &chain->bref;
270
271         off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
272         off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
273         KKASSERT(off_hi != 0);
274         error = bread(hmp->devvp, off_hi, HAMMER2_PBUFSIZE, &chain->bp);
275
276         /*
277          * Even though this can be synthesized from bref->data_off we
278          * store it in the in-memory chain structure for convenience.
279          */
280         chain->bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
281
282         if (error) {
283                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
284                         (intmax_t)off_hi, error);
285                 brelse(chain->bp);
286                 chain->bp = NULL;
287                 return (error);
288         }
289
290         /*
291          * Setup the data pointer, either pointing it to an embedded data
292          * structure and copying the data from the buffer, or pointint it
293          * into the buffer.
294          *
295          * The buffer is not retained when copying to an embedded data
296          * structure in order to avoid potential deadlocks or recursions
297          * on the same physical buffer.
298          */
299         switch (bref->type) {
300         case HAMMER2_BREF_TYPE_VOLUME:
301                 /*
302                  * Copy data from bp to embedded buffer
303                  */
304                 KKASSERT(0);    /* not yet - have mount use this soon */
305                 KKASSERT(off_hi == 0);
306                 bcopy((char *)chain->bp->b_data + off_lo,
307                       &hmp->voldata, HAMMER2_PBUFSIZE);
308                 chain->data = (void *)&hmp->voldata;
309                 brelse(chain->bp);
310                 chain->bp = NULL;
311                 break;
312         case HAMMER2_BREF_TYPE_INODE:
313                 /*
314                  * Copy data from bp to embedded buffer.
315                  */
316                 bcopy((char *)chain->bp->b_data + off_lo,
317                       &chain->u.ip->ip_data,
318                       HAMMER2_INODE_BYTES);
319                 chain->data = (void *)&chain->u.ip->ip_data;
320                 brelse(chain->bp);
321                 chain->bp = NULL;
322                 break;
323         default:
324                 /*
325                  * Leave bp intact
326                  */
327                 data = (char *)chain->bp->b_data + off_lo;
328                 chain->data = data;
329                 break;
330         }
331         return (0);
332 }
333
334 /*
335  * Resdize the chain's physical storage allocation.  Chains can be resized
336  * smaller without reallocating the storage.  Resizing larger will reallocate
337  * the storage.
338  */
339 void
340 hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain, int nradix)
341 {
342         hammer2_chain_t *parent;
343         struct buf *nbp;
344         size_t obytes;
345         size_t nbytes;
346         void *ndata;
347         int error;
348
349         /*
350          * Only data blocks can be resized for now
351          */
352         KKASSERT(chain != &hmp->vchain);
353         KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
354                  chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
355
356         /*
357          * Nothing to do if the element is already the proper size
358          */
359         obytes = chain->bytes;
360         nbytes = 1 << nradix;
361         if (obytes == nbytes)
362                 return;
363
364         /*
365          * A deleted inode may still be active but unreachable via sync
366          * because it has been disconnected from the tree.  Do not allow
367          * deleted inodes to be marked as being modified because this will
368          * bump the refs and never get resolved by the sync, leaving the
369          * inode structure allocated after umount.
370          */
371         if ((chain->flags & HAMMER2_CHAIN_DELETED) &&
372             chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
373                 KKASSERT(chain->data != NULL);
374                 return;
375         }
376
377         /*
378          * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
379          * modified flags share the same ref.
380          */
381         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
382         if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
383                 hammer2_chain_ref(hmp, chain);
384
385         if (nbytes < obytes) {
386                 /*
387                  * If we are making it smaller we don't have to reallocate
388                  * the block.
389                  */
390                 chain->bref.data_off &= ~ HAMMER2_OFF_MASK_RADIX;
391                 chain->bref.data_off |= (nradix & HAMMER2_OFF_MASK_RADIX);
392                 chain->bytes = nbytes;
393         } else {
394                 /*
395                  * Otherwise we do
396                  */
397                 if (chain != &hmp->vchain) {
398                         chain->bref.data_off =
399                                         hammer2_freemap_alloc(hmp, nbytes);
400                         chain->bytes = nbytes;
401                 }
402                 /* XXX failed allocation */
403
404                 switch(chain->bref.type) {
405                 case HAMMER2_BREF_TYPE_VOLUME:          /* embedded */
406                 case HAMMER2_BREF_TYPE_INODE:           /* embedded */
407                         /*
408                          * data points to embedded structure, no copy needed
409                          */
410                         error = 0;
411                         break;
412                 case HAMMER2_BREF_TYPE_INDIRECT:
413                 case HAMMER2_BREF_TYPE_DATA:
414                         /*
415                          * data (if not NULL) points into original bp,
416                          * copy-on-write to new block.
417                          */
418                         KKASSERT(chain != &hmp->vchain);        /* safety */
419                         if (nbytes == HAMMER2_PBUFSIZE) {
420                                 nbp = getblk(hmp->devvp,
421                                              chain->bref.data_off & HAMMER2_OFF_MASK_HI,
422                                              HAMMER2_PBUFSIZE, 0, 0);
423                                 vfs_bio_clrbuf(nbp);
424                                 error = 0;
425                         } else {
426                                 error = bread(hmp->devvp,
427                                              chain->bref.data_off & HAMMER2_OFF_MASK_HI,
428                                              HAMMER2_PBUFSIZE, &nbp);
429                                 KKASSERT(error == 0);/* XXX handle error */
430                         }
431
432                         /*
433                          * The new block may be smaller or larger than the
434                          * old block, only copy what fits.
435                          */
436                         ndata = nbp->b_data + (chain->bref.data_off &
437                                                HAMMER2_OFF_MASK_LO);
438                         if (chain->data) {
439                                 if (nbytes < obytes)
440                                         bcopy(chain->data, ndata, nbytes);
441                                 else
442                                         bcopy(chain->data, ndata, obytes);
443                                 KKASSERT(chain->bp != NULL);
444                                 brelse(chain->bp);
445                         }
446                         chain->bp = nbp;
447                         chain->data = ndata;
448                         break;
449                 default:
450                         panic("hammer2_chain_modify: unknown bref type");
451                         break;
452
453                 }
454         }
455
456         /*
457          * Recursively mark the parent chain elements so flushes can find
458          * modified elements.
459          *
460          * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
461          *       during the flush recursion after clearing the parent's
462          *       SUBMODIFIED bit.  We don't want to re-set the parent's
463          *       SUBMODIFIED bit in this case!
464          */
465         if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
466                 parent = chain->parent;
467                 while (parent &&
468                        (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
469                         atomic_set_int(&parent->flags,
470                                        HAMMER2_CHAIN_SUBMODIFIED);
471                         parent = parent->parent;
472                 }
473         }
474 }
475
476 /*
477  * Convert a locked chain that was retrieved read-only to read-write.
478  *
479  * If not already marked modified a new physical block will be allocated
480  * and assigned to the bref.  If the data is pointing into an existing
481  * bp it will be copied to the new bp and the new bp will replace the
482  * existing bp.
483  *
484  * If the data is embedded we allocate the new physical block but don't
485  * bother copying the data into it (yet).
486  */
487 void
488 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
489 {
490         hammer2_chain_t *parent;
491         struct buf *nbp;
492         void *ndata;
493         int error;
494
495         /*
496          * If the chain is already marked MODIFIED1 we can just return.
497          */
498         if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
499                 KKASSERT(chain->data != NULL);
500                 return;
501         }
502
503         /*
504          * A deleted inode may still be active but unreachable via sync
505          * because it has been disconnected from the tree.  Do not allow
506          * deleted inodes to be marked as being modified because this will
507          * bump the refs and never get resolved by the sync, leaving the
508          * inode structure allocated after umount.
509          */
510         if ((chain->flags & HAMMER2_CHAIN_DELETED) &&
511             chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
512                 KKASSERT(chain->data != NULL);
513                 return;
514         }
515
516         /*
517          * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
518          * modified flags share the same ref.
519          */
520         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
521         if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
522                 hammer2_chain_ref(hmp, chain);
523
524         /*
525          * We must allocate the copy-on-write block.
526          *
527          * If the data is embedded no other action is required.
528          *
529          * If the data is not embedded we acquire and clear the
530          * new block.  If chain->data is not NULL we then do the
531          * copy-on-write.  chain->data will then be repointed to the new
532          * buffer and the old buffer will be released.
533          *
534          * For newly created elements with no prior allocation we go
535          * through the copy-on-write steps except without the copying part.
536          */
537         if (chain != &hmp->vchain) {
538                 chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bytes);
539                 /* XXX failed allocation */
540         }
541
542         switch(chain->bref.type) {
543         case HAMMER2_BREF_TYPE_VOLUME:          /* embedded */
544         case HAMMER2_BREF_TYPE_INODE:           /* embedded */
545                 /*
546                  * data points to embedded structure, no copy needed
547                  */
548                 error = 0;
549                 break;
550         case HAMMER2_BREF_TYPE_INDIRECT:
551         case HAMMER2_BREF_TYPE_DATA:
552                 /*
553                  * data (if not NULL) points into original bp, copy-on-write
554                  * to new block.
555                  */
556                 KKASSERT(chain != &hmp->vchain);        /* safety */
557                 if (chain->bytes == HAMMER2_PBUFSIZE) {
558                         nbp = getblk(hmp->devvp,
559                                      chain->bref.data_off & HAMMER2_OFF_MASK_HI,
560                                      HAMMER2_PBUFSIZE, 0, 0);
561                         vfs_bio_clrbuf(nbp);
562                         error = 0;
563                 } else {
564                         error = bread(hmp->devvp,
565                                      chain->bref.data_off & HAMMER2_OFF_MASK_HI,
566                                      HAMMER2_PBUFSIZE, &nbp);
567                         KKASSERT(error == 0);/* XXX handle error */
568                 }
569
570                 /*
571                  * The new block may be smaller or larger than the old block,
572                  * only copy what fits.
573                  */
574                 ndata = nbp->b_data + (chain->bref.data_off &
575                                        HAMMER2_OFF_MASK_LO);
576                 if (chain->data) {
577                         bcopy(chain->data, ndata, chain->bytes);
578                         KKASSERT(chain->bp != NULL);
579                         brelse(chain->bp);
580                 }
581                 chain->bp = nbp;
582                 chain->data = ndata;
583                 break;
584         default:
585                 panic("hammer2_chain_modify: unknown bref type");
586                 break;
587
588         }
589
590         /*
591          * Recursively mark the parent chain elements so flushes can find
592          * modified elements.
593          *
594          * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
595          *       during the flush recursion after clearing the parent's
596          *       SUBMODIFIED bit.  We don't want to re-set the parent's
597          *       SUBMODIFIED bit in this case!
598          */
599         if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
600                 parent = chain->parent;
601                 while (parent &&
602                        (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
603                         atomic_set_int(&parent->flags,
604                                        HAMMER2_CHAIN_SUBMODIFIED);
605                         parent = parent->parent;
606                 }
607         }
608 }
609
610 /*
611  * Unlock a chain element without dropping its reference count.
612  * (see hammer2_chain_put() to do both).
613  *
614  * Non-embedded data references (chain->bp != NULL) are returned to the
615  * system and the data field is cleared in that case.  If modified the
616  * dirty buffer is still returned to the system, can be flushed to disk by
617  * the system at any time, and will be reconstituted/re-read as needed.
618  */
619 void
620 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
621 {
622         if (chain->bp) {
623                 chain->data = NULL;
624                 if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
625                         if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
626                                 atomic_clear_int(&chain->flags,
627                                                  HAMMER2_CHAIN_IOFLUSH);
628                                 bdwrite(chain->bp);
629                         } else {
630                                 bdwrite(chain->bp);
631                         }
632                 } else {
633                         /* bp might still be dirty */
634                         bqrelse(chain->bp);
635                 }
636                 chain->bp = NULL;
637         }
638         lockmgr(&chain->lk, LK_RELEASE);
639 }
640
641 /*
642  * Locate an in-memory chain.  The parent must be locked.  The in-memory
643  * chain is returned or NULL if no in-memory chain is present.
644  *
645  * NOTE: A chain on-media might exist for this index when NULL is returned.
646  */
647 hammer2_chain_t *
648 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
649 {
650         hammer2_chain_t dummy;
651         hammer2_chain_t *chain;
652
653         dummy.index = index;
654         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
655         return (chain);
656 }
657
658 /*
659  * Return a locked chain structure with all associated data acquired.
660  *
661  * Caller must lock the parent on call, the returned child will be locked.
662  */
663 hammer2_chain_t *
664 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
665                   int index, int flags)
666 {
667         hammer2_blockref_t *bref;
668         hammer2_chain_t *chain;
669         hammer2_chain_t dummy;
670
671         /*
672          * First see if we have a (possibly modified) chain element cached
673          * for this (parent, index).  Acquire the data if necessary.
674          *
675          * If chain->data is non-NULL the chain should already be marked
676          * modified.
677          */
678         dummy.index = index;
679         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
680         if (chain) {
681                 hammer2_chain_ref(hmp, chain);
682                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
683                         hammer2_chain_lock(hmp, chain);
684                 return(chain);
685         }
686
687         /*
688          * Otherwise lookup the bref and issue I/O (switch on the parent)
689          */
690         switch(parent->bref.type) {
691         case HAMMER2_BREF_TYPE_INODE:
692                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
693                 bref = &parent->data->ipdata.u.blockset.blockref[index];
694                 break;
695         case HAMMER2_BREF_TYPE_INDIRECT:
696                 KKASSERT(index >= 0 && index < HAMMER2_IND_COUNT);
697                 bref = &parent->data->npdata.blockref[index];
698                 break;
699         case HAMMER2_BREF_TYPE_VOLUME:
700                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
701                 bref = &hmp->voldata.sroot_blockset.blockref[index];
702                 break;
703         default:
704                 bref = NULL;
705                 panic("hammer2_chain_get: unrecognized blockref type: %d",
706                       parent->bref.type);
707         }
708
709         /*
710          * Allocate a chain structure representing the existing media
711          * entry.  Thus the chain is *not* INITIAL and certainly not
712          * MODIFIED (yet).
713          */
714         chain = hammer2_chain_alloc(hmp, bref);
715
716         /*
717          * Link the chain into its parent.  Caller is expected to hold an
718          * exclusive lock on the parent.
719          */
720         chain->parent = parent;
721         chain->index = index;
722         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
723                 panic("hammer2_chain_link: collision");
724         KKASSERT(parent->refs > 1);
725         atomic_add_int(&parent->refs, 1);       /* for splay entry */
726
727         /*
728          * Additional linkage for inodes.  Reuse the parent pointer to
729          * find the parent directory.
730          */
731         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
732                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
733                         parent = parent->parent;
734                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
735                         chain->u.ip->pip = parent->u.ip;
736         }
737
738         /*
739          * Our new chain structure has already been referenced and locked
740          * but the lock code handles the I/O so call it to resolve the data.
741          * Then release one of our two exclusive locks.
742          *
743          * If NOLOCK is set the release will release the one-and-only lock.
744          */
745         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
746                 hammer2_chain_lock(hmp, chain);
747         lockmgr(&chain->lk, LK_RELEASE);
748
749         return (chain);
750 }
751
752 /*
753  * Unlock and dereference a chain after use.  It is possible for this to
754  * recurse up the chain.
755  */
756 void
757 hammer2_chain_put(hammer2_mount_t *hmp, hammer2_chain_t *chain)
758 {
759         hammer2_chain_unlock(hmp, chain);
760         hammer2_chain_drop(hmp, chain);
761 }
762
763 /*
764  * Locate any key between key_beg and key_end inclusive.  (*parentp)
765  * typically points to an inode but can also point to a related indirect
766  * block and this function will recurse upwards and find the inode again.
767  *
768  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
769  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
770  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
771  *
772  * (*parentp) must be exclusively locked and referenced and can be an inode
773  * or an existing indirect block within the inode.
774  *
775  * On return (*parentp) will be modified to point at the deepest parent chain
776  * element encountered during the search, as a helper for an insertion or
777  * deletion.   The new (*parentp) will be locked and referenced and the old
778  * will be unlocked and dereferenced (no change if they are both the same).
779  *
780  * The matching chain will be returned exclusively locked and referenced.
781  *
782  * NULL is returned if no match was found, but (*parentp) will still
783  * potentially be adjusted.
784  *
785  * This function will also recurse up the chain if the key is not within the
786  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
787  * can simply allow (*parentp) to float inside the loop.
788  */
789 hammer2_chain_t *
790 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
791                      hammer2_key_t key_beg, hammer2_key_t key_end,
792                      int flags)
793 {
794         hammer2_chain_t *parent;
795         hammer2_chain_t *chain;
796         hammer2_chain_t *tmp;
797         hammer2_blockref_t *base;
798         hammer2_blockref_t *bref;
799         hammer2_key_t scan_beg;
800         hammer2_key_t scan_end;
801         int count = 0;
802         int i;
803
804         /*
805          * Recurse (*parentp) upward if necessary until the parent completely
806          * encloses the key range or we hit the inode.
807          */
808         parent = *parentp;
809         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
810                 scan_beg = parent->bref.key;
811                 scan_end = scan_beg +
812                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
813                 if (key_beg >= scan_beg && key_end <= scan_end)
814                         break;
815                 hammer2_chain_unlock(hmp, parent);
816                 parent = parent->parent;
817                 hammer2_chain_ref(hmp, parent);         /* ref new parent */
818                 hammer2_chain_lock(hmp, parent);        /* lock new parent */
819                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
820                 *parentp = parent;                      /* new parent */
821         }
822
823 again:
824         /*
825          * Locate the blockref array.  Currently we do a fully associative
826          * search through the array.
827          */
828         switch(parent->bref.type) {
829         case HAMMER2_BREF_TYPE_INODE:
830                 /*
831                  * Special shortcut for embedded data returns the inode
832                  * itself.  Callers must detect this condition and access
833                  * the embedded data (the strategy code does this for us).
834                  *
835                  * This is only applicable to regular files and softlinks.
836                  */
837                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
838                         hammer2_chain_ref(hmp, parent);
839                         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
840                                 hammer2_chain_lock(hmp, parent);
841                         return (parent);
842                 }
843                 base = &parent->data->ipdata.u.blockset.blockref[0];
844                 count = HAMMER2_SET_COUNT;
845                 break;
846         case HAMMER2_BREF_TYPE_INDIRECT:
847                 if (parent->data == NULL)
848                         panic("parent->data is NULL");
849                 base = &parent->data->npdata.blockref[0];
850                 count = HAMMER2_IND_COUNT;
851                 break;
852         case HAMMER2_BREF_TYPE_VOLUME:
853                 base = &hmp->voldata.sroot_blockset.blockref[0];
854                 count = HAMMER2_SET_COUNT;
855                 break;
856         default:
857                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
858                       parent->bref.type);
859                 base = NULL;    /* safety */
860                 count = 0;      /* safety */
861         }
862
863         /*
864          * If the element and key overlap we use the element.
865          */
866         bref = NULL;
867         for (i = 0; i < count; ++i) {
868                 tmp = hammer2_chain_find(hmp, parent, i);
869                 bref = (tmp) ? &tmp->bref : &base[i];
870                 if (bref->type == 0)
871                         continue;
872                 scan_beg = bref->key;
873                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
874                 if (key_beg <= scan_end && key_end >= scan_beg)
875                         break;
876         }
877         if (i == count) {
878                 if (key_beg == key_end)
879                         return (NULL);
880                 return (hammer2_chain_next(hmp, parentp, NULL,
881                                            key_beg, key_end, flags));
882         }
883
884         /*
885          * Acquire the new chain element.  If the chain element is an
886          * indirect block we must search recursively.
887          */
888         chain = hammer2_chain_get(hmp, parent, i, flags);
889         if (chain == NULL)
890                 return (NULL);
891
892         /*
893          * If the chain element is an indirect block it becomes the new
894          * parent and we loop on it.  We must fixup the chain we loop on
895          * if the caller passed flags to us that aren't sufficient for our
896          * needs.
897          */
898         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
899                 hammer2_chain_put(hmp, parent);
900                 *parentp = parent = chain;
901                 if (flags & HAMMER2_LOOKUP_NOLOCK)
902                         hammer2_chain_lock(hmp, chain);
903                 goto again;
904         }
905
906         /*
907          * All done, return chain
908          */
909         return (chain);
910 }
911
912 /*
913  * After having issued a lookup we can iterate all matching keys.
914  *
915  * If chain is non-NULL we continue the iteration from just after it's index.
916  *
917  * If chain is NULL we assume the parent was exhausted and continue the
918  * iteration at the next parent.
919  */
920 hammer2_chain_t *
921 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
922                    hammer2_chain_t *chain,
923                    hammer2_key_t key_beg, hammer2_key_t key_end,
924                    int flags)
925 {
926         hammer2_chain_t *parent;
927         hammer2_chain_t *tmp;
928         hammer2_blockref_t *base;
929         hammer2_blockref_t *bref;
930         hammer2_key_t scan_beg;
931         hammer2_key_t scan_end;
932         int i;
933         int count;
934
935         parent = *parentp;
936
937 again:
938         /*
939          * Calculate the next index and recalculate the parent if necessary.
940          */
941         if (chain) {
942                 /*
943                  * Continue iteration within current parent.  If not NULL
944                  * the passed-in chain may or may not be locked, based on
945                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
946                  * or a prior next).
947                  */
948                 i = chain->index + 1;
949                 if (flags & HAMMER2_LOOKUP_NOLOCK)
950                         hammer2_chain_drop(hmp, chain);
951                 else
952                         hammer2_chain_put(hmp, chain);
953
954                 /*
955                  * Any scan where the lookup returned degenerate data embedded
956                  * in the inode has an invalid index and must terminate.
957                  */
958                 if (chain == parent)
959                         return(NULL);
960                 chain = NULL;
961         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
962                 /*
963                  * We reached the end of the iteration.
964                  */
965                 return (NULL);
966         } else {
967                 /*
968                  * Continue iteration with next parent unless the current
969                  * parent covers the range.
970                  */
971                 hammer2_chain_t *nparent;
972
973                 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT)
974                         return (NULL);
975
976                 scan_beg = parent->bref.key;
977                 scan_end = scan_beg +
978                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
979                 if (key_beg >= scan_beg && key_end <= scan_end)
980                         return (NULL);
981
982                 i = parent->index + 1;
983                 nparent = parent->parent;
984                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
985                 hammer2_chain_unlock(hmp, parent);
986                 hammer2_chain_lock(hmp, nparent);       /* lock new parent */
987                 hammer2_chain_drop(hmp, parent);        /* drop old parent */
988                 *parentp = parent = nparent;
989         }
990
991 again2:
992         /*
993          * Locate the blockref array.  Currently we do a fully associative
994          * search through the array.
995          */
996         switch(parent->bref.type) {
997         case HAMMER2_BREF_TYPE_INODE:
998                 base = &parent->data->ipdata.u.blockset.blockref[0];
999                 count = HAMMER2_SET_COUNT;
1000                 break;
1001         case HAMMER2_BREF_TYPE_INDIRECT:
1002                 base = &parent->data->npdata.blockref[0];
1003                 count = HAMMER2_IND_COUNT;
1004                 break;
1005         case HAMMER2_BREF_TYPE_VOLUME:
1006                 base = &hmp->voldata.sroot_blockset.blockref[0];
1007                 count = HAMMER2_SET_COUNT;
1008                 break;
1009         default:
1010                 panic("hammer2_chain_next: unrecognized blockref type: %d",
1011                       parent->bref.type);
1012                 base = NULL;    /* safety */
1013                 count = 0;      /* safety */
1014                 break;
1015         }
1016         KKASSERT(i <= count);
1017
1018         /*
1019          * Look for the key.  If we are unable to find a match and an exact
1020          * match was requested we return NULL.  If a range was requested we
1021          * run hammer2_chain_next() to iterate.
1022          */
1023         bref = NULL;
1024         while (i < count) {
1025                 tmp = hammer2_chain_find(hmp, parent, i);
1026                 bref = (tmp) ? &tmp->bref : &base[i];
1027                 if (bref->type == 0) {
1028                         ++i;
1029                         continue;
1030                 }
1031                 scan_beg = bref->key;
1032                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
1033                 if (key_beg <= scan_end && key_end >= scan_beg)
1034                         break;
1035                 ++i;
1036         }
1037
1038         /*
1039          * If we couldn't find a match recurse up a parent to continue the
1040          * search.
1041          */
1042         if (i == count)
1043                 goto again;
1044
1045         /*
1046          * Acquire the new chain element.  If the chain element is an
1047          * indirect block we must search recursively.
1048          */
1049         chain = hammer2_chain_get(hmp, parent, i, flags);
1050         if (chain == NULL)
1051                 return (NULL);
1052
1053         /*
1054          * If the chain element is an indirect block it becomes the new
1055          * parent and we loop on it.  We may have to lock the chain when
1056          * cycling it in as the new parent as it will not be locked if the
1057          * caller passed NOLOCK.
1058          */
1059         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
1060                 hammer2_chain_put(hmp, parent);
1061                 *parentp = parent = chain;
1062                 if (flags & HAMMER2_LOOKUP_NOLOCK)
1063                         hammer2_chain_lock(hmp, chain);
1064                 i = 0;
1065                 goto again2;
1066         }
1067
1068         /*
1069          * All done, return chain
1070          */
1071         return (chain);
1072 }
1073
1074 /*
1075  * Create and return a new hammer2 system memory structure of the specified
1076  * key, type and size and insert it RELATIVE TO (PARENT).
1077  *
1078  * (parent) is typically either an inode or an indirect  block, acquired
1079  * acquired as a side effect of issuing a prior failed lookup.  parent
1080  * must be locked and held.  Do not pass the inode chain to this function
1081  * unless that is the chain returned by the failed lookup.
1082  *
1083  * Non-indirect types will automatically allocate indirect blocks as required
1084  * if the new item does not fit in the current (parent).
1085  *
1086  * Indirect types will move a portion of the existing blockref array in
1087  * (parent) into the new indirect type and then use one of the free slots
1088  * to emplace the new indirect type.
1089  *
1090  * A new locked, referenced chain element is returned of the specified type.
1091  * This element will also be marked as modified and contain a data area
1092  * ready for initialization.
1093  */
1094 hammer2_chain_t *
1095 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1096                      hammer2_chain_t *chain,
1097                      hammer2_key_t key, int keybits, int type, size_t bytes)
1098 {
1099         hammer2_blockref_t dummy;
1100         hammer2_blockref_t *base;
1101         hammer2_blockref_t *bref;
1102         hammer2_chain_t dummy_chain;
1103         int unlock_parent = 0;
1104         int allocated = 0;
1105         int count;
1106         int i;
1107
1108         if (chain == NULL) {
1109                 /*
1110                  * First allocate media space and construct the dummy bref,
1111                  * then allocate the in-memory chain structure.
1112                  */
1113                 bzero(&dummy, sizeof(dummy));
1114                 dummy.type = type;
1115                 dummy.key = key;
1116                 dummy.keybits = keybits;
1117                 dummy.data_off = hammer2_freemap_bytes_to_radix(bytes);
1118                 chain = hammer2_chain_alloc(hmp, &dummy);
1119                 allocated = 1;
1120
1121                 /*
1122                  * We set the WAS_MODIFIED flag here so the chain gets
1123                  * marked as modified below.
1124                  */
1125                 chain->flags |= HAMMER2_CHAIN_INITIAL |
1126                                 HAMMER2_CHAIN_WAS_MODIFIED;
1127
1128                 /*
1129                  * Recalculate bytes to reflect the actual media block
1130                  * allocation.
1131                  */
1132                 bytes = (hammer2_off_t)1 <<
1133                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
1134                 chain->bytes = bytes;
1135
1136                 switch(type) {
1137                 case HAMMER2_BREF_TYPE_VOLUME:
1138                         panic("hammer2_chain_create: called with volume type");
1139                         break;
1140                 case HAMMER2_BREF_TYPE_INODE:
1141                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
1142                         chain->data = (void *)&chain->u.ip->ip_data;
1143                         break;
1144                 default:
1145                         /* leave chain->data NULL */
1146                         KKASSERT(chain->data == NULL);
1147                         break;
1148                 }
1149         } else {
1150                 /*
1151                  * Potentially update the chain's key/keybits, but it will
1152                  * only be marked modified if WAS_MODIFIED is set (if it
1153                  * was modified at the time of its removal during a rename).
1154                  */
1155                 chain->bref.key = key;
1156                 chain->bref.keybits = keybits;
1157         }
1158
1159 again:
1160         /*
1161          * Locate a free blockref in the parent's array
1162          */
1163         switch(parent->bref.type) {
1164         case HAMMER2_BREF_TYPE_INODE:
1165                 KKASSERT(parent->data != NULL);
1166                 base = &parent->data->ipdata.u.blockset.blockref[0];
1167                 count = HAMMER2_SET_COUNT;
1168                 break;
1169         case HAMMER2_BREF_TYPE_INDIRECT:
1170                 KKASSERT(parent->data != NULL);
1171                 base = &parent->data->npdata.blockref[0];
1172                 count = HAMMER2_IND_COUNT;
1173                 break;
1174         case HAMMER2_BREF_TYPE_VOLUME:
1175                 KKASSERT(parent->data != NULL);
1176                 base = &hmp->voldata.sroot_blockset.blockref[0];
1177                 count = HAMMER2_SET_COUNT;
1178                 break;
1179         default:
1180                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1181                       parent->bref.type);
1182                 count = 0;
1183                 break;
1184         }
1185
1186         /*
1187          * Scan for an unallocated bref, also skipping any slots occupied
1188          * by in-memory chain elements that may not yet have been updated
1189          * in the parent's bref array.
1190          */
1191         bzero(&dummy_chain, sizeof(dummy_chain));
1192         bref = NULL;
1193         for (i = 0; i < count; ++i) {
1194                 bref = &base[i];
1195                 dummy_chain.index = i;
1196                 if (bref->type == 0 &&
1197                     SPLAY_FIND(hammer2_chain_splay,
1198                                &parent->shead, &dummy_chain) == NULL) {
1199                         break;
1200                 }
1201         }
1202
1203         /*
1204          * If no free blockref count be found we must create an indirect
1205          * block and move a number of blockrefs into it.  With the parent
1206          * locked we can safely lock each child in order to move it without
1207          * causing a deadlock.
1208          *
1209          * This may return the new indirect block or the old parent depending
1210          * on where the key falls.
1211          */
1212         if (i == count) {
1213                 hammer2_chain_t *nparent;
1214
1215                 nparent = hammer2_chain_create_indirect(hmp, parent,
1216                                                         key, keybits);
1217                 if (nparent == NULL) {
1218                         if (allocated)
1219                                 hammer2_chain_free(hmp, chain);
1220                         chain = NULL;
1221                         goto done;
1222                 }
1223                 if (parent != nparent) {
1224                         if (unlock_parent)
1225                                 hammer2_chain_put(hmp, parent);
1226                         parent = nparent;
1227                         unlock_parent = 1;
1228                 }
1229                 goto again;
1230         }
1231
1232         /*
1233          * Link the chain into its parent.
1234          */
1235         if (chain->parent != NULL)
1236                 panic("hammer2: hammer2_chain_create: chain already connected");
1237         KKASSERT(chain->parent == NULL);
1238         chain->parent = parent;
1239         chain->index = i;
1240         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1241                 panic("hammer2_chain_link: collision");
1242         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1243         KKASSERT(parent->refs > 1);
1244         atomic_add_int(&parent->refs, 1);
1245
1246         /*
1247          * Additional linkage for inodes.  Reuse the parent pointer to
1248          * find the parent directory.
1249          */
1250         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1251                 hammer2_chain_t *scan = parent;
1252                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1253                         scan = scan->parent;
1254                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE)
1255                         chain->u.ip->pip = scan->u.ip;
1256         }
1257
1258         /*
1259          * Mark the newly created or previously disconnected chain element
1260          * as modified and fully resolve the chain->data pointer.  The
1261          * WAS_MODIFIED bit will be set in both cases.
1262          *
1263          * Chain elements with embedded data will not issue I/O at this time.
1264          * A new block will be allocated for the buffer but not instantiated.
1265          *
1266          * Chain elements which do not use embedded data will allocate
1267          * the new block AND instantiate its buffer cache buffer, pointing
1268          * the data at the bp.
1269          */
1270         if (chain->flags & HAMMER2_CHAIN_WAS_MODIFIED) {
1271                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
1272                 hammer2_chain_modify(hmp, chain);
1273         }
1274
1275 done:
1276         if (unlock_parent)
1277                 hammer2_chain_put(hmp, parent);
1278         return (chain);
1279 }
1280
1281 /*
1282  * Create an indirect block that covers one or more of the elements in the
1283  * current parent.  Either returns the existing parent with no locking or
1284  * ref changes or returns the new indirect block locked and referenced,
1285  * depending on what the specified key falls into.
1286  *
1287  * The key/keybits for the indirect mode only needs to follow three rules:
1288  *
1289  * (1) That all elements underneath it fit within its key space and
1290  *
1291  * (2) That all elements outside it are outside its key space.
1292  *
1293  * (3) When creating the new indirect block any elements in the current
1294  *     parent that fit within the new indirect block's keyspace must be
1295  *     moved into the new indirect block.
1296  *
1297  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1298  *     keyspace the the current parent, but lookup/iteration rules will
1299  *     ensure (and must ensure) that rule (2) for all parents leading up
1300  *     to the nearest inode or the root volume header is adhered to.  This
1301  *     is accomplished by always recursing through matching keyspaces in
1302  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1303  *
1304  * The current implementation calculates the current worst-case keyspace by
1305  * iterating the current parent and then divides it into two halves, choosing
1306  * whichever half has the most elements (not necessarily the half containing
1307  * the requested key).
1308  *
1309  * We can also opt to use the half with the least number of elements.  This
1310  * causes lower-numbered keys (aka logical file offsets) to recurse through
1311  * fewer indirect blocks and higher-numbered keys to recurse through more.
1312  * This also has the risk of not moving enough elements to the new indirect
1313  * block and being forced to create several indirect blocks before the element
1314  * can be inserted.
1315  */
1316 static
1317 hammer2_chain_t *
1318 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1319                               hammer2_key_t create_key, int create_bits)
1320 {
1321         hammer2_blockref_t *base;
1322         hammer2_blockref_t *bref;
1323         hammer2_chain_t *chain;
1324         hammer2_chain_t *ichain;
1325         hammer2_chain_t dummy;
1326         hammer2_key_t key = create_key;
1327         int keybits = create_bits;
1328         int locount = 0;
1329         int hicount = 0;
1330         int count;
1331         int i;
1332
1333         /*
1334          * Mark the parent modified so our base[] pointer remains valid
1335          * while we move entries.
1336          */
1337         hammer2_chain_modify(hmp, parent);
1338
1339         /*
1340          * Locate a free blockref in the parent's array
1341          */
1342         switch(parent->bref.type) {
1343         case HAMMER2_BREF_TYPE_INODE:
1344                 base = &parent->data->ipdata.u.blockset.blockref[0];
1345                 count = HAMMER2_SET_COUNT;
1346                 break;
1347         case HAMMER2_BREF_TYPE_INDIRECT:
1348                 base = &parent->data->npdata.blockref[0];
1349                 count = HAMMER2_IND_COUNT;
1350                 break;
1351         case HAMMER2_BREF_TYPE_VOLUME:
1352                 base = &hmp->voldata.sroot_blockset.blockref[0];
1353                 count = HAMMER2_SET_COUNT;
1354                 break;
1355         default:
1356                 panic("hammer2_chain_create_indirect: "
1357                       "unrecognized blockref type: %d",
1358                       parent->bref.type);
1359                 count = 0;
1360                 break;
1361         }
1362
1363         /*
1364          * Scan for an unallocated bref, also skipping any slots occupied
1365          * by in-memory chain elements that may not yet have been updated
1366          * in the parent's bref array.
1367          */
1368         bzero(&dummy, sizeof(dummy));
1369         for (i = 0; i < count; ++i) {
1370                 int nkeybits;
1371
1372                 bref = &base[i];
1373                 if (bref->type == 0) {
1374                         dummy.index = i;
1375                         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
1376                                            &dummy);
1377                         if (chain == NULL)
1378                                 continue;
1379                         bref = &chain->bref;
1380                 }
1381
1382                 /*
1383                  * Expand our calculated key range (key, keybits) to fit
1384                  * the scanned key.  nkeybits represents the full range
1385                  * that we will later cut in half (two halves @ nkeybits - 1).
1386                  */
1387                 nkeybits = keybits;
1388                 if (nkeybits < bref->keybits)
1389                         nkeybits = bref->keybits;
1390                 while ((~(((hammer2_key_t)1 << nkeybits) - 1) &
1391                         (key ^ bref->key)) != 0) {
1392                         ++nkeybits;
1393                 }
1394
1395                 /*
1396                  * If the new key range is larger we have to determine
1397                  * which side of the new key range the existing keys fall
1398                  * under by checking the high bit, then collapsing the
1399                  * locount into the hicount or vise-versa.
1400                  */
1401                 if (keybits != nkeybits) {
1402                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1403                                 hicount += locount;
1404                                 locount = 0;
1405                         } else {
1406                                 locount += hicount;
1407                                 hicount = 0;
1408                         }
1409                         keybits = nkeybits;
1410                 }
1411
1412                 /*
1413                  * The newly scanned key will be in the lower half or the
1414                  * higher half of the (new) key range.
1415                  */
1416                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1417                         ++hicount;
1418                 else
1419                         ++locount;
1420         }
1421
1422         /*
1423          * Adjust keybits to represent half of the full range calculated
1424          * above.
1425          */
1426         --keybits;
1427
1428         /*
1429          * Select whichever half contains the most elements.  Theoretically
1430          * we can select either side as long as it contains at least one
1431          * element (in order to ensure that a free slot is present to hold
1432          * the indirect block).
1433          */
1434         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1435         if (hammer2_indirect_optimize) {
1436                 /*
1437                  * Insert node for least number of keys, this will arrange
1438                  * the first few blocks of a large file or the first few
1439                  * inodes in a directory with fewer indirect blocks when
1440                  * created linearly.
1441                  */
1442                 if (hicount < locount && hicount != 0)
1443                         key |= (hammer2_key_t)1 << keybits;
1444                 else
1445                         key &= ~(hammer2_key_t)1 << keybits;
1446         } else {
1447                 /*
1448                  * Insert node for most number of keys, best for heavily
1449                  * fragmented files.
1450                  */
1451                 if (hicount > locount)
1452                         key |= (hammer2_key_t)1 << keybits;
1453                 else
1454                         key &= ~(hammer2_key_t)1 << keybits;
1455         }
1456
1457         /*
1458          * Ok, create our new indirect block
1459          */
1460         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1461         dummy.bref.key = key;
1462         dummy.bref.keybits = keybits;
1463         dummy.bref.data_off = (hammer2_off_t)
1464                             hammer2_freemap_bytes_to_radix(HAMMER2_PBUFSIZE);
1465         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1466         ichain->flags |= HAMMER2_CHAIN_INITIAL;
1467
1468         /*
1469          * Iterate the original parent and move the matching brefs into
1470          * the new indirect block.
1471          */
1472         for (i = 0; i < count; ++i) {
1473                 /*
1474                  * For keying purposes access the bref from the media or
1475                  * from our in-memory cache.  In cases where the in-memory
1476                  * cache overrides the media the keyrefs will be the same
1477                  * anyway so we can avoid checking the cache when the media
1478                  * has a key.
1479                  */
1480                 bref = &base[i];
1481                 if (bref->type == 0) {
1482                         dummy.index = i;
1483                         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
1484                                            &dummy);
1485                         if (chain == NULL) {
1486                                 /*
1487                                  * Select index indirect block is placed in
1488                                  */
1489                                 if (ichain->index < 0)
1490                                         ichain->index = i;
1491                                 continue;
1492                         }
1493                         bref = &chain->bref;
1494                 }
1495
1496                 /*
1497                  * Skip keys not in the chosen half (low or high), only bit
1498                  * (keybits - 1) needs to be compared but for safety we
1499                  * will compare all msb bits plus that bit again.
1500                  */
1501                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1502                     (key ^ bref->key)) != 0) {
1503                         continue;
1504                 }
1505
1506                 /*
1507                  * This element is being moved, its slot is available
1508                  * for our indirect block.
1509                  */
1510                 if (ichain->index < 0)
1511                         ichain->index = i;
1512
1513                 /*
1514                  * Load the new indirect block by acquiring or allocating
1515                  * the related chain entries, then simply move it to the
1516                  * new parent (ichain).
1517                  *
1518                  * Flagging the new chain entry MOVED will cause a flush
1519                  * to synchronize its block into the new indirect block.
1520                  * The chain is unlocked after being moved but needs to
1521                  * retain a reference for the MOVED state
1522                  *
1523                  * We must still set SUBMODIFIED in the parent but we do
1524                  * that after the loop.
1525                  *
1526                  * XXX we really need a lock here but we don't need the
1527                  *     data.  NODATA feature needed.
1528                  */
1529                 chain = hammer2_chain_get(hmp, parent, i,
1530                                           HAMMER2_LOOKUP_NOLOCK);
1531                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1532                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1533                         panic("hammer2_chain_create_indirect: collision");
1534                 chain->parent = ichain;
1535                 bzero(&base[i], sizeof(base[i]));
1536                 atomic_add_int(&parent->refs, -1);
1537                 atomic_add_int(&ichain->refs, 1);
1538                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
1539                         /* We don't need the ref from the chain_get */
1540                         hammer2_chain_drop(hmp, chain);
1541                 } else {
1542                         /* MOVED bit inherits the ref from the chain_get */
1543                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1544                 }
1545                 KKASSERT(parent->refs > 0);
1546                 chain = NULL;
1547         }
1548
1549         /*
1550          * Insert the new indirect block into the parent now that we've
1551          * cleared out some entries in the parent.  We calculated a good
1552          * insertion index in the loop above (ichain->index).
1553          */
1554         KKASSERT(ichain->index >= 0);
1555         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1556                 panic("hammer2_chain_create_indirect: ichain insertion");
1557         ichain->parent = parent;
1558         atomic_add_int(&parent->refs, 1);
1559
1560         /*
1561          * Mark the new indirect block modified after insertion, which
1562          * will propagate up through parent all the way to the root and
1563          * also allocate the physical block in ichain for our caller.
1564          *
1565          * We have to set SUBMODIFIED in ichain's flags manually so the
1566          * flusher knows it has to recurse through it to get to all of
1567          * our moved blocks.
1568          */
1569         hammer2_chain_modify(hmp, ichain);
1570         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1571
1572         /*
1573          * Figure out what to return.
1574          */
1575         if (create_bits >= keybits) {
1576                 /*
1577                  * Key being created is way outside the key range,
1578                  * return the original parent.
1579                  */
1580                 hammer2_chain_put(hmp, ichain);
1581         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1582                    (create_key ^ key)) {
1583                 /*
1584                  * Key being created is outside the key range,
1585                  * return the original parent.
1586                  */
1587                 hammer2_chain_put(hmp, ichain);
1588         } else {
1589                 /*
1590                  * Otherwise its in the range, return the new parent.
1591                  */
1592                 parent = ichain;
1593         }
1594
1595         return(parent);
1596 }
1597
1598 /*
1599  * Physically delete the specified chain element.  Note that inodes with
1600  * open descriptors should not be deleted (as with other filesystems) until
1601  * the last open descriptor is closed.
1602  *
1603  * This routine will remove the chain element from its parent and potentially
1604  * also recurse upward and delete indirect blocks which become empty as a
1605  * side effect.
1606  *
1607  * The caller must pass a pointer to the chain's parent, also locked and
1608  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1609  * or iteration when indirect blocks are also deleted as a side effect.
1610  */
1611 void
1612 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1613                      hammer2_chain_t *chain)
1614 {
1615         hammer2_blockref_t *base;
1616         int count;
1617
1618         if (chain->parent != parent)
1619                 panic("hammer2_chain_delete: parent mismatch");
1620
1621         /*
1622          * Mark the parent modified so our base[] pointer remains valid
1623          * while we move entries.
1624          *
1625          * Calculate the blockref reference in the parent
1626          */
1627         hammer2_chain_modify(hmp, parent);
1628
1629         switch(parent->bref.type) {
1630         case HAMMER2_BREF_TYPE_INODE:
1631                 base = &parent->data->ipdata.u.blockset.blockref[0];
1632                 count = HAMMER2_SET_COUNT;
1633                 break;
1634         case HAMMER2_BREF_TYPE_INDIRECT:
1635                 base = &parent->data->npdata.blockref[0];
1636                 count = HAMMER2_IND_COUNT;
1637                 break;
1638         case HAMMER2_BREF_TYPE_VOLUME:
1639                 base = &hmp->voldata.sroot_blockset.blockref[0];
1640                 count = HAMMER2_SET_COUNT;
1641                 break;
1642         default:
1643                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
1644                       parent->bref.type);
1645                 count = 0;
1646                 break;
1647         }
1648
1649         /*
1650          * Disconnect the bref in the parent, remove the chain, and
1651          * disconnect in-memory fields from the parent.
1652          */
1653         KKASSERT(chain->index >= 0 && chain->index < count);
1654         base += chain->index;
1655         bzero(base, sizeof(*base));
1656
1657         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1658         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1659         atomic_add_int(&parent->refs, -1);      /* for splay entry */
1660         chain->index = -1;
1661
1662         chain->parent = NULL;
1663         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
1664                 chain->u.ip->pip = NULL;
1665
1666         /*
1667          * Nobody references the underlying object any more so we can
1668          * clear any pending modification(s) on it.  This can theoretically
1669          * recurse downward but even just clearing the bit on this item
1670          * will effectively recurse if someone is doing a rm -rf and greatly
1671          * reduce the I/O required.
1672          *
1673          * The MODIFIED1 bit is cleared but we have to remember the old state
1674          * in case this deletion is related to a rename.  The ref on the
1675          * chain is shared by both modified flags.
1676          */
1677         if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
1678                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
1679                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
1680                 if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
1681                         hammer2_chain_drop(hmp, chain);
1682         }
1683 }
1684
1685 /*
1686  * Recursively flush the specified chain.  The chain is locked and
1687  * referenced by the caller and will remain so on return.
1688  *
1689  * This cannot be called with the volume header's vchain (yet).
1690  *
1691  * PASS1 - clear the MODIFIED1 bit (and set the MODIFIED2 bit XXX)
1692  *
1693  */
1694 static void
1695 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
1696 {
1697         /*
1698          * Flush any children of this chain entry.
1699          */
1700         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
1701                 hammer2_blockref_t *base;
1702                 hammer2_chain_t *scan;
1703                 hammer2_chain_t *next;
1704                 int count;
1705                 int submodified = 0;
1706
1707                 /*
1708                  * Modifications to the children will propagate up, forcing
1709                  * us to become modified and copy-on-write too.
1710                  *
1711                  * Clear SUBMODIFIED now, races during the flush will re-set
1712                  * it.
1713                  */
1714                 hammer2_chain_modify(hmp, chain);
1715                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1716
1717                 /*
1718                  * The blockref in the parent's array must be repointed at
1719                  * the new block allocated by the child after its flush.
1720                  *
1721                  * Calculate the base of the array.
1722                  */
1723                 switch(chain->bref.type) {
1724                 case HAMMER2_BREF_TYPE_INODE:
1725                         KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1726                         base = &chain->data->ipdata.u.blockset.blockref[0];
1727                         count = HAMMER2_SET_COUNT;
1728                         break;
1729                 case HAMMER2_BREF_TYPE_INDIRECT:
1730                         base = &chain->data->npdata.blockref[0];
1731                         count = HAMMER2_IND_COUNT;
1732                         break;
1733                 case HAMMER2_BREF_TYPE_VOLUME:
1734                         KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1735                         base = &hmp->voldata.sroot_blockset.blockref[0];
1736                         count = HAMMER2_SET_COUNT;
1737                         break;
1738                 default:
1739                         base = NULL;
1740                         panic("hammer2_chain_get: unrecognized blockref type: %d",
1741                               chain->bref.type);
1742                 }
1743
1744                 /*
1745                  * Flush the children and update the blockrefs in the parent.
1746                  * Be careful of ripouts during the loop.
1747                  */
1748                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
1749                 while ((scan = next) != NULL) {
1750                         next = SPLAY_NEXT(hammer2_chain_splay, &chain->shead,
1751                                           scan);
1752                         if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
1753                                            HAMMER2_CHAIN_MODIFIED1 |
1754                                            HAMMER2_CHAIN_MOVED)) {
1755                                 hammer2_chain_ref(hmp, scan);
1756                                 hammer2_chain_lock(hmp, scan);
1757                                 hammer2_chain_flush_pass1(hmp, scan);
1758                                 if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
1759                                                    HAMMER2_CHAIN_MODIFIED1)) {
1760                                         submodified = 1;
1761                                         kprintf("x");
1762                                 } else {
1763                                         KKASSERT(scan->index < count);
1764                                         base[scan->index] = scan->bref;
1765                                         if (scan->flags & HAMMER2_CHAIN_MOVED) {
1766                                                 atomic_clear_int(&scan->flags,
1767                                                          HAMMER2_CHAIN_MOVED);
1768                                                 hammer2_chain_drop(hmp, scan);
1769                                         }
1770                                 }
1771                                 hammer2_chain_put(hmp, scan);
1772                         }
1773                 }
1774                 if (submodified) {
1775                         atomic_set_int(&chain->flags,
1776                                        HAMMER2_CHAIN_SUBMODIFIED);
1777                 }
1778         }
1779
1780         /*
1781          * Flush this chain entry only if it is marked modified.
1782          */
1783         if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0)
1784                 return;
1785
1786         /*
1787          * Clear MODIFIED1 and set HAMMER2_CHAIN_MOVED.  The MODIFIED{1,2}
1788          * bits own a single parent ref and the MOVED bit owns its own
1789          * parent ref.
1790          */
1791         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
1792         if (chain->flags & HAMMER2_CHAIN_MOVED) {
1793                 if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
1794                         hammer2_chain_drop(hmp, chain);
1795         } else {
1796                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1797                 if (chain->flags & HAMMER2_CHAIN_MODIFIED2)
1798                         hammer2_chain_ref(hmp, chain);
1799         }
1800
1801         /*
1802          * If this is part of a recursive flush we can go ahead and write
1803          * out the buffer cache buffer and pass a new bref back up the chain.
1804          *
1805          * This will never be a volume header.
1806          */
1807         if (chain != &hmp->vchain) {
1808                 hammer2_blockref_t *bref;
1809                 hammer2_off_t off_hi;
1810                 struct buf *bp;
1811                 size_t off_lo;
1812                 size_t bytes;
1813                 int error;
1814
1815                 KKASSERT(chain->data != NULL);
1816                 bref = &chain->bref;
1817
1818                 off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
1819                 off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
1820                 bytes = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
1821                 KKASSERT(off_hi != 0);  /* not the root volume header */
1822
1823                 if (chain->bp) {
1824                         /*
1825                          * The data is mapped directly to the bp.  Dirty the
1826                          * bp so it gets flushed out by the kernel later on.
1827                          */
1828                         bdirty(chain->bp);
1829                 } else {
1830                         /*
1831                          * The data is embedded, we have to acquire the
1832                          * buffer cache buffer and copy the data into it.
1833                          */
1834                         bp = NULL;
1835                         error = bread(hmp->devvp, off_hi,
1836                                       HAMMER2_PBUFSIZE, &bp);
1837                         KKASSERT(error == 0); /* XXX */
1838
1839                         /*
1840                          * Copy the data to the buffer, mark the buffer
1841                          * dirty, and convert the chain to unmodified.
1842                          */
1843                         bcopy(chain->data, (char *)bp->b_data + off_lo, bytes);
1844                         bdwrite(bp);
1845                         bp = NULL;
1846
1847                         chain->bref.check.iscsi32.value =
1848                                         hammer2_icrc32(chain->data, bytes);
1849                 }
1850         }
1851         {
1852                 hammer2_blockref_t *bref;
1853
1854                 bref = &chain->bref;
1855
1856                 switch(bref->type) {
1857                 case HAMMER2_BREF_TYPE_VOLUME:
1858                         KKASSERT(chain->data != NULL);
1859                         KKASSERT(chain->bp == NULL);
1860
1861                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
1862                                 hammer2_icrc32(
1863                                         (char *)&hmp->voldata +
1864                                          HAMMER2_VOLUME_ICRC1_OFF,
1865                                         HAMMER2_VOLUME_ICRC1_SIZE);
1866                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
1867                                 hammer2_icrc32(
1868                                         (char *)&hmp->voldata +
1869                                          HAMMER2_VOLUME_ICRC0_OFF,
1870                                         HAMMER2_VOLUME_ICRC0_SIZE);
1871                         hmp->voldata.icrc_volheader =
1872                                 hammer2_icrc32(
1873                                         (char *)&hmp->voldata +
1874                                          HAMMER2_VOLUME_ICRCVH_OFF,
1875                                         HAMMER2_VOLUME_ICRCVH_SIZE);
1876                         break;
1877                 }
1878         }
1879 }
1880
1881 #if 0
1882 /*
1883  * PASS2 - not yet implemented (should be called only with the root chain?)
1884  */
1885 static void
1886 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
1887 {
1888 }
1889 #endif
1890
1891 void
1892 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain)
1893 {
1894         hammer2_chain_flush_pass1(hmp, chain);
1895 }