Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/cdefs.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/types.h>
47 #include <sys/lock.h>
48 #include <sys/uuid.h>
49
50 #include "hammer2.h"
51
52 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
53
54 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
55
56 static hammer2_chain_t *hammer2_chain_create_indirect(
57                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
58                         hammer2_key_t key, int keybits);
59
60 /*
61  * Compare function for chain splay tree
62  */
63 int
64 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
65 {
66         return(chain2->index - chain1->index);
67 }
68
69 /*
70  * Allocate a new disconnected chain element representing the specified
71  * bref.  The chain element is locked exclusively and refs is set to 1.
72  *
73  * This essentially allocates a system memory structure representing one
74  * of the media structure types, including inodes.
75  */
76 hammer2_chain_t *
77 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
78 {
79         hammer2_chain_t *chain;
80         hammer2_inode_t *ip;
81         hammer2_indblock_t *np;
82         hammer2_data_t *dp;
83
84         /*
85          * Construct the appropriate system structure.
86          */
87         switch(bref->type) {
88         case HAMMER2_BREF_TYPE_INODE:
89                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
90                 chain = &ip->chain;
91                 chain->u.ip = ip;
92                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
93                 ip->hmp = hmp;
94                 break;
95         case HAMMER2_BREF_TYPE_INDIRECT:
96                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
97                 chain = &np->chain;
98                 chain->u.np = np;
99                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
100                 break;
101         case HAMMER2_BREF_TYPE_DATA:
102                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
103                 chain = &dp->chain;
104                 chain->u.dp = dp;
105                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
106                 break;
107         case HAMMER2_BREF_TYPE_VOLUME:
108                 chain = NULL;
109                 panic("hammer2_chain_get: volume type illegal for op");
110         default:
111                 chain = NULL;
112                 panic("hammer2_chain_get: unrecognized blockref type: %d",
113                       bref->type);
114         }
115         chain->bref = *bref;
116         chain->index = -1;      /* not yet assigned */
117         chain->refs = 1;
118         lockmgr(&chain->lk, LK_EXCLUSIVE);
119
120         return (chain);
121 }
122
123 /*
124  * Free a disconnected chain element
125  */
126 void
127 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
128 {
129         void *mem;
130
131         KKASSERT(chain->bp == NULL);
132         KKASSERT(chain->data == NULL);
133         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
134                  chain->u.ip->vp == NULL);
135
136         if ((mem = chain->u.mem) != NULL) {
137                 chain->u.mem = NULL;
138                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
139                         kfree(mem, hmp->minode);
140                 else
141                         kfree(mem, hmp->mchain);
142         }
143 }
144
145 /*
146  * Add a reference to a chain element (for shared access).  The chain
147  * element must already have at least 1 ref controlled by the caller.
148  */
149 void
150 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
151 {
152         KKASSERT(chain->refs > 0);
153         atomic_add_int(&chain->refs, 1);
154 }
155
156 /*
157  * Drop the callers reference to the chain element.  If the ref count
158  * reaches zero the chain element and its related structure (typically an
159  * inode or indirect block) will be freed and the parent will be
160  * recursively dropped.
161  *
162  * Modified elements hold an additional reference so it should not be
163  * possible for the count on a modified element to drop to 0.
164  *
165  * The chain element must NOT be locked by the caller.
166  *
167  * The parent might or might not be locked by the caller but if so it
168  * will also be referenced so we shouldn't recurse upward.
169  */
170 void
171 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
172 {
173         hammer2_chain_t *parent;
174         u_int refs;
175
176         while (chain) {
177                 refs = chain->refs;
178                 cpu_ccfence();
179                 KKASSERT(refs > 0);
180                 if (refs == 1) {
181                         KKASSERT(chain != &hmp->vchain);
182                         parent = chain->parent;
183                         if (parent)
184                                 lockmgr(&parent->lk, LK_EXCLUSIVE);
185                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
186                                 /*
187                                  * Succeeded, recurse and drop parent
188                                  */
189                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
190                                         SPLAY_REMOVE(hammer2_chain_splay,
191                                                      &parent->shead, chain);
192                                         atomic_set_int(&chain->flags,
193                                                        HAMMER2_CHAIN_DELETED);
194                                         /* parent refs dropped via recursion */
195                                 }
196                                 chain->parent = NULL;
197                                 if (parent)
198                                         lockmgr(&parent->lk, LK_RELEASE);
199                                 hammer2_chain_free(hmp, chain);
200                                 chain = parent;
201                                 /* recurse on parent */
202                         } else {
203                                 if (parent)
204                                         lockmgr(&parent->lk, LK_RELEASE);
205                                 /* retry the same chain */
206                         }
207                 } else {
208                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
209                                 /*
210                                  * Succeeded, count did not reach zero so
211                                  * cut out of the loop.
212                                  */
213                                 break;
214                         }
215                         /* retry the same chain */
216                 }
217         }
218 }
219
220 /*
221  * Lock a chain element, acquiring its data with I/O if necessary.
222  *
223  * Returns 0 on success or an error code if the data could not be acquired.
224  * The chain element is locked either way.
225  *
226  * chain->data will be pointed either at the embedded data (e.g. for
227  * inodes), in which case the buffer cache buffer is released, or will
228  * point into the bp->b_data buffer with the bp left intact while locked.
229  */
230 int
231 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
232 {
233         hammer2_blockref_t *bref;
234         hammer2_off_t off_hi;
235         size_t off_lo;
236         int error;
237         void *data;
238
239         /*
240          * Lock the element.  Under certain conditions this might end up
241          * being a recursive lock.
242          */
243         KKASSERT(chain->refs > 0);
244         lockmgr(&chain->lk, LK_EXCLUSIVE);
245
246         /*
247          * The volume header is a special case
248          */
249         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME)
250                 return(0);
251
252         /*
253          * bp must be NULL, so if the data pointer is valid here it points
254          * to embedded data and no I/O is necessary (whether modified or not).
255          */
256         KKASSERT(chain->bp == NULL);
257         if (chain->data)
258                 return (0);
259
260         /*
261          * If data is NULL we must issue I/O.  Any error returns the error
262          * code but leaves the chain locked.
263          *
264          * If the chain was modified a new bref will have already been
265          * allocated and its related bp is probably still sitting in the
266          * buffer cache.
267          */
268         bref = &chain->bref;
269
270         off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
271         off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
272         KKASSERT(off_hi != 0);
273         error = bread(hmp->devvp, off_hi, HAMMER2_PBUFSIZE, &chain->bp);
274
275         if (error) {
276                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
277                         (intmax_t)off_hi, error);
278                 brelse(chain->bp);
279                 chain->bp = NULL;
280                 return (error);
281         }
282
283         /*
284          * Setup the data pointer, either pointing it to an embedded data
285          * structure and copying the data from the buffer, or pointint it
286          * into the buffer.
287          *
288          * The buffer is not retained when copying to an embedded data
289          * structure in order to avoid potential deadlocks or recursions
290          * on the same physical buffer.
291          */
292         switch (bref->type) {
293         case HAMMER2_BREF_TYPE_VOLUME:
294                 /*
295                  * Copy data from bp to embedded buffer
296                  */
297                 KKASSERT(0);    /* not yet - have mount use this soon */
298                 KKASSERT(off_hi == 0);
299                 bcopy((char *)chain->bp->b_data + off_lo,
300                       &hmp->voldata, HAMMER2_PBUFSIZE);
301                 chain->data = (void *)&hmp->voldata;
302                 brelse(chain->bp);
303                 chain->bp = NULL;
304                 break;
305         case HAMMER2_BREF_TYPE_INODE:
306                 /*
307                  * Copy data from bp to embedded buffer.
308                  */
309                 bcopy((char *)chain->bp->b_data + off_lo,
310                       &chain->u.ip->ip_data,
311                       HAMMER2_INODE_BYTES);
312                 chain->data = (void *)&chain->u.ip->ip_data;
313                 brelse(chain->bp);
314                 chain->bp = NULL;
315                 break;
316         default:
317                 /*
318                  * Leave bp intact
319                  */
320                 data = (char *)chain->bp->b_data + off_lo;
321                 chain->data = data;
322                 break;
323         }
324         return (0);
325 }
326
327 /*
328  * Convert a locked chain that was retrieved read-only to read-write.
329  *
330  * If not already marked modified a new physical block will be allocated
331  * and assigned to the bref.  If the data is pointing into an existing
332  * bp it will be copied to the new bp and the new bp will replace the
333  * existing bp.
334  *
335  * If the data is embedded we allocate the new physical block but don't
336  * bother copying the data into it (yet).
337  */
338 void
339 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
340 {
341         hammer2_chain_t *parent;
342         struct buf *nbp;
343         size_t bytes;
344         void *ndata;
345         int error;
346
347         /*
348          * If the chain is already marked MODIFIED1 we can just return.
349          */
350         if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
351                 KKASSERT(chain->data != NULL);
352                 return;
353         }
354
355         /*
356          * A deleted inode may still be active but unreachable via sync
357          * because it has been disconnected from the tree.  Do not allow
358          * deleted inodes to be marked as being modified because this will
359          * bump the refs and never get resolved by the sync, leaving the
360          * inode structure allocated after umount.
361          */
362         if ((chain->flags & HAMMER2_CHAIN_DELETED) &&
363             chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
364                 KKASSERT(chain->data != NULL);
365                 return;
366         }
367
368         /*
369          * Set MODIFIED1 and add a chain ref to prevent destruction.  Both
370          * modified flags share the same ref.
371          */
372         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
373         if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
374                 hammer2_chain_ref(hmp, chain);
375
376         /*
377          * We must allocate the copy-on-write block.
378          *
379          * If the data is embedded no other action is required.
380          *
381          * If the data is not embedded we acquire and clear the
382          * new block.  If chain->data is not NULL we then do the
383          * copy-on-write.  chain->data will then be repointed to the new
384          * buffer and the old buffer will be released.
385          *
386          * For newly created elements with no prior allocation we go
387          * through the copy-on-write steps except without the copying part.
388          */
389
390         bytes = 1 << (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
391         if (chain != &hmp->vchain) {
392                 chain->bref.data_off = hammer2_freemap_alloc(hmp, bytes);
393                 /* XXX failed allocation */
394         }
395
396         switch(chain->bref.type) {
397         case HAMMER2_BREF_TYPE_VOLUME:          /* embedded */
398         case HAMMER2_BREF_TYPE_INODE:           /* embedded */
399                 /*
400                  * data points to embedded structure, no copy needed
401                  */
402                 error = 0;
403                 break;
404         case HAMMER2_BREF_TYPE_INDIRECT:
405         case HAMMER2_BREF_TYPE_DATA:
406                 /*
407                  * data (if not NULL) points into original bp, copy-on-write
408                  * to new block.
409                  */
410                 KKASSERT(chain != &hmp->vchain);        /* safety */
411                 if (bytes == HAMMER2_PBUFSIZE) {
412                         nbp = getblk(hmp->devvp,
413                                      chain->bref.data_off & HAMMER2_OFF_MASK_HI,
414                                      HAMMER2_PBUFSIZE, 0, 0);
415                         vfs_bio_clrbuf(nbp);
416                         error = 0;
417                 } else {
418                         error = bread(hmp->devvp,
419                                      chain->bref.data_off & HAMMER2_OFF_MASK_HI,
420                                      HAMMER2_PBUFSIZE, &nbp);
421                         KKASSERT(error == 0);/* XXX handle error */
422                 }
423                 ndata = nbp->b_data + (chain->bref.data_off &
424                                        HAMMER2_OFF_MASK_LO);
425                 if (chain->data) {
426                         bcopy(chain->data, ndata, bytes);
427                         KKASSERT(chain->bp != NULL);
428                         brelse(chain->bp);
429                 }
430                 chain->bp = nbp;
431                 chain->data = ndata;
432                 break;
433         default:
434                 panic("hammer2_chain_modify: unknown bref type");
435                 break;
436
437         }
438
439         /*
440          * Recursively mark the parent chain elements so flushes can find
441          * modified elements.
442          *
443          * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
444          *       during the flush recursion after clearing the parent's
445          *       SUBMODIFIED bit.  We don't want to re-set the parent's
446          *       SUBMODIFIED bit in this case!
447          */
448         if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
449                 parent = chain->parent;
450                 while (parent &&
451                        (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
452                         atomic_set_int(&parent->flags,
453                                        HAMMER2_CHAIN_SUBMODIFIED);
454                         parent = parent->parent;
455                 }
456         }
457 }
458
459 /*
460  * Unlock a chain element without dropping its reference count.
461  * (see hammer2_chain_put() to do both).
462  *
463  * Non-embedded data references (chain->bp != NULL) are returned to the
464  * system and the data field is cleared in that case.  If modified the
465  * dirty buffer is still returned to the system, can be flushed to disk by
466  * the system at any time, and will be reconstituted/re-read as needed.
467  */
468 void
469 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
470 {
471         if (chain->bp) {
472                 chain->data = NULL;
473                 if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
474                         if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
475                                 atomic_clear_int(&chain->flags,
476                                                  HAMMER2_CHAIN_IOFLUSH);
477                                 bdwrite(chain->bp);
478                         } else {
479                                 bdwrite(chain->bp);
480                         }
481                 } else {
482                         /* bp might still be dirty */
483                         bqrelse(chain->bp);
484                 }
485                 chain->bp = NULL;
486         }
487         lockmgr(&chain->lk, LK_RELEASE);
488 }
489
490 /*
491  * Locate an in-memory chain.  The parent must be locked.  The in-memory
492  * chain is returned or NULL if no in-memory chain is present.
493  *
494  * NOTE: A chain on-media might exist for this index when NULL is returned.
495  */
496 hammer2_chain_t *
497 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
498 {
499         hammer2_chain_t dummy;
500         hammer2_chain_t *chain;
501
502         dummy.index = index;
503         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
504         return (chain);
505 }
506
507 /*
508  * Return a locked chain structure with all associated data acquired.
509  *
510  * Caller must lock the parent on call, the returned child will be locked.
511  */
512 hammer2_chain_t *
513 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
514                   int index, int flags)
515 {
516         hammer2_blockref_t *bref;
517         hammer2_chain_t *chain;
518         hammer2_chain_t dummy;
519
520         /*
521          * First see if we have a (possibly modified) chain element cached
522          * for this (parent, index).  Acquire the data if necessary.
523          *
524          * If chain->data is non-NULL the chain should already be marked
525          * modified.
526          */
527         dummy.index = index;
528         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
529         if (chain) {
530                 hammer2_chain_ref(hmp, chain);
531                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
532                         hammer2_chain_lock(hmp, chain);
533                 return(chain);
534         }
535
536         /*
537          * Otherwise lookup the bref and issue I/O (switch on the parent)
538          */
539         switch(parent->bref.type) {
540         case HAMMER2_BREF_TYPE_INODE:
541                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
542                 bref = &parent->data->ipdata.u.blockset.blockref[index];
543                 break;
544         case HAMMER2_BREF_TYPE_INDIRECT:
545                 KKASSERT(index >= 0 && index < HAMMER2_IND_COUNT);
546                 bref = &parent->data->npdata.blockref[index];
547                 break;
548         case HAMMER2_BREF_TYPE_VOLUME:
549                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
550                 bref = &hmp->voldata.sroot_blockset.blockref[index];
551                 break;
552         default:
553                 bref = NULL;
554                 panic("hammer2_chain_get: unrecognized blockref type: %d",
555                       parent->bref.type);
556         }
557
558         /*
559          * Allocate a chain structure representing the existing media
560          * entry.  Thus the chain is *not* INITIAL and certainly not
561          * MODIFIED (yet).
562          */
563         chain = hammer2_chain_alloc(hmp, bref);
564
565         /*
566          * Link the chain into its parent.  Caller is expected to hold an
567          * exclusive lock on the parent.
568          */
569         chain->parent = parent;
570         chain->index = index;
571         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
572                 panic("hammer2_chain_link: collision");
573         KKASSERT(parent->refs > 1);
574         atomic_add_int(&parent->refs, 1);       /* for splay entry */
575
576         /*
577          * Additional linkage for inodes.  Reuse the parent pointer to
578          * find the parent directory.
579          */
580         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
581                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
582                         parent = parent->parent;
583                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
584                         chain->u.ip->pip = parent->u.ip;
585         }
586
587         /*
588          * Our new chain structure has already been referenced and locked
589          * but the lock code handles the I/O so call it to resolve the data.
590          * Then release one of our two exclusive locks.
591          *
592          * If NOLOCK is set the release will release the one-and-only lock.
593          */
594         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
595                 hammer2_chain_lock(hmp, chain);
596         lockmgr(&chain->lk, LK_RELEASE);
597
598         return (chain);
599 }
600
601 /*
602  * Unlock and dereference a chain after use.  It is possible for this to
603  * recurse up the chain.
604  */
605 void
606 hammer2_chain_put(hammer2_mount_t *hmp, hammer2_chain_t *chain)
607 {
608         hammer2_chain_unlock(hmp, chain);
609         hammer2_chain_drop(hmp, chain);
610 }
611
612 /*
613  * Locate any key between key_beg and key_end inclusive.  (*parentp)
614  * typically points to an inode but can also point to a related indirect
615  * block and this function will recurse upwards and find the inode again.
616  *
617  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
618  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
619  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
620  *
621  * (*parentp) must be exclusively locked and referenced and can be an inode
622  * or an existing indirect block within the inode.
623  *
624  * On return (*parentp) will be modified to point at the deepest parent chain
625  * element encountered during the search, as a helper for an insertion or
626  * deletion.   The new (*parentp) will be locked and referenced and the old
627  * will be unlocked and dereferenced (no change if they are both the same).
628  *
629  * The matching chain will be returned exclusively locked and referenced.
630  *
631  * NULL is returned if no match was found, but (*parentp) will still
632  * potentially be adjusted.
633  *
634  * This function will also recurse up the chain if the key is not within the
635  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
636  * can simply allow (*parentp) to float inside the loop.
637  */
638 hammer2_chain_t *
639 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
640                      hammer2_key_t key_beg, hammer2_key_t key_end,
641                      int flags)
642 {
643         hammer2_chain_t *parent;
644         hammer2_chain_t *chain;
645         hammer2_chain_t *tmp;
646         hammer2_blockref_t *base;
647         hammer2_blockref_t *bref;
648         hammer2_key_t scan_beg;
649         hammer2_key_t scan_end;
650         int count = 0;
651         int i;
652
653         /*
654          * Recurse (*parentp) upward if necessary until the parent completely
655          * encloses the key range or we hit the inode.
656          */
657         parent = *parentp;
658         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
659                 scan_beg = parent->bref.key;
660                 scan_end = scan_beg +
661                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
662                 if (key_beg >= scan_beg && key_end <= scan_end)
663                         break;
664                 hammer2_chain_unlock(hmp, parent);
665                 parent = parent->parent;
666                 hammer2_chain_ref(hmp, parent);         /* ref new parent */
667                 hammer2_chain_lock(hmp, parent);        /* lock new parent */
668                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
669                 *parentp = parent;                      /* new parent */
670         }
671
672 again:
673         /*
674          * Locate the blockref array.  Currently we do a fully associative
675          * search through the array.
676          */
677         switch(parent->bref.type) {
678         case HAMMER2_BREF_TYPE_INODE:
679                 /*
680                  * Special shortcut for embedded data returns the inode
681                  * itself.  Callers must detect this condition and access
682                  * the embedded data (the strategy code does this for us).
683                  *
684                  * This is only applicable to regular files and softlinks.
685                  */
686                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
687                         hammer2_chain_ref(hmp, parent);
688                         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
689                                 hammer2_chain_lock(hmp, parent);
690                         return (parent);
691                 }
692                 base = &parent->data->ipdata.u.blockset.blockref[0];
693                 count = HAMMER2_SET_COUNT;
694                 break;
695         case HAMMER2_BREF_TYPE_INDIRECT:
696                 if (parent->data == NULL)
697                         panic("parent->data is NULL");
698                 base = &parent->data->npdata.blockref[0];
699                 count = HAMMER2_IND_COUNT;
700                 break;
701         case HAMMER2_BREF_TYPE_VOLUME:
702                 base = &hmp->voldata.sroot_blockset.blockref[0];
703                 count = HAMMER2_SET_COUNT;
704                 break;
705         default:
706                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
707                       parent->bref.type);
708                 base = NULL;    /* safety */
709                 count = 0;      /* safety */
710         }
711
712         /*
713          * If the element and key overlap we use the element.
714          */
715         bref = NULL;
716         for (i = 0; i < count; ++i) {
717                 tmp = hammer2_chain_find(hmp, parent, i);
718                 bref = (tmp) ? &tmp->bref : &base[i];
719                 if (bref->type == 0)
720                         continue;
721                 scan_beg = bref->key;
722                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
723                 if (key_beg <= scan_end && key_end >= scan_beg)
724                         break;
725         }
726         if (i == count) {
727                 if (key_beg == key_end)
728                         return (NULL);
729                 return (hammer2_chain_next(hmp, parentp, NULL,
730                                            key_beg, key_end, flags));
731         }
732
733         /*
734          * Acquire the new chain element.  If the chain element is an
735          * indirect block we must search recursively.
736          */
737         chain = hammer2_chain_get(hmp, parent, i, flags);
738         if (chain == NULL)
739                 return (NULL);
740
741         /*
742          * If the chain element is an indirect block it becomes the new
743          * parent and we loop on it.  We must fixup the chain we loop on
744          * if the caller passed flags to us that aren't sufficient for our
745          * needs.
746          */
747         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
748                 hammer2_chain_put(hmp, parent);
749                 *parentp = parent = chain;
750                 if (flags & HAMMER2_LOOKUP_NOLOCK)
751                         hammer2_chain_lock(hmp, chain);
752                 goto again;
753         }
754
755         /*
756          * All done, return chain
757          */
758         return (chain);
759 }
760
761 /*
762  * After having issued a lookup we can iterate all matching keys.
763  *
764  * If chain is non-NULL we continue the iteration from just after it's index.
765  *
766  * If chain is NULL we assume the parent was exhausted and continue the
767  * iteration at the next parent.
768  */
769 hammer2_chain_t *
770 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
771                    hammer2_chain_t *chain,
772                    hammer2_key_t key_beg, hammer2_key_t key_end,
773                    int flags)
774 {
775         hammer2_chain_t *parent;
776         hammer2_chain_t *tmp;
777         hammer2_blockref_t *base;
778         hammer2_blockref_t *bref;
779         hammer2_key_t scan_beg;
780         hammer2_key_t scan_end;
781         int i;
782         int count;
783
784         parent = *parentp;
785
786 again:
787         /*
788          * Calculate the next index and recalculate the parent if necessary.
789          */
790         if (chain) {
791                 /*
792                  * Continue iteration within current parent.  If not NULL
793                  * the passed-in chain may or may not be locked, based on
794                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
795                  * or a prior next).
796                  */
797                 i = chain->index + 1;
798                 if (flags & HAMMER2_LOOKUP_NOLOCK)
799                         hammer2_chain_drop(hmp, chain);
800                 else
801                         hammer2_chain_put(hmp, chain);
802
803                 /*
804                  * Any scan where the lookup returned degenerate data embedded
805                  * in the inode has an invalid index and must terminate.
806                  */
807                 if (chain == parent)
808                         return(NULL);
809                 chain = NULL;
810         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
811                 /*
812                  * We reached the end of the iteration.
813                  */
814                 return (NULL);
815         } else {
816                 /*
817                  * Continue iteration with next parent unless the current
818                  * parent covers the range.
819                  */
820                 hammer2_chain_t *nparent;
821
822                 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT)
823                         return (NULL);
824
825                 scan_beg = parent->bref.key;
826                 scan_end = scan_beg +
827                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
828                 if (key_beg >= scan_beg && key_end <= scan_end)
829                         return (NULL);
830
831                 i = parent->index + 1;
832                 nparent = parent->parent;
833                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
834                 hammer2_chain_unlock(hmp, parent);
835                 hammer2_chain_lock(hmp, nparent);       /* lock new parent */
836                 hammer2_chain_drop(hmp, parent);        /* drop old parent */
837                 *parentp = parent = nparent;
838         }
839
840 again2:
841         /*
842          * Locate the blockref array.  Currently we do a fully associative
843          * search through the array.
844          */
845         switch(parent->bref.type) {
846         case HAMMER2_BREF_TYPE_INODE:
847                 base = &parent->data->ipdata.u.blockset.blockref[0];
848                 count = HAMMER2_SET_COUNT;
849                 break;
850         case HAMMER2_BREF_TYPE_INDIRECT:
851                 base = &parent->data->npdata.blockref[0];
852                 count = HAMMER2_IND_COUNT;
853                 break;
854         case HAMMER2_BREF_TYPE_VOLUME:
855                 base = &hmp->voldata.sroot_blockset.blockref[0];
856                 count = HAMMER2_SET_COUNT;
857                 break;
858         default:
859                 panic("hammer2_chain_next: unrecognized blockref type: %d",
860                       parent->bref.type);
861                 base = NULL;    /* safety */
862                 count = 0;      /* safety */
863                 break;
864         }
865         KKASSERT(i <= count);
866
867         /*
868          * Look for the key.  If we are unable to find a match and an exact
869          * match was requested we return NULL.  If a range was requested we
870          * run hammer2_chain_next() to iterate.
871          */
872         bref = NULL;
873         while (i < count) {
874                 tmp = hammer2_chain_find(hmp, parent, i);
875                 bref = (tmp) ? &tmp->bref : &base[i];
876                 if (bref->type == 0) {
877                         ++i;
878                         continue;
879                 }
880                 scan_beg = bref->key;
881                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
882                 if (key_beg <= scan_end && key_end >= scan_beg)
883                         break;
884                 ++i;
885         }
886
887         /*
888          * If we couldn't find a match recurse up a parent to continue the
889          * search.
890          */
891         if (i == count)
892                 goto again;
893
894         /*
895          * Acquire the new chain element.  If the chain element is an
896          * indirect block we must search recursively.
897          */
898         chain = hammer2_chain_get(hmp, parent, i, flags);
899         if (chain == NULL)
900                 return (NULL);
901
902         /*
903          * If the chain element is an indirect block it becomes the new
904          * parent and we loop on it.  We may have to lock the chain when
905          * cycling it in as the new parent as it will not be locked if the
906          * caller passed NOLOCK.
907          */
908         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
909                 hammer2_chain_put(hmp, parent);
910                 *parentp = parent = chain;
911                 if (flags & HAMMER2_LOOKUP_NOLOCK)
912                         hammer2_chain_lock(hmp, chain);
913                 i = 0;
914                 goto again2;
915         }
916
917         /*
918          * All done, return chain
919          */
920         return (chain);
921 }
922
923 /*
924  * Create and return a new hammer2 system memory structure of the specified
925  * key, type and size and insert it RELATIVE TO (PARENT).
926  *
927  * (parent) is typically either an inode or an indirect  block, acquired
928  * acquired as a side effect of issuing a prior failed lookup.  parent
929  * must be locked and held.  Do not pass the inode chain to this function
930  * unless that is the chain returned by the failed lookup.
931  *
932  * Non-indirect types will automatically allocate indirect blocks as required
933  * if the new item does not fit in the current (parent).
934  *
935  * Indirect types will move a portion of the existing blockref array in
936  * (parent) into the new indirect type and then use one of the free slots
937  * to emplace the new indirect type.
938  *
939  * A new locked, referenced chain element is returned of the specified type.
940  * This element will also be marked as modified and contain a data area
941  * ready for initialization.
942  */
943 hammer2_chain_t *
944 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
945                      hammer2_chain_t *chain,
946                      hammer2_key_t key, int keybits, int type, size_t bytes)
947 {
948         hammer2_blockref_t dummy;
949         hammer2_blockref_t *base;
950         hammer2_blockref_t *bref;
951         hammer2_chain_t dummy_chain;
952         int unlock_parent = 0;
953         int allocated = 0;
954         int count;
955         int i;
956
957         if (chain == NULL) {
958                 /*
959                  * First allocate media space and construct the dummy bref,
960                  * then allocate the in-memory chain structure.
961                  */
962                 bzero(&dummy, sizeof(dummy));
963                 dummy.type = type;
964                 dummy.key = key;
965                 dummy.keybits = keybits;
966                 dummy.data_off = hammer2_freemap_bytes_to_radix(bytes);
967                 chain = hammer2_chain_alloc(hmp, &dummy);
968                 allocated = 1;
969
970                 /*
971                  * We set the WAS_MODIFIED flag here so the chain gets
972                  * marked as modified below.
973                  */
974                 chain->flags |= HAMMER2_CHAIN_INITIAL |
975                                 HAMMER2_CHAIN_WAS_MODIFIED;
976
977                 /*
978                  * Recalculate bytes to reflect the actual media block
979                  * allocation.
980                  */
981                 bytes = (hammer2_off_t)1 <<
982                         (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
983
984                 switch(type) {
985                 case HAMMER2_BREF_TYPE_VOLUME:
986                         panic("hammer2_chain_create: called with volume type");
987                         break;
988                 case HAMMER2_BREF_TYPE_INODE:
989                         KKASSERT(bytes == HAMMER2_INODE_BYTES);
990                         chain->data = (void *)&chain->u.ip->ip_data;
991                         break;
992                 default:
993                         /* leave chain->data NULL */
994                         KKASSERT(chain->data == NULL);
995                         break;
996                 }
997         } else {
998                 /*
999                  * Potentially update the chain's key/keybits, but it will
1000                  * only be marked modified if WAS_MODIFIED is set (if it
1001                  * was modified at the time of its removal during a rename).
1002                  */
1003                 chain->bref.key = key;
1004                 chain->bref.keybits = keybits;
1005         }
1006
1007 again:
1008         /*
1009          * Locate a free blockref in the parent's array
1010          */
1011         switch(parent->bref.type) {
1012         case HAMMER2_BREF_TYPE_INODE:
1013                 KKASSERT(parent->data != NULL);
1014                 base = &parent->data->ipdata.u.blockset.blockref[0];
1015                 count = HAMMER2_SET_COUNT;
1016                 break;
1017         case HAMMER2_BREF_TYPE_INDIRECT:
1018                 KKASSERT(parent->data != NULL);
1019                 base = &parent->data->npdata.blockref[0];
1020                 count = HAMMER2_IND_COUNT;
1021                 break;
1022         case HAMMER2_BREF_TYPE_VOLUME:
1023                 KKASSERT(parent->data != NULL);
1024                 base = &hmp->voldata.sroot_blockset.blockref[0];
1025                 count = HAMMER2_SET_COUNT;
1026                 break;
1027         default:
1028                 panic("hammer2_chain_create: unrecognized blockref type: %d",
1029                       parent->bref.type);
1030                 count = 0;
1031                 break;
1032         }
1033
1034         /*
1035          * Scan for an unallocated bref, also skipping any slots occupied
1036          * by in-memory chain elements that may not yet have been updated
1037          * in the parent's bref array.
1038          */
1039         bzero(&dummy_chain, sizeof(dummy_chain));
1040         bref = NULL;
1041         for (i = 0; i < count; ++i) {
1042                 bref = &base[i];
1043                 dummy_chain.index = i;
1044                 if (bref->type == 0 &&
1045                     SPLAY_FIND(hammer2_chain_splay,
1046                                &parent->shead, &dummy_chain) == NULL) {
1047                         break;
1048                 }
1049         }
1050
1051         /*
1052          * If no free blockref count be found we must create an indirect
1053          * block and move a number of blockrefs into it.  With the parent
1054          * locked we can safely lock each child in order to move it without
1055          * causing a deadlock.
1056          *
1057          * This may return the new indirect block or the old parent depending
1058          * on where the key falls.
1059          */
1060         if (i == count) {
1061                 hammer2_chain_t *nparent;
1062
1063                 nparent = hammer2_chain_create_indirect(hmp, parent,
1064                                                         key, keybits);
1065                 if (nparent == NULL) {
1066                         if (allocated)
1067                                 hammer2_chain_free(hmp, chain);
1068                         chain = NULL;
1069                         goto done;
1070                 }
1071                 if (parent != nparent) {
1072                         if (unlock_parent)
1073                                 hammer2_chain_put(hmp, parent);
1074                         parent = nparent;
1075                         unlock_parent = 1;
1076                 }
1077                 goto again;
1078         }
1079
1080         /*
1081          * Link the chain into its parent.
1082          */
1083         if (chain->parent != NULL)
1084                 panic("hammer2: hammer2_chain_create: chain already connected");
1085         KKASSERT(chain->parent == NULL);
1086         chain->parent = parent;
1087         chain->index = i;
1088         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1089                 panic("hammer2_chain_link: collision");
1090         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1091         KKASSERT(parent->refs > 1);
1092         atomic_add_int(&parent->refs, 1);
1093
1094         /*
1095          * Additional linkage for inodes.  Reuse the parent pointer to
1096          * find the parent directory.
1097          */
1098         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1099                 hammer2_chain_t *scan = parent;
1100                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1101                         scan = scan->parent;
1102                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE)
1103                         chain->u.ip->pip = scan->u.ip;
1104         }
1105
1106         /*
1107          * Mark the newly created or previously disconnected chain element
1108          * as modified and fully resolve the chain->data pointer.  The
1109          * WAS_MODIFIED bit will be set in both cases.
1110          *
1111          * Chain elements with embedded data will not issue I/O at this time.
1112          * A new block will be allocated for the buffer but not instantiated.
1113          *
1114          * Chain elements which do not use embedded data will allocate
1115          * the new block AND instantiate its buffer cache buffer, pointing
1116          * the data at the bp.
1117          */
1118         if (chain->flags & HAMMER2_CHAIN_WAS_MODIFIED) {
1119                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
1120                 hammer2_chain_modify(hmp, chain);
1121         }
1122
1123 done:
1124         if (unlock_parent)
1125                 hammer2_chain_put(hmp, parent);
1126         return (chain);
1127 }
1128
1129 /*
1130  * Create an indirect block that covers one or more of the elements in the
1131  * current parent.  Either returns the existing parent with no locking or
1132  * ref changes or returns the new indirect block locked and referenced,
1133  * depending on what the specified key falls into.
1134  *
1135  * The key/keybits for the indirect mode only needs to follow three rules:
1136  *
1137  * (1) That all elements underneath it fit within its key space and
1138  *
1139  * (2) That all elements outside it are outside its key space.
1140  *
1141  * (3) When creating the new indirect block any elements in the current
1142  *     parent that fit within the new indirect block's keyspace must be
1143  *     moved into the new indirect block.
1144  *
1145  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1146  *     keyspace the the current parent, but lookup/iteration rules will
1147  *     ensure (and must ensure) that rule (2) for all parents leading up
1148  *     to the nearest inode or the root volume header is adhered to.  This
1149  *     is accomplished by always recursing through matching keyspaces in
1150  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1151  *
1152  * The current implementation calculates the current worst-case keyspace by
1153  * iterating the current parent and then divides it into two halves, choosing
1154  * whichever half has the most elements (not necessarily the half containing
1155  * the requested key).
1156  *
1157  * We can also opt to use the half with the least number of elements.  This
1158  * causes lower-numbered keys (aka logical file offsets) to recurse through
1159  * fewer indirect blocks and higher-numbered keys to recurse through more.
1160  * This also has the risk of not moving enough elements to the new indirect
1161  * block and being forced to create several indirect blocks before the element
1162  * can be inserted.
1163  */
1164 static
1165 hammer2_chain_t *
1166 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1167                               hammer2_key_t create_key, int create_bits)
1168 {
1169         hammer2_blockref_t *base;
1170         hammer2_blockref_t *bref;
1171         hammer2_chain_t *chain;
1172         hammer2_chain_t *ichain;
1173         hammer2_chain_t dummy;
1174         hammer2_key_t key = create_key;
1175         int keybits = create_bits;
1176         int locount = 0;
1177         int hicount = 0;
1178         int count;
1179         int i;
1180
1181         /*
1182          * Mark the parent modified so our base[] pointer remains valid
1183          * while we move entries.
1184          */
1185         hammer2_chain_modify(hmp, parent);
1186
1187         /*
1188          * Locate a free blockref in the parent's array
1189          */
1190         switch(parent->bref.type) {
1191         case HAMMER2_BREF_TYPE_INODE:
1192                 base = &parent->data->ipdata.u.blockset.blockref[0];
1193                 count = HAMMER2_SET_COUNT;
1194                 break;
1195         case HAMMER2_BREF_TYPE_INDIRECT:
1196                 base = &parent->data->npdata.blockref[0];
1197                 count = HAMMER2_IND_COUNT;
1198                 break;
1199         case HAMMER2_BREF_TYPE_VOLUME:
1200                 base = &hmp->voldata.sroot_blockset.blockref[0];
1201                 count = HAMMER2_SET_COUNT;
1202                 break;
1203         default:
1204                 panic("hammer2_chain_create_indirect: "
1205                       "unrecognized blockref type: %d",
1206                       parent->bref.type);
1207                 count = 0;
1208                 break;
1209         }
1210
1211         /*
1212          * Scan for an unallocated bref, also skipping any slots occupied
1213          * by in-memory chain elements that may not yet have been updated
1214          * in the parent's bref array.
1215          */
1216         bzero(&dummy, sizeof(dummy));
1217         for (i = 0; i < count; ++i) {
1218                 int nkeybits;
1219
1220                 bref = &base[i];
1221                 if (bref->type == 0) {
1222                         dummy.index = i;
1223                         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
1224                                            &dummy);
1225                         if (chain == NULL)
1226                                 continue;
1227                         bref = &chain->bref;
1228                 }
1229
1230                 /*
1231                  * Expand our calculated key range (key, keybits) to fit
1232                  * the scanned key.  nkeybits represents the full range
1233                  * that we will later cut in half (two halves @ nkeybits - 1).
1234                  */
1235                 nkeybits = keybits;
1236                 if (nkeybits < bref->keybits)
1237                         nkeybits = bref->keybits;
1238                 while ((~(((hammer2_key_t)1 << nkeybits) - 1) &
1239                         (key ^ bref->key)) != 0) {
1240                         ++nkeybits;
1241                 }
1242
1243                 /*
1244                  * If the new key range is larger we have to determine
1245                  * which side of the new key range the existing keys fall
1246                  * under by checking the high bit, then collapsing the
1247                  * locount into the hicount or vise-versa.
1248                  */
1249                 if (keybits != nkeybits) {
1250                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1251                                 hicount += locount;
1252                                 locount = 0;
1253                         } else {
1254                                 locount += hicount;
1255                                 hicount = 0;
1256                         }
1257                         keybits = nkeybits;
1258                 }
1259
1260                 /*
1261                  * The newly scanned key will be in the lower half or the
1262                  * higher half of the (new) key range.
1263                  */
1264                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1265                         ++hicount;
1266                 else
1267                         ++locount;
1268         }
1269
1270         /*
1271          * Adjust keybits to represent half of the full range calculated
1272          * above.
1273          */
1274         --keybits;
1275
1276         /*
1277          * Select whichever half contains the most elements.  Theoretically
1278          * we can select either side as long as it contains at least one
1279          * element (in order to ensure that a free slot is present to hold
1280          * the indirect block).
1281          */
1282         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1283         if (hammer2_indirect_optimize) {
1284                 /*
1285                  * Insert node for least number of keys, this will arrange
1286                  * the first few blocks of a large file or the first few
1287                  * inodes in a directory with fewer indirect blocks when
1288                  * created linearly.
1289                  */
1290                 if (hicount < locount && hicount != 0)
1291                         key |= (hammer2_key_t)1 << keybits;
1292                 else
1293                         key &= ~(hammer2_key_t)1 << keybits;
1294         } else {
1295                 /*
1296                  * Insert node for most number of keys, best for heavily
1297                  * fragmented files.
1298                  */
1299                 if (hicount > locount)
1300                         key |= (hammer2_key_t)1 << keybits;
1301                 else
1302                         key &= ~(hammer2_key_t)1 << keybits;
1303         }
1304
1305         /*
1306          * Ok, create our new indirect block
1307          */
1308         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1309         dummy.bref.key = key;
1310         dummy.bref.keybits = keybits;
1311         dummy.bref.data_off = (hammer2_off_t)
1312                             hammer2_freemap_bytes_to_radix(HAMMER2_PBUFSIZE);
1313         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1314         ichain->flags |= HAMMER2_CHAIN_INITIAL;
1315
1316         /*
1317          * Iterate the original parent and move the matching brefs into
1318          * the new indirect block.
1319          */
1320         for (i = 0; i < count; ++i) {
1321                 /*
1322                  * For keying purposes access the bref from the media or
1323                  * from our in-memory cache.  In cases where the in-memory
1324                  * cache overrides the media the keyrefs will be the same
1325                  * anyway so we can avoid checking the cache when the media
1326                  * has a key.
1327                  */
1328                 bref = &base[i];
1329                 if (bref->type == 0) {
1330                         dummy.index = i;
1331                         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
1332                                            &dummy);
1333                         if (chain == NULL) {
1334                                 /*
1335                                  * Select index indirect block is placed in
1336                                  */
1337                                 if (ichain->index < 0)
1338                                         ichain->index = i;
1339                                 continue;
1340                         }
1341                         bref = &chain->bref;
1342                 }
1343
1344                 /*
1345                  * Skip keys not in the chosen half (low or high), only bit
1346                  * (keybits - 1) needs to be compared but for safety we
1347                  * will compare all msb bits plus that bit again.
1348                  */
1349                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1350                     (key ^ bref->key)) != 0) {
1351                         continue;
1352                 }
1353
1354                 /*
1355                  * This element is being moved, its slot is available
1356                  * for our indirect block.
1357                  */
1358                 if (ichain->index < 0)
1359                         ichain->index = i;
1360
1361                 /*
1362                  * Load the new indirect block by acquiring or allocating
1363                  * the related chain entries, then simply move it to the
1364                  * new parent (ichain).
1365                  *
1366                  * Flagging the new chain entry MOVED will cause a flush
1367                  * to synchronize its block into the new indirect block.
1368                  * The chain is unlocked after being moved but needs to
1369                  * retain a reference for the MOVED state
1370                  *
1371                  * We must still set SUBMODIFIED in the parent but we do
1372                  * that after the loop.
1373                  *
1374                  * XXX we really need a lock here but we don't need the
1375                  *     data.  NODATA feature needed.
1376                  */
1377                 chain = hammer2_chain_get(hmp, parent, i,
1378                                           HAMMER2_LOOKUP_NOLOCK);
1379                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1380                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1381                         panic("hammer2_chain_create_indirect: collision");
1382                 chain->parent = ichain;
1383                 bzero(&base[i], sizeof(base[i]));
1384                 atomic_add_int(&parent->refs, -1);
1385                 atomic_add_int(&ichain->refs, 1);
1386                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
1387                         /* We don't need the ref from the chain_get */
1388                         hammer2_chain_drop(hmp, chain);
1389                 } else {
1390                         /* MOVED bit inherits the ref from the chain_get */
1391                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1392                 }
1393                 KKASSERT(parent->refs > 0);
1394                 chain = NULL;
1395         }
1396
1397         /*
1398          * Insert the new indirect block into the parent now that we've
1399          * cleared out some entries in the parent.  We calculated a good
1400          * insertion index in the loop above (ichain->index).
1401          */
1402         KKASSERT(ichain->index >= 0);
1403         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1404                 panic("hammer2_chain_create_indirect: ichain insertion");
1405         ichain->parent = parent;
1406         atomic_add_int(&parent->refs, 1);
1407
1408         /*
1409          * Mark the new indirect block modified after insertion, which
1410          * will propagate up through parent all the way to the root and
1411          * also allocate the physical block in ichain for our caller.
1412          *
1413          * We have to set SUBMODIFIED in ichain's flags manually so the
1414          * flusher knows it has to recurse through it to get to all of
1415          * our moved blocks.
1416          */
1417         hammer2_chain_modify(hmp, ichain);
1418         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1419
1420         /*
1421          * Figure out what to return.
1422          */
1423         if (create_bits >= keybits) {
1424                 /*
1425                  * Key being created is way outside the key range,
1426                  * return the original parent.
1427                  */
1428                 hammer2_chain_put(hmp, ichain);
1429         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1430                    (create_key ^ key)) {
1431                 /*
1432                  * Key being created is outside the key range,
1433                  * return the original parent.
1434                  */
1435                 hammer2_chain_put(hmp, ichain);
1436         } else {
1437                 /*
1438                  * Otherwise its in the range, return the new parent.
1439                  */
1440                 parent = ichain;
1441         }
1442
1443         return(parent);
1444 }
1445
1446 /*
1447  * Physically delete the specified chain element.  Note that inodes with
1448  * open descriptors should not be deleted (as with other filesystems) until
1449  * the last open descriptor is closed.
1450  *
1451  * This routine will remove the chain element from its parent and potentially
1452  * also recurse upward and delete indirect blocks which become empty as a
1453  * side effect.
1454  *
1455  * The caller must pass a pointer to the chain's parent, also locked and
1456  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1457  * or iteration when indirect blocks are also deleted as a side effect.
1458  */
1459 void
1460 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1461                      hammer2_chain_t *chain)
1462 {
1463         hammer2_blockref_t *base;
1464         int count;
1465
1466         if (chain->parent != parent)
1467                 panic("hammer2_chain_delete: parent mismatch");
1468
1469         /*
1470          * Mark the parent modified so our base[] pointer remains valid
1471          * while we move entries.
1472          *
1473          * Calculate the blockref reference in the parent
1474          */
1475         hammer2_chain_modify(hmp, parent);
1476
1477         switch(parent->bref.type) {
1478         case HAMMER2_BREF_TYPE_INODE:
1479                 base = &parent->data->ipdata.u.blockset.blockref[0];
1480                 count = HAMMER2_SET_COUNT;
1481                 break;
1482         case HAMMER2_BREF_TYPE_INDIRECT:
1483                 base = &parent->data->npdata.blockref[0];
1484                 count = HAMMER2_IND_COUNT;
1485                 break;
1486         case HAMMER2_BREF_TYPE_VOLUME:
1487                 base = &hmp->voldata.sroot_blockset.blockref[0];
1488                 count = HAMMER2_SET_COUNT;
1489                 break;
1490         default:
1491                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
1492                       parent->bref.type);
1493                 count = 0;
1494                 break;
1495         }
1496
1497         /*
1498          * Disconnect the bref in the parent, remove the chain, and
1499          * disconnect in-memory fields from the parent.
1500          */
1501         KKASSERT(chain->index >= 0 && chain->index < count);
1502         base += chain->index;
1503         bzero(base, sizeof(*base));
1504
1505         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1506         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1507         atomic_add_int(&parent->refs, -1);      /* for splay entry */
1508         chain->index = -1;
1509
1510         chain->parent = NULL;
1511         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
1512                 chain->u.ip->pip = NULL;
1513
1514         /*
1515          * Nobody references the underlying object any more so we can
1516          * clear any pending modification(s) on it.  This can theoretically
1517          * recurse downward but even just clearing the bit on this item
1518          * will effectively recurse if someone is doing a rm -rf and greatly
1519          * reduce the I/O required.
1520          *
1521          * The MODIFIED1 bit is cleared but we have to remember the old state
1522          * in case this deletion is related to a rename.  The ref on the
1523          * chain is shared by both modified flags.
1524          */
1525         if (chain->flags & HAMMER2_CHAIN_MODIFIED1) {
1526                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
1527                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_WAS_MODIFIED);
1528                 if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
1529                         hammer2_chain_drop(hmp, chain);
1530         }
1531 }
1532
1533 /*
1534  * Recursively flush the specified chain.  The chain is locked and
1535  * referenced by the caller and will remain so on return.
1536  *
1537  * This cannot be called with the volume header's vchain (yet).
1538  *
1539  * PASS1 - clear the MODIFIED1 bit (and set the MODIFIED2 bit XXX)
1540  *
1541  */
1542 static void
1543 hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *chain)
1544 {
1545         /*
1546          * Flush any children of this chain entry.
1547          */
1548         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
1549                 hammer2_blockref_t *base;
1550                 hammer2_chain_t *scan;
1551                 hammer2_chain_t *next;
1552                 int count;
1553                 int submodified = 0;
1554
1555                 /*
1556                  * Modifications to the children will propagate up, forcing
1557                  * us to become modified and copy-on-write too.
1558                  *
1559                  * Clear SUBMODIFIED now, races during the flush will re-set
1560                  * it.
1561                  */
1562                 hammer2_chain_modify(hmp, chain);
1563                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1564
1565                 /*
1566                  * The blockref in the parent's array must be repointed at
1567                  * the new block allocated by the child after its flush.
1568                  *
1569                  * Calculate the base of the array.
1570                  */
1571                 switch(chain->bref.type) {
1572                 case HAMMER2_BREF_TYPE_INODE:
1573                         KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1574                         base = &chain->data->ipdata.u.blockset.blockref[0];
1575                         count = HAMMER2_SET_COUNT;
1576                         break;
1577                 case HAMMER2_BREF_TYPE_INDIRECT:
1578                         base = &chain->data->npdata.blockref[0];
1579                         count = HAMMER2_IND_COUNT;
1580                         break;
1581                 case HAMMER2_BREF_TYPE_VOLUME:
1582                         KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1583                         base = &hmp->voldata.sroot_blockset.blockref[0];
1584                         count = HAMMER2_SET_COUNT;
1585                         break;
1586                 default:
1587                         base = NULL;
1588                         panic("hammer2_chain_get: unrecognized blockref type: %d",
1589                               chain->bref.type);
1590                 }
1591
1592                 /*
1593                  * Flush the children and update the blockrefs in the parent.
1594                  * Be careful of ripouts during the loop.
1595                  */
1596                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
1597                 while ((scan = next) != NULL) {
1598                         next = SPLAY_NEXT(hammer2_chain_splay, &chain->shead,
1599                                           scan);
1600                         if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
1601                                            HAMMER2_CHAIN_MODIFIED1 |
1602                                            HAMMER2_CHAIN_MOVED)) {
1603                                 hammer2_chain_ref(hmp, scan);
1604                                 hammer2_chain_lock(hmp, scan);
1605                                 hammer2_chain_flush_pass1(hmp, scan);
1606                                 if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
1607                                                    HAMMER2_CHAIN_MODIFIED1)) {
1608                                         submodified = 1;
1609                                         kprintf("x");
1610                                 } else {
1611                                         KKASSERT(scan->index < count);
1612                                         base[scan->index] = scan->bref;
1613                                         if (scan->flags & HAMMER2_CHAIN_MOVED) {
1614                                                 atomic_clear_int(&scan->flags,
1615                                                          HAMMER2_CHAIN_MOVED);
1616                                                 hammer2_chain_drop(hmp, scan);
1617                                         }
1618                                 }
1619                                 hammer2_chain_put(hmp, scan);
1620                         }
1621                 }
1622                 if (submodified) {
1623                         atomic_set_int(&chain->flags,
1624                                        HAMMER2_CHAIN_SUBMODIFIED);
1625                 }
1626         }
1627
1628         /*
1629          * Flush this chain entry only if it is marked modified.
1630          */
1631         if ((chain->flags & HAMMER2_CHAIN_MODIFIED1) == 0)
1632                 return;
1633
1634         /*
1635          * Clear MODIFIED1 and set HAMMER2_CHAIN_MOVED.  The MODIFIED{1,2}
1636          * bits own a single parent ref and the MOVED bit owns its own
1637          * parent ref.
1638          */
1639         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED1);
1640         if (chain->flags & HAMMER2_CHAIN_MOVED) {
1641                 if ((chain->flags & HAMMER2_CHAIN_MODIFIED2) == 0)
1642                         hammer2_chain_drop(hmp, chain);
1643         } else {
1644                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1645                 if (chain->flags & HAMMER2_CHAIN_MODIFIED2)
1646                         hammer2_chain_ref(hmp, chain);
1647         }
1648
1649         /*
1650          * If this is part of a recursive flush we can go ahead and write
1651          * out the buffer cache buffer and pass a new bref back up the chain.
1652          *
1653          * This will never be a volume header.
1654          */
1655         if (chain != &hmp->vchain) {
1656                 hammer2_blockref_t *bref;
1657                 hammer2_off_t off_hi;
1658                 struct buf *bp;
1659                 size_t off_lo;
1660                 size_t bytes;
1661                 int error;
1662
1663                 KKASSERT(chain->data != NULL);
1664                 bref = &chain->bref;
1665
1666                 off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
1667                 off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
1668                 bytes = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
1669                 KKASSERT(off_hi != 0);  /* not the root volume header */
1670
1671                 if (chain->bp) {
1672                         /*
1673                          * The data is mapped directly to the bp.  Dirty the
1674                          * bp so it gets flushed out by the kernel later on.
1675                          */
1676                         bdirty(chain->bp);
1677                 } else {
1678                         /*
1679                          * The data is embedded, we have to acquire the
1680                          * buffer cache buffer and copy the data into it.
1681                          */
1682                         bp = NULL;
1683                         error = bread(hmp->devvp, off_hi,
1684                                       HAMMER2_PBUFSIZE, &bp);
1685                         KKASSERT(error == 0); /* XXX */
1686
1687                         /*
1688                          * Copy the data to the buffer, mark the buffer
1689                          * dirty, and convert the chain to unmodified.
1690                          */
1691                         bcopy(chain->data, (char *)bp->b_data + off_lo, bytes);
1692                         bdwrite(bp);
1693                         bp = NULL;
1694
1695                         chain->bref.check.iscsi32.value =
1696                                         hammer2_icrc32(chain->data, bytes);
1697                 }
1698         }
1699         {
1700                 hammer2_blockref_t *bref;
1701
1702                 bref = &chain->bref;
1703
1704                 switch(bref->type) {
1705                 case HAMMER2_BREF_TYPE_VOLUME:
1706                         KKASSERT(chain->data != NULL);
1707                         KKASSERT(chain->bp == NULL);
1708
1709                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
1710                                 hammer2_icrc32(
1711                                         (char *)&hmp->voldata +
1712                                          HAMMER2_VOLUME_ICRC1_OFF,
1713                                         HAMMER2_VOLUME_ICRC1_SIZE);
1714                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
1715                                 hammer2_icrc32(
1716                                         (char *)&hmp->voldata +
1717                                          HAMMER2_VOLUME_ICRC0_OFF,
1718                                         HAMMER2_VOLUME_ICRC0_SIZE);
1719                         hmp->voldata.icrc_volheader =
1720                                 hammer2_icrc32(
1721                                         (char *)&hmp->voldata +
1722                                          HAMMER2_VOLUME_ICRCVH_OFF,
1723                                         HAMMER2_VOLUME_ICRCVH_SIZE);
1724                         break;
1725                 }
1726         }
1727 }
1728
1729 #if 0
1730 /*
1731  * PASS2 - not yet implemented (should be called only with the root chain?)
1732  */
1733 static void
1734 hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
1735 {
1736 }
1737 #endif
1738
1739 void
1740 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain)
1741 {
1742         hammer2_chain_flush_pass1(hmp, chain);
1743 }