hammer2 - more indirect block work, add advlock
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/cdefs.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/types.h>
47 #include <sys/lock.h>
48 #include <sys/uuid.h>
49
50 #include "hammer2.h"
51
52 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
53
54 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
55
56 static hammer2_chain_t *hammer2_chain_create_indirect(
57                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
58                         hammer2_key_t key, int keybits);
59
60 /*
61  * Compare function for chain splay tree
62  */
63 int
64 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
65 {
66         return(chain2->index - chain1->index);
67 }
68
69 /*
70  * Allocate a new disconnected chain element representing the specified
71  * bref.  The chain element is locked exclusively and refs is set to 1.
72  *
73  * This essentially allocates a system memory structure representing one
74  * of the media structure types, including inodes.
75  */
76 hammer2_chain_t *
77 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
78 {
79         hammer2_chain_t *chain;
80         hammer2_inode_t *ip;
81         hammer2_indblock_t *np;
82         hammer2_data_t *dp;
83
84         /*
85          * Construct the appropriate system structure.
86          */
87         switch(bref->type) {
88         case HAMMER2_BREF_TYPE_INODE:
89                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
90                 chain = &ip->chain;
91                 chain->u.ip = ip;
92                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
93                 ip->hmp = hmp;
94                 break;
95         case HAMMER2_BREF_TYPE_INDIRECT:
96                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
97                 chain = &np->chain;
98                 chain->u.np = np;
99                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
100                 break;
101         case HAMMER2_BREF_TYPE_DATA:
102                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
103                 chain = &dp->chain;
104                 chain->u.dp = dp;
105                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
106                 break;
107         case HAMMER2_BREF_TYPE_VOLUME:
108                 chain = NULL;
109                 panic("hammer2_chain_get: volume type illegal for op");
110         default:
111                 chain = NULL;
112                 panic("hammer2_chain_get: unrecognized blockref type: %d",
113                       bref->type);
114         }
115         chain->bref = *bref;
116         chain->index = -1;      /* not yet assigned */
117
118         chain->refs = 1;
119         lockmgr(&chain->lk, LK_EXCLUSIVE);
120
121         return (chain);
122 }
123
124 /*
125  * Free a disconnected chain element
126  */
127 void
128 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
129 {
130         void *mem;
131
132         KKASSERT(chain->bp == NULL);
133         KKASSERT(chain->data == NULL);
134         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
135                  chain->u.ip->vp == NULL);
136
137         if ((mem = chain->u.mem) != NULL) {
138                 chain->u.mem = NULL;
139                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
140                         kfree(mem, hmp->minode);
141                 else
142                         kfree(mem, hmp->mchain);
143         }
144 }
145
146 /*
147  * Add a reference to a chain element (for shared access).  The chain
148  * element must already have at least 1 ref controlled by the caller.
149  */
150 void
151 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
152 {
153         KKASSERT(chain->refs > 0);
154         atomic_add_int(&chain->refs, 1);
155 }
156
157 /*
158  * Drop the callers reference to the chain element.  If the ref count
159  * reaches zero the chain element and its related structure (typically an
160  * inode or indirect block) will be freed and the parent will be
161  * recursively dropped.
162  *
163  * Modified elements hold an additional reference so it should not be
164  * possible for the count on a modified element to drop to 0.
165  *
166  * The chain element must NOT be locked by the caller.
167  *
168  * The parent might or might not be locked by the caller but if so it
169  * will also be referenced so we shouldn't recurse upward.
170  */
171 void
172 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
173 {
174         hammer2_chain_t *parent;
175         u_int refs;
176
177         while (chain) {
178                 refs = chain->refs;
179                 cpu_ccfence();
180                 if (refs == 1) {
181                         KKASSERT(chain != &hmp->vchain);
182                         parent = chain->parent;
183                         lockmgr(&parent->lk, LK_EXCLUSIVE);
184                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
185                                 /*
186                                  * Succeeded, recurse and drop parent
187                                  */
188                                 SPLAY_REMOVE(hammer2_chain_splay,
189                                              &parent->shead, chain);
190                                 chain->parent = NULL;
191                                 lockmgr(&parent->lk, LK_RELEASE);
192                                 hammer2_chain_free(hmp, chain);
193                                 chain = parent;
194                         } else {
195                                 lockmgr(&parent->lk, LK_RELEASE);
196                         }
197                 } else {
198                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
199                                 /*
200                                  * Succeeded, count did not reach zero so
201                                  * cut out of the loop.
202                                  */
203                                 break;
204                         }
205                 }
206         }
207 }
208
209 /*
210  * Lock a chain element, acquiring its data with I/O if necessary.
211  *
212  * Returns 0 on success or an error code if the data could not be acquired.
213  * The chain element is locked either way.
214  *
215  * chain->data will be pointed either at the embedded data (e.g. for
216  * inodes), in which case the buffer cache buffer is released, or will
217  * point into the bp->b_data buffer with the bp left intact while locked.
218  */
219 int
220 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
221 {
222         hammer2_blockref_t *bref;
223         hammer2_off_t off_hi;
224         size_t off_lo;
225         int error;
226         void *data;
227
228         /*
229          * Lock the element.  Under certain conditions this might end up
230          * being a recursive lock.
231          */
232         KKASSERT(chain->refs > 0);
233         lockmgr(&chain->lk, LK_EXCLUSIVE);
234
235         /*
236          * The volume header is a special case
237          */
238         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME)
239                 return(0);
240
241         /*
242          * bp must be NULL, so if the data pointer is valid here it points
243          * to embedded data and no I/O is necessary (whether modified or not).
244          */
245         KKASSERT(chain->bp == NULL);
246         if (chain->data)
247                 return (0);
248
249         /*
250          * If data is NULL we must issue I/O.  Any error returns the error
251          * code but leaves the chain locked.
252          *
253          * If the chain was modified a new bref will have already been
254          * allocated and its related bp is probably still sitting in the
255          * buffer cache.
256          */
257         bref = &chain->bref;
258
259         off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
260         off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
261         KKASSERT(off_hi != 0);
262         error = bread(hmp->devvp, off_hi, HAMMER2_PBUFSIZE, &chain->bp);
263
264         if (error) {
265                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
266                         (intmax_t)off_hi, error);
267                 brelse(chain->bp);
268                 chain->bp = NULL;
269                 return (error);
270         }
271
272         /*
273          * Setup the data pointer, either pointing it to an embedded data
274          * structure and copying the data from the buffer, or pointint it
275          * into the buffer.
276          *
277          * The buffer is not retained when copying to an embedded data
278          * structure in order to avoid potential deadlocks or recursions
279          * on the same physical buffer.
280          */
281         switch (bref->type) {
282         case HAMMER2_BREF_TYPE_VOLUME:
283                 /*
284                  * Copy data from bp to embedded buffer
285                  */
286                 KKASSERT(0);    /* not yet - have mount use this soon */
287                 KKASSERT(off_hi == 0);
288                 bcopy((char *)chain->bp->b_data + off_lo,
289                       &hmp->voldata, HAMMER2_PBUFSIZE);
290                 chain->data = (void *)&hmp->voldata;
291                 brelse(chain->bp);
292                 chain->bp = NULL;
293                 break;
294         case HAMMER2_BREF_TYPE_INODE:
295                 /*
296                  * Copy data from bp to embedded buffer.
297                  */
298                 bcopy((char *)chain->bp->b_data + off_lo,
299                       &chain->u.ip->ip_data,
300                       HAMMER2_INODE_BYTES);
301                 chain->data = (void *)&chain->u.ip->ip_data;
302                 brelse(chain->bp);
303                 chain->bp = NULL;
304                 break;
305         default:
306                 /*
307                  * Leave bp intact
308                  */
309                 data = (char *)chain->bp->b_data + off_lo;
310                 chain->data = data;
311                 break;
312         }
313         return (0);
314 }
315
316 /*
317  * Convert a locked chain that was retrieved read-only to read-write.
318  *
319  * If not already marked modified a new physical block will be allocated
320  * and assigned to the bref.  If the data is pointing into an existing
321  * bp it will be copied to the new bp and the new bp will replace the
322  * existing bp.
323  *
324  * If the data is embedded we allocate the new physical block but don't
325  * bother copying the data into it (yet).
326  */
327 void
328 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
329 {
330         hammer2_chain_t *parent;
331         struct buf *nbp;
332         size_t bytes;
333         void *ndata;
334         int error;
335
336         /*
337          * If the chain is already marked modified we can just return.
338          */
339         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
340                 KKASSERT(chain->data != NULL);
341                 return;
342         }
343
344         /*
345          * The MODIFIED bit is not yet set, we must allocate the
346          * copy-on-write block.
347          *
348          * If the data is embedded no other action is required.
349          *
350          * If the data is not embedded we acquire and clear the
351          * new block.  If chain->data is not NULL we then do the
352          * copy-on-write.  chain->data will then be repointed to the new
353          * buffer and the old buffer will be released.
354          *
355          * For newly created elements with no prior allocation we go
356          * through the copy-on-write steps except without the copying part.
357          */
358         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
359         hammer2_chain_ref(hmp, chain);  /* ref for modified bit */
360
361         bytes = 1 << (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
362         if (chain != &hmp->vchain) {
363                 chain->bref.data_off = hammer2_freemap_alloc(hmp, bytes);
364                 /* XXX failed allocation */
365         }
366
367         switch(chain->bref.type) {
368         case HAMMER2_BREF_TYPE_VOLUME:          /* embedded */
369         case HAMMER2_BREF_TYPE_INODE:           /* embedded */
370                 /*
371                  * data points to embedded structure, no copy needed
372                  */
373                 error = 0;
374                 break;
375         case HAMMER2_BREF_TYPE_INDIRECT:
376         case HAMMER2_BREF_TYPE_DATA:
377                 /*
378                  * data (if not NULL) points into original bp, copy-on-write
379                  * to new block.
380                  */
381                 KKASSERT(chain != &hmp->vchain);        /* safety */
382                 if (bytes == HAMMER2_PBUFSIZE) {
383                         nbp = getblk(hmp->devvp,
384                                      chain->bref.data_off & HAMMER2_OFF_MASK_HI,
385                                      HAMMER2_PBUFSIZE, 0, 0);
386                         vfs_bio_clrbuf(nbp);
387                         error = 0;
388                 } else {
389                         error = bread(hmp->devvp,
390                                      chain->bref.data_off & HAMMER2_OFF_MASK_HI,
391                                      HAMMER2_PBUFSIZE, &nbp);
392                         KKASSERT(error == 0);/* XXX handle error */
393                 }
394                 ndata = nbp->b_data + (chain->bref.data_off &
395                                        HAMMER2_OFF_MASK_LO);
396                 if (chain->data) {
397                         bcopy(chain->data, ndata, bytes);
398                         KKASSERT(chain->bp != NULL);
399                         brelse(chain->bp);
400                 }
401                 chain->bp = nbp;
402                 chain->data = ndata;
403                 break;
404         default:
405                 panic("hammer2_chain_modify: unknown bref type");
406                 break;
407
408         }
409
410         /*
411          * Recursively mark the parent chain elements so flushes can find
412          * modified elements.
413          */
414         parent = chain->parent;
415         while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
416                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
417                 parent = parent->parent;
418         }
419 }
420
421 /*
422  * Unlock a chain element without dropping its reference count.
423  * (see hammer2_chain_put() to do both).
424  *
425  * Non-embedded data references (chain->bp != NULL) are returned to the
426  * system and the data field is cleared in that case.  If modified the
427  * dirty buffer is still returned to the system, can be flushed to disk by
428  * the system at any time, and will be reconstituted/re-read as needed.
429  */
430 void
431 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
432 {
433         if (chain->bp) {
434                 chain->data = NULL;
435                 if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
436                                     HAMMER2_CHAIN_FLUSHED)) {
437                         if (chain->flags & HAMMER2_CHAIN_IOFLUSH)
438                                 bawrite(chain->bp);
439                         else
440                                 bdwrite(chain->bp);
441                 } else {
442                         bqrelse(chain->bp);
443                 }
444                 chain->bp = NULL;
445         }
446         lockmgr(&chain->lk, LK_RELEASE);
447 }
448
449 /*
450  * Locate an in-memory chain.  The parent must be locked.  The in-memory
451  * chain is returned or NULL if no in-memory chain is present.
452  *
453  * NOTE: A chain on-media might exist for this index when NULL is returned.
454  */
455 hammer2_chain_t *
456 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
457 {
458         hammer2_chain_t dummy;
459         hammer2_chain_t *chain;
460
461         dummy.index = index;
462         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
463         return (chain);
464 }
465
466 /*
467  * Return a locked chain structure with all associated data acquired.
468  *
469  * Caller must lock the parent on call, the returned child will be locked.
470  */
471 hammer2_chain_t *
472 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
473                   int index, int flags)
474 {
475         hammer2_blockref_t *bref;
476         hammer2_chain_t *chain;
477         hammer2_chain_t dummy;
478
479         /*
480          * First see if we have a (possibly modified) chain element cached
481          * for this (parent, index).  Acquire the data if necessary.
482          *
483          * If chain->data is non-NULL the chain should already be marked
484          * modified.
485          */
486         dummy.index = index;
487         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
488         if (chain) {
489                 hammer2_chain_ref(hmp, chain);
490                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
491                         hammer2_chain_lock(hmp, chain);
492                 return(chain);
493         }
494
495         /*
496          * Otherwise lookup the bref and issue I/O (switch on the parent)
497          */
498         switch(parent->bref.type) {
499         case HAMMER2_BREF_TYPE_INODE:
500                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
501                 bref = &parent->data->ipdata.u.blockset.blockref[index];
502                 break;
503         case HAMMER2_BREF_TYPE_INDIRECT:
504                 KKASSERT(index >= 0 && index < HAMMER2_IND_COUNT);
505                 bref = &parent->data->npdata.blockref[index];
506                 break;
507         case HAMMER2_BREF_TYPE_VOLUME:
508                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
509                 bref = &hmp->voldata.sroot_blockset.blockref[index];
510                 break;
511         default:
512                 bref = NULL;
513                 panic("hammer2_chain_get: unrecognized blockref type: %d",
514                       parent->bref.type);
515         }
516         chain = hammer2_chain_alloc(hmp, bref);
517
518         /*
519          * Link the chain into its parent.  Caller is expected to hold an
520          * exclusive lock on the parent.
521          */
522         chain->parent = parent;
523         chain->index = index;
524         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
525                 panic("hammer2_chain_link: collision");
526         KKASSERT(parent->refs > 1);
527         atomic_add_int(&parent->refs, 1);       /* for splay entry */
528
529         /*
530          * Additional linkage for inodes.  Reuse the parent pointer to
531          * find the parent directory.
532          */
533         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
534                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
535                         parent = parent->parent;
536                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
537                         chain->u.ip->pip = parent->u.ip;
538         }
539
540         /*
541          * Our new chain structure has already been referenced and locked
542          * but the lock code handles the I/O so call it to resolve the data.
543          * Then release one of our two exclusive locks.
544          *
545          * If NOLOCK is set the release will release the one-and-only lock.
546          */
547         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
548                 hammer2_chain_lock(hmp, chain);
549         lockmgr(&chain->lk, LK_RELEASE);
550
551         return (chain);
552 }
553
554 /*
555  * Unlock and dereference a chain after use.  It is possible for this to
556  * recurse up the chain.
557  */
558 void
559 hammer2_chain_put(hammer2_mount_t *hmp, hammer2_chain_t *chain)
560 {
561         hammer2_chain_unlock(hmp, chain);
562         hammer2_chain_drop(hmp, chain);
563 }
564
565 /*
566  * Locate any key between key_beg and key_end inclusive.  (*parentp)
567  * typically points to an inode but can also point to a related indirect
568  * block and this function will recurse upwards and find the inode again.
569  *
570  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
571  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
572  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
573  *
574  * (*parentp) must be exclusively locked and referenced and can be an inode
575  * or an existing indirect block within the inode.
576  *
577  * On return (*parentp) will be modified to point at the deepest parent chain
578  * element encountered during the search, as a helper for an insertion or
579  * deletion.   The new (*parentp) will be locked and referenced and the old
580  * will be unlocked and dereferenced (no change if they are both the same).
581  *
582  * The matching chain will be returned exclusively locked and referenced.
583  *
584  * NULL is returned if no match was found, but (*parentp) will still
585  * potentially be adjusted.
586  *
587  * This function will also recurse up the chain if the key is not within the
588  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
589  * can simply allow (*parentp) to float inside the loop.
590  */
591 hammer2_chain_t *
592 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
593                      hammer2_key_t key_beg, hammer2_key_t key_end,
594                      int flags)
595 {
596         hammer2_chain_t *parent;
597         hammer2_chain_t *chain;
598         hammer2_chain_t *tmp;
599         hammer2_blockref_t *base;
600         hammer2_blockref_t *bref;
601         hammer2_key_t scan_beg;
602         hammer2_key_t scan_end;
603         int count = 0;
604         int i;
605
606         /*
607          * Recurse (*parentp) upward if necessary until the parent completely
608          * encloses the key range or we hit the inode.
609          */
610         parent = *parentp;
611         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
612                 scan_beg = parent->bref.key;
613                 scan_end = scan_beg +
614                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
615                 if (key_beg >= scan_beg && key_end <= scan_end)
616                         break;
617                 hammer2_chain_unlock(hmp, parent);
618                 parent = parent->parent;
619                 hammer2_chain_ref(hmp, parent);         /* ref new parent */
620                 hammer2_chain_lock(hmp, parent);        /* lock new parent */
621                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
622                 *parentp = parent;                      /* new parent */
623         }
624
625 again:
626         /*
627          * Locate the blockref array.  Currently we do a fully associative
628          * search through the array.
629          */
630         switch(parent->bref.type) {
631         case HAMMER2_BREF_TYPE_INODE:
632                 base = &parent->data->ipdata.u.blockset.blockref[0];
633                 count = HAMMER2_SET_COUNT;
634                 break;
635         case HAMMER2_BREF_TYPE_INDIRECT:
636                 if (parent->data == NULL)
637                         panic("parent->data is NULL");
638                 base = &parent->data->npdata.blockref[0];
639                 count = HAMMER2_IND_COUNT;
640                 break;
641         case HAMMER2_BREF_TYPE_VOLUME:
642                 base = &hmp->voldata.sroot_blockset.blockref[0];
643                 count = HAMMER2_SET_COUNT;
644                 break;
645         default:
646                 panic("hammer2_chain_push: unrecognized blockref type: %d",
647                       parent->bref.type);
648                 base = NULL;    /* safety */
649                 count = 0;      /* safety */
650         }
651
652         /*
653          * If the element and key overlap we use the element.
654          */
655         bref = NULL;
656         for (i = 0; i < count; ++i) {
657                 tmp = hammer2_chain_find(hmp, parent, i);
658                 bref = (tmp) ? &tmp->bref : &base[i];
659                 if (bref->type == 0)
660                         continue;
661                 scan_beg = bref->key;
662                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
663                 if (key_beg <= scan_end && key_end >= scan_beg)
664                         break;
665         }
666         if (i == count) {
667                 if (key_beg == key_end)
668                         return (NULL);
669                 return (hammer2_chain_next(hmp, parentp, NULL,
670                                            key_beg, key_end, flags));
671         }
672
673         /*
674          * Acquire the new chain element.  If the chain element is an
675          * indirect block we must search recursively.
676          */
677         chain = hammer2_chain_get(hmp, parent, i, flags);
678         if (chain == NULL)
679                 return (NULL);
680
681         /*
682          * If the chain element is an indirect block it becomes the new
683          * parent and we loop on it.  We must fixup the chain we loop on
684          * if the caller passed flags to us that aren't sufficient for our
685          * needs.
686          */
687         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
688                 hammer2_chain_put(hmp, parent);
689                 *parentp = parent = chain;
690                 if (flags & HAMMER2_LOOKUP_NOLOCK)
691                         hammer2_chain_lock(hmp, chain);
692                 goto again;
693         }
694
695         /*
696          * All done, return chain
697          */
698         return (chain);
699 }
700
701 /*
702  * After having issued a lookup we can iterate all matching keys.
703  *
704  * If chain is non-NULL we continue the iteration from just after it's index.
705  *
706  * If chain is NULL we assume the parent was exhausted and continue the
707  * iteration at the next parent.
708  */
709 hammer2_chain_t *
710 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
711                    hammer2_chain_t *chain,
712                    hammer2_key_t key_beg, hammer2_key_t key_end,
713                    int flags)
714 {
715         hammer2_chain_t *parent;
716         hammer2_chain_t *tmp;
717         hammer2_blockref_t *base;
718         hammer2_blockref_t *bref;
719         hammer2_key_t scan_beg;
720         hammer2_key_t scan_end;
721         int i;
722         int count;
723
724         parent = *parentp;
725
726 again:
727         /*
728          * Calculate the next index and recalculate the parent if necessary.
729          */
730         if (chain) {
731                 /*
732                  * Continue iteration within current parent
733                  */
734                 i = chain->index + 1;
735                 hammer2_chain_put(hmp, chain);
736                 chain = NULL;
737         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
738                 /*
739                  * We reached the end of the iteration.
740                  */
741                 return (NULL);
742         } else {
743                 /*
744                  * Continue iteration with next parent unless the current
745                  * parent covers the range.
746                  */
747                 hammer2_chain_t *nparent;
748
749                 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT)
750                         return (NULL);
751
752                 scan_beg = parent->bref.key;
753                 scan_end = scan_beg +
754                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
755                 if (key_beg >= scan_beg && key_end <= scan_end)
756                         return (NULL);
757
758                 i = parent->index + 1;
759                 nparent = parent->parent;
760                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
761                 hammer2_chain_unlock(hmp, parent);
762                 hammer2_chain_lock(hmp, nparent);       /* lock new parent */
763                 hammer2_chain_drop(hmp, parent);        /* drop old parent */
764                 *parentp = parent = nparent;
765         }
766
767 again2:
768         /*
769          * Locate the blockref array.  Currently we do a fully associative
770          * search through the array.
771          */
772         switch(parent->bref.type) {
773         case HAMMER2_BREF_TYPE_INODE:
774                 base = &parent->data->ipdata.u.blockset.blockref[0];
775                 count = HAMMER2_SET_COUNT;
776                 break;
777         case HAMMER2_BREF_TYPE_INDIRECT:
778                 base = &parent->data->npdata.blockref[0];
779                 count = HAMMER2_IND_COUNT;
780                 break;
781         case HAMMER2_BREF_TYPE_VOLUME:
782                 base = &hmp->voldata.sroot_blockset.blockref[0];
783                 count = HAMMER2_SET_COUNT;
784                 break;
785         default:
786                 panic("hammer2_chain_push: unrecognized blockref type: %d",
787                       parent->bref.type);
788                 base = NULL;    /* safety */
789                 count = 0;      /* safety */
790                 break;
791         }
792         KKASSERT(i <= count);
793
794         /*
795          * Look for the key.  If we are unable to find a match and an exact
796          * match was requested we return NULL.  If a range was requested we
797          * run hammer2_chain_next() to iterate.
798          */
799         bref = NULL;
800         while (i < count) {
801                 tmp = hammer2_chain_find(hmp, parent, i);
802                 bref = (tmp) ? &tmp->bref : &base[i];
803                 if (bref->type == 0) {
804                         ++i;
805                         continue;
806                 }
807                 scan_beg = bref->key;
808                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
809                 if (key_beg <= scan_end && key_end >= scan_beg)
810                         break;
811                 ++i;
812         }
813
814         /*
815          * If we couldn't find a match recurse up a parent to continue the
816          * search.
817          */
818         if (i == count)
819                 goto again;
820
821         /*
822          * Acquire the new chain element.  If the chain element is an
823          * indirect block we must search recursively.
824          */
825         chain = hammer2_chain_get(hmp, parent, i, flags);
826         if (chain == NULL)
827                 return (NULL);
828
829         /*
830          * If the chain element is an indirect block it becomes the new
831          * parent and we loop on it.
832          */
833         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
834                 hammer2_chain_put(hmp, parent);
835                 *parentp = parent = chain;
836                 i = 0;
837                 goto again2;
838         }
839
840         /*
841          * All done, return chain
842          */
843         return (chain);
844 }
845
846 /*
847  * Create and return a new hammer2 system memory structure of the specified
848  * key, type and size and insert it RELATIVE TO (PARENT).
849  *
850  * (parent) is typically either an inode or an indirect  block, acquired
851  * acquired as a side effect of issuing a prior failed lookup.  parent
852  * must be locked and held.  Do not pass the inode chain to this function
853  * unless that is the chain returned by the failed lookup.
854  *
855  * Non-indirect types will automatically allocate indirect blocks as required
856  * if the new item does not fit in the current (parent).
857  *
858  * Indirect types will move a portion of the existing blockref array in
859  * (parent) into the new indirect type and then use one of the free slots
860  * to emplace the new indirect type.
861  *
862  * A new locked, referenced chain element is returned of the specified type.
863  * This element will also be marked as modified and contain a data area
864  * ready for initialization.
865  */
866 hammer2_chain_t *
867 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
868                      hammer2_key_t key, int keybits, int type, size_t bytes)
869 {
870         hammer2_blockref_t dummy;
871         hammer2_blockref_t *base;
872         hammer2_blockref_t *bref;
873         hammer2_chain_t *chain;
874         hammer2_chain_t dummy_chain;
875         int count;
876         int i;
877         int unlock_parent = 0;
878
879         /*
880          * First allocate media space and construct the dummy bref, then
881          * allocate the in-memory chain structure.
882          */
883         bzero(&dummy, sizeof(dummy));
884         dummy.type = type;
885         dummy.key = key;
886         dummy.keybits = keybits;
887         dummy.data_off = (hammer2_off_t)hammer2_freemap_bytes_to_radix(bytes);
888         chain = hammer2_chain_alloc(hmp, &dummy);
889
890         /*
891          * Recalculate bytes to reflect the actual media block allocation,
892          * then allocate the local memory copy.  This is a new structure
893          * so no I/O is performed.
894          */
895         bytes = (hammer2_off_t)1 <<
896                 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
897
898         switch(type) {
899         case HAMMER2_BREF_TYPE_VOLUME:
900                 panic("hammer2_chain_create: called with volume type");
901                 break;
902         case HAMMER2_BREF_TYPE_INODE:
903                 KKASSERT(bytes == HAMMER2_INODE_BYTES);
904                 chain->data = (void *)&chain->u.ip->ip_data;
905                 break;
906         default:
907                 /* leave chain->data NULL */
908                 KKASSERT(chain->data == NULL);
909                 break;
910         }
911         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
912
913 again:
914         /*
915          * Locate a free blockref in the parent's array
916          */
917         switch(parent->bref.type) {
918         case HAMMER2_BREF_TYPE_INODE:
919                 KKASSERT(parent->data != NULL);
920                 base = &parent->data->ipdata.u.blockset.blockref[0];
921                 count = HAMMER2_SET_COUNT;
922                 break;
923         case HAMMER2_BREF_TYPE_INDIRECT:
924                 KKASSERT(parent->data != NULL);
925                 base = &parent->data->npdata.blockref[0];
926                 count = HAMMER2_IND_COUNT;
927                 break;
928         case HAMMER2_BREF_TYPE_VOLUME:
929                 KKASSERT(parent->data != NULL);
930                 base = &hmp->voldata.sroot_blockset.blockref[0];
931                 count = HAMMER2_SET_COUNT;
932                 break;
933         default:
934                 panic("hammer2_chain_push: unrecognized blockref type: %d",
935                       parent->bref.type);
936                 count = 0;
937                 break;
938         }
939
940         /*
941          * Scan for an unallocated bref, also skipping any slots occupied
942          * by in-memory chain elements that may not yet have been updated
943          * in the parent's bref array.
944          */
945         bzero(&dummy_chain, sizeof(dummy_chain));
946         bref = NULL;
947         for (i = 0; i < count; ++i) {
948                 bref = &base[i];
949                 dummy_chain.index = i;
950                 if (bref->type == 0 &&
951                     SPLAY_FIND(hammer2_chain_splay,
952                                &parent->shead, &dummy_chain) == NULL) {
953                         break;
954                 }
955         }
956
957         /*
958          * If no free blockref count be found we must create an indirect
959          * block and move a number of blockrefs into it.  With the parent
960          * locked we can safely lock each child in order to move it without
961          * causing a deadlock.
962          *
963          * This may return the new indirect block or the old parent depending
964          * on where the key falls.
965          */
966         if (i == count) {
967                 hammer2_chain_t *nparent;
968
969                 nparent = hammer2_chain_create_indirect(hmp, parent,
970                                                         key, keybits);
971                 if (nparent == NULL) {
972                         hammer2_chain_free(hmp, chain);
973                         chain = NULL;
974                         goto done;
975                 }
976                 if (parent != nparent) {
977                         if (unlock_parent)
978                                 hammer2_chain_put(hmp, parent);
979                         parent = nparent;
980                         unlock_parent = 1;
981                 }
982                 goto again;
983         }
984
985         /*
986          * Link the chain into its parent.
987          */
988         chain->parent = parent;
989         chain->index = i;
990         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
991                 panic("hammer2_chain_link: collision");
992         KKASSERT(parent->refs > 1);
993         atomic_add_int(&parent->refs, 1);
994
995         /*
996          * Additional linkage for inodes.  Reuse the parent pointer to
997          * find the parent directory.
998          */
999         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1000                 hammer2_chain_t *scan = parent;
1001                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1002                         scan = scan->parent;
1003                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE)
1004                         chain->u.ip->pip = scan->u.ip;
1005         }
1006
1007         /*
1008          * Mark the newly created chain element as modified and fully
1009          * resolve the chain->data pointer.
1010          *
1011          * Chain elements with embedded data will not issue I/O at this time.
1012          * A new block will be allocated for the buffer but not instantiated.
1013          *
1014          * Chain elements which do not use embedded data will allocate
1015          * the new block AND instantiate its buffer cache buffer, pointing
1016          * the data at the bp.
1017          */
1018         hammer2_chain_modify(hmp, chain);
1019
1020 done:
1021         if (unlock_parent)
1022                 hammer2_chain_put(hmp, parent);
1023         return (chain);
1024 }
1025
1026 /*
1027  * Create an indirect block that covers one or more of the elements in the
1028  * current parent.  Either returns the existing parent with no locking or
1029  * ref changes or returns the new indirect block locked and referenced,
1030  * depending on what the specified key falls into.
1031  *
1032  * The key/keybits for the indirect mode only needs to follow three rules:
1033  *
1034  * (1) That all elements underneath it fit within its key space and
1035  *
1036  * (2) That all elements outside it are outside its key space.
1037  *
1038  * (3) When creating the new indirect block any elements in the current
1039  *     parent that fit within the new indirect block's keyspace must be
1040  *     moved into the new indirect block.
1041  *
1042  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1043  *     keyspace the the current parent, but lookup/iteration rules will
1044  *     ensure (and must ensure) that rule (2) for all parents leading up
1045  *     to the nearest inode or the root volume header is adhered to.  This
1046  *     is accomplished by always recursing through matching keyspaces in
1047  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1048  *
1049  * The current implementation calculates the current worst-case keyspace by
1050  * iterating the current parent and then divides it into two halves, choosing
1051  * whichever half has the most elements (not necessarily the half containing
1052  * the requested key).
1053  *
1054  * We can also opt to use the half with the least number of elements.  This
1055  * causes lower-numbered keys (aka logical file offsets) to recurse through
1056  * fewer indirect blocks and higher-numbered keys to recurse through more.
1057  * This also has the risk of not moving enough elements to the new indirect
1058  * block and being forced to create several indirect blocks before the element
1059  * can be inserted.
1060  */
1061 static
1062 hammer2_chain_t *
1063 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1064                               hammer2_key_t create_key, int create_bits)
1065 {
1066         hammer2_blockref_t *base;
1067         hammer2_blockref_t *bref;
1068         hammer2_chain_t *chain;
1069         hammer2_chain_t *ichain;
1070         hammer2_chain_t dummy;
1071         hammer2_key_t key = create_key;
1072         int keybits = create_bits;
1073         int locount = 0;
1074         int hicount = 0;
1075         int count;
1076         int i;
1077
1078         /*
1079          * Mark the parent modified so our base[] pointer remains valid
1080          * while we move entries.
1081          */
1082         hammer2_chain_modify(hmp, parent);
1083
1084         /*
1085          * Locate a free blockref in the parent's array
1086          */
1087         switch(parent->bref.type) {
1088         case HAMMER2_BREF_TYPE_INODE:
1089                 base = &parent->data->ipdata.u.blockset.blockref[0];
1090                 count = HAMMER2_SET_COUNT;
1091                 break;
1092         case HAMMER2_BREF_TYPE_INDIRECT:
1093                 base = &parent->data->npdata.blockref[0];
1094                 count = HAMMER2_IND_COUNT;
1095                 break;
1096         case HAMMER2_BREF_TYPE_VOLUME:
1097                 base = &hmp->voldata.sroot_blockset.blockref[0];
1098                 count = HAMMER2_SET_COUNT;
1099                 break;
1100         default:
1101                 panic("hammer2_chain_push: unrecognized blockref type: %d",
1102                       parent->bref.type);
1103                 count = 0;
1104                 break;
1105         }
1106
1107         /*
1108          * Scan for an unallocated bref, also skipping any slots occupied
1109          * by in-memory chain elements that may not yet have been updated
1110          * in the parent's bref array.
1111          */
1112         bzero(&dummy, sizeof(dummy));
1113         for (i = 0; i < count; ++i) {
1114                 int nkeybits;
1115
1116                 bref = &base[i];
1117                 if (bref->type == 0) {
1118                         dummy.index = i;
1119                         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
1120                                            &dummy);
1121                         if (chain == NULL)
1122                                 continue;
1123                         bref = &chain->bref;
1124                 }
1125
1126                 /*
1127                  * Expand our calculated key range (key, keybits) to fit
1128                  * the scanned key.  nkeybits represents the full range
1129                  * that we will later cut in half (two halves @ nkeybits - 1).
1130                  */
1131                 nkeybits = keybits;
1132                 if (nkeybits < bref->keybits)
1133                         nkeybits = bref->keybits;
1134                 while ((~(((hammer2_key_t)1 << nkeybits) - 1) &
1135                         (key ^ bref->key)) != 0) {
1136                         ++nkeybits;
1137                 }
1138
1139                 /*
1140                  * If the new key range is larger we have to determine
1141                  * which side of the new key range the existing keys fall
1142                  * under by checking the high bit, then collapsing the
1143                  * locount into the hicount or vise-versa.
1144                  */
1145                 if (keybits != nkeybits) {
1146                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1147                                 hicount += locount;
1148                                 locount = 0;
1149                         } else {
1150                                 locount += hicount;
1151                                 hicount = 0;
1152                         }
1153                         keybits = nkeybits;
1154                 }
1155
1156                 /*
1157                  * The newly scanned key will be in the lower half or the
1158                  * higher half of the (new) key range.
1159                  */
1160                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1161                         ++hicount;
1162                 else
1163                         ++locount;
1164         }
1165
1166         /*
1167          * Adjust keybits to represent half of the full range calculated
1168          * above.
1169          */
1170         --keybits;
1171
1172         /*
1173          * Select whichever half contains the most elements.  Theoretically
1174          * we can select either side as long as it contains at least one
1175          * element (in order to ensure that a free slot is present to hold
1176          * the indirect block).
1177          */
1178         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1179         if (hammer2_indirect_optimize) {
1180                 /*
1181                  * Insert node for least number of keys, this will arrange
1182                  * the first few blocks of a large file or the first few
1183                  * inodes in a directory with fewer indirect blocks when
1184                  * created linearly.
1185                  */
1186                 if (hicount < locount && hicount != 0)
1187                         key |= (hammer2_key_t)1 << keybits;
1188                 else
1189                         key &= ~(hammer2_key_t)1 << keybits;
1190         } else {
1191                 /*
1192                  * Insert node for most number of keys, best for heavily
1193                  * fragmented files.
1194                  */
1195                 if (hicount > locount)
1196                         key |= (hammer2_key_t)1 << keybits;
1197                 else
1198                         key &= ~(hammer2_key_t)1 << keybits;
1199         }
1200
1201         /*
1202          * Ok, create our new indirect block
1203          */
1204         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1205         dummy.bref.key = key;
1206         dummy.bref.keybits = keybits;
1207         dummy.bref.data_off = (hammer2_off_t)
1208                             hammer2_freemap_bytes_to_radix(HAMMER2_PBUFSIZE);
1209         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1210
1211         /*
1212          * Iterate the original parent and move the matching brefs into
1213          * the new indirect block.
1214          */
1215         for (i = 0; i < count; ++i) {
1216                 /*
1217                  * For keying purposes access the bref from the media or
1218                  * from our in-memory cache.  In cases where the in-memory
1219                  * cache overrides the media the keyrefs will be the same
1220                  * anyway so we can avoid checking the cache when the media
1221                  * has a key.
1222                  */
1223                 bref = &base[i];
1224                 if (bref->type == 0) {
1225                         dummy.index = i;
1226                         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
1227                                            &dummy);
1228                         if (chain == NULL) {
1229                                 /*
1230                                  * Select index indirect block is placed in
1231                                  */
1232                                 if (ichain->index < 0)
1233                                         ichain->index = i;
1234                                 continue;
1235                         }
1236                         bref = &chain->bref;
1237                 }
1238
1239                 /*
1240                  * Skip keys not in the chosen half (low or high), only bit
1241                  * (keybits - 1) needs to be compared but for safety we
1242                  * will compare all msb bits plus that bit again.
1243                  */
1244                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1245                     (key ^ bref->key)) != 0) {
1246                         continue;
1247                 }
1248
1249                 /*
1250                  * This element is being moved, its slot is available
1251                  * for our indirect block.
1252                  */
1253                 if (ichain->index < 0)
1254                         ichain->index = i;
1255
1256                 /*
1257                  * Load the new indirect block by acquiring or allocating
1258                  * the related chain entries, then simply move it to the
1259                  * new parent (ichain).
1260                  *
1261                  * Flagging the new chain entry MOVED will cause a flush
1262                  * to synchronize its block into the new indirect block.
1263                  * The chain is unlocked after being moved but needs to
1264                  * retain a reference for the MOVED state
1265                  *
1266                  * We must still set SUBMODIFIED in the parent but we do
1267                  * that after the loop.
1268                  *
1269                  * XXX we really need a lock here but we don't need the
1270                  *     data.  NODATA feature needed.
1271                  */
1272                 chain = hammer2_chain_get(hmp, parent, i,
1273                                           HAMMER2_LOOKUP_NOLOCK);
1274                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1275                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1276                         panic("hammer2_chain_create_indirect: collision");
1277                 chain->parent = ichain;
1278                 bzero(&base[i], sizeof(base[i]));
1279                 atomic_add_int(&parent->refs, -1);
1280                 atomic_add_int(&ichain->refs, 1);
1281                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
1282                         hammer2_chain_drop(hmp, chain);
1283                 } else {
1284                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1285                 }
1286                 KKASSERT(parent->refs > 0);
1287                 chain = NULL;
1288         }
1289
1290         /*
1291          * Insert the new indirect block into the parent now that we've
1292          * cleared out some entries in the parent.  We calculated a good
1293          * insertion index in the loop above (ichain->index).
1294          */
1295         KKASSERT(ichain->index >= 0);
1296         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1297                 panic("hammer2_chain_create_indirect: ichain insertion");
1298         ichain->parent = parent;
1299         atomic_add_int(&parent->refs, 1);
1300
1301         /*
1302          * Mark the new indirect block modified after insertion, which
1303          * will propagate up through parent all the way to the root and
1304          * also allocate the physical block in ichain for our caller.
1305          *
1306          * We have to set SUBMODIFIED in ichain's flags manually so the
1307          * flusher knows it has to recurse through it to get to all of
1308          * our moved blocks.
1309          */
1310         hammer2_chain_modify(hmp, ichain);
1311         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1312
1313         /*
1314          * Figure out what to return.
1315          */
1316         if (create_bits >= keybits) {
1317                 /*
1318                  * Key being created is way outside the key range,
1319                  * return the original parent.
1320                  */
1321                 hammer2_chain_put(hmp, ichain);
1322         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1323                    (create_key ^ key)) {
1324                 /*
1325                  * Key being created is outside the key range,
1326                  * return the original parent.
1327                  */
1328                 hammer2_chain_put(hmp, ichain);
1329         } else {
1330                 /*
1331                  * Otherwise its in the range, return the new parent.
1332                  */
1333                 parent = ichain;
1334         }
1335
1336         return(parent);
1337 }
1338
1339 /*
1340  * Physically delete the specified chain element.  Note that inodes with
1341  * open descriptors should not be deleted (as with other filesystems) until
1342  * the last open descriptor is closed.
1343  *
1344  * This routine will remove the chain element from its parent and potentially
1345  * also recurse upward and delete indirect blocks which become empty as a
1346  * side effect.
1347  *
1348  * The caller must pass a pointer to the chain's parent, also locked and
1349  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1350  * or iteration when indirect blocks are also deleted as a side effect.
1351  */
1352 void
1353 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
1354                      hammer2_chain_t *chain)
1355 {
1356 }
1357
1358 /*
1359  * Recursively flush the specified chain.  The chain is locked and
1360  * referenced by the caller and will remain so on return.
1361  *
1362  * This cannot be called with the volume header's vchain
1363  */
1364 void
1365 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
1366                     hammer2_blockref_t *parent_bref)
1367 {
1368         /*
1369          * Flush any children of this chain entry.
1370          */
1371         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
1372                 hammer2_blockref_t *base;
1373                 hammer2_blockref_t bref;
1374                 hammer2_chain_t *scan;
1375                 hammer2_chain_t *next;
1376                 int count;
1377                 int submodified = 0;
1378
1379                 /*
1380                  * Modifications to the children will propagate up, forcing
1381                  * us to become modified and copy-on-write too.
1382                  */
1383                 hammer2_chain_modify(hmp, chain);
1384
1385                 /*
1386                  * The blockref in the parent's array must be repointed at
1387                  * the new block allocated by the child after its flush.
1388                  *
1389                  * Calculate the base of the array.
1390                  */
1391                 switch(chain->bref.type) {
1392                 case HAMMER2_BREF_TYPE_INODE:
1393                         KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1394                         base = &chain->data->ipdata.u.blockset.blockref[0];
1395                         count = HAMMER2_SET_COUNT;
1396                         break;
1397                 case HAMMER2_BREF_TYPE_INDIRECT:
1398                         base = &chain->data->npdata.blockref[0];
1399                         count = HAMMER2_IND_COUNT;
1400                         break;
1401                 case HAMMER2_BREF_TYPE_VOLUME:
1402                         KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1403                         base = &hmp->voldata.sroot_blockset.blockref[0];
1404                         count = HAMMER2_SET_COUNT;
1405                         break;
1406                 default:
1407                         base = NULL;
1408                         panic("hammer2_chain_get: unrecognized blockref type: %d",
1409                               chain->bref.type);
1410                 }
1411
1412                 /*
1413                  * Flush the children and update the blockrefs in the parent.
1414                  * Be careful of ripouts during the loop.
1415                  */
1416                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
1417                 while ((scan = next) != NULL) {
1418                         next = SPLAY_NEXT(hammer2_chain_splay, &chain->shead,
1419                                           scan);
1420                         if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
1421                                            HAMMER2_CHAIN_MODIFIED |
1422                                            HAMMER2_CHAIN_MOVED)) {
1423                                 hammer2_chain_ref(hmp, scan);
1424                                 hammer2_chain_lock(hmp, scan);
1425                                 bref = base[scan->index];
1426                                 hammer2_chain_flush(hmp, scan, &bref);
1427                                 if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
1428                                                    HAMMER2_CHAIN_MODIFIED)) {
1429                                         submodified = 1;
1430                                         kprintf("flush race, sub dirty\n");
1431                                 } else {
1432                                         KKASSERT(scan->index < count);
1433                                         base[scan->index] = bref;
1434                                         if (scan->flags & HAMMER2_CHAIN_MOVED) {
1435                                                 atomic_clear_int(&scan->flags,
1436                                                          HAMMER2_CHAIN_MOVED);
1437                                                 hammer2_chain_drop(hmp, scan);
1438                                         }
1439                                 }
1440                                 hammer2_chain_put(hmp, scan);
1441                         }
1442                 }
1443                 if (submodified == 0) {
1444                         atomic_clear_int(&chain->flags,
1445                                          HAMMER2_CHAIN_SUBMODIFIED);
1446                 }
1447         }
1448
1449         /*
1450          * Flush this chain entry only if it is marked modified.
1451          *
1452          * If the chain entry was moved we must still updated *parent_bref
1453          * or the indirect block won't be adjusted to point to us.
1454          */
1455         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1456                 if (chain->flags & HAMMER2_CHAIN_MOVED)
1457                         *parent_bref = chain->bref;
1458                 return;
1459         }
1460
1461         /*
1462          * If this is part of a recursive flush we can go ahead and write
1463          * out the buffer cache buffer and pass a new bref back up the chain.
1464          *
1465          * This will never be a volume header.
1466          */
1467         if (parent_bref) {
1468                 hammer2_blockref_t *bref;
1469                 hammer2_off_t off_hi;
1470                 struct buf *bp;
1471                 size_t off_lo;
1472                 size_t bytes;
1473                 int error;
1474
1475                 KKASSERT(chain->data != NULL);
1476                 bref = &chain->bref;
1477
1478                 off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
1479                 off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
1480                 bytes = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
1481                 KKASSERT(off_hi != 0);  /* not the root volume header */
1482
1483                 if (chain->bp) {
1484                         /*
1485                          * The data is mapped directly to the bp and will be
1486                          * written out when the chain is unlocked by the
1487                          * parent.  However, since we are clearing the
1488                          * MODIFIED flag we have to set the FLUSHED flag
1489                          * so the hammer2_chain_unlock() code knows to
1490                          * bdwrite() the buffer.
1491                          */
1492                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSHED);
1493                 } else {
1494                         /*
1495                          * The data is embedded, we have to acquire the
1496                          * buffer cache buffer and copy the data into it.
1497                          */
1498                         bp = NULL;
1499                         error = bread(hmp->devvp, off_hi,
1500                                       HAMMER2_PBUFSIZE, &bp);
1501                         KKASSERT(error == 0); /* XXX */
1502
1503                         /*
1504                          * Copy the data to the buffer, mark the buffer
1505                          * dirty, and convert the chain to unmodified.
1506                          */
1507                         bcopy(chain->data, (char *)bp->b_data + off_lo, bytes);
1508                         bdwrite(bp);
1509                         bp = NULL;
1510
1511                         chain->bref.check.iscsi32.value =
1512                                         hammer2_icrc32(chain->data, bytes);
1513                 }
1514
1515                 /*
1516                  * Return information on the new block to the parent.
1517                  */
1518                 *parent_bref = chain->bref;
1519                 hammer2_chain_drop(hmp, chain); /* drop ref tracking mod bit */
1520                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1521         } else {
1522                 hammer2_blockref_t *bref;
1523
1524                 KKASSERT(chain->data != NULL);
1525                 KKASSERT(chain->bp == NULL);
1526                 bref = &chain->bref;
1527
1528                 switch(bref->type) {
1529                 case HAMMER2_BREF_TYPE_VOLUME:
1530                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
1531                                 hammer2_icrc32(
1532                                         (char *)&hmp->voldata +
1533                                          HAMMER2_VOLUME_ICRC1_OFF,
1534                                         HAMMER2_VOLUME_ICRC1_SIZE);
1535                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
1536                                 hammer2_icrc32(
1537                                         (char *)&hmp->voldata +
1538                                          HAMMER2_VOLUME_ICRC0_OFF,
1539                                         HAMMER2_VOLUME_ICRC0_SIZE);
1540                         hmp->voldata.icrc_volheader =
1541                                 hammer2_icrc32(
1542                                         (char *)&hmp->voldata +
1543                                          HAMMER2_VOLUME_ICRCVH_OFF,
1544                                         HAMMER2_VOLUME_ICRCVH_SIZE);
1545                         break;
1546                 }
1547         }
1548 }