Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
[dragonfly.git] / sys / vfs / hammer2 / hammer2_chain.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 /*
36  * This subsystem handles direct and indirect block searches, recursions,
37  * creation, and deletion.  Chains of blockrefs are tracked and modifications
38  * are flag for propagation... eventually all the way back to the volume
39  * header.
40  */
41
42 #include <sys/cdefs.h>
43 #include <sys/cdefs.h>
44 #include <sys/param.h>
45 #include <sys/systm.h>
46 #include <sys/types.h>
47 #include <sys/lock.h>
48 #include <sys/uuid.h>
49
50 #include "hammer2.h"
51
52 SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
53
54 static int hammer2_indirect_optimize;   /* XXX SYSCTL */
55
56 static hammer2_chain_t *hammer2_chain_create_indirect(
57                         hammer2_mount_t *hmp, hammer2_chain_t *parent,
58                         hammer2_key_t key, int keybits);
59
60 /*
61  * Compare function for chain splay tree
62  */
63 int
64 hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
65 {
66         return(chain2->index - chain1->index);
67 }
68
69 /*
70  * Allocate a new disconnected chain element representing the specified
71  * bref.  The chain element is locked exclusively and refs is set to 1.
72  *
73  * This essentially allocates a system memory structure representing one
74  * of the media structure types, including inodes.
75  */
76 hammer2_chain_t *
77 hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
78 {
79         hammer2_chain_t *chain;
80         hammer2_inode_t *ip;
81         hammer2_indblock_t *np;
82         hammer2_data_t *dp;
83
84         /*
85          * Construct the appropriate system structure.
86          */
87         switch(bref->type) {
88         case HAMMER2_BREF_TYPE_INODE:
89                 ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
90                 chain = &ip->chain;
91                 chain->u.ip = ip;
92                 lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
93                 ip->hmp = hmp;
94                 break;
95         case HAMMER2_BREF_TYPE_INDIRECT:
96                 np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
97                 chain = &np->chain;
98                 chain->u.np = np;
99                 lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
100                 break;
101         case HAMMER2_BREF_TYPE_DATA:
102                 dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
103                 chain = &dp->chain;
104                 chain->u.dp = dp;
105                 lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
106                 break;
107         case HAMMER2_BREF_TYPE_VOLUME:
108                 chain = NULL;
109                 panic("hammer2_chain_get: volume type illegal for op");
110         default:
111                 chain = NULL;
112                 panic("hammer2_chain_get: unrecognized blockref type: %d",
113                       bref->type);
114         }
115         chain->bref = *bref;
116         chain->index = -1;      /* not yet assigned */
117
118         chain->refs = 1;
119         lockmgr(&chain->lk, LK_EXCLUSIVE);
120
121         return (chain);
122 }
123
124 /*
125  * Free a disconnected chain element
126  */
127 void
128 hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
129 {
130         void *mem;
131
132         KKASSERT(chain->bp == NULL);
133         KKASSERT(chain->data == NULL);
134         KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
135                  chain->u.ip->vp == NULL);
136
137         if ((mem = chain->u.mem) != NULL) {
138                 chain->u.mem = NULL;
139                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
140                         kfree(mem, hmp->minode);
141                 else
142                         kfree(mem, hmp->mchain);
143         }
144 }
145
146 /*
147  * Add a reference to a chain element (for shared access).  The chain
148  * element must already have at least 1 ref controlled by the caller.
149  */
150 void
151 hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
152 {
153         KKASSERT(chain->refs > 0);
154         atomic_add_int(&chain->refs, 1);
155 }
156
157 /*
158  * Drop the callers reference to the chain element.  If the ref count
159  * reaches zero the chain element and its related structure (typically an
160  * inode or indirect block) will be freed and the parent will be
161  * recursively dropped.
162  *
163  * Modified elements hold an additional reference so it should not be
164  * possible for the count on a modified element to drop to 0.
165  *
166  * The chain element must NOT be locked by the caller.
167  *
168  * The parent might or might not be locked by the caller but if so it
169  * will also be referenced so we shouldn't recurse upward.
170  */
171 void
172 hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
173 {
174         hammer2_chain_t *parent;
175         u_int refs;
176
177         while (chain) {
178                 refs = chain->refs;
179                 cpu_ccfence();
180                 if (refs == 1) {
181                         KKASSERT(chain != &hmp->vchain);
182                         parent = chain->parent;
183                         lockmgr(&parent->lk, LK_EXCLUSIVE);
184                         if (atomic_cmpset_int(&chain->refs, 1, 0)) {
185                                 /*
186                                  * Succeeded, recurse and drop parent
187                                  */
188                                 if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
189                                         SPLAY_REMOVE(hammer2_chain_splay,
190                                                      &parent->shead, chain);
191                                 }
192                                 chain->parent = NULL;
193                                 lockmgr(&parent->lk, LK_RELEASE);
194                                 hammer2_chain_free(hmp, chain);
195                                 chain = parent;
196                         } else {
197                                 lockmgr(&parent->lk, LK_RELEASE);
198                         }
199                 } else {
200                         if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
201                                 /*
202                                  * Succeeded, count did not reach zero so
203                                  * cut out of the loop.
204                                  */
205                                 break;
206                         }
207                 }
208         }
209 }
210
211 /*
212  * Lock a chain element, acquiring its data with I/O if necessary.
213  *
214  * Returns 0 on success or an error code if the data could not be acquired.
215  * The chain element is locked either way.
216  *
217  * chain->data will be pointed either at the embedded data (e.g. for
218  * inodes), in which case the buffer cache buffer is released, or will
219  * point into the bp->b_data buffer with the bp left intact while locked.
220  */
221 int
222 hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
223 {
224         hammer2_blockref_t *bref;
225         hammer2_off_t off_hi;
226         size_t off_lo;
227         int error;
228         void *data;
229
230         /*
231          * Lock the element.  Under certain conditions this might end up
232          * being a recursive lock.
233          */
234         KKASSERT(chain->refs > 0);
235         lockmgr(&chain->lk, LK_EXCLUSIVE);
236
237         /*
238          * The volume header is a special case
239          */
240         if (chain->bref.type == HAMMER2_BREF_TYPE_VOLUME)
241                 return(0);
242
243         /*
244          * bp must be NULL, so if the data pointer is valid here it points
245          * to embedded data and no I/O is necessary (whether modified or not).
246          */
247         KKASSERT(chain->bp == NULL);
248         if (chain->data)
249                 return (0);
250
251         /*
252          * If data is NULL we must issue I/O.  Any error returns the error
253          * code but leaves the chain locked.
254          *
255          * If the chain was modified a new bref will have already been
256          * allocated and its related bp is probably still sitting in the
257          * buffer cache.
258          */
259         bref = &chain->bref;
260
261         off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
262         off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
263         KKASSERT(off_hi != 0);
264         error = bread(hmp->devvp, off_hi, HAMMER2_PBUFSIZE, &chain->bp);
265
266         if (error) {
267                 kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
268                         (intmax_t)off_hi, error);
269                 brelse(chain->bp);
270                 chain->bp = NULL;
271                 return (error);
272         }
273
274         /*
275          * Setup the data pointer, either pointing it to an embedded data
276          * structure and copying the data from the buffer, or pointint it
277          * into the buffer.
278          *
279          * The buffer is not retained when copying to an embedded data
280          * structure in order to avoid potential deadlocks or recursions
281          * on the same physical buffer.
282          */
283         switch (bref->type) {
284         case HAMMER2_BREF_TYPE_VOLUME:
285                 /*
286                  * Copy data from bp to embedded buffer
287                  */
288                 KKASSERT(0);    /* not yet - have mount use this soon */
289                 KKASSERT(off_hi == 0);
290                 bcopy((char *)chain->bp->b_data + off_lo,
291                       &hmp->voldata, HAMMER2_PBUFSIZE);
292                 chain->data = (void *)&hmp->voldata;
293                 brelse(chain->bp);
294                 chain->bp = NULL;
295                 break;
296         case HAMMER2_BREF_TYPE_INODE:
297                 /*
298                  * Copy data from bp to embedded buffer.
299                  */
300                 bcopy((char *)chain->bp->b_data + off_lo,
301                       &chain->u.ip->ip_data,
302                       HAMMER2_INODE_BYTES);
303                 chain->data = (void *)&chain->u.ip->ip_data;
304                 brelse(chain->bp);
305                 chain->bp = NULL;
306                 break;
307         default:
308                 /*
309                  * Leave bp intact
310                  */
311                 data = (char *)chain->bp->b_data + off_lo;
312                 chain->data = data;
313                 break;
314         }
315         return (0);
316 }
317
318 /*
319  * Convert a locked chain that was retrieved read-only to read-write.
320  *
321  * If not already marked modified a new physical block will be allocated
322  * and assigned to the bref.  If the data is pointing into an existing
323  * bp it will be copied to the new bp and the new bp will replace the
324  * existing bp.
325  *
326  * If the data is embedded we allocate the new physical block but don't
327  * bother copying the data into it (yet).
328  */
329 void
330 hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain)
331 {
332         hammer2_chain_t *parent;
333         struct buf *nbp;
334         size_t bytes;
335         void *ndata;
336         int error;
337
338         /*
339          * If the chain is already marked modified we can just return.
340          */
341         if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
342                 KKASSERT(chain->data != NULL);
343                 return;
344         }
345
346         /*
347          * The MODIFIED bit is not yet set, we must allocate the
348          * copy-on-write block.
349          *
350          * If the data is embedded no other action is required.
351          *
352          * If the data is not embedded we acquire and clear the
353          * new block.  If chain->data is not NULL we then do the
354          * copy-on-write.  chain->data will then be repointed to the new
355          * buffer and the old buffer will be released.
356          *
357          * For newly created elements with no prior allocation we go
358          * through the copy-on-write steps except without the copying part.
359          */
360         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
361         hammer2_chain_ref(hmp, chain);  /* ref for modified bit */
362
363         bytes = 1 << (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
364         if (chain != &hmp->vchain) {
365                 chain->bref.data_off = hammer2_freemap_alloc(hmp, bytes);
366                 /* XXX failed allocation */
367         }
368
369         switch(chain->bref.type) {
370         case HAMMER2_BREF_TYPE_VOLUME:          /* embedded */
371         case HAMMER2_BREF_TYPE_INODE:           /* embedded */
372                 /*
373                  * data points to embedded structure, no copy needed
374                  */
375                 error = 0;
376                 break;
377         case HAMMER2_BREF_TYPE_INDIRECT:
378         case HAMMER2_BREF_TYPE_DATA:
379                 /*
380                  * data (if not NULL) points into original bp, copy-on-write
381                  * to new block.
382                  */
383                 KKASSERT(chain != &hmp->vchain);        /* safety */
384                 if (bytes == HAMMER2_PBUFSIZE) {
385                         nbp = getblk(hmp->devvp,
386                                      chain->bref.data_off & HAMMER2_OFF_MASK_HI,
387                                      HAMMER2_PBUFSIZE, 0, 0);
388                         vfs_bio_clrbuf(nbp);
389                         error = 0;
390                 } else {
391                         error = bread(hmp->devvp,
392                                      chain->bref.data_off & HAMMER2_OFF_MASK_HI,
393                                      HAMMER2_PBUFSIZE, &nbp);
394                         KKASSERT(error == 0);/* XXX handle error */
395                 }
396                 ndata = nbp->b_data + (chain->bref.data_off &
397                                        HAMMER2_OFF_MASK_LO);
398                 if (chain->data) {
399                         bcopy(chain->data, ndata, bytes);
400                         KKASSERT(chain->bp != NULL);
401                         brelse(chain->bp);
402                 }
403                 chain->bp = nbp;
404                 chain->data = ndata;
405                 break;
406         default:
407                 panic("hammer2_chain_modify: unknown bref type");
408                 break;
409
410         }
411
412         /*
413          * Recursively mark the parent chain elements so flushes can find
414          * modified elements.
415          */
416         parent = chain->parent;
417         while (parent && (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
418                 atomic_set_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
419                 parent = parent->parent;
420         }
421 }
422
423 /*
424  * Unlock a chain element without dropping its reference count.
425  * (see hammer2_chain_put() to do both).
426  *
427  * Non-embedded data references (chain->bp != NULL) are returned to the
428  * system and the data field is cleared in that case.  If modified the
429  * dirty buffer is still returned to the system, can be flushed to disk by
430  * the system at any time, and will be reconstituted/re-read as needed.
431  */
432 void
433 hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
434 {
435         if (chain->bp) {
436                 chain->data = NULL;
437                 if (chain->flags & (HAMMER2_CHAIN_MODIFIED |
438                                     HAMMER2_CHAIN_FLUSHED)) {
439                         if (chain->flags & HAMMER2_CHAIN_IOFLUSH)
440                                 bawrite(chain->bp);
441                         else
442                                 bdwrite(chain->bp);
443                 } else {
444                         bqrelse(chain->bp);
445                 }
446                 chain->bp = NULL;
447         }
448         lockmgr(&chain->lk, LK_RELEASE);
449 }
450
451 /*
452  * Locate an in-memory chain.  The parent must be locked.  The in-memory
453  * chain is returned or NULL if no in-memory chain is present.
454  *
455  * NOTE: A chain on-media might exist for this index when NULL is returned.
456  */
457 hammer2_chain_t *
458 hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
459 {
460         hammer2_chain_t dummy;
461         hammer2_chain_t *chain;
462
463         dummy.index = index;
464         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
465         return (chain);
466 }
467
468 /*
469  * Return a locked chain structure with all associated data acquired.
470  *
471  * Caller must lock the parent on call, the returned child will be locked.
472  */
473 hammer2_chain_t *
474 hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
475                   int index, int flags)
476 {
477         hammer2_blockref_t *bref;
478         hammer2_chain_t *chain;
479         hammer2_chain_t dummy;
480
481         /*
482          * First see if we have a (possibly modified) chain element cached
483          * for this (parent, index).  Acquire the data if necessary.
484          *
485          * If chain->data is non-NULL the chain should already be marked
486          * modified.
487          */
488         dummy.index = index;
489         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
490         if (chain) {
491                 hammer2_chain_ref(hmp, chain);
492                 if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
493                         hammer2_chain_lock(hmp, chain);
494                 return(chain);
495         }
496
497         /*
498          * Otherwise lookup the bref and issue I/O (switch on the parent)
499          */
500         switch(parent->bref.type) {
501         case HAMMER2_BREF_TYPE_INODE:
502                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
503                 bref = &parent->data->ipdata.u.blockset.blockref[index];
504                 break;
505         case HAMMER2_BREF_TYPE_INDIRECT:
506                 KKASSERT(index >= 0 && index < HAMMER2_IND_COUNT);
507                 bref = &parent->data->npdata.blockref[index];
508                 break;
509         case HAMMER2_BREF_TYPE_VOLUME:
510                 KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
511                 bref = &hmp->voldata.sroot_blockset.blockref[index];
512                 break;
513         default:
514                 bref = NULL;
515                 panic("hammer2_chain_get: unrecognized blockref type: %d",
516                       parent->bref.type);
517         }
518         chain = hammer2_chain_alloc(hmp, bref);
519
520         /*
521          * Link the chain into its parent.  Caller is expected to hold an
522          * exclusive lock on the parent.
523          */
524         chain->parent = parent;
525         chain->index = index;
526         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
527                 panic("hammer2_chain_link: collision");
528         KKASSERT(parent->refs > 1);
529         atomic_add_int(&parent->refs, 1);       /* for splay entry */
530
531         /*
532          * Additional linkage for inodes.  Reuse the parent pointer to
533          * find the parent directory.
534          */
535         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
536                 while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
537                         parent = parent->parent;
538                 if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
539                         chain->u.ip->pip = parent->u.ip;
540         }
541
542         /*
543          * Our new chain structure has already been referenced and locked
544          * but the lock code handles the I/O so call it to resolve the data.
545          * Then release one of our two exclusive locks.
546          *
547          * If NOLOCK is set the release will release the one-and-only lock.
548          */
549         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
550                 hammer2_chain_lock(hmp, chain);
551         lockmgr(&chain->lk, LK_RELEASE);
552
553         return (chain);
554 }
555
556 /*
557  * Unlock and dereference a chain after use.  It is possible for this to
558  * recurse up the chain.
559  */
560 void
561 hammer2_chain_put(hammer2_mount_t *hmp, hammer2_chain_t *chain)
562 {
563         hammer2_chain_unlock(hmp, chain);
564         hammer2_chain_drop(hmp, chain);
565 }
566
567 /*
568  * Locate any key between key_beg and key_end inclusive.  (*parentp)
569  * typically points to an inode but can also point to a related indirect
570  * block and this function will recurse upwards and find the inode again.
571  *
572  * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
573  *           WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
574  *           WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
575  *
576  * (*parentp) must be exclusively locked and referenced and can be an inode
577  * or an existing indirect block within the inode.
578  *
579  * On return (*parentp) will be modified to point at the deepest parent chain
580  * element encountered during the search, as a helper for an insertion or
581  * deletion.   The new (*parentp) will be locked and referenced and the old
582  * will be unlocked and dereferenced (no change if they are both the same).
583  *
584  * The matching chain will be returned exclusively locked and referenced.
585  *
586  * NULL is returned if no match was found, but (*parentp) will still
587  * potentially be adjusted.
588  *
589  * This function will also recurse up the chain if the key is not within the
590  * current parent's range.  (*parentp) can never be set to NULL.  An iteration
591  * can simply allow (*parentp) to float inside the loop.
592  */
593 hammer2_chain_t *
594 hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
595                      hammer2_key_t key_beg, hammer2_key_t key_end,
596                      int flags)
597 {
598         hammer2_chain_t *parent;
599         hammer2_chain_t *chain;
600         hammer2_chain_t *tmp;
601         hammer2_blockref_t *base;
602         hammer2_blockref_t *bref;
603         hammer2_key_t scan_beg;
604         hammer2_key_t scan_end;
605         int count = 0;
606         int i;
607
608         /*
609          * Recurse (*parentp) upward if necessary until the parent completely
610          * encloses the key range or we hit the inode.
611          */
612         parent = *parentp;
613         while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
614                 scan_beg = parent->bref.key;
615                 scan_end = scan_beg +
616                            ((hammer2_key_t)1 << parent->bref.keybits) - 1;
617                 if (key_beg >= scan_beg && key_end <= scan_end)
618                         break;
619                 hammer2_chain_unlock(hmp, parent);
620                 parent = parent->parent;
621                 hammer2_chain_ref(hmp, parent);         /* ref new parent */
622                 hammer2_chain_lock(hmp, parent);        /* lock new parent */
623                 hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
624                 *parentp = parent;                      /* new parent */
625         }
626
627 again:
628         /*
629          * Locate the blockref array.  Currently we do a fully associative
630          * search through the array.
631          */
632         switch(parent->bref.type) {
633         case HAMMER2_BREF_TYPE_INODE:
634                 /*
635                  * Special shortcut for embedded data returns the inode
636                  * itself.  Callers must detect this condition and access
637                  * the embedded data (the strategy code does this for us).
638                  *
639                  * This is only applicable to regular files and softlinks.
640                  */
641                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
642                         hammer2_chain_ref(hmp, parent);
643                         if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0)
644                                 hammer2_chain_lock(hmp, parent);
645                         kprintf("DIRECT DATA RETURNED\n");
646                         return (parent);
647                 }
648                 base = &parent->data->ipdata.u.blockset.blockref[0];
649                 count = HAMMER2_SET_COUNT;
650                 break;
651         case HAMMER2_BREF_TYPE_INDIRECT:
652                 if (parent->data == NULL)
653                         panic("parent->data is NULL");
654                 base = &parent->data->npdata.blockref[0];
655                 count = HAMMER2_IND_COUNT;
656                 break;
657         case HAMMER2_BREF_TYPE_VOLUME:
658                 base = &hmp->voldata.sroot_blockset.blockref[0];
659                 count = HAMMER2_SET_COUNT;
660                 break;
661         default:
662                 panic("hammer2_chain_lookup: unrecognized blockref type: %d",
663                       parent->bref.type);
664                 base = NULL;    /* safety */
665                 count = 0;      /* safety */
666         }
667
668         /*
669          * If the element and key overlap we use the element.
670          */
671         bref = NULL;
672         for (i = 0; i < count; ++i) {
673                 tmp = hammer2_chain_find(hmp, parent, i);
674                 bref = (tmp) ? &tmp->bref : &base[i];
675                 if (bref->type == 0)
676                         continue;
677                 scan_beg = bref->key;
678                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
679                 if (key_beg <= scan_end && key_end >= scan_beg)
680                         break;
681         }
682         if (i == count) {
683                 if (key_beg == key_end)
684                         return (NULL);
685                 return (hammer2_chain_next(hmp, parentp, NULL,
686                                            key_beg, key_end, flags));
687         }
688
689         /*
690          * Acquire the new chain element.  If the chain element is an
691          * indirect block we must search recursively.
692          */
693         chain = hammer2_chain_get(hmp, parent, i, flags);
694         if (chain == NULL)
695                 return (NULL);
696
697         /*
698          * If the chain element is an indirect block it becomes the new
699          * parent and we loop on it.  We must fixup the chain we loop on
700          * if the caller passed flags to us that aren't sufficient for our
701          * needs.
702          */
703         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
704                 hammer2_chain_put(hmp, parent);
705                 *parentp = parent = chain;
706                 if (flags & HAMMER2_LOOKUP_NOLOCK)
707                         hammer2_chain_lock(hmp, chain);
708                 goto again;
709         }
710
711         /*
712          * All done, return chain
713          */
714         return (chain);
715 }
716
717 /*
718  * After having issued a lookup we can iterate all matching keys.
719  *
720  * If chain is non-NULL we continue the iteration from just after it's index.
721  *
722  * If chain is NULL we assume the parent was exhausted and continue the
723  * iteration at the next parent.
724  */
725 hammer2_chain_t *
726 hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
727                    hammer2_chain_t *chain,
728                    hammer2_key_t key_beg, hammer2_key_t key_end,
729                    int flags)
730 {
731         hammer2_chain_t *parent;
732         hammer2_chain_t *tmp;
733         hammer2_blockref_t *base;
734         hammer2_blockref_t *bref;
735         hammer2_key_t scan_beg;
736         hammer2_key_t scan_end;
737         int i;
738         int count;
739
740         parent = *parentp;
741
742 again:
743         /*
744          * Calculate the next index and recalculate the parent if necessary.
745          */
746         if (chain) {
747                 /*
748                  * Continue iteration within current parent.  If not NULL
749                  * the passed-in chain may or may not be locked, based on
750                  * the LOOKUP_NOLOCK flag (passed in as returned from lookup
751                  * or a prior next).
752                  */
753                 i = chain->index + 1;
754                 if (flags & HAMMER2_LOOKUP_NOLOCK)
755                         hammer2_chain_drop(hmp, chain);
756                 else
757                         hammer2_chain_put(hmp, chain);
758
759                 /*
760                  * Any scan where the lookup returned degenerate data embedded
761                  * in the inode has an invalid index and must terminate.
762                  */
763                 if (chain == parent)
764                         return(NULL);
765                 chain = NULL;
766         } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
767                 /*
768                  * We reached the end of the iteration.
769                  */
770                 return (NULL);
771         } else {
772                 /*
773                  * Continue iteration with next parent unless the current
774                  * parent covers the range.
775                  */
776                 hammer2_chain_t *nparent;
777
778                 if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT)
779                         return (NULL);
780
781                 scan_beg = parent->bref.key;
782                 scan_end = scan_beg +
783                             ((hammer2_key_t)1 << parent->bref.keybits) - 1;
784                 if (key_beg >= scan_beg && key_end <= scan_end)
785                         return (NULL);
786
787                 i = parent->index + 1;
788                 nparent = parent->parent;
789                 hammer2_chain_ref(hmp, nparent);        /* ref new parent */
790                 hammer2_chain_unlock(hmp, parent);
791                 hammer2_chain_lock(hmp, nparent);       /* lock new parent */
792                 hammer2_chain_drop(hmp, parent);        /* drop old parent */
793                 *parentp = parent = nparent;
794         }
795
796 again2:
797         /*
798          * Locate the blockref array.  Currently we do a fully associative
799          * search through the array.
800          */
801         switch(parent->bref.type) {
802         case HAMMER2_BREF_TYPE_INODE:
803                 base = &parent->data->ipdata.u.blockset.blockref[0];
804                 count = HAMMER2_SET_COUNT;
805                 break;
806         case HAMMER2_BREF_TYPE_INDIRECT:
807                 base = &parent->data->npdata.blockref[0];
808                 count = HAMMER2_IND_COUNT;
809                 break;
810         case HAMMER2_BREF_TYPE_VOLUME:
811                 base = &hmp->voldata.sroot_blockset.blockref[0];
812                 count = HAMMER2_SET_COUNT;
813                 break;
814         default:
815                 panic("hammer2_chain_next: unrecognized blockref type: %d",
816                       parent->bref.type);
817                 base = NULL;    /* safety */
818                 count = 0;      /* safety */
819                 break;
820         }
821         KKASSERT(i <= count);
822
823         /*
824          * Look for the key.  If we are unable to find a match and an exact
825          * match was requested we return NULL.  If a range was requested we
826          * run hammer2_chain_next() to iterate.
827          */
828         bref = NULL;
829         while (i < count) {
830                 tmp = hammer2_chain_find(hmp, parent, i);
831                 bref = (tmp) ? &tmp->bref : &base[i];
832                 if (bref->type == 0) {
833                         ++i;
834                         continue;
835                 }
836                 scan_beg = bref->key;
837                 scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
838                 if (key_beg <= scan_end && key_end >= scan_beg)
839                         break;
840                 ++i;
841         }
842
843         /*
844          * If we couldn't find a match recurse up a parent to continue the
845          * search.
846          */
847         if (i == count)
848                 goto again;
849
850         /*
851          * Acquire the new chain element.  If the chain element is an
852          * indirect block we must search recursively.
853          */
854         chain = hammer2_chain_get(hmp, parent, i, flags);
855         if (chain == NULL)
856                 return (NULL);
857
858         /*
859          * If the chain element is an indirect block it becomes the new
860          * parent and we loop on it.  We may have to lock the chain when
861          * cycling it in as the new parent as it will not be locked if the
862          * caller passed NOLOCK.
863          */
864         if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
865                 hammer2_chain_put(hmp, parent);
866                 *parentp = parent = chain;
867                 if (flags & HAMMER2_LOOKUP_NOLOCK)
868                         hammer2_chain_lock(hmp, chain);
869                 i = 0;
870                 goto again2;
871         }
872
873         /*
874          * All done, return chain
875          */
876         return (chain);
877 }
878
879 /*
880  * Create and return a new hammer2 system memory structure of the specified
881  * key, type and size and insert it RELATIVE TO (PARENT).
882  *
883  * (parent) is typically either an inode or an indirect  block, acquired
884  * acquired as a side effect of issuing a prior failed lookup.  parent
885  * must be locked and held.  Do not pass the inode chain to this function
886  * unless that is the chain returned by the failed lookup.
887  *
888  * Non-indirect types will automatically allocate indirect blocks as required
889  * if the new item does not fit in the current (parent).
890  *
891  * Indirect types will move a portion of the existing blockref array in
892  * (parent) into the new indirect type and then use one of the free slots
893  * to emplace the new indirect type.
894  *
895  * A new locked, referenced chain element is returned of the specified type.
896  * This element will also be marked as modified and contain a data area
897  * ready for initialization.
898  */
899 hammer2_chain_t *
900 hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
901                      hammer2_key_t key, int keybits, int type, size_t bytes)
902 {
903         hammer2_blockref_t dummy;
904         hammer2_blockref_t *base;
905         hammer2_blockref_t *bref;
906         hammer2_chain_t *chain;
907         hammer2_chain_t dummy_chain;
908         int count;
909         int i;
910         int unlock_parent = 0;
911
912         /*
913          * First allocate media space and construct the dummy bref, then
914          * allocate the in-memory chain structure.
915          */
916         bzero(&dummy, sizeof(dummy));
917         dummy.type = type;
918         dummy.key = key;
919         dummy.keybits = keybits;
920         dummy.data_off = (hammer2_off_t)hammer2_freemap_bytes_to_radix(bytes);
921         chain = hammer2_chain_alloc(hmp, &dummy);
922
923         /*
924          * Recalculate bytes to reflect the actual media block allocation,
925          * then allocate the local memory copy.  This is a new structure
926          * so no I/O is performed.
927          */
928         bytes = (hammer2_off_t)1 <<
929                 (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
930
931         switch(type) {
932         case HAMMER2_BREF_TYPE_VOLUME:
933                 panic("hammer2_chain_create: called with volume type");
934                 break;
935         case HAMMER2_BREF_TYPE_INODE:
936                 KKASSERT(bytes == HAMMER2_INODE_BYTES);
937                 chain->data = (void *)&chain->u.ip->ip_data;
938                 break;
939         default:
940                 /* leave chain->data NULL */
941                 KKASSERT(chain->data == NULL);
942                 break;
943         }
944         atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
945
946 again:
947         /*
948          * Locate a free blockref in the parent's array
949          */
950         switch(parent->bref.type) {
951         case HAMMER2_BREF_TYPE_INODE:
952                 KKASSERT(parent->data != NULL);
953                 base = &parent->data->ipdata.u.blockset.blockref[0];
954                 count = HAMMER2_SET_COUNT;
955                 break;
956         case HAMMER2_BREF_TYPE_INDIRECT:
957                 KKASSERT(parent->data != NULL);
958                 base = &parent->data->npdata.blockref[0];
959                 count = HAMMER2_IND_COUNT;
960                 break;
961         case HAMMER2_BREF_TYPE_VOLUME:
962                 KKASSERT(parent->data != NULL);
963                 base = &hmp->voldata.sroot_blockset.blockref[0];
964                 count = HAMMER2_SET_COUNT;
965                 break;
966         default:
967                 panic("hammer2_chain_create: unrecognized blockref type: %d",
968                       parent->bref.type);
969                 count = 0;
970                 break;
971         }
972
973         /*
974          * Scan for an unallocated bref, also skipping any slots occupied
975          * by in-memory chain elements that may not yet have been updated
976          * in the parent's bref array.
977          */
978         bzero(&dummy_chain, sizeof(dummy_chain));
979         bref = NULL;
980         for (i = 0; i < count; ++i) {
981                 bref = &base[i];
982                 dummy_chain.index = i;
983                 if (bref->type == 0 &&
984                     SPLAY_FIND(hammer2_chain_splay,
985                                &parent->shead, &dummy_chain) == NULL) {
986                         break;
987                 }
988         }
989
990         /*
991          * If no free blockref count be found we must create an indirect
992          * block and move a number of blockrefs into it.  With the parent
993          * locked we can safely lock each child in order to move it without
994          * causing a deadlock.
995          *
996          * This may return the new indirect block or the old parent depending
997          * on where the key falls.
998          */
999         if (i == count) {
1000                 hammer2_chain_t *nparent;
1001
1002                 nparent = hammer2_chain_create_indirect(hmp, parent,
1003                                                         key, keybits);
1004                 if (nparent == NULL) {
1005                         hammer2_chain_free(hmp, chain);
1006                         chain = NULL;
1007                         goto done;
1008                 }
1009                 if (parent != nparent) {
1010                         if (unlock_parent)
1011                                 hammer2_chain_put(hmp, parent);
1012                         parent = nparent;
1013                         unlock_parent = 1;
1014                 }
1015                 goto again;
1016         }
1017
1018         /*
1019          * Link the chain into its parent.
1020          */
1021         chain->parent = parent;
1022         chain->index = i;
1023         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
1024                 panic("hammer2_chain_link: collision");
1025         KKASSERT(parent->refs > 1);
1026         atomic_add_int(&parent->refs, 1);
1027
1028         /*
1029          * Additional linkage for inodes.  Reuse the parent pointer to
1030          * find the parent directory.
1031          */
1032         if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1033                 hammer2_chain_t *scan = parent;
1034                 while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
1035                         scan = scan->parent;
1036                 if (scan->bref.type == HAMMER2_BREF_TYPE_INODE)
1037                         chain->u.ip->pip = scan->u.ip;
1038         }
1039
1040         /*
1041          * Mark the newly created chain element as modified and fully
1042          * resolve the chain->data pointer.
1043          *
1044          * Chain elements with embedded data will not issue I/O at this time.
1045          * A new block will be allocated for the buffer but not instantiated.
1046          *
1047          * Chain elements which do not use embedded data will allocate
1048          * the new block AND instantiate its buffer cache buffer, pointing
1049          * the data at the bp.
1050          */
1051         hammer2_chain_modify(hmp, chain);
1052
1053 done:
1054         if (unlock_parent)
1055                 hammer2_chain_put(hmp, parent);
1056         return (chain);
1057 }
1058
1059 /*
1060  * Create an indirect block that covers one or more of the elements in the
1061  * current parent.  Either returns the existing parent with no locking or
1062  * ref changes or returns the new indirect block locked and referenced,
1063  * depending on what the specified key falls into.
1064  *
1065  * The key/keybits for the indirect mode only needs to follow three rules:
1066  *
1067  * (1) That all elements underneath it fit within its key space and
1068  *
1069  * (2) That all elements outside it are outside its key space.
1070  *
1071  * (3) When creating the new indirect block any elements in the current
1072  *     parent that fit within the new indirect block's keyspace must be
1073  *     moved into the new indirect block.
1074  *
1075  * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
1076  *     keyspace the the current parent, but lookup/iteration rules will
1077  *     ensure (and must ensure) that rule (2) for all parents leading up
1078  *     to the nearest inode or the root volume header is adhered to.  This
1079  *     is accomplished by always recursing through matching keyspaces in
1080  *     the hammer2_chain_lookup() and hammer2_chain_next() API.
1081  *
1082  * The current implementation calculates the current worst-case keyspace by
1083  * iterating the current parent and then divides it into two halves, choosing
1084  * whichever half has the most elements (not necessarily the half containing
1085  * the requested key).
1086  *
1087  * We can also opt to use the half with the least number of elements.  This
1088  * causes lower-numbered keys (aka logical file offsets) to recurse through
1089  * fewer indirect blocks and higher-numbered keys to recurse through more.
1090  * This also has the risk of not moving enough elements to the new indirect
1091  * block and being forced to create several indirect blocks before the element
1092  * can be inserted.
1093  */
1094 static
1095 hammer2_chain_t *
1096 hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1097                               hammer2_key_t create_key, int create_bits)
1098 {
1099         hammer2_blockref_t *base;
1100         hammer2_blockref_t *bref;
1101         hammer2_chain_t *chain;
1102         hammer2_chain_t *ichain;
1103         hammer2_chain_t dummy;
1104         hammer2_key_t key = create_key;
1105         int keybits = create_bits;
1106         int locount = 0;
1107         int hicount = 0;
1108         int count;
1109         int i;
1110
1111         /*
1112          * Mark the parent modified so our base[] pointer remains valid
1113          * while we move entries.
1114          */
1115         hammer2_chain_modify(hmp, parent);
1116
1117         /*
1118          * Locate a free blockref in the parent's array
1119          */
1120         switch(parent->bref.type) {
1121         case HAMMER2_BREF_TYPE_INODE:
1122                 base = &parent->data->ipdata.u.blockset.blockref[0];
1123                 count = HAMMER2_SET_COUNT;
1124                 break;
1125         case HAMMER2_BREF_TYPE_INDIRECT:
1126                 base = &parent->data->npdata.blockref[0];
1127                 count = HAMMER2_IND_COUNT;
1128                 break;
1129         case HAMMER2_BREF_TYPE_VOLUME:
1130                 base = &hmp->voldata.sroot_blockset.blockref[0];
1131                 count = HAMMER2_SET_COUNT;
1132                 break;
1133         default:
1134                 panic("hammer2_chain_create_indirect: "
1135                       "unrecognized blockref type: %d",
1136                       parent->bref.type);
1137                 count = 0;
1138                 break;
1139         }
1140
1141         /*
1142          * Scan for an unallocated bref, also skipping any slots occupied
1143          * by in-memory chain elements that may not yet have been updated
1144          * in the parent's bref array.
1145          */
1146         bzero(&dummy, sizeof(dummy));
1147         for (i = 0; i < count; ++i) {
1148                 int nkeybits;
1149
1150                 bref = &base[i];
1151                 if (bref->type == 0) {
1152                         dummy.index = i;
1153                         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
1154                                            &dummy);
1155                         if (chain == NULL)
1156                                 continue;
1157                         bref = &chain->bref;
1158                 }
1159
1160                 /*
1161                  * Expand our calculated key range (key, keybits) to fit
1162                  * the scanned key.  nkeybits represents the full range
1163                  * that we will later cut in half (two halves @ nkeybits - 1).
1164                  */
1165                 nkeybits = keybits;
1166                 if (nkeybits < bref->keybits)
1167                         nkeybits = bref->keybits;
1168                 while ((~(((hammer2_key_t)1 << nkeybits) - 1) &
1169                         (key ^ bref->key)) != 0) {
1170                         ++nkeybits;
1171                 }
1172
1173                 /*
1174                  * If the new key range is larger we have to determine
1175                  * which side of the new key range the existing keys fall
1176                  * under by checking the high bit, then collapsing the
1177                  * locount into the hicount or vise-versa.
1178                  */
1179                 if (keybits != nkeybits) {
1180                         if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
1181                                 hicount += locount;
1182                                 locount = 0;
1183                         } else {
1184                                 locount += hicount;
1185                                 hicount = 0;
1186                         }
1187                         keybits = nkeybits;
1188                 }
1189
1190                 /*
1191                  * The newly scanned key will be in the lower half or the
1192                  * higher half of the (new) key range.
1193                  */
1194                 if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
1195                         ++hicount;
1196                 else
1197                         ++locount;
1198         }
1199
1200         /*
1201          * Adjust keybits to represent half of the full range calculated
1202          * above.
1203          */
1204         --keybits;
1205
1206         /*
1207          * Select whichever half contains the most elements.  Theoretically
1208          * we can select either side as long as it contains at least one
1209          * element (in order to ensure that a free slot is present to hold
1210          * the indirect block).
1211          */
1212         key &= ~(((hammer2_key_t)1 << keybits) - 1);
1213         if (hammer2_indirect_optimize) {
1214                 /*
1215                  * Insert node for least number of keys, this will arrange
1216                  * the first few blocks of a large file or the first few
1217                  * inodes in a directory with fewer indirect blocks when
1218                  * created linearly.
1219                  */
1220                 if (hicount < locount && hicount != 0)
1221                         key |= (hammer2_key_t)1 << keybits;
1222                 else
1223                         key &= ~(hammer2_key_t)1 << keybits;
1224         } else {
1225                 /*
1226                  * Insert node for most number of keys, best for heavily
1227                  * fragmented files.
1228                  */
1229                 if (hicount > locount)
1230                         key |= (hammer2_key_t)1 << keybits;
1231                 else
1232                         key &= ~(hammer2_key_t)1 << keybits;
1233         }
1234
1235         /*
1236          * Ok, create our new indirect block
1237          */
1238         dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
1239         dummy.bref.key = key;
1240         dummy.bref.keybits = keybits;
1241         dummy.bref.data_off = (hammer2_off_t)
1242                             hammer2_freemap_bytes_to_radix(HAMMER2_PBUFSIZE);
1243         ichain = hammer2_chain_alloc(hmp, &dummy.bref);
1244
1245         /*
1246          * Iterate the original parent and move the matching brefs into
1247          * the new indirect block.
1248          */
1249         for (i = 0; i < count; ++i) {
1250                 /*
1251                  * For keying purposes access the bref from the media or
1252                  * from our in-memory cache.  In cases where the in-memory
1253                  * cache overrides the media the keyrefs will be the same
1254                  * anyway so we can avoid checking the cache when the media
1255                  * has a key.
1256                  */
1257                 bref = &base[i];
1258                 if (bref->type == 0) {
1259                         dummy.index = i;
1260                         chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead,
1261                                            &dummy);
1262                         if (chain == NULL) {
1263                                 /*
1264                                  * Select index indirect block is placed in
1265                                  */
1266                                 if (ichain->index < 0)
1267                                         ichain->index = i;
1268                                 continue;
1269                         }
1270                         bref = &chain->bref;
1271                 }
1272
1273                 /*
1274                  * Skip keys not in the chosen half (low or high), only bit
1275                  * (keybits - 1) needs to be compared but for safety we
1276                  * will compare all msb bits plus that bit again.
1277                  */
1278                 if ((~(((hammer2_key_t)1 << keybits) - 1) &
1279                     (key ^ bref->key)) != 0) {
1280                         continue;
1281                 }
1282
1283                 /*
1284                  * This element is being moved, its slot is available
1285                  * for our indirect block.
1286                  */
1287                 if (ichain->index < 0)
1288                         ichain->index = i;
1289
1290                 /*
1291                  * Load the new indirect block by acquiring or allocating
1292                  * the related chain entries, then simply move it to the
1293                  * new parent (ichain).
1294                  *
1295                  * Flagging the new chain entry MOVED will cause a flush
1296                  * to synchronize its block into the new indirect block.
1297                  * The chain is unlocked after being moved but needs to
1298                  * retain a reference for the MOVED state
1299                  *
1300                  * We must still set SUBMODIFIED in the parent but we do
1301                  * that after the loop.
1302                  *
1303                  * XXX we really need a lock here but we don't need the
1304                  *     data.  NODATA feature needed.
1305                  */
1306                 chain = hammer2_chain_get(hmp, parent, i,
1307                                           HAMMER2_LOOKUP_NOLOCK);
1308                 SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1309                 if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
1310                         panic("hammer2_chain_create_indirect: collision");
1311                 chain->parent = ichain;
1312                 bzero(&base[i], sizeof(base[i]));
1313                 atomic_add_int(&parent->refs, -1);
1314                 atomic_add_int(&ichain->refs, 1);
1315                 if (chain->flags & HAMMER2_CHAIN_MOVED) {
1316                         hammer2_chain_drop(hmp, chain);
1317                 } else {
1318                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
1319                 }
1320                 KKASSERT(parent->refs > 0);
1321                 chain = NULL;
1322         }
1323
1324         /*
1325          * Insert the new indirect block into the parent now that we've
1326          * cleared out some entries in the parent.  We calculated a good
1327          * insertion index in the loop above (ichain->index).
1328          */
1329         KKASSERT(ichain->index >= 0);
1330         if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
1331                 panic("hammer2_chain_create_indirect: ichain insertion");
1332         ichain->parent = parent;
1333         atomic_add_int(&parent->refs, 1);
1334
1335         /*
1336          * Mark the new indirect block modified after insertion, which
1337          * will propagate up through parent all the way to the root and
1338          * also allocate the physical block in ichain for our caller.
1339          *
1340          * We have to set SUBMODIFIED in ichain's flags manually so the
1341          * flusher knows it has to recurse through it to get to all of
1342          * our moved blocks.
1343          */
1344         hammer2_chain_modify(hmp, ichain);
1345         atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
1346
1347         /*
1348          * Figure out what to return.
1349          */
1350         if (create_bits >= keybits) {
1351                 /*
1352                  * Key being created is way outside the key range,
1353                  * return the original parent.
1354                  */
1355                 hammer2_chain_put(hmp, ichain);
1356         } else if (~(((hammer2_key_t)1 << keybits) - 1) &
1357                    (create_key ^ key)) {
1358                 /*
1359                  * Key being created is outside the key range,
1360                  * return the original parent.
1361                  */
1362                 hammer2_chain_put(hmp, ichain);
1363         } else {
1364                 /*
1365                  * Otherwise its in the range, return the new parent.
1366                  */
1367                 parent = ichain;
1368         }
1369
1370         return(parent);
1371 }
1372
1373 /*
1374  * Physically delete the specified chain element.  Note that inodes with
1375  * open descriptors should not be deleted (as with other filesystems) until
1376  * the last open descriptor is closed.
1377  *
1378  * This routine will remove the chain element from its parent and potentially
1379  * also recurse upward and delete indirect blocks which become empty as a
1380  * side effect.
1381  *
1382  * The caller must pass a pointer to the chain's parent, also locked and
1383  * referenced.  (*parentp) will be modified in a manner similar to a lookup
1384  * or iteration when indirect blocks are also deleted as a side effect.
1385  */
1386 void
1387 hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
1388                      hammer2_chain_t *chain)
1389 {
1390         hammer2_blockref_t *base;
1391         int count;
1392
1393         /*
1394          * Mark the parent modified so our base[] pointer remains valid
1395          * while we move entries.
1396          *
1397          * Calculate the blockref reference in the parent
1398          */
1399         hammer2_chain_modify(hmp, parent);
1400
1401         switch(parent->bref.type) {
1402         case HAMMER2_BREF_TYPE_INODE:
1403                 base = &parent->data->ipdata.u.blockset.blockref[0];
1404                 count = HAMMER2_SET_COUNT;
1405                 break;
1406         case HAMMER2_BREF_TYPE_INDIRECT:
1407                 base = &parent->data->npdata.blockref[0];
1408                 count = HAMMER2_IND_COUNT;
1409                 break;
1410         case HAMMER2_BREF_TYPE_VOLUME:
1411                 base = &hmp->voldata.sroot_blockset.blockref[0];
1412                 count = HAMMER2_SET_COUNT;
1413                 break;
1414         default:
1415                 panic("hammer2_chain_delete: unrecognized blockref type: %d",
1416                       parent->bref.type);
1417                 count = 0;
1418                 break;
1419         }
1420         KKASSERT(chain->index >= 0 && chain->index < count);
1421         base += chain->index;
1422         bzero(base, sizeof(*base));
1423         SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
1424         atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
1425 }
1426
1427 /*
1428  * Recursively flush the specified chain.  The chain is locked and
1429  * referenced by the caller and will remain so on return.
1430  *
1431  * This cannot be called with the volume header's vchain
1432  */
1433 void
1434 hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain,
1435                     hammer2_blockref_t *parent_bref)
1436 {
1437         /*
1438          * Flush any children of this chain entry.
1439          */
1440         if (chain->flags & HAMMER2_CHAIN_SUBMODIFIED) {
1441                 hammer2_blockref_t *base;
1442                 hammer2_blockref_t bref;
1443                 hammer2_chain_t *scan;
1444                 hammer2_chain_t *next;
1445                 int count;
1446                 int submodified = 0;
1447
1448                 /*
1449                  * Modifications to the children will propagate up, forcing
1450                  * us to become modified and copy-on-write too.
1451                  */
1452                 hammer2_chain_modify(hmp, chain);
1453
1454                 /*
1455                  * The blockref in the parent's array must be repointed at
1456                  * the new block allocated by the child after its flush.
1457                  *
1458                  * Calculate the base of the array.
1459                  */
1460                 switch(chain->bref.type) {
1461                 case HAMMER2_BREF_TYPE_INODE:
1462                         KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1463                         base = &chain->data->ipdata.u.blockset.blockref[0];
1464                         count = HAMMER2_SET_COUNT;
1465                         break;
1466                 case HAMMER2_BREF_TYPE_INDIRECT:
1467                         base = &chain->data->npdata.blockref[0];
1468                         count = HAMMER2_IND_COUNT;
1469                         break;
1470                 case HAMMER2_BREF_TYPE_VOLUME:
1471                         KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
1472                         base = &hmp->voldata.sroot_blockset.blockref[0];
1473                         count = HAMMER2_SET_COUNT;
1474                         break;
1475                 default:
1476                         base = NULL;
1477                         panic("hammer2_chain_get: unrecognized blockref type: %d",
1478                               chain->bref.type);
1479                 }
1480
1481                 /*
1482                  * Flush the children and update the blockrefs in the parent.
1483                  * Be careful of ripouts during the loop.
1484                  */
1485                 next = SPLAY_MIN(hammer2_chain_splay, &chain->shead);
1486                 while ((scan = next) != NULL) {
1487                         next = SPLAY_NEXT(hammer2_chain_splay, &chain->shead,
1488                                           scan);
1489                         if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
1490                                            HAMMER2_CHAIN_MODIFIED |
1491                                            HAMMER2_CHAIN_MOVED)) {
1492                                 hammer2_chain_ref(hmp, scan);
1493                                 hammer2_chain_lock(hmp, scan);
1494                                 bref = base[scan->index];
1495                                 hammer2_chain_flush(hmp, scan, &bref);
1496                                 if (scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
1497                                                    HAMMER2_CHAIN_MODIFIED)) {
1498                                         submodified = 1;
1499                                         kprintf("flush race, sub dirty\n");
1500                                 } else {
1501                                         KKASSERT(scan->index < count);
1502                                         base[scan->index] = bref;
1503                                         if (scan->flags & HAMMER2_CHAIN_MOVED) {
1504                                                 atomic_clear_int(&scan->flags,
1505                                                          HAMMER2_CHAIN_MOVED);
1506                                                 hammer2_chain_drop(hmp, scan);
1507                                         }
1508                                 }
1509                                 hammer2_chain_put(hmp, scan);
1510                         }
1511                 }
1512                 if (submodified == 0) {
1513                         atomic_clear_int(&chain->flags,
1514                                          HAMMER2_CHAIN_SUBMODIFIED);
1515                 }
1516         }
1517
1518         /*
1519          * Flush this chain entry only if it is marked modified.
1520          *
1521          * If the chain entry was moved we must still updated *parent_bref
1522          * or the indirect block won't be adjusted to point to us.
1523          */
1524         if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
1525                 if (chain->flags & HAMMER2_CHAIN_MOVED)
1526                         *parent_bref = chain->bref;
1527                 return;
1528         }
1529
1530         /*
1531          * If this is part of a recursive flush we can go ahead and write
1532          * out the buffer cache buffer and pass a new bref back up the chain.
1533          *
1534          * This will never be a volume header.
1535          */
1536         if (parent_bref) {
1537                 hammer2_blockref_t *bref;
1538                 hammer2_off_t off_hi;
1539                 struct buf *bp;
1540                 size_t off_lo;
1541                 size_t bytes;
1542                 int error;
1543
1544                 KKASSERT(chain->data != NULL);
1545                 bref = &chain->bref;
1546
1547                 off_hi = bref->data_off & HAMMER2_OFF_MASK_HI;
1548                 off_lo = (size_t)bref->data_off & HAMMER2_OFF_MASK_LO;
1549                 bytes = 1 << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
1550                 KKASSERT(off_hi != 0);  /* not the root volume header */
1551
1552                 if (chain->bp) {
1553                         /*
1554                          * The data is mapped directly to the bp and will be
1555                          * written out when the chain is unlocked by the
1556                          * parent.  However, since we are clearing the
1557                          * MODIFIED flag we have to set the FLUSHED flag
1558                          * so the hammer2_chain_unlock() code knows to
1559                          * bdwrite() the buffer.
1560                          */
1561                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_FLUSHED);
1562                 } else {
1563                         /*
1564                          * The data is embedded, we have to acquire the
1565                          * buffer cache buffer and copy the data into it.
1566                          */
1567                         bp = NULL;
1568                         error = bread(hmp->devvp, off_hi,
1569                                       HAMMER2_PBUFSIZE, &bp);
1570                         KKASSERT(error == 0); /* XXX */
1571
1572                         /*
1573                          * Copy the data to the buffer, mark the buffer
1574                          * dirty, and convert the chain to unmodified.
1575                          */
1576                         bcopy(chain->data, (char *)bp->b_data + off_lo, bytes);
1577                         bdwrite(bp);
1578                         bp = NULL;
1579
1580                         chain->bref.check.iscsi32.value =
1581                                         hammer2_icrc32(chain->data, bytes);
1582                 }
1583
1584                 /*
1585                  * Return information on the new block to the parent.
1586                  */
1587                 *parent_bref = chain->bref;
1588                 hammer2_chain_drop(hmp, chain); /* drop ref tracking mod bit */
1589                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
1590         } else {
1591                 hammer2_blockref_t *bref;
1592
1593                 KKASSERT(chain->data != NULL);
1594                 KKASSERT(chain->bp == NULL);
1595                 bref = &chain->bref;
1596
1597                 switch(bref->type) {
1598                 case HAMMER2_BREF_TYPE_VOLUME:
1599                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
1600                                 hammer2_icrc32(
1601                                         (char *)&hmp->voldata +
1602                                          HAMMER2_VOLUME_ICRC1_OFF,
1603                                         HAMMER2_VOLUME_ICRC1_SIZE);
1604                         hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
1605                                 hammer2_icrc32(
1606                                         (char *)&hmp->voldata +
1607                                          HAMMER2_VOLUME_ICRC0_OFF,
1608                                         HAMMER2_VOLUME_ICRC0_SIZE);
1609                         hmp->voldata.icrc_volheader =
1610                                 hammer2_icrc32(
1611                                         (char *)&hmp->voldata +
1612                                          HAMMER2_VOLUME_ICRCVH_OFF,
1613                                         HAMMER2_VOLUME_ICRCVH_SIZE);
1614                         break;
1615                 }
1616         }
1617 }