2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
41 #include <sys/mount.h>
42 #include <sys/vnode.h>
43 #include <sys/mountctl.h>
47 #define FREEMAP_DEBUG 0
49 struct hammer2_fiterate {
56 typedef struct hammer2_fiterate hammer2_fiterate_t;
58 static int hammer2_freemap_try_alloc(hammer2_chain_t **parentp,
59 hammer2_blockref_t *bref, int radix,
60 hammer2_fiterate_t *iter, hammer2_tid_t mtid);
61 static void hammer2_freemap_init(hammer2_dev_t *hmp,
62 hammer2_key_t key, hammer2_chain_t *chain);
63 static int hammer2_bmap_alloc(hammer2_dev_t *hmp,
64 hammer2_bmap_data_t *bmap, uint16_t class,
65 int n, int sub_key, int radix, hammer2_key_t *basep);
66 static int hammer2_freemap_iterate(hammer2_chain_t **parentp,
67 hammer2_chain_t **chainp,
68 hammer2_fiterate_t *iter);
71 * Calculate the device offset for the specified FREEMAP_NODE or FREEMAP_LEAF
72 * bref. Return a combined media offset and physical size radix. Freemap
73 * chains use fixed storage offsets in the 4MB reserved area at the
74 * beginning of each 2GB zone
76 * Rotate between four possibilities. Theoretically this means we have three
77 * good freemaps in case of a crash which we can use as a base for the fixup
82 hammer2_freemap_reserve(hammer2_chain_t *chain, int radix)
84 hammer2_blockref_t *bref = &chain->bref;
91 * Physical allocation size.
93 bytes = (size_t)1 << radix;
96 * Calculate block selection index 0..7 of current block. If this
97 * is the first allocation of the block (verses a modification of an
98 * existing block), we use index 0, otherwise we use the next rotating
101 if ((bref->data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) {
104 off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX &
106 off = off / HAMMER2_PBUFSIZE;
107 KKASSERT(off >= HAMMER2_ZONE_FREEMAP_00 &&
108 off < HAMMER2_ZONE_FREEMAP_END);
109 index = (int)(off - HAMMER2_ZONE_FREEMAP_00) /
110 HAMMER2_ZONE_FREEMAP_INC;
111 KKASSERT(index >= 0 && index < HAMMER2_NFREEMAPS);
112 if (++index == HAMMER2_NFREEMAPS)
117 * Calculate the block offset of the reserved block. This will
118 * point into the 4MB reserved area at the base of the appropriate
119 * 2GB zone, once added to the FREEMAP_x selection above.
121 index_inc = index * HAMMER2_ZONE_FREEMAP_INC;
123 switch(bref->keybits) {
124 /* case HAMMER2_FREEMAP_LEVEL6_RADIX: not applicable */
125 case HAMMER2_FREEMAP_LEVEL5_RADIX: /* 4EB */
126 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
127 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
128 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL5_RADIX) +
129 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
130 HAMMER2_ZONEFM_LEVEL5) * HAMMER2_PBUFSIZE;
132 case HAMMER2_FREEMAP_LEVEL4_RADIX: /* 16PB */
133 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
134 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
135 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL4_RADIX) +
136 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
137 HAMMER2_ZONEFM_LEVEL4) * HAMMER2_PBUFSIZE;
139 case HAMMER2_FREEMAP_LEVEL3_RADIX: /* 64TB */
140 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
141 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
142 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL3_RADIX) +
143 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
144 HAMMER2_ZONEFM_LEVEL3) * HAMMER2_PBUFSIZE;
146 case HAMMER2_FREEMAP_LEVEL2_RADIX: /* 256GB */
147 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
148 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
149 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL2_RADIX) +
150 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
151 HAMMER2_ZONEFM_LEVEL2) * HAMMER2_PBUFSIZE;
153 case HAMMER2_FREEMAP_LEVEL1_RADIX: /* 1GB */
154 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
155 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
156 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
157 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
158 HAMMER2_ZONEFM_LEVEL1) * HAMMER2_PBUFSIZE;
161 panic("freemap: bad radix(2) %p %d\n", bref, bref->keybits);
163 off = (hammer2_off_t)-1;
166 bref->data_off = off | radix;
168 kprintf("FREEMAP BLOCK TYPE %d %016jx/%d DATA_OFF=%016jx\n",
169 bref->type, bref->key, bref->keybits, bref->data_off);
175 * Normal freemap allocator
177 * Use available hints to allocate space using the freemap. Create missing
178 * freemap infrastructure on-the-fly as needed (including marking initial
179 * allocations using the iterator as allocated, instantiating new 2GB zones,
180 * and dealing with the end-of-media edge case).
182 * ip and bpref are only used as a heuristic to determine locality of
183 * reference. bref->key may also be used heuristically.
185 * This function is a NOP if bytes is 0.
188 hammer2_freemap_alloc(hammer2_chain_t *chain, size_t bytes)
190 hammer2_dev_t *hmp = chain->hmp;
191 hammer2_blockref_t *bref = &chain->bref;
192 hammer2_chain_t *parent;
197 hammer2_fiterate_t iter;
200 * If allocating or downsizing to zero we just get rid of whatever
204 chain->bref.data_off = 0;
209 mtid = hammer2_trans_sub(hmp->spmp);
212 * Validate the allocation size. It must be a power of 2.
214 * For now require that the caller be aware of the minimum
217 radix = hammer2_getradix(bytes);
218 KKASSERT((size_t)1 << radix == bytes);
220 if (bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
221 bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
223 * Freemap blocks themselves are assigned from the reserve
224 * area, not allocated from the freemap.
226 error = hammer2_freemap_reserve(chain, radix);
231 KKASSERT(bytes >= HAMMER2_ALLOC_MIN && bytes <= HAMMER2_ALLOC_MAX);
234 * Calculate the starting point for our allocation search.
236 * Each freemap leaf is dedicated to a specific freemap_radix.
237 * The freemap_radix can be more fine-grained than the device buffer
238 * radix which results in inodes being grouped together in their
239 * own segment, terminal-data (16K or less) and initial indirect
240 * block being grouped together, and then full-indirect and full-data
241 * blocks (64K) being grouped together.
243 * The single most important aspect of this is the inode grouping
244 * because that is what allows 'find' and 'ls' and other filesystem
245 * topology operations to run fast.
248 if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
249 bpref = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
250 else if (trans->tmp_bpref)
251 bpref = trans->tmp_bpref;
252 else if (trans->tmp_ip)
253 bpref = trans->tmp_ip->chain->bref.data_off;
257 * Heuristic tracking index. We would like one for each distinct
258 * bref type if possible. heur_freemap[] has room for two classes
259 * for each type. At a minimum we have to break-up our heuristic
260 * by device block sizes.
262 hindex = HAMMER2_PBUFRADIX - HAMMER2_LBUFRADIX;
263 KKASSERT(hindex < HAMMER2_FREEMAP_HEUR_NRADIX);
264 hindex += bref->type * HAMMER2_FREEMAP_HEUR_NRADIX;
265 hindex &= HAMMER2_FREEMAP_HEUR_TYPES * HAMMER2_FREEMAP_HEUR_NRADIX - 1;
266 KKASSERT(hindex < HAMMER2_FREEMAP_HEUR_SIZE);
268 iter.bpref = hmp->heur_freemap[hindex];
269 iter.relaxed = hmp->freemap_relaxed;
272 * Make sure bpref is in-bounds. It's ok if bpref covers a zone's
273 * reserved area, the try code will iterate past it.
275 if (iter.bpref > hmp->voldata.volu_size)
276 iter.bpref = hmp->voldata.volu_size - 1;
279 * Iterate the freemap looking for free space before and after.
281 parent = &hmp->fchain;
282 hammer2_chain_ref(parent);
283 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
284 error = HAMMER2_ERROR_EAGAIN;
285 iter.bnext = iter.bpref;
288 while (error == HAMMER2_ERROR_EAGAIN) {
289 error = hammer2_freemap_try_alloc(&parent, bref, radix,
292 hmp->freemap_relaxed |= iter.relaxed; /* heuristical, SMP race ok */
293 hmp->heur_freemap[hindex] = iter.bnext;
294 hammer2_chain_unlock(parent);
295 hammer2_chain_drop(parent);
301 hammer2_freemap_try_alloc(hammer2_chain_t **parentp,
302 hammer2_blockref_t *bref, int radix,
303 hammer2_fiterate_t *iter, hammer2_tid_t mtid)
305 hammer2_dev_t *hmp = (*parentp)->hmp;
306 hammer2_off_t l0size;
307 hammer2_off_t l1size;
308 hammer2_off_t l1mask;
309 hammer2_key_t key_dummy;
310 hammer2_chain_t *chain;
317 * Calculate the number of bytes being allocated, the number
318 * of contiguous bits of bitmap being allocated, and the bitmap
321 * WARNING! cpu hardware may mask bits == 64 -> 0 and blow up the
324 bytes = (size_t)1 << radix;
325 class = (bref->type << 8) | HAMMER2_PBUFRADIX;
328 * Lookup the level1 freemap chain, creating and initializing one
329 * if necessary. Intermediate levels will be created automatically
330 * when necessary by hammer2_chain_create().
332 key = H2FMBASE(iter->bnext, HAMMER2_FREEMAP_LEVEL1_RADIX);
333 l0size = HAMMER2_FREEMAP_LEVEL0_SIZE;
334 l1size = HAMMER2_FREEMAP_LEVEL1_SIZE;
337 chain = hammer2_chain_lookup(parentp, &key_dummy, key, key + l1mask,
339 HAMMER2_LOOKUP_ALWAYS |
340 HAMMER2_LOOKUP_MATCHIND);
344 * Create the missing leaf, be sure to initialize
345 * the auxillary freemap tracking information in
346 * the bref.check.freemap structure.
349 kprintf("freemap create L1 @ %016jx bpref %016jx\n",
352 error = hammer2_chain_create(parentp, &chain, NULL, hmp->spmp,
353 HAMMER2_METH_DEFAULT,
354 key, HAMMER2_FREEMAP_LEVEL1_RADIX,
355 HAMMER2_BREF_TYPE_FREEMAP_LEAF,
356 HAMMER2_FREEMAP_LEVELN_PSIZE,
358 KKASSERT(error == 0);
360 hammer2_chain_modify(chain, mtid, 0, 0);
361 bzero(&chain->data->bmdata[0],
362 HAMMER2_FREEMAP_LEVELN_PSIZE);
363 chain->bref.check.freemap.bigmask = (uint32_t)-1;
364 chain->bref.check.freemap.avail = l1size;
365 /* bref.methods should already be inherited */
367 hammer2_freemap_init(hmp, key, chain);
369 } else if (chain->error) {
371 * Error during lookup.
373 kprintf("hammer2_freemap_try_alloc: %016jx: error %s\n",
374 (intmax_t)bref->data_off,
375 hammer2_error_str(chain->error));
376 error = HAMMER2_ERROR_EIO;
377 } else if ((chain->bref.check.freemap.bigmask &
378 ((size_t)1 << radix)) == 0) {
380 * Already flagged as not having enough space
382 error = HAMMER2_ERROR_ENOSPC;
385 * Modify existing chain to setup for adjustment.
387 hammer2_chain_modify(chain, mtid, 0, 0);
394 hammer2_bmap_data_t *bmap;
395 hammer2_key_t base_key;
400 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
401 start = (int)((iter->bnext - key) >>
402 HAMMER2_FREEMAP_LEVEL0_RADIX);
403 KKASSERT(start >= 0 && start < HAMMER2_FREEMAP_COUNT);
404 hammer2_chain_modify(chain, mtid, 0, 0);
406 error = HAMMER2_ERROR_ENOSPC;
407 for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
410 if (start + count >= HAMMER2_FREEMAP_COUNT &&
416 * Calculate bmap pointer from thart starting index
419 * NOTE: bmap pointer is invalid if n >= FREEMAP_COUNT.
422 bmap = &chain->data->bmdata[n];
424 if (n >= HAMMER2_FREEMAP_COUNT) {
426 } else if (bmap->avail) {
428 } else if (radix < HAMMER2_FREEMAP_BLOCK_RADIX &&
429 (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK)) {
436 * Try to allocate from a matching freemap class
437 * superblock. If we are in relaxed mode we allocate
438 * from any freemap class superblock.
441 (bmap->class == 0 || bmap->class == class ||
443 base_key = key + n * l0size;
444 error = hammer2_bmap_alloc(hmp, bmap,
449 if (error != HAMMER2_ERROR_ENOSPC) {
456 * Calculate bmap pointer from thart starting index
457 * backwards (locality).
459 * Must recalculate after potentially having called
460 * hammer2_bmap_alloc() above in case chain was
463 * NOTE: bmap pointer is invalid if n < 0.
466 bmap = &chain->data->bmdata[n];
469 } else if (bmap->avail) {
471 } else if (radix < HAMMER2_FREEMAP_BLOCK_RADIX &&
472 (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK)) {
479 * Try to allocate from a matching freemap class
480 * superblock. If we are in relaxed mode we allocate
481 * from any freemap class superblock.
484 (bmap->class == 0 || bmap->class == class ||
486 base_key = key + n * l0size;
487 error = hammer2_bmap_alloc(hmp, bmap,
492 if (error != HAMMER2_ERROR_ENOSPC) {
500 * We only know for sure that we can clear the bitmap bit
501 * if we scanned the entire array (start == 0).
503 if (error == HAMMER2_ERROR_ENOSPC && start == 0) {
504 chain->bref.check.freemap.bigmask &=
505 (uint32_t)~((size_t)1 << radix);
507 /* XXX also scan down from original count */
512 * Assert validity. Must be beyond the static allocator used
513 * by newfs_hammer2 (and thus also beyond the aux area),
514 * not go past the volume size, and must not be in the
515 * reserved segment area for a zone.
517 KKASSERT(key >= hmp->voldata.allocator_beg &&
518 key + bytes <= hmp->voldata.volu_size);
519 KKASSERT((key & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
520 bref->data_off = key | radix;
523 * Record dedupability. The dedup bits are cleared
524 * when bulkfree transitions the freemap from 11->10,
525 * and asserted to be clear on the 10->00 transition.
527 * We must record the bitmask with the chain locked
528 * at the time we set the allocation bits to avoid
531 if (bref->type == HAMMER2_BREF_TYPE_DATA)
532 hammer2_io_dedup_set(hmp, bref);
534 kprintf("alloc cp=%p %016jx %016jx using %016jx\n",
536 bref->key, bref->data_off, chain->bref.data_off);
538 } else if (error == HAMMER2_ERROR_ENOSPC) {
540 * Return EAGAIN with next iteration in iter->bnext, or
541 * return ENOSPC if the allocation map has been exhausted.
543 error = hammer2_freemap_iterate(parentp, &chain, iter);
550 hammer2_chain_unlock(chain);
551 hammer2_chain_drop(chain);
557 * Allocate (1<<radix) bytes from the bmap whos base data offset is (*basep).
559 * If the linear iterator is mid-block we use it directly (the bitmap should
560 * already be marked allocated), otherwise we search for a block in the
561 * bitmap that fits the allocation request.
563 * A partial bitmap allocation sets the minimum bitmap granularity (16KB)
564 * to fully allocated and adjusts the linear allocator to allow the
565 * remaining space to be allocated.
567 * sub_key is the lower 32 bits of the chain->bref.key for the chain whos
568 * bref is being allocated. If the radix represents an allocation >= 16KB
569 * (aka HAMMER2_FREEMAP_BLOCK_RADIX) we try to use this key to select the
570 * blocks directly out of the bmap.
574 hammer2_bmap_alloc(hammer2_dev_t *hmp, hammer2_bmap_data_t *bmap,
575 uint16_t class, int n, int sub_key,
576 int radix, hammer2_key_t *basep)
581 hammer2_bitmap_t bmmask;
587 * Take into account 2-bits per block when calculating bmradix.
589 size = (size_t)1 << radix;
591 if (radix <= HAMMER2_FREEMAP_BLOCK_RADIX) {
593 /* (16K) 2 bits per allocation block */
595 bmradix = (hammer2_bitmap_t)2 <<
596 (radix - HAMMER2_FREEMAP_BLOCK_RADIX);
597 /* (32K-256K) 4, 8, 16, 32 bits per allocation block */
601 * Use the linear iterator to pack small allocations, otherwise
602 * fall-back to finding a free 16KB chunk. The linear iterator
603 * is only valid when *NOT* on a freemap chunking boundary (16KB).
604 * If it is the bitmap must be scanned. It can become invalid
605 * once we pack to the boundary. We adjust it after a bitmap
606 * allocation only for sub-16KB allocations (so the perfectly good
607 * previous value can still be used for fragments when 16KB+
608 * allocations are made inbetween fragmentary allocations).
610 * Beware of hardware artifacts when bmradix == 64 (intermediate
611 * result can wind up being '1' instead of '0' if hardware masks
614 * NOTE: j needs to be even in the j= calculation. As an artifact
615 * of the /2 division, our bitmask has to clear bit 0.
617 * NOTE: TODO this can leave little unallocatable fragments lying
620 if (((uint32_t)bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) + size <=
621 HAMMER2_FREEMAP_BLOCK_SIZE &&
622 (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) &&
623 bmap->linear < HAMMER2_SEGSIZE) {
625 * Use linear iterator if it is not block-aligned to avoid
628 * Calculate the bitmapq[] index (i) and calculate the
629 * shift count within the 64-bit bitmapq[] entry.
631 * The freemap block size is 16KB, but each bitmap
632 * entry is two bits so use a little trick to get
633 * a (j) shift of 0, 2, 4, ... 62 in 16KB chunks.
635 KKASSERT(bmap->linear >= 0 &&
636 bmap->linear + size <= HAMMER2_SEGSIZE &&
637 (bmap->linear & (HAMMER2_ALLOC_MIN - 1)) == 0);
638 offset = bmap->linear;
639 i = offset / (HAMMER2_SEGSIZE / HAMMER2_BMAP_ELEMENTS);
640 j = (offset / (HAMMER2_FREEMAP_BLOCK_SIZE / 2)) & 62;
641 bmmask = (bmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
642 HAMMER2_BMAP_ALLONES :
643 ((hammer2_bitmap_t)1 << bmradix) - 1;
645 bmap->linear = offset + size;
648 * Try to index a starting point based on sub_key. This
649 * attempts to restore sequential block ordering on-disk
650 * whenever possible, even if data is committed out of
653 * i - Index bitmapq[], full data range represented is
656 * j - Index within bitmapq[i], full data range represented is
657 * HAMMER2_BMAP_INDEX_SIZE.
665 case HAMMER2_BREF_TYPE_DATA:
666 if (radix >= HAMMER2_FREEMAP_BLOCK_RADIX) {
667 i = (sub_key & HAMMER2_BMAP_MASK) /
668 (HAMMER2_BMAP_SIZE / HAMMER2_BMAP_ELEMENTS);
669 j = (sub_key & HAMMER2_BMAP_INDEX_MASK) /
670 (HAMMER2_BMAP_INDEX_SIZE /
671 HAMMER2_BMAP_BLOCKS_PER_ELEMENT);
675 case HAMMER2_BREF_TYPE_INODE:
681 KKASSERT(i < HAMMER2_BMAP_ELEMENTS &&
682 j < 2 * HAMMER2_BMAP_BLOCKS_PER_ELEMENT);
683 KKASSERT(j + bmradix <= HAMMER2_BMAP_BITS_PER_ELEMENT);
684 bmmask = (bmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
685 HAMMER2_BMAP_ALLONES :
686 ((hammer2_bitmap_t)1 << bmradix) - 1;
689 if ((bmap->bitmapq[i] & bmmask) == 0)
694 * General element scan.
696 * WARNING: (j) is iterating a bit index (by 2's)
698 for (i = 0; i < HAMMER2_BMAP_ELEMENTS; ++i) {
699 bmmask = (bmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
700 HAMMER2_BMAP_ALLONES :
701 ((hammer2_bitmap_t)1 << bmradix) - 1;
703 j < HAMMER2_BMAP_BITS_PER_ELEMENT;
705 if ((bmap->bitmapq[i] & bmmask) == 0)
710 /*fragments might remain*/
711 /*KKASSERT(bmap->avail == 0);*/
712 return (HAMMER2_ERROR_ENOSPC);
714 offset = i * (HAMMER2_SEGSIZE / HAMMER2_BMAP_ELEMENTS) +
715 (j * (HAMMER2_FREEMAP_BLOCK_SIZE / 2));
716 if (size & HAMMER2_FREEMAP_BLOCK_MASK)
717 bmap->linear = offset + size;
720 /* 8 x (64/2) -> 256 x 16K -> 4MB */
721 KKASSERT(i >= 0 && i < HAMMER2_BMAP_ELEMENTS);
724 * Optimize the buffer cache to avoid unnecessary read-before-write
727 * The device block size could be larger than the allocation size
728 * so the actual bitmap test is somewhat more involved. We have
729 * to use a compatible buffer size for this operation.
731 if ((bmap->bitmapq[i] & bmmask) == 0 &&
732 HAMMER2_PBUFSIZE != size) {
733 size_t psize = HAMMER2_PBUFSIZE;
734 hammer2_off_t pmask = (hammer2_off_t)psize - 1;
735 int pbmradix = (hammer2_bitmap_t)2 <<
737 HAMMER2_FREEMAP_BLOCK_RADIX);
738 hammer2_bitmap_t pbmmask;
739 int pradix = hammer2_getradix(psize);
741 pbmmask = (pbmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
742 HAMMER2_BMAP_ALLONES :
743 ((hammer2_bitmap_t)1 << pbmradix) - 1;
744 while ((pbmmask & bmmask) == 0)
745 pbmmask <<= pbmradix;
748 kprintf("%016jx mask %016jx %016jx %016jx (%zd/%zd)\n",
749 *basep + offset, bmap->bitmapq[i],
750 pbmmask, bmmask, size, psize);
753 if ((bmap->bitmapq[i] & pbmmask) == 0) {
756 hammer2_io_newnz(hmp, class >> 8,
757 (*basep + (offset & ~pmask)) |
758 pradix, psize, &dio);
759 hammer2_io_putblk(&dio);
765 * When initializing a new inode segment also attempt to initialize
766 * an adjacent segment. Be careful not to index beyond the array
769 * We do this to try to localize inode accesses to improve
770 * directory scan rates. XXX doesn't improve scan rates.
772 if (size == HAMMER2_INODE_BYTES) {
774 if (bmap[-1].radix == 0 && bmap[-1].avail)
775 bmap[-1].radix = radix;
777 if (bmap[1].radix == 0 && bmap[1].avail)
778 bmap[1].radix = radix;
783 * Calculate the bitmap-granular change in bgsize for the volume
784 * header. We cannot use the fine-grained change here because
785 * the bulkfree code can't undo it. If the bitmap element is already
786 * marked allocated it has already been accounted for.
788 if (radix < HAMMER2_FREEMAP_BLOCK_RADIX) {
789 if (bmap->bitmapq[i] & bmmask)
792 bgsize = HAMMER2_FREEMAP_BLOCK_SIZE;
798 * Adjust the bitmap, set the class (it might have been 0),
799 * and available bytes, update the allocation offset (*basep)
800 * from the L0 base to the actual offset.
802 * Do not override the class if doing a relaxed class allocation.
804 * avail must reflect the bitmap-granular availability. The allocator
805 * tests will also check the linear iterator.
807 bmap->bitmapq[i] |= bmmask;
808 if (bmap->class == 0)
810 bmap->avail -= bgsize;
814 * Adjust the volume header's allocator_free parameter. This
815 * parameter has to be fixed up by bulkfree which has no way to
816 * figure out sub-16K chunking, so it must be adjusted by the
817 * bitmap-granular size.
820 hammer2_voldata_lock(hmp);
821 hammer2_voldata_modify(hmp);
822 hmp->voldata.allocator_free -= bgsize;
823 hammer2_voldata_unlock(hmp);
830 * Initialize a freemap for the storage area (in bytes) that begins at (key).
834 hammer2_freemap_init(hammer2_dev_t *hmp, hammer2_key_t key,
835 hammer2_chain_t *chain)
837 hammer2_off_t l1size;
840 hammer2_bmap_data_t *bmap;
844 * LEVEL1 is 1GB, there are two level1 1GB freemaps per 2GB zone.
846 l1size = HAMMER2_FREEMAP_LEVEL1_SIZE;
849 * Calculate the portion of the 1GB map that should be initialized
850 * as free. Portions below or after will be initialized as allocated.
851 * SEGMASK-align the areas so we don't have to worry about sub-scans
852 * or endianess when using memset.
854 * WARNING! It is possible for lokey to be larger than hikey if the
855 * entire 2GB segment is within the static allocation.
858 * (1) Ensure that all statically allocated space from newfs_hammer2
859 * is marked allocated, and take it up to the level1 base for
862 lokey = (hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) &
864 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX))
865 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX);
868 * (2) Ensure that the reserved area is marked allocated (typically
869 * the first 4MB of each 2GB area being represented). Since
870 * each LEAF represents 1GB of storage and the zone is 2GB, we
871 * have to adjust lowkey upward every other LEAF sequentially.
873 if (lokey < H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64)
874 lokey = H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64;
877 * (3) Ensure that any trailing space at the end-of-volume is marked
880 hikey = key + HAMMER2_FREEMAP_LEVEL1_SIZE;
881 if (hikey > hmp->voldata.volu_size) {
882 hikey = hmp->voldata.volu_size & ~HAMMER2_SEGMASK64;
886 * Heuristic highest possible value
888 chain->bref.check.freemap.avail = HAMMER2_FREEMAP_LEVEL1_SIZE;
889 bmap = &chain->data->bmdata[0];
892 * Initialize bitmap (bzero'd by caller)
894 for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
895 if (key < lokey || key >= hikey) {
896 memset(bmap->bitmapq, -1,
897 sizeof(bmap->bitmapq));
899 bmap->linear = HAMMER2_SEGSIZE;
900 chain->bref.check.freemap.avail -=
901 HAMMER2_FREEMAP_LEVEL0_SIZE;
903 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
905 key += HAMMER2_FREEMAP_LEVEL0_SIZE;
911 * The current Level 1 freemap has been exhausted, iterate to the next
912 * one, return ENOSPC if no freemaps remain.
914 * At least two loops are required. If we are not in relaxed mode and
915 * we run out of storage we enter relaxed mode and do a third loop.
916 * The relaxed mode is recorded back in the hmp so once we enter the mode
917 * we remain relaxed until stuff begins to get freed and only do 2 loops.
919 * XXX this should rotate back to the beginning to handle freed-up space
920 * XXX or use intermediate entries to locate free space. TODO
923 hammer2_freemap_iterate(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
924 hammer2_fiterate_t *iter)
926 hammer2_dev_t *hmp = (*parentp)->hmp;
928 iter->bnext &= ~HAMMER2_FREEMAP_LEVEL1_MASK;
929 iter->bnext += HAMMER2_FREEMAP_LEVEL1_SIZE;
930 if (iter->bnext >= hmp->voldata.volu_size) {
932 if (++iter->loops >= 2) {
933 if (iter->relaxed == 0)
936 return (HAMMER2_ERROR_ENOSPC);
939 return(HAMMER2_ERROR_EAGAIN);
943 * Adjust the bit-pattern for data in the freemap bitmap according to
944 * (how). This code is called from on-mount recovery to fixup (mark
945 * as allocated) blocks whos freemap upates might not have been committed
946 * in the last crash and is used by the bulk freemap scan to stage frees.
948 * WARNING! Cannot be called with a empty-data bref (radix == 0).
950 * XXX currently disabled when how == 0 (the normal real-time case). At
951 * the moment we depend on the bulk freescan to actually free blocks. It
952 * will still call this routine with a non-zero how to stage possible frees
953 * and to do the actual free.
956 hammer2_freemap_adjust(hammer2_dev_t *hmp, hammer2_blockref_t *bref,
959 hammer2_off_t data_off = bref->data_off;
960 hammer2_chain_t *chain;
961 hammer2_chain_t *parent;
962 hammer2_bmap_data_t *bmap;
964 hammer2_key_t key_dummy;
965 hammer2_off_t l0size;
966 hammer2_off_t l1size;
967 hammer2_off_t l1mask;
969 hammer2_bitmap_t *bitmap;
970 const hammer2_bitmap_t bmmask00 = 0;
971 hammer2_bitmap_t bmmask01;
972 hammer2_bitmap_t bmmask10;
973 hammer2_bitmap_t bmmask11;
983 KKASSERT(how == HAMMER2_FREEMAP_DORECOVER);
986 mtid = hammer2_trans_sub(hmp->spmp);
988 radix = (int)data_off & HAMMER2_OFF_MASK_RADIX;
989 KKASSERT(radix != 0);
990 data_off &= ~HAMMER2_OFF_MASK_RADIX;
991 KKASSERT(radix <= HAMMER2_RADIX_MAX);
994 bytes = (size_t)1 << radix;
997 class = (bref->type << 8) | HAMMER2_PBUFRADIX;
1000 * We can't adjust the freemap for data allocations made by
1003 if (data_off < hmp->voldata.allocator_beg)
1006 KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
1009 * Lookup the level1 freemap chain. The chain must exist.
1011 key = H2FMBASE(data_off, HAMMER2_FREEMAP_LEVEL1_RADIX);
1012 l0size = HAMMER2_FREEMAP_LEVEL0_SIZE;
1013 l1size = HAMMER2_FREEMAP_LEVEL1_SIZE;
1014 l1mask = l1size - 1;
1016 parent = &hmp->fchain;
1017 hammer2_chain_ref(parent);
1018 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1020 chain = hammer2_chain_lookup(&parent, &key_dummy, key, key + l1mask,
1022 HAMMER2_LOOKUP_ALWAYS |
1023 HAMMER2_LOOKUP_MATCHIND);
1026 * Stop early if we are trying to free something but no leaf exists.
1028 if (chain == NULL && how != HAMMER2_FREEMAP_DORECOVER) {
1029 kprintf("hammer2_freemap_adjust: %016jx: no chain\n",
1030 (intmax_t)bref->data_off);
1034 kprintf("hammer2_freemap_adjust: %016jx: error %s\n",
1035 (intmax_t)bref->data_off,
1036 hammer2_error_str(chain->error));
1037 hammer2_chain_unlock(chain);
1038 hammer2_chain_drop(chain);
1044 * Create any missing leaf(s) if we are doing a recovery (marking
1045 * the block(s) as being allocated instead of being freed). Be sure
1046 * to initialize the auxillary freemap tracking info in the
1047 * bref.check.freemap structure.
1049 if (chain == NULL && how == HAMMER2_FREEMAP_DORECOVER) {
1050 error = hammer2_chain_create(&parent, &chain, NULL, hmp->spmp,
1051 HAMMER2_METH_DEFAULT,
1052 key, HAMMER2_FREEMAP_LEVEL1_RADIX,
1053 HAMMER2_BREF_TYPE_FREEMAP_LEAF,
1054 HAMMER2_FREEMAP_LEVELN_PSIZE,
1057 if (hammer2_debug & 0x0040) {
1058 kprintf("fixup create chain %p %016jx:%d\n",
1059 chain, chain->bref.key, chain->bref.keybits);
1063 error = hammer2_chain_modify(chain, mtid, 0, 0);
1064 KKASSERT(error == 0);
1065 bzero(&chain->data->bmdata[0],
1066 HAMMER2_FREEMAP_LEVELN_PSIZE);
1067 chain->bref.check.freemap.bigmask = (uint32_t)-1;
1068 chain->bref.check.freemap.avail = l1size;
1069 /* bref.methods should already be inherited */
1071 hammer2_freemap_init(hmp, key, chain);
1073 /* XXX handle error */
1077 kprintf("FREEMAP ADJUST TYPE %d %016jx/%d DATA_OFF=%016jx\n",
1078 chain->bref.type, chain->bref.key,
1079 chain->bref.keybits, chain->bref.data_off);
1083 * Calculate the bitmask (runs in 2-bit pairs).
1085 start = ((int)(data_off >> HAMMER2_FREEMAP_BLOCK_RADIX) & 15) * 2;
1086 bmmask01 = (hammer2_bitmap_t)1 << start;
1087 bmmask10 = (hammer2_bitmap_t)2 << start;
1088 bmmask11 = (hammer2_bitmap_t)3 << start;
1091 * Fixup the bitmap. Partial blocks cannot be fully freed unless
1092 * a bulk scan is able to roll them up.
1094 if (radix < HAMMER2_FREEMAP_BLOCK_RADIX) {
1096 if (how == HAMMER2_FREEMAP_DOREALFREE)
1097 how = HAMMER2_FREEMAP_DOMAYFREE;
1099 count = 1 << (radix - HAMMER2_FREEMAP_BLOCK_RADIX);
1103 * [re]load the bmap and bitmap pointers. Each bmap entry covers
1104 * a 4MB swath. The bmap itself (LEVEL1) covers 2GB.
1106 * Be sure to reset the linear iterator to ensure that the adjustment
1110 bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) &
1111 (HAMMER2_FREEMAP_COUNT - 1)];
1112 bitmap = &bmap->bitmapq[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7];
1119 if (how == HAMMER2_FREEMAP_DORECOVER) {
1121 * Recovery request, mark as allocated.
1123 if ((*bitmap & bmmask11) != bmmask11) {
1124 if (modified == 0) {
1125 hammer2_chain_modify(chain, mtid, 0, 0);
1129 if ((*bitmap & bmmask11) == bmmask00) {
1131 HAMMER2_FREEMAP_BLOCK_SIZE;
1132 bgsize += HAMMER2_FREEMAP_BLOCK_SIZE;
1134 if (bmap->class == 0)
1135 bmap->class = class;
1136 *bitmap |= bmmask11;
1137 if (hammer2_debug & 0x0040) {
1138 kprintf("hammer2_freemap_adjust: "
1140 "block=%016jx/%zd\n",
1141 bref->type, data_off, bytes);
1145 kprintf("hammer2_freemap_adjust: good "
1146 "type=%02x block=%016jx/%zd\n",
1147 bref->type, data_off, bytes);
1153 * XXX this stuff doesn't work, avail is miscalculated and
1154 * code 10 means something else now.
1156 else if ((*bitmap & bmmask11) == bmmask11) {
1158 * Mayfree/Realfree request and bitmap is currently
1159 * marked as being fully allocated.
1162 hammer2_chain_modify(chain, 0);
1166 if (how == HAMMER2_FREEMAP_DOREALFREE)
1167 *bitmap &= ~bmmask11;
1169 *bitmap = (*bitmap & ~bmmask11) | bmmask10;
1170 } else if ((*bitmap & bmmask11) == bmmask10) {
1172 * Mayfree/Realfree request and bitmap is currently
1173 * marked as being possibly freeable.
1175 if (how == HAMMER2_FREEMAP_DOREALFREE) {
1177 hammer2_chain_modify(chain, 0);
1181 *bitmap &= ~bmmask11;
1185 * 01 - Not implemented, currently illegal state
1186 * 00 - Not allocated at all, illegal free.
1188 panic("hammer2_freemap_adjust: "
1189 "Illegal state %08x(%08x)",
1190 *bitmap, *bitmap & bmmask11);
1198 #if HAMMER2_BMAP_ELEMENTS != 8
1199 #error "hammer2_freemap.c: HAMMER2_BMAP_ELEMENTS expected to be 8"
1201 if (how == HAMMER2_FREEMAP_DOREALFREE && modified) {
1202 bmap->avail += 1 << radix;
1203 KKASSERT(bmap->avail <= HAMMER2_SEGSIZE);
1204 if (bmap->avail == HAMMER2_SEGSIZE &&
1205 bmap->bitmapq[0] == 0 &&
1206 bmap->bitmapq[1] == 0 &&
1207 bmap->bitmapq[2] == 0 &&
1208 bmap->bitmapq[3] == 0 &&
1209 bmap->bitmapq[4] == 0 &&
1210 bmap->bitmapq[5] == 0 &&
1211 bmap->bitmapq[6] == 0 &&
1212 bmap->bitmapq[7] == 0) {
1213 key = H2FMBASE(data_off, HAMMER2_FREEMAP_LEVEL0_RADIX);
1214 kprintf("Freeseg %016jx\n", (intmax_t)key);
1220 * chain->bref.check.freemap.bigmask (XXX)
1222 * Setting bigmask is a hint to the allocation code that there might
1223 * be something allocatable. We also set this in recovery... it
1224 * doesn't hurt and we might want to use the hint for other validation
1225 * operations later on.
1227 * We could calculate the largest possible allocation and set the
1228 * radii that could fit, but its easier just to set bigmask to -1.
1231 chain->bref.check.freemap.bigmask = -1;
1232 hmp->freemap_relaxed = 0; /* reset heuristic */
1235 hammer2_chain_unlock(chain);
1236 hammer2_chain_drop(chain);
1238 hammer2_chain_unlock(parent);
1239 hammer2_chain_drop(parent);
1242 hammer2_voldata_lock(hmp);
1243 hammer2_voldata_modify(hmp);
1244 hmp->voldata.allocator_free -= bgsize;
1245 hammer2_voldata_unlock(hmp);
1250 * Validate the freemap, in three stages.
1252 * stage-1 ALLOCATED -> POSSIBLY FREE
1253 * POSSIBLY FREE -> POSSIBLY FREE (type corrected)
1255 * This transitions bitmap entries from ALLOCATED to POSSIBLY FREE.
1256 * The POSSIBLY FREE state does not mean that a block is actually free
1257 * and may be transitioned back to ALLOCATED in stage-2.
1259 * This is typically done during normal filesystem operations when
1260 * something is deleted or a block is replaced.
1262 * This is done by bulkfree in-bulk after a memory-bounded meta-data
1263 * scan to try to determine what might be freeable.
1265 * This can be done unconditionally through a freemap scan when the
1266 * intention is to brute-force recover the proper state of the freemap.
1268 * stage-2 POSSIBLY FREE -> ALLOCATED (scan metadata topology)
1270 * This is done by bulkfree during a meta-data scan to ensure that
1271 * all blocks still actually allocated by the filesystem are marked
1274 * NOTE! Live filesystem transitions to POSSIBLY FREE can occur while
1275 * the bulkfree stage-2 and stage-3 is running. The live filesystem
1276 * will use the alternative POSSIBLY FREE type (2) to prevent
1277 * stage-3 from improperly transitioning unvetted possibly-free
1280 * stage-3 POSSIBLY FREE (type 1) -> FREE (scan freemap)
1282 * This is done by bulkfree to finalize POSSIBLY FREE states.