2 * Copyright (c) 2011-2013 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
41 #include <sys/namei.h>
42 #include <sys/mount.h>
43 #include <sys/vnode.h>
44 #include <sys/mountctl.h>
48 #define USE_SIMPLE_ALLOC 0
52 static int hammer2_freemap_try_alloc(hammer2_trans_t *trans,
53 hammer2_chain_t **parentp, hammer2_blockref_t *bref,
54 int radix, hammer2_off_t bpref, hammer2_off_t *bnext);
55 static int hammer2_freemap_iterate(hammer2_trans_t *trans,
56 hammer2_chain_t **parentp, hammer2_chain_t **chainp,
57 hammer2_off_t bpref, hammer2_off_t *bnextp);
63 hammer2_freemapradix(int radix)
69 * Calculate the device offset for the specified FREEMAP_NODE or FREEMAP_LEAF
70 * bref. Return a combined media offset and physical size radix. Freemap
71 * chains use fixed storage offsets in the 4MB reserved area at the
72 * beginning of each 2GB zone
74 * Rotate between four possibilities. Theoretically this means we have three
75 * good freemaps in case of a crash which we can use as a base for the fixup
78 #define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1))
79 #define H2FMSHIFT(radix) ((hammer2_off_t)1 << (radix))
83 hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
90 * Physical allocation size -> radix. Typically either 256 for
91 * a level 0 freemap leaf or 65536 for a level N freemap node.
93 * NOTE: A 256 byte bitmap represents 256 x 8 x 1024 = 2MB of storage.
94 * Do not use hammer2_allocsize() here as it has a min cap.
99 * Adjust by HAMMER2_ZONE_FREEMAP_{A,B,C,D} using the existing
100 * offset as a basis. Start in zone A if previously unallocated.
102 if ((bref->data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) {
103 off = HAMMER2_ZONE_FREEMAP_A;
105 off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX &
106 (((hammer2_off_t)1 << HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
107 off = off / HAMMER2_PBUFSIZE;
108 KKASSERT(off >= HAMMER2_ZONE_FREEMAP_A);
109 KKASSERT(off < HAMMER2_ZONE_FREEMAP_D + 8);
111 if (off >= HAMMER2_ZONE_FREEMAP_D)
112 off = HAMMER2_ZONE_FREEMAP_A;
113 else if (off >= HAMMER2_ZONE_FREEMAP_C)
114 off = HAMMER2_ZONE_FREEMAP_D;
115 else if (off >= HAMMER2_ZONE_FREEMAP_B)
116 off = HAMMER2_ZONE_FREEMAP_C;
118 off = HAMMER2_ZONE_FREEMAP_B;
120 off = off * HAMMER2_PBUFSIZE;
123 * Calculate the block offset of the reserved block. This will
124 * point into the 4MB reserved area at the base of the appropriate
125 * 2GB zone, once added to the FREEMAP_x selection above.
127 switch(bref->keybits) {
128 /* case HAMMER2_FREEMAP_LEVEL5_RADIX: not applicable */
129 case HAMMER2_FREEMAP_LEVEL4_RADIX: /* 2EB */
130 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
131 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
132 off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL4_RADIX) +
133 HAMMER2_ZONEFM_LEVEL4 * HAMMER2_PBUFSIZE;
135 case HAMMER2_FREEMAP_LEVEL3_RADIX: /* 2PB */
136 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
137 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
138 off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL3_RADIX) +
139 HAMMER2_ZONEFM_LEVEL3 * HAMMER2_PBUFSIZE;
141 case HAMMER2_FREEMAP_LEVEL2_RADIX: /* 2TB */
142 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
143 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
144 off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL2_RADIX) +
145 HAMMER2_ZONEFM_LEVEL2 * HAMMER2_PBUFSIZE;
147 case HAMMER2_FREEMAP_LEVEL1_RADIX: /* 2GB */
148 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
149 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
150 off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
151 HAMMER2_ZONEFM_LEVEL1 * HAMMER2_PBUFSIZE;
153 case HAMMER2_FREEMAP_LEVEL0_RADIX: /* 2MB (256 byte bitmap) */
155 * Terminal bitmap, start with 2GB base, then offset by
156 * 256 bytes of device offset per 2MB of logical space
157 * (8 bits per byte, 1024 byte allocation chunking).
159 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
160 KKASSERT(bytes == HAMMER2_FREEMAP_LEVEL0_PSIZE);
161 off += H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
162 HAMMER2_ZONEFM_LEVEL0 * HAMMER2_PBUFSIZE;
164 off += ((bref->key >> HAMMER2_FREEMAP_LEVEL0_RADIX) &
165 ((1 << (HAMMER2_FREEMAP_LEVEL1_RADIX -
166 HAMMER2_FREEMAP_LEVEL0_RADIX)) - 1)) *
167 HAMMER2_FREEMAP_LEVEL0_PSIZE;
170 panic("freemap: bad radix(2) %p %d\n", bref, bref->keybits);
174 bref->data_off = off | radix;
179 * Allocate media space, returning a combined data offset and radix.
180 * THIS ROUTINE IS USE FOR DEBUGGING ONLY.
182 * This version of the routine is ONLY usable for testing and debug
183 * purposes and only if the filesystem never instantiated an actual
184 * freemap. It uses the initial allocation iterator that newfs_hammer2
185 * used to build the filesystem to allocate new space and is not capable
186 * of dealing with freed space.
192 hammer2_freemap_simple_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref,
195 hammer2_off_t data_off;
196 hammer2_off_t data_next;
197 hammer2_freecache_t *fc;
202 bytes = (size_t)(1 << radix);
203 KKASSERT(bytes >= HAMMER2_MIN_ALLOC &&
204 bytes <= HAMMER2_MAX_ALLOC);
207 * Must not be used if the filesystem is using a real freemap.
209 KKASSERT(hmp->voldata.freemap_blockset.blockref[0].data_off == 0);
212 case HAMMER2_BREF_TYPE_INODE:
213 fctype = HAMMER2_FREECACHE_INODE;
215 case HAMMER2_BREF_TYPE_INDIRECT:
216 fctype = HAMMER2_FREECACHE_INODE;
218 case HAMMER2_BREF_TYPE_DATA:
219 fctype = HAMMER2_FREECACHE_DATA;
222 fctype = HAMMER2_FREECACHE_DATA;
226 if (radix <= HAMMER2_MAX_RADIX)
227 fc = &hmp->freecache[fctype][radix];
231 lockmgr(&hmp->alloclk, LK_EXCLUSIVE);
232 if (fc && fc->single) {
234 * Allocate from our single-block cache.
236 data_off = fc->single;
238 } else if (fc && fc->bulk) {
240 * Allocate from our packing cache.
244 if ((fc->bulk & HAMMER2_SEGMASK) == 0)
248 * Allocate from the allocation iterator using a SEGSIZE
249 * aligned block and reload the packing cache if possible.
251 * Skip reserved areas at the beginning of each zone.
253 hammer2_voldata_lock(hmp);
254 data_off = hmp->voldata.allocator_beg;
255 data_off = (data_off + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
256 if ((data_off & HAMMER2_ZONE_MASK64) < HAMMER2_ZONE_SEG) {
257 KKASSERT((data_off & HAMMER2_ZONE_MASK64) == 0);
258 data_off += HAMMER2_ZONE_SEG64;
260 data_next = data_off + bytes;
262 if ((data_next & HAMMER2_SEGMASK) == 0) {
263 hmp->voldata.allocator_beg = data_next;
265 KKASSERT(radix <= HAMMER2_MAX_RADIX);
266 hmp->voldata.allocator_beg =
267 (data_next + HAMMER2_SEGMASK64) &
269 fc->bulk = data_next;
271 hammer2_voldata_unlock(hmp, 1);
273 lockmgr(&hmp->alloclk, LK_RELEASE);
275 bref->data_off = data_off | radix;
282 * Normal freemap allocator
284 * Use available hints to allocate space using the freemap. Create missing
285 * freemap infrastructure on-the-fly as needed (including marking initial
286 * allocations using the iterator as allocated, instantiating new 2GB zones,
287 * and dealing with the end-of-media edge case).
289 * ip and bpref are only used as a heuristic to determine locality of
290 * reference. bref->key may also be used heuristically.
293 hammer2_freemap_alloc(hammer2_trans_t *trans,
294 hammer2_blockref_t *bref, size_t bytes)
296 hammer2_mount_t *hmp = trans->hmp;
297 hammer2_chain_t *parent;
305 * Validate the allocation size. It must be a power of 2.
307 radix = hammer2_getradix(bytes);
308 KKASSERT((size_t)1 << radix == bytes);
311 * Freemap elements are assigned from the reserve area.
312 * Note that FREEMAP_LEVEL0_PSIZE is 256 bytes which is
313 * allowed for this case.
315 if (bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
316 bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
317 return(hammer2_freemap_reserve(hmp, bref, radix));
320 return (hammer2_freemap_simple_alloc(hmp, bref, radix));
323 KKASSERT(bytes >= HAMMER2_MIN_ALLOC &&
324 bytes <= HAMMER2_MAX_ALLOC);
327 * Calculate the starting point for our allocation search.
329 * Each freemap leaf is dedicated to a specific freemap_radix.
330 * The freemap_radix can be more fine-grained than the device buffer
331 * radix which results in inodes being grouped together in their
332 * own segment, terminal-data (16K or less) and initial indirect
333 * block being grouped together, and then full-indirect and full-data
334 * blocks (64K) being grouped together.
336 * The single most important aspect of this is the inode grouping
337 * because that is what allows 'find' and 'ls' and other filesystem
338 * topology operations to run fast.
340 freemap_radix = hammer2_freemapradix(radix);
342 if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
343 bpref = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
344 else if (trans->tmp_bpref)
345 bpref = trans->tmp_bpref;
346 else if (trans->tmp_ip)
347 bpref = trans->tmp_ip->chain->bref.data_off;
350 KKASSERT(radix >= 0 && radix <= HAMMER2_MAX_RADIX);
351 bpref = hmp->heur_freemap[freemap_radix];
354 * Make sure bpref is in-bounds. It's ok if bpref covers a zone's
355 * reserved area, the try code will iterate past it.
357 if (bpref > hmp->voldata.volu_size)
358 bpref = hmp->voldata.volu_size - 1;
361 * Iterate the freemap looking for free space before and after.
363 parent = &hmp->fchain;
364 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
368 while (error == EAGAIN) {
369 error = hammer2_freemap_try_alloc(trans, &parent, bref,
370 radix, bpref, &bnext);
372 hmp->heur_freemap[freemap_radix] = bnext;
373 hammer2_chain_unlock(parent);
379 #if !USE_SIMPLE_ALLOC
382 * Attempt to allocate (1 << radix) bytes from the freemap at bnext.
383 * Return 0 on success with the bref appropriately updated, non-zero
384 * on failure. Updates bnextp and returns EAGAIN to continue the
387 * This function will create missing freemap infrastructure as well as
388 * properly initialize reserved areas as already having been allocated.
392 countbits(uint64_t *data)
398 for (i = 0; i < 32; ++i) {
410 hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp,
411 hammer2_blockref_t *bref, int radix,
412 hammer2_off_t bpref, hammer2_off_t *bnextp)
414 hammer2_mount_t *hmp = trans->hmp;
415 hammer2_off_t l0mask;
416 hammer2_off_t l0size;
417 hammer2_chain_t *chain;
433 * Calculate the number of bytes being allocated, the number
434 * of contiguous bits of bitmap being allocated, and the bitmap
437 * WARNING! cpu hardware may mask bits == 64 -> 0 and blow up the
440 bytes = (size_t)1 << radix;
441 bits = 1 << (radix - HAMMER2_MIN_RADIX);
442 mask = (bits == 64) ? (uint64_t)-1 : (((uint64_t)1 << bits) - 1);
444 devblk_radix = hammer2_devblkradix(radix);
445 freemap_radix = hammer2_freemapradix(radix);
448 * Lookup the level0 freemap chain, creating and initializing one
449 * if necessary. Intermediate levels will be created automatically
450 * when necessary by hammer2_chain_create().
452 key = H2FMBASE(*bnextp, HAMMER2_FREEMAP_LEVEL0_RADIX);
453 l0mask = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX) - 1;
454 l0size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
456 chain = hammer2_chain_lookup(parentp, key, key + l0mask,
457 HAMMER2_LOOKUP_FREEMAP |
458 HAMMER2_LOOKUP_ALWAYS |
459 HAMMER2_LOOKUP_MATCHIND/*XXX*/);
462 * Create the missing leaf, be sure to initialize
463 * the auxillary freemap tracking information in
464 * the bref.check.freemap structure.
467 kprintf("freemap create L0 @ %016jx bpref %016jx\n",
470 error = hammer2_chain_create(trans, parentp, &chain,
471 key, HAMMER2_FREEMAP_LEVEL0_RADIX,
472 HAMMER2_BREF_TYPE_FREEMAP_LEAF,
473 HAMMER2_FREEMAP_LEVEL0_PSIZE);
475 hammer2_chain_modify(trans, &chain, 0);
476 bzero(chain->data->bmdata.array,
477 HAMMER2_FREEMAP_LEVEL0_PSIZE);
478 chain->bref.check.freemap.biggest =
479 HAMMER2_FREEMAP_LEVEL0_RADIX;
480 chain->bref.check.freemap.avail = l0size;
481 chain->bref.check.freemap.radix = freemap_radix;
484 * Preset bitmap for existing static allocations.
485 * 64-bit-align so we don't have to worry about
486 * endian for the memset().
488 tmp = (hmp->voldata.allocator_beg +
489 HAMMER2_MAX_ALLOC - 1) &
490 ~(hammer2_off_t)(HAMMER2_MAX_ALLOC - 1);
492 if (key + l0size <= tmp) {
493 memset(chain->data->bmdata.array, -1,
494 l0size / HAMMER2_MIN_ALLOC / 8);
495 chain->bref.check.freemap.avail = 0;
497 count = (tmp - key) / HAMMER2_MIN_ALLOC;
498 kprintf("Init L0 BASE %d\n", count);
499 memset(chain->data->bmdata.array, -1,
501 chain->bref.check.freemap.avail -=
502 count * HAMMER2_MIN_ALLOC;
507 * Preset bitmap for reserved area. Calculate
510 tmp = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX);
511 if (key - tmp < HAMMER2_ZONE_SEG) {
512 memset(chain->data->bmdata.array, -1,
513 l0size / HAMMER2_MIN_ALLOC / 8);
514 chain->bref.check.freemap.avail = 0;
518 * Preset bitmap for end of media
520 if (key >= trans->hmp->voldata.volu_size) {
521 memset(chain->data->bmdata.array, -1,
522 l0size / HAMMER2_MIN_ALLOC / 8);
523 chain->bref.check.freemap.avail = 0;
526 } else if (chain->bref.check.freemap.biggest < radix) {
528 * Already flagged as not having enough space
531 } else if (chain->bref.check.freemap.radix != freemap_radix) {
533 * Wrong cluster radix, cannot allocate from this leaf.
538 * Modify existing chain to setup for adjustment.
540 hammer2_chain_modify(trans, &chain, 0);
546 * Calculate mask and count. Each bit represents 1KB and (currently)
547 * the maximum allocation is 65536 bytes. Allocations are always
550 count = HAMMER2_FREEMAP_LEVEL0_PSIZE / sizeof(uint64_t); /* 32 */
551 data = &chain->data->bmdata.array[0];
553 tmp_mask = 0; /* avoid compiler warnings */
554 subindex = 0; /* avoid compiler warnings */
557 * Allocate data and meta-data from the beginning and inodes
560 for (index = 0; index < count; ++index) {
561 if (data[index] == (uint64_t)-1) /* all allocated */
563 tmp_mask = mask; /* iterate */
564 for (subindex = 0; subindex < 64; subindex += bits) {
565 if ((data[index] & tmp_mask) == 0)
569 if (subindex != 64) {
570 key += HAMMER2_MIN_ALLOC * 64 * index;
571 key += HAMMER2_MIN_ALLOC * subindex;
581 * Assert validity. Must be beyond the static allocator used
582 * by newfs_hammer2 (and thus also beyond the aux area),
583 * not go past the volume size, and must not be in the
584 * reserved segment area for a zone.
588 KKASSERT(key >= hmp->voldata.allocator_beg &&
589 key + bytes <= hmp->voldata.volu_size);
590 KKASSERT((key & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
593 * Modify the chain and set the bitmap appropriately.
595 * For smaller allocations try to avoid a read-before-write
596 * by priming the buffer cache buffer. The caller handles
597 * read-avoidance for larger allocations (or more properly,
598 * when the chain is locked).
601 hammer2_chain_modify(trans, &chain, 0);
602 data = &chain->data->bmdata.array[0];
603 if (radix != devblk_radix) {
605 int iobmradix = HAMMER2_MINIORADIX - HAMMER2_MIN_RADIX;
607 int iobmskip = 1 << iobmradix;
609 iomask = ((uint64_t)1 << iobmskip) - 1;
610 for (ioindex = 0; ioindex < 64; ioindex += iobmskip) {
611 if (tmp_mask & iomask) {
612 if ((data[index] & iomask) == 0)
620 KKASSERT((data[index] & tmp_mask) == 0);
621 data[index] |= tmp_mask;
624 * We return the allocated space in bref->data_off.
627 bref->data_off = key | radix;
635 csize = (hammer2_off_t)1 << devblk_radix;
639 bp = getblk(hmp->devvp, pbase, csize,
642 if ((bp->b_flags & B_CACHE) == 0)
649 kprintf("alloc cp=%p %016jx %016jx using %016jx chain->data %d\n",
651 bref->key, bref->data_off, chain->bref.data_off,
654 } else if (error == ENOSPC) {
656 * Return EAGAIN with next iteration in *bnextp, or
657 * return ENOSPC if the allocation map has been exhausted.
659 if (chain->bref.check.freemap.biggest > radix)
660 chain->bref.check.freemap.biggest = radix;
661 error = hammer2_freemap_iterate(trans, parentp, &chain,
669 hammer2_chain_unlock(chain);
674 hammer2_freemap_iterate(hammer2_trans_t *trans, hammer2_chain_t **parentp,
675 hammer2_chain_t **chainp,
676 hammer2_off_t bpref, hammer2_off_t *bnextp)
678 *bnextp += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
679 if (*bnextp >= trans->hmp->voldata.volu_size)
689 hammer2_freemap_free(hammer2_mount_t *hmp, hammer2_off_t data_off, int type)
691 hammer2_freecache_t *fc;
696 case HAMMER2_BREF_TYPE_INODE:
697 fctype = HAMMER2_FREECACHE_INODE;
699 case HAMMER2_BREF_TYPE_INDIRECT:
700 fctype = HAMMER2_FREECACHE_INODE;
702 case HAMMER2_BREF_TYPE_DATA:
703 fctype = HAMMER2_FREECACHE_DATA;
706 fctype = HAMMER2_FREECACHE_DATA;
709 radix = (int)data_off & HAMMER2_OFF_MASK_RADIX;
710 data_off &= ~HAMMER2_OFF_MASK_RADIX;
711 if (radix >= HAMMER2_MAX_RADIX)
714 fc = &hmp->freecache[fctype][radix];
715 if (fc->single == 0) {
716 lockmgr(&hmp->alloclk, LK_EXCLUSIVE);
717 fc->single = data_off;
718 lockmgr(&hmp->alloclk, LK_RELEASE);
726 * Allocate media space, returning a combined data offset and radix.
727 * Also return the related (device) buffer cache buffer.
730 hammer2_freemap_alloc_bp(hammer2_mount_t *hmp, size_t bytes, struct buf **bpp)