hammer2 - Correct allocator race and related corruption
[dragonfly.git] / sys / vfs / hammer2 / hammer2_freemap.c
CommitLineData
5c23d7f1 1/*
68b321c1 2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved.
5c23d7f1
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7 *
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
10 * are met:
11 *
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
17 * distribution.
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35#include <sys/param.h>
36#include <sys/systm.h>
37#include <sys/kernel.h>
38#include <sys/fcntl.h>
39#include <sys/buf.h>
40#include <sys/proc.h>
41#include <sys/namei.h>
42#include <sys/mount.h>
43#include <sys/vnode.h>
44#include <sys/mountctl.h>
45
46#include "hammer2.h"
47
8138a154
MD
48#define FREEMAP_DEBUG 0
49
91abd410
MD
50struct hammer2_fiterate {
51 hammer2_off_t bpref;
52 hammer2_off_t bnext;
53 int loops;
65cacacf 54 int relaxed;
91abd410
MD
55};
56
57typedef struct hammer2_fiterate hammer2_fiterate_t;
58
c603b86b 59static int hammer2_freemap_try_alloc(hammer2_chain_t **parentp,
e2163f5b
MD
60 hammer2_blockref_t *bref, int radix,
61 hammer2_fiterate_t *iter, hammer2_tid_t mtid);
c603b86b 62static void hammer2_freemap_init(hammer2_dev_t *hmp,
a5913bdf 63 hammer2_key_t key, hammer2_chain_t *chain);
c603b86b 64static int hammer2_bmap_alloc(hammer2_dev_t *hmp,
91abd410 65 hammer2_bmap_data_t *bmap, uint16_t class,
aff72996 66 int n, int sub_key, int radix, hammer2_key_t *basep);
c603b86b
MD
67static int hammer2_freemap_iterate(hammer2_chain_t **parentp,
68 hammer2_chain_t **chainp,
91abd410 69 hammer2_fiterate_t *iter);
1a7cfe5a 70
a98aa0b0
MD
71static __inline
72int
73hammer2_freemapradix(int radix)
74{
75 return(radix);
76}
77
1a7cfe5a
MD
78/*
79 * Calculate the device offset for the specified FREEMAP_NODE or FREEMAP_LEAF
80 * bref. Return a combined media offset and physical size radix. Freemap
81 * chains use fixed storage offsets in the 4MB reserved area at the
82 * beginning of each 2GB zone
83 *
65cacacf
MD
84 * XXX I made a mistake and made the reserved area begin at each LEVEL1 zone,
85 * which is on a 1GB demark. This will eat a little more space but for
86 * now we retain compatibility and make FMZONEBASE every 1GB
87 *
88 * (see same thing in hammer2_bulkfree.c near the top, as well as in
89 * newfs_hammer2).
90 *
1a7cfe5a
MD
91 * Rotate between four possibilities. Theoretically this means we have three
92 * good freemaps in case of a crash which we can use as a base for the fixup
93 * scan at mount-time.
94 */
65cacacf 95#define H2FMZONEBASE(key) ((key) & ~HAMMER2_FREEMAP_LEVEL1_MASK)
1a7cfe5a
MD
96#define H2FMBASE(key, radix) ((key) & ~(((hammer2_off_t)1 << (radix)) - 1))
97#define H2FMSHIFT(radix) ((hammer2_off_t)1 << (radix))
98
99static
100int
c603b86b 101hammer2_freemap_reserve(hammer2_chain_t *chain, int radix)
1a7cfe5a 102{
623d43d4 103 hammer2_blockref_t *bref = &chain->bref;
1a7cfe5a 104 hammer2_off_t off;
a3fd5153 105 int index;
5cebbe36 106 int index_inc;
1a7cfe5a
MD
107 size_t bytes;
108
109 /*
5cebbe36 110 * Physical allocation size.
1a7cfe5a 111 */
5cebbe36 112 bytes = (size_t)1 << radix;
1a7cfe5a
MD
113
114 /*
a71db85d
MD
115 * Calculate block selection index 0..7 of current block. If this
116 * is the first allocation of the block (verses a modification of an
117 * existing block), we use index 0, otherwise we use the next rotating
118 * index.
1a7cfe5a 119 */
3148f677
MD
120 if ((bref->data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) {
121 index = 0;
1a7cfe5a 122 } else {
3148f677
MD
123 off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX &
124 (((hammer2_off_t)1 <<
125 HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
126 off = off / HAMMER2_PBUFSIZE;
127 KKASSERT(off >= HAMMER2_ZONE_FREEMAP_00 &&
128 off < HAMMER2_ZONE_FREEMAP_END);
5031809d
MD
129 index = (int)(off - HAMMER2_ZONE_FREEMAP_00) /
130 HAMMER2_ZONE_FREEMAP_INC;
3148f677
MD
131 KKASSERT(index >= 0 && index < HAMMER2_NFREEMAPS);
132 if (++index == HAMMER2_NFREEMAPS)
a71db85d 133 index = 0;
623d43d4 134 }
1a7cfe5a 135
1a7cfe5a
MD
136 /*
137 * Calculate the block offset of the reserved block. This will
138 * point into the 4MB reserved area at the base of the appropriate
139 * 2GB zone, once added to the FREEMAP_x selection above.
140 */
5cebbe36
MD
141 index_inc = index * HAMMER2_ZONE_FREEMAP_INC;
142
1a7cfe5a 143 switch(bref->keybits) {
5cebbe36
MD
144 /* case HAMMER2_FREEMAP_LEVEL6_RADIX: not applicable */
145 case HAMMER2_FREEMAP_LEVEL5_RADIX: /* 2EB */
146 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
147 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
148 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL5_RADIX) +
149 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
150 HAMMER2_ZONEFM_LEVEL5) * HAMMER2_PBUFSIZE;
151 break;
1a7cfe5a
MD
152 case HAMMER2_FREEMAP_LEVEL4_RADIX: /* 2EB */
153 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
154 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
a3fd5153 155 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL4_RADIX) +
5cebbe36 156 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
ba8a9be0 157 HAMMER2_ZONEFM_LEVEL4) * HAMMER2_PBUFSIZE;
1a7cfe5a
MD
158 break;
159 case HAMMER2_FREEMAP_LEVEL3_RADIX: /* 2PB */
160 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
161 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
a3fd5153 162 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL3_RADIX) +
5cebbe36 163 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
ba8a9be0 164 HAMMER2_ZONEFM_LEVEL3) * HAMMER2_PBUFSIZE;
1a7cfe5a
MD
165 break;
166 case HAMMER2_FREEMAP_LEVEL2_RADIX: /* 2TB */
167 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
168 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
a3fd5153 169 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL2_RADIX) +
5cebbe36 170 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
ba8a9be0 171 HAMMER2_ZONEFM_LEVEL2) * HAMMER2_PBUFSIZE;
1a7cfe5a
MD
172 break;
173 case HAMMER2_FREEMAP_LEVEL1_RADIX: /* 2GB */
93f3933a 174 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
1a7cfe5a 175 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
a3fd5153 176 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
5cebbe36 177 (index_inc + HAMMER2_ZONE_FREEMAP_00 +
ba8a9be0 178 HAMMER2_ZONEFM_LEVEL1) * HAMMER2_PBUFSIZE;
1a7cfe5a 179 break;
1a7cfe5a
MD
180 default:
181 panic("freemap: bad radix(2) %p %d\n", bref, bref->keybits);
182 /* NOT REACHED */
a3fd5153 183 off = (hammer2_off_t)-1;
1a7cfe5a
MD
184 break;
185 }
186 bref->data_off = off | radix;
8138a154
MD
187#if FREEMAP_DEBUG
188 kprintf("FREEMAP BLOCK TYPE %d %016jx/%d DATA_OFF=%016jx\n",
189 bref->type, bref->key, bref->keybits, bref->data_off);
623d43d4 190#endif
1a7cfe5a
MD
191 return (0);
192}
193
1a7cfe5a
MD
194/*
195 * Normal freemap allocator
196 *
197 * Use available hints to allocate space using the freemap. Create missing
198 * freemap infrastructure on-the-fly as needed (including marking initial
199 * allocations using the iterator as allocated, instantiating new 2GB zones,
200 * and dealing with the end-of-media edge case).
201 *
202 * ip and bpref are only used as a heuristic to determine locality of
203 * reference. bref->key may also be used heuristically.
da0cdd33
MD
204 *
205 * This function is a NOP if bytes is 0.
1a7cfe5a
MD
206 */
207int
c603b86b 208hammer2_freemap_alloc(hammer2_chain_t *chain, size_t bytes)
1a7cfe5a 209{
506bd6d1 210 hammer2_dev_t *hmp = chain->hmp;
623d43d4 211 hammer2_blockref_t *bref = &chain->bref;
1a7cfe5a 212 hammer2_chain_t *parent;
e2163f5b 213 hammer2_tid_t mtid;
1a7cfe5a
MD
214 int radix;
215 int error;
91abd410
MD
216 unsigned int hindex;
217 hammer2_fiterate_t iter;
1a7cfe5a 218
da0cdd33
MD
219 /*
220 * If allocating or downsizing to zero we just get rid of whatever
221 * data_off we had.
222 */
223 if (bytes == 0) {
224 chain->bref.data_off = 0;
225 return 0;
226 }
227
65c894ff 228 KKASSERT(hmp->spmp);
e2163f5b
MD
229 mtid = hammer2_trans_sub(hmp->spmp);
230
1a7cfe5a
MD
231 /*
232 * Validate the allocation size. It must be a power of 2.
93f3933a
MD
233 *
234 * For now require that the caller be aware of the minimum
235 * allocation (1K).
1a7cfe5a
MD
236 */
237 radix = hammer2_getradix(bytes);
238 KKASSERT((size_t)1 << radix == bytes);
239
1a7cfe5a
MD
240 if (bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
241 bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
50456506
MD
242 /*
243 * Freemap blocks themselves are assigned from the reserve
244 * area, not allocated from the freemap.
245 */
c603b86b 246 error = hammer2_freemap_reserve(chain, radix);
470dad14 247
50456506 248 return error;
1a7cfe5a 249 }
1a7cfe5a 250
50456506 251 KKASSERT(bytes >= HAMMER2_ALLOC_MIN && bytes <= HAMMER2_ALLOC_MAX);
91abd410 252
01eabad4 253 /*
a98aa0b0
MD
254 * Calculate the starting point for our allocation search.
255 *
256 * Each freemap leaf is dedicated to a specific freemap_radix.
257 * The freemap_radix can be more fine-grained than the device buffer
258 * radix which results in inodes being grouped together in their
259 * own segment, terminal-data (16K or less) and initial indirect
260 * block being grouped together, and then full-indirect and full-data
261 * blocks (64K) being grouped together.
262 *
263 * The single most important aspect of this is the inode grouping
264 * because that is what allows 'find' and 'ls' and other filesystem
265 * topology operations to run fast.
1a7cfe5a 266 */
a98aa0b0 267#if 0
1a7cfe5a
MD
268 if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
269 bpref = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
270 else if (trans->tmp_bpref)
271 bpref = trans->tmp_bpref;
272 else if (trans->tmp_ip)
273 bpref = trans->tmp_ip->chain->bref.data_off;
274 else
275#endif
91abd410
MD
276 /*
277 * Heuristic tracking index. We would like one for each distinct
278 * bref type if possible. heur_freemap[] has room for two classes
279 * for each type. At a minimum we have to break-up our heuristic
280 * by device block sizes.
281 */
282 hindex = hammer2_devblkradix(radix) - HAMMER2_MINIORADIX;
283 KKASSERT(hindex < HAMMER2_FREEMAP_HEUR_NRADIX);
284 hindex += bref->type * HAMMER2_FREEMAP_HEUR_NRADIX;
285 hindex &= HAMMER2_FREEMAP_HEUR_TYPES * HAMMER2_FREEMAP_HEUR_NRADIX - 1;
3f01ebaa 286 KKASSERT(hindex < HAMMER2_FREEMAP_HEUR_SIZE);
91abd410
MD
287
288 iter.bpref = hmp->heur_freemap[hindex];
65cacacf 289 iter.relaxed = hmp->freemap_relaxed;
1a7cfe5a
MD
290
291 /*
292 * Make sure bpref is in-bounds. It's ok if bpref covers a zone's
293 * reserved area, the try code will iterate past it.
294 */
91abd410
MD
295 if (iter.bpref > hmp->voldata.volu_size)
296 iter.bpref = hmp->voldata.volu_size - 1;
1a7cfe5a
MD
297
298 /*
299 * Iterate the freemap looking for free space before and after.
300 */
301 parent = &hmp->fchain;
e513e77e 302 hammer2_chain_ref(parent);
1a7cfe5a 303 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
65cacacf 304 error = HAMMER2_ERROR_EAGAIN;
91abd410
MD
305 iter.bnext = iter.bpref;
306 iter.loops = 0;
1a7cfe5a 307
65cacacf 308 while (error == HAMMER2_ERROR_EAGAIN) {
e2163f5b
MD
309 error = hammer2_freemap_try_alloc(&parent, bref, radix,
310 &iter, mtid);
1a7cfe5a 311 }
65cacacf 312 hmp->freemap_relaxed |= iter.relaxed; /* heuristical, SMP race ok */
91abd410 313 hmp->heur_freemap[hindex] = iter.bnext;
1a7cfe5a 314 hammer2_chain_unlock(parent);
e513e77e 315 hammer2_chain_drop(parent);
1a7cfe5a
MD
316
317 return (error);
1a7cfe5a
MD
318}
319
320static int
c603b86b 321hammer2_freemap_try_alloc(hammer2_chain_t **parentp,
1a7cfe5a 322 hammer2_blockref_t *bref, int radix,
e2163f5b 323 hammer2_fiterate_t *iter, hammer2_tid_t mtid)
1a7cfe5a 324{
506bd6d1 325 hammer2_dev_t *hmp = (*parentp)->hmp;
1a7cfe5a 326 hammer2_off_t l0size;
93f3933a
MD
327 hammer2_off_t l1size;
328 hammer2_off_t l1mask;
1897c66e 329 hammer2_key_t key_dummy;
1a7cfe5a
MD
330 hammer2_chain_t *chain;
331 hammer2_off_t key;
1a7cfe5a 332 size_t bytes;
91abd410 333 uint16_t class;
c8c0a18a 334 int error;
1a7cfe5a
MD
335
336 /*
337 * Calculate the number of bytes being allocated, the number
338 * of contiguous bits of bitmap being allocated, and the bitmap
339 * mask.
01eabad4 340 *
1a7cfe5a
MD
341 * WARNING! cpu hardware may mask bits == 64 -> 0 and blow up the
342 * mask calculation.
343 */
344 bytes = (size_t)1 << radix;
91abd410 345 class = (bref->type << 8) | hammer2_devblkradix(radix);
a98aa0b0 346
1a7cfe5a 347 /*
93f3933a 348 * Lookup the level1 freemap chain, creating and initializing one
1a7cfe5a
MD
349 * if necessary. Intermediate levels will be created automatically
350 * when necessary by hammer2_chain_create().
351 */
91abd410 352 key = H2FMBASE(iter->bnext, HAMMER2_FREEMAP_LEVEL1_RADIX);
1a7cfe5a 353 l0size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
93f3933a
MD
354 l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
355 l1mask = l1size - 1;
1a7cfe5a 356
1897c66e 357 chain = hammer2_chain_lookup(parentp, &key_dummy, key, key + l1mask,
c8c0a18a 358 &error,
1a7cfe5a 359 HAMMER2_LOOKUP_ALWAYS |
b8ba9690 360 HAMMER2_LOOKUP_MATCHIND);
623d43d4 361
1a7cfe5a
MD
362 if (chain == NULL) {
363 /*
364 * Create the missing leaf, be sure to initialize
365 * the auxillary freemap tracking information in
366 * the bref.check.freemap structure.
367 */
368#if 0
93f3933a 369 kprintf("freemap create L1 @ %016jx bpref %016jx\n",
91abd410 370 key, iter->bpref);
1a7cfe5a 371#endif
ecfe89b8
MD
372 error = hammer2_chain_create(parentp, &chain, NULL, hmp->spmp,
373 HAMMER2_METH_DEFAULT,
93f3933a 374 key, HAMMER2_FREEMAP_LEVEL1_RADIX,
1a7cfe5a 375 HAMMER2_BREF_TYPE_FREEMAP_LEAF,
b3659de2 376 HAMMER2_FREEMAP_LEVELN_PSIZE,
3f01ebaa 377 mtid, 0, 0);
0924b3f8 378 KKASSERT(error == 0);
1a7cfe5a 379 if (error == 0) {
3f01ebaa 380 hammer2_chain_modify(chain, mtid, 0, 0);
93f3933a
MD
381 bzero(&chain->data->bmdata[0],
382 HAMMER2_FREEMAP_LEVELN_PSIZE);
383 chain->bref.check.freemap.bigmask = (uint32_t)-1;
384 chain->bref.check.freemap.avail = l1size;
385 /* bref.methods should already be inherited */
1a7cfe5a 386
c603b86b 387 hammer2_freemap_init(hmp, key, chain);
1a7cfe5a 388 }
b93cc2e0
MD
389 } else if (chain->error) {
390 /*
391 * Error during lookup.
392 */
393 kprintf("hammer2_freemap_try_alloc: %016jx: error %s\n",
394 (intmax_t)bref->data_off,
395 hammer2_error_str(chain->error));
65cacacf 396 error = HAMMER2_ERROR_EIO;
5cebbe36
MD
397 } else if ((chain->bref.check.freemap.bigmask &
398 ((size_t)1 << radix)) == 0) {
1a7cfe5a
MD
399 /*
400 * Already flagged as not having enough space
401 */
65cacacf 402 error = HAMMER2_ERROR_ENOSPC;
1a7cfe5a
MD
403 } else {
404 /*
405 * Modify existing chain to setup for adjustment.
406 */
3f01ebaa 407 hammer2_chain_modify(chain, mtid, 0, 0);
1a7cfe5a 408 }
1a7cfe5a
MD
409
410 /*
30b0abf3 411 * Scan 4MB entries.
1a7cfe5a 412 */
93f3933a
MD
413 if (error == 0) {
414 hammer2_bmap_data_t *bmap;
415 hammer2_key_t base_key;
416 int count;
417 int start;
418 int n;
419
10136ab6 420 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
91abd410
MD
421 start = (int)((iter->bnext - key) >>
422 HAMMER2_FREEMAP_LEVEL0_RADIX);
93f3933a 423 KKASSERT(start >= 0 && start < HAMMER2_FREEMAP_COUNT);
3f01ebaa 424 hammer2_chain_modify(chain, mtid, 0, 0);
1a7cfe5a 425
65cacacf 426 error = HAMMER2_ERROR_ENOSPC;
93f3933a 427 for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
271568b1
MD
428 int availchk;
429
93f3933a
MD
430 if (start + count >= HAMMER2_FREEMAP_COUNT &&
431 start - count < 0) {
1a7cfe5a 432 break;
93f3933a 433 }
271568b1
MD
434
435 /*
65cacacf
MD
436 * Calculate bmap pointer from thart starting index
437 * forwards.
271568b1
MD
438 *
439 * NOTE: bmap pointer is invalid if n >= FREEMAP_COUNT.
440 */
93f3933a
MD
441 n = start + count;
442 bmap = &chain->data->bmdata[n];
271568b1
MD
443
444 if (n >= HAMMER2_FREEMAP_COUNT) {
445 availchk = 0;
446 } else if (bmap->avail) {
447 availchk = 1;
448 } else if (radix < HAMMER2_FREEMAP_BLOCK_RADIX &&
449 (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK)) {
450 availchk = 1;
451 } else {
452 availchk = 0;
453 }
454
65cacacf
MD
455 /*
456 * Try to allocate from a matching freemap class
457 * superblock. If we are in relaxed mode we allocate
458 * from any freemap class superblock.
459 */
271568b1 460 if (availchk &&
65cacacf
MD
461 (bmap->class == 0 || bmap->class == class ||
462 iter->relaxed)) {
93f3933a 463 base_key = key + n * l0size;
c603b86b 464 error = hammer2_bmap_alloc(hmp, bmap,
aff72996
MD
465 class, n,
466 (int)bref->key,
467 radix,
91abd410 468 &base_key);
65cacacf 469 if (error != HAMMER2_ERROR_ENOSPC) {
93f3933a
MD
470 key = base_key;
471 break;
472 }
473 }
271568b1
MD
474
475 /*
65cacacf
MD
476 * Calculate bmap pointer from thart starting index
477 * backwards (locality).
478 *
271568b1
MD
479 * Must recalculate after potentially having called
480 * hammer2_bmap_alloc() above in case chain was
481 * reallocated.
482 *
483 * NOTE: bmap pointer is invalid if n < 0.
484 */
93f3933a
MD
485 n = start - count;
486 bmap = &chain->data->bmdata[n];
271568b1
MD
487 if (n < 0) {
488 availchk = 0;
489 } else if (bmap->avail) {
490 availchk = 1;
491 } else if (radix < HAMMER2_FREEMAP_BLOCK_RADIX &&
492 (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK)) {
493 availchk = 1;
494 } else {
495 availchk = 0;
496 }
497
65cacacf
MD
498 /*
499 * Try to allocate from a matching freemap class
500 * superblock. If we are in relaxed mode we allocate
501 * from any freemap class superblock.
502 */
271568b1 503 if (availchk &&
65cacacf
MD
504 (bmap->class == 0 || bmap->class == class ||
505 iter->relaxed)) {
93f3933a 506 base_key = key + n * l0size;
c603b86b 507 error = hammer2_bmap_alloc(hmp, bmap,
aff72996
MD
508 class, n,
509 (int)bref->key,
510 radix,
91abd410 511 &base_key);
65cacacf 512 if (error != HAMMER2_ERROR_ENOSPC) {
93f3933a
MD
513 key = base_key;
514 break;
515 }
516 }
1a7cfe5a 517 }
65cacacf
MD
518
519 /*
520 * We only know for sure that we can clear the bitmap bit
521 * if we scanned the entire array (start == 0).
522 */
523 if (error == HAMMER2_ERROR_ENOSPC && start == 0) {
5cebbe36
MD
524 chain->bref.check.freemap.bigmask &=
525 (uint32_t)~((size_t)1 << radix);
526 }
93f3933a 527 /* XXX also scan down from original count */
1a7cfe5a
MD
528 }
529
1a7cfe5a
MD
530 if (error == 0) {
531 /*
532 * Assert validity. Must be beyond the static allocator used
533 * by newfs_hammer2 (and thus also beyond the aux area),
534 * not go past the volume size, and must not be in the
535 * reserved segment area for a zone.
536 */
537 KKASSERT(key >= hmp->voldata.allocator_beg &&
538 key + bytes <= hmp->voldata.volu_size);
539 KKASSERT((key & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
1a7cfe5a 540 bref->data_off = key | radix;
0b8efeb7
MD
541
542 /*
543 * Record dedupability. The dedup bits are cleared
544 * when bulkfree transitions the freemap from 11->10,
545 * and asserted to be clear on the 10->00 transition.
546 *
547 * We must record the bitmask with the chain locked
548 * at the time we set the allocation bits to avoid
549 * racing a bulkfree.
550 */
551 if (bref->type == HAMMER2_BREF_TYPE_DATA)
552 hammer2_io_dedup_set(hmp, bref);
1a7cfe5a 553#if 0
93f3933a 554 kprintf("alloc cp=%p %016jx %016jx using %016jx\n",
1a7cfe5a 555 chain,
93f3933a 556 bref->key, bref->data_off, chain->bref.data_off);
1a7cfe5a 557#endif
65cacacf 558 } else if (error == HAMMER2_ERROR_ENOSPC) {
1a7cfe5a 559 /*
91abd410 560 * Return EAGAIN with next iteration in iter->bnext, or
1a7cfe5a
MD
561 * return ENOSPC if the allocation map has been exhausted.
562 */
c603b86b 563 error = hammer2_freemap_iterate(parentp, &chain, iter);
1a7cfe5a
MD
564 }
565
566 /*
567 * Cleanup
568 */
e513e77e 569 if (chain) {
1a7cfe5a 570 hammer2_chain_unlock(chain);
e513e77e
MD
571 hammer2_chain_drop(chain);
572 }
1a7cfe5a
MD
573 return (error);
574}
575
93f3933a
MD
576/*
577 * Allocate (1<<radix) bytes from the bmap whos base data offset is (*basep).
578 *
579 * If the linear iterator is mid-block we use it directly (the bitmap should
aff72996
MD
580 * already be marked allocated), otherwise we search for a block in the
581 * bitmap that fits the allocation request.
93f3933a
MD
582 *
583 * A partial bitmap allocation sets the minimum bitmap granularity (16KB)
584 * to fully allocated and adjusts the linear allocator to allow the
585 * remaining space to be allocated.
aff72996
MD
586 *
587 * sub_key is the lower 32 bits of the chain->bref.key for the chain whos
588 * bref is being allocated. If the radix represents an allocation >= 16KB
589 * (aka HAMMER2_FREEMAP_BLOCK_RADIX) we try to use this key to select the
590 * blocks directly out of the bmap.
93f3933a
MD
591 */
592static
593int
c603b86b 594hammer2_bmap_alloc(hammer2_dev_t *hmp, hammer2_bmap_data_t *bmap,
aff72996
MD
595 uint16_t class, int n, int sub_key,
596 int radix, hammer2_key_t *basep)
93f3933a 597{
93f3933a 598 size_t size;
271568b1 599 size_t bgsize;
93f3933a 600 int bmradix;
5cebbe36 601 hammer2_bitmap_t bmmask;
93f3933a
MD
602 int offset;
603 int i;
604 int j;
605
606 /*
607 * Take into account 2-bits per block when calculating bmradix.
608 */
609 size = (size_t)1 << radix;
91abd410 610
93f3933a
MD
611 if (radix <= HAMMER2_FREEMAP_BLOCK_RADIX) {
612 bmradix = 2;
93f3933a
MD
613 /* (16K) 2 bits per allocation block */
614 } else {
5cebbe36
MD
615 bmradix = (hammer2_bitmap_t)2 <<
616 (radix - HAMMER2_FREEMAP_BLOCK_RADIX);
93f3933a
MD
617 /* (32K-256K) 4, 8, 16, 32 bits per allocation block */
618 }
619
620 /*
91abd410
MD
621 * Use the linear iterator to pack small allocations, otherwise
622 * fall-back to finding a free 16KB chunk. The linear iterator
623 * is only valid when *NOT* on a freemap chunking boundary (16KB).
624 * If it is the bitmap must be scanned. It can become invalid
625 * once we pack to the boundary. We adjust it after a bitmap
626 * allocation only for sub-16KB allocations (so the perfectly good
627 * previous value can still be used for fragments when 16KB+
b49f72c3 628 * allocations are made inbetween fragmentary allocations).
91abd410 629 *
5cebbe36 630 * Beware of hardware artifacts when bmradix == 64 (intermediate
91abd410 631 * result can wind up being '1' instead of '0' if hardware masks
b49f72c3 632 * bit-count & 63).
93f3933a
MD
633 *
634 * NOTE: j needs to be even in the j= calculation. As an artifact
635 * of the /2 division, our bitmask has to clear bit 0.
91abd410
MD
636 *
637 * NOTE: TODO this can leave little unallocatable fragments lying
638 * around.
93f3933a 639 */
91abd410
MD
640 if (((uint32_t)bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) + size <=
641 HAMMER2_FREEMAP_BLOCK_SIZE &&
2c6f594d 642 (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) &&
91abd410 643 bmap->linear < HAMMER2_SEGSIZE) {
aff72996
MD
644 /*
645 * Use linear iterator if it is not block-aligned to avoid
646 * wasting space.
83815ec6
MD
647 *
648 * Calculate the bitmapq[] index (i) and calculate the
649 * shift count within the 64-bit bitmapq[] entry.
650 *
651 * The freemap block size is 16KB, but each bitmap
652 * entry is two bits so use a little trick to get
653 * a (j) shift of 0, 2, 4, ... 62 in 16KB chunks.
aff72996 654 */
93f3933a 655 KKASSERT(bmap->linear >= 0 &&
91abd410 656 bmap->linear + size <= HAMMER2_SEGSIZE &&
50456506 657 (bmap->linear & (HAMMER2_ALLOC_MIN - 1)) == 0);
93f3933a 658 offset = bmap->linear;
83815ec6
MD
659 i = offset / (HAMMER2_SEGSIZE / HAMMER2_BMAP_ELEMENTS);
660 j = (offset / (HAMMER2_FREEMAP_BLOCK_SIZE / 2)) & 62;
5cebbe36
MD
661 bmmask = (bmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
662 HAMMER2_BMAP_ALLONES :
663 ((hammer2_bitmap_t)1 << bmradix) - 1;
93f3933a 664 bmmask <<= j;
91abd410 665 bmap->linear = offset + size;
93f3933a 666 } else {
aff72996
MD
667 /*
668 * Try to index a starting point based on sub_key. This
669 * attempts to restore sequential block ordering on-disk
670 * whenever possible, even if data is committed out of
671 * order.
672 *
673 * i - Index bitmapq[], full data range represented is
674 * HAMMER2_BMAP_SIZE.
675 *
676 * j - Index within bitmapq[i], full data range represented is
677 * HAMMER2_BMAP_INDEX_SIZE.
678 *
679 * WARNING!
680 */
681 i = -1;
682 j = -1;
683
684 switch(class >> 8) {
685 case HAMMER2_BREF_TYPE_DATA:
686 if (radix >= HAMMER2_FREEMAP_BLOCK_RADIX) {
687 i = (sub_key & HAMMER2_BMAP_MASK) /
688 (HAMMER2_BMAP_SIZE / HAMMER2_BMAP_ELEMENTS);
689 j = (sub_key & HAMMER2_BMAP_INDEX_MASK) /
690 (HAMMER2_BMAP_INDEX_SIZE /
691 HAMMER2_BMAP_BLOCKS_PER_ELEMENT);
692 j = j * 2;
693 }
694 break;
695 case HAMMER2_BREF_TYPE_INODE:
696 break;
697 default:
698 break;
699 }
700 if (i >= 0) {
701 KKASSERT(i < HAMMER2_BMAP_ELEMENTS &&
702 j < 2 * HAMMER2_BMAP_BLOCKS_PER_ELEMENT);
703 KKASSERT(j + bmradix <= HAMMER2_BMAP_BITS_PER_ELEMENT);
704 bmmask = (bmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
705 HAMMER2_BMAP_ALLONES :
706 ((hammer2_bitmap_t)1 << bmradix) - 1;
707 bmmask <<= j;
708
709 if ((bmap->bitmapq[i] & bmmask) == 0)
710 goto success;
711 }
712
713 /*
714 * General element scan.
715 *
716 * WARNING: (j) is iterating a bit index (by 2's)
717 */
5cebbe36
MD
718 for (i = 0; i < HAMMER2_BMAP_ELEMENTS; ++i) {
719 bmmask = (bmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
720 HAMMER2_BMAP_ALLONES :
721 ((hammer2_bitmap_t)1 << bmradix) - 1;
722 for (j = 0;
723 j < HAMMER2_BMAP_BITS_PER_ELEMENT;
724 j += bmradix) {
725 if ((bmap->bitmapq[i] & bmmask) == 0)
93f3933a
MD
726 goto success;
727 bmmask <<= bmradix;
728 }
729 }
91abd410
MD
730 /*fragments might remain*/
731 /*KKASSERT(bmap->avail == 0);*/
65cacacf 732 return (HAMMER2_ERROR_ENOSPC);
93f3933a 733success:
5cebbe36 734 offset = i * (HAMMER2_SEGSIZE / HAMMER2_BMAP_ELEMENTS) +
93f3933a 735 (j * (HAMMER2_FREEMAP_BLOCK_SIZE / 2));
91abd410
MD
736 if (size & HAMMER2_FREEMAP_BLOCK_MASK)
737 bmap->linear = offset + size;
93f3933a
MD
738 }
739
5cebbe36
MD
740 /* 8 x (64/2) -> 256 x 16K -> 4MB */
741 KKASSERT(i >= 0 && i < HAMMER2_BMAP_ELEMENTS);
93f3933a
MD
742
743 /*
744 * Optimize the buffer cache to avoid unnecessary read-before-write
745 * operations.
91abd410
MD
746 *
747 * The device block size could be larger than the allocation size
748 * so the actual bitmap test is somewhat more involved. We have
749 * to use a compatible buffer size for this operation.
93f3933a 750 */
5cebbe36 751 if ((bmap->bitmapq[i] & bmmask) == 0 &&
2c6f594d 752 hammer2_devblksize(size) != size) {
91abd410
MD
753 size_t psize = hammer2_devblksize(size);
754 hammer2_off_t pmask = (hammer2_off_t)psize - 1;
5cebbe36
MD
755 int pbmradix = (hammer2_bitmap_t)2 <<
756 (hammer2_devblkradix(radix) -
757 HAMMER2_FREEMAP_BLOCK_RADIX);
758 hammer2_bitmap_t pbmmask;
fdf62707 759 int pradix = hammer2_getradix(psize);
91abd410 760
5cebbe36
MD
761 pbmmask = (pbmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
762 HAMMER2_BMAP_ALLONES :
763 ((hammer2_bitmap_t)1 << pbmradix) - 1;
91abd410
MD
764 while ((pbmmask & bmmask) == 0)
765 pbmmask <<= pbmradix;
766
2c6f594d 767#if 0
5cebbe36
MD
768 kprintf("%016jx mask %016jx %016jx %016jx (%zd/%zd)\n",
769 *basep + offset, bmap->bitmapq[i],
2c6f594d
MD
770 pbmmask, bmmask, size, psize);
771#endif
772
5cebbe36 773 if ((bmap->bitmapq[i] & pbmmask) == 0) {
85b1f267
MD
774 hammer2_io_t *dio;
775
776 hammer2_io_newnz(hmp, class >> 8,
ba38182b 777 (*basep + (offset & ~pmask)) |
85b1f267
MD
778 pradix, psize, &dio);
779 hammer2_io_putblk(&dio);
93f3933a
MD
780 }
781 }
782
783#if 0
784 /*
785 * When initializing a new inode segment also attempt to initialize
786 * an adjacent segment. Be careful not to index beyond the array
787 * bounds.
788 *
789 * We do this to try to localize inode accesses to improve
790 * directory scan rates. XXX doesn't improve scan rates.
791 */
792 if (size == HAMMER2_INODE_BYTES) {
793 if (n & 1) {
794 if (bmap[-1].radix == 0 && bmap[-1].avail)
795 bmap[-1].radix = radix;
796 } else {
797 if (bmap[1].radix == 0 && bmap[1].avail)
798 bmap[1].radix = radix;
799 }
800 }
801#endif
271568b1
MD
802 /*
803 * Calculate the bitmap-granular change in bgsize for the volume
804 * header. We cannot use the fine-grained change here because
805 * the bulkfree code can't undo it. If the bitmap element is already
806 * marked allocated it has already been accounted for.
807 */
808 if (radix < HAMMER2_FREEMAP_BLOCK_RADIX) {
5cebbe36 809 if (bmap->bitmapq[i] & bmmask)
271568b1
MD
810 bgsize = 0;
811 else
812 bgsize = HAMMER2_FREEMAP_BLOCK_SIZE;
813 } else {
814 bgsize = size;
815 }
93f3933a
MD
816
817 /*
271568b1
MD
818 * Adjust the bitmap, set the class (it might have been 0),
819 * and available bytes, update the allocation offset (*basep)
820 * from the L0 base to the actual offset.
821 *
65cacacf
MD
822 * Do not override the class if doing a relaxed class allocation.
823 *
271568b1
MD
824 * avail must reflect the bitmap-granular availability. The allocator
825 * tests will also check the linear iterator.
93f3933a 826 */
5cebbe36 827 bmap->bitmapq[i] |= bmmask;
65cacacf
MD
828 if (bmap->class == 0)
829 bmap->class = class;
271568b1 830 bmap->avail -= bgsize;
93f3933a
MD
831 *basep += offset;
832
271568b1
MD
833 /*
834 * Adjust the volume header's allocator_free parameter. This
835 * parameter has to be fixed up by bulkfree which has no way to
836 * figure out sub-16K chunking, so it must be adjusted by the
837 * bitmap-granular size.
838 */
839 if (bgsize) {
840 hammer2_voldata_lock(hmp);
841 hammer2_voldata_modify(hmp);
842 hmp->voldata.allocator_free -= bgsize;
843 hammer2_voldata_unlock(hmp);
844 }
9f604b01 845
93f3933a
MD
846 return(0);
847}
848
65cacacf
MD
849/*
850 * Initialize a freemap for the storage area (in bytes) that begins at (key).
851 */
93f3933a
MD
852static
853void
c603b86b
MD
854hammer2_freemap_init(hammer2_dev_t *hmp, hammer2_key_t key,
855 hammer2_chain_t *chain)
93f3933a
MD
856{
857 hammer2_off_t l1size;
858 hammer2_off_t lokey;
859 hammer2_off_t hikey;
860 hammer2_bmap_data_t *bmap;
861 int count;
862
65cacacf
MD
863 /*
864 * LEVEL1 is 1GB, there are two level1 1GB freemaps per 2GB zone.
865 */
93f3933a
MD
866 l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
867
868 /*
65cacacf 869 * Calculate the portion of the 1GB map that should be initialized
93f3933a
MD
870 * as free. Portions below or after will be initialized as allocated.
871 * SEGMASK-align the areas so we don't have to worry about sub-scans
872 * or endianess when using memset.
873 *
93f3933a
MD
874 * WARNING! It is possible for lokey to be larger than hikey if the
875 * entire 2GB segment is within the static allocation.
876 */
65cacacf
MD
877 /*
878 * (1) Ensure that all statically allocated space from newfs_hammer2
879 * is marked allocated, and take it up to the level1 base for
880 * this key.
881 */
a5913bdf 882 lokey = (hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) &
93f3933a 883 ~HAMMER2_SEGMASK64;
65cacacf
MD
884 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX))
885 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX);
93f3933a 886
65cacacf
MD
887 /*
888 * (2) Ensure that the reserved area is marked allocated (typically
889 * the first 4MB of each 2GB area being represented). Since
890 * each LEAF represents 1GB of storage and the zone is 2GB, we
891 * have to adjust lowkey upward every other LEAF sequentially.
892 */
893 if (lokey < H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64)
894 lokey = H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64;
93f3933a 895
65cacacf
MD
896 /*
897 * (3) Ensure that any trailing space at the end-of-volume is marked
898 * allocated.
899 */
93f3933a 900 hikey = key + H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
a5913bdf
MD
901 if (hikey > hmp->voldata.volu_size) {
902 hikey = hmp->voldata.volu_size & ~HAMMER2_SEGMASK64;
93f3933a
MD
903 }
904
65cacacf
MD
905 /*
906 * Heuristic highest possible value
907 */
93f3933a
MD
908 chain->bref.check.freemap.avail =
909 H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
910 bmap = &chain->data->bmdata[0];
911
65cacacf
MD
912 /*
913 * Initialize bitmap (bzero'd by caller)
914 */
93f3933a
MD
915 for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
916 if (key < lokey || key >= hikey) {
5cebbe36
MD
917 memset(bmap->bitmapq, -1,
918 sizeof(bmap->bitmapq));
93f3933a 919 bmap->avail = 0;
2c6f594d 920 bmap->linear = HAMMER2_SEGSIZE;
93f3933a
MD
921 chain->bref.check.freemap.avail -=
922 H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
923 } else {
924 bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
925 }
926 key += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
927 ++bmap;
928 }
929}
930
931/*
932 * The current Level 1 freemap has been exhausted, iterate to the next
933 * one, return ENOSPC if no freemaps remain.
934 *
65cacacf
MD
935 * At least two loops are required. If we are not in relaxed mode and
936 * we run out of storage we enter relaxed mode and do a third loop.
937 * The relaxed mode is recorded back in the hmp so once we enter the mode
938 * we remain relaxed until stuff begins to get freed and only do 2 loops.
939 *
93f3933a
MD
940 * XXX this should rotate back to the beginning to handle freed-up space
941 * XXX or use intermediate entries to locate free space. TODO
942 */
1a7cfe5a 943static int
c603b86b
MD
944hammer2_freemap_iterate(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
945 hammer2_fiterate_t *iter)
1a7cfe5a 946{
506bd6d1 947 hammer2_dev_t *hmp = (*parentp)->hmp;
a5913bdf 948
91abd410
MD
949 iter->bnext &= ~(H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
950 iter->bnext += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
951 if (iter->bnext >= hmp->voldata.volu_size) {
952 iter->bnext = 0;
65cacacf
MD
953 if (++iter->loops >= 2) {
954 if (iter->relaxed == 0)
955 iter->relaxed = 1;
956 else
957 return (HAMMER2_ERROR_ENOSPC);
958 }
91abd410 959 }
65cacacf 960 return(HAMMER2_ERROR_EAGAIN);
5c23d7f1
MD
961}
962
91abd410 963/*
877eacb6
MD
964 * Adjust the bit-pattern for data in the freemap bitmap according to
965 * (how). This code is called from on-mount recovery to fixup (mark
966 * as allocated) blocks whos freemap upates might not have been committed
967 * in the last crash and is used by the bulk freemap scan to stage frees.
91abd410 968 *
da0cdd33
MD
969 * WARNING! Cannot be called with a empty-data bref (radix == 0).
970 *
91abd410
MD
971 * XXX currently disabled when how == 0 (the normal real-time case). At
972 * the moment we depend on the bulk freescan to actually free blocks. It
973 * will still call this routine with a non-zero how to stage possible frees
974 * and to do the actual free.
975 */
004f88b4 976void
e2163f5b
MD
977hammer2_freemap_adjust(hammer2_dev_t *hmp, hammer2_blockref_t *bref,
978 int how)
004f88b4 979{
91abd410
MD
980 hammer2_off_t data_off = bref->data_off;
981 hammer2_chain_t *chain;
982 hammer2_chain_t *parent;
983 hammer2_bmap_data_t *bmap;
984 hammer2_key_t key;
1897c66e 985 hammer2_key_t key_dummy;
91abd410
MD
986 hammer2_off_t l0size;
987 hammer2_off_t l1size;
988 hammer2_off_t l1mask;
e2163f5b 989 hammer2_tid_t mtid;
5cebbe36
MD
990 hammer2_bitmap_t *bitmap;
991 const hammer2_bitmap_t bmmask00 = 0;
992 hammer2_bitmap_t bmmask01;
993 hammer2_bitmap_t bmmask10;
994 hammer2_bitmap_t bmmask11;
91abd410
MD
995 size_t bytes;
996 uint16_t class;
004f88b4 997 int radix;
91abd410
MD
998 int start;
999 int count;
1000 int modified = 0;
10136ab6 1001 int error;
470dad14 1002 size_t bgsize = 0;
004f88b4 1003
271568b1
MD
1004 KKASSERT(how == HAMMER2_FREEMAP_DORECOVER);
1005
65c894ff 1006 KKASSERT(hmp->spmp);
e2163f5b
MD
1007 mtid = hammer2_trans_sub(hmp->spmp);
1008
004f88b4 1009 radix = (int)data_off & HAMMER2_OFF_MASK_RADIX;
da0cdd33 1010 KKASSERT(radix != 0);
004f88b4 1011 data_off &= ~HAMMER2_OFF_MASK_RADIX;
50456506 1012 KKASSERT(radix <= HAMMER2_RADIX_MAX);
91abd410 1013
da0cdd33
MD
1014 if (radix)
1015 bytes = (size_t)1 << radix;
1016 else
1017 bytes = 0;
91abd410
MD
1018 class = (bref->type << 8) | hammer2_devblkradix(radix);
1019
355d67fc 1020 /*
da0cdd33 1021 * We can't adjust the freemap for data allocations made by
10136ab6 1022 * newfs_hammer2.
355d67fc
MD
1023 */
1024 if (data_off < hmp->voldata.allocator_beg)
1025 return;
355d67fc 1026
10136ab6 1027 KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
a7720be7 1028
91abd410
MD
1029 /*
1030 * Lookup the level1 freemap chain. The chain must exist.
1031 */
1032 key = H2FMBASE(data_off, HAMMER2_FREEMAP_LEVEL1_RADIX);
1033 l0size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
1034 l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
1035 l1mask = l1size - 1;
1036
1037 parent = &hmp->fchain;
e513e77e 1038 hammer2_chain_ref(parent);
91abd410
MD
1039 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
1040
1897c66e 1041 chain = hammer2_chain_lookup(&parent, &key_dummy, key, key + l1mask,
c8c0a18a 1042 &error,
91abd410 1043 HAMMER2_LOOKUP_ALWAYS |
b8ba9690 1044 HAMMER2_LOOKUP_MATCHIND);
10136ab6
MD
1045
1046 /*
1047 * Stop early if we are trying to free something but no leaf exists.
1048 */
1049 if (chain == NULL && how != HAMMER2_FREEMAP_DORECOVER) {
1050 kprintf("hammer2_freemap_adjust: %016jx: no chain\n",
91abd410 1051 (intmax_t)bref->data_off);
623d43d4 1052 goto done;
91abd410 1053 }
b93cc2e0
MD
1054 if (chain->error) {
1055 kprintf("hammer2_freemap_adjust: %016jx: error %s\n",
1056 (intmax_t)bref->data_off,
1057 hammer2_error_str(chain->error));
1058 hammer2_chain_unlock(chain);
e513e77e 1059 hammer2_chain_drop(chain);
b93cc2e0
MD
1060 chain = NULL;
1061 goto done;
1062 }
91abd410
MD
1063
1064 /*
10136ab6
MD
1065 * Create any missing leaf(s) if we are doing a recovery (marking
1066 * the block(s) as being allocated instead of being freed). Be sure
1067 * to initialize the auxillary freemap tracking info in the
1068 * bref.check.freemap structure.
91abd410 1069 */
10136ab6 1070 if (chain == NULL && how == HAMMER2_FREEMAP_DORECOVER) {
ecfe89b8
MD
1071 error = hammer2_chain_create(&parent, &chain, NULL, hmp->spmp,
1072 HAMMER2_METH_DEFAULT,
10136ab6
MD
1073 key, HAMMER2_FREEMAP_LEVEL1_RADIX,
1074 HAMMER2_BREF_TYPE_FREEMAP_LEAF,
b3659de2 1075 HAMMER2_FREEMAP_LEVELN_PSIZE,
3f01ebaa 1076 mtid, 0, 0);
1fca819a
MD
1077
1078 if (hammer2_debug & 0x0040) {
1079 kprintf("fixup create chain %p %016jx:%d\n",
1080 chain, chain->bref.key, chain->bref.keybits);
1081 }
10136ab6
MD
1082
1083 if (error == 0) {
65cacacf
MD
1084 error = hammer2_chain_modify(chain, mtid, 0, 0);
1085 KKASSERT(error == 0);
10136ab6
MD
1086 bzero(&chain->data->bmdata[0],
1087 HAMMER2_FREEMAP_LEVELN_PSIZE);
1088 chain->bref.check.freemap.bigmask = (uint32_t)-1;
1089 chain->bref.check.freemap.avail = l1size;
1090 /* bref.methods should already be inherited */
91abd410 1091
c603b86b 1092 hammer2_freemap_init(hmp, key, chain);
10136ab6
MD
1093 }
1094 /* XXX handle error */
1095 }
1096
8138a154
MD
1097#if FREEMAP_DEBUG
1098 kprintf("FREEMAP ADJUST TYPE %d %016jx/%d DATA_OFF=%016jx\n",
1099 chain->bref.type, chain->bref.key,
1100 chain->bref.keybits, chain->bref.data_off);
1101#endif
1102
10136ab6
MD
1103 /*
1104 * Calculate the bitmask (runs in 2-bit pairs).
1105 */
91abd410 1106 start = ((int)(data_off >> HAMMER2_FREEMAP_BLOCK_RADIX) & 15) * 2;
5cebbe36
MD
1107 bmmask01 = (hammer2_bitmap_t)1 << start;
1108 bmmask10 = (hammer2_bitmap_t)2 << start;
1109 bmmask11 = (hammer2_bitmap_t)3 << start;
91abd410
MD
1110
1111 /*
10136ab6
MD
1112 * Fixup the bitmap. Partial blocks cannot be fully freed unless
1113 * a bulk scan is able to roll them up.
91abd410
MD
1114 */
1115 if (radix < HAMMER2_FREEMAP_BLOCK_RADIX) {
1116 count = 1;
10136ab6
MD
1117 if (how == HAMMER2_FREEMAP_DOREALFREE)
1118 how = HAMMER2_FREEMAP_DOMAYFREE;
91abd410
MD
1119 } else {
1120 count = 1 << (radix - HAMMER2_FREEMAP_BLOCK_RADIX);
1121 }
004f88b4 1122
10136ab6
MD
1123 /*
1124 * [re]load the bmap and bitmap pointers. Each bmap entry covers
30b0abf3 1125 * a 4MB swath. The bmap itself (LEVEL1) covers 2GB.
8138a154
MD
1126 *
1127 * Be sure to reset the linear iterator to ensure that the adjustment
1128 * is not ignored.
10136ab6
MD
1129 */
1130again:
1131 bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) &
1132 (HAMMER2_FREEMAP_COUNT - 1)];
5cebbe36 1133 bitmap = &bmap->bitmapq[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7];
a71db85d
MD
1134
1135 if (modified)
1136 bmap->linear = 0;
10136ab6 1137
91abd410
MD
1138 while (count) {
1139 KKASSERT(bmmask11);
10136ab6
MD
1140 if (how == HAMMER2_FREEMAP_DORECOVER) {
1141 /*
1142 * Recovery request, mark as allocated.
1143 */
1144 if ((*bitmap & bmmask11) != bmmask11) {
1145 if (modified == 0) {
3f01ebaa 1146 hammer2_chain_modify(chain, mtid, 0, 0);
10136ab6
MD
1147 modified = 1;
1148 goto again;
1149 }
271568b1
MD
1150 if ((*bitmap & bmmask11) == bmmask00) {
1151 bmap->avail -=
1152 HAMMER2_FREEMAP_BLOCK_SIZE;
470dad14 1153 bgsize += HAMMER2_FREEMAP_BLOCK_SIZE;
271568b1 1154 }
10136ab6
MD
1155 if (bmap->class == 0)
1156 bmap->class = class;
1157 *bitmap |= bmmask11;
1fca819a
MD
1158 if (hammer2_debug & 0x0040) {
1159 kprintf("hammer2_freemap_recover: "
1160 "fixup type=%02x "
1161 "block=%016jx/%zd\n",
1162 bref->type, data_off, bytes);
1163 }
10136ab6
MD
1164 } else {
1165 /*
1166 kprintf("hammer2_freemap_recover: good "
1167 "type=%02x block=%016jx/%zd\n",
1168 bref->type, data_off, bytes);
1169 */
1170 }
271568b1
MD
1171 }
1172#if 0
1173 /*
1174 * XXX this stuff doesn't work, avail is miscalculated and
1175 * code 10 means something else now.
1176 */
1177 else if ((*bitmap & bmmask11) == bmmask11) {
10136ab6
MD
1178 /*
1179 * Mayfree/Realfree request and bitmap is currently
1180 * marked as being fully allocated.
1181 */
91abd410 1182 if (!modified) {
c603b86b 1183 hammer2_chain_modify(chain, 0);
91abd410 1184 modified = 1;
10136ab6 1185 goto again;
91abd410 1186 }
10136ab6 1187 if (how == HAMMER2_FREEMAP_DOREALFREE)
91abd410
MD
1188 *bitmap &= ~bmmask11;
1189 else
1190 *bitmap = (*bitmap & ~bmmask11) | bmmask10;
1191 } else if ((*bitmap & bmmask11) == bmmask10) {
10136ab6
MD
1192 /*
1193 * Mayfree/Realfree request and bitmap is currently
1194 * marked as being possibly freeable.
1195 */
1196 if (how == HAMMER2_FREEMAP_DOREALFREE) {
91abd410 1197 if (!modified) {
c603b86b 1198 hammer2_chain_modify(chain, 0);
91abd410 1199 modified = 1;
10136ab6 1200 goto again;
91abd410
MD
1201 }
1202 *bitmap &= ~bmmask11;
1203 }
10136ab6
MD
1204 } else {
1205 /*
1206 * 01 - Not implemented, currently illegal state
1207 * 00 - Not allocated at all, illegal free.
1208 */
1209 panic("hammer2_freemap_adjust: "
1210 "Illegal state %08x(%08x)",
1211 *bitmap, *bitmap & bmmask11);
91abd410 1212 }
271568b1 1213#endif
91abd410
MD
1214 --count;
1215 bmmask01 <<= 2;
1216 bmmask10 <<= 2;
1217 bmmask11 <<= 2;
1218 }
5cebbe36
MD
1219#if HAMMER2_BMAP_ELEMENTS != 8
1220#error "hammer2_freemap.c: HAMMER2_BMAP_ELEMENTS expected to be 8"
1221#endif
10136ab6 1222 if (how == HAMMER2_FREEMAP_DOREALFREE && modified) {
91abd410
MD
1223 bmap->avail += 1 << radix;
1224 KKASSERT(bmap->avail <= HAMMER2_SEGSIZE);
1225 if (bmap->avail == HAMMER2_SEGSIZE &&
5cebbe36
MD
1226 bmap->bitmapq[0] == 0 &&
1227 bmap->bitmapq[1] == 0 &&
1228 bmap->bitmapq[2] == 0 &&
1229 bmap->bitmapq[3] == 0 &&
1230 bmap->bitmapq[4] == 0 &&
1231 bmap->bitmapq[5] == 0 &&
1232 bmap->bitmapq[6] == 0 &&
1233 bmap->bitmapq[7] == 0) {
91abd410
MD
1234 key = H2FMBASE(data_off, HAMMER2_FREEMAP_LEVEL0_RADIX);
1235 kprintf("Freeseg %016jx\n", (intmax_t)key);
1236 bmap->class = 0;
1237 }
004f88b4 1238 }
004f88b4 1239
91abd410
MD
1240 /*
1241 * chain->bref.check.freemap.bigmask (XXX)
10136ab6
MD
1242 *
1243 * Setting bigmask is a hint to the allocation code that there might
1244 * be something allocatable. We also set this in recovery... it
1245 * doesn't hurt and we might want to use the hint for other validation
1246 * operations later on.
65cacacf
MD
1247 *
1248 * We could calculate the largest possible allocation and set the
1249 * radii that could fit, but its easier just to set bigmask to -1.
91abd410 1250 */
65cacacf
MD
1251 if (modified) {
1252 chain->bref.check.freemap.bigmask = -1;
1253 hmp->freemap_relaxed = 0; /* reset heuristic */
1254 }
1a7cfe5a 1255
91abd410 1256 hammer2_chain_unlock(chain);
e513e77e 1257 hammer2_chain_drop(chain);
623d43d4 1258done:
91abd410 1259 hammer2_chain_unlock(parent);
e513e77e 1260 hammer2_chain_drop(parent);
470dad14
MD
1261
1262 if (bgsize) {
1263 hammer2_voldata_lock(hmp);
1264 hammer2_voldata_modify(hmp);
1265 hmp->voldata.allocator_free -= bgsize;
1266 hammer2_voldata_unlock(hmp);
1267 }
5c23d7f1 1268}
bca9f8e6
MD
1269
1270/*
1271 * Validate the freemap, in three stages.
1272 *
1273 * stage-1 ALLOCATED -> POSSIBLY FREE
1274 * POSSIBLY FREE -> POSSIBLY FREE (type corrected)
1275 *
1276 * This transitions bitmap entries from ALLOCATED to POSSIBLY FREE.
1277 * The POSSIBLY FREE state does not mean that a block is actually free
1278 * and may be transitioned back to ALLOCATED in stage-2.
1279 *
1280 * This is typically done during normal filesystem operations when
1281 * something is deleted or a block is replaced.
1282 *
1283 * This is done by bulkfree in-bulk after a memory-bounded meta-data
1284 * scan to try to determine what might be freeable.
1285 *
1286 * This can be done unconditionally through a freemap scan when the
1287 * intention is to brute-force recover the proper state of the freemap.
1288 *
1289 * stage-2 POSSIBLY FREE -> ALLOCATED (scan metadata topology)
1290 *
1291 * This is done by bulkfree during a meta-data scan to ensure that
1292 * all blocks still actually allocated by the filesystem are marked
1293 * as such.
1294 *
1295 * NOTE! Live filesystem transitions to POSSIBLY FREE can occur while
1296 * the bulkfree stage-2 and stage-3 is running. The live filesystem
1297 * will use the alternative POSSIBLY FREE type (2) to prevent
1298 * stage-3 from improperly transitioning unvetted possibly-free
1299 * blocks to FREE.
1300 *
1301 * stage-3 POSSIBLY FREE (type 1) -> FREE (scan freemap)
1302 *
1303 * This is done by bulkfree to finalize POSSIBLY FREE states.
1304 *
1305 */