hammer2 - Initial HARDLINK -> DIRENT replacement code
[dragonfly.git] / sys / vfs / hammer2 / hammer2_freemap.c
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
39 #include <sys/buf.h>
40 #include <sys/proc.h>
41 #include <sys/namei.h>
42 #include <sys/mount.h>
43 #include <sys/vnode.h>
44 #include <sys/mountctl.h>
45
46 #include "hammer2.h"
47
48 #define FREEMAP_DEBUG   0
49
50 struct hammer2_fiterate {
51         hammer2_off_t   bpref;
52         hammer2_off_t   bnext;
53         int             loops;
54 };
55
56 typedef struct hammer2_fiterate hammer2_fiterate_t;
57
58 static int hammer2_freemap_try_alloc(hammer2_chain_t **parentp,
59                         hammer2_blockref_t *bref, int radix,
60                         hammer2_fiterate_t *iter, hammer2_tid_t mtid);
61 static void hammer2_freemap_init(hammer2_dev_t *hmp,
62                         hammer2_key_t key, hammer2_chain_t *chain);
63 static int hammer2_bmap_alloc(hammer2_dev_t *hmp,
64                         hammer2_bmap_data_t *bmap, uint16_t class,
65                         int n, int radix, hammer2_key_t *basep);
66 static int hammer2_freemap_iterate(hammer2_chain_t **parentp,
67                         hammer2_chain_t **chainp,
68                         hammer2_fiterate_t *iter);
69
70 static __inline
71 int
72 hammer2_freemapradix(int radix)
73 {
74         return(radix);
75 }
76
77 /*
78  * Calculate the device offset for the specified FREEMAP_NODE or FREEMAP_LEAF
79  * bref.  Return a combined media offset and physical size radix.  Freemap
80  * chains use fixed storage offsets in the 4MB reserved area at the
81  * beginning of each 2GB zone
82  *
83  * Rotate between four possibilities.  Theoretically this means we have three
84  * good freemaps in case of a crash which we can use as a base for the fixup
85  * scan at mount-time.
86  */
87 #define H2FMBASE(key, radix)    ((key) & ~(((hammer2_off_t)1 << (radix)) - 1))
88 #define H2FMSHIFT(radix)        ((hammer2_off_t)1 << (radix))
89
90 static
91 int
92 hammer2_freemap_reserve(hammer2_chain_t *chain, int radix)
93 {
94         hammer2_blockref_t *bref = &chain->bref;
95         hammer2_off_t off;
96         int index;
97         int index_inc;
98         size_t bytes;
99
100         /*
101          * Physical allocation size.
102          */
103         bytes = (size_t)1 << radix;
104
105         /*
106          * Calculate block selection index 0..7 of current block.  If this
107          * is the first allocation of the block (verses a modification of an
108          * existing block), we use index 0, otherwise we use the next rotating
109          * index.
110          */
111         if ((bref->data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) {
112                 index = 0;
113         } else {
114                 off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX &
115                       (((hammer2_off_t)1 <<
116                         HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
117                 off = off / HAMMER2_PBUFSIZE;
118                 KKASSERT(off >= HAMMER2_ZONE_FREEMAP_00 &&
119                          off < HAMMER2_ZONE_FREEMAP_END);
120                 index = (int)(off - HAMMER2_ZONE_FREEMAP_00) /
121                         HAMMER2_ZONE_FREEMAP_INC;
122                 KKASSERT(index >= 0 && index < HAMMER2_NFREEMAPS);
123                 if (++index == HAMMER2_NFREEMAPS)
124                         index = 0;
125         }
126
127         /*
128          * Calculate the block offset of the reserved block.  This will
129          * point into the 4MB reserved area at the base of the appropriate
130          * 2GB zone, once added to the FREEMAP_x selection above.
131          */
132         index_inc = index * HAMMER2_ZONE_FREEMAP_INC;
133
134         switch(bref->keybits) {
135         /* case HAMMER2_FREEMAP_LEVEL6_RADIX: not applicable */
136         case HAMMER2_FREEMAP_LEVEL5_RADIX:      /* 2EB */
137                 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
138                 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
139                 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL5_RADIX) +
140                       (index_inc + HAMMER2_ZONE_FREEMAP_00 +
141                        HAMMER2_ZONEFM_LEVEL5) * HAMMER2_PBUFSIZE;
142                 break;
143         case HAMMER2_FREEMAP_LEVEL4_RADIX:      /* 2EB */
144                 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
145                 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
146                 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL4_RADIX) +
147                       (index_inc + HAMMER2_ZONE_FREEMAP_00 +
148                        HAMMER2_ZONEFM_LEVEL4) * HAMMER2_PBUFSIZE;
149                 break;
150         case HAMMER2_FREEMAP_LEVEL3_RADIX:      /* 2PB */
151                 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
152                 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
153                 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL3_RADIX) +
154                       (index_inc + HAMMER2_ZONE_FREEMAP_00 +
155                        HAMMER2_ZONEFM_LEVEL3) * HAMMER2_PBUFSIZE;
156                 break;
157         case HAMMER2_FREEMAP_LEVEL2_RADIX:      /* 2TB */
158                 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE);
159                 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
160                 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL2_RADIX) +
161                       (index_inc + HAMMER2_ZONE_FREEMAP_00 +
162                        HAMMER2_ZONEFM_LEVEL2) * HAMMER2_PBUFSIZE;
163                 break;
164         case HAMMER2_FREEMAP_LEVEL1_RADIX:      /* 2GB */
165                 KKASSERT(bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
166                 KKASSERT(bytes == HAMMER2_FREEMAP_LEVELN_PSIZE);
167                 off = H2FMBASE(bref->key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
168                       (index_inc + HAMMER2_ZONE_FREEMAP_00 +
169                        HAMMER2_ZONEFM_LEVEL1) * HAMMER2_PBUFSIZE;
170                 break;
171         default:
172                 panic("freemap: bad radix(2) %p %d\n", bref, bref->keybits);
173                 /* NOT REACHED */
174                 off = (hammer2_off_t)-1;
175                 break;
176         }
177         bref->data_off = off | radix;
178 #if FREEMAP_DEBUG
179         kprintf("FREEMAP BLOCK TYPE %d %016jx/%d DATA_OFF=%016jx\n",
180                 bref->type, bref->key, bref->keybits, bref->data_off);
181 #endif
182         return (0);
183 }
184
185 /*
186  * Normal freemap allocator
187  *
188  * Use available hints to allocate space using the freemap.  Create missing
189  * freemap infrastructure on-the-fly as needed (including marking initial
190  * allocations using the iterator as allocated, instantiating new 2GB zones,
191  * and dealing with the end-of-media edge case).
192  *
193  * ip and bpref are only used as a heuristic to determine locality of
194  * reference.  bref->key may also be used heuristically.
195  *
196  * This function is a NOP if bytes is 0.
197  */
198 int
199 hammer2_freemap_alloc(hammer2_chain_t *chain, size_t bytes)
200 {
201         hammer2_dev_t *hmp = chain->hmp;
202         hammer2_blockref_t *bref = &chain->bref;
203         hammer2_chain_t *parent;
204         hammer2_tid_t mtid;
205         int radix;
206         int error;
207         unsigned int hindex;
208         hammer2_fiterate_t iter;
209
210         /*
211          * If allocating or downsizing to zero we just get rid of whatever
212          * data_off we had.
213          */
214         if (bytes == 0) {
215                 chain->bref.data_off = 0;
216                 return 0;
217         }
218
219         mtid = hammer2_trans_sub(hmp->spmp);
220
221         /*
222          * Validate the allocation size.  It must be a power of 2.
223          *
224          * For now require that the caller be aware of the minimum
225          * allocation (1K).
226          */
227         radix = hammer2_getradix(bytes);
228         KKASSERT((size_t)1 << radix == bytes);
229
230         if (bref->type == HAMMER2_BREF_TYPE_FREEMAP_NODE ||
231             bref->type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) {
232                 /*
233                  * Freemap blocks themselves are assigned from the reserve
234                  * area, not allocated from the freemap.
235                  */
236                 error = hammer2_freemap_reserve(chain, radix);
237                 KKASSERT(error == 0);
238
239                 return error;
240         }
241
242         KKASSERT(bytes >= HAMMER2_ALLOC_MIN && bytes <= HAMMER2_ALLOC_MAX);
243
244         /*
245          * Calculate the starting point for our allocation search.
246          *
247          * Each freemap leaf is dedicated to a specific freemap_radix.
248          * The freemap_radix can be more fine-grained than the device buffer
249          * radix which results in inodes being grouped together in their
250          * own segment, terminal-data (16K or less) and initial indirect
251          * block being grouped together, and then full-indirect and full-data
252          * blocks (64K) being grouped together.
253          *
254          * The single most important aspect of this is the inode grouping
255          * because that is what allows 'find' and 'ls' and other filesystem
256          * topology operations to run fast.
257          */
258 #if 0
259         if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX)
260                 bpref = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
261         else if (trans->tmp_bpref)
262                 bpref = trans->tmp_bpref;
263         else if (trans->tmp_ip)
264                 bpref = trans->tmp_ip->chain->bref.data_off;
265         else
266 #endif
267         /*
268          * Heuristic tracking index.  We would like one for each distinct
269          * bref type if possible.  heur_freemap[] has room for two classes
270          * for each type.  At a minimum we have to break-up our heuristic
271          * by device block sizes.
272          */
273         hindex = hammer2_devblkradix(radix) - HAMMER2_MINIORADIX;
274         KKASSERT(hindex < HAMMER2_FREEMAP_HEUR_NRADIX);
275         hindex += bref->type * HAMMER2_FREEMAP_HEUR_NRADIX;
276         hindex &= HAMMER2_FREEMAP_HEUR_TYPES * HAMMER2_FREEMAP_HEUR_NRADIX - 1;
277         KKASSERT(hindex < HAMMER2_FREEMAP_HEUR_SIZE);
278
279         iter.bpref = hmp->heur_freemap[hindex];
280
281         /*
282          * Make sure bpref is in-bounds.  It's ok if bpref covers a zone's
283          * reserved area, the try code will iterate past it.
284          */
285         if (iter.bpref > hmp->voldata.volu_size)
286                 iter.bpref = hmp->voldata.volu_size - 1;
287
288         /*
289          * Iterate the freemap looking for free space before and after.
290          */
291         parent = &hmp->fchain;
292         hammer2_chain_ref(parent);
293         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
294         error = EAGAIN;
295         iter.bnext = iter.bpref;
296         iter.loops = 0;
297
298         while (error == EAGAIN) {
299                 error = hammer2_freemap_try_alloc(&parent, bref, radix,
300                                                   &iter, mtid);
301         }
302         hmp->heur_freemap[hindex] = iter.bnext;
303         hammer2_chain_unlock(parent);
304         hammer2_chain_drop(parent);
305
306         KKASSERT(error == 0);
307
308         return (error);
309 }
310
311 static int
312 hammer2_freemap_try_alloc(hammer2_chain_t **parentp,
313                           hammer2_blockref_t *bref, int radix,
314                           hammer2_fiterate_t *iter, hammer2_tid_t mtid)
315 {
316         hammer2_dev_t *hmp = (*parentp)->hmp;
317         hammer2_off_t l0size;
318         hammer2_off_t l1size;
319         hammer2_off_t l1mask;
320         hammer2_key_t key_dummy;
321         hammer2_chain_t *chain;
322         hammer2_off_t key;
323         size_t bytes;
324         uint16_t class;
325         int error = 0;
326         int cache_index = -1;
327
328         /*
329          * Calculate the number of bytes being allocated, the number
330          * of contiguous bits of bitmap being allocated, and the bitmap
331          * mask.
332          *
333          * WARNING! cpu hardware may mask bits == 64 -> 0 and blow up the
334          *          mask calculation.
335          */
336         bytes = (size_t)1 << radix;
337         class = (bref->type << 8) | hammer2_devblkradix(radix);
338
339         /*
340          * Lookup the level1 freemap chain, creating and initializing one
341          * if necessary.  Intermediate levels will be created automatically
342          * when necessary by hammer2_chain_create().
343          */
344         key = H2FMBASE(iter->bnext, HAMMER2_FREEMAP_LEVEL1_RADIX);
345         l0size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
346         l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
347         l1mask = l1size - 1;
348
349         chain = hammer2_chain_lookup(parentp, &key_dummy, key, key + l1mask,
350                                      &cache_index,
351                                      HAMMER2_LOOKUP_ALWAYS |
352                                      HAMMER2_LOOKUP_MATCHIND);
353
354         if (chain == NULL) {
355                 /*
356                  * Create the missing leaf, be sure to initialize
357                  * the auxillary freemap tracking information in
358                  * the bref.check.freemap structure.
359                  */
360 #if 0
361                 kprintf("freemap create L1 @ %016jx bpref %016jx\n",
362                         key, iter->bpref);
363 #endif
364                 error = hammer2_chain_create(parentp, &chain,
365                                      hmp->spmp, HAMMER2_METH_DEFAULT,
366                                      key, HAMMER2_FREEMAP_LEVEL1_RADIX,
367                                      HAMMER2_BREF_TYPE_FREEMAP_LEAF,
368                                      HAMMER2_FREEMAP_LEVELN_PSIZE,
369                                      mtid, 0, 0);
370                 KKASSERT(error == 0);
371                 if (error == 0) {
372                         hammer2_chain_modify(chain, mtid, 0, 0);
373                         bzero(&chain->data->bmdata[0],
374                               HAMMER2_FREEMAP_LEVELN_PSIZE);
375                         chain->bref.check.freemap.bigmask = (uint32_t)-1;
376                         chain->bref.check.freemap.avail = l1size;
377                         /* bref.methods should already be inherited */
378
379                         hammer2_freemap_init(hmp, key, chain);
380                 }
381         } else if (chain->error) {
382                 /*
383                  * Error during lookup.
384                  */
385                 kprintf("hammer2_freemap_try_alloc: %016jx: error %s\n",
386                         (intmax_t)bref->data_off,
387                         hammer2_error_str(chain->error));
388                 error = EIO;
389         } else if ((chain->bref.check.freemap.bigmask &
390                    ((size_t)1 << radix)) == 0) {
391                 /*
392                  * Already flagged as not having enough space
393                  */
394                 error = ENOSPC;
395         } else {
396                 /*
397                  * Modify existing chain to setup for adjustment.
398                  */
399                 hammer2_chain_modify(chain, mtid, 0, 0);
400         }
401
402         /*
403          * Scan 2MB entries.
404          */
405         if (error == 0) {
406                 hammer2_bmap_data_t *bmap;
407                 hammer2_key_t base_key;
408                 int count;
409                 int start;
410                 int n;
411
412                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_FREEMAP_LEAF);
413                 start = (int)((iter->bnext - key) >>
414                               HAMMER2_FREEMAP_LEVEL0_RADIX);
415                 KKASSERT(start >= 0 && start < HAMMER2_FREEMAP_COUNT);
416                 hammer2_chain_modify(chain, mtid, 0, 0);
417
418                 error = ENOSPC;
419                 for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
420                         int availchk;
421
422                         if (start + count >= HAMMER2_FREEMAP_COUNT &&
423                             start - count < 0) {
424                                 break;
425                         }
426
427                         /*
428                          * Calculate bmap pointer
429                          *
430                          * NOTE: bmap pointer is invalid if n >= FREEMAP_COUNT.
431                          */
432                         n = start + count;
433                         bmap = &chain->data->bmdata[n];
434
435                         if (n >= HAMMER2_FREEMAP_COUNT) {
436                                 availchk = 0;
437                         } else if (bmap->avail) {
438                                 availchk = 1;
439                         } else if (radix < HAMMER2_FREEMAP_BLOCK_RADIX &&
440                                   (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK)) {
441                                 availchk = 1;
442                         } else {
443                                 availchk = 0;
444                         }
445
446                         if (availchk &&
447                             (bmap->class == 0 || bmap->class == class)) {
448                                 base_key = key + n * l0size;
449                                 error = hammer2_bmap_alloc(hmp, bmap,
450                                                            class, n, radix,
451                                                            &base_key);
452                                 if (error != ENOSPC) {
453                                         key = base_key;
454                                         break;
455                                 }
456                         }
457
458                         /*
459                          * Must recalculate after potentially having called
460                          * hammer2_bmap_alloc() above in case chain was
461                          * reallocated.
462                          *
463                          * NOTE: bmap pointer is invalid if n < 0.
464                          */
465                         n = start - count;
466                         bmap = &chain->data->bmdata[n];
467                         if (n < 0) {
468                                 availchk = 0;
469                         } else if (bmap->avail) {
470                                 availchk = 1;
471                         } else if (radix < HAMMER2_FREEMAP_BLOCK_RADIX &&
472                                   (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK)) {
473                                 availchk = 1;
474                         } else {
475                                 availchk = 0;
476                         }
477
478                         if (availchk &&
479                             (bmap->class == 0 || bmap->class == class)) {
480                                 base_key = key + n * l0size;
481                                 error = hammer2_bmap_alloc(hmp, bmap,
482                                                            class, n, radix,
483                                                            &base_key);
484                                 if (error != ENOSPC) {
485                                         key = base_key;
486                                         break;
487                                 }
488                         }
489                 }
490                 if (error == ENOSPC) {
491                         chain->bref.check.freemap.bigmask &=
492                                 (uint32_t)~((size_t)1 << radix);
493                 }
494                 /* XXX also scan down from original count */
495         }
496
497         if (error == 0) {
498                 /*
499                  * Assert validity.  Must be beyond the static allocator used
500                  * by newfs_hammer2 (and thus also beyond the aux area),
501                  * not go past the volume size, and must not be in the
502                  * reserved segment area for a zone.
503                  */
504                 KKASSERT(key >= hmp->voldata.allocator_beg &&
505                          key + bytes <= hmp->voldata.volu_size);
506                 KKASSERT((key & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
507                 bref->data_off = key | radix;
508 #if 0
509                 kprintf("alloc cp=%p %016jx %016jx using %016jx\n",
510                         chain,
511                         bref->key, bref->data_off, chain->bref.data_off);
512 #endif
513         } else if (error == ENOSPC) {
514                 /*
515                  * Return EAGAIN with next iteration in iter->bnext, or
516                  * return ENOSPC if the allocation map has been exhausted.
517                  */
518                 error = hammer2_freemap_iterate(parentp, &chain, iter);
519         }
520
521         /*
522          * Cleanup
523          */
524         if (chain) {
525                 hammer2_chain_unlock(chain);
526                 hammer2_chain_drop(chain);
527         }
528         return (error);
529 }
530
531 /*
532  * Allocate (1<<radix) bytes from the bmap whos base data offset is (*basep).
533  *
534  * If the linear iterator is mid-block we use it directly (the bitmap should
535  * already be marked allocated), otherwise we search for a block in the bitmap
536  * that fits the allocation request.
537  *
538  * A partial bitmap allocation sets the minimum bitmap granularity (16KB)
539  * to fully allocated and adjusts the linear allocator to allow the
540  * remaining space to be allocated.
541  */
542 static
543 int
544 hammer2_bmap_alloc(hammer2_dev_t *hmp, hammer2_bmap_data_t *bmap,
545                    uint16_t class, int n, int radix, hammer2_key_t *basep)
546 {
547         size_t size;
548         size_t bgsize;
549         int bmradix;
550         hammer2_bitmap_t bmmask;
551         int offset;
552         int i;
553         int j;
554
555         /*
556          * Take into account 2-bits per block when calculating bmradix.
557          */
558         size = (size_t)1 << radix;
559
560         if (radix <= HAMMER2_FREEMAP_BLOCK_RADIX) {
561                 bmradix = 2;
562                 /* (16K) 2 bits per allocation block */
563         } else {
564                 bmradix = (hammer2_bitmap_t)2 <<
565                           (radix - HAMMER2_FREEMAP_BLOCK_RADIX);
566                 /* (32K-256K) 4, 8, 16, 32 bits per allocation block */
567         }
568
569         /*
570          * Use the linear iterator to pack small allocations, otherwise
571          * fall-back to finding a free 16KB chunk.  The linear iterator
572          * is only valid when *NOT* on a freemap chunking boundary (16KB).
573          * If it is the bitmap must be scanned.  It can become invalid
574          * once we pack to the boundary.  We adjust it after a bitmap
575          * allocation only for sub-16KB allocations (so the perfectly good
576          * previous value can still be used for fragments when 16KB+
577          * allocations are made).
578          *
579          * Beware of hardware artifacts when bmradix == 64 (intermediate
580          * result can wind up being '1' instead of '0' if hardware masks
581          * bit-count & 31).
582          *
583          * NOTE: j needs to be even in the j= calculation.  As an artifact
584          *       of the /2 division, our bitmask has to clear bit 0.
585          *
586          * NOTE: TODO this can leave little unallocatable fragments lying
587          *       around.
588          */
589         if (((uint32_t)bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) + size <=
590             HAMMER2_FREEMAP_BLOCK_SIZE &&
591             (bmap->linear & HAMMER2_FREEMAP_BLOCK_MASK) &&
592             bmap->linear < HAMMER2_SEGSIZE) {
593                 KKASSERT(bmap->linear >= 0 &&
594                          bmap->linear + size <= HAMMER2_SEGSIZE &&
595                          (bmap->linear & (HAMMER2_ALLOC_MIN - 1)) == 0);
596                 offset = bmap->linear;
597                 i = offset / (HAMMER2_SEGSIZE / 8);
598                 j = (offset / (HAMMER2_FREEMAP_BLOCK_SIZE / 2)) & 30;
599                 bmmask = (bmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
600                          HAMMER2_BMAP_ALLONES :
601                          ((hammer2_bitmap_t)1 << bmradix) - 1;
602                 bmmask <<= j;
603                 bmap->linear = offset + size;
604         } else {
605                 for (i = 0; i < HAMMER2_BMAP_ELEMENTS; ++i) {
606                         bmmask = (bmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
607                                  HAMMER2_BMAP_ALLONES :
608                                  ((hammer2_bitmap_t)1 << bmradix) - 1;
609                         for (j = 0;
610                              j < HAMMER2_BMAP_BITS_PER_ELEMENT;
611                              j += bmradix) {
612                                 if ((bmap->bitmapq[i] & bmmask) == 0)
613                                         goto success;
614                                 bmmask <<= bmradix;
615                         }
616                 }
617                 /*fragments might remain*/
618                 /*KKASSERT(bmap->avail == 0);*/
619                 return (ENOSPC);
620 success:
621                 offset = i * (HAMMER2_SEGSIZE / HAMMER2_BMAP_ELEMENTS) +
622                          (j * (HAMMER2_FREEMAP_BLOCK_SIZE / 2));
623                 if (size & HAMMER2_FREEMAP_BLOCK_MASK)
624                         bmap->linear = offset + size;
625         }
626
627         /* 8 x (64/2) -> 256 x 16K -> 4MB */
628         KKASSERT(i >= 0 && i < HAMMER2_BMAP_ELEMENTS);
629
630         /*
631          * Optimize the buffer cache to avoid unnecessary read-before-write
632          * operations.
633          *
634          * The device block size could be larger than the allocation size
635          * so the actual bitmap test is somewhat more involved.  We have
636          * to use a compatible buffer size for this operation.
637          */
638         if ((bmap->bitmapq[i] & bmmask) == 0 &&
639             hammer2_devblksize(size) != size) {
640                 size_t psize = hammer2_devblksize(size);
641                 hammer2_off_t pmask = (hammer2_off_t)psize - 1;
642                 int pbmradix = (hammer2_bitmap_t)2 <<
643                                         (hammer2_devblkradix(radix) -
644                                HAMMER2_FREEMAP_BLOCK_RADIX);
645                 hammer2_bitmap_t pbmmask;
646                 int pradix = hammer2_getradix(psize);
647
648                 pbmmask = (pbmradix == HAMMER2_BMAP_BITS_PER_ELEMENT) ?
649                         HAMMER2_BMAP_ALLONES :
650                         ((hammer2_bitmap_t)1 << pbmradix) - 1;
651                 while ((pbmmask & bmmask) == 0)
652                         pbmmask <<= pbmradix;
653
654 #if 0
655                 kprintf("%016jx mask %016jx %016jx %016jx (%zd/%zd)\n",
656                         *basep + offset, bmap->bitmapq[i],
657                         pbmmask, bmmask, size, psize);
658 #endif
659
660                 if ((bmap->bitmapq[i] & pbmmask) == 0) {
661                         hammer2_io_newq(hmp, HAMMER2_BREF_TYPE_FREEMAP_LEAF,
662                                         (*basep + (offset & ~pmask)) |
663                                         pradix, psize);
664                 }
665         }
666
667 #if 0
668         /*
669          * When initializing a new inode segment also attempt to initialize
670          * an adjacent segment.  Be careful not to index beyond the array
671          * bounds.
672          *
673          * We do this to try to localize inode accesses to improve
674          * directory scan rates.  XXX doesn't improve scan rates.
675          */
676         if (size == HAMMER2_INODE_BYTES) {
677                 if (n & 1) {
678                         if (bmap[-1].radix == 0 && bmap[-1].avail)
679                                 bmap[-1].radix = radix;
680                 } else {
681                         if (bmap[1].radix == 0 && bmap[1].avail)
682                                 bmap[1].radix = radix;
683                 }
684         }
685 #endif
686         /*
687          * Calculate the bitmap-granular change in bgsize for the volume
688          * header.  We cannot use the fine-grained change here because
689          * the bulkfree code can't undo it.  If the bitmap element is already
690          * marked allocated it has already been accounted for.
691          */
692         if (radix < HAMMER2_FREEMAP_BLOCK_RADIX) {
693                 if (bmap->bitmapq[i] & bmmask)
694                         bgsize = 0;
695                 else
696                         bgsize = HAMMER2_FREEMAP_BLOCK_SIZE;
697         } else {
698                 bgsize = size;
699         }
700
701         /*
702          * Adjust the bitmap, set the class (it might have been 0),
703          * and available bytes, update the allocation offset (*basep)
704          * from the L0 base to the actual offset.
705          *
706          * avail must reflect the bitmap-granular availability.  The allocator
707          * tests will also check the linear iterator.
708          */
709         bmap->bitmapq[i] |= bmmask;
710         bmap->class = class;
711         bmap->avail -= bgsize;
712         *basep += offset;
713
714         /*
715          * Adjust the volume header's allocator_free parameter.  This
716          * parameter has to be fixed up by bulkfree which has no way to
717          * figure out sub-16K chunking, so it must be adjusted by the
718          * bitmap-granular size.
719          */
720         if (bgsize) {
721                 hammer2_voldata_lock(hmp);
722                 hammer2_voldata_modify(hmp);
723                 hmp->voldata.allocator_free -= bgsize;
724                 hammer2_voldata_unlock(hmp);
725         }
726
727         return(0);
728 }
729
730 static
731 void
732 hammer2_freemap_init(hammer2_dev_t *hmp, hammer2_key_t key,
733                      hammer2_chain_t *chain)
734 {
735         hammer2_off_t l1size;
736         hammer2_off_t lokey;
737         hammer2_off_t hikey;
738         hammer2_bmap_data_t *bmap;
739         int count;
740
741         l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
742
743         /*
744          * Calculate the portion of the 2GB map that should be initialized
745          * as free.  Portions below or after will be initialized as allocated.
746          * SEGMASK-align the areas so we don't have to worry about sub-scans
747          * or endianess when using memset.
748          *
749          * (1) Ensure that all statically allocated space from newfs_hammer2
750          *     is marked allocated.
751          *
752          * (2) Ensure that the reserved area is marked allocated (typically
753          *     the first 4MB of the 2GB area being represented).
754          *
755          * (3) Ensure that any trailing space at the end-of-volume is marked
756          *     allocated.
757          *
758          * WARNING! It is possible for lokey to be larger than hikey if the
759          *          entire 2GB segment is within the static allocation.
760          */
761         lokey = (hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) &
762                 ~HAMMER2_SEGMASK64;
763
764         if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
765                   HAMMER2_ZONE_SEG64) {
766                 lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX) +
767                         HAMMER2_ZONE_SEG64;
768         }
769
770         hikey = key + H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
771         if (hikey > hmp->voldata.volu_size) {
772                 hikey = hmp->voldata.volu_size & ~HAMMER2_SEGMASK64;
773         }
774
775         chain->bref.check.freemap.avail =
776                 H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
777         bmap = &chain->data->bmdata[0];
778
779         for (count = 0; count < HAMMER2_FREEMAP_COUNT; ++count) {
780                 if (key < lokey || key >= hikey) {
781                         memset(bmap->bitmapq, -1,
782                                sizeof(bmap->bitmapq));
783                         bmap->avail = 0;
784                         bmap->linear = HAMMER2_SEGSIZE;
785                         chain->bref.check.freemap.avail -=
786                                 H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
787                 } else {
788                         bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
789                 }
790                 key += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
791                 ++bmap;
792         }
793 }
794
795 /*
796  * The current Level 1 freemap has been exhausted, iterate to the next
797  * one, return ENOSPC if no freemaps remain.
798  *
799  * XXX this should rotate back to the beginning to handle freed-up space
800  * XXX or use intermediate entries to locate free space. TODO
801  */
802 static int
803 hammer2_freemap_iterate(hammer2_chain_t **parentp, hammer2_chain_t **chainp,
804                         hammer2_fiterate_t *iter)
805 {
806         hammer2_dev_t *hmp = (*parentp)->hmp;
807
808         iter->bnext &= ~(H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX) - 1);
809         iter->bnext += H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
810         if (iter->bnext >= hmp->voldata.volu_size) {
811                 iter->bnext = 0;
812                 if (++iter->loops == 2)
813                         return (ENOSPC);
814         }
815         return(EAGAIN);
816 }
817
818 /*
819  * Adjust the bit-pattern for data in the freemap bitmap according to
820  * (how).  This code is called from on-mount recovery to fixup (mark
821  * as allocated) blocks whos freemap upates might not have been committed
822  * in the last crash and is used by the bulk freemap scan to stage frees.
823  *
824  * WARNING! Cannot be called with a empty-data bref (radix == 0).
825  *
826  * XXX currently disabled when how == 0 (the normal real-time case).  At
827  * the moment we depend on the bulk freescan to actually free blocks.  It
828  * will still call this routine with a non-zero how to stage possible frees
829  * and to do the actual free.
830  */
831 void
832 hammer2_freemap_adjust(hammer2_dev_t *hmp, hammer2_blockref_t *bref,
833                        int how)
834 {
835         hammer2_off_t data_off = bref->data_off;
836         hammer2_chain_t *chain;
837         hammer2_chain_t *parent;
838         hammer2_bmap_data_t *bmap;
839         hammer2_key_t key;
840         hammer2_key_t key_dummy;
841         hammer2_off_t l0size;
842         hammer2_off_t l1size;
843         hammer2_off_t l1mask;
844         hammer2_tid_t mtid;
845         hammer2_bitmap_t *bitmap;
846         const hammer2_bitmap_t bmmask00 = 0;
847         hammer2_bitmap_t bmmask01;
848         hammer2_bitmap_t bmmask10;
849         hammer2_bitmap_t bmmask11;
850         size_t bytes;
851         uint16_t class;
852         int radix;
853         int start;
854         int count;
855         int modified = 0;
856         int cache_index = -1;
857         int error;
858         size_t bgsize = 0;
859
860         KKASSERT(how == HAMMER2_FREEMAP_DORECOVER);
861
862         mtid = hammer2_trans_sub(hmp->spmp);
863
864         radix = (int)data_off & HAMMER2_OFF_MASK_RADIX;
865         KKASSERT(radix != 0);
866         data_off &= ~HAMMER2_OFF_MASK_RADIX;
867         KKASSERT(radix <= HAMMER2_RADIX_MAX);
868
869         if (radix)
870                 bytes = (size_t)1 << radix;
871         else
872                 bytes = 0;
873         class = (bref->type << 8) | hammer2_devblkradix(radix);
874
875         /*
876          * We can't adjust the freemap for data allocations made by
877          * newfs_hammer2.
878          */
879         if (data_off < hmp->voldata.allocator_beg)
880                 return;
881
882         KKASSERT((data_off & HAMMER2_ZONE_MASK64) >= HAMMER2_ZONE_SEG);
883
884         /*
885          * Lookup the level1 freemap chain.  The chain must exist.
886          */
887         key = H2FMBASE(data_off, HAMMER2_FREEMAP_LEVEL1_RADIX);
888         l0size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
889         l1size = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL1_RADIX);
890         l1mask = l1size - 1;
891
892         parent = &hmp->fchain;
893         hammer2_chain_ref(parent);
894         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
895
896         chain = hammer2_chain_lookup(&parent, &key_dummy, key, key + l1mask,
897                                      &cache_index,
898                                      HAMMER2_LOOKUP_ALWAYS |
899                                      HAMMER2_LOOKUP_MATCHIND);
900
901         /*
902          * Stop early if we are trying to free something but no leaf exists.
903          */
904         if (chain == NULL && how != HAMMER2_FREEMAP_DORECOVER) {
905                 kprintf("hammer2_freemap_adjust: %016jx: no chain\n",
906                         (intmax_t)bref->data_off);
907                 goto done;
908         }
909         if (chain->error) {
910                 kprintf("hammer2_freemap_adjust: %016jx: error %s\n",
911                         (intmax_t)bref->data_off,
912                         hammer2_error_str(chain->error));
913                 hammer2_chain_unlock(chain);
914                 hammer2_chain_drop(chain);
915                 chain = NULL;
916                 goto done;
917         }
918
919         /*
920          * Create any missing leaf(s) if we are doing a recovery (marking
921          * the block(s) as being allocated instead of being freed).  Be sure
922          * to initialize the auxillary freemap tracking info in the
923          * bref.check.freemap structure.
924          */
925         if (chain == NULL && how == HAMMER2_FREEMAP_DORECOVER) {
926                 error = hammer2_chain_create(&parent, &chain,
927                                      hmp->spmp, HAMMER2_METH_DEFAULT,
928                                      key, HAMMER2_FREEMAP_LEVEL1_RADIX,
929                                      HAMMER2_BREF_TYPE_FREEMAP_LEAF,
930                                      HAMMER2_FREEMAP_LEVELN_PSIZE,
931                                      mtid, 0, 0);
932
933                 if (hammer2_debug & 0x0040) {
934                         kprintf("fixup create chain %p %016jx:%d\n",
935                                 chain, chain->bref.key, chain->bref.keybits);
936                 }
937
938                 if (error == 0) {
939                         hammer2_chain_modify(chain, mtid, 0, 0);
940                         bzero(&chain->data->bmdata[0],
941                               HAMMER2_FREEMAP_LEVELN_PSIZE);
942                         chain->bref.check.freemap.bigmask = (uint32_t)-1;
943                         chain->bref.check.freemap.avail = l1size;
944                         /* bref.methods should already be inherited */
945
946                         hammer2_freemap_init(hmp, key, chain);
947                 }
948                 /* XXX handle error */
949         }
950
951 #if FREEMAP_DEBUG
952         kprintf("FREEMAP ADJUST TYPE %d %016jx/%d DATA_OFF=%016jx\n",
953                 chain->bref.type, chain->bref.key,
954                 chain->bref.keybits, chain->bref.data_off);
955 #endif
956
957         /*
958          * Calculate the bitmask (runs in 2-bit pairs).
959          */
960         start = ((int)(data_off >> HAMMER2_FREEMAP_BLOCK_RADIX) & 15) * 2;
961         bmmask01 = (hammer2_bitmap_t)1 << start;
962         bmmask10 = (hammer2_bitmap_t)2 << start;
963         bmmask11 = (hammer2_bitmap_t)3 << start;
964
965         /*
966          * Fixup the bitmap.  Partial blocks cannot be fully freed unless
967          * a bulk scan is able to roll them up.
968          */
969         if (radix < HAMMER2_FREEMAP_BLOCK_RADIX) {
970                 count = 1;
971                 if (how == HAMMER2_FREEMAP_DOREALFREE)
972                         how = HAMMER2_FREEMAP_DOMAYFREE;
973         } else {
974                 count = 1 << (radix - HAMMER2_FREEMAP_BLOCK_RADIX);
975         }
976
977         /*
978          * [re]load the bmap and bitmap pointers.  Each bmap entry covers
979          * a 2MB swath.  The bmap itself (LEVEL1) covers 2GB.
980          *
981          * Be sure to reset the linear iterator to ensure that the adjustment
982          * is not ignored.
983          */
984 again:
985         bmap = &chain->data->bmdata[(int)(data_off >> HAMMER2_SEGRADIX) &
986                                     (HAMMER2_FREEMAP_COUNT - 1)];
987         bitmap = &bmap->bitmapq[(int)(data_off >> (HAMMER2_SEGRADIX - 3)) & 7];
988
989         if (modified)
990                 bmap->linear = 0;
991
992         while (count) {
993                 KKASSERT(bmmask11);
994                 if (how == HAMMER2_FREEMAP_DORECOVER) {
995                         /*
996                          * Recovery request, mark as allocated.
997                          */
998                         if ((*bitmap & bmmask11) != bmmask11) {
999                                 if (modified == 0) {
1000                                         hammer2_chain_modify(chain, mtid, 0, 0);
1001                                         modified = 1;
1002                                         goto again;
1003                                 }
1004                                 if ((*bitmap & bmmask11) == bmmask00) {
1005                                         bmap->avail -=
1006                                                 HAMMER2_FREEMAP_BLOCK_SIZE;
1007                                         bgsize += HAMMER2_FREEMAP_BLOCK_SIZE;
1008                                 }
1009                                 if (bmap->class == 0)
1010                                         bmap->class = class;
1011                                 *bitmap |= bmmask11;
1012                                 if (hammer2_debug & 0x0040) {
1013                                         kprintf("hammer2_freemap_recover: "
1014                                                 "fixup type=%02x "
1015                                                 "block=%016jx/%zd\n",
1016                                                 bref->type, data_off, bytes);
1017                                 }
1018                         } else {
1019                                 /*
1020                                 kprintf("hammer2_freemap_recover:  good "
1021                                         "type=%02x block=%016jx/%zd\n",
1022                                         bref->type, data_off, bytes);
1023                                 */
1024                         }
1025                 }
1026 #if 0
1027                 /*
1028                  * XXX this stuff doesn't work, avail is miscalculated and
1029                  * code 10 means something else now.
1030                  */
1031                 else if ((*bitmap & bmmask11) == bmmask11) {
1032                         /*
1033                          * Mayfree/Realfree request and bitmap is currently
1034                          * marked as being fully allocated.
1035                          */
1036                         if (!modified) {
1037                                 hammer2_chain_modify(chain, 0);
1038                                 modified = 1;
1039                                 goto again;
1040                         }
1041                         if (how == HAMMER2_FREEMAP_DOREALFREE)
1042                                 *bitmap &= ~bmmask11;
1043                         else
1044                                 *bitmap = (*bitmap & ~bmmask11) | bmmask10;
1045                 } else if ((*bitmap & bmmask11) == bmmask10) {
1046                         /*
1047                          * Mayfree/Realfree request and bitmap is currently
1048                          * marked as being possibly freeable.
1049                          */
1050                         if (how == HAMMER2_FREEMAP_DOREALFREE) {
1051                                 if (!modified) {
1052                                         hammer2_chain_modify(chain, 0);
1053                                         modified = 1;
1054                                         goto again;
1055                                 }
1056                                 *bitmap &= ~bmmask11;
1057                         }
1058                 } else {
1059                         /*
1060                          * 01 - Not implemented, currently illegal state
1061                          * 00 - Not allocated at all, illegal free.
1062                          */
1063                         panic("hammer2_freemap_adjust: "
1064                               "Illegal state %08x(%08x)",
1065                               *bitmap, *bitmap & bmmask11);
1066                 }
1067 #endif
1068                 --count;
1069                 bmmask01 <<= 2;
1070                 bmmask10 <<= 2;
1071                 bmmask11 <<= 2;
1072         }
1073 #if HAMMER2_BMAP_ELEMENTS != 8
1074 #error "hammer2_freemap.c: HAMMER2_BMAP_ELEMENTS expected to be 8"
1075 #endif
1076         if (how == HAMMER2_FREEMAP_DOREALFREE && modified) {
1077                 bmap->avail += 1 << radix;
1078                 KKASSERT(bmap->avail <= HAMMER2_SEGSIZE);
1079                 if (bmap->avail == HAMMER2_SEGSIZE &&
1080                     bmap->bitmapq[0] == 0 &&
1081                     bmap->bitmapq[1] == 0 &&
1082                     bmap->bitmapq[2] == 0 &&
1083                     bmap->bitmapq[3] == 0 &&
1084                     bmap->bitmapq[4] == 0 &&
1085                     bmap->bitmapq[5] == 0 &&
1086                     bmap->bitmapq[6] == 0 &&
1087                     bmap->bitmapq[7] == 0) {
1088                         key = H2FMBASE(data_off, HAMMER2_FREEMAP_LEVEL0_RADIX);
1089                         kprintf("Freeseg %016jx\n", (intmax_t)key);
1090                         bmap->class = 0;
1091                 }
1092         }
1093
1094         /*
1095          * chain->bref.check.freemap.bigmask (XXX)
1096          *
1097          * Setting bigmask is a hint to the allocation code that there might
1098          * be something allocatable.  We also set this in recovery... it
1099          * doesn't hurt and we might want to use the hint for other validation
1100          * operations later on.
1101          */
1102         if (modified)
1103                 chain->bref.check.freemap.bigmask |= 1 << radix;
1104
1105         hammer2_chain_unlock(chain);
1106         hammer2_chain_drop(chain);
1107 done:
1108         hammer2_chain_unlock(parent);
1109         hammer2_chain_drop(parent);
1110
1111         if (bgsize) {
1112                 hammer2_voldata_lock(hmp);
1113                 hammer2_voldata_modify(hmp);
1114                 hmp->voldata.allocator_free -= bgsize;
1115                 hammer2_voldata_unlock(hmp);
1116         }
1117 }
1118
1119 /*
1120  * Validate the freemap, in three stages.
1121  *
1122  * stage-1      ALLOCATED     -> POSSIBLY FREE
1123  *              POSSIBLY FREE -> POSSIBLY FREE (type corrected)
1124  *
1125  *      This transitions bitmap entries from ALLOCATED to POSSIBLY FREE.
1126  *      The POSSIBLY FREE state does not mean that a block is actually free
1127  *      and may be transitioned back to ALLOCATED in stage-2.
1128  *
1129  *      This is typically done during normal filesystem operations when
1130  *      something is deleted or a block is replaced.
1131  *
1132  *      This is done by bulkfree in-bulk after a memory-bounded meta-data
1133  *      scan to try to determine what might be freeable.
1134  *
1135  *      This can be done unconditionally through a freemap scan when the
1136  *      intention is to brute-force recover the proper state of the freemap.
1137  *
1138  * stage-2      POSSIBLY FREE -> ALLOCATED      (scan metadata topology)
1139  *
1140  *      This is done by bulkfree during a meta-data scan to ensure that
1141  *      all blocks still actually allocated by the filesystem are marked
1142  *      as such.
1143  *
1144  *      NOTE! Live filesystem transitions to POSSIBLY FREE can occur while
1145  *            the bulkfree stage-2 and stage-3 is running.  The live filesystem
1146  *            will use the alternative POSSIBLY FREE type (2) to prevent
1147  *            stage-3 from improperly transitioning unvetted possibly-free
1148  *            blocks to FREE.
1149  *
1150  * stage-3      POSSIBLY FREE (type 1) -> FREE  (scan freemap)
1151  *
1152  *      This is done by bulkfree to finalize POSSIBLY FREE states.
1153  *
1154  */