f6987cd742ad99fa8a463324ce366ec6358130c9
[dragonfly.git] / sys / vfs / hammer2 / hammer2_bulkfree.c
1 /*
2  * Copyright (c) 2013-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 #include <sys/param.h>
35 #include <sys/systm.h>
36 #include <sys/kernel.h>
37 #include <sys/fcntl.h>
38 #include <sys/buf.h>
39 #include <sys/proc.h>
40 #include <sys/namei.h>
41 #include <sys/mount.h>
42 #include <sys/vnode.h>
43 #include <sys/mountctl.h>
44 #include <vm/vm_kern.h>
45 #include <vm/vm_extern.h>
46
47 #include "hammer2.h"
48
49 /*
50  * XXX I made a mistake and made the reserved area begin at each LEVEL1 zone,
51  *     which is on a 1GB demark.  This will eat a little more space but for
52  *     now we retain compatibility and make FMZONEBASE every 1GB
53  */
54 #define H2FMZONEBASE(key)         ((key) & ~HAMMER2_FREEMAP_LEVEL1_MASK)
55 #define H2FMBASE(key, radix)    ((key) & ~(((hammer2_off_t)1 << (radix)) - 1))
56 #define H2FMSHIFT(radix)        ((hammer2_off_t)1 << (radix))
57
58 /*
59  * breadth-first search
60  */
61 typedef struct hammer2_chain_save {
62         TAILQ_ENTRY(hammer2_chain_save) entry;
63         hammer2_chain_t *chain;
64         int pri;
65 } hammer2_chain_save_t;
66
67 TAILQ_HEAD(hammer2_chain_save_list, hammer2_chain_save);
68 typedef struct hammer2_chain_save_list hammer2_chain_save_list_t;
69
70 typedef struct hammer2_bulkfree_info {
71         hammer2_dev_t           *hmp;
72         kmem_anon_desc_t        kp;
73         hammer2_off_t           sbase;          /* sub-loop iteration */
74         hammer2_off_t           sstop;
75         hammer2_bmap_data_t     *bmap;
76         int                     depth;
77         long                    count_10_00;    /* staged->free      */
78         long                    count_11_10;    /* allocated->staged */
79         long                    count_00_11;    /* (should not happen) */
80         long                    count_01_11;    /* (should not happen) */
81         long                    count_10_11;    /* staged->allocated */
82         long                    count_l0cleans;
83         long                    count_linadjusts;
84         long                    count_inodes_scanned;
85         long                    count_dedup_factor;
86         long                    bytes_scanned;
87         hammer2_off_t           adj_free;
88         hammer2_tid_t           mtid;
89         hammer2_tid_t           saved_mirror_tid;
90         time_t                  save_time;
91         hammer2_chain_save_list_t list;
92         hammer2_dedup_t         *dedup;
93         int                     pri;
94 } hammer2_bulkfree_info_t;
95
96 static int h2_bulkfree_test(hammer2_bulkfree_info_t *info,
97                         hammer2_blockref_t *bref, int pri);
98
99 /*
100  * General bulk scan function with callback.  Called with a referenced
101  * but UNLOCKED parent.  The parent is returned in the same state.
102  */
103 static
104 int
105 hammer2_bulk_scan(hammer2_chain_t *parent,
106                   int (*func)(hammer2_bulkfree_info_t *info,
107                               hammer2_blockref_t *bref),
108                   hammer2_bulkfree_info_t *info)
109 {
110         hammer2_blockref_t bref;
111         hammer2_chain_t *chain;
112         int first = 1;
113         int rup_error;
114         int error;
115
116         ++info->pri;
117
118         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
119                                    HAMMER2_RESOLVE_SHARED);
120         chain = NULL;
121         rup_error = 0;
122         error = 0;
123
124         /*
125          * Generally loop on the contents if we have not been flagged
126          * for abort.
127          *
128          * Remember that these chains are completely isolated from
129          * the frontend, so we can release locks temporarily without
130          * imploding.
131          */
132         for (;;) {
133                 error |= hammer2_chain_scan(parent, &chain, &bref, &first,
134                                             HAMMER2_LOOKUP_NODATA |
135                                             HAMMER2_LOOKUP_SHARED);
136
137                 /*
138                  * Handle EOF or other error at current level.  This stops
139                  * the bulkfree scan.
140                  */
141                 if (error)
142                         break;
143
144                 /*
145                  * Process bref, chain is only non-NULL if the bref
146                  * might be recursable (its possible that we sometimes get
147                  * a non-NULL chain where the bref cannot be recursed).
148                  */
149                 ++info->pri;
150                 if (h2_bulkfree_test(info, &bref, 1))
151                         continue;
152
153                 error |= func(info, &bref);
154                 if (error)
155                         break;
156
157                 /*
158                  * A non-null chain is always returned if it is
159                  * recursive, otherwise a non-null chain might be
160                  * returned but usually is not when not recursive.
161                  */
162                 if (chain == NULL)
163                         continue;
164
165                 /*
166                  * Else check type and setup depth-first scan.
167                  *
168                  * Account for bytes actually read.
169                  */
170                 info->bytes_scanned += chain->bytes;
171
172                 switch(chain->bref.type) {
173                 case HAMMER2_BREF_TYPE_INODE:
174                 case HAMMER2_BREF_TYPE_FREEMAP_NODE:
175                 case HAMMER2_BREF_TYPE_INDIRECT:
176                 case HAMMER2_BREF_TYPE_VOLUME:
177                 case HAMMER2_BREF_TYPE_FREEMAP:
178                         ++info->depth;
179                         if (info->depth > 16) {
180                                 hammer2_chain_save_t *save;
181                                 save = kmalloc(sizeof(*save), M_HAMMER2,
182                                                M_WAITOK | M_ZERO);
183                                 save->chain = chain;
184                                 hammer2_chain_ref(chain);
185                                 TAILQ_INSERT_TAIL(&info->list, save, entry);
186
187                                 /* guess */
188                                 info->pri += 10;
189                         } else {
190                                 int savepri = info->pri;
191
192                                 hammer2_chain_unlock(chain);
193                                 hammer2_chain_unlock(parent);
194                                 info->pri = 0;
195                                 rup_error |=
196                                         hammer2_bulk_scan(chain, func, info);
197                                 info->pri += savepri;
198                                 hammer2_chain_lock(parent,
199                                                    HAMMER2_RESOLVE_ALWAYS |
200                                                    HAMMER2_RESOLVE_SHARED);
201                                 hammer2_chain_lock(chain,
202                                                    HAMMER2_RESOLVE_ALWAYS |
203                                                    HAMMER2_RESOLVE_SHARED);
204                         }
205                         --info->depth;
206                         break;
207                 default:
208                         /* does not recurse */
209                         break;
210                 }
211                 if (rup_error & HAMMER2_ERROR_ABORTED)
212                         break;
213         }
214         if (chain) {
215                 hammer2_chain_unlock(chain);
216                 hammer2_chain_drop(chain);
217         }
218
219         /*
220          * Save with higher pri now that we know what it is.
221          */
222         h2_bulkfree_test(info, &parent->bref, info->pri + 1);
223
224         hammer2_chain_unlock(parent);
225
226         return ((error | rup_error) & ~HAMMER2_ERROR_EOF);
227 }
228
229 /*
230  * Bulkfree algorithm
231  *
232  * Repeat {
233  *      Chain flush (partial synchronization) XXX removed
234  *      Scan the whole topology - build in-memory freemap (mark 11)
235  *      Reconcile the in-memory freemap against the on-disk freemap.
236  *              ondisk xx -> ondisk 11 (if allocated)
237  *              ondisk 11 -> ondisk 10 (if free in-memory)
238  *              ondisk 10 -> ondisk 00 (if free in-memory) - on next pass
239  * }
240  *
241  * The topology scan may have to be performed multiple times to window
242  * freemaps which are too large to fit in kernel memory.
243  *
244  * Races are handled using a double-transition (11->10, 10->00).  The bulkfree
245  * scan snapshots the volume root's blockset and thus can run concurrent with
246  * normal operations, as long as a full flush is made between each pass to
247  * synchronize any modified chains (otherwise their blocks might be improperly
248  * freed).
249  *
250  * Temporary memory in multiples of 64KB is required to reconstruct the leaf
251  * hammer2_bmap_data blocks so they can later be compared against the live
252  * freemap.  Each 64KB block represents 128 x 16KB x 1024 = ~2 GB of storage.
253  * A 32MB save area thus represents around ~1 TB.  The temporary memory
254  * allocated can be specified.  If it is not sufficient multiple topology
255  * passes will be made.
256  */
257
258 /*
259  * Bulkfree callback info
260  */
261 static void hammer2_bulkfree_thread(void *arg __unused);
262 static void cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size);
263 static int h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo,
264                         hammer2_blockref_t *bref);
265 static int h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo);
266 static void h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
267                         hammer2_off_t data_off, hammer2_bmap_data_t *live,
268                         hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base);
269
270 void
271 hammer2_bulkfree_init(hammer2_dev_t *hmp)
272 {
273         hammer2_thr_create(&hmp->bfthr, NULL, hmp,
274                            hmp->devrepname, -1, -1,
275                            hammer2_bulkfree_thread);
276 }
277
278 void
279 hammer2_bulkfree_uninit(hammer2_dev_t *hmp)
280 {
281         hammer2_thr_delete(&hmp->bfthr);
282 }
283
284 static void
285 hammer2_bulkfree_thread(void *arg)
286 {
287         hammer2_thread_t *thr = arg;
288         hammer2_ioc_bulkfree_t bfi;
289         uint32_t flags;
290
291         for (;;) {
292                 hammer2_thr_wait_any(thr,
293                                      HAMMER2_THREAD_STOP |
294                                      HAMMER2_THREAD_FREEZE |
295                                      HAMMER2_THREAD_UNFREEZE |
296                                      HAMMER2_THREAD_REMASTER,
297                                      hz * 60);
298
299                 flags = thr->flags;
300                 cpu_ccfence();
301                 if (flags & HAMMER2_THREAD_STOP)
302                         break;
303                 if (flags & HAMMER2_THREAD_FREEZE) {
304                         hammer2_thr_signal2(thr, HAMMER2_THREAD_FROZEN,
305                                                  HAMMER2_THREAD_FREEZE);
306                         continue;
307                 }
308                 if (flags & HAMMER2_THREAD_UNFREEZE) {
309                         hammer2_thr_signal2(thr, 0,
310                                                  HAMMER2_THREAD_FROZEN |
311                                                  HAMMER2_THREAD_UNFREEZE);
312                         continue;
313                 }
314                 if (flags & HAMMER2_THREAD_FROZEN)
315                         continue;
316                 if (flags & HAMMER2_THREAD_REMASTER) {
317                         hammer2_thr_signal2(thr, 0, HAMMER2_THREAD_REMASTER);
318                         bzero(&bfi, sizeof(bfi));
319                         bfi.size = 8192 * 1024;
320                         /* hammer2_bulkfree_pass(thr->hmp, &bfi); */
321                 }
322         }
323         thr->td = NULL;
324         hammer2_thr_signal(thr, HAMMER2_THREAD_STOPPED);
325         /* structure can go invalid at this point */
326 }
327
328 int
329 hammer2_bulkfree_pass(hammer2_dev_t *hmp, hammer2_chain_t *vchain,
330                       hammer2_ioc_bulkfree_t *bfi)
331 {
332         hammer2_bulkfree_info_t cbinfo;
333         hammer2_chain_save_t *save;
334         hammer2_off_t incr;
335         size_t size;
336         int error;
337
338         /*
339          * We have to clear the live dedup cache as it might have entries
340          * that are freeable as of now.  Any new entries in the dedup cache
341          * made after this point, even if they become freeable, will have
342          * previously been fully allocated and will be protected by the
343          * 2-stage bulkfree.
344          */
345         hammer2_dedup_clear(hmp);
346
347         /*
348          * Setup for free pass
349          */
350         bzero(&cbinfo, sizeof(cbinfo));
351         size = (bfi->size + HAMMER2_FREEMAP_LEVELN_PSIZE - 1) &
352                ~(size_t)(HAMMER2_FREEMAP_LEVELN_PSIZE - 1);
353         cbinfo.hmp = hmp;
354         cbinfo.bmap = kmem_alloc_swapbacked(&cbinfo.kp, size, VM_SUBSYS_HAMMER);
355         cbinfo.saved_mirror_tid = hmp->voldata.mirror_tid;
356
357         cbinfo.dedup = kmalloc(sizeof(*cbinfo.dedup) * HAMMER2_DEDUP_HEUR_SIZE,
358                                M_HAMMER2, M_WAITOK | M_ZERO);
359
360         /*
361          * Normalize start point to a 2GB boundary.  We operate on a
362          * 64KB leaf bitmap boundary which represents 2GB of storage.
363          */
364         cbinfo.sbase = bfi->sbase;
365         if (cbinfo.sbase > hmp->voldata.volu_size)
366                 cbinfo.sbase = hmp->voldata.volu_size;
367         cbinfo.sbase &= ~HAMMER2_FREEMAP_LEVEL1_MASK;
368         TAILQ_INIT(&cbinfo.list);
369
370         /*
371          * Loop on a full meta-data scan as many times as required to
372          * get through all available storage.
373          */
374         error = 0;
375         while (cbinfo.sbase < hmp->voldata.volu_size) {
376                 /*
377                  * We have enough ram to represent (incr) bytes of storage.
378                  * Each 64KB of ram represents 2GB of storage.
379                  *
380                  * We must also clean out our de-duplication heuristic for
381                  * each (incr) bytes of storage, otherwise we wind up not
382                  * scanning meta-data for later areas of storage because
383                  * they had already been scanned in earlier areas of storage.
384                  * Since the ranging is different, we have to restart
385                  * the dedup heuristic too.
386                  */
387                 cbinfo_bmap_init(&cbinfo, size);
388                 bzero(cbinfo.dedup, sizeof(*cbinfo.dedup) *
389                                     HAMMER2_DEDUP_HEUR_SIZE);
390                 incr = size / HAMMER2_FREEMAP_LEVELN_PSIZE *
391                        HAMMER2_FREEMAP_LEVEL1_SIZE;
392                 if (hmp->voldata.volu_size - cbinfo.sbase < incr)
393                         cbinfo.sstop = hmp->voldata.volu_size;
394                 else
395                         cbinfo.sstop = cbinfo.sbase + incr;
396                 if (hammer2_debug & 1) {
397                         kprintf("bulkfree pass %016jx/%jdGB\n",
398                                 (intmax_t)cbinfo.sbase,
399                                 (intmax_t)incr / HAMMER2_FREEMAP_LEVEL1_SIZE);
400                 }
401
402                 /*
403                  * Scan topology for stuff inside this range.
404                  */
405                 hammer2_trans_init(hmp->spmp, 0);
406                 cbinfo.mtid = hammer2_trans_sub(hmp->spmp);
407                 cbinfo.pri = 0;
408                 error |= hammer2_bulk_scan(vchain, h2_bulkfree_callback,
409                                            &cbinfo);
410
411                 while ((save = TAILQ_FIRST(&cbinfo.list)) != NULL &&
412                        error == 0) {
413                         TAILQ_REMOVE(&cbinfo.list, save, entry);
414                         cbinfo.pri = 0;
415                         error |= hammer2_bulk_scan(save->chain,
416                                                      h2_bulkfree_callback,
417                                                      &cbinfo);
418                         hammer2_chain_drop(save->chain);
419                         kfree(save, M_HAMMER2);
420                 }
421                 while (save) {
422                         TAILQ_REMOVE(&cbinfo.list, save, entry);
423                         hammer2_chain_drop(save->chain);
424                         kfree(save, M_HAMMER2);
425                         save = TAILQ_FIRST(&cbinfo.list);
426                 }
427
428                 kprintf("bulkfree lastdrop %d %d error=0x%04x\n",
429                         vchain->refs, vchain->core.chain_count, error);
430
431                 /*
432                  * If complete scan succeeded we can synchronize our
433                  * in-memory freemap against live storage.  If an abort
434                  * did occur we cannot safely synchronize our partially
435                  * filled-out in-memory freemap.
436                  */
437                 if (error == 0) {
438                         error = h2_bulkfree_sync(&cbinfo);
439
440                         hammer2_voldata_lock(hmp);
441                         hammer2_voldata_modify(hmp);
442                         hmp->voldata.allocator_free += cbinfo.adj_free;
443                         hammer2_voldata_unlock(hmp);
444                 }
445
446                 /*
447                  * Cleanup for next loop.
448                  */
449                 hammer2_trans_done(hmp->spmp);
450                 if (error)
451                         break;
452                 cbinfo.sbase = cbinfo.sstop;
453                 cbinfo.adj_free = 0;
454         }
455         kmem_free_swapbacked(&cbinfo.kp);
456         kfree(cbinfo.dedup, M_HAMMER2);
457         cbinfo.dedup = NULL;
458
459         bfi->sstop = cbinfo.sbase;
460
461         incr = bfi->sstop / (hmp->voldata.volu_size / 10000);
462         if (incr > 10000)
463                 incr = 10000;
464
465         kprintf("bulkfree pass statistics (%d.%02d%% storage processed):\n",
466                 (int)incr / 100,
467                 (int)incr % 100);
468
469         if (error) {
470                 kprintf("    bulkfree was aborted\n");
471         } else {
472                 kprintf("    transition->free   %ld\n", cbinfo.count_10_00);
473                 kprintf("    transition->staged %ld\n", cbinfo.count_11_10);
474                 kprintf("    ERR(00)->allocated %ld\n", cbinfo.count_00_11);
475                 kprintf("    ERR(01)->allocated %ld\n", cbinfo.count_01_11);
476                 kprintf("    staged->allocated  %ld\n", cbinfo.count_10_11);
477                 kprintf("    ~2MB segs cleaned  %ld\n", cbinfo.count_l0cleans);
478                 kprintf("    linear adjusts     %ld\n",
479                         cbinfo.count_linadjusts);
480                 kprintf("    dedup factor       %ld\n",
481                         cbinfo.count_dedup_factor);
482         }
483
484         return error;
485 }
486
487 static void
488 cbinfo_bmap_init(hammer2_bulkfree_info_t *cbinfo, size_t size)
489 {
490         hammer2_bmap_data_t *bmap = cbinfo->bmap;
491         hammer2_key_t key = cbinfo->sbase;
492         hammer2_key_t lokey;
493         hammer2_key_t hikey;
494
495         lokey = (cbinfo->hmp->voldata.allocator_beg + HAMMER2_SEGMASK64) &
496                 ~HAMMER2_SEGMASK64;
497         hikey = cbinfo->hmp->voldata.volu_size & ~HAMMER2_SEGMASK64;
498
499         bzero(bmap, size);
500         while (size) {
501                 bzero(bmap, sizeof(*bmap));
502                 if (lokey < H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX))
503                         lokey = H2FMBASE(key, HAMMER2_FREEMAP_LEVEL1_RADIX);
504                 if (lokey < H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64)
505                         lokey = H2FMZONEBASE(key) + HAMMER2_ZONE_SEG64;
506                 if (key < lokey || key >= hikey) {
507                         memset(bmap->bitmapq, -1,
508                                sizeof(bmap->bitmapq));
509                         bmap->avail = 0;
510                         bmap->linear = HAMMER2_SEGSIZE;
511                 } else {
512                         bmap->avail = H2FMSHIFT(HAMMER2_FREEMAP_LEVEL0_RADIX);
513                 }
514                 size -= sizeof(*bmap);
515                 key += HAMMER2_FREEMAP_LEVEL0_SIZE;
516                 ++bmap;
517         }
518 }
519
520 static int
521 h2_bulkfree_callback(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref)
522 {
523         hammer2_bmap_data_t *bmap;
524         hammer2_off_t data_off;
525         uint16_t class;
526         size_t bytes;
527         int radix;
528
529         /*
530          * Check for signal and allow yield to userland during scan
531          */
532         if (hammer2_signal_check(&cbinfo->save_time))
533                 return HAMMER2_ERROR_ABORTED;
534
535         if (bref->type == HAMMER2_BREF_TYPE_INODE) {
536                 ++cbinfo->count_inodes_scanned;
537                 if ((cbinfo->count_inodes_scanned & 65535) == 0)
538                         kprintf(" inodes %6ld bytes %9ld\n",
539                                 cbinfo->count_inodes_scanned,
540                                 cbinfo->bytes_scanned);
541         }
542
543         /*
544          * Calculate the data offset and determine if it is within
545          * the current freemap range being gathered.
546          */
547         data_off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX;
548         if (data_off < cbinfo->sbase || data_off >= cbinfo->sstop)
549                 return 0;
550         if (data_off < cbinfo->hmp->voldata.allocator_beg)
551                 return 0;
552         if (data_off >= cbinfo->hmp->voldata.volu_size)
553                 return 0;
554
555         /*
556          * Calculate the information needed to generate the in-memory
557          * freemap record.
558          *
559          * Hammer2 does not allow allocations to cross the L1 (2GB) boundary,
560          * it's a problem if it does.  (Or L0 (2MB) for that matter).
561          */
562         radix = (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
563         KKASSERT(radix != 0);
564         bytes = (size_t)1 << radix;
565         class = (bref->type << 8) | hammer2_devblkradix(radix);
566
567         if (data_off + bytes > cbinfo->sstop) {
568                 kprintf("hammer2_bulkfree_scan: illegal 2GB boundary "
569                         "%016jx %016jx/%d\n",
570                         (intmax_t)bref->data_off,
571                         (intmax_t)bref->key,
572                         bref->keybits);
573                 bytes = cbinfo->sstop - data_off;       /* XXX */
574         }
575
576         /*
577          * Convert to a storage offset relative to the beginning of the
578          * storage range we are collecting.  Then lookup the level0 bmap entry.
579          */
580         data_off -= cbinfo->sbase;
581         bmap = cbinfo->bmap + (data_off >> HAMMER2_FREEMAP_LEVEL0_RADIX);
582
583         /*
584          * Convert data_off to a bmap-relative value (~4MB storage range).
585          * Adjust linear, class, and avail.
586          *
587          * Hammer2 does not allow allocations to cross the L0 (4MB) boundary,
588          */
589         data_off &= HAMMER2_FREEMAP_LEVEL0_MASK;
590         if (data_off + bytes > HAMMER2_FREEMAP_LEVEL0_SIZE) {
591                 kprintf("hammer2_bulkfree_scan: illegal 4MB boundary "
592                         "%016jx %016jx/%d\n",
593                         (intmax_t)bref->data_off,
594                         (intmax_t)bref->key,
595                         bref->keybits);
596                 bytes = HAMMER2_FREEMAP_LEVEL0_SIZE - data_off;
597         }
598
599         if (bmap->class == 0) {
600                 bmap->class = class;
601                 bmap->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
602         }
603
604         /*
605          * NOTE: bmap->class does not have to match class.  Classification
606          *       is relaxed when free space is low, so some mixing can occur.
607          */
608 #if 0
609         /*
610          * XXX removed
611          */
612         if (bmap->class != class) {
613                 kprintf("hammer2_bulkfree_scan: illegal mixed class "
614                         "%016jx %016jx/%d (%04x vs %04x)\n",
615                         (intmax_t)bref->data_off,
616                         (intmax_t)bref->key,
617                         bref->keybits,
618                         class, bmap->class);
619         }
620 #endif
621
622         /*
623          * Just record the highest byte-granular offset for now.  Do not
624          * match against allocations which are in multiples of whole blocks.
625          *
626          * Make sure that any in-block linear offset at least covers the
627          * data range.  This can cause bmap->linear to become block-aligned.
628          */
629         if (bytes & HAMMER2_FREEMAP_BLOCK_MASK) {
630                 if (bmap->linear < (int32_t)data_off + (int32_t)bytes)
631                         bmap->linear = (int32_t)data_off + (int32_t)bytes;
632         } else if (bmap->linear >= (int32_t)data_off &&
633                    bmap->linear < (int32_t)data_off + (int32_t)bytes) {
634                 bmap->linear = (int32_t)data_off + (int32_t)bytes;
635         }
636
637         /*
638          * Adjust the hammer2_bitmap_t bitmap[HAMMER2_BMAP_ELEMENTS].
639          * 64-bit entries, 2 bits per entry, to code 11.
640          *
641          * NOTE: data_off mask to 524288, shift right by 14 (radix for 16384),
642          *       and multiply shift amount by 2 for sets of 2 bits.
643          *
644          * NOTE: The allocation can be smaller than HAMMER2_FREEMAP_BLOCK_SIZE.
645          *       also, data_off may not be FREEMAP_BLOCK_SIZE aligned.
646          */
647         while (bytes > 0) {
648                 hammer2_bitmap_t bmask;
649                 int bindex;
650
651                 bindex = (int)data_off >> (HAMMER2_FREEMAP_BLOCK_RADIX +
652                                            HAMMER2_BMAP_INDEX_RADIX);
653                 bmask = (hammer2_bitmap_t)3 <<
654                         ((((int)data_off & HAMMER2_BMAP_INDEX_MASK) >>
655                          HAMMER2_FREEMAP_BLOCK_RADIX) << 1);
656
657                 /*
658                  * NOTE! The (avail) calculation is bitmap-granular.  Multiple
659                  *       sub-granular records can wind up at the same bitmap
660                  *       position.
661                  */
662                 if ((bmap->bitmapq[bindex] & bmask) == 0) {
663                         if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE) {
664                                 bmap->avail -= HAMMER2_FREEMAP_BLOCK_SIZE;
665                         } else {
666                                 bmap->avail -= bytes;
667                         }
668                         bmap->bitmapq[bindex] |= bmask;
669                 }
670                 data_off += HAMMER2_FREEMAP_BLOCK_SIZE;
671                 if (bytes < HAMMER2_FREEMAP_BLOCK_SIZE)
672                         bytes = 0;
673                 else
674                         bytes -= HAMMER2_FREEMAP_BLOCK_SIZE;
675         }
676         return 0;
677 }
678
679 /*
680  * Synchronize the in-memory bitmap with the live freemap.  This is not a
681  * direct copy.  Instead the bitmaps must be compared:
682  *
683  *      In-memory       Live-freemap
684  *         00             11 -> 10      (do nothing if live modified)
685  *                        10 -> 00      (do nothing if live modified)
686  *         11             10 -> 11      handles race against live
687  *                        ** -> 11      nominally warn of corruption
688  * 
689  * We must also fixup the hints in HAMMER2_BREF_TYPE_FREEMAP_LEAF.
690  */
691 static int
692 h2_bulkfree_sync(hammer2_bulkfree_info_t *cbinfo)
693 {
694         hammer2_off_t data_off;
695         hammer2_key_t key;
696         hammer2_key_t key_dummy;
697         hammer2_bmap_data_t *bmap;
698         hammer2_bmap_data_t *live;
699         hammer2_chain_t *live_parent;
700         hammer2_chain_t *live_chain;
701         int bmapindex;
702         int error;
703
704         kprintf("hammer2_bulkfree - range ");
705
706         if (cbinfo->sbase < cbinfo->hmp->voldata.allocator_beg)
707                 kprintf("%016jx-",
708                         (intmax_t)cbinfo->hmp->voldata.allocator_beg);
709         else
710                 kprintf("%016jx-",
711                         (intmax_t)cbinfo->sbase);
712
713         if (cbinfo->sstop > cbinfo->hmp->voldata.volu_size)
714                 kprintf("%016jx\n",
715                         (intmax_t)cbinfo->hmp->voldata.volu_size);
716         else
717                 kprintf("%016jx\n",
718                         (intmax_t)cbinfo->sstop);
719                 
720         data_off = cbinfo->sbase;
721         bmap = cbinfo->bmap;
722
723         live_parent = &cbinfo->hmp->fchain;
724         hammer2_chain_ref(live_parent);
725         hammer2_chain_lock(live_parent, HAMMER2_RESOLVE_ALWAYS);
726         live_chain = NULL;
727         error = 0;
728
729         /*
730          * Iterate each hammer2_bmap_data_t line (128 bytes) managing
731          * 4MB of storage.
732          */
733         while (data_off < cbinfo->sstop) {
734                 /*
735                  * The freemap is not used below allocator_beg or beyond
736                  * volu_size.
737                  */
738
739                 if (data_off < cbinfo->hmp->voldata.allocator_beg)
740                         goto next;
741                 if (data_off >= cbinfo->hmp->voldata.volu_size)
742                         goto next;
743
744                 /*
745                  * Locate the freemap leaf on the live filesystem
746                  */
747                 key = (data_off & ~HAMMER2_FREEMAP_LEVEL1_MASK);
748
749                 if (live_chain == NULL || live_chain->bref.key != key) {
750                         if (live_chain) {
751                                 hammer2_chain_unlock(live_chain);
752                                 hammer2_chain_drop(live_chain);
753                         }
754                         live_chain = hammer2_chain_lookup(
755                                             &live_parent,
756                                             &key_dummy,
757                                             key,
758                                             key + HAMMER2_FREEMAP_LEVEL1_MASK,
759                                             &error,
760                                             HAMMER2_LOOKUP_ALWAYS);
761                         if (error) {
762                                 kprintf("hammer2_bulkfree: freemap lookup "
763                                         "error near %016jx, error %s\n",
764                                         (intmax_t)data_off,
765                                         hammer2_error_str(live_chain->error));
766                                 break;
767                         }
768                 }
769                 if (live_chain == NULL) {
770                         /*
771                          * XXX if we implement a full recovery mode we need
772                          * to create/recreate missing freemap chains if our
773                          * bmap has any allocated blocks.
774                          */
775                         if (bmap->class &&
776                             bmap->avail != HAMMER2_FREEMAP_LEVEL0_SIZE) {
777                                 kprintf("hammer2_bulkfree: cannot locate "
778                                         "live leaf for allocated data "
779                                         "near %016jx\n",
780                                         (intmax_t)data_off);
781                         }
782                         goto next;
783                 }
784                 if (live_chain->error) {
785                         kprintf("hammer2_bulkfree: unable to access freemap "
786                                 "near %016jx, error %s\n",
787                                 (intmax_t)data_off,
788                                 hammer2_error_str(live_chain->error));
789                         hammer2_chain_unlock(live_chain);
790                         hammer2_chain_drop(live_chain);
791                         live_chain = NULL;
792                         goto next;
793                 }
794
795                 bmapindex = (data_off & HAMMER2_FREEMAP_LEVEL1_MASK) >>
796                             HAMMER2_FREEMAP_LEVEL0_RADIX;
797                 live = &live_chain->data->bmdata[bmapindex];
798
799                 /*
800                  * Shortcut if the bitmaps match and the live linear
801                  * indicator is sane.  We can't do a perfect check of
802                  * live->linear because the only real requirement is that
803                  * if it is not block-aligned, that it not cover the space
804                  * within its current block which overlaps one of the data
805                  * ranges we scan.  We don't retain enough fine-grained
806                  * data in our scan to be able to set it exactly.
807                  *
808                  * TODO - we could shortcut this by testing that both
809                  * live->class and bmap->class are 0, and both avails are
810                  * set to HAMMER2_FREEMAP_LEVEL0_SIZE (4MB).
811                  */
812                 if (bcmp(live->bitmapq, bmap->bitmapq,
813                          sizeof(bmap->bitmapq)) == 0 &&
814                     live->linear >= bmap->linear) {
815                         goto next;
816                 }
817                 if (hammer2_debug & 1) {
818                         kprintf("live %016jx %04d.%04x (avail=%d)\n",
819                                 data_off, bmapindex, live->class, live->avail);
820                 }
821
822                 hammer2_chain_modify(live_chain, cbinfo->mtid, 0, 0);
823                 live_chain->bref.check.freemap.bigmask = -1;
824                 cbinfo->hmp->freemap_relaxed = 0;       /* reset heuristic */
825                 live = &live_chain->data->bmdata[bmapindex];
826
827                 h2_bulkfree_sync_adjust(cbinfo, data_off, live, bmap,
828                                         live_chain->bref.key +
829                                         bmapindex *
830                                         HAMMER2_FREEMAP_LEVEL0_SIZE);
831 next:
832                 data_off += HAMMER2_FREEMAP_LEVEL0_SIZE;
833                 ++bmap;
834         }
835         if (live_chain) {
836                 hammer2_chain_unlock(live_chain);
837                 hammer2_chain_drop(live_chain);
838         }
839         if (live_parent) {
840                 hammer2_chain_unlock(live_parent);
841                 hammer2_chain_drop(live_parent);
842         }
843         return error;
844 }
845
846 /*
847  * Merge the bulkfree bitmap against the existing bitmap.
848  */
849 static
850 void
851 h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo,
852                         hammer2_off_t data_off, hammer2_bmap_data_t *live,
853                         hammer2_bmap_data_t *bmap, hammer2_key_t alloc_base)
854 {
855         int bindex;
856         int scount;
857         hammer2_off_t tmp_off;
858         hammer2_bitmap_t lmask;
859         hammer2_bitmap_t mmask;
860
861         tmp_off = data_off;
862
863         for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) {
864                 lmask = live->bitmapq[bindex];  /* live */
865                 mmask = bmap->bitmapq[bindex];  /* snapshotted bulkfree */
866                 if (lmask == mmask) {
867                         tmp_off += HAMMER2_BMAP_INDEX_SIZE;
868                         continue;
869                 }
870
871                 for (scount = 0;
872                      scount < HAMMER2_BMAP_BITS_PER_ELEMENT;
873                      scount += 2) {
874                         if ((mmask & 3) == 0) {
875                                 /*
876                                  * in-memory 00         live 11 -> 10
877                                  *                      live 10 -> 00
878                                  *
879                                  * Storage might be marked allocated or
880                                  * staged and must be remarked staged or
881                                  * free.
882                                  */
883                                 switch (lmask & 3) {
884                                 case 0: /* 00 */
885                                         break;
886                                 case 1: /* 01 */
887                                         kprintf("hammer2_bulkfree: cannot "
888                                                 "transition m=00/l=01\n");
889                                         break;
890                                 case 2: /* 10 -> 00 */
891                                         live->bitmapq[bindex] &=
892                                             ~((hammer2_bitmap_t)2 << scount);
893                                         live->avail +=
894                                                 HAMMER2_FREEMAP_BLOCK_SIZE;
895                                         if (live->avail >
896                                             HAMMER2_FREEMAP_LEVEL0_SIZE) {
897                                                 live->avail =
898                                                     HAMMER2_FREEMAP_LEVEL0_SIZE;
899                                         }
900                                         cbinfo->adj_free +=
901                                                 HAMMER2_FREEMAP_BLOCK_SIZE;
902                                         ++cbinfo->count_10_00;
903                                         hammer2_io_dedup_assert(
904                                                 cbinfo->hmp,
905                                                 tmp_off |
906                                                 HAMMER2_FREEMAP_BLOCK_RADIX,
907                                                 HAMMER2_FREEMAP_BLOCK_SIZE);
908                                         break;
909                                 case 3: /* 11 -> 10 */
910                                         live->bitmapq[bindex] &=
911                                             ~((hammer2_bitmap_t)1 << scount);
912                                         ++cbinfo->count_11_10;
913                                         hammer2_io_dedup_delete(
914                                                 cbinfo->hmp,
915                                                 HAMMER2_BREF_TYPE_DATA,
916                                                 tmp_off |
917                                                 HAMMER2_FREEMAP_BLOCK_RADIX,
918                                                 HAMMER2_FREEMAP_BLOCK_SIZE);
919                                         break;
920                                 }
921                         } else if ((mmask & 3) == 3) {
922                                 /*
923                                  * in-memory 11         live 10 -> 11
924                                  *                      live ** -> 11
925                                  *
926                                  * Storage might be incorrectly marked free
927                                  * or staged and must be remarked fully
928                                  * allocated.
929                                  */
930                                 switch (lmask & 3) {
931                                 case 0: /* 00 */
932                                         ++cbinfo->count_00_11;
933                                         cbinfo->adj_free -=
934                                                 HAMMER2_FREEMAP_BLOCK_SIZE;
935                                         live->avail -=
936                                                 HAMMER2_FREEMAP_BLOCK_SIZE;
937                                         if ((int32_t)live->avail < 0)
938                                                 live->avail = 0;
939                                         break;
940                                 case 1: /* 01 */
941                                         ++cbinfo->count_01_11;
942                                         break;
943                                 case 2: /* 10 -> 11 */
944                                         ++cbinfo->count_10_11;
945                                         break;
946                                 case 3: /* 11 */
947                                         break;
948                                 }
949                                 live->bitmapq[bindex] |=
950                                         ((hammer2_bitmap_t)3 << scount);
951                         }
952                         mmask >>= 2;
953                         lmask >>= 2;
954                         tmp_off += HAMMER2_FREEMAP_BLOCK_SIZE;
955                 }
956         }
957
958         /*
959          * Determine if the live bitmap is completely free and reset its
960          * fields if so.  Otherwise check to see if we can reduce the linear
961          * offset.
962          */
963         for (bindex = HAMMER2_BMAP_ELEMENTS - 1; bindex >= 0; --bindex) {
964                 if (live->bitmapq[bindex] != 0)
965                         break;
966         }
967         if (bindex < 0) {
968                 /*
969                  * Completely empty, reset entire segment
970                  */
971 #if 0
972                 kprintf("hammer2: cleanseg %016jx.%04x (%d)\n",
973                         alloc_base, live->class, live->avail);
974 #endif
975                 live->avail = HAMMER2_FREEMAP_LEVEL0_SIZE;
976                 live->class = 0;
977                 live->linear = 0;
978                 ++cbinfo->count_l0cleans;
979         } else if (bindex < 7) {
980                 /*
981                  * Partially full, bitmapq[bindex] != 0.  The live->linear
982                  * offset can legitimately be just about anything, but
983                  * our bulkfree pass doesn't record enough information to
984                  * set it exactly.  Just make sure that it is set to a
985                  * safe value that also works in our match code above (the
986                  * bcmp and linear test).
987                  *
988                  * We cannot safely leave live->linear at a sub-block offset
989                  * unless it is already in the same block as bmap->linear.
990                  *
991                  * If it is not in the same block, we cannot assume that
992                  * we can set it to bmap->linear on a sub-block boundary,
993                  * because the live system could have bounced it around.
994                  * In that situation we satisfy our bcmp/skip requirement
995                  * above by setting it to the nearest higher block boundary.
996                  * This alignment effectively kills any partial allocation it
997                  * might have been tracking before.
998                  */
999                 if (live->linear < bmap->linear &&
1000                     ((live->linear ^ bmap->linear) &
1001                      ~HAMMER2_FREEMAP_BLOCK_MASK) == 0) {
1002                         live->linear = bmap->linear;
1003                         ++cbinfo->count_linadjusts;
1004                 } else {
1005                         live->linear =
1006                                 (bmap->linear + HAMMER2_FREEMAP_BLOCK_MASK) &
1007                                 ~HAMMER2_FREEMAP_BLOCK_MASK;
1008                         ++cbinfo->count_linadjusts;
1009                 }
1010         } else {
1011                 /*
1012                  * Completely full, effectively disable the linear iterator
1013                  */
1014                 live->linear = HAMMER2_SEGSIZE;
1015         }
1016
1017 #if 0
1018         if (bmap->class) {
1019                 kprintf("%016jx %04d.%04x (avail=%7d) "
1020                         "%08x %08x %08x %08x %08x %08x %08x %08x\n",
1021                         (intmax_t)data_off,
1022                         (int)((data_off &
1023                                HAMMER2_FREEMAP_LEVEL1_MASK) >>
1024                               HAMMER2_FREEMAP_LEVEL0_RADIX),
1025                         bmap->class,
1026                         bmap->avail,
1027                         bmap->bitmap[0], bmap->bitmap[1],
1028                         bmap->bitmap[2], bmap->bitmap[3],
1029                         bmap->bitmap[4], bmap->bitmap[5],
1030                         bmap->bitmap[6], bmap->bitmap[7]);
1031         }
1032 #endif
1033 }
1034
1035 /*
1036  * BULKFREE DEDUP HEURISTIC
1037  *
1038  * WARNING! This code is SMP safe but the heuristic allows SMP collisions.
1039  *          All fields must be loaded into locals and validated.
1040  */
1041 static
1042 int
1043 h2_bulkfree_test(hammer2_bulkfree_info_t *cbinfo, hammer2_blockref_t *bref,
1044                  int pri)
1045 {
1046         hammer2_dedup_t *dedup;
1047         int best;
1048         int n;
1049         int i;
1050
1051         n = hammer2_icrc32(&bref->data_off, sizeof(bref->data_off));
1052         dedup = cbinfo->dedup + (n & (HAMMER2_DEDUP_HEUR_MASK & ~7));
1053
1054         for (i = best = 0; i < 8; ++i) {
1055                 if (dedup[i].data_off == bref->data_off) {
1056                         if (dedup[i].ticks < pri)
1057                                 dedup[i].ticks = pri;
1058                         if (pri == 1)
1059                                 cbinfo->count_dedup_factor += dedup[i].ticks;
1060                         return 1;
1061                 }
1062                 if (dedup[i].ticks < dedup[best].ticks)
1063                         best = i;
1064         }
1065         dedup[best].data_off = bref->data_off;
1066         dedup[best].ticks = pri;
1067
1068         return 0;
1069 }