HAMMER 54B/Many: Performance tuning.
[dragonfly.git] / sys / vfs / hammer / hammer_blockmap.c
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_blockmap.c,v 1.18 2008/06/12 00:16:10 dillon Exp $
35  */
36
37 /*
38  * HAMMER blockmap
39  */
40 #include "hammer.h"
41
42 static hammer_off_t hammer_find_hole(hammer_mount_t hmp,
43                                    hammer_holes_t holes, int bytes);
44 static void hammer_add_hole(hammer_mount_t hmp, hammer_holes_t holes,
45                                    hammer_off_t zone_offset, int bytes);
46 static void hammer_clean_holes(hammer_mount_t hmp, hammer_holes_t holes,
47                                    hammer_off_t base_offset);
48 static int hammer_res_rb_compare(hammer_reserve_t res1, hammer_reserve_t res2);
49
50 /*
51  * Reserved big-blocks red-black tree support
52  */
53 RB_GENERATE2(hammer_res_rb_tree, hammer_reserve, rb_node,
54              hammer_res_rb_compare, hammer_off_t, zone_offset);
55
56 static int
57 hammer_res_rb_compare(hammer_reserve_t res1, hammer_reserve_t res2)
58 {
59         if (res1->zone_offset < res2->zone_offset)
60                 return(-1);
61         if (res1->zone_offset > res2->zone_offset)
62                 return(1);
63         return(0);
64 }
65
66 /*
67  * Allocate a big-block from the freemap and stuff it into the blockmap
68  * at layer1/layer2.
69  */
70 static void
71 hammer_blockmap_llalloc(hammer_transaction_t trans,
72                 hammer_off_t zone_offset, int *errorp,
73                 hammer_buffer_t buffer1, hammer_blockmap_layer1_t layer1,
74                 hammer_buffer_t buffer2, hammer_blockmap_layer2_t layer2)
75 {
76         hammer_off_t zone2_offset;
77
78         zone2_offset = hammer_freemap_alloc(trans, zone_offset, errorp);
79         if (*errorp)
80                 return;
81         hammer_modify_buffer(trans, buffer1, layer1, sizeof(*layer1));
82         KKASSERT(layer1->blocks_free);
83         --layer1->blocks_free;
84         layer1->layer1_crc = crc32(layer1, HAMMER_LAYER1_CRCSIZE);
85         hammer_modify_buffer_done(buffer1);
86         hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2));
87         bzero(layer2, sizeof(*layer2));
88         layer2->u.phys_offset = zone2_offset;
89         layer2->bytes_free = HAMMER_LARGEBLOCK_SIZE;
90         layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE);
91         hammer_modify_buffer_done(buffer2);
92 }
93
94
95 /*
96  * Allocate bytes from a zone
97  */
98 hammer_off_t
99 hammer_blockmap_alloc(hammer_transaction_t trans, int zone,
100                       int bytes, int *errorp)
101 {
102         hammer_mount_t hmp;
103         hammer_volume_t root_volume;
104         hammer_blockmap_t rootmap;
105         hammer_reserve_t resv;
106         struct hammer_blockmap_layer1 *layer1;
107         struct hammer_blockmap_layer2 *layer2;
108         hammer_buffer_t buffer1 = NULL;
109         hammer_buffer_t buffer2 = NULL;
110         hammer_buffer_t buffer3 = NULL;
111         hammer_off_t tmp_offset;
112         hammer_off_t next_offset;
113         hammer_off_t result_offset;
114         hammer_off_t layer1_offset;
115         hammer_off_t layer2_offset;
116         int loops = 0;
117         int skip_amount;
118
119         hmp = trans->hmp;
120
121         /*
122          * Deal with alignment and buffer-boundary issues.
123          *
124          * Be careful, certain primary alignments are used below to allocate
125          * new blockmap blocks.
126          */
127         bytes = (bytes + 15) & ~15;
128         KKASSERT(bytes > 0 && bytes <= HAMMER_BUFSIZE);
129         KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
130
131         /*
132          * Try to use a known-free hole.
133          */
134         result_offset = hammer_find_hole(hmp, &trans->hmp->holes[zone], bytes);
135         if (result_offset) {
136                 *errorp = 0;
137                 hammer_blockmap_free(trans, result_offset, -bytes);
138                 return(result_offset);
139         }
140
141         /*
142          * Otherwise scan for space
143          */
144         root_volume = hammer_get_root_volume(hmp, errorp);
145         if (*errorp)
146                 return(0);
147         rootmap = &hmp->blockmap[zone];
148         KKASSERT(rootmap->phys_offset != 0);
149         KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
150                  HAMMER_ZONE_RAW_BUFFER_INDEX);
151         KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
152         KKASSERT(HAMMER_ZONE_DECODE(rootmap->next_offset) == zone);
153
154         hammer_lock_ex(&hmp->blkmap_lock);
155         next_offset = rootmap->next_offset;
156 again:
157         /*
158          * The allocation request may not cross a buffer boundary.
159          */
160         tmp_offset = next_offset + bytes - 1;
161         if ((next_offset ^ tmp_offset) & ~HAMMER_BUFMASK64) {
162                 skip_amount = HAMMER_BUFSIZE - 
163                               ((int)next_offset & HAMMER_BUFMASK);
164                 hammer_add_hole(hmp, &hmp->holes[zone],
165                                 next_offset, skip_amount);
166                 next_offset = tmp_offset & ~HAMMER_BUFMASK64;
167         }
168
169         /*
170          * Dive layer 1.  If we are starting a new layer 1 entry,
171          * allocate a layer 2 block for it.
172          */
173         layer1_offset = rootmap->phys_offset +
174                         HAMMER_BLOCKMAP_LAYER1_OFFSET(next_offset);
175         layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer1);
176         KKASSERT(*errorp == 0);
177         KKASSERT(next_offset <= rootmap->alloc_offset);
178
179         /*
180          * Check CRC if not allocating into uninitialized space
181          */
182         if ((next_offset != rootmap->alloc_offset) ||
183             (next_offset & HAMMER_BLOCKMAP_LAYER2_MASK)) {
184                 if (layer1->layer1_crc != crc32(layer1,
185                                                 HAMMER_LAYER1_CRCSIZE)) {
186                         Debugger("CRC FAILED: LAYER1");
187                 }
188         }
189
190         /*
191          * Allocate layer2 backing store in layer1 if necessary.  next_offset
192          * can skip to a bigblock boundary but alloc_offset is at least
193          * bigblock-aligned so that's ok.
194          */
195         if ((next_offset == rootmap->alloc_offset &&
196             (next_offset & HAMMER_BLOCKMAP_LAYER2_MASK) == 0) ||
197             layer1->phys_offset == HAMMER_BLOCKMAP_FREE
198         ) {
199                 KKASSERT((next_offset & HAMMER_BLOCKMAP_LAYER2_MASK) == 0);
200                 hammer_modify_buffer(trans, buffer1, layer1, sizeof(*layer1));
201                 bzero(layer1, sizeof(*layer1));
202                 layer1->phys_offset =
203                         hammer_freemap_alloc(trans, next_offset, errorp);
204                 layer1->blocks_free = HAMMER_BLOCKMAP_RADIX2;
205                 layer1->layer1_crc = crc32(layer1, HAMMER_LAYER1_CRCSIZE);
206                 hammer_modify_buffer_done(buffer1);
207                 KKASSERT(*errorp == 0);
208         }
209         KKASSERT(layer1->phys_offset);
210
211         /*
212          * If layer1 indicates no free blocks in layer2 and our alloc_offset
213          * is not in layer2, skip layer2 entirely.
214          */
215         if (layer1->blocks_free == 0 &&
216             ((next_offset ^ rootmap->alloc_offset) & ~HAMMER_BLOCKMAP_LAYER2_MASK) != 0) {
217                 next_offset = (next_offset + HAMMER_BLOCKMAP_LAYER2_MASK) &
218                               ~HAMMER_BLOCKMAP_LAYER2_MASK;
219                 if (next_offset >= hmp->zone_limits[zone]) {
220                         hkprintf("blockmap wrap1\n");
221                         next_offset = HAMMER_ZONE_ENCODE(zone, 0);
222                         if (++loops == 2) {     /* XXX poor-man's */
223                                 result_offset = 0;
224                                 *errorp = ENOSPC;
225                                 goto done;
226                         }
227                 }
228                 goto again;
229         }
230
231         /*
232          * Dive layer 2, each entry represents a large-block.
233          */
234         layer2_offset = layer1->phys_offset +
235                         HAMMER_BLOCKMAP_LAYER2_OFFSET(next_offset);
236         layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer2);
237         KKASSERT(*errorp == 0);
238
239         /*
240          * Check CRC if not allocating into uninitialized space
241          */
242         if (next_offset != rootmap->alloc_offset ||
243             (next_offset & HAMMER_LARGEBLOCK_MASK64)) {
244                 if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
245                         Debugger("CRC FAILED: LAYER2");
246                 }
247         }
248
249         if ((next_offset & HAMMER_LARGEBLOCK_MASK64) == 0) {
250                 /*
251                  * We are at the beginning of a new bigblock
252                  */
253                 resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
254                                  next_offset & ~HAMMER_LARGEBLOCK_MASK64);
255
256                 if (resv) {
257                         goto skip;
258                 } else if (next_offset == rootmap->alloc_offset ||
259                            layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) {
260                         /*
261                          * Allocate the bigblock in layer2 if diving into
262                          * uninitialized space or if the block was previously
263                          * freed.
264                          */
265                         hammer_blockmap_llalloc(trans,
266                                                 next_offset, errorp,
267                                                 buffer1, layer1,
268                                                 buffer2, layer2);
269                         KKASSERT(layer2->u.phys_offset != HAMMER_BLOCKMAP_FREE);
270                 } else if (layer2->bytes_free != HAMMER_LARGEBLOCK_SIZE) {
271                         /*
272                          * We have encountered a block that is already
273                          * partially allocated.  We must skip this block.
274                          */
275 skip:
276                         next_offset += HAMMER_LARGEBLOCK_SIZE;
277                         if (next_offset >= trans->hmp->zone_limits[zone]) {
278                                 next_offset = HAMMER_ZONE_ENCODE(zone, 0);
279                                 hkprintf("blockmap wrap2\n");
280                                 if (++loops == 2) {     /* XXX poor-man's */
281                                         result_offset = 0;
282                                         *errorp = ENOSPC;
283                                         goto done;
284                                 }
285                         }
286                         goto again;
287                 }
288         } else {
289                 /*
290                  * We are appending within a bigblock.  It is possible that
291                  * the blockmap has been marked completely free via a prior
292                  * pruning operation.  We no longer reset the append index
293                  * for that case because it compromises the UNDO by allowing
294                  * data overwrites.
295                  */
296                 /*
297                 KKASSERT(layer2->u.phys_offset != HAMMER_BLOCKMAP_FREE);
298                 */
299         }
300
301         hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2));
302         layer2->bytes_free -= bytes;
303         layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE);
304         hammer_modify_buffer_done(buffer2);
305         KKASSERT(layer2->bytes_free >= 0);
306
307         /*
308          * If we are allocating from the base of a new buffer we can avoid
309          * a disk read by calling hammer_bnew().
310          */
311         if ((next_offset & HAMMER_BUFMASK) == 0) {
312                 hammer_bnew(trans->hmp, next_offset, errorp, &buffer3);
313         }
314         result_offset = next_offset;
315
316         /*
317          * Process allocated result_offset
318          */
319 done:
320         hammer_modify_volume(NULL, root_volume, NULL, 0);
321         if (result_offset) {
322                 if (result_offset == next_offset) {
323                         rootmap->next_offset = next_offset + bytes;
324                 } else {
325                         rootmap->next_offset = next_offset;
326                 }
327         } else {
328                 rootmap->next_offset = next_offset;
329         }
330         if (rootmap->alloc_offset < rootmap->next_offset) {
331                 rootmap->alloc_offset =
332                     (rootmap->next_offset + HAMMER_LARGEBLOCK_MASK) &
333                     ~HAMMER_LARGEBLOCK_MASK64;
334         }
335         hammer_modify_volume_done(root_volume);
336         hammer_unlock(&hmp->blkmap_lock);
337
338         /*
339          * Cleanup
340          */
341         if (buffer1)
342                 hammer_rel_buffer(buffer1, 0);
343         if (buffer2)
344                 hammer_rel_buffer(buffer2, 0);
345         if (buffer3)
346                 hammer_rel_buffer(buffer3, 0);
347         hammer_rel_volume(root_volume, 0);
348
349         return(result_offset);
350 }
351
352 /*
353  * Front-end blockmap reservation
354  *
355  * This code reserves bytes out of a blockmap without committing to any
356  * meta-data modifications, allowing the front-end to issue disk write I/O
357  * for large blocks of data without having to queue the BIOs to the back-end.
358  * If the reservation winds up not being used, for example due to a crash,
359  * the reblocker should eventually come along and clean it up.
360  *
361  * This code will attempt to assign free big-blocks to the blockmap to
362  * accomodate the request.
363  *
364  * If we return 0 a reservation was not possible and the caller must queue
365  * the I/O to the backend.
366  */
367 hammer_reserve_t
368 hammer_blockmap_reserve(hammer_mount_t hmp, int zone, int bytes,
369                         hammer_off_t *zone_offp, int *errorp)
370 {
371         hammer_volume_t root_volume;
372         hammer_blockmap_t rootmap;
373         struct hammer_blockmap_layer1 *layer1;
374         struct hammer_blockmap_layer2 *layer2;
375         hammer_buffer_t buffer1 = NULL;
376         hammer_buffer_t buffer2 = NULL;
377         hammer_buffer_t buffer3 = NULL;
378         hammer_off_t tmp_offset;
379         hammer_off_t next_offset;
380         hammer_off_t layer1_offset;
381         hammer_off_t layer2_offset;
382         hammer_reserve_t resv;
383         int loops = 0;
384         int skip_amount;
385
386         /*
387          * Setup
388          */
389         KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
390         root_volume = hammer_get_root_volume(hmp, errorp);
391         if (*errorp)
392                 return(NULL);
393         rootmap = &hmp->blockmap[zone];
394         KKASSERT(rootmap->phys_offset != 0);
395         KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
396                  HAMMER_ZONE_RAW_BUFFER_INDEX);
397         KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
398         KKASSERT(HAMMER_ZONE_DECODE(rootmap->next_offset) == zone);
399
400         /*
401          * Deal with alignment and buffer-boundary issues.
402          *
403          * Be careful, certain primary alignments are used below to allocate
404          * new blockmap blocks.
405          */
406         bytes = (bytes + 15) & ~15;
407         KKASSERT(bytes > 0 && bytes <= HAMMER_BUFSIZE);
408
409         hammer_lock_ex(&hmp->blkmap_lock);
410
411         /*
412          * Starting zoneX offset.  The reservation code always wraps at the
413          * alloc_offset (the allocation code is allowed to go through to the
414          * limit).
415          */
416         next_offset = rootmap->next_offset;
417 again:
418         resv = NULL;
419         if (next_offset >= rootmap->alloc_offset) {
420                 if (++loops == 2) {     /* XXX poor-man's */
421                         *errorp = ENOSPC;
422                         goto done;
423                 }
424                 next_offset = HAMMER_ZONE_ENCODE(zone, 0);
425         }
426
427         /*
428          * The allocation request may not cross a buffer boundary.
429          */
430         tmp_offset = next_offset + bytes - 1;
431         if ((next_offset ^ tmp_offset) & ~HAMMER_BUFMASK64) {
432                 skip_amount = HAMMER_BUFSIZE - 
433                               ((int)next_offset & HAMMER_BUFMASK);
434                 hammer_add_hole(hmp, &hmp->holes[zone],
435                                 next_offset, skip_amount);
436                 next_offset = tmp_offset & ~HAMMER_BUFMASK64;
437         }
438
439         /*
440          * Dive layer 1.
441          */
442         layer1_offset = rootmap->phys_offset +
443                         HAMMER_BLOCKMAP_LAYER1_OFFSET(next_offset);
444         layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer1);
445         KKASSERT(*errorp == 0);
446         KKASSERT(next_offset <= rootmap->alloc_offset);
447
448         /*
449          * Check CRC if not allocating into uninitialized space
450          */
451         if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
452                 Debugger("CRC FAILED: LAYER1");
453         }
454         KKASSERT(layer1->phys_offset);
455
456         /*
457          * If layer1 indicates no free blocks in layer2 and our alloc_offset
458          * is not in layer2, skip layer2 entirely.
459          */
460         if (layer1->blocks_free == 0 &&
461             ((next_offset ^ rootmap->alloc_offset) & ~HAMMER_BLOCKMAP_LAYER2_MASK) != 0) {
462                 next_offset = (next_offset + HAMMER_BLOCKMAP_LAYER2_MASK) &
463                               ~HAMMER_BLOCKMAP_LAYER2_MASK;
464                 goto again;
465         }
466
467         /*
468          * Dive layer 2, each entry represents a large-block.
469          */
470         layer2_offset = layer1->phys_offset +
471                         HAMMER_BLOCKMAP_LAYER2_OFFSET(next_offset);
472         layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer2);
473         KKASSERT(*errorp == 0);
474
475         /*
476          * Check CRC if not allocating into uninitialized space (which we
477          * aren't when reserving space).
478          */
479         if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
480                 Debugger("CRC FAILED: LAYER2");
481         }
482
483         /*
484          * Acquire the related reservation structure.  If it exists we can
485          * only use the bigblock if our current next_offset is already in
486          * it.
487          */
488         resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
489                          next_offset & ~HAMMER_LARGEBLOCK_MASK64);
490
491         if ((next_offset & HAMMER_LARGEBLOCK_MASK64) == 0) {
492                 /*
493                  * We are at the beginning of a new bigblock.
494                  *
495                  * (1) If the bigblock has already been reserved do not
496                  *     try to use it, skip it.
497                  *
498                  * (2) If the bigblock has not been allocated then allocate
499                  *     it.
500                  *
501                  * (3) If the bigblock is not completely free we have no
502                  *     visibility into what portions may have been allocated,
503                  *     so skip it.
504                  */
505
506                 if (resv) {
507                         next_offset += HAMMER_LARGEBLOCK_SIZE;
508                         goto again;
509                 } else if (layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) {
510                         struct hammer_transaction trans;
511
512                         hammer_start_transaction(&trans, hmp);
513                         if (hammer_sync_lock_sh_try(&trans) == 0) {
514                                 hammer_blockmap_llalloc(&trans,
515                                                         next_offset, errorp,
516                                                         buffer1, layer1,
517                                                         buffer2, layer2);
518                                 hammer_sync_unlock(&trans);
519                         } else {
520                                 hammer_sync_lock_sh(&trans);
521                                 hammer_blockmap_llalloc(&trans,
522                                                         next_offset, errorp,
523                                                         buffer1, layer1,
524                                                         buffer2, layer2);
525                                 hammer_sync_unlock(&trans);
526                                 /* *errorp = EDEADLK; */
527                         }
528                         hammer_done_transaction(&trans);
529                         if (layer2->u.phys_offset == HAMMER_BLOCKMAP_FREE) {
530                                 resv = NULL;
531                                 goto done;
532                         }
533                 } else if (layer2->bytes_free != HAMMER_LARGEBLOCK_SIZE) {
534                         /*
535                          * We have encountered a block that is already
536                          * partially allocated.  We must skip this block.
537                          */
538                         next_offset += HAMMER_LARGEBLOCK_SIZE;
539                         goto again;
540                 }
541         } else {
542                 /*
543                  * We are appending within a bigblock.  It is possible that
544                  * the blockmap has been marked completely free via a prior
545                  * pruning operation.  We no longer reset the append index
546                  * for that case because it compromises the UNDO by allowing
547                  * data overwrites.
548                  */
549                 KKASSERT(layer2->u.phys_offset != HAMMER_BLOCKMAP_FREE);
550                 KKASSERT(layer2->bytes_free >= HAMMER_LARGEBLOCK_SIZE - (int)(next_offset & HAMMER_LARGEBLOCK_MASK64));
551         }
552
553         /*
554          * The reservation code does not modify layer2->bytes_free, it
555          * simply adjusts next_offset.
556          */
557         KKASSERT(layer2->bytes_free >= 0);
558
559         /*
560          * If we are not reserving a whole buffer but are at the start of
561          * a new block, call hammer_bnew() to avoid a disk read.
562          *
563          * If we are reserving a whole buffer the caller will probably use
564          * a direct read, so do nothing.
565          */
566         if (bytes < HAMMER_BUFSIZE && (next_offset & HAMMER_BUFMASK) == 0) {
567                 hammer_bnew(hmp, next_offset, errorp, &buffer3);
568         }
569
570         /*
571          * Make the reservation
572          */
573         if (resv) {
574                 ++resv->refs;
575         } else {
576                 resv = kmalloc(sizeof(*resv), M_HAMMER, M_WAITOK|M_ZERO);
577                 resv->refs = 1;
578                 resv->zone_offset = next_offset & ~HAMMER_LARGEBLOCK_MASK64;
579                 RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
580                 ++hammer_count_reservations;
581         }
582
583         /*
584          * Adjust our iterator and alloc_offset.  The layer1 and layer2
585          * space beyond alloc_offset is uninitialized.  alloc_offset must
586          * be big-block aligned.
587          */
588 done:
589         if (resv) {
590                 hammer_modify_volume(NULL, root_volume, NULL, 0);
591                 rootmap->next_offset = next_offset + bytes;
592                 hammer_modify_volume_done(root_volume);
593         } else if (rootmap->next_offset != next_offset) {
594                 hammer_modify_volume(NULL, root_volume, NULL, 0);
595                 rootmap->next_offset = next_offset;
596                 hammer_modify_volume_done(root_volume);
597         }
598
599         if (buffer1)
600                 hammer_rel_buffer(buffer1, 0);
601         if (buffer2)
602                 hammer_rel_buffer(buffer2, 0);
603         if (buffer3)
604                 hammer_rel_buffer(buffer3, 0);
605         hammer_rel_volume(root_volume, 0);
606         hammer_unlock(&hmp->blkmap_lock);
607         *zone_offp = next_offset;
608
609         return(resv);
610 }
611
612 /*
613  * A record with a storage resolution calls this function when it is
614  * being freed.  The storage may or may not have actually been allocated.
615  */
616 void
617 hammer_blockmap_reserve_complete(hammer_mount_t hmp, hammer_reserve_t resv)
618 {
619         KKASSERT(resv->refs > 0);
620         if (--resv->refs == 0) {
621                 RB_REMOVE(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
622                 kfree(resv, M_HAMMER);
623                 --hammer_count_reservations;
624         }
625 }
626
627 /*
628  * Free (offset,bytes) in a zone.
629  *
630  * If bytes is negative we are actually allocating previously reserved
631  * space in the zone.
632  */
633 void
634 hammer_blockmap_free(hammer_transaction_t trans,
635                      hammer_off_t bmap_off, int bytes)
636 {
637         hammer_mount_t hmp;
638         hammer_volume_t root_volume;
639         hammer_reserve_t resv;
640         hammer_blockmap_t rootmap;
641         struct hammer_blockmap_layer1 *layer1;
642         struct hammer_blockmap_layer2 *layer2;
643         hammer_buffer_t buffer1 = NULL;
644         hammer_buffer_t buffer2 = NULL;
645         hammer_off_t layer1_offset;
646         hammer_off_t layer2_offset;
647         int error;
648         int zone;
649
650         hmp = trans->hmp;
651
652         if (bytes >= 0) {
653                 bytes = (bytes + 15) & ~15;
654                 KKASSERT(bytes <= HAMMER_BUFSIZE);
655                 KKASSERT(((bmap_off ^ (bmap_off + (bytes - 1))) & 
656                           ~HAMMER_LARGEBLOCK_MASK64) == 0);
657         } else {
658                 bytes = -((-bytes + 15) & ~15);
659                 KKASSERT(bytes >= -HAMMER_BUFSIZE);
660         }
661         zone = HAMMER_ZONE_DECODE(bmap_off);
662         KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
663         root_volume = hammer_get_root_volume(hmp, &error);
664         if (error)
665                 return;
666
667         hammer_lock_ex(&hmp->blkmap_lock);
668
669         rootmap = &hmp->blockmap[zone];
670         KKASSERT(rootmap->phys_offset != 0);
671         KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
672                  HAMMER_ZONE_RAW_BUFFER_INDEX);
673         KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
674
675         if (bmap_off >= rootmap->alloc_offset) {
676                 panic("hammer_blockmap_lookup: %016llx beyond EOF %016llx",
677                       bmap_off, rootmap->alloc_offset);
678                 goto done;
679         }
680
681         /*
682          * Dive layer 1.
683          */
684         layer1_offset = rootmap->phys_offset +
685                         HAMMER_BLOCKMAP_LAYER1_OFFSET(bmap_off);
686         layer1 = hammer_bread(hmp, layer1_offset, &error, &buffer1);
687         KKASSERT(error == 0);
688         KKASSERT(layer1->phys_offset);
689         if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
690                 Debugger("CRC FAILED: LAYER1");
691         }
692
693         /*
694          * Dive layer 2, each entry represents a large-block.
695          */
696         layer2_offset = layer1->phys_offset +
697                         HAMMER_BLOCKMAP_LAYER2_OFFSET(bmap_off);
698         layer2 = hammer_bread(hmp, layer2_offset, &error, &buffer2);
699         KKASSERT(error == 0);
700         KKASSERT(layer2->u.phys_offset);
701         if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
702                 Debugger("CRC FAILED: LAYER2");
703         }
704
705         hammer_modify_buffer(trans, buffer2, layer2, sizeof(*layer2));
706         layer2->bytes_free += bytes;
707         KKASSERT(layer2->bytes_free <= HAMMER_LARGEBLOCK_SIZE);
708
709         /*
710          * If the big-block is free, return it to the free pool.  The layer2
711          * infrastructure is left intact even if the entire layer2 becomes
712          * free.
713          *
714          * At the moment if our iterator is in a bigblock that becomes
715          * wholely free, we have to leave the block allocated and we cannot
716          * reset the iterator because there may be UNDOs on-disk that
717          * reference areas of that block and we cannot overwrite those areas.
718          */
719         if (layer2->bytes_free == HAMMER_LARGEBLOCK_SIZE) {
720                 hammer_off_t base_off;
721
722                 base_off = bmap_off & ~HAMMER_LARGEBLOCK_MASK64;
723                 resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root,
724                                  base_off);
725
726                 if (resv) {
727                         /*
728                          * Portions of this block have been reserved, do
729                          * not free it.
730                          */
731                 } else if ((rootmap->next_offset ^ bmap_off) &
732                             ~HAMMER_LARGEBLOCK_MASK64) {
733                         /*
734                          * Our iterator is not in the now-free big-block
735                          * and we can release it.
736                          */
737                         hammer_clean_holes(hmp, &trans->hmp->holes[zone],
738                                            base_off);
739                         hammer_del_buffers(hmp, base_off,
740                                            layer2->u.phys_offset,
741                                            HAMMER_LARGEBLOCK_SIZE);
742                         hammer_freemap_free(trans, layer2->u.phys_offset,
743                                             bmap_off, &error);
744                         layer2->u.phys_offset = HAMMER_BLOCKMAP_FREE;
745
746                         hammer_modify_buffer(trans, buffer1,
747                                              layer1, sizeof(*layer1));
748                         ++layer1->blocks_free;
749 #if 0
750                         /*
751                          * This commented out code would release the layer2
752                          * bigblock.  We do not want to do this, at least
753                          * not right now.
754                          *
755                          * This also may be incomplete.
756                          */
757                         if (layer1->blocks_free == HAMMER_BLOCKMAP_RADIX2) {
758                                 hammer_freemap_free(
759                                         trans, layer1->phys_offset,
760                                         bmap_off & ~HAMMER_BLOCKMAP_LAYER2_MASK,
761                                         &error);
762                                 layer1->phys_offset = HAMMER_BLOCKMAP_FREE;
763                         }
764 #endif
765                         layer1->layer1_crc = crc32(layer1,
766                                                    HAMMER_LAYER1_CRCSIZE);
767                         hammer_modify_buffer_done(buffer1);
768                 } else {
769 #if 0
770                         /*
771                          * This commented out code would reset the iterator,
772                          * which we cannot do at the moment as it could cause
773                          * new allocations to overwrite deleted data still
774                          * subject to undo on reboot.
775                          */
776                         hammer_modify_volume(trans, root_volume,
777                                              NULL, 0);
778                         rootmap->next_offset &= ~HAMMER_LARGEBLOCK_MASK64;
779                         hammer_modify_volume_done(root_volume);
780 #endif
781                 }
782         }
783         layer2->entry_crc = crc32(layer2, HAMMER_LAYER2_CRCSIZE);
784         hammer_modify_buffer_done(buffer2);
785 done:
786         hammer_unlock(&hmp->blkmap_lock);
787
788         if (buffer1)
789                 hammer_rel_buffer(buffer1, 0);
790         if (buffer2)
791                 hammer_rel_buffer(buffer2, 0);
792         hammer_rel_volume(root_volume, 0);
793 }
794
795 /*
796  * Return the number of free bytes in the big-block containing the
797  * specified blockmap offset.
798  */
799 int
800 hammer_blockmap_getfree(hammer_mount_t hmp, hammer_off_t bmap_off,
801                         int *curp, int *errorp)
802 {
803         hammer_volume_t root_volume;
804         hammer_blockmap_t rootmap;
805         struct hammer_blockmap_layer1 *layer1;
806         struct hammer_blockmap_layer2 *layer2;
807         hammer_buffer_t buffer = NULL;
808         hammer_off_t layer1_offset;
809         hammer_off_t layer2_offset;
810         int bytes;
811         int zone;
812
813         zone = HAMMER_ZONE_DECODE(bmap_off);
814         KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
815         root_volume = hammer_get_root_volume(hmp, errorp);
816         if (*errorp) {
817                 *curp = 0;
818                 return(0);
819         }
820         rootmap = &hmp->blockmap[zone];
821         KKASSERT(rootmap->phys_offset != 0);
822         KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
823                  HAMMER_ZONE_RAW_BUFFER_INDEX);
824         KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
825
826         if (bmap_off >= rootmap->alloc_offset) {
827                 panic("hammer_blockmap_lookup: %016llx beyond EOF %016llx",
828                       bmap_off, rootmap->alloc_offset);
829                 bytes = 0;
830                 *curp = 0;
831                 goto done;
832         }
833
834         /*
835          * Dive layer 1.
836          */
837         layer1_offset = rootmap->phys_offset +
838                         HAMMER_BLOCKMAP_LAYER1_OFFSET(bmap_off);
839         layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer);
840         KKASSERT(*errorp == 0);
841         KKASSERT(layer1->phys_offset);
842         if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
843                 Debugger("CRC FAILED: LAYER1");
844         }
845
846         /*
847          * Dive layer 2, each entry represents a large-block.
848          */
849         layer2_offset = layer1->phys_offset +
850                         HAMMER_BLOCKMAP_LAYER2_OFFSET(bmap_off);
851         layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer);
852         KKASSERT(*errorp == 0);
853         KKASSERT(layer2->u.phys_offset);
854         if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
855                 Debugger("CRC FAILED: LAYER2");
856         }
857
858         bytes = layer2->bytes_free;
859
860         if ((rootmap->next_offset ^ bmap_off) & ~HAMMER_LARGEBLOCK_MASK64)
861                 *curp = 0;
862         else
863                 *curp = 1;
864 done:
865         if (buffer)
866                 hammer_rel_buffer(buffer, 0);
867         hammer_rel_volume(root_volume, 0);
868         if (hammer_debug_general & 0x0800) {
869                 kprintf("hammer_blockmap_getfree: %016llx -> %d\n",
870                         bmap_off, bytes);
871         }
872         return(bytes);
873 }
874
875
876 /*
877  * Lookup a blockmap offset.
878  */
879 hammer_off_t
880 hammer_blockmap_lookup(hammer_mount_t hmp, hammer_off_t bmap_off, int *errorp)
881 {
882         hammer_volume_t root_volume;
883         hammer_blockmap_t rootmap;
884         struct hammer_blockmap_layer1 *layer1;
885         struct hammer_blockmap_layer2 *layer2;
886         hammer_buffer_t buffer = NULL;
887         hammer_off_t layer1_offset;
888         hammer_off_t layer2_offset;
889         hammer_off_t result_offset;
890         int zone;
891
892         zone = HAMMER_ZONE_DECODE(bmap_off);
893         KKASSERT(zone >= HAMMER_ZONE_BTREE_INDEX && zone < HAMMER_MAX_ZONES);
894         root_volume = hammer_get_root_volume(hmp, errorp);
895         if (*errorp)
896                 return(0);
897         rootmap = &hmp->blockmap[zone];
898         KKASSERT(rootmap->phys_offset != 0);
899         KKASSERT(HAMMER_ZONE_DECODE(rootmap->phys_offset) ==
900                  HAMMER_ZONE_RAW_BUFFER_INDEX);
901         KKASSERT(HAMMER_ZONE_DECODE(rootmap->alloc_offset) == zone);
902
903         if (bmap_off >= rootmap->alloc_offset) {
904                 panic("hammer_blockmap_lookup: %016llx beyond EOF %016llx",
905                       bmap_off, rootmap->alloc_offset);
906                 result_offset = 0;
907                 goto done;
908         }
909
910         /*
911          * Dive layer 1.
912          */
913         layer1_offset = rootmap->phys_offset +
914                         HAMMER_BLOCKMAP_LAYER1_OFFSET(bmap_off);
915         layer1 = hammer_bread(hmp, layer1_offset, errorp, &buffer);
916         KKASSERT(*errorp == 0);
917         KKASSERT(layer1->phys_offset);
918         if (layer1->layer1_crc != crc32(layer1, HAMMER_LAYER1_CRCSIZE)) {
919                 Debugger("CRC FAILED: LAYER1");
920         }
921
922         /*
923          * Dive layer 2, each entry represents a large-block.
924          */
925         layer2_offset = layer1->phys_offset +
926                         HAMMER_BLOCKMAP_LAYER2_OFFSET(bmap_off);
927         layer2 = hammer_bread(hmp, layer2_offset, errorp, &buffer);
928
929         KKASSERT(*errorp == 0);
930         KKASSERT(layer2->u.phys_offset);
931         if (layer2->entry_crc != crc32(layer2, HAMMER_LAYER2_CRCSIZE)) {
932                 Debugger("CRC FAILED: LAYER2");
933         }
934
935         result_offset = layer2->u.phys_offset +
936                         (bmap_off & HAMMER_LARGEBLOCK_MASK64);
937 done:
938         if (buffer)
939                 hammer_rel_buffer(buffer, 0);
940         hammer_rel_volume(root_volume, 0);
941         if (hammer_debug_general & 0x0800) {
942                 kprintf("hammer_blockmap_lookup: %016llx -> %016llx\n",
943                         bmap_off, result_offset);
944         }
945         return(result_offset);
946 }
947
948 /************************************************************************
949  *                  IN-CORE TRACKING OF ALLOCATION HOLES                *
950  ************************************************************************
951  *
952  * This is a temporary shim in need of a more permanent solution.
953  *
954  * As we allocate space holes are created due to having to align to a new
955  * 16K buffer when an allocation would otherwise cross the buffer boundary.
956  * These holes are recorded here and used to fullfill smaller requests as
957  * much as possible.  Only a limited number of holes are recorded and these
958  * functions operate somewhat like a heuristic, where information is allowed
959  * to be thrown away.
960  */
961
962 void
963 hammer_init_holes(hammer_mount_t hmp, hammer_holes_t holes)
964 {
965         TAILQ_INIT(&holes->list);
966         holes->count = 0;
967 }
968
969 void
970 hammer_free_holes(hammer_mount_t hmp, hammer_holes_t holes)
971 {
972         hammer_hole_t hole;
973
974         while ((hole = TAILQ_FIRST(&holes->list)) != NULL) {
975                 TAILQ_REMOVE(&holes->list, hole, entry);
976                 if (hole->resv) {
977                         hammer_blockmap_reserve_complete(hmp, hole->resv);
978                         hole->resv = NULL;
979                 }
980                 kfree(hole, M_HAMMER);
981                 --holes->count;
982         }
983 }
984
985 /*
986  * Attempt to locate a hole with sufficient free space to accomodate the
987  * requested allocation.  Return the offset or 0 if no hole could be found.
988  */
989 static hammer_off_t
990 hammer_find_hole(hammer_mount_t hmp, hammer_holes_t holes, int bytes)
991 {
992         hammer_hole_t hole;
993         hammer_off_t result_off = 0;
994
995         TAILQ_FOREACH(hole, &holes->list, entry) {
996                 if (bytes <= hole->bytes) {
997                         result_off = hole->zone_offset;
998                         hole->zone_offset += bytes;
999                         hole->bytes -= bytes;
1000                         break;
1001                 }
1002         }
1003         return(result_off);
1004 }
1005
1006 /*
1007  * If a newly created hole is reasonably sized then record it.  We only
1008  * keep track of a limited number of holes.  Lost holes are recovered by
1009  * reblocking.
1010  *
1011  * offset is a zone-N offset.
1012  */
1013 static void
1014 hammer_add_hole(hammer_mount_t hmp, hammer_holes_t holes,
1015                 hammer_off_t zone_offset, int bytes)
1016 {
1017         hammer_hole_t hole;
1018         hammer_reserve_t resv;
1019
1020         if (bytes <= 128)
1021                 return;
1022
1023         /*
1024          * Allocate or reuse a hole structure
1025          */
1026         if (holes->count < HAMMER_MAX_HOLES) {
1027                 hole = kmalloc(sizeof(*hole), M_HAMMER, M_WAITOK);
1028                 ++holes->count;
1029         } else {
1030                 hole = TAILQ_FIRST(&holes->list);
1031                 TAILQ_REMOVE(&holes->list, hole, entry);
1032                 if (hole->resv) {
1033                         hammer_blockmap_reserve_complete(hmp, hole->resv);
1034                         hole->resv = NULL;
1035                 }
1036         }
1037
1038         /*
1039          * Associate the structure with the appropriate reservation so the
1040          * bigblock does not get freed or reused while we have cached holes,
1041          * and install.
1042          */
1043         hole->zone_offset = zone_offset;
1044         hole->bytes = bytes;
1045
1046         zone_offset &= ~HAMMER_LARGEBLOCK_MASK64;
1047
1048         resv = RB_LOOKUP(hammer_res_rb_tree, &hmp->rb_resv_root, zone_offset);
1049         if (resv == NULL) {
1050                 resv = kmalloc(sizeof(*resv), M_HAMMER, M_WAITOK|M_ZERO);
1051                 resv->zone_offset = zone_offset;
1052                 resv->refs = 1;
1053                 RB_INSERT(hammer_res_rb_tree, &hmp->rb_resv_root, resv);
1054                 ++hammer_count_reservations;
1055         } else {
1056                 ++resv->refs;
1057         }
1058         hole->resv = resv;
1059         TAILQ_INSERT_TAIL(&holes->list, hole, entry);
1060 }
1061
1062 /*
1063  * Clean out any holes cached for the bigblock we are about to release back
1064  * to the free pool.
1065  */
1066 static void
1067 hammer_clean_holes(hammer_mount_t hmp, hammer_holes_t holes,
1068                    hammer_off_t base_offset)
1069 {
1070         hammer_hole_t hole;
1071
1072 restart:
1073         TAILQ_FOREACH(hole, &holes->list, entry) {
1074                 if ((hole->zone_offset & ~HAMMER_LARGEBLOCK_MASK64) == 
1075                     base_offset) {
1076                         TAILQ_REMOVE(&holes->list, hole, entry);
1077                         if (hole->resv) {
1078                                 hammer_blockmap_reserve_complete(hmp,
1079                                                                  hole->resv);
1080                                 hole->resv = NULL;
1081                         }
1082                         --holes->count;
1083                         kfree(hole, M_HAMMER);
1084                         goto restart;
1085                 }
1086         }
1087 }
1088