HAMMER 23/many: Recovery, B-Tree, spike, I/O work.
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.24 2008/01/24 02:14:45 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_supercl(hammer_supercl_t supercl,
51                         hammer_alloc_state_t isnew);
52 static int hammer_load_cluster(hammer_cluster_t cluster, int getflags);
53 static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
54 static int hammer_load_node(hammer_node_t node);
55 static void alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type,
56                         hammer_alist_t live,
57                         int32_t start, int *errorp,
58                         struct hammer_buffer **bufferp);
59 #if 0
60 static void readhammerbuf(hammer_volume_t vol, void *data,
61                         int64_t offset);
62 static void writehammerbuf(hammer_volume_t vol, const void *data,
63                         int64_t offset);
64 #endif
65 static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
66 static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
67 static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks,
68                         int32_t start, int isfwd);
69 static void hammer_adjust_stats(hammer_cluster_t cluster,
70                         u_int64_t buf_type, int nblks);
71
72 struct hammer_alist_config Buf_alist_config;
73 struct hammer_alist_config Vol_normal_alist_config;
74 struct hammer_alist_config Vol_super_alist_config;
75 struct hammer_alist_config Supercl_alist_config;
76 struct hammer_alist_config Clu_master_alist_config;
77 struct hammer_alist_config Clu_slave_alist_config;
78
79 /*
80  * Red-Black tree support for various structures
81  */
82 static int
83 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
84 {
85         if (ip1->obj_id < ip2->obj_id)
86                 return(-1);
87         if (ip1->obj_id > ip2->obj_id)
88                 return(1);
89         if (ip1->obj_asof < ip2->obj_asof)
90                 return(-1);
91         if (ip1->obj_asof > ip2->obj_asof)
92                 return(1);
93         return(0);
94 }
95
96 static int
97 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
98 {
99         if (info->obj_id < ip->obj_id)
100                 return(-1);
101         if (info->obj_id > ip->obj_id)
102                 return(1);
103         if (info->obj_asof < ip->obj_asof)
104                 return(-1);
105         if (info->obj_asof > ip->obj_asof)
106                 return(1);
107         return(0);
108 }
109
110 static int
111 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
112 {
113         if (vol1->vol_no < vol2->vol_no)
114                 return(-1);
115         if (vol1->vol_no > vol2->vol_no)
116                 return(1);
117         return(0);
118 }
119
120 static int
121 hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
122 {
123         if (cl1->scl_no < cl2->scl_no)
124                 return(-1);
125         if (cl1->scl_no > cl2->scl_no)
126                 return(1);
127         return(0);
128 }
129
130 static int
131 hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
132 {
133         if (cl1->clu_no < cl2->clu_no)
134                 return(-1);
135         if (cl1->clu_no > cl2->clu_no)
136                 return(1);
137         return(0);
138 }
139
140 static int
141 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
142 {
143         if (buf1->buf_no < buf2->buf_no)
144                 return(-1);
145         if (buf1->buf_no > buf2->buf_no)
146                 return(1);
147         return(0);
148 }
149
150 static int
151 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
152 {
153         if (node1->node_offset < node2->node_offset)
154                 return(-1);
155         if (node1->node_offset > node2->node_offset)
156                 return(1);
157         return(0);
158 }
159
160 /*
161  * Note: The lookup function for hammer_ino_rb_tree winds up being named
162  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
163  * functions are normal, e.g. hammer_clu_rb_tree_RB_LOOKUP(root, clu_no).
164  */
165 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
166 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
167                 hammer_inode_info_cmp, hammer_inode_info_t);
168 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
169              hammer_vol_rb_compare, int32_t, vol_no);
170 RB_GENERATE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
171              hammer_scl_rb_compare, int32_t, scl_no);
172 RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
173              hammer_clu_rb_compare, int32_t, clu_no);
174 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
175              hammer_buf_rb_compare, int32_t, buf_no);
176 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
177              hammer_nod_rb_compare, int32_t, node_offset);
178
179 /************************************************************************
180  *                              VOLUMES                                 *
181  ************************************************************************
182  *
183  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
184  * code on failure.  Volumes must be loaded at mount time, get_volume() will
185  * not load a new volume.
186  *
187  * Calls made to hammer_load_volume() or single-threaded
188  */
189 int
190 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
191 {
192         struct mount *mp;
193         hammer_volume_t volume;
194         struct hammer_volume_ondisk *ondisk;
195         struct nlookupdata nd;
196         struct buf *bp = NULL;
197         int error;
198         int ronly;
199
200         mp = hmp->mp;
201         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
202
203         /*
204          * Allocate a volume structure
205          */
206         ++hammer_count_volumes;
207         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
208         volume->vol_name = kstrdup(volname, M_HAMMER);
209         volume->hmp = hmp;
210         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
211         volume->io.offset = 0LL;
212
213         /*
214          * Get the device vnode
215          */
216         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
217         if (error == 0)
218                 error = nlookup(&nd);
219         if (error == 0)
220                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
221         nlookup_done(&nd);
222         if (error == 0) {
223                 vn_isdisk(volume->devvp, &error);
224         }
225         if (error == 0) {
226                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
227                 error = VOP_OPEN(volume->devvp, (ronly ? FREAD : FREAD|FWRITE),
228                                  FSCRED, NULL);
229                 vn_unlock(volume->devvp);
230         }
231         if (error) {
232                 hammer_free_volume(volume);
233                 return(error);
234         }
235
236         /*
237          * Extract the volume number from the volume header and do various
238          * sanity checks.
239          */
240         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
241         if (error)
242                 goto late_failure;
243         ondisk = (void *)bp->b_data;
244         if (ondisk->head.buf_type != HAMMER_FSBUF_VOLUME) {
245                 kprintf("hammer_mount: volume %s has an invalid header\n",
246                         volume->vol_name);
247                 error = EFTYPE;
248                 goto late_failure;
249         }
250         volume->vol_no = ondisk->vol_no;
251         volume->cluster_base = ondisk->vol_clo_beg;
252         volume->vol_clsize = ondisk->vol_clsize;
253         volume->vol_flags = ondisk->vol_flags;
254         volume->nblocks = ondisk->vol_nblocks; 
255         RB_INIT(&volume->rb_clus_root);
256         RB_INIT(&volume->rb_scls_root);
257
258         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
259
260         if (RB_EMPTY(&hmp->rb_vols_root)) {
261                 hmp->fsid = ondisk->vol_fsid;
262         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
263                 kprintf("hammer_mount: volume %s's fsid does not match "
264                         "other volumes\n", volume->vol_name);
265                 error = EFTYPE;
266                 goto late_failure;
267         }
268
269         /*
270          * Insert the volume structure into the red-black tree.
271          */
272         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
273                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
274                         volume->vol_name, volume->vol_no);
275                 error = EEXIST;
276         }
277
278         /*
279          * Set the root volume and load the root cluster.  HAMMER special
280          * cases rootvol and rootcl and will not deallocate the structures.
281          * We do not hold a ref because this would prevent related I/O
282          * from being flushed.
283          */
284         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
285                 hmp->rootvol = volume;
286                 if (bp) {
287                         brelse(bp);
288                         bp = NULL;
289                 }
290                 hammer_ref_volume(volume);
291                 hmp->rootcl = hammer_get_cluster(volume,
292                                                  ondisk->vol0_root_clu_no,
293                                                  &error, 0);
294                 hammer_rel_cluster(hmp->rootcl, 0);
295                 hammer_rel_volume(volume, 0);
296                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
297         }
298 late_failure:
299         if (bp)
300                 brelse(bp);
301         if (error) {
302                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
303                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
304                 hammer_free_volume(volume);
305         }
306         return (error);
307 }
308
309 /*
310  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
311  * so returns -1 on failure.
312  */
313 int
314 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
315 {
316         struct hammer_mount *hmp = volume->hmp;
317         hammer_cluster_t rootcl;
318         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
319
320         /*
321          * Sync clusters, sync volume
322          */
323
324         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
325
326         /*
327          * Clean up the root cluster, which is held unlocked in the root
328          * volume.
329          */
330         if (hmp->rootvol == volume) {
331                 if ((rootcl = hmp->rootcl) != NULL)
332                         hmp->rootcl = NULL;
333                 hmp->rootvol = NULL;
334         }
335
336         /*
337          * Unload clusters and super-clusters.  Unloading a super-cluster
338          * also unloads related clusters, but the filesystem may not be
339          * using super-clusters so unload clusters anyway.
340          */
341         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
342                         hammer_unload_cluster, NULL);
343         RB_SCAN(hammer_scl_rb_tree, &volume->rb_scls_root, NULL,
344                         hammer_unload_supercl, NULL);
345         hammer_io_waitdep(&volume->io);
346
347         /*
348          * Release our buffer and flush anything left in the buffer cache.
349          */
350         hammer_io_release(&volume->io, 2);
351
352         /*
353          * There should be no references on the volume, no clusters, and
354          * no super-clusters.
355          */
356         KKASSERT(volume->io.lock.refs == 0);
357         KKASSERT(RB_EMPTY(&volume->rb_clus_root));
358         KKASSERT(RB_EMPTY(&volume->rb_scls_root));
359
360         volume->ondisk = NULL;
361         if (volume->devvp) {
362                 if (ronly) {
363                         vinvalbuf(volume->devvp, 0, 0, 0);
364                         VOP_CLOSE(volume->devvp, FREAD);
365                 } else {
366                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
367                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
368                 }
369         }
370
371         /*
372          * Destroy the structure
373          */
374         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
375         hammer_free_volume(volume);
376         return(0);
377 }
378
379 static
380 void
381 hammer_free_volume(hammer_volume_t volume)
382 {
383         if (volume->vol_name) {
384                 kfree(volume->vol_name, M_HAMMER);
385                 volume->vol_name = NULL;
386         }
387         if (volume->devvp) {
388                 vrele(volume->devvp);
389                 volume->devvp = NULL;
390         }
391         --hammer_count_volumes;
392         kfree(volume, M_HAMMER);
393 }
394
395 /*
396  * Get a HAMMER volume.  The volume must already exist.
397  */
398 hammer_volume_t
399 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
400 {
401         struct hammer_volume *volume;
402
403         /*
404          * Locate the volume structure
405          */
406         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
407         if (volume == NULL) {
408                 *errorp = ENOENT;
409                 return(NULL);
410         }
411         hammer_ref(&volume->io.lock);
412
413         /*
414          * Deal with on-disk info
415          */
416         if (volume->ondisk == NULL || volume->io.loading) {
417                 *errorp = hammer_load_volume(volume);
418                 if (*errorp) {
419                         hammer_rel_volume(volume, 1);
420                         volume = NULL;
421                 }
422         } else {
423                 *errorp = 0;
424         }
425         return(volume);
426 }
427
428 int
429 hammer_ref_volume(hammer_volume_t volume)
430 {
431         int error;
432
433         hammer_ref(&volume->io.lock);
434
435         /*
436          * Deal with on-disk info
437          */
438         if (volume->ondisk == NULL || volume->io.loading) {
439                 error = hammer_load_volume(volume);
440                 if (error)
441                         hammer_rel_volume(volume, 1);
442         } else {
443                 error = 0;
444         }
445         return (error);
446 }
447
448 hammer_volume_t
449 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
450 {
451         hammer_volume_t volume;
452
453         volume = hmp->rootvol;
454         KKASSERT(volume != NULL);
455         hammer_ref(&volume->io.lock);
456
457         /*
458          * Deal with on-disk info
459          */
460         if (volume->ondisk == NULL || volume->io.loading) {
461                 *errorp = hammer_load_volume(volume);
462                 if (*errorp) {
463                         hammer_rel_volume(volume, 1);
464                         volume = NULL;
465                 }
466         } else {
467                 *errorp = 0;
468         }
469         return (volume);
470 }
471
472 /*
473  * Load a volume's on-disk information.  The volume must be referenced and
474  * not locked.  We temporarily acquire an exclusive lock to interlock
475  * against releases or multiple get's.
476  */
477 static int
478 hammer_load_volume(hammer_volume_t volume)
479 {
480         struct hammer_volume_ondisk *ondisk;
481         int error;
482
483         hammer_lock_ex(&volume->io.lock);
484         KKASSERT(volume->io.loading == 0);
485         volume->io.loading = 1;
486
487         if (volume->ondisk == NULL) {
488                 error = hammer_io_read(volume->devvp, &volume->io);
489                 if (error) {
490                         volume->io.loading = 0;
491                         hammer_unlock(&volume->io.lock);
492                         return (error);
493                 }
494                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
495
496                 /*
497                  * Configure the volume's A-lists.  These are used to
498                  * allocate clusters.
499                  */
500                 if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
501                         volume->alist.config = &Vol_super_alist_config;
502                         volume->alist.meta = ondisk->vol_almeta.super;
503                         volume->alist.info = volume;
504                 } else {
505                         volume->alist.config = &Vol_normal_alist_config;
506                         volume->alist.meta = ondisk->vol_almeta.normal;
507                         volume->alist.info = NULL;
508                 }
509         } else {
510                 error = 0;
511         }
512         volume->io.loading = 0;
513         hammer_unlock(&volume->io.lock);
514         return(0);
515 }
516
517 /*
518  * Release a volume.  Call hammer_io_release on the last reference.  We have
519  * to acquire an exclusive lock to interlock against volume->ondisk tests
520  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
521  * lock to be held.
522  *
523  * Volumes are not unloaded from memory during normal operation.
524  */
525 void
526 hammer_rel_volume(hammer_volume_t volume, int flush)
527 {
528         if (volume->io.lock.refs == 1) {
529                 hammer_lock_ex(&volume->io.lock);
530                 if (volume->io.lock.refs == 1) {
531                         volume->ondisk = NULL;
532                         hammer_io_release(&volume->io, flush);
533                 } else if (flush) {
534                         hammer_io_flush(&volume->io);
535                 }
536                 hammer_unlock(&volume->io.lock);
537         }
538         hammer_unref(&volume->io.lock);
539 }
540
541 /************************************************************************
542  *                              SUPER-CLUSTERS                          *
543  ************************************************************************
544  *
545  * Manage super-clusters.  Note that a supercl holds a reference to its
546  * associated volume.
547  */
548 static int
549 hammer_find_supercl(hammer_volume_t volume, int32_t scl_no)
550 {
551         if (RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no))
552                 return(1);
553         return(0);
554 }
555
556 hammer_supercl_t
557 hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
558                    int *errorp, hammer_alloc_state_t isnew)
559 {
560         hammer_supercl_t supercl;
561
562         /*
563          * Locate and lock the super-cluster structure, creating one
564          * if necessary.
565          */
566 again:
567         supercl = RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no);
568         if (supercl == NULL) {
569                 ++hammer_count_supercls;
570                 supercl = kmalloc(sizeof(*supercl), M_HAMMER, M_WAITOK|M_ZERO);
571                 supercl->scl_no = scl_no;
572                 supercl->volume = volume;
573                 supercl->io.offset = calculate_supercl_offset(volume, scl_no);
574                 hammer_io_init(&supercl->io, HAMMER_STRUCTURE_SUPERCL);
575                 hammer_ref(&supercl->io.lock);
576
577                 /*
578                  * Insert the cluster into the RB tree and handle late
579                  * collisions.
580                  */
581                 if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
582                         hammer_unref(&supercl->io.lock);
583                         --hammer_count_supercls;
584                         kfree(supercl, M_HAMMER);
585                         goto again;
586                 }
587                 hammer_ref(&volume->io.lock);
588         } else {
589                 hammer_ref(&supercl->io.lock);
590         }
591
592         /*
593          * Deal with on-disk info
594          */
595         if (supercl->ondisk == NULL || isnew || supercl->io.loading) {
596                 *errorp = hammer_load_supercl(supercl, isnew);
597                 if (*errorp) {
598                         hammer_rel_supercl(supercl, 1);
599                         supercl = NULL;
600                 }
601         } else {
602                 *errorp = 0;
603         }
604         return(supercl);
605 }
606
607 static int
608 hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew)
609 {
610         struct hammer_supercl_ondisk *ondisk;
611         hammer_volume_t volume = supercl->volume;
612         int error;
613         int64_t nclusters;
614
615         hammer_lock_ex(&supercl->io.lock);
616         KKASSERT(supercl->io.loading == 0);
617         supercl->io.loading = 1;
618
619         if (supercl->ondisk == NULL) {
620                 if (isnew)
621                         error = hammer_io_new(volume->devvp, &supercl->io);
622                 else
623                         error = hammer_io_read(volume->devvp, &supercl->io);
624                 if (error) {
625                         supercl->io.loading = 0;
626                         hammer_unlock(&supercl->io.lock);
627                         return (error);
628                 }
629                 supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
630
631                 supercl->alist.config = &Supercl_alist_config;
632                 supercl->alist.meta = ondisk->scl_meta;
633                 supercl->alist.info = NULL;
634         } else if (isnew) {
635                 error = hammer_io_new(volume->devvp, &supercl->io);
636         } else {
637                 error = 0;
638         }
639         if (error == 0 && isnew) {
640                 /*
641                  * If this is a new super-cluster we have to initialize
642                  * various ondisk structural elements.  The caller is
643                  * responsible for the remainder.
644                  */
645                 struct hammer_alist_live dummy;
646
647                 hammer_modify_supercl(supercl);
648
649                 ondisk = supercl->ondisk;
650                 dummy.config = &Buf_alist_config;
651                 dummy.meta = ondisk->head.buf_almeta;
652                 dummy.info = NULL;
653                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL);
654
655                 nclusters = volume->ondisk->vol_nclusters -
656                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
657                 KKASSERT(nclusters > 0);
658                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
659                         nclusters = HAMMER_SCL_MAXCLUSTERS;
660                 hammer_alist_init(&supercl->alist, 0, (int32_t)nclusters,
661                                   isnew);
662         }
663         supercl->io.loading = 0;
664         hammer_unlock(&supercl->io.lock);
665         return (error);
666 }
667
668 /*
669  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
670  */
671 int
672 hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused)
673 {
674         KKASSERT(supercl->io.lock.refs == 0);
675         hammer_ref(&supercl->io.lock);
676         hammer_rel_supercl(supercl, 2);
677         return(0);
678 }
679
680 /*
681  * Release a super-cluster.  We have to deal with several places where
682  * another thread can ref the super-cluster.
683  *
684  * Only destroy the structure itself if the related buffer cache buffer
685  * was disassociated from it.  This ties the management of the structure
686  * to the buffer cache subsystem.
687  */
688 void
689 hammer_rel_supercl(hammer_supercl_t supercl, int flush)
690 {
691         hammer_volume_t volume;
692
693         if (supercl->io.lock.refs == 1) {
694                 hammer_lock_ex(&supercl->io.lock);
695                 if (supercl->io.lock.refs == 1) {
696                         hammer_io_release(&supercl->io, flush);
697                         if (supercl->io.bp == NULL &&
698                             supercl->io.lock.refs == 1) {
699                                 volume = supercl->volume;
700                                 RB_REMOVE(hammer_scl_rb_tree,
701                                           &volume->rb_scls_root, supercl);
702                                 supercl->volume = NULL; /* sanity */
703                                 --hammer_count_supercls;
704                                 kfree(supercl, M_HAMMER);
705                                 hammer_rel_volume(volume, 0);
706                                 return;
707                         }
708                 } else if (flush) {
709                         hammer_io_flush(&supercl->io);
710                 }
711                 hammer_unlock(&supercl->io.lock);
712         }
713         hammer_unref(&supercl->io.lock);
714 }
715
716 /************************************************************************
717  *                              CLUSTERS                                *
718  ************************************************************************
719  *
720  */
721 hammer_cluster_t
722 hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
723                    int *errorp, int getflags)
724 {
725         hammer_cluster_t cluster;
726
727 again:
728         cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
729         if (cluster == NULL) {
730                 ++hammer_count_clusters;
731                 cluster = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
732                 cluster->clu_no = clu_no;
733                 cluster->volume = volume;
734                 RB_INIT(&cluster->rb_bufs_root);
735                 RB_INIT(&cluster->rb_nods_root);
736                 hammer_io_init(&cluster->io, HAMMER_STRUCTURE_CLUSTER);
737                 cluster->io.offset = calculate_cluster_offset(volume, clu_no);
738                 hammer_ref(&cluster->io.lock);
739
740                 /*
741                  * Insert the cluster into the RB tree and handle late
742                  * collisions.
743                  */
744                 if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
745                         hammer_unref(&cluster->io.lock);
746                         --hammer_count_clusters;
747                         kfree(cluster, M_HAMMER);
748                         goto again;
749                 }
750                 hammer_ref(&volume->io.lock);
751         } else {
752                 hammer_ref(&cluster->io.lock);
753         }
754
755         /*
756          * Deal with on-disk info
757          */
758         if (cluster->ondisk == NULL || getflags || cluster->io.validated == 0) {
759                 *errorp = hammer_load_cluster(cluster, getflags);
760                 if (*errorp) {
761                         hammer_rel_cluster(cluster, 1);
762                         cluster = NULL;
763                 }
764         } else {
765                 *errorp = 0;
766         }
767         return (cluster);
768 }
769
770 hammer_cluster_t
771 hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
772 {
773         hammer_cluster_t cluster;
774
775         cluster = hmp->rootcl;
776         KKASSERT(cluster != NULL);
777         hammer_ref(&cluster->io.lock);
778
779         /*
780          * Deal with on-disk info
781          */
782         if (cluster->ondisk == NULL || cluster->io.validated == 0) {
783                 *errorp = hammer_load_cluster(cluster, 0);
784                 if (*errorp) {
785                         hammer_rel_cluster(cluster, 1);
786                         cluster = NULL;
787                 }
788         } else {
789                 *errorp = 0;
790         }
791         return (cluster);
792 }
793
794 static
795 int
796 hammer_load_cluster(hammer_cluster_t cluster, int getflags)
797 {
798         hammer_volume_t volume = cluster->volume;
799         struct hammer_cluster_ondisk *ondisk;
800         int error;
801
802         hammer_lock_ex(&cluster->io.lock);
803         KKASSERT(cluster->io.loading == 0);
804         cluster->io.loading = 1;
805
806         if (cluster->ondisk == NULL) {
807                 KKASSERT(TAILQ_EMPTY(&cluster->io.deplist));
808
809                 /*
810                  * Unmodified buffers may be present, indicating that we
811                  * had already validated the cluster even though we no longer
812                  * have its ondisk info.
813                  */
814                 if (RB_EMPTY(&cluster->rb_bufs_root))
815                         cluster->io.validated = 0;
816                 if (getflags & GET_CLUSTER_NEW)
817                         error = hammer_io_new(volume->devvp, &cluster->io);
818                 else
819                         error = hammer_io_read(volume->devvp, &cluster->io);
820                 if (error) {
821                         cluster->io.loading = 0;
822                         hammer_unlock(&cluster->io.lock);
823                         return (error);
824                 }
825                 cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
826
827                 cluster->alist_master.config = &Clu_master_alist_config;
828                 cluster->alist_master.meta = ondisk->clu_master_meta;
829                 cluster->alist_btree.config = &Clu_slave_alist_config;
830                 cluster->alist_btree.meta = ondisk->clu_btree_meta;
831                 cluster->alist_btree.info = cluster;
832                 cluster->alist_record.config = &Clu_slave_alist_config;
833                 cluster->alist_record.meta = ondisk->clu_record_meta;
834                 cluster->alist_record.info = cluster;
835                 cluster->alist_mdata.config = &Clu_slave_alist_config;
836                 cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
837                 cluster->alist_mdata.info = cluster;
838
839                 if ((getflags & GET_CLUSTER_NEW) == 0) {
840                         /*
841                          * Load cluster range info for easy access
842                          */
843                         cluster->clu_btree_beg = ondisk->clu_btree_beg;
844                         cluster->clu_btree_end = ondisk->clu_btree_end;
845                 }
846         } else if (getflags & GET_CLUSTER_NEW) {
847                 error = hammer_io_new(volume->devvp, &cluster->io);
848         } else {
849                 error = 0;
850         }
851         if (error == 0 && (getflags & GET_CLUSTER_NEW)) {
852                 /*
853                  * If this is a new cluster we have to initialize
854                  * various ondisk structural elements.  The caller is
855                  * responsible for the remainder.
856                  */
857                 struct hammer_alist_live dummy;
858                 hammer_node_t croot;
859                 hammer_volume_ondisk_t voldisk;
860                 int32_t nbuffers;
861
862                 hammer_modify_cluster(cluster);
863                 ondisk = cluster->ondisk;
864                 voldisk = volume->ondisk;
865
866                 dummy.config = &Buf_alist_config;
867                 dummy.meta = ondisk->head.buf_almeta;
868                 dummy.info = NULL;
869                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER);
870
871                 ondisk->vol_fsid = voldisk->vol_fsid;
872                 ondisk->vol_fstype = voldisk->vol_fstype;
873                 ondisk->clu_gen = 1;
874                 ondisk->clu_id = 0;     /* XXX */
875                 ondisk->clu_no = cluster->clu_no;
876                 ondisk->clu_flags = 0;
877                 ondisk->clu_start = HAMMER_BUFSIZE;
878                 ondisk->synchronized_rec_id = 1; /* XXX timestamp */
879                 KKASSERT(voldisk->vol_clo_end > cluster->io.offset);
880                 if (voldisk->vol_clo_end - cluster->io.offset >
881                     voldisk->vol_clsize) {
882                         ondisk->clu_limit = voldisk->vol_clsize;
883                 } else {
884                         ondisk->clu_limit = (int32_t)(voldisk->vol_clo_end -
885                                                       cluster->io.offset);
886                 }
887                 nbuffers = ondisk->clu_limit / HAMMER_BUFSIZE;
888                 hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
889                                   HAMMER_ASTATE_FREE);
890                 hammer_alist_init(&cluster->alist_btree,
891                                   HAMMER_FSBUF_MAXBLKS,
892                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
893                                   HAMMER_ASTATE_ALLOC);
894                 hammer_alist_init(&cluster->alist_record,
895                                   HAMMER_FSBUF_MAXBLKS,
896                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
897                                   HAMMER_ASTATE_ALLOC);
898                 hammer_alist_init(&cluster->alist_mdata,
899                                   HAMMER_FSBUF_MAXBLKS,
900                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
901                                   HAMMER_ASTATE_ALLOC);
902
903                 ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
904                 ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
905                 ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
906
907                 /*
908                  * Initialize the B-Tree.  We don't know what the caller
909                  * intends to do with the cluster so make sure it causes
910                  * an assertion if the caller makes no changes.
911                  */
912                 ondisk->clu_btree_parent_vol_no = -2;
913                 ondisk->clu_btree_parent_clu_no = -2;
914                 ondisk->clu_btree_parent_offset = -2;
915                 ondisk->clu_btree_parent_clu_gen = -2;
916
917                 croot = hammer_alloc_btree(cluster, &error);
918                 if (error == 0) {
919                         hammer_modify_node(croot);
920                         bzero(croot->ondisk, sizeof(*croot->ondisk));
921                         croot->ondisk->count = 0;
922                         croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
923                         hammer_modify_cluster(cluster);
924                         ondisk->clu_btree_root = croot->node_offset;
925                         hammer_rel_node(croot);
926                 }
927
928                 /*
929                  * We just formatted this cluster, don't try to recover it!
930                  */
931                 cluster->io.validated = 1;
932         }
933
934         /*
935          * If no error occured handle automatic cluster recovery unless
936          * the NORECOVER flag is passed.
937          *
938          * Setting hammer_debug_recover to 1 will force recovery on load
939          * whether or not the cluster is marked open.
940          *
941          * Setting hammer_debug_recover to -1 will force NO recovery
942          * regardless of state.
943          *
944          * io.validated can only be cleared if the buffer RB list is empty,
945          * preventing us from trying to recover an actively referenced
946          * cluster (which would blow the filesystem to smithereens).
947          */
948         if (error == 0 && cluster->io.validated == 0) {
949                 if ((getflags & GET_CLUSTER_NORECOVER) == 0) {
950                         if ((cluster->ondisk->clu_flags & HAMMER_CLUF_OPEN) ||
951                             hammer_debug_recover > 0) {
952                                 if (hammer_debug_recover >= 0)
953                                         hammer_recover(cluster);
954                         }
955                         cluster->io.validated = 1;
956                 }
957         }
958         cluster->io.loading = 0;
959         hammer_unlock(&cluster->io.lock);
960         return (error);
961 }
962
963 /*
964  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
965  */
966 int
967 hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused)
968 {
969         hammer_ref(&cluster->io.lock);
970         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
971                 hammer_unload_buffer, NULL);
972         hammer_io_waitdep(&cluster->io);
973         KKASSERT(cluster->io.lock.refs == 1);
974         hammer_rel_cluster(cluster, 2);
975         return(0);
976 }
977
978 /*
979  * Update the cluster's synchronization TID, which is used during cluster
980  * recovery.  NOTE: The cluster header is not written out until all related
981  * records have been written out.
982  */
983 u_int64_t
984 hammer_alloc_recid(hammer_cluster_t cluster)
985 {
986         u_int64_t recid;
987
988         hammer_modify_cluster(cluster);
989         recid = cluster->ondisk->synchronized_rec_id++;
990         return(recid);
991 }
992
993 #if 0
994 void
995 hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid)
996 {
997         hammer_modify_cluster(cluster);
998         if (cluster->ondisk->synchronized_tid < tid)
999                 cluster->ondisk->synchronized_tid = tid;
1000 }
1001 #endif
1002
1003 /*
1004  * Reference a cluster that is either already referenced or via a specially
1005  * handled pointer (aka rootcl).
1006  */
1007 int
1008 hammer_ref_cluster(hammer_cluster_t cluster)
1009 {
1010         int error;
1011
1012         KKASSERT(cluster != NULL);
1013         hammer_ref(&cluster->io.lock);
1014
1015         /*
1016          * Deal with on-disk info
1017          */
1018         if (cluster->ondisk == NULL || cluster->io.validated == 0) {
1019                 error = hammer_load_cluster(cluster, 0);
1020                 if (error)
1021                         hammer_rel_cluster(cluster, 1);
1022         } else {
1023                 error = 0;
1024         }
1025         return(error);
1026 }
1027
1028 /*
1029  * Release a cluster.  We have to deal with several places where
1030  * another thread can ref the cluster.
1031  *
1032  * Only destroy the structure itself if we no longer have an IO or any
1033  * hammer buffers associated with the structure.
1034  */
1035 void
1036 hammer_rel_cluster(hammer_cluster_t cluster, int flush)
1037 {
1038         hammer_volume_t volume;
1039
1040         if (cluster->io.lock.refs == 1) {
1041                 hammer_lock_ex(&cluster->io.lock);
1042                 if (cluster->io.lock.refs == 1) {
1043                         /*
1044                          * Release the I/O.  If we or the kernel wants to
1045                          * flush, this will release the bp.  Otherwise the
1046                          * bp may be written and flushed passively by the
1047                          * kernel later on.
1048                          */
1049                         hammer_io_release(&cluster->io, flush);
1050
1051                         /*
1052                          * Final cleanup
1053                          */
1054                         if (cluster != cluster->volume->hmp->rootcl &&
1055                             cluster->io.bp == NULL &&
1056                             cluster->io.lock.refs == 1 &&
1057                             RB_EMPTY(&cluster->rb_bufs_root)) {
1058                                 KKASSERT(RB_EMPTY(&cluster->rb_nods_root));
1059                                 volume = cluster->volume;
1060                                 RB_REMOVE(hammer_clu_rb_tree,
1061                                           &volume->rb_clus_root, cluster);
1062                                 cluster->volume = NULL; /* sanity */
1063                                 --hammer_count_clusters;
1064                                 kfree(cluster, M_HAMMER);
1065                                 hammer_rel_volume(volume, 0);
1066                                 return;
1067                         }
1068                 } else if (flush) {
1069                         hammer_io_flush(&cluster->io);
1070                 }
1071                 hammer_unlock(&cluster->io.lock);
1072         }
1073         hammer_unref(&cluster->io.lock);
1074 }
1075
1076 /************************************************************************
1077  *                              BUFFERS                                 *
1078  ************************************************************************
1079  *
1080  * Manage buffers.  Note that a buffer holds a reference to its associated
1081  * cluster, and its cluster will hold a reference to the cluster's volume.
1082  *
1083  * A non-zero buf_type indicates that a new buffer should be created and
1084  * zero'd.
1085  */
1086 hammer_buffer_t
1087 hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
1088                   u_int64_t buf_type, int *errorp)
1089 {
1090         hammer_buffer_t buffer;
1091
1092         /*
1093          * Find the buffer.  Note that buffer 0 corresponds to the cluster
1094          * header and should never be requested.
1095          */
1096         KKASSERT(buf_no >= cluster->ondisk->clu_start / HAMMER_BUFSIZE &&
1097                  buf_no < cluster->ondisk->clu_limit / HAMMER_BUFSIZE);
1098
1099         /*
1100          * Locate and lock the buffer structure, creating one if necessary.
1101          */
1102 again:
1103         buffer = RB_LOOKUP(hammer_buf_rb_tree, &cluster->rb_bufs_root, buf_no);
1104         if (buffer == NULL) {
1105                 ++hammer_count_buffers;
1106                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
1107                 buffer->buf_no = buf_no;
1108                 buffer->cluster = cluster;
1109                 buffer->volume = cluster->volume;
1110                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
1111                 buffer->io.offset = cluster->io.offset +
1112                                     (buf_no * HAMMER_BUFSIZE);
1113                 TAILQ_INIT(&buffer->clist);
1114                 hammer_ref(&buffer->io.lock);
1115
1116                 /*
1117                  * Insert the cluster into the RB tree and handle late
1118                  * collisions.
1119                  */
1120                 if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
1121                         hammer_unref(&buffer->io.lock);
1122                         --hammer_count_buffers;
1123                         kfree(buffer, M_HAMMER);
1124                         goto again;
1125                 }
1126                 hammer_ref(&cluster->io.lock);
1127         } else {
1128                 hammer_ref(&buffer->io.lock);
1129         }
1130
1131         /*
1132          * Deal with on-disk info
1133          */
1134         if (buffer->ondisk == NULL || buf_type || buffer->io.loading) {
1135                 *errorp = hammer_load_buffer(buffer, buf_type);
1136                 if (*errorp) {
1137                         hammer_rel_buffer(buffer, 1);
1138                         buffer = NULL;
1139                 }
1140         } else {
1141                 *errorp = 0;
1142         }
1143         return(buffer);
1144 }
1145
1146 static int
1147 hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
1148 {
1149         hammer_volume_t volume;
1150         hammer_fsbuf_ondisk_t ondisk;
1151         int error;
1152
1153         /*
1154          * Load the buffer's on-disk info
1155          */
1156         volume = buffer->volume;
1157         hammer_lock_ex(&buffer->io.lock);
1158         KKASSERT(buffer->io.loading == 0);
1159         buffer->io.loading = 1;
1160
1161         if (buffer->ondisk == NULL) {
1162                 if (buf_type) {
1163                         error = hammer_io_new(volume->devvp, &buffer->io);
1164                 } else {
1165                         error = hammer_io_read(volume->devvp, &buffer->io);
1166                 }
1167                 if (error) {
1168                         buffer->io.loading = 0;
1169                         hammer_unlock(&buffer->io.lock);
1170                         return (error);
1171                 }
1172                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
1173                 buffer->alist.config = &Buf_alist_config;
1174                 buffer->alist.meta = ondisk->head.buf_almeta;
1175                 buffer->buf_type = ondisk->head.buf_type;
1176         } else if (buf_type) {
1177                 error = hammer_io_new(volume->devvp, &buffer->io);
1178         } else {
1179                 error = 0;
1180         }
1181         if (error == 0 && buf_type) {
1182                 hammer_modify_buffer(buffer);
1183                 ondisk = buffer->ondisk;
1184                 hammer_initbuffer(&buffer->alist, &ondisk->head, buf_type);
1185                 buffer->buf_type = ondisk->head.buf_type;
1186         }
1187         buffer->io.loading = 0;
1188         hammer_unlock(&buffer->io.lock);
1189         return (error);
1190 }
1191
1192 /*
1193  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
1194  */
1195 int
1196 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
1197 {
1198         hammer_ref(&buffer->io.lock);
1199         hammer_flush_buffer_nodes(buffer);
1200         KKASSERT(buffer->io.lock.refs == 1);
1201         hammer_rel_buffer(buffer, 2);
1202         return(0);
1203 }
1204
1205 /*
1206  * Reference a buffer that is either already referenced or via a specially
1207  * handled pointer (aka cursor->buffer).
1208  */
1209 int
1210 hammer_ref_buffer(hammer_buffer_t buffer)
1211 {
1212         int error;
1213
1214         hammer_ref(&buffer->io.lock);
1215         if (buffer->ondisk == NULL || buffer->io.loading) {
1216                 error = hammer_load_buffer(buffer, 0);
1217                 if (error) {
1218                         hammer_rel_buffer(buffer, 1);
1219                         /*
1220                          * NOTE: buffer pointer can become stale after
1221                          * the above release.
1222                          */
1223                 } else {
1224                         KKASSERT(buffer->buf_type ==
1225                                  buffer->ondisk->head.buf_type);
1226                 }
1227         } else {
1228                 error = 0;
1229         }
1230         return(error);
1231 }
1232
1233 /*
1234  * Release a buffer.  We have to deal with several places where
1235  * another thread can ref the buffer.
1236  *
1237  * Only destroy the structure itself if the related buffer cache buffer
1238  * was disassociated from it.  This ties the management of the structure
1239  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1240  * embedded io is referenced or not.
1241  */
1242 void
1243 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
1244 {
1245         hammer_cluster_t cluster;
1246
1247         if (buffer->io.lock.refs == 1) {
1248                 hammer_lock_ex(&buffer->io.lock);
1249                 if (buffer->io.lock.refs == 1) {
1250                         hammer_io_release(&buffer->io, flush);
1251
1252                         if (buffer->io.bp == NULL &&
1253                             buffer->io.lock.refs == 1) {
1254                                 hammer_flush_buffer_nodes(buffer);
1255                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
1256                                 cluster = buffer->cluster;
1257                                 RB_REMOVE(hammer_buf_rb_tree,
1258                                           &cluster->rb_bufs_root, buffer);
1259                                 buffer->cluster = NULL; /* sanity */
1260                                 --hammer_count_buffers;
1261                                 kfree(buffer, M_HAMMER);
1262                                 hammer_rel_cluster(cluster, 0);
1263                                 return;
1264                         }
1265                 } else if (flush) {
1266                         hammer_io_flush(&buffer->io);
1267                 }
1268                 hammer_unlock(&buffer->io.lock);
1269         }
1270         hammer_unref(&buffer->io.lock);
1271 }
1272
1273 /************************************************************************
1274  *                              NODES                                   *
1275  ************************************************************************
1276  *
1277  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1278  * method used by the HAMMER filesystem.
1279  *
1280  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1281  * associated with its buffer, and will only referenced the buffer while
1282  * the node itself is referenced.
1283  *
1284  * A hammer_node can also be passively associated with other HAMMER
1285  * structures, such as inodes, while retaining 0 references.  These
1286  * associations can be cleared backwards using a pointer-to-pointer in
1287  * the hammer_node.
1288  *
1289  * This allows the HAMMER implementation to cache hammer_nodes long-term
1290  * and short-cut a great deal of the infrastructure's complexity.  In
1291  * most cases a cached node can be reacquired without having to dip into
1292  * either the buffer or cluster management code.
1293  *
1294  * The caller must pass a referenced cluster on call and will retain
1295  * ownership of the reference on return.  The node will acquire its own
1296  * additional references, if necessary.
1297  */
1298 hammer_node_t
1299 hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
1300 {
1301         hammer_node_t node;
1302
1303         /*
1304          * Locate the structure, allocating one if necessary.
1305          */
1306 again:
1307         node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
1308                          node_offset);
1309         if (node == NULL) {
1310                 ++hammer_count_nodes;
1311                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
1312                 node->node_offset = node_offset;
1313                 node->cluster = cluster;
1314                 if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
1315                               node)) {
1316                         --hammer_count_nodes;
1317                         kfree(node, M_HAMMER);
1318                         goto again;
1319                 }
1320         }
1321         hammer_ref(&node->lock);
1322         *errorp = hammer_load_node(node);
1323         if (*errorp) {
1324                 hammer_rel_node(node);
1325                 node = NULL;
1326         }
1327         return(node);
1328 }
1329
1330 /*
1331  * Reference an already-referenced node.
1332  */
1333 int
1334 hammer_ref_node(hammer_node_t node)
1335 {
1336         int error;
1337
1338         KKASSERT(node->lock.refs > 0);
1339         hammer_ref(&node->lock);
1340         if ((error = hammer_load_node(node)) != 0)
1341                 hammer_rel_node(node);
1342         return(error);
1343 }
1344
1345 /*
1346  * Load a node's on-disk data reference.
1347  */
1348 static int
1349 hammer_load_node(hammer_node_t node)
1350 {
1351         hammer_buffer_t buffer;
1352         int32_t buf_no;
1353         int error;
1354
1355         if (node->ondisk)
1356                 return(0);
1357         error = 0;
1358         hammer_lock_ex(&node->lock);
1359         if (node->ondisk == NULL) {
1360                 /*
1361                  * This is a little confusing but the jist is that
1362                  * node->buffer determines whether the node is on
1363                  * the buffer's clist and node->ondisk determines
1364                  * whether the buffer is referenced.
1365                  */
1366                 if ((buffer = node->buffer) != NULL) {
1367                         error = hammer_ref_buffer(buffer);
1368                 } else {
1369                         buf_no = node->node_offset / HAMMER_BUFSIZE;
1370                         buffer = hammer_get_buffer(node->cluster,
1371                                                    buf_no, 0, &error);
1372                         if (buffer) {
1373                                 KKASSERT(error == 0);
1374                                 TAILQ_INSERT_TAIL(&buffer->clist,
1375                                                   node, entry);
1376                                 node->buffer = buffer;
1377                         }
1378                 }
1379                 if (error == 0) {
1380                         node->ondisk = (void *)((char *)buffer->ondisk +
1381                                (node->node_offset & HAMMER_BUFMASK));
1382                 }
1383         }
1384         hammer_unlock(&node->lock);
1385         return (error);
1386 }
1387
1388 /*
1389  * Safely reference a node, interlock against flushes via the IO subsystem.
1390  */
1391 hammer_node_t
1392 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
1393                      int *errorp)
1394 {
1395         hammer_node_t node;
1396
1397         if ((node = *cache) != NULL)
1398                 hammer_ref(&node->lock);
1399         if (node) {
1400                 *errorp = hammer_load_node(node);
1401                 if (*errorp) {
1402                         hammer_rel_node(node);
1403                         node = NULL;
1404                 }
1405         } else {
1406                 *errorp = ENOENT;
1407         }
1408         return(node);
1409 }
1410
1411 /*
1412  * Release a hammer_node.  On the last release the node dereferences
1413  * its underlying buffer and may or may not be destroyed.
1414  */
1415 void
1416 hammer_rel_node(hammer_node_t node)
1417 {
1418         hammer_cluster_t cluster;
1419         hammer_buffer_t buffer;
1420         int32_t node_offset;
1421         int flags;
1422
1423         /*
1424          * If this isn't the last ref just decrement the ref count and
1425          * return.
1426          */
1427         if (node->lock.refs > 1) {
1428                 hammer_unref(&node->lock);
1429                 return;
1430         }
1431
1432         /*
1433          * If there is no ondisk info or no buffer the node failed to load,
1434          * remove the last reference and destroy the node.
1435          */
1436         if (node->ondisk == NULL) {
1437                 hammer_unref(&node->lock);
1438                 hammer_flush_node(node);
1439                 /* node is stale now */
1440                 return;
1441         }
1442
1443         /*
1444          * Do final cleanups and then either destroy the node and leave it
1445          * passively cached.  The buffer reference is removed regardless.
1446          */
1447         buffer = node->buffer;
1448         node->ondisk = NULL;
1449
1450         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
1451                 hammer_unref(&node->lock);
1452                 hammer_rel_buffer(buffer, 0);
1453                 return;
1454         }
1455
1456         /*
1457          * Destroy the node.  Record pertainant data because the node
1458          * becomes stale the instant we flush it.
1459          */
1460         flags = node->flags;
1461         node_offset = node->node_offset;
1462         hammer_unref(&node->lock);
1463         hammer_flush_node(node);
1464         /* node is stale */
1465
1466         cluster = buffer->cluster;
1467         if (flags & HAMMER_NODE_DELETED) {
1468                 hammer_free_btree(cluster, node_offset);
1469                 if (node_offset == cluster->ondisk->clu_btree_root) {
1470                         kprintf("FREE CLUSTER %d\n", cluster->clu_no);
1471                         hammer_free_cluster(cluster);
1472                         /*hammer_io_undirty(&cluster->io);*/
1473                 }
1474         }
1475         hammer_rel_buffer(buffer, 0);
1476 }
1477
1478 /*
1479  * Passively cache a referenced hammer_node in *cache.  The caller may
1480  * release the node on return.
1481  */
1482 void
1483 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1484 {
1485         hammer_node_t old;
1486
1487         /*
1488          * If the node is being deleted, don't cache it!
1489          */
1490         if (node->flags & HAMMER_NODE_DELETED)
1491                 return;
1492
1493         /*
1494          * Cache the node.  If we previously cached a different node we
1495          * have to give HAMMER a chance to destroy it.
1496          */
1497 again:
1498         if (node->cache1 != cache) {
1499                 if (node->cache2 != cache) {
1500                         if ((old = *cache) != NULL) {
1501                                 KKASSERT(node->lock.refs != 0);
1502                                 hammer_uncache_node(cache);
1503                                 goto again;
1504                         }
1505                         if (node->cache2)
1506                                 *node->cache2 = NULL;
1507                         node->cache2 = node->cache1;
1508                         node->cache1 = cache;
1509                         *cache = node;
1510                 } else {
1511                         struct hammer_node **tmp;
1512                         tmp = node->cache1;
1513                         node->cache1 = node->cache2;
1514                         node->cache2 = tmp;
1515                 }
1516         }
1517 }
1518
1519 void
1520 hammer_uncache_node(struct hammer_node **cache)
1521 {
1522         hammer_node_t node;
1523
1524         if ((node = *cache) != NULL) {
1525                 *cache = NULL;
1526                 if (node->cache1 == cache) {
1527                         node->cache1 = node->cache2;
1528                         node->cache2 = NULL;
1529                 } else if (node->cache2 == cache) {
1530                         node->cache2 = NULL;
1531                 } else {
1532                         panic("hammer_uncache_node: missing cache linkage");
1533                 }
1534                 if (node->cache1 == NULL && node->cache2 == NULL)
1535                         hammer_flush_node(node);
1536         }
1537 }
1538
1539 /*
1540  * Remove a node's cache references and destroy the node if it has no
1541  * other references or backing store.
1542  */
1543 void
1544 hammer_flush_node(hammer_node_t node)
1545 {
1546         hammer_buffer_t buffer;
1547
1548         if (node->cache1)
1549                 *node->cache1 = NULL;
1550         if (node->cache2)
1551                 *node->cache2 = NULL;
1552         if (node->lock.refs == 0 && node->ondisk == NULL) {
1553                 RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
1554                           node);
1555                 if ((buffer = node->buffer) != NULL) {
1556                         node->buffer = NULL;
1557                         TAILQ_REMOVE(&buffer->clist, node, entry);
1558                         /* buffer is unreferenced because ondisk is NULL */
1559                 }
1560                 --hammer_count_nodes;
1561                 kfree(node, M_HAMMER);
1562         }
1563 }
1564
1565 /*
1566  * Flush passively cached B-Tree nodes associated with this buffer.
1567  * This is only called when the buffer is about to be destroyed, so
1568  * none of the nodes should have any references.
1569  */
1570 void
1571 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1572 {
1573         hammer_node_t node;
1574
1575         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1576                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1577                 hammer_ref(&node->lock);
1578                 node->flags |= HAMMER_NODE_FLUSH;
1579                 hammer_rel_node(node);
1580         }
1581 }
1582
1583 /************************************************************************
1584  *                              A-LIST ALLOCATORS                       *
1585  ************************************************************************/
1586
1587 /*
1588  * Allocate HAMMER clusters
1589  */
1590 hammer_cluster_t
1591 hammer_alloc_cluster(hammer_mount_t hmp, hammer_cluster_t cluster_hint,
1592                      int *errorp)
1593 {
1594         hammer_volume_t volume;
1595         hammer_cluster_t cluster;
1596         int32_t clu_no;
1597         int32_t clu_hint;
1598         int32_t vol_beg;
1599         int32_t vol_no;
1600
1601         /*
1602          * Figure out our starting volume and hint.
1603          */
1604         if (cluster_hint) {
1605                 vol_beg = cluster_hint->volume->vol_no;
1606                 clu_hint = cluster_hint->clu_no;
1607         } else {
1608                 vol_beg = hmp->volume_iterator;
1609                 clu_hint = -1;
1610         }
1611
1612         /*
1613          * Loop through volumes looking for a free cluster.  If allocating
1614          * a new cluster relative to an existing cluster try to find a free
1615          * cluster on either side (clu_hint >= 0), otherwise just do a
1616          * forwards iteration.
1617          */
1618         vol_no = vol_beg;
1619         do {
1620                 volume = hammer_get_volume(hmp, vol_no, errorp);
1621                 kprintf("VOLUME %p %d\n", volume, vol_no);
1622                 if (*errorp) {
1623                         clu_no = HAMMER_ALIST_BLOCK_NONE;
1624                         break;
1625                 }
1626                 hammer_modify_volume(volume);
1627                 if (clu_hint == -1) {
1628                         clu_hint = volume->clu_iterator;
1629                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1630                                                         clu_hint);
1631                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1632                                 clu_no = hammer_alist_alloc_fwd(&volume->alist,
1633                                                                 1, 0);
1634                         }
1635                 } else {
1636                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1637                                                         clu_hint);
1638                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1639                                 clu_no = hammer_alist_alloc_rev(&volume->alist,
1640                                                                 1, clu_hint);
1641                         }
1642                 }
1643                 if (clu_no != HAMMER_ALIST_BLOCK_NONE)
1644                         break;
1645                 hammer_rel_volume(volume, 0);
1646                 volume = NULL;
1647                 *errorp = ENOSPC;
1648                 vol_no = (vol_no + 1) % hmp->nvolumes;
1649                 clu_hint = -1;
1650         } while (vol_no != vol_beg);
1651
1652         /*
1653          * Acquire the cluster.  On success this will force *errorp to 0.
1654          */
1655         if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
1656                 kprintf("ALLOC CLUSTER %d:%d\n", volume->vol_no, clu_no);
1657                 cluster = hammer_get_cluster(volume, clu_no, errorp,
1658                                              GET_CLUSTER_NEW);
1659                 volume->clu_iterator = clu_no;
1660                 hammer_rel_volume(volume, 0);
1661         } else {
1662                 cluster = NULL;
1663         }
1664         if (cluster)
1665                 hammer_lock_ex(&cluster->io.lock);
1666         return(cluster);
1667 }
1668
1669 void
1670 hammer_init_cluster(hammer_cluster_t cluster, hammer_base_elm_t left_bound, 
1671                     hammer_base_elm_t right_bound)
1672 {
1673         hammer_cluster_ondisk_t ondisk = cluster->ondisk;
1674
1675         hammer_modify_cluster(cluster);
1676         ondisk->clu_btree_beg = *left_bound;
1677         ondisk->clu_btree_end = *right_bound;
1678         cluster->clu_btree_beg = ondisk->clu_btree_beg;
1679         cluster->clu_btree_end = ondisk->clu_btree_end;
1680 }
1681
1682 /*
1683  * Deallocate a cluster
1684  */
1685 void
1686 hammer_free_cluster(hammer_cluster_t cluster)
1687 {
1688         hammer_modify_volume(cluster->volume);
1689         hammer_alist_free(&cluster->volume->alist, cluster->clu_no, 1);
1690 }
1691
1692 /*
1693  * Allocate HAMMER elements - btree nodes, data storage, and record elements
1694  *
1695  * The passed *bufferp should be initialized to NULL.  On successive calls
1696  * *bufferp caches the most recent buffer used until put away by the caller.
1697  * Note that previously returned pointers using the cached buffer become
1698  * invalid on successive calls which reuse *bufferp.
1699  *
1700  * All allocations first attempt to use the block found at the specified
1701  * iterator.  If that fails the first available block is used.  If that
1702  * fails a new buffer is allocated and associated with the buffer type
1703  * A-list and the element is allocated out of the new buffer.
1704  *
1705  * This function also ensures that the required minimum number of buffers is
1706  * reserved to guarantee that recovery operations succeed.
1707  */
1708
1709 hammer_node_t
1710 hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
1711 {
1712         hammer_buffer_t buffer;
1713         hammer_alist_t live;
1714         hammer_node_t node;
1715         int32_t elm_no;
1716         int32_t buf_no;
1717         int32_t node_offset;
1718         int32_t n;
1719
1720         hammer_modify_cluster(cluster);
1721         buffer = NULL;
1722         live = &cluster->alist_btree;
1723
1724         /*
1725          * If we aren't recovering then ensure the required minimum
1726          * reservation is met. XXX if the recovery code packs the B-Tree
1727          * we don't have to do this.
1728          *
1729          * Calculate the number of buffers needed to hold the B-Tree.
1730          */
1731         if (cluster->io.validated) {
1732                 n = (cluster->ondisk->stat_records * 3 / 
1733                     HAMMER_BTREE_INT_ELMS / HAMMER_BTREE_NODES) + 1;
1734                 if (cluster->ondisk->stat_idx_bufs < n) {
1735                         kprintf("hammer_alloc_btree: %d/%d buffers\n",
1736                                 cluster->ondisk->stat_idx_bufs, n);
1737                 }
1738                 while (cluster->ondisk->stat_idx_bufs < n) {
1739                         alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1740                                          cluster->ondisk->idx_index, errorp,
1741                                          &buffer);
1742                         if (*errorp) {
1743                                 if (buffer)
1744                                         hammer_rel_buffer(buffer, 0);
1745                                 return(NULL);
1746                         }
1747                 }
1748         }
1749
1750
1751         /*
1752          * Allocate a B-Tree element
1753          */
1754         elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
1755         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1756                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1757         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1758                 alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1759                                  cluster->ondisk->idx_index, errorp, &buffer);
1760                 elm_no = hammer_alist_alloc(live, 1);
1761                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1762                         *errorp = ENOSPC;
1763                         if (buffer)
1764                                 hammer_rel_buffer(buffer, 0);
1765                         return(NULL);
1766                 }
1767         }
1768         cluster->ondisk->idx_index = elm_no;
1769         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_BTREE_NODES);
1770
1771         /*
1772          * Load and return the B-Tree element
1773          */
1774         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1775         node_offset = buf_no * HAMMER_BUFSIZE +
1776                       offsetof(union hammer_fsbuf_ondisk,
1777                                btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
1778         node = hammer_get_node(cluster, node_offset, errorp);
1779         if (node) {
1780                 hammer_modify_node(node);
1781                 bzero(node->ondisk, sizeof(*node->ondisk));
1782         } else {
1783                 hammer_alist_free(live, elm_no, 1);
1784                 hammer_rel_node(node);
1785                 node = NULL;
1786         }
1787         if (buffer)
1788                 hammer_rel_buffer(buffer, 0);
1789         return(node);
1790 }
1791
1792 void *
1793 hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
1794                   int *errorp, struct hammer_buffer **bufferp)
1795 {
1796         hammer_buffer_t buffer;
1797         hammer_alist_t live;
1798         int32_t elm_no;
1799         int32_t buf_no;
1800         int32_t nblks;
1801         void *item;
1802
1803         /*
1804          * Deal with large data blocks.  The blocksize is HAMMER_BUFSIZE
1805          * for these allocations.
1806          */
1807         hammer_modify_cluster(cluster);
1808         if ((bytes & HAMMER_BUFMASK) == 0) {
1809                 nblks = bytes / HAMMER_BUFSIZE;
1810                 /* only one block allowed for now (so buffer can hold it) */
1811                 KKASSERT(nblks == 1);
1812
1813                 buf_no = hammer_alloc_master(cluster, nblks,
1814                                              cluster->ondisk->idx_ldata, 1);
1815                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1816                         *errorp = ENOSPC;
1817                         return(NULL);
1818                 }
1819                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, nblks);
1820                 cluster->ondisk->idx_ldata = buf_no;
1821                 buffer = *bufferp;
1822                 *bufferp = hammer_get_buffer(cluster, buf_no, -1, errorp);
1823                 if (buffer)
1824                         hammer_rel_buffer(buffer, 0);
1825                 buffer = *bufferp;
1826                 return(buffer->ondisk);
1827         }
1828
1829         /*
1830          * Allocate a data element.  The block size is HAMMER_DATA_BLKSIZE
1831          * (64 bytes) for these allocations.
1832          */
1833         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1834         nblks /= HAMMER_DATA_BLKSIZE;
1835         live = &cluster->alist_mdata;
1836         elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data);
1837         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1838                 elm_no = hammer_alist_alloc_fwd(live, nblks, 0);
1839         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1840                 alloc_new_buffer(cluster, HAMMER_FSBUF_DATA, live,
1841                                  cluster->ondisk->idx_data, errorp, bufferp);
1842                 elm_no = hammer_alist_alloc(live, nblks);
1843                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1844                         *errorp = ENOSPC;
1845                         return(NULL);
1846                 }
1847         }
1848         cluster->ondisk->idx_index = elm_no;
1849
1850         /*
1851          * Load and return the B-Tree element
1852          */
1853         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1854         buffer = *bufferp;
1855         if (buffer == NULL || buffer->cluster != cluster ||
1856             buffer->buf_no != buf_no) {
1857                 if (buffer)
1858                         hammer_rel_buffer(buffer, 0);
1859                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1860                 *bufferp = buffer;
1861         }
1862         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_DATA);
1863         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_DATA_NODES);
1864         hammer_modify_buffer(buffer);
1865         item = &buffer->ondisk->data.data[elm_no & HAMMER_FSBUF_BLKMASK];
1866         bzero(item, nblks * HAMMER_DATA_BLKSIZE);
1867         *errorp = 0;
1868         return(item);
1869 }
1870
1871 void *
1872 hammer_alloc_record(hammer_cluster_t cluster, int *errorp,
1873                     u_int8_t rec_type, struct hammer_buffer **bufferp)
1874 {
1875         hammer_buffer_t buffer;
1876         hammer_alist_t live;
1877         int32_t elm_no;
1878         int32_t buf_no;
1879         void *item;
1880
1881         /*
1882          * Allocate a record element
1883          */
1884         hammer_modify_cluster(cluster);
1885         live = &cluster->alist_record;
1886         elm_no = hammer_alist_alloc_rev(live, 1, cluster->ondisk->idx_record);
1887         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1888                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1889         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1890                 alloc_new_buffer(cluster, HAMMER_FSBUF_RECORDS, live,
1891                                  cluster->ondisk->idx_record, errorp, bufferp);
1892                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1893                 kprintf("hammer_alloc_record elm again %08x\n", elm_no);
1894                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1895                         *errorp = ENOSPC;
1896                         return(NULL);
1897                 }
1898         }
1899         cluster->ondisk->idx_record = elm_no;
1900
1901         /*
1902          * Load and return the record element
1903          */
1904         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1905         buffer = *bufferp;
1906         if (buffer == NULL || buffer->cluster != cluster ||
1907             buffer->buf_no != buf_no) {
1908                 if (buffer)
1909                         hammer_rel_buffer(buffer, 0);
1910                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1911                 *bufferp = buffer;
1912         }
1913         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_RECORDS);
1914         KASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_RECORD_NODES,
1915                 ("elm_no %d (%d) out of bounds", elm_no, elm_no & HAMMER_FSBUF_BLKMASK));
1916         hammer_modify_buffer(buffer);
1917         item = &buffer->ondisk->record.recs[elm_no & HAMMER_FSBUF_BLKMASK];
1918         bzero(item, sizeof(union hammer_record_ondisk));
1919         *errorp = 0;
1920         ++cluster->ondisk->stat_records;
1921         if (rec_type == HAMMER_RECTYPE_CLUSTER)
1922                 ++cluster->ondisk->stat_records;
1923         return(item);
1924 }
1925
1926 void
1927 hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
1928 {
1929         int32_t elm_no;
1930         int32_t nblks;
1931         hammer_alist_t live;
1932
1933         hammer_modify_cluster(buffer->cluster);
1934         if ((bytes & HAMMER_BUFMASK) == 0) {
1935                 nblks = bytes / HAMMER_BUFSIZE;
1936                 KKASSERT(nblks == 1 && data == (void *)buffer->ondisk);
1937                 hammer_alist_free(&buffer->cluster->alist_master,
1938                                   buffer->buf_no, nblks);
1939                 hammer_adjust_stats(buffer->cluster, HAMMER_FSBUF_DATA, -nblks);
1940                 return;
1941         }
1942
1943         elm_no = ((char *)data - (char *)buffer->ondisk->data.data) /
1944                  HAMMER_DATA_BLKSIZE;
1945         KKASSERT(elm_no >= 0 && elm_no < HAMMER_DATA_NODES);
1946         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1947         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1948         nblks /= HAMMER_DATA_BLKSIZE;
1949         live = &buffer->cluster->alist_mdata;
1950         hammer_alist_free(live, elm_no, nblks);
1951 }
1952
1953 void
1954 hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec,
1955                         u_int8_t rec_type)
1956 {
1957         int32_t elm_no;
1958         hammer_alist_t live;
1959
1960         hammer_modify_cluster(buffer->cluster);
1961         elm_no = rec - &buffer->ondisk->record.recs[0];
1962         KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
1963         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1964         live = &buffer->cluster->alist_record;
1965         hammer_alist_free(live, elm_no, 1);
1966         --buffer->cluster->ondisk->stat_records;
1967         if (rec_type == HAMMER_RECTYPE_CLUSTER)
1968                 --buffer->cluster->ondisk->stat_records;
1969 }
1970
1971 void
1972 hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
1973 {
1974         const int32_t blksize = sizeof(struct hammer_node_ondisk);
1975         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1976         hammer_alist_t live;
1977         int32_t elm_no;
1978
1979         hammer_modify_cluster(cluster);
1980         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1981         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, btree.nodes[0]);
1982         live = &cluster->alist_btree;
1983         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1984         elm_no += fsbuf_offset / blksize;
1985         hammer_alist_free(live, elm_no, 1);
1986 }
1987
1988 void
1989 hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
1990 {
1991         const int32_t blksize = HAMMER_DATA_BLKSIZE;
1992         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1993         hammer_alist_t live;
1994         int32_t elm_no;
1995         int32_t buf_no;
1996         int32_t nblks;
1997
1998         hammer_modify_cluster(cluster);
1999         if ((bytes & HAMMER_BUFMASK) == 0) {
2000                 nblks = bytes / HAMMER_BUFSIZE;
2001                 KKASSERT(nblks == 1 && (bclu_offset & HAMMER_BUFMASK) == 0);
2002                 buf_no = bclu_offset / HAMMER_BUFSIZE;
2003                 hammer_alist_free(&cluster->alist_master, buf_no, nblks);
2004                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, -nblks);
2005                 return;
2006         }
2007
2008         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2009         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, data.data[0][0]);
2010         live = &cluster->alist_mdata;
2011         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
2012         nblks /= HAMMER_DATA_BLKSIZE;
2013         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2014         elm_no += fsbuf_offset / blksize;
2015         hammer_alist_free(live, elm_no, nblks);
2016 }
2017
2018 void
2019 hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset,
2020                    u_int8_t rec_type)
2021 {
2022         const int32_t blksize = sizeof(union hammer_record_ondisk);
2023         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2024         hammer_alist_t live;
2025         int32_t elm_no;
2026
2027         hammer_modify_cluster(cluster);
2028         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2029         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, record.recs[0]);
2030         live = &cluster->alist_record;
2031         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2032         elm_no += fsbuf_offset / blksize;
2033         hammer_alist_free(live, elm_no, 1);
2034         --cluster->ondisk->stat_records;
2035         if (rec_type == HAMMER_RECTYPE_CLUSTER)
2036                 --cluster->ondisk->stat_records;
2037 }
2038
2039
2040 /*
2041  * Allocate a new filesystem buffer and assign it to the specified
2042  * filesystem buffer type.  The new buffer will be added to the
2043  * type-specific A-list and initialized.
2044  */
2045 static void
2046 alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live,
2047                  int start, int *errorp, struct hammer_buffer **bufferp)
2048 {
2049         hammer_buffer_t buffer;
2050         int32_t buf_no;
2051         int32_t base_blk;
2052         int isfwd;
2053
2054         if (*bufferp)
2055                 hammer_rel_buffer(*bufferp, 0);
2056         *bufferp = NULL;
2057
2058         start = start / HAMMER_FSBUF_MAXBLKS;   /* convert to buf_no */
2059         isfwd = (type != HAMMER_FSBUF_RECORDS);
2060         buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
2061         if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2062                 *errorp = ENOSPC;
2063                 return;
2064         }
2065
2066         /*
2067          * The new buffer must be initialized (type != 0) regardless of
2068          * whether we already have it cached or not, so don't try to
2069          * optimize the cached buffer check.  Just call hammer_get_buffer().
2070          */
2071         buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
2072         *bufferp = buffer;
2073
2074         /*
2075          * Do a meta-free of the buffer's elements into the type-specific
2076          * A-list and update our statistics to reflect the allocation.
2077          */
2078         if (buffer) {
2079 #if 0
2080                 kprintf("alloc_new_buffer buf_no %d type %016llx nelms %d\n",
2081                         buf_no, type, nelements);
2082 #endif
2083                 hammer_modify_buffer(buffer);  /*XXX*/
2084                 hammer_adjust_stats(cluster, type, 1);
2085
2086                 /*
2087                  * Free the buffer to the appropriate slave list so the
2088                  * cluster-based allocator sees it.
2089                  */
2090                 base_blk = buf_no * HAMMER_FSBUF_MAXBLKS;
2091
2092                 switch(type) {
2093                 case HAMMER_FSBUF_BTREE:
2094                         hammer_alist_free(live, base_blk, HAMMER_BTREE_NODES);
2095                         break;
2096                 case HAMMER_FSBUF_DATA:
2097                         hammer_alist_free(live, base_blk, HAMMER_DATA_NODES);
2098                         break;
2099                 case HAMMER_FSBUF_RECORDS:
2100                         hammer_alist_free(live, base_blk, HAMMER_RECORD_NODES);
2101                         break;
2102                 }
2103         }
2104 }
2105
2106 /*
2107  * Sync dirty buffers to the media
2108  */
2109
2110 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
2111 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
2112
2113 int
2114 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
2115 {
2116         struct hammer_sync_info info;
2117
2118         info.error = 0;
2119         info.waitfor = waitfor;
2120
2121         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
2122                       hammer_sync_scan1, hammer_sync_scan2, &info);
2123
2124         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
2125                 hammer_sync_volume, &info);
2126         return(info.error);
2127 }
2128
2129 static int
2130 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
2131 {
2132         struct hammer_inode *ip;
2133
2134         ip = VTOI(vp);
2135         if (vp->v_type == VNON || ip == NULL ||
2136             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2137              RB_EMPTY(&vp->v_rbdirty_tree))) {
2138                 return(-1);
2139         }
2140         return(0);
2141 }
2142
2143 static int
2144 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2145 {
2146         struct hammer_sync_info *info = data;
2147         struct hammer_inode *ip;
2148         int error;
2149
2150         ip = VTOI(vp);
2151         if (vp->v_type == VNON || vp->v_type == VBAD ||
2152             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2153              RB_EMPTY(&vp->v_rbdirty_tree))) {
2154                 return(0);
2155         }
2156         if (vp->v_type != VCHR) {
2157                 error = VOP_FSYNC(vp, info->waitfor);
2158                 if (error)
2159                         info->error = error;
2160         }
2161         return(0);
2162 }
2163
2164 int
2165 hammer_sync_volume(hammer_volume_t volume, void *data)
2166 {
2167         struct hammer_sync_info *info = data;
2168
2169         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
2170                 hammer_sync_cluster, info);
2171         if (hammer_ref_volume(volume) == 0)
2172                 hammer_rel_volume(volume, 1);
2173         return(0);
2174 }
2175
2176 int
2177 hammer_sync_cluster(hammer_cluster_t cluster, void *data)
2178 {
2179         struct hammer_sync_info *info = data;
2180
2181         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
2182                 hammer_sync_buffer, info);
2183         /*hammer_io_waitdep(&cluster->io);*/
2184         if (hammer_ref_cluster(cluster) == 0)
2185                 hammer_rel_cluster(cluster, 1);
2186         return(0);
2187 }
2188
2189 int
2190 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
2191 {
2192         if (hammer_ref_buffer(buffer) == 0)
2193                 hammer_rel_buffer(buffer, 1);
2194         return(0);
2195 }
2196
2197 /*
2198  * Generic buffer initialization.  Initialize the A-list into an all-allocated
2199  * state with the free block limit properly set.
2200  *
2201  * Note that alloc_new_buffer() will free the appropriate block range via
2202  * the appropriate cluster alist, so the free count is properly propogated.
2203  */
2204 void
2205 hammer_initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
2206 {
2207         head->buf_type = type;
2208
2209         switch(type) {
2210         case HAMMER_FSBUF_BTREE:
2211                 hammer_alist_init(live, 0, HAMMER_BTREE_NODES,
2212                                   HAMMER_ASTATE_ALLOC);
2213                 break;
2214         case HAMMER_FSBUF_DATA:
2215                 hammer_alist_init(live, 0, HAMMER_DATA_NODES,
2216                                   HAMMER_ASTATE_ALLOC);
2217                 break;
2218         case HAMMER_FSBUF_RECORDS:
2219                 hammer_alist_init(live, 0, HAMMER_RECORD_NODES,
2220                                   HAMMER_ASTATE_ALLOC);
2221                 break;
2222         default:
2223                 hammer_alist_init(live, 0, 0, HAMMER_ASTATE_ALLOC);
2224                 break;
2225         }
2226 }
2227
2228 /*
2229  * Calculate the cluster's offset in the volume.  This calculation is
2230  * slightly more complex when using superclusters because superclusters
2231  * are grouped in blocks of 16, followed by 16 x N clusters where N
2232  * is the number of clusters a supercluster can manage.
2233  */
2234 static int64_t
2235 calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
2236 {
2237         int32_t scl_group;
2238         int64_t scl_group_size;
2239         int64_t off;
2240
2241         if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
2242                 scl_group = clu_no / HAMMER_VOL_SUPERCLUSTER_GROUP /
2243                             HAMMER_SCL_MAXCLUSTERS;
2244                 scl_group_size = 
2245                             ((int64_t)HAMMER_BUFSIZE *
2246                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2247                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2248                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2249                 scl_group_size += 
2250                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2251
2252                 off = volume->cluster_base +
2253                       scl_group * scl_group_size +
2254                       (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
2255                       ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
2256                        HAMMER_VOL_SUPERCLUSTER_GROUP))
2257                       * volume->vol_clsize;
2258         } else {
2259                 off = volume->cluster_base +
2260                       (int64_t)clu_no * volume->vol_clsize;
2261         }
2262         return(off);
2263 }
2264
2265 /*
2266  * Calculate a super-cluster's offset in the volume.
2267  */
2268 static int64_t
2269 calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
2270 {
2271         int64_t off;
2272         int32_t scl_group;
2273         int64_t scl_group_size;
2274
2275         KKASSERT (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL);
2276         scl_group = scl_no / HAMMER_VOL_SUPERCLUSTER_GROUP;
2277         if (scl_group) {
2278                 scl_group_size = 
2279                             ((int64_t)HAMMER_BUFSIZE *
2280                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2281                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2282                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2283                 scl_group_size += 
2284                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2285                 off = volume->cluster_base + (scl_group * scl_group_size) +
2286                       (scl_no % HAMMER_VOL_SUPERCLUSTER_GROUP) * HAMMER_BUFSIZE;
2287         } else {
2288                 off = volume->cluster_base + (scl_no * HAMMER_BUFSIZE);
2289         }
2290         return(off);
2291 }
2292
2293 /*
2294  * Allocate nblks buffers from the cluster's master alist.
2295  */
2296 static int32_t
2297 hammer_alloc_master(hammer_cluster_t cluster, int nblks,
2298                     int32_t start, int isfwd)
2299 {
2300         int32_t buf_no;
2301
2302         hammer_modify_cluster(cluster);
2303         if (isfwd) {
2304                 buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2305                                                 nblks, start);
2306                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2307                         buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2308                                                 nblks, 0);
2309                 }
2310         } else {
2311                 buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2312                                                 nblks, start);
2313                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2314                         buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2315                                                 nblks, HAMMER_ALIST_BLOCK_MAX);
2316                 }
2317         }
2318
2319         /*
2320          * Recover space from empty record, b-tree, and data a-lists.
2321          */
2322
2323         return(buf_no);
2324 }
2325
2326 /*
2327  * Adjust allocation statistics
2328  */
2329 static void
2330 hammer_adjust_stats(hammer_cluster_t cluster, u_int64_t buf_type, int nblks)
2331 {
2332         hammer_modify_cluster(cluster);
2333         hammer_modify_volume(cluster->volume);
2334         hammer_modify_volume(cluster->volume->hmp->rootvol);
2335
2336         switch(buf_type) {
2337         case HAMMER_FSBUF_BTREE:
2338                 cluster->ondisk->stat_idx_bufs += nblks;
2339                 cluster->volume->ondisk->vol_stat_idx_bufs += nblks;
2340                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_idx_bufs += nblks;
2341                 break;
2342         case HAMMER_FSBUF_DATA:
2343                 cluster->ondisk->stat_data_bufs += nblks;
2344                 cluster->volume->ondisk->vol_stat_data_bufs += nblks;
2345                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_data_bufs += nblks;
2346                 break;
2347         case HAMMER_FSBUF_RECORDS:
2348                 cluster->ondisk->stat_rec_bufs += nblks;
2349                 cluster->volume->ondisk->vol_stat_rec_bufs += nblks;
2350                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_rec_bufs += nblks;
2351                 break;
2352         }
2353 }
2354
2355 /*
2356  * A-LIST SUPPORT
2357  *
2358  * Setup the parameters for the various A-lists we use in hammer.  The
2359  * supercluster A-list must be chained to the cluster A-list and cluster
2360  * slave A-lists are chained to buffer A-lists.
2361  *
2362  * See hammer_init_alist_config() below.
2363  */
2364
2365 /*
2366  * A-LIST - cluster recursion into a filesystem buffer
2367  *
2368  * In the init case the buffer has already been initialized by
2369  * alloc_new_buffer() when it allocated the buffer out of the master
2370  * alist and marked it as free in the slave alist.
2371  *
2372  * Because we use a somewhat odd mechanism to assign buffers to slave
2373  * pools we can't actually free the buffer back to the master alist in
2374  * buffer_alist_destroy(), but instead must deal with that logic somewhere
2375  * else.
2376  */
2377 static int
2378 buffer_alist_init(void *info, int32_t blk, int32_t radix,
2379                   hammer_alloc_state_t state)
2380 {
2381         return(0);
2382 }
2383
2384 /*
2385  * Note: This routine is only called when freeing the last elements of
2386  * an initialized buffer.  Freeing all elements of the buffer when the
2387  * buffer was not previously initialized does not call this routine.
2388  */
2389 static int
2390 buffer_alist_destroy(void *info, int32_t blk, int32_t radix)
2391 {
2392         hammer_cluster_t cluster = info;
2393         int32_t buf_no;
2394
2395         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2396         kprintf("destroy buffer %d:%d:%d\n", cluster->volume->vol_no, cluster->clu_no, buf_no);
2397         return (0);
2398 }
2399
2400 /*
2401  * Note: atblk can be negative and atblk - blk can go negative.
2402  */
2403 static int
2404 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2405                       int32_t count, int32_t atblk, int32_t *fullp)
2406 {
2407         hammer_cluster_t cluster = info;
2408         hammer_buffer_t buffer;
2409         int32_t buf_no;
2410         int32_t r;
2411         int error = 0;
2412
2413         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2414         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2415         if (buffer) {
2416                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2417
2418                 hammer_modify_buffer(buffer);
2419                 r = hammer_alist_alloc_fwd(&buffer->alist, count, atblk - blk);
2420                 if (r != HAMMER_ALIST_BLOCK_NONE)
2421                         r += blk;
2422                 *fullp = hammer_alist_isfull(&buffer->alist);
2423                 hammer_rel_buffer(buffer, 0);
2424         } else {
2425                 r = HAMMER_ALIST_BLOCK_NONE;
2426                 *fullp = 1;
2427         }
2428         return(r);
2429 }
2430
2431 static int
2432 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2433                       int32_t count, int32_t atblk, int32_t *fullp)
2434 {
2435         hammer_cluster_t cluster = info;
2436         hammer_buffer_t buffer;
2437         int32_t buf_no;
2438         int32_t r;
2439         int error = 0;
2440
2441         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2442         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2443         if (buffer) {
2444                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2445                 hammer_modify_buffer(buffer);
2446                 r = hammer_alist_alloc_rev(&buffer->alist, count, atblk - blk);
2447                 if (r != HAMMER_ALIST_BLOCK_NONE)
2448                         r += blk;
2449                 *fullp = hammer_alist_isfull(&buffer->alist);
2450                 hammer_rel_buffer(buffer, 0);
2451         } else {
2452                 r = HAMMER_ALIST_BLOCK_NONE;
2453                 *fullp = 1;
2454         }
2455         return(r);
2456 }
2457
2458 static void
2459 buffer_alist_free(void *info, int32_t blk, int32_t radix,
2460                  int32_t base_blk, int32_t count, int32_t *emptyp)
2461 {
2462         hammer_cluster_t cluster = info;
2463         hammer_buffer_t buffer;
2464         int32_t buf_no;
2465         int error = 0;
2466
2467         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2468         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2469         if (buffer) {
2470                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2471                 hammer_modify_buffer(buffer);
2472                 hammer_alist_free(&buffer->alist, base_blk, count);
2473                 *emptyp = hammer_alist_isempty(&buffer->alist);
2474                 hammer_rel_buffer(buffer, 0);
2475         } else {
2476                 *emptyp = 0;
2477         }
2478 }
2479
2480 static int32_t
2481 buffer_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk,
2482                   int flags)
2483 {
2484         hammer_cluster_t cluster = info;
2485         hammer_buffer_t buffer;
2486         int32_t buf_no;
2487         int32_t maxblks;
2488         int error = 0;
2489
2490         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2491         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2492         if (buffer) {
2493                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2494                 switch(buffer->ondisk->head.buf_type) {
2495                 case HAMMER_FSBUF_RECORDS:
2496                         maxblks = HAMMER_RECORD_NODES;
2497                         break;
2498                 case HAMMER_FSBUF_BTREE:
2499                         maxblks = HAMMER_BTREE_NODES;
2500                         break;
2501                 case HAMMER_FSBUF_DATA:
2502                         maxblks = HAMMER_DATA_NODES;
2503                         break;
2504                 default:
2505                         panic("buffer_alist_find: unknown buffer type");
2506                         maxblks = 0;
2507                         break;
2508                 }
2509                 blk = hammer_alist_find(&buffer->alist, atblk - blk, maxblks,
2510                                         flags);
2511                 hammer_rel_buffer(buffer, 0);
2512         } else {
2513                 blk = HAMMER_ALIST_BLOCK_NONE;
2514         }
2515         return(blk);
2516 }
2517
2518 static void
2519 buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2520 {
2521 }
2522
2523 /*
2524  * A-LIST - super-cluster recursion into a cluster and cluster recursion
2525  * into a filesystem buffer.  A-List's are mostly self-contained entities,
2526  * but callbacks must be installed to recurse from one A-List to another.
2527  *
2528  * Implementing these callbacks allows us to operate a multi-layered A-List
2529  * as a single entity.
2530  */
2531
2532 /*
2533  * This occurs when allocating a cluster via the volume a-list and the
2534  * entry in the volume a-list indicated all-free.  The underlying supercl
2535  * has not yet been initialized.
2536  */
2537 static int
2538 super_alist_init(void *info, int32_t blk, int32_t radix,
2539                  hammer_alloc_state_t state)
2540 {
2541         hammer_volume_t volume = info;
2542         hammer_supercl_t supercl;
2543         int32_t scl_no;
2544         int error = 0;
2545
2546         /*
2547          * Calculate the super-cluster number containing the cluster (blk)
2548          * and obtain the super-cluster buffer.
2549          */
2550         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2551         supercl = hammer_get_supercl(volume, scl_no, &error, state);
2552         if (supercl)
2553                 hammer_rel_supercl(supercl, 0);
2554         return (error);
2555 }
2556
2557 static int
2558 super_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
2559 {
2560         hammer_volume_t volume = info;
2561         hammer_supercl_t supercl;
2562         int32_t scl_no;
2563         int error = 0;
2564
2565         /*
2566          * Calculate the super-cluster number containing the cluster (blk)
2567          * and obtain the super-cluster buffer.
2568          */
2569         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2570         supercl = hammer_get_supercl(volume, scl_no, &error,
2571                                      HAMMER_ASTATE_NONE);
2572         if (supercl) {
2573                 hammer_modify_supercl(supercl);
2574                 error = hammer_alist_recover(&supercl->alist, blk, 0, count);
2575                 /* free block count is returned if >= 0 */
2576                 hammer_rel_supercl(supercl, 0);
2577         } else {
2578                 error = -error;
2579         }
2580         return (error);
2581 }
2582
2583 /*
2584  * This occurs when freeing a cluster via the volume a-list and the
2585  * supercl is now 100% free.  We can destroy the supercl.
2586  *
2587  * What we actually do is just unset the modify bit so it doesn't get
2588  * written out.
2589  */
2590 static int
2591 super_alist_destroy(void *info, int32_t blk, int32_t radix)
2592 {
2593         hammer_volume_t volume = info;
2594         hammer_supercl_t supercl;
2595         int32_t scl_no;
2596         int error = 0;
2597
2598         /*
2599          * Calculate the super-cluster number containing the cluster (blk)
2600          * and obtain the super-cluster buffer.
2601          */
2602         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2603         if (hammer_find_supercl(volume, scl_no)) {
2604                 supercl = hammer_get_supercl(volume, scl_no, &error,
2605                                              HAMMER_ASTATE_FREE);
2606                                              /* XXX */
2607                 if (supercl) {
2608                         hammer_io_clear_modify(&supercl->io);
2609                         hammer_rel_supercl(supercl, 0);
2610                 }
2611         }
2612         return (error);
2613 }
2614
2615 static int
2616 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2617                       int32_t count, int32_t atblk, int32_t *fullp)
2618 {
2619         hammer_volume_t volume = info;
2620         hammer_supercl_t supercl;
2621         int32_t scl_no;
2622         int32_t r;
2623         int error = 0;
2624
2625         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2626         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2627         if (supercl) {
2628                 hammer_modify_supercl(supercl);
2629                 r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
2630                 if (r != HAMMER_ALIST_BLOCK_NONE)
2631                         r += blk;
2632                 *fullp = hammer_alist_isfull(&supercl->alist);
2633                 hammer_rel_supercl(supercl, 0);
2634         } else {
2635                 r = HAMMER_ALIST_BLOCK_NONE;
2636                 *fullp = 1;
2637         }
2638         return(r);
2639 }
2640
2641 static int
2642 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2643                       int32_t count, int32_t atblk, int32_t *fullp)
2644 {
2645         hammer_volume_t volume = info;
2646         hammer_supercl_t supercl;
2647         int32_t scl_no;
2648         int32_t r;
2649         int error = 0;
2650
2651         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2652         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2653         if (supercl) {
2654                 hammer_modify_supercl(supercl);
2655                 r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
2656                 if (r != HAMMER_ALIST_BLOCK_NONE)
2657                         r += blk;
2658                 *fullp = hammer_alist_isfull(&supercl->alist);
2659                 hammer_rel_supercl(supercl, 0);
2660         } else { 
2661                 r = HAMMER_ALIST_BLOCK_NONE;
2662                 *fullp = 1;
2663         }
2664         return(r);
2665 }
2666
2667 static void
2668 super_alist_free(void *info, int32_t blk, int32_t radix,
2669                  int32_t base_blk, int32_t count, int32_t *emptyp)
2670 {
2671         hammer_volume_t volume = info;
2672         hammer_supercl_t supercl;
2673         int32_t scl_no;
2674         int error = 0;
2675
2676         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2677         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2678         if (supercl) {
2679                 hammer_modify_supercl(supercl);
2680                 hammer_alist_free(&supercl->alist, base_blk, count);
2681                 *emptyp = hammer_alist_isempty(&supercl->alist);
2682                 hammer_rel_supercl(supercl, 0);
2683         } else {
2684                 *emptyp = 0;
2685         }
2686 }
2687
2688 static int32_t
2689 super_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk,
2690                   int flags)
2691 {
2692         hammer_volume_t volume = info;
2693         hammer_supercl_t supercl;
2694         int32_t scl_no;
2695         int32_t nclusters;
2696         int error = 0;
2697
2698         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2699         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2700         if (supercl) {
2701                 nclusters = supercl->volume->ondisk->vol_nclusters -
2702                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
2703                 KKASSERT(nclusters > 0);
2704                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
2705                         nclusters = HAMMER_SCL_MAXCLUSTERS;
2706                 blk = hammer_alist_find(&supercl->alist, atblk - blk,
2707                                         nclusters, flags);
2708                 hammer_rel_supercl(supercl, 0);
2709         } else {
2710                 blk = HAMMER_ALIST_BLOCK_NONE;
2711         }
2712         return(blk);
2713 }
2714
2715 static void
2716 super_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2717 {
2718 }
2719
2720 void
2721 hammer_init_alist_config(void)
2722 {
2723         hammer_alist_config_t config;
2724
2725         hammer_alist_template(&Buf_alist_config, HAMMER_FSBUF_MAXBLKS,
2726                               1, HAMMER_FSBUF_METAELMS, 0);
2727         hammer_alist_template(&Vol_normal_alist_config, HAMMER_VOL_MAXCLUSTERS,
2728                               1, HAMMER_VOL_METAELMS_1LYR, 0);
2729         hammer_alist_template(&Vol_super_alist_config,
2730                           HAMMER_VOL_MAXSUPERCLUSTERS * HAMMER_SCL_MAXCLUSTERS,
2731                               HAMMER_SCL_MAXCLUSTERS, HAMMER_VOL_METAELMS_2LYR,
2732                               0);
2733         hammer_alist_template(&Supercl_alist_config, HAMMER_VOL_MAXCLUSTERS,
2734                               1, HAMMER_SUPERCL_METAELMS, 0);
2735         hammer_alist_template(&Clu_master_alist_config, HAMMER_CLU_MAXBUFFERS,
2736                               1, HAMMER_CLU_MASTER_METAELMS, 0);
2737         hammer_alist_template(&Clu_slave_alist_config,
2738                               HAMMER_CLU_MAXBUFFERS * HAMMER_FSBUF_MAXBLKS,
2739                               HAMMER_FSBUF_MAXBLKS, HAMMER_CLU_SLAVE_METAELMS,
2740                               1);
2741
2742         config = &Vol_super_alist_config;
2743         config->bl_radix_init = super_alist_init;
2744         config->bl_radix_recover = super_alist_recover;
2745         config->bl_radix_destroy = super_alist_destroy;
2746         config->bl_radix_alloc_fwd = super_alist_alloc_fwd;
2747         config->bl_radix_alloc_rev = super_alist_alloc_rev;
2748         config->bl_radix_free = super_alist_free;
2749         config->bl_radix_find = super_alist_find;
2750         config->bl_radix_print = super_alist_print;
2751
2752         config = &Clu_slave_alist_config;
2753         config->bl_radix_init = buffer_alist_init;
2754         config->bl_radix_recover = buffer_alist_recover;
2755         config->bl_radix_destroy = buffer_alist_destroy;
2756         config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd;
2757         config->bl_radix_alloc_rev = buffer_alist_alloc_rev;
2758         config->bl_radix_free = buffer_alist_free;
2759         config->bl_radix_find = buffer_alist_find;
2760         config->bl_radix_print = buffer_alist_print;
2761 }
2762