HAMMER 25/many: get fsx (filesystem test) working, cleanup pass
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.26 2008/01/25 10:36:04 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_supercl(hammer_supercl_t supercl,
51                         hammer_alloc_state_t isnew);
52 static int hammer_load_cluster(hammer_cluster_t cluster, int getflags);
53 static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
54 static int hammer_load_node(hammer_node_t node);
55 static void alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type,
56                         hammer_alist_t live,
57                         int32_t start, int *errorp,
58                         struct hammer_buffer **bufferp);
59 #if 0
60 static void readhammerbuf(hammer_volume_t vol, void *data,
61                         int64_t offset);
62 static void writehammerbuf(hammer_volume_t vol, const void *data,
63                         int64_t offset);
64 #endif
65 static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
66 static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
67 static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks,
68                         int32_t start, int isfwd);
69 static void hammer_adjust_stats(hammer_cluster_t cluster,
70                         u_int64_t buf_type, int nblks);
71
72 struct hammer_alist_config Buf_alist_config;
73 struct hammer_alist_config Vol_normal_alist_config;
74 struct hammer_alist_config Vol_super_alist_config;
75 struct hammer_alist_config Supercl_alist_config;
76 struct hammer_alist_config Clu_master_alist_config;
77 struct hammer_alist_config Clu_slave_alist_config;
78
79 /*
80  * Red-Black tree support for various structures
81  */
82 static int
83 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
84 {
85         if (ip1->obj_id < ip2->obj_id)
86                 return(-1);
87         if (ip1->obj_id > ip2->obj_id)
88                 return(1);
89         if (ip1->obj_asof < ip2->obj_asof)
90                 return(-1);
91         if (ip1->obj_asof > ip2->obj_asof)
92                 return(1);
93         return(0);
94 }
95
96 static int
97 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
98 {
99         if (info->obj_id < ip->obj_id)
100                 return(-1);
101         if (info->obj_id > ip->obj_id)
102                 return(1);
103         if (info->obj_asof < ip->obj_asof)
104                 return(-1);
105         if (info->obj_asof > ip->obj_asof)
106                 return(1);
107         return(0);
108 }
109
110 static int
111 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
112 {
113         if (vol1->vol_no < vol2->vol_no)
114                 return(-1);
115         if (vol1->vol_no > vol2->vol_no)
116                 return(1);
117         return(0);
118 }
119
120 static int
121 hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
122 {
123         if (cl1->scl_no < cl2->scl_no)
124                 return(-1);
125         if (cl1->scl_no > cl2->scl_no)
126                 return(1);
127         return(0);
128 }
129
130 static int
131 hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
132 {
133         if (cl1->clu_no < cl2->clu_no)
134                 return(-1);
135         if (cl1->clu_no > cl2->clu_no)
136                 return(1);
137         return(0);
138 }
139
140 static int
141 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
142 {
143         if (buf1->buf_no < buf2->buf_no)
144                 return(-1);
145         if (buf1->buf_no > buf2->buf_no)
146                 return(1);
147         return(0);
148 }
149
150 static int
151 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
152 {
153         if (node1->node_offset < node2->node_offset)
154                 return(-1);
155         if (node1->node_offset > node2->node_offset)
156                 return(1);
157         return(0);
158 }
159
160 /*
161  * Note: The lookup function for hammer_ino_rb_tree winds up being named
162  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
163  * functions are normal, e.g. hammer_clu_rb_tree_RB_LOOKUP(root, clu_no).
164  */
165 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
166 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
167                 hammer_inode_info_cmp, hammer_inode_info_t);
168 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
169              hammer_vol_rb_compare, int32_t, vol_no);
170 RB_GENERATE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
171              hammer_scl_rb_compare, int32_t, scl_no);
172 RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
173              hammer_clu_rb_compare, int32_t, clu_no);
174 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
175              hammer_buf_rb_compare, int32_t, buf_no);
176 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
177              hammer_nod_rb_compare, int32_t, node_offset);
178
179 /************************************************************************
180  *                              VOLUMES                                 *
181  ************************************************************************
182  *
183  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
184  * code on failure.  Volumes must be loaded at mount time, get_volume() will
185  * not load a new volume.
186  *
187  * Calls made to hammer_load_volume() or single-threaded
188  */
189 int
190 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
191 {
192         struct mount *mp;
193         hammer_volume_t volume;
194         struct hammer_volume_ondisk *ondisk;
195         struct nlookupdata nd;
196         struct buf *bp = NULL;
197         int error;
198         int ronly;
199
200         mp = hmp->mp;
201         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
202
203         /*
204          * Allocate a volume structure
205          */
206         ++hammer_count_volumes;
207         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
208         volume->vol_name = kstrdup(volname, M_HAMMER);
209         volume->hmp = hmp;
210         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
211         volume->io.offset = 0LL;
212
213         /*
214          * Get the device vnode
215          */
216         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
217         if (error == 0)
218                 error = nlookup(&nd);
219         if (error == 0)
220                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
221         nlookup_done(&nd);
222         if (error == 0) {
223                 vn_isdisk(volume->devvp, &error);
224         }
225         if (error == 0) {
226                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
227                 error = VOP_OPEN(volume->devvp, (ronly ? FREAD : FREAD|FWRITE),
228                                  FSCRED, NULL);
229                 vn_unlock(volume->devvp);
230         }
231         if (error) {
232                 hammer_free_volume(volume);
233                 return(error);
234         }
235
236         /*
237          * Extract the volume number from the volume header and do various
238          * sanity checks.
239          */
240         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
241         if (error)
242                 goto late_failure;
243         ondisk = (void *)bp->b_data;
244         if (ondisk->head.buf_type != HAMMER_FSBUF_VOLUME) {
245                 kprintf("hammer_mount: volume %s has an invalid header\n",
246                         volume->vol_name);
247                 error = EFTYPE;
248                 goto late_failure;
249         }
250         volume->vol_no = ondisk->vol_no;
251         volume->cluster_base = ondisk->vol_clo_beg;
252         volume->vol_clsize = ondisk->vol_clsize;
253         volume->vol_flags = ondisk->vol_flags;
254         volume->nblocks = ondisk->vol_nblocks; 
255         RB_INIT(&volume->rb_clus_root);
256         RB_INIT(&volume->rb_scls_root);
257
258         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
259
260         if (RB_EMPTY(&hmp->rb_vols_root)) {
261                 hmp->fsid = ondisk->vol_fsid;
262         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
263                 kprintf("hammer_mount: volume %s's fsid does not match "
264                         "other volumes\n", volume->vol_name);
265                 error = EFTYPE;
266                 goto late_failure;
267         }
268
269         /*
270          * Insert the volume structure into the red-black tree.
271          */
272         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
273                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
274                         volume->vol_name, volume->vol_no);
275                 error = EEXIST;
276         }
277
278         /*
279          * Set the root volume and load the root cluster.  HAMMER special
280          * cases rootvol and rootcl and will not deallocate the structures.
281          * We do not hold a ref because this would prevent related I/O
282          * from being flushed.
283          */
284         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
285                 hmp->rootvol = volume;
286                 if (bp) {
287                         brelse(bp);
288                         bp = NULL;
289                 }
290                 hammer_ref_volume(volume);
291                 hmp->rootcl = hammer_get_cluster(volume,
292                                                  ondisk->vol0_root_clu_no,
293                                                  &error, GET_CLUSTER_NORECOVER);
294                 hammer_rel_cluster(hmp->rootcl, 0);
295                 hammer_rel_volume(volume, 0);
296                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
297         }
298 late_failure:
299         if (bp)
300                 brelse(bp);
301         if (error) {
302                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
303                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
304                 hammer_free_volume(volume);
305         }
306         return (error);
307 }
308
309 /*
310  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
311  * so returns -1 on failure.
312  */
313 int
314 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
315 {
316         struct hammer_mount *hmp = volume->hmp;
317         hammer_cluster_t rootcl;
318         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
319
320         /*
321          * Sync clusters, sync volume
322          */
323
324         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
325
326         /*
327          * Clean up the root cluster, which is held unlocked in the root
328          * volume.
329          */
330         if (hmp->rootvol == volume) {
331                 if ((rootcl = hmp->rootcl) != NULL)
332                         hmp->rootcl = NULL;
333                 hmp->rootvol = NULL;
334         }
335
336         /*
337          * Unload clusters and super-clusters.  Unloading a super-cluster
338          * also unloads related clusters, but the filesystem may not be
339          * using super-clusters so unload clusters anyway.
340          */
341         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
342                         hammer_unload_cluster, NULL);
343         RB_SCAN(hammer_scl_rb_tree, &volume->rb_scls_root, NULL,
344                         hammer_unload_supercl, NULL);
345         hammer_io_waitdep(&volume->io);
346
347         /*
348          * Release our buffer and flush anything left in the buffer cache.
349          */
350         hammer_io_release(&volume->io, 2);
351
352         /*
353          * There should be no references on the volume, no clusters, and
354          * no super-clusters.
355          */
356         KKASSERT(volume->io.lock.refs == 0);
357         KKASSERT(RB_EMPTY(&volume->rb_clus_root));
358         KKASSERT(RB_EMPTY(&volume->rb_scls_root));
359
360         volume->ondisk = NULL;
361         if (volume->devvp) {
362                 if (ronly) {
363                         vinvalbuf(volume->devvp, 0, 0, 0);
364                         VOP_CLOSE(volume->devvp, FREAD);
365                 } else {
366                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
367                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
368                 }
369         }
370
371         /*
372          * Destroy the structure
373          */
374         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
375         hammer_free_volume(volume);
376         return(0);
377 }
378
379 static
380 void
381 hammer_free_volume(hammer_volume_t volume)
382 {
383         if (volume->vol_name) {
384                 kfree(volume->vol_name, M_HAMMER);
385                 volume->vol_name = NULL;
386         }
387         if (volume->devvp) {
388                 vrele(volume->devvp);
389                 volume->devvp = NULL;
390         }
391         --hammer_count_volumes;
392         kfree(volume, M_HAMMER);
393 }
394
395 /*
396  * Get a HAMMER volume.  The volume must already exist.
397  */
398 hammer_volume_t
399 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
400 {
401         struct hammer_volume *volume;
402
403         /*
404          * Locate the volume structure
405          */
406         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
407         if (volume == NULL) {
408                 *errorp = ENOENT;
409                 return(NULL);
410         }
411         hammer_ref(&volume->io.lock);
412
413         /*
414          * Deal with on-disk info
415          */
416         if (volume->ondisk == NULL || volume->io.loading) {
417                 *errorp = hammer_load_volume(volume);
418                 if (*errorp) {
419                         hammer_rel_volume(volume, 1);
420                         volume = NULL;
421                 }
422         } else {
423                 *errorp = 0;
424         }
425         return(volume);
426 }
427
428 int
429 hammer_ref_volume(hammer_volume_t volume)
430 {
431         int error;
432
433         hammer_ref(&volume->io.lock);
434
435         /*
436          * Deal with on-disk info
437          */
438         if (volume->ondisk == NULL || volume->io.loading) {
439                 error = hammer_load_volume(volume);
440                 if (error)
441                         hammer_rel_volume(volume, 1);
442         } else {
443                 error = 0;
444         }
445         return (error);
446 }
447
448 hammer_volume_t
449 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
450 {
451         hammer_volume_t volume;
452
453         volume = hmp->rootvol;
454         KKASSERT(volume != NULL);
455         hammer_ref(&volume->io.lock);
456
457         /*
458          * Deal with on-disk info
459          */
460         if (volume->ondisk == NULL || volume->io.loading) {
461                 *errorp = hammer_load_volume(volume);
462                 if (*errorp) {
463                         hammer_rel_volume(volume, 1);
464                         volume = NULL;
465                 }
466         } else {
467                 *errorp = 0;
468         }
469         return (volume);
470 }
471
472 /*
473  * Load a volume's on-disk information.  The volume must be referenced and
474  * not locked.  We temporarily acquire an exclusive lock to interlock
475  * against releases or multiple get's.
476  */
477 static int
478 hammer_load_volume(hammer_volume_t volume)
479 {
480         struct hammer_volume_ondisk *ondisk;
481         int error;
482
483         hammer_lock_ex(&volume->io.lock);
484         KKASSERT(volume->io.loading == 0);
485         volume->io.loading = 1;
486
487         if (volume->ondisk == NULL) {
488                 error = hammer_io_read(volume->devvp, &volume->io);
489                 if (error) {
490                         volume->io.loading = 0;
491                         hammer_unlock(&volume->io.lock);
492                         return (error);
493                 }
494                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
495
496                 /*
497                  * Configure the volume's A-lists.  These are used to
498                  * allocate clusters.
499                  */
500                 if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
501                         volume->alist.config = &Vol_super_alist_config;
502                         volume->alist.meta = ondisk->vol_almeta.super;
503                         volume->alist.info = volume;
504                 } else {
505                         volume->alist.config = &Vol_normal_alist_config;
506                         volume->alist.meta = ondisk->vol_almeta.normal;
507                         volume->alist.info = NULL;
508                 }
509         } else {
510                 error = 0;
511         }
512         volume->io.loading = 0;
513         hammer_unlock(&volume->io.lock);
514         return(0);
515 }
516
517 /*
518  * Release a volume.  Call hammer_io_release on the last reference.  We have
519  * to acquire an exclusive lock to interlock against volume->ondisk tests
520  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
521  * lock to be held.
522  *
523  * Volumes are not unloaded from memory during normal operation.
524  */
525 void
526 hammer_rel_volume(hammer_volume_t volume, int flush)
527 {
528         if (volume->io.lock.refs == 1) {
529                 hammer_lock_ex(&volume->io.lock);
530                 if (volume->io.lock.refs == 1) {
531                         volume->ondisk = NULL;
532                         hammer_io_release(&volume->io, flush);
533                 } else if (flush) {
534                         hammer_io_flush(&volume->io);
535                 }
536                 hammer_unlock(&volume->io.lock);
537         }
538         hammer_unref(&volume->io.lock);
539 }
540
541 /************************************************************************
542  *                              SUPER-CLUSTERS                          *
543  ************************************************************************
544  *
545  * Manage super-clusters.  Note that a supercl holds a reference to its
546  * associated volume.
547  */
548 static int
549 hammer_find_supercl(hammer_volume_t volume, int32_t scl_no)
550 {
551         if (RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no))
552                 return(1);
553         return(0);
554 }
555
556 hammer_supercl_t
557 hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
558                    int *errorp, hammer_alloc_state_t isnew)
559 {
560         hammer_supercl_t supercl;
561
562         /*
563          * Locate and lock the super-cluster structure, creating one
564          * if necessary.
565          */
566 again:
567         supercl = RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no);
568         if (supercl == NULL) {
569                 ++hammer_count_supercls;
570                 supercl = kmalloc(sizeof(*supercl), M_HAMMER, M_WAITOK|M_ZERO);
571                 supercl->scl_no = scl_no;
572                 supercl->volume = volume;
573                 supercl->io.offset = calculate_supercl_offset(volume, scl_no);
574                 hammer_io_init(&supercl->io, HAMMER_STRUCTURE_SUPERCL);
575                 hammer_ref(&supercl->io.lock);
576
577                 /*
578                  * Insert the cluster into the RB tree and handle late
579                  * collisions.
580                  */
581                 if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
582                         hammer_unref(&supercl->io.lock);
583                         --hammer_count_supercls;
584                         kfree(supercl, M_HAMMER);
585                         goto again;
586                 }
587                 hammer_ref(&volume->io.lock);
588         } else {
589                 hammer_ref(&supercl->io.lock);
590         }
591
592         /*
593          * Deal with on-disk info
594          */
595         if (supercl->ondisk == NULL || isnew || supercl->io.loading) {
596                 *errorp = hammer_load_supercl(supercl, isnew);
597                 if (*errorp) {
598                         hammer_rel_supercl(supercl, 1);
599                         supercl = NULL;
600                 }
601         } else {
602                 *errorp = 0;
603         }
604         return(supercl);
605 }
606
607 static int
608 hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew)
609 {
610         struct hammer_supercl_ondisk *ondisk;
611         hammer_volume_t volume = supercl->volume;
612         int error;
613         int64_t nclusters;
614
615         hammer_lock_ex(&supercl->io.lock);
616         KKASSERT(supercl->io.loading == 0);
617         supercl->io.loading = 1;
618
619         if (supercl->ondisk == NULL) {
620                 if (isnew)
621                         error = hammer_io_new(volume->devvp, &supercl->io);
622                 else
623                         error = hammer_io_read(volume->devvp, &supercl->io);
624                 if (error) {
625                         supercl->io.loading = 0;
626                         hammer_unlock(&supercl->io.lock);
627                         return (error);
628                 }
629                 supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
630
631                 supercl->alist.config = &Supercl_alist_config;
632                 supercl->alist.meta = ondisk->scl_meta;
633                 supercl->alist.info = NULL;
634         } else if (isnew) {
635                 error = hammer_io_new(volume->devvp, &supercl->io);
636         } else {
637                 error = 0;
638         }
639         if (error == 0 && isnew) {
640                 /*
641                  * If this is a new super-cluster we have to initialize
642                  * various ondisk structural elements.  The caller is
643                  * responsible for the remainder.
644                  */
645                 struct hammer_alist_live dummy;
646
647                 hammer_modify_supercl(supercl);
648
649                 ondisk = supercl->ondisk;
650                 dummy.config = &Buf_alist_config;
651                 dummy.meta = ondisk->head.buf_almeta;
652                 dummy.info = NULL;
653                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL);
654
655                 nclusters = volume->ondisk->vol_nclusters -
656                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
657                 KKASSERT(nclusters > 0);
658                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
659                         nclusters = HAMMER_SCL_MAXCLUSTERS;
660                 hammer_alist_init(&supercl->alist, 0, (int32_t)nclusters,
661                                   isnew);
662         }
663         supercl->io.loading = 0;
664         hammer_unlock(&supercl->io.lock);
665         return (error);
666 }
667
668 /*
669  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
670  */
671 int
672 hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused)
673 {
674         KKASSERT(supercl->io.lock.refs == 0);
675         hammer_ref(&supercl->io.lock);
676         hammer_rel_supercl(supercl, 2);
677         return(0);
678 }
679
680 /*
681  * Release a super-cluster.  We have to deal with several places where
682  * another thread can ref the super-cluster.
683  *
684  * Only destroy the structure itself if the related buffer cache buffer
685  * was disassociated from it.  This ties the management of the structure
686  * to the buffer cache subsystem.
687  */
688 void
689 hammer_rel_supercl(hammer_supercl_t supercl, int flush)
690 {
691         hammer_volume_t volume;
692
693         if (supercl->io.lock.refs == 1) {
694                 hammer_lock_ex(&supercl->io.lock);
695                 if (supercl->io.lock.refs == 1) {
696                         hammer_io_release(&supercl->io, flush);
697                         if (supercl->io.bp == NULL &&
698                             supercl->io.lock.refs == 1) {
699                                 volume = supercl->volume;
700                                 RB_REMOVE(hammer_scl_rb_tree,
701                                           &volume->rb_scls_root, supercl);
702                                 supercl->volume = NULL; /* sanity */
703                                 --hammer_count_supercls;
704                                 kfree(supercl, M_HAMMER);
705                                 hammer_rel_volume(volume, 0);
706                                 return;
707                         }
708                 } else if (flush) {
709                         hammer_io_flush(&supercl->io);
710                 }
711                 hammer_unlock(&supercl->io.lock);
712         }
713         hammer_unref(&supercl->io.lock);
714 }
715
716 /************************************************************************
717  *                              CLUSTERS                                *
718  ************************************************************************
719  *
720  */
721 hammer_cluster_t
722 hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
723                    int *errorp, int getflags)
724 {
725         hammer_cluster_t cluster;
726
727 again:
728         cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
729         if (cluster == NULL) {
730                 ++hammer_count_clusters;
731                 cluster = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
732                 cluster->clu_no = clu_no;
733                 cluster->volume = volume;
734                 RB_INIT(&cluster->rb_bufs_root);
735                 RB_INIT(&cluster->rb_nods_root);
736                 hammer_io_init(&cluster->io, HAMMER_STRUCTURE_CLUSTER);
737                 cluster->io.offset = calculate_cluster_offset(volume, clu_no);
738                 hammer_ref(&cluster->io.lock);
739                 /* NOTE: cluster->io.validated expected to be 0 */
740
741                 /*
742                  * Insert the cluster into the RB tree and handle late
743                  * collisions.
744                  */
745                 if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
746                         hammer_unref(&cluster->io.lock);
747                         --hammer_count_clusters;
748                         kfree(cluster, M_HAMMER);
749                         goto again;
750                 }
751                 hammer_ref(&volume->io.lock);
752         } else {
753                 hammer_ref(&cluster->io.lock);
754         }
755
756         /*
757          * Deal with on-disk info
758          */
759         if (cluster->ondisk == NULL || getflags || cluster->io.validated == 0) {
760                 *errorp = hammer_load_cluster(cluster, getflags);
761                 if (*errorp) {
762                         hammer_rel_cluster(cluster, 1);
763                         cluster = NULL;
764                 }
765         } else {
766                 *errorp = 0;
767         }
768         return (cluster);
769 }
770
771 hammer_cluster_t
772 hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
773 {
774         hammer_cluster_t cluster;
775
776         cluster = hmp->rootcl;
777         KKASSERT(cluster != NULL);
778         hammer_ref(&cluster->io.lock);
779
780         /*
781          * Deal with on-disk info
782          */
783         if (cluster->ondisk == NULL || cluster->io.validated == 0) {
784                 *errorp = hammer_load_cluster(cluster, 0);
785                 if (*errorp) {
786                         hammer_rel_cluster(cluster, 1);
787                         cluster = NULL;
788                 }
789         } else {
790                 *errorp = 0;
791         }
792         return (cluster);
793 }
794
795 static
796 int
797 hammer_load_cluster(hammer_cluster_t cluster, int getflags)
798 {
799         hammer_volume_t volume = cluster->volume;
800         struct hammer_cluster_ondisk *ondisk;
801         int error;
802
803         hammer_lock_ex(&cluster->io.lock);
804         KKASSERT(cluster->io.loading == 0);
805         cluster->io.loading = 1;
806
807         if (cluster->ondisk == NULL) {
808                 KKASSERT(TAILQ_EMPTY(&cluster->io.deplist));
809
810                 /*
811                  * Unmodified buffers may be present, indicating that we
812                  * had already validated the cluster even though we no longer
813                  * have its ondisk info.
814                  */
815                 if (!RB_EMPTY(&cluster->rb_bufs_root))
816                         KKASSERT(cluster->io.validated);
817                 if (getflags & GET_CLUSTER_NEW)
818                         error = hammer_io_new(volume->devvp, &cluster->io);
819                 else
820                         error = hammer_io_read(volume->devvp, &cluster->io);
821                 if (error) {
822                         cluster->io.loading = 0;
823                         hammer_unlock(&cluster->io.lock);
824                         return (error);
825                 }
826                 cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
827
828                 cluster->alist_master.config = &Clu_master_alist_config;
829                 cluster->alist_master.meta = ondisk->clu_master_meta;
830                 cluster->alist_btree.config = &Clu_slave_alist_config;
831                 cluster->alist_btree.meta = ondisk->clu_btree_meta;
832                 cluster->alist_btree.info = cluster;
833                 cluster->alist_record.config = &Clu_slave_alist_config;
834                 cluster->alist_record.meta = ondisk->clu_record_meta;
835                 cluster->alist_record.info = cluster;
836                 cluster->alist_mdata.config = &Clu_slave_alist_config;
837                 cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
838                 cluster->alist_mdata.info = cluster;
839
840                 if ((getflags & GET_CLUSTER_NEW) == 0) {
841                         /*
842                          * Load cluster range info for easy access
843                          */
844                         cluster->clu_btree_beg = ondisk->clu_btree_beg;
845                         cluster->clu_btree_end = ondisk->clu_btree_end;
846                 }
847         } else if (getflags & GET_CLUSTER_NEW) {
848                 error = hammer_io_new(volume->devvp, &cluster->io);
849         } else {
850                 error = 0;
851         }
852         if (error == 0 && (getflags & GET_CLUSTER_NEW)) {
853                 /*
854                  * If this is a new cluster we have to initialize
855                  * various ondisk structural elements.  The caller is
856                  * responsible for the remainder.
857                  */
858                 struct hammer_alist_live dummy;
859                 hammer_node_t croot;
860                 hammer_volume_ondisk_t voldisk;
861                 int32_t nbuffers;
862
863                 cluster->flags &= ~HAMMER_CLUSTER_DELETED;
864
865                 hammer_modify_cluster(cluster);
866                 ondisk = cluster->ondisk;
867                 voldisk = volume->ondisk;
868
869                 dummy.config = &Buf_alist_config;
870                 dummy.meta = ondisk->head.buf_almeta;
871                 dummy.info = NULL;
872                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER);
873
874                 ondisk->vol_fsid = voldisk->vol_fsid;
875                 ondisk->vol_fstype = voldisk->vol_fstype;
876                 ondisk->clu_gen = 1;
877                 ondisk->clu_id = 0;     /* XXX */
878                 ondisk->clu_no = cluster->clu_no;
879                 ondisk->clu_flags = 0;
880                 ondisk->clu_start = HAMMER_BUFSIZE;
881                 ondisk->synchronized_rec_id = 1; /* XXX timestamp */
882                 KKASSERT(voldisk->vol_clo_end > cluster->io.offset);
883                 if (voldisk->vol_clo_end - cluster->io.offset >
884                     voldisk->vol_clsize) {
885                         ondisk->clu_limit = voldisk->vol_clsize;
886                 } else {
887                         ondisk->clu_limit = (int32_t)(voldisk->vol_clo_end -
888                                                       cluster->io.offset);
889                 }
890                 nbuffers = ondisk->clu_limit / HAMMER_BUFSIZE;
891                 hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
892                                   HAMMER_ASTATE_FREE);
893                 hammer_alist_init(&cluster->alist_btree,
894                                   HAMMER_FSBUF_MAXBLKS,
895                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
896                                   HAMMER_ASTATE_ALLOC);
897                 hammer_alist_init(&cluster->alist_record,
898                                   HAMMER_FSBUF_MAXBLKS,
899                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
900                                   HAMMER_ASTATE_ALLOC);
901                 hammer_alist_init(&cluster->alist_mdata,
902                                   HAMMER_FSBUF_MAXBLKS,
903                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
904                                   HAMMER_ASTATE_ALLOC);
905
906                 ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
907                 ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
908                 ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
909
910                 /*
911                  * Initialize the B-Tree.  We don't know what the caller
912                  * intends to do with the cluster so make sure it causes
913                  * an assertion if the caller makes no changes.
914                  */
915                 ondisk->clu_btree_parent_vol_no = -2;
916                 ondisk->clu_btree_parent_clu_no = -2;
917                 ondisk->clu_btree_parent_offset = -2;
918                 ondisk->clu_btree_parent_clu_gen = -2;
919
920                 croot = hammer_alloc_btree(cluster, &error);
921                 if (error == 0) {
922                         hammer_modify_node(croot);
923                         bzero(croot->ondisk, sizeof(*croot->ondisk));
924                         croot->ondisk->count = 0;
925                         croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
926                         hammer_modify_cluster(cluster);
927                         ondisk->clu_btree_root = croot->node_offset;
928                         hammer_rel_node(croot);
929                 }
930
931                 /*
932                  * We just formatted this cluster, don't try to recover it!
933                  */
934                 cluster->io.validated = 1;
935         }
936
937         /*
938          * If no error occured handle automatic cluster recovery unless
939          * the NORECOVER flag is passed (prevents recovery recursions) or
940          * the cluster has been flagged for deletion (prevents an attempt
941          * to recover a cluster which is no longer hooked into the tree).
942          *
943          * Setting hammer_debug_recover to 1 will force recovery on load
944          * whether or not the cluster is marked open.
945          *
946          * Setting hammer_debug_recover to -1 will force NO recovery
947          * regardless of state.
948          *
949          * io.validated can only be cleared if the buffer RB list is empty,
950          * preventing us from trying to recover an actively referenced
951          * cluster (which would blow the filesystem to smithereens).
952          */
953         if (error == 0 && cluster->io.validated == 0) {
954                 if ((getflags & GET_CLUSTER_NORECOVER) == 0 &&
955                     (cluster->flags & HAMMER_CLUSTER_DELETED) == 0) {
956                         if ((cluster->ondisk->clu_flags & HAMMER_CLUF_OPEN) ||
957                             hammer_debug_recover > 0) {
958                                 if (hammer_debug_recover >= 0)
959                                         hammer_recover(cluster);
960                         }
961                         cluster->io.validated = 1;
962                 } else if ((cluster->ondisk->clu_flags & HAMMER_CLUF_OPEN)==0) {
963                         cluster->io.validated = 1;
964                 }
965         }
966         cluster->io.loading = 0;
967         hammer_unlock(&cluster->io.lock);
968         return (error);
969 }
970
971 /*
972  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
973  */
974 int
975 hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused)
976 {
977         hammer_ref(&cluster->io.lock);
978         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
979                 hammer_unload_buffer, NULL);
980         hammer_io_waitdep(&cluster->io);
981         KKASSERT(cluster->io.lock.refs == 1);
982         hammer_rel_cluster(cluster, 2);
983         return(0);
984 }
985
986 /*
987  * Update the cluster's synchronization TID, which is used during cluster
988  * recovery.  NOTE: The cluster header is not written out until all related
989  * records have been written out.
990  */
991 u_int64_t
992 hammer_alloc_recid(hammer_cluster_t cluster)
993 {
994         u_int64_t recid;
995
996         hammer_modify_cluster(cluster);
997         recid = cluster->ondisk->synchronized_rec_id++;
998         return(recid);
999 }
1000
1001 #if 0
1002 void
1003 hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid)
1004 {
1005         hammer_modify_cluster(cluster);
1006         if (cluster->ondisk->synchronized_tid < tid)
1007                 cluster->ondisk->synchronized_tid = tid;
1008 }
1009 #endif
1010
1011 /*
1012  * Reference a cluster that is either already referenced or via a specially
1013  * handled pointer (aka rootcl).
1014  */
1015 int
1016 hammer_ref_cluster(hammer_cluster_t cluster)
1017 {
1018         int error;
1019
1020         KKASSERT(cluster != NULL);
1021         hammer_ref(&cluster->io.lock);
1022
1023         /*
1024          * Deal with on-disk info
1025          */
1026         if (cluster->ondisk == NULL || cluster->io.validated == 0) {
1027                 error = hammer_load_cluster(cluster, 0);
1028                 if (error)
1029                         hammer_rel_cluster(cluster, 1);
1030         } else {
1031                 error = 0;
1032         }
1033         return(error);
1034 }
1035
1036 /*
1037  * Release a cluster.  We have to deal with several places where
1038  * another thread can ref the cluster.
1039  *
1040  * Only destroy the structure itself if we no longer have an IO or any
1041  * hammer buffers associated with the structure.
1042  */
1043 void
1044 hammer_rel_cluster(hammer_cluster_t cluster, int flush)
1045 {
1046         hammer_volume_t volume;
1047
1048         /*
1049          * Free a deleted cluster back to the pool when its last
1050          * active reference is released.  This prevents the cluster
1051          * from being reallocated until all its prior references go away.
1052          *
1053          * XXX implement a discard dependancy list which holds references
1054          * on clusters, preventing their deletion, until their parent cluster
1055          * has been flushed to disk.
1056          */
1057         if (cluster->io.lock.refs == 1) {
1058                 if (cluster->flags & HAMMER_CLUSTER_DELETED) {
1059                         cluster->flags &= ~HAMMER_CLUSTER_DELETED;
1060                         if (hammer_debug_general & 0x80)
1061                                 kprintf("FREE CLUSTER %d\n", cluster->clu_no);
1062                         if (cluster->ondisk->stat_records) {
1063                                 struct hammer_sync_info info;
1064
1065                                 info.error = 0;
1066                                 info.waitfor = MNT_WAIT;
1067                                 kprintf(" (still has %d records!)\n",
1068                                         cluster->ondisk->stat_records);
1069                                 Debugger("continue to recover cluster");
1070                                 hammer_recover(cluster);
1071                                 Debugger("continue to sync cluster");
1072                                 hammer_sync_cluster(cluster, &info);
1073                                 Debugger("now debug it");
1074                         }
1075
1076                         /*
1077                          * Clean up any statistics we left hanging in the
1078                          * cluster.
1079                          */
1080                         hammer_adjust_stats(cluster, HAMMER_FSBUF_BTREE,
1081                                             -cluster->ondisk->stat_idx_bufs);
1082                         hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA,
1083                                             -cluster->ondisk->stat_data_bufs);
1084                         hammer_adjust_stats(cluster, HAMMER_FSBUF_RECORDS,
1085                                             -cluster->ondisk->stat_rec_bufs);
1086                         /*
1087                          * hammer_discard_cluster(cluster) - throw away
1088                          * dirty backing store, recurse to any underlying
1089                          * buffers. XXX
1090                          */
1091                         hammer_free_cluster(cluster);
1092                 }
1093         }
1094
1095         if (cluster->io.lock.refs == 1) {
1096                 hammer_lock_ex(&cluster->io.lock);
1097                 if (cluster->io.lock.refs == 1) {
1098                         /*
1099                          * Release the I/O.  If we or the kernel wants to
1100                          * flush, this will release the bp.  Otherwise the
1101                          * bp may be written and flushed passively by the
1102                          * kernel later on.
1103                          */
1104                         hammer_io_release(&cluster->io, flush);
1105
1106                         /*
1107                          * Final cleanup
1108                          */
1109                         if (cluster != cluster->volume->hmp->rootcl &&
1110                             cluster->io.bp == NULL &&
1111                             cluster->io.lock.refs == 1 &&
1112                             RB_EMPTY(&cluster->rb_bufs_root)) {
1113                                 KKASSERT(RB_EMPTY(&cluster->rb_nods_root));
1114                                 volume = cluster->volume;
1115                                 RB_REMOVE(hammer_clu_rb_tree,
1116                                           &volume->rb_clus_root, cluster);
1117                                 cluster->volume = NULL; /* sanity */
1118                                 --hammer_count_clusters;
1119                                 kfree(cluster, M_HAMMER);
1120                                 hammer_rel_volume(volume, 0);
1121                                 return;
1122                         }
1123                 } else if (flush) {
1124                         hammer_io_flush(&cluster->io);
1125                 }
1126                 hammer_unlock(&cluster->io.lock);
1127         }
1128         hammer_unref(&cluster->io.lock);
1129 }
1130
1131 /************************************************************************
1132  *                              BUFFERS                                 *
1133  ************************************************************************
1134  *
1135  * Manage buffers.  Note that a buffer holds a reference to its associated
1136  * cluster, and its cluster will hold a reference to the cluster's volume.
1137  *
1138  * A non-zero buf_type indicates that a new buffer should be created and
1139  * zero'd.
1140  */
1141 hammer_buffer_t
1142 hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
1143                   u_int64_t buf_type, int *errorp)
1144 {
1145         hammer_buffer_t buffer;
1146
1147         /*
1148          * Find the buffer.  Note that buffer 0 corresponds to the cluster
1149          * header and should never be requested.
1150          */
1151         KKASSERT(buf_no >= cluster->ondisk->clu_start / HAMMER_BUFSIZE &&
1152                  buf_no < cluster->ondisk->clu_limit / HAMMER_BUFSIZE);
1153
1154         /*
1155          * Locate and lock the buffer structure, creating one if necessary.
1156          */
1157 again:
1158         buffer = RB_LOOKUP(hammer_buf_rb_tree, &cluster->rb_bufs_root, buf_no);
1159         if (buffer == NULL) {
1160                 ++hammer_count_buffers;
1161                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
1162                 buffer->buf_no = buf_no;
1163                 buffer->cluster = cluster;
1164                 buffer->volume = cluster->volume;
1165                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
1166                 buffer->io.offset = cluster->io.offset +
1167                                     (buf_no * HAMMER_BUFSIZE);
1168                 TAILQ_INIT(&buffer->clist);
1169                 hammer_ref(&buffer->io.lock);
1170
1171                 /*
1172                  * Insert the cluster into the RB tree and handle late
1173                  * collisions.
1174                  */
1175                 if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
1176                         hammer_unref(&buffer->io.lock);
1177                         --hammer_count_buffers;
1178                         kfree(buffer, M_HAMMER);
1179                         goto again;
1180                 }
1181                 hammer_ref(&cluster->io.lock);
1182         } else {
1183                 hammer_ref(&buffer->io.lock);
1184         }
1185
1186         /*
1187          * Deal with on-disk info
1188          */
1189         if (buffer->ondisk == NULL || buf_type || buffer->io.loading) {
1190                 *errorp = hammer_load_buffer(buffer, buf_type);
1191                 if (*errorp) {
1192                         hammer_rel_buffer(buffer, 1);
1193                         buffer = NULL;
1194                 }
1195         } else {
1196                 *errorp = 0;
1197         }
1198         return(buffer);
1199 }
1200
1201 static int
1202 hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
1203 {
1204         hammer_volume_t volume;
1205         hammer_fsbuf_ondisk_t ondisk;
1206         int error;
1207
1208         /*
1209          * Load the buffer's on-disk info
1210          */
1211         volume = buffer->volume;
1212         hammer_lock_ex(&buffer->io.lock);
1213         KKASSERT(buffer->io.loading == 0);
1214         buffer->io.loading = 1;
1215
1216         if (buffer->ondisk == NULL) {
1217                 if (buf_type) {
1218                         error = hammer_io_new(volume->devvp, &buffer->io);
1219                 } else {
1220                         error = hammer_io_read(volume->devvp, &buffer->io);
1221                 }
1222                 if (error) {
1223                         buffer->io.loading = 0;
1224                         hammer_unlock(&buffer->io.lock);
1225                         return (error);
1226                 }
1227                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
1228                 buffer->alist.config = &Buf_alist_config;
1229                 buffer->alist.meta = ondisk->head.buf_almeta;
1230                 buffer->buf_type = ondisk->head.buf_type;
1231         } else if (buf_type) {
1232                 error = hammer_io_new(volume->devvp, &buffer->io);
1233         } else {
1234                 error = 0;
1235         }
1236         if (error == 0 && buf_type) {
1237                 hammer_modify_buffer(buffer);
1238                 ondisk = buffer->ondisk;
1239                 hammer_initbuffer(&buffer->alist, &ondisk->head, buf_type);
1240                 buffer->buf_type = ondisk->head.buf_type;
1241         }
1242         buffer->io.loading = 0;
1243         hammer_unlock(&buffer->io.lock);
1244         return (error);
1245 }
1246
1247 /*
1248  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
1249  */
1250 int
1251 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
1252 {
1253         hammer_ref(&buffer->io.lock);
1254         hammer_flush_buffer_nodes(buffer);
1255         KKASSERT(buffer->io.lock.refs == 1);
1256         hammer_rel_buffer(buffer, 2);
1257         return(0);
1258 }
1259
1260 /*
1261  * Reference a buffer that is either already referenced or via a specially
1262  * handled pointer (aka cursor->buffer).
1263  */
1264 int
1265 hammer_ref_buffer(hammer_buffer_t buffer)
1266 {
1267         int error;
1268
1269         hammer_ref(&buffer->io.lock);
1270         if (buffer->ondisk == NULL || buffer->io.loading) {
1271                 error = hammer_load_buffer(buffer, 0);
1272                 if (error) {
1273                         hammer_rel_buffer(buffer, 1);
1274                         /*
1275                          * NOTE: buffer pointer can become stale after
1276                          * the above release.
1277                          */
1278                 } else {
1279                         KKASSERT(buffer->buf_type ==
1280                                  buffer->ondisk->head.buf_type);
1281                 }
1282         } else {
1283                 error = 0;
1284         }
1285         return(error);
1286 }
1287
1288 /*
1289  * Release a buffer.  We have to deal with several places where
1290  * another thread can ref the buffer.
1291  *
1292  * Only destroy the structure itself if the related buffer cache buffer
1293  * was disassociated from it.  This ties the management of the structure
1294  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1295  * embedded io is referenced or not.
1296  */
1297 void
1298 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
1299 {
1300         hammer_cluster_t cluster;
1301
1302         if (buffer->io.lock.refs == 1) {
1303                 hammer_lock_ex(&buffer->io.lock);
1304                 if (buffer->io.lock.refs == 1) {
1305                         hammer_io_release(&buffer->io, flush);
1306
1307                         if (buffer->io.bp == NULL &&
1308                             buffer->io.lock.refs == 1) {
1309                                 hammer_flush_buffer_nodes(buffer);
1310                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
1311                                 cluster = buffer->cluster;
1312                                 RB_REMOVE(hammer_buf_rb_tree,
1313                                           &cluster->rb_bufs_root, buffer);
1314                                 buffer->cluster = NULL; /* sanity */
1315                                 --hammer_count_buffers;
1316                                 kfree(buffer, M_HAMMER);
1317                                 hammer_rel_cluster(cluster, 0);
1318                                 return;
1319                         }
1320                 } else if (flush) {
1321                         hammer_io_flush(&buffer->io);
1322                 }
1323                 hammer_unlock(&buffer->io.lock);
1324         }
1325         hammer_unref(&buffer->io.lock);
1326 }
1327
1328 /************************************************************************
1329  *                              NODES                                   *
1330  ************************************************************************
1331  *
1332  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1333  * method used by the HAMMER filesystem.
1334  *
1335  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1336  * associated with its buffer, and will only referenced the buffer while
1337  * the node itself is referenced.
1338  *
1339  * A hammer_node can also be passively associated with other HAMMER
1340  * structures, such as inodes, while retaining 0 references.  These
1341  * associations can be cleared backwards using a pointer-to-pointer in
1342  * the hammer_node.
1343  *
1344  * This allows the HAMMER implementation to cache hammer_nodes long-term
1345  * and short-cut a great deal of the infrastructure's complexity.  In
1346  * most cases a cached node can be reacquired without having to dip into
1347  * either the buffer or cluster management code.
1348  *
1349  * The caller must pass a referenced cluster on call and will retain
1350  * ownership of the reference on return.  The node will acquire its own
1351  * additional references, if necessary.
1352  */
1353 hammer_node_t
1354 hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
1355 {
1356         hammer_node_t node;
1357
1358         /*
1359          * Locate the structure, allocating one if necessary.
1360          */
1361 again:
1362         node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
1363                          node_offset);
1364         if (node == NULL) {
1365                 ++hammer_count_nodes;
1366                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
1367                 node->node_offset = node_offset;
1368                 node->cluster = cluster;
1369                 if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
1370                               node)) {
1371                         --hammer_count_nodes;
1372                         kfree(node, M_HAMMER);
1373                         goto again;
1374                 }
1375         }
1376         hammer_ref(&node->lock);
1377         *errorp = hammer_load_node(node);
1378         if (*errorp) {
1379                 hammer_rel_node(node);
1380                 node = NULL;
1381         }
1382         return(node);
1383 }
1384
1385 /*
1386  * Reference an already-referenced node.
1387  */
1388 int
1389 hammer_ref_node(hammer_node_t node)
1390 {
1391         int error;
1392
1393         KKASSERT(node->lock.refs > 0);
1394         hammer_ref(&node->lock);
1395         if ((error = hammer_load_node(node)) != 0)
1396                 hammer_rel_node(node);
1397         return(error);
1398 }
1399
1400 /*
1401  * Load a node's on-disk data reference.
1402  */
1403 static int
1404 hammer_load_node(hammer_node_t node)
1405 {
1406         hammer_buffer_t buffer;
1407         int32_t buf_no;
1408         int error;
1409
1410         if (node->ondisk)
1411                 return(0);
1412         error = 0;
1413         hammer_lock_ex(&node->lock);
1414         if (node->ondisk == NULL) {
1415                 /*
1416                  * This is a little confusing but the jist is that
1417                  * node->buffer determines whether the node is on
1418                  * the buffer's clist and node->ondisk determines
1419                  * whether the buffer is referenced.
1420                  */
1421                 if ((buffer = node->buffer) != NULL) {
1422                         error = hammer_ref_buffer(buffer);
1423                 } else {
1424                         buf_no = node->node_offset / HAMMER_BUFSIZE;
1425                         buffer = hammer_get_buffer(node->cluster,
1426                                                    buf_no, 0, &error);
1427                         if (buffer) {
1428                                 KKASSERT(error == 0);
1429                                 TAILQ_INSERT_TAIL(&buffer->clist,
1430                                                   node, entry);
1431                                 node->buffer = buffer;
1432                         }
1433                 }
1434                 if (error == 0) {
1435                         node->ondisk = (void *)((char *)buffer->ondisk +
1436                                (node->node_offset & HAMMER_BUFMASK));
1437                 }
1438         }
1439         hammer_unlock(&node->lock);
1440         return (error);
1441 }
1442
1443 /*
1444  * Safely reference a node, interlock against flushes via the IO subsystem.
1445  */
1446 hammer_node_t
1447 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
1448                      int *errorp)
1449 {
1450         hammer_node_t node;
1451
1452         if ((node = *cache) != NULL)
1453                 hammer_ref(&node->lock);
1454         if (node) {
1455                 *errorp = hammer_load_node(node);
1456                 if (*errorp) {
1457                         hammer_rel_node(node);
1458                         node = NULL;
1459                 }
1460         } else {
1461                 *errorp = ENOENT;
1462         }
1463         return(node);
1464 }
1465
1466 /*
1467  * Release a hammer_node.  On the last release the node dereferences
1468  * its underlying buffer and may or may not be destroyed.
1469  */
1470 void
1471 hammer_rel_node(hammer_node_t node)
1472 {
1473         hammer_cluster_t cluster;
1474         hammer_buffer_t buffer;
1475         int32_t node_offset;
1476         int flags;
1477
1478         /*
1479          * If this isn't the last ref just decrement the ref count and
1480          * return.
1481          */
1482         if (node->lock.refs > 1) {
1483                 hammer_unref(&node->lock);
1484                 return;
1485         }
1486
1487         /*
1488          * If there is no ondisk info or no buffer the node failed to load,
1489          * remove the last reference and destroy the node.
1490          */
1491         if (node->ondisk == NULL) {
1492                 hammer_unref(&node->lock);
1493                 hammer_flush_node(node);
1494                 /* node is stale now */
1495                 return;
1496         }
1497
1498         /*
1499          * Do final cleanups and then either destroy the node and leave it
1500          * passively cached.  The buffer reference is removed regardless.
1501          */
1502         buffer = node->buffer;
1503         node->ondisk = NULL;
1504
1505         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
1506                 hammer_unref(&node->lock);
1507                 hammer_rel_buffer(buffer, 0);
1508                 return;
1509         }
1510
1511         /*
1512          * Destroy the node.  Record pertainant data because the node
1513          * becomes stale the instant we flush it.
1514          */
1515         flags = node->flags;
1516         node_offset = node->node_offset;
1517         hammer_unref(&node->lock);
1518         hammer_flush_node(node);
1519         /* node is stale */
1520
1521         cluster = buffer->cluster;
1522         if (flags & HAMMER_NODE_DELETED) {
1523                 if (node_offset == cluster->ondisk->clu_btree_root)
1524                         KKASSERT(cluster->flags & HAMMER_CLUSTER_DELETED);
1525                 hammer_free_btree(cluster, node_offset);
1526         }
1527         hammer_rel_buffer(buffer, 0);
1528 }
1529
1530 /*
1531  * Passively cache a referenced hammer_node in *cache.  The caller may
1532  * release the node on return.
1533  */
1534 void
1535 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1536 {
1537         hammer_node_t old;
1538
1539         /*
1540          * If the node is being deleted, don't cache it!
1541          */
1542         if (node->flags & HAMMER_NODE_DELETED)
1543                 return;
1544
1545         /*
1546          * Cache the node.  If we previously cached a different node we
1547          * have to give HAMMER a chance to destroy it.
1548          */
1549 again:
1550         if (node->cache1 != cache) {
1551                 if (node->cache2 != cache) {
1552                         if ((old = *cache) != NULL) {
1553                                 KKASSERT(node->lock.refs != 0);
1554                                 hammer_uncache_node(cache);
1555                                 goto again;
1556                         }
1557                         if (node->cache2)
1558                                 *node->cache2 = NULL;
1559                         node->cache2 = node->cache1;
1560                         node->cache1 = cache;
1561                         *cache = node;
1562                 } else {
1563                         struct hammer_node **tmp;
1564                         tmp = node->cache1;
1565                         node->cache1 = node->cache2;
1566                         node->cache2 = tmp;
1567                 }
1568         }
1569 }
1570
1571 void
1572 hammer_uncache_node(struct hammer_node **cache)
1573 {
1574         hammer_node_t node;
1575
1576         if ((node = *cache) != NULL) {
1577                 *cache = NULL;
1578                 if (node->cache1 == cache) {
1579                         node->cache1 = node->cache2;
1580                         node->cache2 = NULL;
1581                 } else if (node->cache2 == cache) {
1582                         node->cache2 = NULL;
1583                 } else {
1584                         panic("hammer_uncache_node: missing cache linkage");
1585                 }
1586                 if (node->cache1 == NULL && node->cache2 == NULL)
1587                         hammer_flush_node(node);
1588         }
1589 }
1590
1591 /*
1592  * Remove a node's cache references and destroy the node if it has no
1593  * other references or backing store.
1594  */
1595 void
1596 hammer_flush_node(hammer_node_t node)
1597 {
1598         hammer_buffer_t buffer;
1599
1600         if (node->cache1)
1601                 *node->cache1 = NULL;
1602         if (node->cache2)
1603                 *node->cache2 = NULL;
1604         if (node->lock.refs == 0 && node->ondisk == NULL) {
1605                 RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
1606                           node);
1607                 if ((buffer = node->buffer) != NULL) {
1608                         node->buffer = NULL;
1609                         TAILQ_REMOVE(&buffer->clist, node, entry);
1610                         /* buffer is unreferenced because ondisk is NULL */
1611                 }
1612                 --hammer_count_nodes;
1613                 kfree(node, M_HAMMER);
1614         }
1615 }
1616
1617 /*
1618  * Flush passively cached B-Tree nodes associated with this buffer.
1619  * This is only called when the buffer is about to be destroyed, so
1620  * none of the nodes should have any references.
1621  */
1622 void
1623 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1624 {
1625         hammer_node_t node;
1626
1627         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1628                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1629                 hammer_ref(&node->lock);
1630                 node->flags |= HAMMER_NODE_FLUSH;
1631                 hammer_rel_node(node);
1632         }
1633 }
1634
1635 /************************************************************************
1636  *                              A-LIST ALLOCATORS                       *
1637  ************************************************************************/
1638
1639 /*
1640  * Allocate HAMMER clusters
1641  */
1642 hammer_cluster_t
1643 hammer_alloc_cluster(hammer_mount_t hmp, hammer_cluster_t cluster_hint,
1644                      int *errorp)
1645 {
1646         hammer_volume_t volume;
1647         hammer_cluster_t cluster;
1648         int32_t clu_no;
1649         int32_t clu_hint;
1650         int32_t vol_beg;
1651         int32_t vol_no;
1652
1653         /*
1654          * Figure out our starting volume and hint.
1655          */
1656         if (cluster_hint) {
1657                 vol_beg = cluster_hint->volume->vol_no;
1658                 clu_hint = cluster_hint->clu_no;
1659         } else {
1660                 vol_beg = hmp->volume_iterator;
1661                 clu_hint = -1;
1662         }
1663
1664         /*
1665          * Loop through volumes looking for a free cluster.  If allocating
1666          * a new cluster relative to an existing cluster try to find a free
1667          * cluster on either side (clu_hint >= 0), otherwise just do a
1668          * forwards iteration.
1669          */
1670         vol_no = vol_beg;
1671         do {
1672                 volume = hammer_get_volume(hmp, vol_no, errorp);
1673                 if (*errorp) {
1674                         clu_no = HAMMER_ALIST_BLOCK_NONE;
1675                         break;
1676                 }
1677                 hammer_modify_volume(volume);
1678                 if (clu_hint == -1) {
1679                         clu_hint = volume->clu_iterator;
1680                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1681                                                         clu_hint);
1682                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1683                                 clu_no = hammer_alist_alloc_fwd(&volume->alist,
1684                                                                 1, 0);
1685                         }
1686                 } else {
1687                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1688                                                         clu_hint);
1689                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1690                                 clu_no = hammer_alist_alloc_rev(&volume->alist,
1691                                                                 1, clu_hint);
1692                         }
1693                 }
1694                 if (clu_no != HAMMER_ALIST_BLOCK_NONE)
1695                         break;
1696                 hammer_rel_volume(volume, 0);
1697                 volume = NULL;
1698                 *errorp = ENOSPC;
1699                 vol_no = (vol_no + 1) % hmp->nvolumes;
1700                 clu_hint = -1;
1701         } while (vol_no != vol_beg);
1702
1703         /*
1704          * Acquire the cluster.  On success this will force *errorp to 0.
1705          */
1706         if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
1707                 if (hammer_debug_general & 0x40) {
1708                         kprintf("ALLOC CLUSTER %d:%d\n", 
1709                                 volume->vol_no, clu_no);
1710                 }
1711                 cluster = hammer_get_cluster(volume, clu_no, errorp,
1712                                              GET_CLUSTER_NEW);
1713                 volume->clu_iterator = clu_no;
1714                 hammer_rel_volume(volume, 0);
1715         } else {
1716                 cluster = NULL;
1717         }
1718         if (cluster)
1719                 hammer_lock_ex(&cluster->io.lock);
1720         return(cluster);
1721 }
1722
1723 void
1724 hammer_init_cluster(hammer_cluster_t cluster, hammer_base_elm_t left_bound, 
1725                     hammer_base_elm_t right_bound)
1726 {
1727         hammer_cluster_ondisk_t ondisk = cluster->ondisk;
1728
1729         hammer_modify_cluster(cluster);
1730         ondisk->clu_btree_beg = *left_bound;
1731         ondisk->clu_btree_end = *right_bound;
1732         cluster->clu_btree_beg = ondisk->clu_btree_beg;
1733         cluster->clu_btree_end = ondisk->clu_btree_end;
1734 }
1735
1736 /*
1737  * Deallocate a cluster
1738  */
1739 void
1740 hammer_free_cluster(hammer_cluster_t cluster)
1741 {
1742         hammer_modify_volume(cluster->volume);
1743         hammer_alist_free(&cluster->volume->alist, cluster->clu_no, 1);
1744 }
1745
1746 /*
1747  * Allocate HAMMER elements - btree nodes, data storage, and record elements
1748  *
1749  * The passed *bufferp should be initialized to NULL.  On successive calls
1750  * *bufferp caches the most recent buffer used until put away by the caller.
1751  * Note that previously returned pointers using the cached buffer become
1752  * invalid on successive calls which reuse *bufferp.
1753  *
1754  * All allocations first attempt to use the block found at the specified
1755  * iterator.  If that fails the first available block is used.  If that
1756  * fails a new buffer is allocated and associated with the buffer type
1757  * A-list and the element is allocated out of the new buffer.
1758  *
1759  * This function also ensures that the required minimum number of buffers is
1760  * reserved to guarantee that recovery operations succeed.
1761  */
1762
1763 hammer_node_t
1764 hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
1765 {
1766         hammer_buffer_t buffer;
1767         hammer_alist_t live;
1768         hammer_node_t node;
1769         int32_t elm_no;
1770         int32_t buf_no;
1771         int32_t node_offset;
1772         int32_t n;
1773
1774         hammer_modify_cluster(cluster);
1775         buffer = NULL;
1776         live = &cluster->alist_btree;
1777
1778         /*
1779          * If we aren't recovering then ensure the required minimum
1780          * reservation is met. XXX if the recovery code packs the B-Tree
1781          * we don't have to do this.
1782          *
1783          * Calculate the number of buffers needed to hold the B-Tree.
1784          */
1785         if (cluster->io.validated) {
1786                 n = (cluster->ondisk->stat_records * 3 / 
1787                     HAMMER_BTREE_INT_ELMS / HAMMER_BTREE_NODES) + 1;
1788                 if (hammer_debug_general &&
1789                     cluster->ondisk->stat_idx_bufs < n) {
1790                         kprintf("hammer_alloc_btree: %d/%d buffers\n",
1791                                 cluster->ondisk->stat_idx_bufs, n);
1792                 }
1793                 while (cluster->ondisk->stat_idx_bufs < n) {
1794                         alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1795                                          cluster->ondisk->idx_index, errorp,
1796                                          &buffer);
1797                         if (*errorp) {
1798                                 if (buffer)
1799                                         hammer_rel_buffer(buffer, 0);
1800                                 return(NULL);
1801                         }
1802                 }
1803         }
1804
1805
1806         /*
1807          * Allocate a B-Tree element
1808          */
1809         elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
1810         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1811                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1812         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1813                 alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1814                                  cluster->ondisk->idx_index, errorp, &buffer);
1815                 elm_no = hammer_alist_alloc(live, 1);
1816                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1817                         *errorp = ENOSPC;
1818                         if (buffer)
1819                                 hammer_rel_buffer(buffer, 0);
1820                         return(NULL);
1821                 }
1822         }
1823         cluster->ondisk->idx_index = elm_no;
1824         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_BTREE_NODES);
1825
1826         /*
1827          * Load and return the B-Tree element
1828          */
1829         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1830         node_offset = buf_no * HAMMER_BUFSIZE +
1831                       offsetof(union hammer_fsbuf_ondisk,
1832                                btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
1833         node = hammer_get_node(cluster, node_offset, errorp);
1834         if (node) {
1835                 hammer_modify_node(node);
1836                 bzero(node->ondisk, sizeof(*node->ondisk));
1837                 KKASSERT((node->flags & (HAMMER_NODE_DELETED)) == 0);
1838         } else {
1839                 hammer_alist_free(live, elm_no, 1);
1840                 hammer_rel_node(node);
1841                 node = NULL;
1842         }
1843         if (buffer)
1844                 hammer_rel_buffer(buffer, 0);
1845         return(node);
1846 }
1847
1848 void *
1849 hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
1850                   int *errorp, struct hammer_buffer **bufferp)
1851 {
1852         hammer_buffer_t buffer;
1853         hammer_alist_t live;
1854         int32_t elm_no;
1855         int32_t buf_no;
1856         int32_t nblks;
1857         void *item;
1858
1859         /*
1860          * Deal with large data blocks.  The blocksize is HAMMER_BUFSIZE
1861          * for these allocations.
1862          */
1863         hammer_modify_cluster(cluster);
1864         if ((bytes & HAMMER_BUFMASK) == 0) {
1865                 nblks = bytes / HAMMER_BUFSIZE;
1866                 /* only one block allowed for now (so buffer can hold it) */
1867                 KKASSERT(nblks == 1);
1868
1869                 buf_no = hammer_alloc_master(cluster, nblks,
1870                                              cluster->ondisk->idx_ldata, 1);
1871                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1872                         *errorp = ENOSPC;
1873                         return(NULL);
1874                 }
1875                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, nblks);
1876                 cluster->ondisk->idx_ldata = buf_no;
1877                 buffer = *bufferp;
1878                 *bufferp = hammer_get_buffer(cluster, buf_no, -1, errorp);
1879                 if (buffer)
1880                         hammer_rel_buffer(buffer, 0);
1881                 buffer = *bufferp;
1882                 return(buffer->ondisk);
1883         }
1884
1885         /*
1886          * Allocate a data element.  The block size is HAMMER_DATA_BLKSIZE
1887          * (64 bytes) for these allocations.
1888          */
1889         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1890         nblks /= HAMMER_DATA_BLKSIZE;
1891         live = &cluster->alist_mdata;
1892         elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data);
1893         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1894                 elm_no = hammer_alist_alloc_fwd(live, nblks, 0);
1895         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1896                 alloc_new_buffer(cluster, HAMMER_FSBUF_DATA, live,
1897                                  cluster->ondisk->idx_data, errorp, bufferp);
1898                 elm_no = hammer_alist_alloc(live, nblks);
1899                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1900                         *errorp = ENOSPC;
1901                         return(NULL);
1902                 }
1903         }
1904         cluster->ondisk->idx_index = elm_no;
1905
1906         /*
1907          * Load and return the B-Tree element
1908          */
1909         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1910         buffer = *bufferp;
1911         if (buffer == NULL || buffer->cluster != cluster ||
1912             buffer->buf_no != buf_no) {
1913                 if (buffer)
1914                         hammer_rel_buffer(buffer, 0);
1915                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1916                 *bufferp = buffer;
1917         }
1918         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_DATA);
1919         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_DATA_NODES);
1920         hammer_modify_buffer(buffer);
1921         item = &buffer->ondisk->data.data[elm_no & HAMMER_FSBUF_BLKMASK];
1922         bzero(item, nblks * HAMMER_DATA_BLKSIZE);
1923         *errorp = 0;
1924         return(item);
1925 }
1926
1927 void *
1928 hammer_alloc_record(hammer_cluster_t cluster, int *errorp,
1929                     u_int8_t rec_type, struct hammer_buffer **bufferp)
1930 {
1931         hammer_buffer_t buffer;
1932         hammer_alist_t live;
1933         int32_t elm_no;
1934         int32_t buf_no;
1935         void *item;
1936
1937         /*
1938          * Allocate a record element
1939          */
1940         hammer_modify_cluster(cluster);
1941         live = &cluster->alist_record;
1942         elm_no = hammer_alist_alloc_rev(live, 1, cluster->ondisk->idx_record);
1943         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1944                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1945         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1946                 alloc_new_buffer(cluster, HAMMER_FSBUF_RECORDS, live,
1947                                  cluster->ondisk->idx_record, errorp, bufferp);
1948                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1949                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1950                         *errorp = ENOSPC;
1951                         return(NULL);
1952                 }
1953         }
1954         cluster->ondisk->idx_record = elm_no;
1955
1956         /*
1957          * Load and return the record element
1958          */
1959         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1960         buffer = *bufferp;
1961         if (buffer == NULL || buffer->cluster != cluster ||
1962             buffer->buf_no != buf_no) {
1963                 if (buffer)
1964                         hammer_rel_buffer(buffer, 0);
1965                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1966                 *bufferp = buffer;
1967         }
1968         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_RECORDS);
1969         KASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_RECORD_NODES,
1970                 ("elm_no %d (%d) out of bounds", elm_no, elm_no & HAMMER_FSBUF_BLKMASK));
1971         hammer_modify_buffer(buffer);
1972         item = &buffer->ondisk->record.recs[elm_no & HAMMER_FSBUF_BLKMASK];
1973         bzero(item, sizeof(union hammer_record_ondisk));
1974         *errorp = 0;
1975         ++cluster->ondisk->stat_records;
1976         if (rec_type == HAMMER_RECTYPE_CLUSTER)
1977                 ++cluster->ondisk->stat_records;
1978         return(item);
1979 }
1980
1981 void
1982 hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
1983 {
1984         int32_t elm_no;
1985         int32_t nblks;
1986         hammer_alist_t live;
1987
1988         hammer_modify_cluster(buffer->cluster);
1989         if ((bytes & HAMMER_BUFMASK) == 0) {
1990                 nblks = bytes / HAMMER_BUFSIZE;
1991                 KKASSERT(nblks == 1 && data == (void *)buffer->ondisk);
1992                 hammer_alist_free(&buffer->cluster->alist_master,
1993                                   buffer->buf_no, nblks);
1994                 hammer_adjust_stats(buffer->cluster, HAMMER_FSBUF_DATA, -nblks);
1995                 return;
1996         }
1997
1998         elm_no = ((char *)data - (char *)buffer->ondisk->data.data) /
1999                  HAMMER_DATA_BLKSIZE;
2000         KKASSERT(elm_no >= 0 && elm_no < HAMMER_DATA_NODES);
2001         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
2002         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
2003         nblks /= HAMMER_DATA_BLKSIZE;
2004         live = &buffer->cluster->alist_mdata;
2005         hammer_alist_free(live, elm_no, nblks);
2006 }
2007
2008 void
2009 hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec,
2010                         u_int8_t rec_type)
2011 {
2012         int32_t elm_no;
2013         hammer_alist_t live;
2014
2015         hammer_modify_cluster(buffer->cluster);
2016         elm_no = rec - &buffer->ondisk->record.recs[0];
2017         KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
2018         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
2019         live = &buffer->cluster->alist_record;
2020         hammer_alist_free(live, elm_no, 1);
2021         --buffer->cluster->ondisk->stat_records;
2022         if (rec_type == HAMMER_RECTYPE_CLUSTER)
2023                 --buffer->cluster->ondisk->stat_records;
2024 }
2025
2026 void
2027 hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
2028 {
2029         const int32_t blksize = sizeof(struct hammer_node_ondisk);
2030         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2031         hammer_alist_t live;
2032         int32_t elm_no;
2033
2034         hammer_modify_cluster(cluster);
2035         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2036         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, btree.nodes[0]);
2037         live = &cluster->alist_btree;
2038         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2039         elm_no += fsbuf_offset / blksize;
2040         hammer_alist_free(live, elm_no, 1);
2041 }
2042
2043 void
2044 hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
2045 {
2046         const int32_t blksize = HAMMER_DATA_BLKSIZE;
2047         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2048         hammer_alist_t live;
2049         int32_t elm_no;
2050         int32_t buf_no;
2051         int32_t nblks;
2052
2053         hammer_modify_cluster(cluster);
2054         if ((bytes & HAMMER_BUFMASK) == 0) {
2055                 nblks = bytes / HAMMER_BUFSIZE;
2056                 KKASSERT(nblks == 1 && (bclu_offset & HAMMER_BUFMASK) == 0);
2057                 buf_no = bclu_offset / HAMMER_BUFSIZE;
2058                 hammer_alist_free(&cluster->alist_master, buf_no, nblks);
2059                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, -nblks);
2060                 return;
2061         }
2062
2063         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2064         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, data.data[0][0]);
2065         live = &cluster->alist_mdata;
2066         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
2067         nblks /= HAMMER_DATA_BLKSIZE;
2068         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2069         elm_no += fsbuf_offset / blksize;
2070         hammer_alist_free(live, elm_no, nblks);
2071 }
2072
2073 void
2074 hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset,
2075                    u_int8_t rec_type)
2076 {
2077         const int32_t blksize = sizeof(union hammer_record_ondisk);
2078         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2079         hammer_alist_t live;
2080         int32_t elm_no;
2081
2082         hammer_modify_cluster(cluster);
2083         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2084         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, record.recs[0]);
2085         live = &cluster->alist_record;
2086         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2087         elm_no += fsbuf_offset / blksize;
2088         hammer_alist_free(live, elm_no, 1);
2089         --cluster->ondisk->stat_records;
2090         if (rec_type == HAMMER_RECTYPE_CLUSTER)
2091                 --cluster->ondisk->stat_records;
2092 }
2093
2094
2095 /*
2096  * Allocate a new filesystem buffer and assign it to the specified
2097  * filesystem buffer type.  The new buffer will be added to the
2098  * type-specific A-list and initialized.
2099  */
2100 static void
2101 alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live,
2102                  int start, int *errorp, struct hammer_buffer **bufferp)
2103 {
2104         hammer_buffer_t buffer;
2105         int32_t buf_no;
2106         int32_t base_blk;
2107         int isfwd;
2108
2109         if (*bufferp)
2110                 hammer_rel_buffer(*bufferp, 0);
2111         *bufferp = NULL;
2112
2113         start = start / HAMMER_FSBUF_MAXBLKS;   /* convert to buf_no */
2114         isfwd = (type != HAMMER_FSBUF_RECORDS);
2115         buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
2116         if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2117                 *errorp = ENOSPC;
2118                 return;
2119         }
2120
2121         /*
2122          * The new buffer must be initialized (type != 0) regardless of
2123          * whether we already have it cached or not, so don't try to
2124          * optimize the cached buffer check.  Just call hammer_get_buffer().
2125          */
2126         buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
2127         *bufferp = buffer;
2128
2129         /*
2130          * Do a meta-free of the buffer's elements into the type-specific
2131          * A-list and update our statistics to reflect the allocation.
2132          */
2133         if (buffer) {
2134                 hammer_modify_buffer(buffer);  /*XXX*/
2135                 hammer_adjust_stats(cluster, type, 1);
2136
2137                 /*
2138                  * Free the buffer to the appropriate slave list so the
2139                  * cluster-based allocator sees it.
2140                  */
2141                 base_blk = buf_no * HAMMER_FSBUF_MAXBLKS;
2142
2143                 switch(type) {
2144                 case HAMMER_FSBUF_BTREE:
2145                         hammer_alist_free(live, base_blk, HAMMER_BTREE_NODES);
2146                         break;
2147                 case HAMMER_FSBUF_DATA:
2148                         hammer_alist_free(live, base_blk, HAMMER_DATA_NODES);
2149                         break;
2150                 case HAMMER_FSBUF_RECORDS:
2151                         hammer_alist_free(live, base_blk, HAMMER_RECORD_NODES);
2152                         break;
2153                 }
2154         }
2155 }
2156
2157 /*
2158  * Sync dirty buffers to the media
2159  */
2160
2161 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
2162 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
2163
2164 int
2165 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
2166 {
2167         struct hammer_sync_info info;
2168
2169         info.error = 0;
2170         info.waitfor = waitfor;
2171
2172         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
2173                       hammer_sync_scan1, hammer_sync_scan2, &info);
2174
2175         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
2176                 hammer_sync_volume, &info);
2177         return(info.error);
2178 }
2179
2180 static int
2181 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
2182 {
2183         struct hammer_inode *ip;
2184
2185         ip = VTOI(vp);
2186         if (vp->v_type == VNON || ip == NULL ||
2187             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2188              RB_EMPTY(&vp->v_rbdirty_tree))) {
2189                 return(-1);
2190         }
2191         return(0);
2192 }
2193
2194 static int
2195 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2196 {
2197         struct hammer_sync_info *info = data;
2198         struct hammer_inode *ip;
2199         int error;
2200
2201         ip = VTOI(vp);
2202         if (vp->v_type == VNON || vp->v_type == VBAD ||
2203             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2204              RB_EMPTY(&vp->v_rbdirty_tree))) {
2205                 return(0);
2206         }
2207         error = VOP_FSYNC(vp, info->waitfor);
2208         if (error)
2209                 info->error = error;
2210         return(0);
2211 }
2212
2213 int
2214 hammer_sync_volume(hammer_volume_t volume, void *data)
2215 {
2216         struct hammer_sync_info *info = data;
2217
2218         hammer_ref(&volume->io.lock);
2219         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
2220                 hammer_sync_cluster, info);
2221         hammer_rel_volume(volume, 1);
2222         return(0);
2223 }
2224
2225 int
2226 hammer_sync_cluster(hammer_cluster_t cluster, void *data)
2227 {
2228         struct hammer_sync_info *info = data;
2229
2230         /*
2231          * XXX check if cluster deleted and don't bother to sync it?
2232          */
2233         hammer_ref(&cluster->io.lock);
2234         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
2235                 hammer_sync_buffer, info);
2236         /*hammer_io_waitdep(&cluster->io);*/
2237         hammer_rel_cluster(cluster, 1);
2238         return(0);
2239 }
2240
2241 int
2242 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
2243 {
2244         hammer_ref(&buffer->io.lock);
2245         hammer_rel_buffer(buffer, 1);
2246         return(0);
2247 }
2248
2249 /*
2250  * Generic buffer initialization.  Initialize the A-list into an all-allocated
2251  * state with the free block limit properly set.
2252  *
2253  * Note that alloc_new_buffer() will free the appropriate block range via
2254  * the appropriate cluster alist, so the free count is properly propogated.
2255  */
2256 void
2257 hammer_initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
2258 {
2259         head->buf_type = type;
2260
2261         switch(type) {
2262         case HAMMER_FSBUF_BTREE:
2263                 hammer_alist_init(live, 0, HAMMER_BTREE_NODES,
2264                                   HAMMER_ASTATE_ALLOC);
2265                 break;
2266         case HAMMER_FSBUF_DATA:
2267                 hammer_alist_init(live, 0, HAMMER_DATA_NODES,
2268                                   HAMMER_ASTATE_ALLOC);
2269                 break;
2270         case HAMMER_FSBUF_RECORDS:
2271                 hammer_alist_init(live, 0, HAMMER_RECORD_NODES,
2272                                   HAMMER_ASTATE_ALLOC);
2273                 break;
2274         default:
2275                 hammer_alist_init(live, 0, 0, HAMMER_ASTATE_ALLOC);
2276                 break;
2277         }
2278 }
2279
2280 /*
2281  * Calculate the cluster's offset in the volume.  This calculation is
2282  * slightly more complex when using superclusters because superclusters
2283  * are grouped in blocks of 16, followed by 16 x N clusters where N
2284  * is the number of clusters a supercluster can manage.
2285  */
2286 static int64_t
2287 calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
2288 {
2289         int32_t scl_group;
2290         int64_t scl_group_size;
2291         int64_t off;
2292
2293         if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
2294                 scl_group = clu_no / HAMMER_VOL_SUPERCLUSTER_GROUP /
2295                             HAMMER_SCL_MAXCLUSTERS;
2296                 scl_group_size = 
2297                             ((int64_t)HAMMER_BUFSIZE *
2298                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2299                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2300                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2301                 scl_group_size += 
2302                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2303
2304                 off = volume->cluster_base +
2305                       scl_group * scl_group_size +
2306                       (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
2307                       ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
2308                        HAMMER_VOL_SUPERCLUSTER_GROUP))
2309                       * volume->vol_clsize;
2310         } else {
2311                 off = volume->cluster_base +
2312                       (int64_t)clu_no * volume->vol_clsize;
2313         }
2314         return(off);
2315 }
2316
2317 /*
2318  * Calculate a super-cluster's offset in the volume.
2319  */
2320 static int64_t
2321 calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
2322 {
2323         int64_t off;
2324         int32_t scl_group;
2325         int64_t scl_group_size;
2326
2327         KKASSERT (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL);
2328         scl_group = scl_no / HAMMER_VOL_SUPERCLUSTER_GROUP;
2329         if (scl_group) {
2330                 scl_group_size = 
2331                             ((int64_t)HAMMER_BUFSIZE *
2332                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2333                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2334                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2335                 scl_group_size += 
2336                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2337                 off = volume->cluster_base + (scl_group * scl_group_size) +
2338                       (scl_no % HAMMER_VOL_SUPERCLUSTER_GROUP) * HAMMER_BUFSIZE;
2339         } else {
2340                 off = volume->cluster_base + (scl_no * HAMMER_BUFSIZE);
2341         }
2342         return(off);
2343 }
2344
2345 /*
2346  * Allocate nblks buffers from the cluster's master alist.
2347  */
2348 static int32_t
2349 hammer_alloc_master(hammer_cluster_t cluster, int nblks,
2350                     int32_t start, int isfwd)
2351 {
2352         int32_t buf_no;
2353
2354         hammer_modify_cluster(cluster);
2355         if (isfwd) {
2356                 buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2357                                                 nblks, start);
2358                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2359                         buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2360                                                 nblks, 0);
2361                 }
2362         } else {
2363                 buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2364                                                 nblks, start);
2365                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2366                         buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2367                                                 nblks, HAMMER_ALIST_BLOCK_MAX);
2368                 }
2369         }
2370
2371         /*
2372          * Recover space from empty record, b-tree, and data a-lists.
2373          */
2374
2375         return(buf_no);
2376 }
2377
2378 /*
2379  * Adjust allocation statistics
2380  */
2381 static void
2382 hammer_adjust_stats(hammer_cluster_t cluster, u_int64_t buf_type, int nblks)
2383 {
2384         if (nblks == 0)
2385                 return;
2386
2387         hammer_modify_cluster(cluster);
2388         hammer_modify_volume(cluster->volume);
2389         hammer_modify_volume(cluster->volume->hmp->rootvol);
2390
2391         switch(buf_type) {
2392         case HAMMER_FSBUF_BTREE:
2393                 cluster->ondisk->stat_idx_bufs += nblks;
2394                 cluster->volume->ondisk->vol_stat_idx_bufs += nblks;
2395                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_idx_bufs += nblks;
2396                 break;
2397         case HAMMER_FSBUF_DATA:
2398                 cluster->ondisk->stat_data_bufs += nblks;
2399                 cluster->volume->ondisk->vol_stat_data_bufs += nblks;
2400                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_data_bufs += nblks;
2401                 break;
2402         case HAMMER_FSBUF_RECORDS:
2403                 cluster->ondisk->stat_rec_bufs += nblks;
2404                 cluster->volume->ondisk->vol_stat_rec_bufs += nblks;
2405                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_rec_bufs += nblks;
2406                 break;
2407         }
2408 }
2409
2410 /*
2411  * A-LIST SUPPORT
2412  *
2413  * Setup the parameters for the various A-lists we use in hammer.  The
2414  * supercluster A-list must be chained to the cluster A-list and cluster
2415  * slave A-lists are chained to buffer A-lists.
2416  *
2417  * See hammer_init_alist_config() below.
2418  */
2419
2420 /*
2421  * A-LIST - cluster recursion into a filesystem buffer
2422  *
2423  * In the init case the buffer has already been initialized by
2424  * alloc_new_buffer() when it allocated the buffer out of the master
2425  * alist and marked it as free in the slave alist.
2426  *
2427  * Because we use a somewhat odd mechanism to assign buffers to slave
2428  * pools we can't actually free the buffer back to the master alist in
2429  * buffer_alist_destroy(), but instead must deal with that logic somewhere
2430  * else.
2431  */
2432 static int
2433 buffer_alist_init(void *info, int32_t blk, int32_t radix,
2434                   hammer_alloc_state_t state)
2435 {
2436         return(0);
2437 }
2438
2439 /*
2440  * Note: This routine is only called when freeing the last elements of
2441  * an initialized buffer.  Freeing all elements of the buffer when the
2442  * buffer was not previously initialized does not call this routine.
2443  */
2444 static int
2445 buffer_alist_destroy(void *info, int32_t blk, int32_t radix)
2446 {
2447         hammer_cluster_t cluster = info;
2448         int32_t buf_no;
2449
2450         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2451         if (hammer_debug_general & 0x80) {
2452                 kprintf("destroy buffer %d:%d:%d\n",
2453                         cluster->volume->vol_no, cluster->clu_no, buf_no);
2454         }
2455         return (0);
2456 }
2457
2458 /*
2459  * Note: atblk can be negative and atblk - blk can go negative.
2460  */
2461 static int
2462 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2463                       int32_t count, int32_t atblk, int32_t *fullp)
2464 {
2465         hammer_cluster_t cluster = info;
2466         hammer_buffer_t buffer;
2467         int32_t buf_no;
2468         int32_t r;
2469         int error = 0;
2470
2471         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2472         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2473         if (buffer) {
2474                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2475
2476                 hammer_modify_buffer(buffer);
2477                 r = hammer_alist_alloc_fwd(&buffer->alist, count, atblk - blk);
2478                 if (r != HAMMER_ALIST_BLOCK_NONE)
2479                         r += blk;
2480                 *fullp = hammer_alist_isfull(&buffer->alist);
2481                 hammer_rel_buffer(buffer, 0);
2482         } else {
2483                 r = HAMMER_ALIST_BLOCK_NONE;
2484                 *fullp = 1;
2485         }
2486         return(r);
2487 }
2488
2489 static int
2490 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2491                       int32_t count, int32_t atblk, int32_t *fullp)
2492 {
2493         hammer_cluster_t cluster = info;
2494         hammer_buffer_t buffer;
2495         int32_t buf_no;
2496         int32_t r;
2497         int error = 0;
2498
2499         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2500         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2501         if (buffer) {
2502                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2503                 hammer_modify_buffer(buffer);
2504                 r = hammer_alist_alloc_rev(&buffer->alist, count, atblk - blk);
2505                 if (r != HAMMER_ALIST_BLOCK_NONE)
2506                         r += blk;
2507                 *fullp = hammer_alist_isfull(&buffer->alist);
2508                 hammer_rel_buffer(buffer, 0);
2509         } else {
2510                 r = HAMMER_ALIST_BLOCK_NONE;
2511                 *fullp = 1;
2512         }
2513         return(r);
2514 }
2515
2516 static void
2517 buffer_alist_free(void *info, int32_t blk, int32_t radix,
2518                  int32_t base_blk, int32_t count, int32_t *emptyp)
2519 {
2520         hammer_cluster_t cluster = info;
2521         hammer_buffer_t buffer;
2522         int32_t buf_no;
2523         int error = 0;
2524
2525         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2526         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2527         if (buffer) {
2528                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2529                 hammer_modify_buffer(buffer);
2530                 hammer_alist_free(&buffer->alist, base_blk, count);
2531                 *emptyp = hammer_alist_isempty(&buffer->alist);
2532                 hammer_rel_buffer(buffer, 0);
2533         } else {
2534                 *emptyp = 0;
2535         }
2536 }
2537
2538 static int32_t
2539 buffer_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk,
2540                   int flags)
2541 {
2542         hammer_cluster_t cluster = info;
2543         hammer_buffer_t buffer;
2544         int32_t buf_no;
2545         int32_t maxblks;
2546         int error = 0;
2547
2548         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2549         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2550         if (buffer) {
2551                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2552                 switch(buffer->ondisk->head.buf_type) {
2553                 case HAMMER_FSBUF_RECORDS:
2554                         maxblks = HAMMER_RECORD_NODES;
2555                         break;
2556                 case HAMMER_FSBUF_BTREE:
2557                         maxblks = HAMMER_BTREE_NODES;
2558                         break;
2559                 case HAMMER_FSBUF_DATA:
2560                         maxblks = HAMMER_DATA_NODES;
2561                         break;
2562                 default:
2563                         panic("buffer_alist_find: unknown buffer type");
2564                         maxblks = 0;
2565                         break;
2566                 }
2567                 blk = hammer_alist_find(&buffer->alist, atblk - blk, maxblks,
2568                                         flags);
2569                 hammer_rel_buffer(buffer, 0);
2570         } else {
2571                 blk = HAMMER_ALIST_BLOCK_NONE;
2572         }
2573         return(blk);
2574 }
2575
2576 static void
2577 buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2578 {
2579 }
2580
2581 /*
2582  * A-LIST - super-cluster recursion into a cluster and cluster recursion
2583  * into a filesystem buffer.  A-List's are mostly self-contained entities,
2584  * but callbacks must be installed to recurse from one A-List to another.
2585  *
2586  * Implementing these callbacks allows us to operate a multi-layered A-List
2587  * as a single entity.
2588  */
2589
2590 /*
2591  * This occurs when allocating a cluster via the volume a-list and the
2592  * entry in the volume a-list indicated all-free.  The underlying supercl
2593  * has not yet been initialized.
2594  */
2595 static int
2596 super_alist_init(void *info, int32_t blk, int32_t radix,
2597                  hammer_alloc_state_t state)
2598 {
2599         hammer_volume_t volume = info;
2600         hammer_supercl_t supercl;
2601         int32_t scl_no;
2602         int error = 0;
2603
2604         /*
2605          * Calculate the super-cluster number containing the cluster (blk)
2606          * and obtain the super-cluster buffer.
2607          */
2608         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2609         supercl = hammer_get_supercl(volume, scl_no, &error, state);
2610         if (supercl)
2611                 hammer_rel_supercl(supercl, 0);
2612         return (error);
2613 }
2614
2615 static int
2616 super_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
2617 {
2618         hammer_volume_t volume = info;
2619         hammer_supercl_t supercl;
2620         int32_t scl_no;
2621         int error = 0;
2622
2623         /*
2624          * Calculate the super-cluster number containing the cluster (blk)
2625          * and obtain the super-cluster buffer.
2626          */
2627         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2628         supercl = hammer_get_supercl(volume, scl_no, &error,
2629                                      HAMMER_ASTATE_NONE);
2630         if (supercl) {
2631                 hammer_modify_supercl(supercl);
2632                 error = hammer_alist_recover(&supercl->alist, blk, 0, count);
2633                 /* free block count is returned if >= 0 */
2634                 hammer_rel_supercl(supercl, 0);
2635         } else {
2636                 error = -error;
2637         }
2638         return (error);
2639 }
2640
2641 /*
2642  * This occurs when freeing a cluster via the volume a-list and the
2643  * supercl is now 100% free.  We can destroy the supercl.
2644  *
2645  * What we actually do is just unset the modify bit so it doesn't get
2646  * written out.
2647  */
2648 static int
2649 super_alist_destroy(void *info, int32_t blk, int32_t radix)
2650 {
2651         hammer_volume_t volume = info;
2652         hammer_supercl_t supercl;
2653         int32_t scl_no;
2654         int error = 0;
2655
2656         /*
2657          * Calculate the super-cluster number containing the cluster (blk)
2658          * and obtain the super-cluster buffer.
2659          */
2660         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2661         if (hammer_find_supercl(volume, scl_no)) {
2662                 supercl = hammer_get_supercl(volume, scl_no, &error,
2663                                              HAMMER_ASTATE_FREE);
2664                                              /* XXX */
2665                 if (supercl) {
2666                         hammer_io_clear_modify(&supercl->io);
2667                         hammer_rel_supercl(supercl, 0);
2668                 }
2669         }
2670         return (error);
2671 }
2672
2673 static int
2674 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2675                       int32_t count, int32_t atblk, int32_t *fullp)
2676 {
2677         hammer_volume_t volume = info;
2678         hammer_supercl_t supercl;
2679         int32_t scl_no;
2680         int32_t r;
2681         int error = 0;
2682
2683         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2684         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2685         if (supercl) {
2686                 hammer_modify_supercl(supercl);
2687                 r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
2688                 if (r != HAMMER_ALIST_BLOCK_NONE)
2689                         r += blk;
2690                 *fullp = hammer_alist_isfull(&supercl->alist);
2691                 hammer_rel_supercl(supercl, 0);
2692         } else {
2693                 r = HAMMER_ALIST_BLOCK_NONE;
2694                 *fullp = 1;
2695         }
2696         return(r);
2697 }
2698
2699 static int
2700 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2701                       int32_t count, int32_t atblk, int32_t *fullp)
2702 {
2703         hammer_volume_t volume = info;
2704         hammer_supercl_t supercl;
2705         int32_t scl_no;
2706         int32_t r;
2707         int error = 0;
2708
2709         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2710         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2711         if (supercl) {
2712                 hammer_modify_supercl(supercl);
2713                 r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
2714                 if (r != HAMMER_ALIST_BLOCK_NONE)
2715                         r += blk;
2716                 *fullp = hammer_alist_isfull(&supercl->alist);
2717                 hammer_rel_supercl(supercl, 0);
2718         } else { 
2719                 r = HAMMER_ALIST_BLOCK_NONE;
2720                 *fullp = 1;
2721         }
2722         return(r);
2723 }
2724
2725 static void
2726 super_alist_free(void *info, int32_t blk, int32_t radix,
2727                  int32_t base_blk, int32_t count, int32_t *emptyp)
2728 {
2729         hammer_volume_t volume = info;
2730         hammer_supercl_t supercl;
2731         int32_t scl_no;
2732         int error = 0;
2733
2734         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2735         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2736         if (supercl) {
2737                 hammer_modify_supercl(supercl);
2738                 hammer_alist_free(&supercl->alist, base_blk, count);
2739                 *emptyp = hammer_alist_isempty(&supercl->alist);
2740                 hammer_rel_supercl(supercl, 0);
2741         } else {
2742                 *emptyp = 0;
2743         }
2744 }
2745
2746 static int32_t
2747 super_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk,
2748                   int flags)
2749 {
2750         hammer_volume_t volume = info;
2751         hammer_supercl_t supercl;
2752         int32_t scl_no;
2753         int32_t nclusters;
2754         int error = 0;
2755
2756         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2757         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2758         if (supercl) {
2759                 nclusters = supercl->volume->ondisk->vol_nclusters -
2760                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
2761                 KKASSERT(nclusters > 0);
2762                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
2763                         nclusters = HAMMER_SCL_MAXCLUSTERS;
2764                 blk = hammer_alist_find(&supercl->alist, atblk - blk,
2765                                         nclusters, flags);
2766                 hammer_rel_supercl(supercl, 0);
2767         } else {
2768                 blk = HAMMER_ALIST_BLOCK_NONE;
2769         }
2770         return(blk);
2771 }
2772
2773 static void
2774 super_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2775 {
2776 }
2777
2778 void
2779 hammer_init_alist_config(void)
2780 {
2781         hammer_alist_config_t config;
2782
2783         hammer_alist_template(&Buf_alist_config, HAMMER_FSBUF_MAXBLKS,
2784                               1, HAMMER_FSBUF_METAELMS, 0);
2785         hammer_alist_template(&Vol_normal_alist_config, HAMMER_VOL_MAXCLUSTERS,
2786                               1, HAMMER_VOL_METAELMS_1LYR, 0);
2787         hammer_alist_template(&Vol_super_alist_config,
2788                           HAMMER_VOL_MAXSUPERCLUSTERS * HAMMER_SCL_MAXCLUSTERS,
2789                               HAMMER_SCL_MAXCLUSTERS, HAMMER_VOL_METAELMS_2LYR,
2790                               0);
2791         hammer_alist_template(&Supercl_alist_config, HAMMER_VOL_MAXCLUSTERS,
2792                               1, HAMMER_SUPERCL_METAELMS, 0);
2793         hammer_alist_template(&Clu_master_alist_config, HAMMER_CLU_MAXBUFFERS,
2794                               1, HAMMER_CLU_MASTER_METAELMS, 0);
2795         hammer_alist_template(&Clu_slave_alist_config,
2796                               HAMMER_CLU_MAXBUFFERS * HAMMER_FSBUF_MAXBLKS,
2797                               HAMMER_FSBUF_MAXBLKS, HAMMER_CLU_SLAVE_METAELMS,
2798                               1);
2799
2800         config = &Vol_super_alist_config;
2801         config->bl_radix_init = super_alist_init;
2802         config->bl_radix_recover = super_alist_recover;
2803         config->bl_radix_destroy = super_alist_destroy;
2804         config->bl_radix_alloc_fwd = super_alist_alloc_fwd;
2805         config->bl_radix_alloc_rev = super_alist_alloc_rev;
2806         config->bl_radix_free = super_alist_free;
2807         config->bl_radix_find = super_alist_find;
2808         config->bl_radix_print = super_alist_print;
2809
2810         config = &Clu_slave_alist_config;
2811         config->bl_radix_init = buffer_alist_init;
2812         config->bl_radix_recover = buffer_alist_recover;
2813         config->bl_radix_destroy = buffer_alist_destroy;
2814         config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd;
2815         config->bl_radix_alloc_rev = buffer_alist_alloc_rev;
2816         config->bl_radix_free = buffer_alist_free;
2817         config->bl_radix_find = buffer_alist_find;
2818         config->bl_radix_print = buffer_alist_print;
2819 }
2820