HAMMER 22/many: Recovery and B-Tree work.
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.23 2008/01/21 00:00:19 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_supercl(hammer_supercl_t supercl,
51                         hammer_alloc_state_t isnew);
52 static int hammer_load_cluster(hammer_cluster_t cluster, int getflags);
53 static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
54 static int hammer_load_node(hammer_node_t node);
55 static void alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type,
56                         hammer_alist_t live,
57                         int32_t start, int *errorp,
58                         struct hammer_buffer **bufferp);
59 #if 0
60 static void readhammerbuf(hammer_volume_t vol, void *data,
61                         int64_t offset);
62 static void writehammerbuf(hammer_volume_t vol, const void *data,
63                         int64_t offset);
64 #endif
65 static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
66 static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
67 static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks,
68                         int32_t start, int isfwd);
69 static void hammer_adjust_stats(hammer_cluster_t cluster,
70                         u_int64_t buf_type, int nblks);
71
72 struct hammer_alist_config Buf_alist_config;
73 struct hammer_alist_config Vol_normal_alist_config;
74 struct hammer_alist_config Vol_super_alist_config;
75 struct hammer_alist_config Supercl_alist_config;
76 struct hammer_alist_config Clu_master_alist_config;
77 struct hammer_alist_config Clu_slave_alist_config;
78
79 /*
80  * Red-Black tree support for various structures
81  */
82 static int
83 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
84 {
85         if (ip1->obj_id < ip2->obj_id)
86                 return(-1);
87         if (ip1->obj_id > ip2->obj_id)
88                 return(1);
89         if (ip1->obj_asof < ip2->obj_asof)
90                 return(-1);
91         if (ip1->obj_asof > ip2->obj_asof)
92                 return(1);
93         return(0);
94 }
95
96 static int
97 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
98 {
99         if (info->obj_id < ip->obj_id)
100                 return(-1);
101         if (info->obj_id > ip->obj_id)
102                 return(1);
103         if (info->obj_asof < ip->obj_asof)
104                 return(-1);
105         if (info->obj_asof > ip->obj_asof)
106                 return(1);
107         return(0);
108 }
109
110 static int
111 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
112 {
113         if (vol1->vol_no < vol2->vol_no)
114                 return(-1);
115         if (vol1->vol_no > vol2->vol_no)
116                 return(1);
117         return(0);
118 }
119
120 static int
121 hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
122 {
123         if (cl1->scl_no < cl2->scl_no)
124                 return(-1);
125         if (cl1->scl_no > cl2->scl_no)
126                 return(1);
127         return(0);
128 }
129
130 static int
131 hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
132 {
133         if (cl1->clu_no < cl2->clu_no)
134                 return(-1);
135         if (cl1->clu_no > cl2->clu_no)
136                 return(1);
137         return(0);
138 }
139
140 static int
141 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
142 {
143         if (buf1->buf_no < buf2->buf_no)
144                 return(-1);
145         if (buf1->buf_no > buf2->buf_no)
146                 return(1);
147         return(0);
148 }
149
150 static int
151 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
152 {
153         if (node1->node_offset < node2->node_offset)
154                 return(-1);
155         if (node1->node_offset > node2->node_offset)
156                 return(1);
157         return(0);
158 }
159
160 /*
161  * Note: The lookup function for hammer_ino_rb_tree winds up being named
162  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
163  * functions are normal, e.g. hammer_clu_rb_tree_RB_LOOKUP(root, clu_no).
164  */
165 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
166 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
167                 hammer_inode_info_cmp, hammer_inode_info_t);
168 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
169              hammer_vol_rb_compare, int32_t, vol_no);
170 RB_GENERATE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
171              hammer_scl_rb_compare, int32_t, scl_no);
172 RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
173              hammer_clu_rb_compare, int32_t, clu_no);
174 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
175              hammer_buf_rb_compare, int32_t, buf_no);
176 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
177              hammer_nod_rb_compare, int32_t, node_offset);
178
179 /************************************************************************
180  *                              VOLUMES                                 *
181  ************************************************************************
182  *
183  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
184  * code on failure.  Volumes must be loaded at mount time, get_volume() will
185  * not load a new volume.
186  *
187  * Calls made to hammer_load_volume() or single-threaded
188  */
189 int
190 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
191 {
192         struct mount *mp;
193         hammer_volume_t volume;
194         struct hammer_volume_ondisk *ondisk;
195         struct nlookupdata nd;
196         struct buf *bp = NULL;
197         int error;
198         int ronly;
199
200         mp = hmp->mp;
201         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
202
203         /*
204          * Allocate a volume structure
205          */
206         ++hammer_count_volumes;
207         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
208         volume->vol_name = kstrdup(volname, M_HAMMER);
209         volume->hmp = hmp;
210         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
211         volume->io.offset = 0LL;
212
213         /*
214          * Get the device vnode
215          */
216         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
217         if (error == 0)
218                 error = nlookup(&nd);
219         if (error == 0)
220                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
221         nlookup_done(&nd);
222         if (error == 0) {
223                 vn_isdisk(volume->devvp, &error);
224         }
225         if (error == 0) {
226                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
227                 error = VOP_OPEN(volume->devvp, (ronly ? FREAD : FREAD|FWRITE),
228                                  FSCRED, NULL);
229                 vn_unlock(volume->devvp);
230         }
231         if (error) {
232                 hammer_free_volume(volume);
233                 return(error);
234         }
235
236         /*
237          * Extract the volume number from the volume header and do various
238          * sanity checks.
239          */
240         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
241         if (error)
242                 goto late_failure;
243         ondisk = (void *)bp->b_data;
244         if (ondisk->head.buf_type != HAMMER_FSBUF_VOLUME) {
245                 kprintf("hammer_mount: volume %s has an invalid header\n",
246                         volume->vol_name);
247                 error = EFTYPE;
248                 goto late_failure;
249         }
250         volume->vol_no = ondisk->vol_no;
251         volume->cluster_base = ondisk->vol_clo_beg;
252         volume->vol_clsize = ondisk->vol_clsize;
253         volume->vol_flags = ondisk->vol_flags;
254         volume->nblocks = ondisk->vol_nblocks; 
255         RB_INIT(&volume->rb_clus_root);
256         RB_INIT(&volume->rb_scls_root);
257
258         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
259
260         if (RB_EMPTY(&hmp->rb_vols_root)) {
261                 hmp->fsid = ondisk->vol_fsid;
262         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
263                 kprintf("hammer_mount: volume %s's fsid does not match "
264                         "other volumes\n", volume->vol_name);
265                 error = EFTYPE;
266                 goto late_failure;
267         }
268
269         /*
270          * Insert the volume structure into the red-black tree.
271          */
272         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
273                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
274                         volume->vol_name, volume->vol_no);
275                 error = EEXIST;
276         }
277
278         /*
279          * Set the root volume and load the root cluster.  HAMMER special
280          * cases rootvol and rootcl and will not deallocate the structures.
281          * We do not hold a ref because this would prevent related I/O
282          * from being flushed.
283          */
284         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
285                 hmp->rootvol = volume;
286                 if (bp) {
287                         brelse(bp);
288                         bp = NULL;
289                 }
290                 hammer_ref_volume(volume);
291                 hmp->rootcl = hammer_get_cluster(volume,
292                                                  ondisk->vol0_root_clu_no,
293                                                  &error, 0);
294                 hammer_rel_cluster(hmp->rootcl, 0);
295                 hammer_rel_volume(volume, 0);
296                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
297         }
298 late_failure:
299         if (bp)
300                 brelse(bp);
301         if (error) {
302                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
303                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
304                 hammer_free_volume(volume);
305         }
306         return (error);
307 }
308
309 /*
310  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
311  * so returns -1 on failure.
312  */
313 int
314 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
315 {
316         struct hammer_mount *hmp = volume->hmp;
317         hammer_cluster_t rootcl;
318         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
319
320         /*
321          * Sync clusters, sync volume
322          */
323
324         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
325
326         /*
327          * Clean up the root cluster, which is held unlocked in the root
328          * volume.
329          */
330         if (hmp->rootvol == volume) {
331                 if ((rootcl = hmp->rootcl) != NULL)
332                         hmp->rootcl = NULL;
333                 hmp->rootvol = NULL;
334         }
335
336         /*
337          * Unload clusters and super-clusters.  Unloading a super-cluster
338          * also unloads related clusters, but the filesystem may not be
339          * using super-clusters so unload clusters anyway.
340          */
341         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
342                         hammer_unload_cluster, NULL);
343         RB_SCAN(hammer_scl_rb_tree, &volume->rb_scls_root, NULL,
344                         hammer_unload_supercl, NULL);
345         hammer_io_waitdep(&volume->io);
346
347         /*
348          * Release our buffer and flush anything left in the buffer cache.
349          */
350         hammer_io_release(&volume->io, 2);
351
352         /*
353          * There should be no references on the volume, no clusters, and
354          * no super-clusters.
355          */
356         KKASSERT(volume->io.lock.refs == 0);
357         KKASSERT(RB_EMPTY(&volume->rb_clus_root));
358         KKASSERT(RB_EMPTY(&volume->rb_scls_root));
359
360         volume->ondisk = NULL;
361         if (volume->devvp) {
362                 if (ronly) {
363                         vinvalbuf(volume->devvp, 0, 0, 0);
364                         VOP_CLOSE(volume->devvp, FREAD);
365                 } else {
366                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
367                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
368                 }
369         }
370
371         /*
372          * Destroy the structure
373          */
374         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
375         hammer_free_volume(volume);
376         return(0);
377 }
378
379 static
380 void
381 hammer_free_volume(hammer_volume_t volume)
382 {
383         if (volume->vol_name) {
384                 kfree(volume->vol_name, M_HAMMER);
385                 volume->vol_name = NULL;
386         }
387         if (volume->devvp) {
388                 vrele(volume->devvp);
389                 volume->devvp = NULL;
390         }
391         --hammer_count_volumes;
392         kfree(volume, M_HAMMER);
393 }
394
395 /*
396  * Get a HAMMER volume.  The volume must already exist.
397  */
398 hammer_volume_t
399 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
400 {
401         struct hammer_volume *volume;
402
403         /*
404          * Locate the volume structure
405          */
406         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
407         if (volume == NULL) {
408                 *errorp = ENOENT;
409                 return(NULL);
410         }
411         hammer_ref(&volume->io.lock);
412
413         /*
414          * Deal with on-disk info
415          */
416         while (volume->io.loading) {
417                 hammer_lock_ex(&volume->io.lock);
418                 hammer_unlock(&volume->io.lock);
419         }
420         if (volume->ondisk == NULL) {
421                 *errorp = hammer_load_volume(volume);
422                 if (*errorp) {
423                         hammer_rel_volume(volume, 1);
424                         volume = NULL;
425                 }
426         } else {
427                 *errorp = 0;
428         }
429         return(volume);
430 }
431
432 int
433 hammer_ref_volume(hammer_volume_t volume)
434 {
435         int error;
436
437         hammer_ref(&volume->io.lock);
438
439         /*
440          * Deal with on-disk info
441          */
442         while (volume->io.loading) {
443                 hammer_lock_ex(&volume->io.lock);
444                 hammer_unlock(&volume->io.lock);
445         }
446         if (volume->ondisk == NULL) {
447                 error = hammer_load_volume(volume);
448                 if (error)
449                         hammer_rel_volume(volume, 1);
450         } else {
451                 error = 0;
452         }
453         return (error);
454 }
455
456 hammer_volume_t
457 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
458 {
459         hammer_volume_t volume;
460
461         volume = hmp->rootvol;
462         KKASSERT(volume != NULL);
463         hammer_ref(&volume->io.lock);
464
465         /*
466          * Deal with on-disk info
467          */
468         while (volume->io.loading) {
469                 hammer_lock_ex(&volume->io.lock);
470                 hammer_unlock(&volume->io.lock);
471         }
472         if (volume->ondisk == NULL) {
473                 *errorp = hammer_load_volume(volume);
474                 if (*errorp) {
475                         hammer_rel_volume(volume, 1);
476                         volume = NULL;
477                 }
478         } else {
479                 *errorp = 0;
480         }
481         return (volume);
482 }
483
484 /*
485  * Load a volume's on-disk information.  The volume must be referenced and
486  * not locked.  We temporarily acquire an exclusive lock to interlock
487  * against releases or multiple get's.
488  */
489 static int
490 hammer_load_volume(hammer_volume_t volume)
491 {
492         struct hammer_volume_ondisk *ondisk;
493         int error;
494
495         hammer_lock_ex(&volume->io.lock);
496         volume->io.loading = 1;
497         if (volume->ondisk == NULL) {
498                 error = hammer_io_read(volume->devvp, &volume->io);
499                 if (error) {
500                         volume->io.loading = 0;
501                         hammer_unlock(&volume->io.lock);
502                         return (error);
503                 }
504                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
505
506                 /*
507                  * Configure the volume's A-lists.  These are used to
508                  * allocate clusters.
509                  */
510                 if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
511                         volume->alist.config = &Vol_super_alist_config;
512                         volume->alist.meta = ondisk->vol_almeta.super;
513                         volume->alist.info = volume;
514                 } else {
515                         volume->alist.config = &Vol_normal_alist_config;
516                         volume->alist.meta = ondisk->vol_almeta.normal;
517                         volume->alist.info = NULL;
518                 }
519         } else {
520                 error = 0;
521         }
522         volume->io.loading = 0;
523         hammer_unlock(&volume->io.lock);
524         return(0);
525 }
526
527 /*
528  * Release a volume.  Call hammer_io_release on the last reference.  We have
529  * to acquire an exclusive lock to interlock against volume->ondisk tests
530  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
531  * lock to be held.
532  *
533  * Volumes are not unloaded from memory during normal operation.
534  */
535 void
536 hammer_rel_volume(hammer_volume_t volume, int flush)
537 {
538         if (volume->io.lock.refs == 1) {
539                 hammer_lock_ex(&volume->io.lock);
540                 if (volume->io.lock.refs == 1) {
541                         volume->ondisk = NULL;
542                         hammer_io_release(&volume->io, flush);
543                 } else if (flush) {
544                         hammer_io_flush(&volume->io);
545                 }
546                 hammer_unlock(&volume->io.lock);
547         }
548         hammer_unref(&volume->io.lock);
549 }
550
551 /************************************************************************
552  *                              SUPER-CLUSTERS                          *
553  ************************************************************************
554  *
555  * Manage super-clusters.  Note that a supercl holds a reference to its
556  * associated volume.
557  */
558 static int
559 hammer_find_supercl(hammer_volume_t volume, int32_t scl_no)
560 {
561         if (RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no))
562                 return(1);
563         return(0);
564 }
565
566 hammer_supercl_t
567 hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
568                    int *errorp, hammer_alloc_state_t isnew)
569 {
570         hammer_supercl_t supercl;
571
572         /*
573          * Locate and lock the super-cluster structure, creating one
574          * if necessary.
575          */
576 again:
577         supercl = RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no);
578         if (supercl == NULL) {
579                 ++hammer_count_supercls;
580                 supercl = kmalloc(sizeof(*supercl), M_HAMMER, M_WAITOK|M_ZERO);
581                 supercl->scl_no = scl_no;
582                 supercl->volume = volume;
583                 supercl->io.offset = calculate_supercl_offset(volume, scl_no);
584                 hammer_io_init(&supercl->io, HAMMER_STRUCTURE_SUPERCL);
585                 hammer_ref(&supercl->io.lock);
586
587                 /*
588                  * Insert the cluster into the RB tree and handle late
589                  * collisions.
590                  */
591                 if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
592                         hammer_unref(&supercl->io.lock);
593                         --hammer_count_supercls;
594                         kfree(supercl, M_HAMMER);
595                         goto again;
596                 }
597                 hammer_ref(&volume->io.lock);
598         } else {
599                 hammer_ref(&supercl->io.lock);
600         }
601
602         /*
603          * Deal with on-disk info
604          */
605         while (supercl->io.loading) {
606                 hammer_lock_ex(&supercl->io.lock);
607                 hammer_unlock(&supercl->io.lock);
608         }
609         if (supercl->ondisk == NULL || isnew) {
610                 *errorp = hammer_load_supercl(supercl, isnew);
611                 if (*errorp) {
612                         hammer_rel_supercl(supercl, 1);
613                         supercl = NULL;
614                 }
615         } else {
616                 *errorp = 0;
617         }
618         return(supercl);
619 }
620
621 static int
622 hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew)
623 {
624         struct hammer_supercl_ondisk *ondisk;
625         hammer_volume_t volume = supercl->volume;
626         int error;
627         int64_t nclusters;
628
629         hammer_lock_ex(&supercl->io.lock);
630         supercl->io.loading = 1;
631         if (supercl->ondisk == NULL) {
632                 if (isnew)
633                         error = hammer_io_new(volume->devvp, &supercl->io);
634                 else
635                         error = hammer_io_read(volume->devvp, &supercl->io);
636                 if (error) {
637                         supercl->io.loading = 0;
638                         hammer_unlock(&supercl->io.lock);
639                         return (error);
640                 }
641                 supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
642
643                 supercl->alist.config = &Supercl_alist_config;
644                 supercl->alist.meta = ondisk->scl_meta;
645                 supercl->alist.info = NULL;
646         } else if (isnew) {
647                 error = hammer_io_new(volume->devvp, &supercl->io);
648         } else {
649                 error = 0;
650         }
651         if (error == 0 && isnew) {
652                 /*
653                  * If this is a new super-cluster we have to initialize
654                  * various ondisk structural elements.  The caller is
655                  * responsible for the remainder.
656                  */
657                 struct hammer_alist_live dummy;
658
659                 hammer_modify_supercl(supercl);
660
661                 ondisk = supercl->ondisk;
662                 dummy.config = &Buf_alist_config;
663                 dummy.meta = ondisk->head.buf_almeta;
664                 dummy.info = NULL;
665                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL);
666
667                 nclusters = volume->ondisk->vol_nclusters -
668                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
669                 KKASSERT(nclusters > 0);
670                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
671                         nclusters = HAMMER_SCL_MAXCLUSTERS;
672                 hammer_alist_init(&supercl->alist, 0, (int32_t)nclusters,
673                                   isnew);
674         }
675         supercl->io.loading = 0;
676         hammer_unlock(&supercl->io.lock);
677         return (error);
678 }
679
680 /*
681  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
682  */
683 int
684 hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused)
685 {
686         KKASSERT(supercl->io.lock.refs == 0);
687         hammer_ref(&supercl->io.lock);
688         hammer_rel_supercl(supercl, 2);
689         return(0);
690 }
691
692 /*
693  * Release a super-cluster.  We have to deal with several places where
694  * another thread can ref the super-cluster.
695  *
696  * Only destroy the structure itself if the related buffer cache buffer
697  * was disassociated from it.  This ties the management of the structure
698  * to the buffer cache subsystem.
699  */
700 void
701 hammer_rel_supercl(hammer_supercl_t supercl, int flush)
702 {
703         hammer_volume_t volume;
704
705         if (supercl->io.lock.refs == 1) {
706                 hammer_lock_ex(&supercl->io.lock);
707                 if (supercl->io.lock.refs == 1) {
708                         hammer_io_release(&supercl->io, flush);
709                         if (supercl->io.bp == NULL &&
710                             supercl->io.lock.refs == 1) {
711                                 volume = supercl->volume;
712                                 RB_REMOVE(hammer_scl_rb_tree,
713                                           &volume->rb_scls_root, supercl);
714                                 supercl->volume = NULL; /* sanity */
715                                 --hammer_count_supercls;
716                                 kfree(supercl, M_HAMMER);
717                                 hammer_rel_volume(volume, 0);
718                                 return;
719                         }
720                 } else if (flush) {
721                         hammer_io_flush(&supercl->io);
722                 }
723                 hammer_unlock(&supercl->io.lock);
724         }
725         hammer_unref(&supercl->io.lock);
726 }
727
728 /************************************************************************
729  *                              CLUSTERS                                *
730  ************************************************************************
731  *
732  */
733 hammer_cluster_t
734 hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
735                    int *errorp, int getflags)
736 {
737         hammer_cluster_t cluster;
738
739 again:
740         cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
741         if (cluster == NULL) {
742                 ++hammer_count_clusters;
743                 cluster = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
744                 cluster->clu_no = clu_no;
745                 cluster->volume = volume;
746                 RB_INIT(&cluster->rb_bufs_root);
747                 RB_INIT(&cluster->rb_nods_root);
748                 hammer_io_init(&cluster->io, HAMMER_STRUCTURE_CLUSTER);
749                 cluster->io.offset = calculate_cluster_offset(volume, clu_no);
750                 hammer_ref(&cluster->io.lock);
751
752                 /*
753                  * Insert the cluster into the RB tree and handle late
754                  * collisions.
755                  */
756                 if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
757                         hammer_unref(&cluster->io.lock);
758                         --hammer_count_clusters;
759                         kfree(cluster, M_HAMMER);
760                         goto again;
761                 }
762                 hammer_ref(&volume->io.lock);
763         } else {
764                 hammer_ref(&cluster->io.lock);
765         }
766
767         /*
768          * Deal with on-disk info
769          */
770         while (cluster->io.loading) {
771                 hammer_lock_ex(&cluster->io.lock);
772                 hammer_unlock(&cluster->io.lock);
773         }
774         if (cluster->ondisk == NULL || getflags) {
775                 *errorp = hammer_load_cluster(cluster, getflags);
776                 if (*errorp) {
777                         hammer_rel_cluster(cluster, 1);
778                         cluster = NULL;
779                 }
780         } else {
781                 *errorp = 0;
782         }
783         return (cluster);
784 }
785
786 hammer_cluster_t
787 hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
788 {
789         hammer_cluster_t cluster;
790
791         cluster = hmp->rootcl;
792         KKASSERT(cluster != NULL);
793         hammer_ref(&cluster->io.lock);
794
795         /*
796          * Deal with on-disk info
797          */
798         while (cluster->io.loading) {
799                 hammer_lock_ex(&cluster->io.lock);
800                 hammer_unlock(&cluster->io.lock);
801         }
802         if (cluster->ondisk == NULL) {
803                 *errorp = hammer_load_cluster(cluster, 0);
804                 if (*errorp) {
805                         hammer_rel_cluster(cluster, 1);
806                         cluster = NULL;
807                 }
808         } else {
809                 *errorp = 0;
810         }
811         return (cluster);
812 }
813
814 static
815 int
816 hammer_load_cluster(hammer_cluster_t cluster, int getflags)
817 {
818         hammer_volume_t volume = cluster->volume;
819         struct hammer_cluster_ondisk *ondisk;
820         int error;
821
822         /*
823          * Load the cluster's on-disk info
824          */
825         hammer_lock_ex(&cluster->io.lock);
826         cluster->io.loading = 1;
827         if (cluster->ondisk == NULL) {
828                 KKASSERT(TAILQ_EMPTY(&cluster->io.deplist));
829                 if (getflags & GET_CLUSTER_NEW)
830                         error = hammer_io_new(volume->devvp, &cluster->io);
831                 else
832                         error = hammer_io_read(volume->devvp, &cluster->io);
833                 if (error) {
834                         cluster->io.loading = 0;
835                         hammer_unlock(&cluster->io.lock);
836                         return (error);
837                 }
838                 cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
839
840                 cluster->alist_master.config = &Clu_master_alist_config;
841                 cluster->alist_master.meta = ondisk->clu_master_meta;
842                 cluster->alist_btree.config = &Clu_slave_alist_config;
843                 cluster->alist_btree.meta = ondisk->clu_btree_meta;
844                 cluster->alist_btree.info = cluster;
845                 cluster->alist_record.config = &Clu_slave_alist_config;
846                 cluster->alist_record.meta = ondisk->clu_record_meta;
847                 cluster->alist_record.info = cluster;
848                 cluster->alist_mdata.config = &Clu_slave_alist_config;
849                 cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
850                 cluster->alist_mdata.info = cluster;
851
852                 if ((getflags & GET_CLUSTER_NEW) == 0) {
853                         /*
854                          * Load cluster range info for easy access
855                          */
856                         cluster->clu_btree_beg = ondisk->clu_btree_beg;
857                         cluster->clu_btree_end = ondisk->clu_btree_end;
858
859                         /*
860                          * Recover a cluster that was marked open.  This
861                          * can be rather involved and block for a hefty
862                          * chunk of time.
863                          */
864                         /*if (ondisk->clu_flags & HAMMER_CLUF_OPEN)*/
865                         if ((getflags & GET_CLUSTER_NORECOVER) == 0)
866                                 hammer_recover(cluster);
867                 }
868         } else if (getflags & GET_CLUSTER_NEW) {
869                 error = hammer_io_new(volume->devvp, &cluster->io);
870         } else {
871                 error = 0;
872         }
873         if (error == 0 && (getflags & GET_CLUSTER_NEW)) {
874                 /*
875                  * If this is a new cluster we have to initialize
876                  * various ondisk structural elements.  The caller is
877                  * responsible for the remainder.
878                  */
879                 struct hammer_alist_live dummy;
880                 hammer_node_t croot;
881                 hammer_volume_ondisk_t voldisk;
882                 int32_t nbuffers;
883
884                 hammer_modify_cluster(cluster);
885                 ondisk = cluster->ondisk;
886                 voldisk = volume->ondisk;
887
888                 dummy.config = &Buf_alist_config;
889                 dummy.meta = ondisk->head.buf_almeta;
890                 dummy.info = NULL;
891                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER);
892
893                 ondisk->vol_fsid = voldisk->vol_fsid;
894                 ondisk->vol_fstype = voldisk->vol_fstype;
895                 ondisk->clu_gen = 1;
896                 ondisk->clu_id = 0;     /* XXX */
897                 ondisk->clu_no = cluster->clu_no;
898                 ondisk->clu_flags = 0;
899                 ondisk->clu_start = HAMMER_BUFSIZE;
900                 ondisk->synchronized_rec_id = 1; /* XXX timestamp */
901                 KKASSERT(voldisk->vol_clo_end > cluster->io.offset);
902                 if (voldisk->vol_clo_end - cluster->io.offset >
903                     voldisk->vol_clsize) {
904                         ondisk->clu_limit = voldisk->vol_clsize;
905                 } else {
906                         ondisk->clu_limit = (int32_t)(voldisk->vol_clo_end -
907                                                       cluster->io.offset);
908                 }
909                 nbuffers = ondisk->clu_limit / HAMMER_BUFSIZE;
910                 hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
911                                   HAMMER_ASTATE_FREE);
912                 hammer_alist_init(&cluster->alist_btree,
913                                   HAMMER_FSBUF_MAXBLKS,
914                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
915                                   HAMMER_ASTATE_ALLOC);
916                 hammer_alist_init(&cluster->alist_record,
917                                   HAMMER_FSBUF_MAXBLKS,
918                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
919                                   HAMMER_ASTATE_ALLOC);
920                 hammer_alist_init(&cluster->alist_mdata,
921                                   HAMMER_FSBUF_MAXBLKS,
922                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
923                                   HAMMER_ASTATE_ALLOC);
924
925                 ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
926                 ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
927                 ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
928
929                 /*
930                  * Initialize the B-Tree.  We don't know what the caller
931                  * intends to do with the cluster so make sure it causes
932                  * an assertion if the caller makes no changes.
933                  */
934                 ondisk->clu_btree_parent_vol_no = -2;
935                 ondisk->clu_btree_parent_clu_no = -2;
936                 ondisk->clu_btree_parent_offset = -2;
937                 ondisk->clu_btree_parent_clu_gen = -2;
938
939                 croot = hammer_alloc_btree(cluster, &error);
940                 if (error == 0) {
941                         hammer_modify_node(croot);
942                         bzero(croot->ondisk, sizeof(*croot->ondisk));
943                         croot->ondisk->count = 0;
944                         croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
945                         hammer_modify_cluster(cluster);
946                         ondisk->clu_btree_root = croot->node_offset;
947                         hammer_rel_node(croot);
948                 }
949         }
950         cluster->io.loading = 0;
951         hammer_unlock(&cluster->io.lock);
952         return (error);
953 }
954
955 /*
956  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
957  */
958 int
959 hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused)
960 {
961         hammer_ref(&cluster->io.lock);
962         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
963                 hammer_unload_buffer, NULL);
964         hammer_io_waitdep(&cluster->io);
965         KKASSERT(cluster->io.lock.refs == 1);
966         hammer_rel_cluster(cluster, 2);
967         return(0);
968 }
969
970 /*
971  * Update the cluster's synchronization TID, which is used during cluster
972  * recovery.  NOTE: The cluster header is not written out until all related
973  * records have been written out.
974  */
975 u_int64_t
976 hammer_alloc_recid(hammer_cluster_t cluster)
977 {
978         u_int64_t recid;
979
980         hammer_modify_cluster(cluster);
981         recid = cluster->ondisk->synchronized_rec_id++;
982         return(recid);
983 }
984
985 #if 0
986 void
987 hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid)
988 {
989         hammer_modify_cluster(cluster);
990         if (cluster->ondisk->synchronized_tid < tid)
991                 cluster->ondisk->synchronized_tid = tid;
992 }
993 #endif
994
995 /*
996  * Reference a cluster that is either already referenced or via a specially
997  * handled pointer (aka rootcl).
998  */
999 int
1000 hammer_ref_cluster(hammer_cluster_t cluster)
1001 {
1002         int error;
1003
1004         KKASSERT(cluster != NULL);
1005         hammer_ref(&cluster->io.lock);
1006
1007         /*
1008          * Deal with on-disk info
1009          */
1010         while (cluster->io.loading) {
1011                 hammer_lock_ex(&cluster->io.lock);
1012                 hammer_unlock(&cluster->io.lock);
1013         }
1014         if (cluster->ondisk == NULL) {
1015                 error = hammer_load_cluster(cluster, 0);
1016                 if (error)
1017                         hammer_rel_cluster(cluster, 1);
1018         } else {
1019                 error = 0;
1020         }
1021         return(error);
1022 }
1023
1024 /*
1025  * Release a cluster.  We have to deal with several places where
1026  * another thread can ref the cluster.
1027  *
1028  * Only destroy the structure itself if we no longer have an IO or any
1029  * hammer buffers associated with the structure.
1030  */
1031 void
1032 hammer_rel_cluster(hammer_cluster_t cluster, int flush)
1033 {
1034         hammer_volume_t volume;
1035
1036         if (cluster->io.lock.refs == 1) {
1037                 hammer_lock_ex(&cluster->io.lock);
1038                 if (cluster->io.lock.refs == 1) {
1039                         /*
1040                          * Release the I/O.  If we or the kernel wants to
1041                          * flush, this will release the bp.  Otherwise the
1042                          * bp may be written and flushed passively by the
1043                          * kernel later on.
1044                          */
1045                         hammer_io_release(&cluster->io, flush);
1046
1047                         /*
1048                          * Final cleanup
1049                          */
1050                         if (cluster != cluster->volume->hmp->rootcl &&
1051                             cluster->io.bp == NULL &&
1052                             cluster->io.lock.refs == 1 &&
1053                             RB_EMPTY(&cluster->rb_bufs_root)) {
1054                                 KKASSERT(RB_EMPTY(&cluster->rb_nods_root));
1055                                 volume = cluster->volume;
1056                                 RB_REMOVE(hammer_clu_rb_tree,
1057                                           &volume->rb_clus_root, cluster);
1058                                 cluster->volume = NULL; /* sanity */
1059                                 --hammer_count_clusters;
1060                                 kfree(cluster, M_HAMMER);
1061                                 hammer_rel_volume(volume, 0);
1062                                 return;
1063                         }
1064                 } else if (flush) {
1065                         hammer_io_flush(&cluster->io);
1066                 }
1067                 hammer_unlock(&cluster->io.lock);
1068         }
1069         hammer_unref(&cluster->io.lock);
1070 }
1071
1072 /************************************************************************
1073  *                              BUFFERS                                 *
1074  ************************************************************************
1075  *
1076  * Manage buffers.  Note that a buffer holds a reference to its associated
1077  * cluster, and its cluster will hold a reference to the cluster's volume.
1078  *
1079  * A non-zero buf_type indicates that a new buffer should be created and
1080  * zero'd.
1081  */
1082 hammer_buffer_t
1083 hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
1084                   u_int64_t buf_type, int *errorp)
1085 {
1086         hammer_buffer_t buffer;
1087
1088         /*
1089          * Find the buffer.  Note that buffer 0 corresponds to the cluster
1090          * header and should never be requested.
1091          */
1092         KKASSERT(buf_no >= cluster->ondisk->clu_start / HAMMER_BUFSIZE &&
1093                  buf_no < cluster->ondisk->clu_limit / HAMMER_BUFSIZE);
1094
1095         /*
1096          * Locate and lock the buffer structure, creating one if necessary.
1097          */
1098 again:
1099         buffer = RB_LOOKUP(hammer_buf_rb_tree, &cluster->rb_bufs_root, buf_no);
1100         if (buffer == NULL) {
1101                 ++hammer_count_buffers;
1102                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
1103                 buffer->buf_no = buf_no;
1104                 buffer->cluster = cluster;
1105                 buffer->volume = cluster->volume;
1106                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
1107                 buffer->io.offset = cluster->io.offset +
1108                                     (buf_no * HAMMER_BUFSIZE);
1109                 TAILQ_INIT(&buffer->clist);
1110                 hammer_ref(&buffer->io.lock);
1111
1112                 /*
1113                  * Insert the cluster into the RB tree and handle late
1114                  * collisions.
1115                  */
1116                 if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
1117                         hammer_unref(&buffer->io.lock);
1118                         --hammer_count_buffers;
1119                         kfree(buffer, M_HAMMER);
1120                         goto again;
1121                 }
1122                 hammer_ref(&cluster->io.lock);
1123         } else {
1124                 hammer_ref(&buffer->io.lock);
1125         }
1126
1127         /*
1128          * Deal with on-disk info
1129          */
1130         while (buffer->io.loading) {
1131                 hammer_lock_ex(&buffer->io.lock);
1132                 hammer_unlock(&buffer->io.lock);
1133         }
1134         if (buffer->ondisk == NULL || buf_type) {
1135                 *errorp = hammer_load_buffer(buffer, buf_type);
1136                 if (*errorp) {
1137                         hammer_rel_buffer(buffer, 1);
1138                         buffer = NULL;
1139                 }
1140         } else {
1141                 *errorp = 0;
1142         }
1143         return(buffer);
1144 }
1145
1146 static int
1147 hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
1148 {
1149         hammer_volume_t volume;
1150         hammer_fsbuf_ondisk_t ondisk;
1151         int error;
1152
1153         /*
1154          * Load the buffer's on-disk info
1155          */
1156         volume = buffer->volume;
1157         hammer_lock_ex(&buffer->io.lock);
1158         buffer->io.loading = 1;
1159         if (buffer->ondisk == NULL) {
1160                 if (buf_type) {
1161                         error = hammer_io_new(volume->devvp, &buffer->io);
1162                 } else {
1163                         error = hammer_io_read(volume->devvp, &buffer->io);
1164                 }
1165                 if (error) {
1166                         buffer->io.loading = 0;
1167                         hammer_unlock(&buffer->io.lock);
1168                         return (error);
1169                 }
1170                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
1171                 buffer->alist.config = &Buf_alist_config;
1172                 buffer->alist.meta = ondisk->head.buf_almeta;
1173                 buffer->buf_type = ondisk->head.buf_type;
1174         } else if (buf_type) {
1175                 error = hammer_io_new(volume->devvp, &buffer->io);
1176         } else {
1177                 error = 0;
1178         }
1179         if (error == 0 && buf_type) {
1180                 hammer_modify_buffer(buffer);
1181                 ondisk = buffer->ondisk;
1182                 hammer_initbuffer(&buffer->alist, &ondisk->head, buf_type);
1183                 buffer->buf_type = ondisk->head.buf_type;
1184         }
1185         buffer->io.loading = 0;
1186         hammer_unlock(&buffer->io.lock);
1187         return (error);
1188 }
1189
1190 /*
1191  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
1192  */
1193 int
1194 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
1195 {
1196         hammer_ref(&buffer->io.lock);
1197         hammer_flush_buffer_nodes(buffer);
1198         KKASSERT(buffer->io.lock.refs == 1);
1199         hammer_rel_buffer(buffer, 2);
1200         return(0);
1201 }
1202
1203 /*
1204  * Reference a buffer that is either already referenced or via a specially
1205  * handled pointer (aka cursor->buffer).
1206  */
1207 int
1208 hammer_ref_buffer(hammer_buffer_t buffer)
1209 {
1210         int error;
1211
1212         hammer_ref(&buffer->io.lock);
1213         while (buffer->io.loading) {
1214                 hammer_lock_ex(&buffer->io.lock);
1215                 hammer_unlock(&buffer->io.lock);
1216         }
1217         if (buffer->ondisk == NULL) {
1218                 error = hammer_load_buffer(buffer, 0);
1219                 if (error) {
1220                         hammer_rel_buffer(buffer, 1);
1221                         /*
1222                          * NOTE: buffer pointer can become stale after
1223                          * the above release.
1224                          */
1225                 } else {
1226                         KKASSERT(buffer->buf_type ==
1227                                  buffer->ondisk->head.buf_type);
1228                 }
1229         } else {
1230                 error = 0;
1231         }
1232         return(error);
1233 }
1234
1235 /*
1236  * Release a buffer.  We have to deal with several places where
1237  * another thread can ref the buffer.
1238  *
1239  * Only destroy the structure itself if the related buffer cache buffer
1240  * was disassociated from it.  This ties the management of the structure
1241  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1242  * embedded io is referenced or not.
1243  */
1244 void
1245 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
1246 {
1247         hammer_cluster_t cluster;
1248
1249         if (buffer->io.lock.refs == 1) {
1250                 hammer_lock_ex(&buffer->io.lock);
1251                 if (buffer->io.lock.refs == 1) {
1252                         hammer_io_release(&buffer->io, flush);
1253
1254                         if (buffer->io.bp == NULL &&
1255                             buffer->io.lock.refs == 1) {
1256                                 hammer_flush_buffer_nodes(buffer);
1257                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
1258                                 cluster = buffer->cluster;
1259                                 RB_REMOVE(hammer_buf_rb_tree,
1260                                           &cluster->rb_bufs_root, buffer);
1261                                 buffer->cluster = NULL; /* sanity */
1262                                 --hammer_count_buffers;
1263                                 kfree(buffer, M_HAMMER);
1264                                 hammer_rel_cluster(cluster, 0);
1265                                 return;
1266                         }
1267                 } else if (flush) {
1268                         hammer_io_flush(&buffer->io);
1269                 }
1270                 hammer_unlock(&buffer->io.lock);
1271         }
1272         hammer_unref(&buffer->io.lock);
1273 }
1274
1275 /************************************************************************
1276  *                              NODES                                   *
1277  ************************************************************************
1278  *
1279  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1280  * method used by the HAMMER filesystem.
1281  *
1282  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1283  * associated with its buffer, and will only referenced the buffer while
1284  * the node itself is referenced.
1285  *
1286  * A hammer_node can also be passively associated with other HAMMER
1287  * structures, such as inodes, while retaining 0 references.  These
1288  * associations can be cleared backwards using a pointer-to-pointer in
1289  * the hammer_node.
1290  *
1291  * This allows the HAMMER implementation to cache hammer_nodes long-term
1292  * and short-cut a great deal of the infrastructure's complexity.  In
1293  * most cases a cached node can be reacquired without having to dip into
1294  * either the buffer or cluster management code.
1295  *
1296  * The caller must pass a referenced cluster on call and will retain
1297  * ownership of the reference on return.  The node will acquire its own
1298  * additional references, if necessary.
1299  */
1300 hammer_node_t
1301 hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
1302 {
1303         hammer_node_t node;
1304
1305         /*
1306          * Locate the structure, allocating one if necessary.
1307          */
1308 again:
1309         node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
1310                          node_offset);
1311         if (node == NULL) {
1312                 ++hammer_count_nodes;
1313                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
1314                 node->node_offset = node_offset;
1315                 node->cluster = cluster;
1316                 if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
1317                               node)) {
1318                         --hammer_count_nodes;
1319                         kfree(node, M_HAMMER);
1320                         goto again;
1321                 }
1322         }
1323         hammer_ref(&node->lock);
1324         *errorp = hammer_load_node(node);
1325         if (*errorp) {
1326                 hammer_rel_node(node);
1327                 node = NULL;
1328         }
1329         return(node);
1330 }
1331
1332 /*
1333  * Reference an already-referenced node.
1334  */
1335 int
1336 hammer_ref_node(hammer_node_t node)
1337 {
1338         int error;
1339
1340         KKASSERT(node->lock.refs > 0);
1341         hammer_ref(&node->lock);
1342         if ((error = hammer_load_node(node)) != 0)
1343                 hammer_rel_node(node);
1344         return(error);
1345 }
1346
1347 /*
1348  * Load a node's on-disk data reference.
1349  */
1350 static int
1351 hammer_load_node(hammer_node_t node)
1352 {
1353         hammer_buffer_t buffer;
1354         int32_t buf_no;
1355         int error;
1356
1357         if (node->ondisk)
1358                 return(0);
1359         error = 0;
1360         hammer_lock_ex(&node->lock);
1361         if (node->ondisk == NULL) {
1362                 /*
1363                  * This is a little confusing but the jist is that
1364                  * node->buffer determines whether the node is on
1365                  * the buffer's clist and node->ondisk determines
1366                  * whether the buffer is referenced.
1367                  */
1368                 if ((buffer = node->buffer) != NULL) {
1369                         error = hammer_ref_buffer(buffer);
1370                 } else {
1371                         buf_no = node->node_offset / HAMMER_BUFSIZE;
1372                         buffer = hammer_get_buffer(node->cluster,
1373                                                    buf_no, 0, &error);
1374                         if (buffer) {
1375                                 KKASSERT(error == 0);
1376                                 TAILQ_INSERT_TAIL(&buffer->clist,
1377                                                   node, entry);
1378                                 node->buffer = buffer;
1379                         }
1380                 }
1381                 if (error == 0) {
1382                         node->ondisk = (void *)((char *)buffer->ondisk +
1383                                (node->node_offset & HAMMER_BUFMASK));
1384                 }
1385         }
1386         hammer_unlock(&node->lock);
1387         return (error);
1388 }
1389
1390 /*
1391  * Safely reference a node, interlock against flushes via the IO subsystem.
1392  */
1393 hammer_node_t
1394 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
1395                      int *errorp)
1396 {
1397         hammer_node_t node;
1398
1399         if ((node = *cache) != NULL)
1400                 hammer_ref(&node->lock);
1401         if (node) {
1402                 *errorp = hammer_load_node(node);
1403                 if (*errorp) {
1404                         hammer_rel_node(node);
1405                         node = NULL;
1406                 }
1407         } else {
1408                 *errorp = ENOENT;
1409         }
1410         return(node);
1411 }
1412
1413 /*
1414  * Release a hammer_node.  On the last release the node dereferences
1415  * its underlying buffer and may or may not be destroyed.
1416  */
1417 void
1418 hammer_rel_node(hammer_node_t node)
1419 {
1420         hammer_cluster_t cluster;
1421         hammer_buffer_t buffer;
1422         int32_t node_offset;
1423         int flags;
1424
1425         /*
1426          * If this isn't the last ref just decrement the ref count and
1427          * return.
1428          */
1429         if (node->lock.refs > 1) {
1430                 hammer_unref(&node->lock);
1431                 return;
1432         }
1433
1434         /*
1435          * If there is no ondisk info or no buffer the node failed to load,
1436          * remove the last reference and destroy the node.
1437          */
1438         if (node->ondisk == NULL) {
1439                 hammer_unref(&node->lock);
1440                 hammer_flush_node(node);
1441                 /* node is stale now */
1442                 return;
1443         }
1444
1445         /*
1446          * Do final cleanups and then either destroy the node and leave it
1447          * passively cached.  The buffer reference is removed regardless.
1448          */
1449         buffer = node->buffer;
1450         node->ondisk = NULL;
1451
1452         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
1453                 hammer_unref(&node->lock);
1454                 hammer_rel_buffer(buffer, 0);
1455                 return;
1456         }
1457
1458         /*
1459          * Destroy the node.  Record pertainant data because the node
1460          * becomes stale the instant we flush it.
1461          */
1462         flags = node->flags;
1463         node_offset = node->node_offset;
1464         hammer_unref(&node->lock);
1465         hammer_flush_node(node);
1466         /* node is stale */
1467
1468         cluster = buffer->cluster;
1469         if (flags & HAMMER_NODE_DELETED) {
1470                 hammer_free_btree(cluster, node_offset);
1471                 if (node_offset == cluster->ondisk->clu_btree_root) {
1472                         kprintf("FREE CLUSTER %d\n", cluster->clu_no);
1473                         hammer_free_cluster(cluster);
1474                         /*hammer_io_undirty(&cluster->io);*/
1475                 }
1476         }
1477         hammer_rel_buffer(buffer, 0);
1478 }
1479
1480 /*
1481  * Passively cache a referenced hammer_node in *cache.  The caller may
1482  * release the node on return.
1483  */
1484 void
1485 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1486 {
1487         hammer_node_t old;
1488
1489         /*
1490          * If the node is being deleted, don't cache it!
1491          */
1492         if (node->flags & HAMMER_NODE_DELETED)
1493                 return;
1494
1495         /*
1496          * Cache the node.  If we previously cached a different node we
1497          * have to give HAMMER a chance to destroy it.
1498          */
1499 again:
1500         if (node->cache1 != cache) {
1501                 if (node->cache2 != cache) {
1502                         if ((old = *cache) != NULL) {
1503                                 KKASSERT(node->lock.refs != 0);
1504                                 hammer_uncache_node(cache);
1505                                 goto again;
1506                         }
1507                         if (node->cache2)
1508                                 *node->cache2 = NULL;
1509                         node->cache2 = node->cache1;
1510                         node->cache1 = cache;
1511                         *cache = node;
1512                 } else {
1513                         struct hammer_node **tmp;
1514                         tmp = node->cache1;
1515                         node->cache1 = node->cache2;
1516                         node->cache2 = tmp;
1517                 }
1518         }
1519 }
1520
1521 void
1522 hammer_uncache_node(struct hammer_node **cache)
1523 {
1524         hammer_node_t node;
1525
1526         if ((node = *cache) != NULL) {
1527                 *cache = NULL;
1528                 if (node->cache1 == cache) {
1529                         node->cache1 = node->cache2;
1530                         node->cache2 = NULL;
1531                 } else if (node->cache2 == cache) {
1532                         node->cache2 = NULL;
1533                 } else {
1534                         panic("hammer_uncache_node: missing cache linkage");
1535                 }
1536                 if (node->cache1 == NULL && node->cache2 == NULL)
1537                         hammer_flush_node(node);
1538         }
1539 }
1540
1541 /*
1542  * Remove a node's cache references and destroy the node if it has no
1543  * other references or backing store.
1544  */
1545 void
1546 hammer_flush_node(hammer_node_t node)
1547 {
1548         hammer_buffer_t buffer;
1549
1550         if (node->cache1)
1551                 *node->cache1 = NULL;
1552         if (node->cache2)
1553                 *node->cache2 = NULL;
1554         if (node->lock.refs == 0 && node->ondisk == NULL) {
1555                 RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
1556                           node);
1557                 if ((buffer = node->buffer) != NULL) {
1558                         node->buffer = NULL;
1559                         TAILQ_REMOVE(&buffer->clist, node, entry);
1560                         /* buffer is unreferenced because ondisk is NULL */
1561                 }
1562                 --hammer_count_nodes;
1563                 kfree(node, M_HAMMER);
1564         }
1565 }
1566
1567 /*
1568  * Flush passively cached B-Tree nodes associated with this buffer.
1569  * This is only called when the buffer is about to be destroyed, so
1570  * none of the nodes should have any references.
1571  */
1572 void
1573 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1574 {
1575         hammer_node_t node;
1576
1577         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1578                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1579                 hammer_ref(&node->lock);
1580                 node->flags |= HAMMER_NODE_FLUSH;
1581                 hammer_rel_node(node);
1582         }
1583 }
1584
1585 /************************************************************************
1586  *                              A-LIST ALLOCATORS                       *
1587  ************************************************************************/
1588
1589 /*
1590  * Allocate HAMMER clusters
1591  */
1592 hammer_cluster_t
1593 hammer_alloc_cluster(hammer_mount_t hmp, hammer_cluster_t cluster_hint,
1594                      int *errorp)
1595 {
1596         hammer_volume_t volume;
1597         hammer_cluster_t cluster;
1598         int32_t clu_no;
1599         int32_t clu_hint;
1600         int32_t vol_beg;
1601         int32_t vol_no;
1602
1603         /*
1604          * Figure out our starting volume and hint.
1605          */
1606         if (cluster_hint) {
1607                 vol_beg = cluster_hint->volume->vol_no;
1608                 clu_hint = cluster_hint->clu_no;
1609         } else {
1610                 vol_beg = hmp->volume_iterator;
1611                 clu_hint = -1;
1612         }
1613
1614         /*
1615          * Loop through volumes looking for a free cluster.  If allocating
1616          * a new cluster relative to an existing cluster try to find a free
1617          * cluster on either side (clu_hint >= 0), otherwise just do a
1618          * forwards iteration.
1619          */
1620         vol_no = vol_beg;
1621         do {
1622                 volume = hammer_get_volume(hmp, vol_no, errorp);
1623                 kprintf("VOLUME %p %d\n", volume, vol_no);
1624                 if (*errorp) {
1625                         clu_no = HAMMER_ALIST_BLOCK_NONE;
1626                         break;
1627                 }
1628                 hammer_modify_volume(volume);
1629                 if (clu_hint == -1) {
1630                         clu_hint = volume->clu_iterator;
1631                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1632                                                         clu_hint);
1633                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1634                                 clu_no = hammer_alist_alloc_fwd(&volume->alist,
1635                                                                 1, 0);
1636                         }
1637                 } else {
1638                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1639                                                         clu_hint);
1640                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1641                                 clu_no = hammer_alist_alloc_rev(&volume->alist,
1642                                                                 1, clu_hint);
1643                         }
1644                 }
1645                 if (clu_no != HAMMER_ALIST_BLOCK_NONE)
1646                         break;
1647                 hammer_rel_volume(volume, 0);
1648                 volume = NULL;
1649                 *errorp = ENOSPC;
1650                 vol_no = (vol_no + 1) % hmp->nvolumes;
1651                 clu_hint = -1;
1652         } while (vol_no != vol_beg);
1653
1654         /*
1655          * Acquire the cluster.  On success this will force *errorp to 0.
1656          */
1657         if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
1658                 kprintf("ALLOC CLUSTER %d:%d\n", volume->vol_no, clu_no);
1659                 cluster = hammer_get_cluster(volume, clu_no, errorp,
1660                                              GET_CLUSTER_NEW);
1661                 volume->clu_iterator = clu_no;
1662                 hammer_rel_volume(volume, 0);
1663         } else {
1664                 cluster = NULL;
1665         }
1666         if (cluster)
1667                 hammer_lock_ex(&cluster->io.lock);
1668         return(cluster);
1669 }
1670
1671 void
1672 hammer_init_cluster(hammer_cluster_t cluster, hammer_base_elm_t left_bound, 
1673                     hammer_base_elm_t right_bound)
1674 {
1675         hammer_cluster_ondisk_t ondisk = cluster->ondisk;
1676
1677         hammer_modify_cluster(cluster);
1678         ondisk->clu_btree_beg = *left_bound;
1679         ondisk->clu_btree_end = *right_bound;
1680         cluster->clu_btree_beg = ondisk->clu_btree_beg;
1681         cluster->clu_btree_end = ondisk->clu_btree_end;
1682 }
1683
1684 /*
1685  * Deallocate a cluster
1686  */
1687 void
1688 hammer_free_cluster(hammer_cluster_t cluster)
1689 {
1690         hammer_modify_cluster(cluster);
1691         hammer_alist_free(&cluster->volume->alist, cluster->clu_no, 1);
1692 }
1693
1694 /*
1695  * Allocate HAMMER elements - btree nodes, data storage, and record elements
1696  *
1697  * The passed *bufferp should be initialized to NULL.  On successive calls
1698  * *bufferp caches the most recent buffer used until put away by the caller.
1699  * Note that previously returned pointers using the cached buffer become
1700  * invalid on successive calls which reuse *bufferp.
1701  *
1702  * All allocations first attempt to use the block found at the specified
1703  * iterator.  If that fails the first available block is used.  If that
1704  * fails a new buffer is allocated and associated with the buffer type
1705  * A-list and the element is allocated out of the new buffer.
1706  */
1707
1708 hammer_node_t
1709 hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
1710 {
1711         hammer_buffer_t buffer;
1712         hammer_alist_t live;
1713         hammer_node_t node;
1714         int32_t elm_no;
1715         int32_t buf_no;
1716         int32_t node_offset;
1717
1718         /*
1719          * Allocate a B-Tree element
1720          */
1721         hammer_modify_cluster(cluster);
1722         buffer = NULL;
1723         live = &cluster->alist_btree;
1724         elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
1725         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1726                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1727         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1728                 alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1729                                  cluster->ondisk->idx_index, errorp, &buffer);
1730                 elm_no = hammer_alist_alloc(live, 1);
1731                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1732                         *errorp = ENOSPC;
1733                         if (buffer)
1734                                 hammer_rel_buffer(buffer, 0);
1735                         return(NULL);
1736                 }
1737         }
1738         cluster->ondisk->idx_index = elm_no;
1739         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_BTREE_NODES);
1740
1741         /*
1742          * Load and return the B-Tree element
1743          */
1744         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1745         node_offset = buf_no * HAMMER_BUFSIZE +
1746                       offsetof(union hammer_fsbuf_ondisk,
1747                                btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
1748         node = hammer_get_node(cluster, node_offset, errorp);
1749         if (node) {
1750                 hammer_modify_node(node);
1751                 bzero(node->ondisk, sizeof(*node->ondisk));
1752         } else {
1753                 hammer_alist_free(live, elm_no, 1);
1754                 hammer_rel_node(node);
1755                 node = NULL;
1756         }
1757         if (buffer)
1758                 hammer_rel_buffer(buffer, 0);
1759         return(node);
1760 }
1761
1762 void *
1763 hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
1764                   int *errorp, struct hammer_buffer **bufferp)
1765 {
1766         hammer_buffer_t buffer;
1767         hammer_alist_t live;
1768         int32_t elm_no;
1769         int32_t buf_no;
1770         int32_t nblks;
1771         void *item;
1772
1773         /*
1774          * Deal with large data blocks.  The blocksize is HAMMER_BUFSIZE
1775          * for these allocations.
1776          */
1777         hammer_modify_cluster(cluster);
1778         if ((bytes & HAMMER_BUFMASK) == 0) {
1779                 nblks = bytes / HAMMER_BUFSIZE;
1780                 /* only one block allowed for now (so buffer can hold it) */
1781                 KKASSERT(nblks == 1);
1782
1783                 buf_no = hammer_alloc_master(cluster, nblks,
1784                                              cluster->ondisk->idx_ldata, 1);
1785                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1786                         *errorp = ENOSPC;
1787                         return(NULL);
1788                 }
1789                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, nblks);
1790                 cluster->ondisk->idx_ldata = buf_no;
1791                 buffer = *bufferp;
1792                 *bufferp = hammer_get_buffer(cluster, buf_no, -1, errorp);
1793                 if (buffer)
1794                         hammer_rel_buffer(buffer, 0);
1795                 buffer = *bufferp;
1796                 return(buffer->ondisk);
1797         }
1798
1799         /*
1800          * Allocate a data element.  The block size is HAMMER_DATA_BLKSIZE
1801          * (64 bytes) for these allocations.
1802          */
1803         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1804         nblks /= HAMMER_DATA_BLKSIZE;
1805         live = &cluster->alist_mdata;
1806         elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data);
1807         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1808                 elm_no = hammer_alist_alloc_fwd(live, nblks, 0);
1809         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1810                 alloc_new_buffer(cluster, HAMMER_FSBUF_DATA, live,
1811                                  cluster->ondisk->idx_data, errorp, bufferp);
1812                 elm_no = hammer_alist_alloc(live, nblks);
1813                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1814                         *errorp = ENOSPC;
1815                         return(NULL);
1816                 }
1817         }
1818         cluster->ondisk->idx_index = elm_no;
1819
1820         /*
1821          * Load and return the B-Tree element
1822          */
1823         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1824         buffer = *bufferp;
1825         if (buffer == NULL || buffer->cluster != cluster ||
1826             buffer->buf_no != buf_no) {
1827                 if (buffer)
1828                         hammer_rel_buffer(buffer, 0);
1829                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1830                 *bufferp = buffer;
1831         }
1832         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_DATA);
1833         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_DATA_NODES);
1834         hammer_modify_buffer(buffer);
1835         item = &buffer->ondisk->data.data[elm_no & HAMMER_FSBUF_BLKMASK];
1836         bzero(item, nblks * HAMMER_DATA_BLKSIZE);
1837         *errorp = 0;
1838         return(item);
1839 }
1840
1841 void *
1842 hammer_alloc_record(hammer_cluster_t cluster,
1843                     int *errorp, struct hammer_buffer **bufferp)
1844 {
1845         hammer_buffer_t buffer;
1846         hammer_alist_t live;
1847         int32_t elm_no;
1848         int32_t buf_no;
1849         void *item;
1850
1851         /*
1852          * Allocate a record element
1853          */
1854         hammer_modify_cluster(cluster);
1855         live = &cluster->alist_record;
1856         elm_no = hammer_alist_alloc_rev(live, 1, cluster->ondisk->idx_record);
1857         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1858                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1859         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1860                 alloc_new_buffer(cluster, HAMMER_FSBUF_RECORDS, live,
1861                                  cluster->ondisk->idx_record, errorp, bufferp);
1862                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1863                 kprintf("hammer_alloc_record elm again %08x\n", elm_no);
1864                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1865                         *errorp = ENOSPC;
1866                         return(NULL);
1867                 }
1868         }
1869         cluster->ondisk->idx_record = elm_no;
1870
1871         /*
1872          * Load and return the record element
1873          */
1874         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1875         buffer = *bufferp;
1876         if (buffer == NULL || buffer->cluster != cluster ||
1877             buffer->buf_no != buf_no) {
1878                 if (buffer)
1879                         hammer_rel_buffer(buffer, 0);
1880                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1881                 *bufferp = buffer;
1882         }
1883         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_RECORDS);
1884         KASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_RECORD_NODES,
1885                 ("elm_no %d (%d) out of bounds", elm_no, elm_no & HAMMER_FSBUF_BLKMASK));
1886         hammer_modify_buffer(buffer);
1887         item = &buffer->ondisk->record.recs[elm_no & HAMMER_FSBUF_BLKMASK];
1888         bzero(item, sizeof(union hammer_record_ondisk));
1889         *errorp = 0;
1890         return(item);
1891 }
1892
1893 void
1894 hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
1895 {
1896         int32_t elm_no;
1897         int32_t nblks;
1898         hammer_alist_t live;
1899
1900         hammer_modify_cluster(buffer->cluster);
1901         if ((bytes & HAMMER_BUFMASK) == 0) {
1902                 nblks = bytes / HAMMER_BUFSIZE;
1903                 KKASSERT(nblks == 1 && data == (void *)buffer->ondisk);
1904                 hammer_alist_free(&buffer->cluster->alist_master,
1905                                   buffer->buf_no, nblks);
1906                 hammer_adjust_stats(buffer->cluster, HAMMER_FSBUF_DATA, -nblks);
1907                 return;
1908         }
1909
1910         elm_no = ((char *)data - (char *)buffer->ondisk->data.data) /
1911                  HAMMER_DATA_BLKSIZE;
1912         KKASSERT(elm_no >= 0 && elm_no < HAMMER_DATA_NODES);
1913         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1914         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1915         nblks /= HAMMER_DATA_BLKSIZE;
1916         live = &buffer->cluster->alist_mdata;
1917         hammer_alist_free(live, elm_no, nblks);
1918 }
1919
1920 void
1921 hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec)
1922 {
1923         int32_t elm_no;
1924         hammer_alist_t live;
1925
1926         hammer_modify_cluster(buffer->cluster);
1927         elm_no = rec - &buffer->ondisk->record.recs[0];
1928         KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
1929         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1930         live = &buffer->cluster->alist_record;
1931         hammer_alist_free(live, elm_no, 1);
1932 }
1933
1934 void
1935 hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
1936 {
1937         const int32_t blksize = sizeof(struct hammer_node_ondisk);
1938         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1939         hammer_alist_t live;
1940         int32_t elm_no;
1941
1942         hammer_modify_cluster(cluster);
1943         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1944         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, btree.nodes[0]);
1945         live = &cluster->alist_btree;
1946         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1947         elm_no += fsbuf_offset / blksize;
1948         hammer_alist_free(live, elm_no, 1);
1949 }
1950
1951 void
1952 hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
1953 {
1954         const int32_t blksize = HAMMER_DATA_BLKSIZE;
1955         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1956         hammer_alist_t live;
1957         int32_t elm_no;
1958         int32_t buf_no;
1959         int32_t nblks;
1960
1961         hammer_modify_cluster(cluster);
1962         if ((bytes & HAMMER_BUFMASK) == 0) {
1963                 nblks = bytes / HAMMER_BUFSIZE;
1964                 KKASSERT(nblks == 1 && (bclu_offset & HAMMER_BUFMASK) == 0);
1965                 buf_no = bclu_offset / HAMMER_BUFSIZE;
1966                 hammer_alist_free(&cluster->alist_master, buf_no, nblks);
1967                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, -nblks);
1968                 return;
1969         }
1970
1971         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1972         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, data.data[0][0]);
1973         live = &cluster->alist_mdata;
1974         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1975         nblks /= HAMMER_DATA_BLKSIZE;
1976         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1977         elm_no += fsbuf_offset / blksize;
1978         hammer_alist_free(live, elm_no, nblks);
1979 }
1980
1981 void
1982 hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset)
1983 {
1984         const int32_t blksize = sizeof(union hammer_record_ondisk);
1985         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1986         hammer_alist_t live;
1987         int32_t elm_no;
1988
1989         hammer_modify_cluster(cluster);
1990         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1991         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, record.recs[0]);
1992         live = &cluster->alist_record;
1993         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1994         elm_no += fsbuf_offset / blksize;
1995         hammer_alist_free(live, elm_no, 1);
1996 }
1997
1998
1999 /*
2000  * Allocate a new filesystem buffer and assign it to the specified
2001  * filesystem buffer type.  The new buffer will be added to the
2002  * type-specific A-list and initialized.
2003  *
2004  * buffers used for records will also be added to the clu_record_buf_bitmap.
2005  */
2006 static void
2007 alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live,
2008                  int start, int *errorp, struct hammer_buffer **bufferp)
2009 {
2010         hammer_buffer_t buffer;
2011         int32_t buf_no;
2012         int32_t base_blk;
2013         int isfwd;
2014
2015         if (*bufferp)
2016                 hammer_rel_buffer(*bufferp, 0);
2017         *bufferp = NULL;
2018
2019         start = start / HAMMER_FSBUF_MAXBLKS;   /* convert to buf_no */
2020         isfwd = (type != HAMMER_FSBUF_RECORDS);
2021         buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
2022         if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2023                 *errorp = ENOSPC;
2024                 return;
2025         }
2026
2027         /*
2028          * The new buffer must be initialized (type != 0) regardless of
2029          * whether we already have it cached or not, so don't try to
2030          * optimize the cached buffer check.  Just call hammer_get_buffer().
2031          */
2032         buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
2033         *bufferp = buffer;
2034
2035         /*
2036          * Do a meta-free of the buffer's elements into the type-specific
2037          * A-list and update our statistics to reflect the allocation.
2038          */
2039         if (buffer) {
2040 #if 0
2041                 kprintf("alloc_new_buffer buf_no %d type %016llx nelms %d\n",
2042                         buf_no, type, nelements);
2043 #endif
2044                 hammer_modify_buffer(buffer);  /*XXX*/
2045                 hammer_adjust_stats(cluster, type, 1);
2046
2047                 /*
2048                  * Free the buffer to the appropriate slave list so the
2049                  * cluster-based allocator sees it.
2050                  */
2051                 base_blk = buf_no * HAMMER_FSBUF_MAXBLKS;
2052
2053                 switch(type) {
2054                 case HAMMER_FSBUF_BTREE:
2055                         hammer_alist_free(live, base_blk, HAMMER_BTREE_NODES);
2056                         break;
2057                 case HAMMER_FSBUF_DATA:
2058                         hammer_alist_free(live, base_blk, HAMMER_DATA_NODES);
2059                         break;
2060                 case HAMMER_FSBUF_RECORDS:
2061                         hammer_alist_free(live, base_blk, HAMMER_RECORD_NODES);
2062                         break;
2063                 }
2064         }
2065
2066         /*
2067          * And, finally, update clu_record_buf_bitmap for record buffers.
2068          * Since buffers are synced to disk before their associated cluster
2069          * header, a recovery operation will only see synced record buffers
2070          * in the bitmap.  XXX We can't use alist_record for recovery due
2071          * to the way we currently manage it.
2072          */
2073         if (buffer && type == HAMMER_FSBUF_RECORDS) {
2074                 KKASSERT(buf_no >= 0 && buf_no < HAMMER_CLU_MAXBUFFERS);
2075                 hammer_modify_cluster(cluster);
2076                 cluster->ondisk->clu_record_buf_bitmap[buf_no >> 5] |=
2077                         (1 << (buf_no & 31));
2078         }
2079 }
2080
2081 /*
2082  * Sync dirty buffers to the media
2083  */
2084
2085 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
2086 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
2087
2088 int
2089 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
2090 {
2091         struct hammer_sync_info info;
2092
2093         info.error = 0;
2094         info.waitfor = waitfor;
2095
2096         kprintf("hammer_sync\n");
2097         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
2098                       hammer_sync_scan1, hammer_sync_scan2, &info);
2099
2100         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
2101                 hammer_sync_volume, &info);
2102         return(info.error);
2103 }
2104
2105 static int
2106 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
2107 {
2108         struct hammer_inode *ip;
2109
2110         ip = VTOI(vp);
2111         if (vp->v_type == VNON || ip == NULL ||
2112             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2113              RB_EMPTY(&vp->v_rbdirty_tree))) {
2114                 return(-1);
2115         }
2116         return(0);
2117 }
2118
2119 static int
2120 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2121 {
2122         struct hammer_sync_info *info = data;
2123         struct hammer_inode *ip;
2124         int error;
2125
2126         ip = VTOI(vp);
2127         if (vp->v_type == VNON || vp->v_type == VBAD ||
2128             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2129              RB_EMPTY(&vp->v_rbdirty_tree))) {
2130                 return(0);
2131         }
2132         if (vp->v_type != VCHR) {
2133                 error = VOP_FSYNC(vp, info->waitfor);
2134                 if (error)
2135                         info->error = error;
2136         }
2137         return(0);
2138 }
2139
2140 int
2141 hammer_sync_volume(hammer_volume_t volume, void *data)
2142 {
2143         struct hammer_sync_info *info = data;
2144
2145         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
2146                 hammer_sync_cluster, info);
2147         if (hammer_ref_volume(volume) == 0)
2148                 hammer_rel_volume(volume, 1);
2149         return(0);
2150 }
2151
2152 int
2153 hammer_sync_cluster(hammer_cluster_t cluster, void *data)
2154 {
2155         struct hammer_sync_info *info = data;
2156
2157         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
2158                 hammer_sync_buffer, info);
2159         /*hammer_io_waitdep(&cluster->io);*/
2160         if (hammer_ref_cluster(cluster) == 0)
2161                 hammer_rel_cluster(cluster, 1);
2162         return(0);
2163 }
2164
2165 int
2166 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
2167 {
2168         if (hammer_ref_buffer(buffer) == 0)
2169                 hammer_rel_buffer(buffer, 1);
2170         return(0);
2171 }
2172
2173 /*
2174  * Generic buffer initialization.  Initialize the A-list into an all-allocated
2175  * state with the free block limit properly set.
2176  *
2177  * Note that alloc_new_buffer() will free the appropriate block range via
2178  * the appropriate cluster alist, so the free count is properly propogated.
2179  */
2180 void
2181 hammer_initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
2182 {
2183         head->buf_type = type;
2184
2185         switch(type) {
2186         case HAMMER_FSBUF_BTREE:
2187                 hammer_alist_init(live, 0, HAMMER_BTREE_NODES,
2188                                   HAMMER_ASTATE_ALLOC);
2189                 break;
2190         case HAMMER_FSBUF_DATA:
2191                 hammer_alist_init(live, 0, HAMMER_DATA_NODES,
2192                                   HAMMER_ASTATE_ALLOC);
2193                 break;
2194         case HAMMER_FSBUF_RECORDS:
2195                 hammer_alist_init(live, 0, HAMMER_RECORD_NODES,
2196                                   HAMMER_ASTATE_ALLOC);
2197                 break;
2198         default:
2199                 hammer_alist_init(live, 0, 0, HAMMER_ASTATE_ALLOC);
2200                 break;
2201         }
2202 }
2203
2204 /*
2205  * Calculate the cluster's offset in the volume.  This calculation is
2206  * slightly more complex when using superclusters because superclusters
2207  * are grouped in blocks of 16, followed by 16 x N clusters where N
2208  * is the number of clusters a supercluster can manage.
2209  */
2210 static int64_t
2211 calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
2212 {
2213         int32_t scl_group;
2214         int64_t scl_group_size;
2215         int64_t off;
2216
2217         if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
2218                 scl_group = clu_no / HAMMER_VOL_SUPERCLUSTER_GROUP /
2219                             HAMMER_SCL_MAXCLUSTERS;
2220                 scl_group_size = 
2221                             ((int64_t)HAMMER_BUFSIZE *
2222                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2223                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2224                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2225                 scl_group_size += 
2226                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2227
2228                 off = volume->cluster_base +
2229                       scl_group * scl_group_size +
2230                       (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
2231                       ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
2232                        HAMMER_VOL_SUPERCLUSTER_GROUP))
2233                       * volume->vol_clsize;
2234         } else {
2235                 off = volume->cluster_base +
2236                       (int64_t)clu_no * volume->vol_clsize;
2237         }
2238         return(off);
2239 }
2240
2241 /*
2242  * Calculate a super-cluster's offset in the volume.
2243  */
2244 static int64_t
2245 calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
2246 {
2247         int64_t off;
2248         int32_t scl_group;
2249         int64_t scl_group_size;
2250
2251         KKASSERT (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL);
2252         scl_group = scl_no / HAMMER_VOL_SUPERCLUSTER_GROUP;
2253         if (scl_group) {
2254                 scl_group_size = 
2255                             ((int64_t)HAMMER_BUFSIZE *
2256                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2257                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2258                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2259                 scl_group_size += 
2260                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2261                 off = volume->cluster_base + (scl_group * scl_group_size) +
2262                       (scl_no % HAMMER_VOL_SUPERCLUSTER_GROUP) * HAMMER_BUFSIZE;
2263         } else {
2264                 off = volume->cluster_base + (scl_no * HAMMER_BUFSIZE);
2265         }
2266         return(off);
2267 }
2268
2269 /*
2270  * Allocate nblks buffers from the cluster's master alist.
2271  */
2272 static int32_t
2273 hammer_alloc_master(hammer_cluster_t cluster, int nblks,
2274                     int32_t start, int isfwd)
2275 {
2276         int32_t buf_no;
2277
2278         hammer_modify_cluster(cluster);
2279         if (isfwd) {
2280                 buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2281                                                 nblks, start);
2282                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2283                         buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2284                                                 nblks, 0);
2285                 }
2286         } else {
2287                 buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2288                                                 nblks, start);
2289                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2290                         buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2291                                                 nblks, HAMMER_ALIST_BLOCK_MAX);
2292                 }
2293         }
2294
2295         /*
2296          * Recover space from empty record, b-tree, and data a-lists.
2297          */
2298
2299         return(buf_no);
2300 }
2301
2302 /*
2303  * Adjust allocation statistics
2304  */
2305 static void
2306 hammer_adjust_stats(hammer_cluster_t cluster, u_int64_t buf_type, int nblks)
2307 {
2308         hammer_modify_cluster(cluster);
2309         hammer_modify_volume(cluster->volume);
2310         hammer_modify_volume(cluster->volume->hmp->rootvol);
2311
2312         switch(buf_type) {
2313         case HAMMER_FSBUF_BTREE:
2314                 cluster->ondisk->stat_idx_bufs += nblks;
2315                 cluster->volume->ondisk->vol_stat_idx_bufs += nblks;
2316                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_idx_bufs += nblks;
2317                 break;
2318         case HAMMER_FSBUF_DATA:
2319                 cluster->ondisk->stat_data_bufs += nblks;
2320                 cluster->volume->ondisk->vol_stat_data_bufs += nblks;
2321                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_data_bufs += nblks;
2322                 break;
2323         case HAMMER_FSBUF_RECORDS:
2324                 cluster->ondisk->stat_rec_bufs += nblks;
2325                 cluster->volume->ondisk->vol_stat_rec_bufs += nblks;
2326                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_rec_bufs += nblks;
2327                 break;
2328         }
2329 }
2330
2331 /*
2332  * A-LIST SUPPORT
2333  *
2334  * Setup the parameters for the various A-lists we use in hammer.  The
2335  * supercluster A-list must be chained to the cluster A-list and cluster
2336  * slave A-lists are chained to buffer A-lists.
2337  *
2338  * See hammer_init_alist_config() below.
2339  */
2340
2341 /*
2342  * A-LIST - cluster recursion into a filesystem buffer
2343  *
2344  * In the init case the buffer has already been initialized by
2345  * alloc_new_buffer() when it allocated the buffer out of the master
2346  * alist and marked it as free in the slave alist.
2347  *
2348  * Because we use a somewhat odd mechanism to assign buffers to slave
2349  * pools we can't actually free the buffer back to the master alist in
2350  * buffer_alist_destroy(), but instead must deal with that logic somewhere
2351  * else.
2352  */
2353 static int
2354 buffer_alist_init(void *info, int32_t blk, int32_t radix,
2355                   hammer_alloc_state_t state)
2356 {
2357         return(0);
2358 }
2359
2360 /*
2361  * Note: This routine is only called when freeing the last elements of
2362  * an initialized buffer.  Freeing all elements of the buffer when the
2363  * buffer was not previously initialized does not call this routine.
2364  */
2365 static int
2366 buffer_alist_destroy(void *info, int32_t blk, int32_t radix)
2367 {
2368         hammer_cluster_t cluster = info;
2369         int32_t buf_no;
2370
2371         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2372         kprintf("destroy buffer %d:%d:%d\n", cluster->volume->vol_no, cluster->clu_no, buf_no);
2373         return (0);
2374 }
2375
2376 /*
2377  * Note: atblk can be negative and atblk - blk can go negative.
2378  */
2379 static int
2380 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2381                       int32_t count, int32_t atblk, int32_t *fullp)
2382 {
2383         hammer_cluster_t cluster = info;
2384         hammer_buffer_t buffer;
2385         int32_t buf_no;
2386         int32_t r;
2387         int error = 0;
2388
2389         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2390         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2391         if (buffer) {
2392                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2393
2394                 hammer_modify_buffer(buffer);
2395                 r = hammer_alist_alloc_fwd(&buffer->alist, count, atblk - blk);
2396                 if (r != HAMMER_ALIST_BLOCK_NONE)
2397                         r += blk;
2398                 *fullp = hammer_alist_isfull(&buffer->alist);
2399                 hammer_rel_buffer(buffer, 0);
2400         } else {
2401                 r = HAMMER_ALIST_BLOCK_NONE;
2402                 *fullp = 1;
2403         }
2404         return(r);
2405 }
2406
2407 static int
2408 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2409                       int32_t count, int32_t atblk, int32_t *fullp)
2410 {
2411         hammer_cluster_t cluster = info;
2412         hammer_buffer_t buffer;
2413         int32_t buf_no;
2414         int32_t r;
2415         int error = 0;
2416
2417         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2418         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2419         if (buffer) {
2420                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2421                 hammer_modify_buffer(buffer);
2422                 r = hammer_alist_alloc_rev(&buffer->alist, count, atblk - blk);
2423                 if (r != HAMMER_ALIST_BLOCK_NONE)
2424                         r += blk;
2425                 *fullp = hammer_alist_isfull(&buffer->alist);
2426                 hammer_rel_buffer(buffer, 0);
2427         } else {
2428                 r = HAMMER_ALIST_BLOCK_NONE;
2429                 *fullp = 1;
2430         }
2431         return(r);
2432 }
2433
2434 static void
2435 buffer_alist_free(void *info, int32_t blk, int32_t radix,
2436                  int32_t base_blk, int32_t count, int32_t *emptyp)
2437 {
2438         hammer_cluster_t cluster = info;
2439         hammer_buffer_t buffer;
2440         int32_t buf_no;
2441         int error = 0;
2442
2443         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2444         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2445         if (buffer) {
2446                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2447                 hammer_modify_buffer(buffer);
2448                 hammer_alist_free(&buffer->alist, base_blk, count);
2449                 *emptyp = hammer_alist_isempty(&buffer->alist);
2450                 /* XXX don't bother updating the buffer is completely empty? */
2451                 hammer_rel_buffer(buffer, 0);
2452         } else {
2453                 *emptyp = 0;
2454         }
2455 }
2456
2457 static int32_t
2458 buffer_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk)
2459 {
2460         hammer_cluster_t cluster = info;
2461         hammer_buffer_t buffer;
2462         int32_t buf_no;
2463         int error = 0;
2464
2465         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2466         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2467         if (buffer) {
2468                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2469                 blk = hammer_alist_find(&buffer->alist, atblk - blk, radix);
2470                 hammer_rel_buffer(buffer, 0);
2471         } else {
2472                 blk = HAMMER_ALIST_BLOCK_NONE;
2473         }
2474         return(blk);
2475 }
2476
2477 static void
2478 buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2479 {
2480 }
2481
2482 /*
2483  * A-LIST - super-cluster recursion into a cluster and cluster recursion
2484  * into a filesystem buffer.  A-List's are mostly self-contained entities,
2485  * but callbacks must be installed to recurse from one A-List to another.
2486  *
2487  * Implementing these callbacks allows us to operate a multi-layered A-List
2488  * as a single entity.
2489  */
2490
2491 /*
2492  * This occurs when allocating a cluster via the volume a-list and the
2493  * entry in the volume a-list indicated all-free.  The underlying supercl
2494  * has not yet been initialized.
2495  */
2496 static int
2497 super_alist_init(void *info, int32_t blk, int32_t radix,
2498                  hammer_alloc_state_t state)
2499 {
2500         hammer_volume_t volume = info;
2501         hammer_supercl_t supercl;
2502         int32_t scl_no;
2503         int error = 0;
2504
2505         /*
2506          * Calculate the super-cluster number containing the cluster (blk)
2507          * and obtain the super-cluster buffer.
2508          */
2509         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2510         supercl = hammer_get_supercl(volume, scl_no, &error, state);
2511         if (supercl)
2512                 hammer_rel_supercl(supercl, 0);
2513         return (error);
2514 }
2515
2516 static int
2517 super_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
2518 {
2519         hammer_volume_t volume = info;
2520         hammer_supercl_t supercl;
2521         int32_t scl_no;
2522         int error = 0;
2523
2524         /*
2525          * Calculate the super-cluster number containing the cluster (blk)
2526          * and obtain the super-cluster buffer.
2527          */
2528         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2529         supercl = hammer_get_supercl(volume, scl_no, &error,
2530                                      HAMMER_ASTATE_NONE);
2531         if (supercl) {
2532                 hammer_modify_supercl(supercl);
2533                 error = hammer_alist_recover(&supercl->alist, blk, 0, count);
2534                 /* free block count is returned if >= 0 */
2535                 hammer_rel_supercl(supercl, 0);
2536         } else {
2537                 error = -error;
2538         }
2539         return (error);
2540 }
2541
2542 /*
2543  * This occurs when freeing a cluster via the volume a-list and the
2544  * supercl is now 100% free.  We can destroy the supercl.
2545  *
2546  * What we actually do is just unset the modify bit so it doesn't get
2547  * written out.
2548  */
2549 static int
2550 super_alist_destroy(void *info, int32_t blk, int32_t radix)
2551 {
2552         hammer_volume_t volume = info;
2553         hammer_supercl_t supercl;
2554         int32_t scl_no;
2555         int error = 0;
2556
2557         /*
2558          * Calculate the super-cluster number containing the cluster (blk)
2559          * and obtain the super-cluster buffer.
2560          */
2561         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2562         if (hammer_find_supercl(volume, scl_no)) {
2563                 supercl = hammer_get_supercl(volume, scl_no, &error,
2564                                              HAMMER_ASTATE_FREE);
2565                                              /* XXX */
2566                 hammer_io_clear_modify(&supercl->io);
2567                 if (supercl)
2568                         hammer_rel_supercl(supercl, 0);
2569         }
2570         return (error);
2571 }
2572
2573 static int
2574 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2575                       int32_t count, int32_t atblk, int32_t *fullp)
2576 {
2577         hammer_volume_t volume = info;
2578         hammer_supercl_t supercl;
2579         int32_t scl_no;
2580         int32_t r;
2581         int error = 0;
2582
2583         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2584         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2585         if (supercl) {
2586                 hammer_modify_supercl(supercl);
2587                 r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
2588                 if (r != HAMMER_ALIST_BLOCK_NONE)
2589                         r += blk;
2590                 *fullp = hammer_alist_isfull(&supercl->alist);
2591                 hammer_rel_supercl(supercl, 0);
2592         } else {
2593                 r = HAMMER_ALIST_BLOCK_NONE;
2594                 *fullp = 1;
2595         }
2596         return(r);
2597 }
2598
2599 static int
2600 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2601                       int32_t count, int32_t atblk, int32_t *fullp)
2602 {
2603         hammer_volume_t volume = info;
2604         hammer_supercl_t supercl;
2605         int32_t scl_no;
2606         int32_t r;
2607         int error = 0;
2608
2609         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2610         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2611         if (supercl) {
2612                 hammer_modify_supercl(supercl);
2613                 r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
2614                 if (r != HAMMER_ALIST_BLOCK_NONE)
2615                         r += blk;
2616                 *fullp = hammer_alist_isfull(&supercl->alist);
2617                 hammer_rel_supercl(supercl, 0);
2618         } else { 
2619                 r = HAMMER_ALIST_BLOCK_NONE;
2620                 *fullp = 1;
2621         }
2622         return(r);
2623 }
2624
2625 static void
2626 super_alist_free(void *info, int32_t blk, int32_t radix,
2627                  int32_t base_blk, int32_t count, int32_t *emptyp)
2628 {
2629         hammer_volume_t volume = info;
2630         hammer_supercl_t supercl;
2631         int32_t scl_no;
2632         int error = 0;
2633
2634         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2635         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2636         if (supercl) {
2637                 hammer_modify_supercl(supercl);
2638                 hammer_alist_free(&supercl->alist, base_blk, count);
2639                 *emptyp = hammer_alist_isempty(&supercl->alist);
2640                 hammer_rel_supercl(supercl, 0);
2641         } else {
2642                 *emptyp = 0;
2643         }
2644 }
2645
2646 static void
2647 super_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2648 {
2649 }
2650
2651 void
2652 hammer_init_alist_config(void)
2653 {
2654         hammer_alist_config_t config;
2655
2656         hammer_alist_template(&Buf_alist_config, HAMMER_FSBUF_MAXBLKS,
2657                               1, HAMMER_FSBUF_METAELMS);
2658         hammer_alist_template(&Vol_normal_alist_config, HAMMER_VOL_MAXCLUSTERS,
2659                               1, HAMMER_VOL_METAELMS_1LYR);
2660         hammer_alist_template(&Vol_super_alist_config,
2661                           HAMMER_VOL_MAXSUPERCLUSTERS * HAMMER_SCL_MAXCLUSTERS,
2662                               HAMMER_SCL_MAXCLUSTERS, HAMMER_VOL_METAELMS_2LYR);
2663         hammer_alist_template(&Supercl_alist_config, HAMMER_VOL_MAXCLUSTERS,
2664                               1, HAMMER_SUPERCL_METAELMS);
2665         hammer_alist_template(&Clu_master_alist_config, HAMMER_CLU_MAXBUFFERS,
2666                               1, HAMMER_CLU_MASTER_METAELMS);
2667         hammer_alist_template(&Clu_slave_alist_config,
2668                               HAMMER_CLU_MAXBUFFERS * HAMMER_FSBUF_MAXBLKS,
2669                               HAMMER_FSBUF_MAXBLKS, HAMMER_CLU_SLAVE_METAELMS);
2670
2671         config = &Vol_super_alist_config;
2672         config->bl_radix_init = super_alist_init;
2673         config->bl_radix_recover = super_alist_recover;
2674         config->bl_radix_destroy = super_alist_destroy;
2675         config->bl_radix_alloc_fwd = super_alist_alloc_fwd;
2676         config->bl_radix_alloc_rev = super_alist_alloc_rev;
2677         config->bl_radix_free = super_alist_free;
2678         config->bl_radix_print = super_alist_print;
2679
2680         config = &Clu_slave_alist_config;
2681         config->bl_radix_init = buffer_alist_init;
2682         config->bl_radix_recover = buffer_alist_recover;
2683         config->bl_radix_destroy = buffer_alist_destroy;
2684         config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd;
2685         config->bl_radix_alloc_rev = buffer_alist_alloc_rev;
2686         config->bl_radix_free = buffer_alist_free;
2687         config->bl_radix_find = buffer_alist_find;
2688         config->bl_radix_print = buffer_alist_print;
2689 }
2690