Merge from vendor branch LIBARCHIVE:
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.10 2007/12/14 08:05:39 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_supercl(hammer_supercl_t supercl, int isnew);
51 static int hammer_load_cluster(hammer_cluster_t cluster, int isnew);
52 static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
53 static void hammer_remove_node_clist(hammer_buffer_t buffer,
54                         hammer_node_t node);
55 static void initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head,
56                         u_int64_t type);
57 static void alloc_new_buffer(hammer_cluster_t cluster,
58                         hammer_alist_t live, u_int64_t type, int32_t nelements,
59                         int32_t start,
60                         int *errorp, struct hammer_buffer **bufferp);
61 #if 0
62 static void readhammerbuf(hammer_volume_t vol, void *data,
63                         int64_t offset);
64 static void writehammerbuf(hammer_volume_t vol, const void *data,
65                         int64_t offset);
66 #endif
67 static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
68 static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
69 static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks,
70                         int32_t start, int isfwd);
71 static void hammer_adjust_stats(hammer_cluster_t cluster,
72                         u_int64_t buf_type, int nblks);
73
74 struct hammer_alist_config Buf_alist_config;
75 struct hammer_alist_config Vol_normal_alist_config;
76 struct hammer_alist_config Vol_super_alist_config;
77 struct hammer_alist_config Supercl_alist_config;
78 struct hammer_alist_config Clu_master_alist_config;
79 struct hammer_alist_config Clu_slave_alist_config;
80
81 /*
82  * Red-Black tree support for various structures
83  */
84 static int
85 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
86 {
87         if (ip1->obj_id < ip2->obj_id)
88                 return(-1);
89         if (ip1->obj_id > ip2->obj_id)
90                 return(1);
91         if (ip1->obj_asof < ip2->obj_asof)
92                 return(-1);
93         if (ip1->obj_asof > ip2->obj_asof)
94                 return(1);
95         return(0);
96 }
97
98 static int
99 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
100 {
101         if (info->obj_id < ip->obj_id)
102                 return(-1);
103         if (info->obj_id > ip->obj_id)
104                 return(1);
105         if (info->obj_asof < ip->obj_asof)
106                 return(-1);
107         if (info->obj_asof > ip->obj_asof)
108                 return(1);
109         return(0);
110 }
111
112 static int
113 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
114 {
115         if (vol1->vol_no < vol2->vol_no)
116                 return(-1);
117         if (vol1->vol_no > vol2->vol_no)
118                 return(1);
119         return(0);
120 }
121
122 static int
123 hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
124 {
125         if (cl1->scl_no < cl2->scl_no)
126                 return(-1);
127         if (cl1->scl_no > cl2->scl_no)
128                 return(1);
129         return(0);
130 }
131
132 static int
133 hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
134 {
135         if (cl1->clu_no < cl2->clu_no)
136                 return(-1);
137         if (cl1->clu_no > cl2->clu_no)
138                 return(1);
139         return(0);
140 }
141
142 static int
143 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
144 {
145         if (buf1->buf_no < buf2->buf_no)
146                 return(-1);
147         if (buf1->buf_no > buf2->buf_no)
148                 return(1);
149         return(0);
150 }
151
152 static int
153 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
154 {
155         if (node1->node_offset < node2->node_offset)
156                 return(-1);
157         if (node1->node_offset > node2->node_offset)
158                 return(1);
159         return(0);
160 }
161
162 /*
163  * Note: The lookup function for hammer_ino_rb_tree winds up being named
164  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
165  * functions are normal, e.g. hammer_clu_rb_tree_RB_LOOKUP(root, clu_no).
166  */
167 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
168 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
169                 hammer_inode_info_cmp, hammer_inode_info_t);
170 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
171              hammer_vol_rb_compare, int32_t, vol_no);
172 RB_GENERATE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
173              hammer_scl_rb_compare, int32_t, scl_no);
174 RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
175              hammer_clu_rb_compare, int32_t, clu_no);
176 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
177              hammer_buf_rb_compare, int32_t, buf_no);
178 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
179              hammer_nod_rb_compare, int32_t, node_offset);
180
181 /************************************************************************
182  *                              VOLUMES                                 *
183  ************************************************************************
184  *
185  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
186  * code on failure.  Volumes must be loaded at mount time, get_volume() will
187  * not load a new volume.
188  *
189  * Calls made to hammer_load_volume() or single-threaded
190  */
191 int
192 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
193 {
194         struct mount *mp;
195         hammer_volume_t volume;
196         struct hammer_volume_ondisk *ondisk;
197         struct nlookupdata nd;
198         struct buf *bp = NULL;
199         int error;
200         int ronly;
201
202         mp = hmp->mp;
203         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
204
205         /*
206          * Allocate a volume structure
207          */
208         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
209         volume->vol_name = kstrdup(volname, M_HAMMER);
210         volume->hmp = hmp;
211         volume->io.type = HAMMER_STRUCTURE_VOLUME;
212         volume->io.offset = 0LL;
213
214         /*
215          * Get the device vnode
216          */
217         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
218         if (error == 0)
219                 error = nlookup(&nd);
220         if (error == 0)
221                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
222         nlookup_done(&nd);
223         if (error == 0) {
224                 vn_isdisk(volume->devvp, &error);
225         }
226         if (error == 0) {
227                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
228                 error = VOP_OPEN(volume->devvp, (ronly ? FREAD : FREAD|FWRITE),
229                                  FSCRED, NULL);
230                 vn_unlock(volume->devvp);
231         }
232         if (error) {
233                 hammer_free_volume(volume);
234                 return(error);
235         }
236
237         /*
238          * Extract the volume number from the volume header and do various
239          * sanity checks.
240          */
241         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
242         if (error)
243                 goto late_failure;
244         ondisk = (void *)bp->b_data;
245         if (ondisk->head.buf_type != HAMMER_FSBUF_VOLUME) {
246                 kprintf("hammer_mount: volume %s has an invalid header\n",
247                         volume->vol_name);
248                 error = EFTYPE;
249                 goto late_failure;
250         }
251         volume->vol_no = ondisk->vol_no;
252         volume->cluster_base = ondisk->vol_clo_beg;
253         volume->vol_clsize = ondisk->vol_clsize;
254         volume->vol_flags = ondisk->vol_flags;
255         volume->nblocks = ondisk->vol_nblocks; 
256         RB_INIT(&volume->rb_clus_root);
257         RB_INIT(&volume->rb_scls_root);
258
259         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
260
261         if (RB_EMPTY(&hmp->rb_vols_root)) {
262                 hmp->fsid = ondisk->vol_fsid;
263         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
264                 kprintf("hammer_mount: volume %s's fsid does not match "
265                         "other volumes\n", volume->vol_name);
266                 error = EFTYPE;
267                 goto late_failure;
268         }
269
270         /*
271          * Insert the volume structure into the red-black tree.
272          */
273         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
274                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
275                         volume->vol_name, volume->vol_no);
276                 error = EEXIST;
277         }
278
279         /*
280          * Set the root volume and load the root cluster.  HAMMER special
281          * cases rootvol and rootcl and will not deallocate the structures.
282          * We do not hold a ref because this would prevent related I/O
283          * from being flushed.
284          */
285         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
286                 hmp->rootvol = volume;
287                 hmp->rootcl = hammer_get_cluster(volume,
288                                                  ondisk->vol0_root_clu_no,
289                                                  &error, 0);
290                 hammer_rel_cluster(hmp->rootcl, 0);
291                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
292         }
293 late_failure:
294         if (bp)
295                 brelse(bp);
296         if (error) {
297                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
298                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
299                 hammer_free_volume(volume);
300         }
301         return (error);
302 }
303
304 /*
305  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
306  * so returns -1 on failure.
307  */
308 int
309 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
310 {
311         struct hammer_mount *hmp = volume->hmp;
312         hammer_cluster_t rootcl;
313         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
314
315         /*
316          * Sync clusters, sync volume
317          */
318
319         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
320
321         /*
322          * Clean up the root cluster, which is held unlocked in the root
323          * volume.
324          */
325         if (hmp->rootvol == volume) {
326                 if ((rootcl = hmp->rootcl) != NULL)
327                         hmp->rootcl = NULL;
328                 hmp->rootvol = NULL;
329         }
330
331         /*
332          * Unload clusters and super-clusters.  Unloading a super-cluster
333          * also unloads related clusters, but the filesystem may not be
334          * using super-clusters so unload clusters anyway.
335          */
336         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
337                         hammer_unload_cluster, NULL);
338         RB_SCAN(hammer_scl_rb_tree, &volume->rb_scls_root, NULL,
339                         hammer_unload_supercl, NULL);
340
341         /*
342          * Release our buffer and flush anything left in the buffer cache.
343          */
344         hammer_io_release(&volume->io, 1);
345
346         /*
347          * There should be no references on the volume, no clusters, and
348          * no super-clusters.
349          */
350         KKASSERT(volume->io.lock.refs == 0);
351         KKASSERT(RB_EMPTY(&volume->rb_clus_root));
352         KKASSERT(RB_EMPTY(&volume->rb_scls_root));
353
354         volume->ondisk = NULL;
355         if (volume->devvp) {
356                 if (ronly) {
357                         vinvalbuf(volume->devvp, 0, 0, 0);
358                         VOP_CLOSE(volume->devvp, FREAD);
359                 } else {
360                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
361                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
362                 }
363         }
364
365         /*
366          * Destroy the structure
367          */
368         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
369         hammer_free_volume(volume);
370         return(0);
371 }
372
373 static
374 void
375 hammer_free_volume(hammer_volume_t volume)
376 {
377         if (volume->vol_name) {
378                 kfree(volume->vol_name, M_HAMMER);
379                 volume->vol_name = NULL;
380         }
381         if (volume->devvp) {
382                 vrele(volume->devvp);
383                 volume->devvp = NULL;
384         }
385         kfree(volume, M_HAMMER);
386 }
387
388 /*
389  * Get a HAMMER volume.  The volume must already exist.
390  */
391 hammer_volume_t
392 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
393 {
394         struct hammer_volume *volume;
395
396         /*
397          * Locate the volume structure
398          */
399         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
400         if (volume == NULL) {
401                 *errorp = ENOENT;
402                 return(NULL);
403         }
404         hammer_ref(&volume->io.lock);
405
406         /*
407          * Deal with on-disk info
408          */
409         if (volume->ondisk == NULL) {
410                 *errorp = hammer_load_volume(volume);
411                 if (*errorp) {
412                         hammer_rel_volume(volume, 1);
413                         volume = NULL;
414                 }
415         } else {
416                 *errorp = 0;
417         }
418         return(volume);
419 }
420
421 int
422 hammer_ref_volume(hammer_volume_t volume)
423 {
424         int error;
425
426         hammer_ref(&volume->io.lock);
427
428         /*
429          * Deal with on-disk info
430          */
431         if (volume->ondisk == NULL) {
432                 error = hammer_load_volume(volume);
433                 if (error)
434                         hammer_rel_volume(volume, 1);
435         } else {
436                 error = 0;
437         }
438         return (error);
439 }
440
441 hammer_volume_t
442 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
443 {
444         hammer_volume_t volume;
445
446         volume = hmp->rootvol;
447         KKASSERT(volume != NULL);
448         hammer_ref(&volume->io.lock);
449
450         /*
451          * Deal with on-disk info
452          */
453         if (volume->ondisk == NULL) {
454                 *errorp = hammer_load_volume(volume);
455                 if (*errorp) {
456                         hammer_rel_volume(volume, 1);
457                         volume = NULL;
458                 }
459         } else {
460                 *errorp = 0;
461         }
462         return (volume);
463 }
464
465 /*
466  * Load a volume's on-disk information.  The volume must be referenced and
467  * not locked.  We temporarily acquire an exclusive lock to interlock
468  * against releases or multiple get's.
469  */
470 static int
471 hammer_load_volume(hammer_volume_t volume)
472 {
473         struct hammer_volume_ondisk *ondisk;
474         int error;
475
476         hammer_lock_ex(&volume->io.lock);
477         if (volume->ondisk == NULL) {
478                 error = hammer_io_read(volume->devvp, &volume->io);
479                 if (error) {
480                         hammer_unlock(&volume->io.lock);
481                         return (error);
482                 }
483                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
484
485                 /*
486                  * Configure the volume's A-lists.  These are used to
487                  * allocate clusters.
488                  */
489                 if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
490                         volume->alist.config = &Vol_super_alist_config;
491                         volume->alist.meta = ondisk->vol_almeta.super;
492                         volume->alist.info = volume;
493                 } else {
494                         volume->alist.config = &Vol_normal_alist_config;
495                         volume->alist.meta = ondisk->vol_almeta.normal;
496                         volume->alist.info = NULL;
497                 }
498                 hammer_alist_init(&volume->alist);
499         } else {
500                 error = 0;
501         }
502         hammer_unlock(&volume->io.lock);
503         return(0);
504 }
505
506 /*
507  * Release a volume.  Call hammer_io_release on the last reference.  We have
508  * to acquire an exclusive lock to interlock against volume->ondisk tests
509  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
510  * lock to be held.
511  *
512  * Volumes are not unloaded from memory during normal operation.
513  */
514 void
515 hammer_rel_volume(hammer_volume_t volume, int flush)
516 {
517         if (volume->io.lock.refs == 1) {
518                 hammer_lock_ex(&volume->io.lock);
519                 if (volume->io.lock.refs == 1) {
520                         volume->ondisk = NULL;
521                         hammer_io_release(&volume->io, flush);
522                 }
523                 hammer_unlock(&volume->io.lock);
524         }
525         hammer_unref(&volume->io.lock);
526 }
527
528 /************************************************************************
529  *                              SUPER-CLUSTERS                          *
530  ************************************************************************
531  *
532  * Manage super-clusters.  Note that a supercl holds a reference to its
533  * associated volume.
534  */
535 hammer_supercl_t
536 hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
537                    int *errorp, int isnew)
538 {
539         hammer_supercl_t supercl;
540
541         /*
542          * Locate and lock the super-cluster structure, creating one
543          * if necessary.
544          */
545 again:
546         supercl = RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no);
547         if (supercl == NULL) {
548                 supercl = kmalloc(sizeof(*supercl), M_HAMMER, M_WAITOK|M_ZERO);
549                 supercl->scl_no = scl_no;
550                 supercl->volume = volume;
551                 supercl->io.offset = calculate_supercl_offset(volume, scl_no);
552                 supercl->io.type = HAMMER_STRUCTURE_SUPERCL;
553                 hammer_ref(&supercl->io.lock);
554
555                 /*
556                  * Insert the cluster into the RB tree and handle late
557                  * collisions.
558                  */
559                 if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
560                         hammer_unref(&supercl->io.lock);
561                         kfree(supercl, M_HAMMER);
562                         goto again;
563                 }
564                 hammer_ref(&volume->io.lock);
565         } else {
566                 hammer_ref(&supercl->io.lock);
567         }
568
569         /*
570          * Deal with on-disk info
571          */
572         if (supercl->ondisk == NULL || isnew) {
573                 *errorp = hammer_load_supercl(supercl, isnew);
574                 if (*errorp) {
575                         hammer_rel_supercl(supercl, 1);
576                         supercl = NULL;
577                 }
578         } else {
579                 *errorp = 0;
580         }
581         return(supercl);
582 }
583
584 static int
585 hammer_load_supercl(hammer_supercl_t supercl, int isnew)
586 {
587         struct hammer_supercl_ondisk *ondisk;
588         hammer_volume_t volume = supercl->volume;
589         int error;
590
591         hammer_lock_ex(&supercl->io.lock);
592         if (supercl->ondisk == NULL) {
593                 if (isnew)
594                         error = hammer_io_new(volume->devvp, &supercl->io);
595                 else
596                         error = hammer_io_read(volume->devvp, &supercl->io);
597                 if (error) {
598                         hammer_unlock(&supercl->io.lock);
599                         return (error);
600                 }
601                 supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
602
603                 supercl->alist.config = &Supercl_alist_config;
604                 supercl->alist.meta = ondisk->scl_meta;
605                 supercl->alist.info = NULL;
606         } else if (isnew) {
607                 error = hammer_io_new(volume->devvp, &supercl->io);
608         } else {
609                 error = 0;
610         }
611         if (error == 0 && isnew) {
612                 /*
613                  * If this is a new super-cluster we have to initialize
614                  * various ondisk structural elements.  The caller is
615                  * responsible for the remainder.
616                  */
617                 struct hammer_alist_live dummy;
618
619                 ondisk = supercl->ondisk;
620                 dummy.config = &Buf_alist_config;
621                 dummy.meta = ondisk->head.buf_almeta;
622                 dummy.info = NULL;
623                 initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL);
624                 hammer_alist_init(&supercl->alist);
625         }
626         hammer_unlock(&supercl->io.lock);
627         return (error);
628 }
629
630 /*
631  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
632  */
633 int
634 hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused)
635 {
636         KKASSERT(supercl->io.lock.refs == 0);
637         hammer_ref(&supercl->io.lock);
638         hammer_rel_supercl(supercl, 1);
639         return(0);
640 }
641
642 /*
643  * Release a super-cluster.  We have to deal with several places where
644  * another thread can ref the super-cluster.
645  *
646  * Only destroy the structure itself if the related buffer cache buffer
647  * was disassociated from it.  This ties the management of the structure
648  * to the buffer cache subsystem.
649  */
650 void
651 hammer_rel_supercl(hammer_supercl_t supercl, int flush)
652 {
653         hammer_volume_t volume;
654
655         if (supercl->io.lock.refs == 1) {
656                 hammer_lock_ex(&supercl->io.lock);
657                 if (supercl->io.lock.refs == 1) {
658                         hammer_io_release(&supercl->io, flush);
659                         if (supercl->io.bp == NULL &&
660                             supercl->io.lock.refs == 1) {
661                                 volume = supercl->volume;
662                                 RB_REMOVE(hammer_scl_rb_tree,
663                                           &volume->rb_scls_root, supercl);
664                                 supercl->volume = NULL; /* sanity */
665                                 kfree(supercl, M_HAMMER);
666                                 hammer_rel_volume(volume, 0);
667                                 return;
668                         }
669                 }
670                 hammer_unlock(&supercl->io.lock);
671         }
672         hammer_unref(&supercl->io.lock);
673 }
674
675 /************************************************************************
676  *                              CLUSTERS                                *
677  ************************************************************************
678  *
679  * Manage clusters.  Note that a cluster holds a reference to its
680  * associated volume.
681  */
682 hammer_cluster_t
683 hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
684                    int *errorp, int isnew)
685 {
686         hammer_cluster_t cluster;
687
688 again:
689         cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
690         if (cluster == NULL) {
691                 cluster = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
692                 cluster->clu_no = clu_no;
693                 cluster->volume = volume;
694                 cluster->io.offset = calculate_cluster_offset(volume, clu_no);
695                 cluster->state = HAMMER_CLUSTER_IDLE;
696                 RB_INIT(&cluster->rb_bufs_root);
697                 RB_INIT(&cluster->rb_nods_root);
698                 cluster->io.type = HAMMER_STRUCTURE_CLUSTER;
699                 hammer_ref(&cluster->io.lock);
700
701                 /*
702                  * Insert the cluster into the RB tree and handle late
703                  * collisions.
704                  */
705                 if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
706                         hammer_unref(&cluster->io.lock);
707                         kfree(cluster, M_HAMMER);
708                         goto again;
709                 }
710                 hammer_ref(&volume->io.lock);
711         } else {
712                 hammer_ref(&cluster->io.lock);
713         }
714
715         /*
716          * Deal with on-disk info
717          */
718         if (cluster->ondisk == NULL || isnew) {
719                 *errorp = hammer_load_cluster(cluster, isnew);
720                 if (*errorp) {
721                         hammer_rel_cluster(cluster, 1);
722                         cluster = NULL;
723                 }
724         } else {
725                 *errorp = 0;
726         }
727         return (cluster);
728 }
729
730 hammer_cluster_t
731 hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
732 {
733         hammer_cluster_t cluster;
734
735         cluster = hmp->rootcl;
736         KKASSERT(cluster != NULL);
737         hammer_ref(&cluster->io.lock);
738
739         /*
740          * Deal with on-disk info
741          */
742         if (cluster->ondisk == NULL) {
743                 *errorp = hammer_load_cluster(cluster, 0);
744                 if (*errorp) {
745                         hammer_rel_cluster(cluster, 1);
746                         cluster = NULL;
747                 }
748         } else {
749                 *errorp = 0;
750         }
751         return (cluster);
752 }
753
754 static
755 int
756 hammer_load_cluster(hammer_cluster_t cluster, int isnew)
757 {
758         hammer_volume_t volume = cluster->volume;
759         struct hammer_cluster_ondisk *ondisk;
760         int error;
761
762         /*
763          * Load the cluster's on-disk info
764          */
765         hammer_lock_ex(&cluster->io.lock);
766         if (cluster->ondisk == NULL) {
767                 if (isnew)
768                         error = hammer_io_new(volume->devvp, &cluster->io);
769                 else
770                         error = hammer_io_read(volume->devvp, &cluster->io);
771                 if (error) {
772                         hammer_unlock(&cluster->io.lock);
773                         return (error);
774                 }
775                 cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
776
777                 cluster->alist_master.config = &Clu_master_alist_config;
778                 cluster->alist_master.meta = ondisk->clu_master_meta;
779                 cluster->alist_btree.config = &Clu_slave_alist_config;
780                 cluster->alist_btree.meta = ondisk->clu_btree_meta;
781                 cluster->alist_btree.info = cluster;
782                 cluster->alist_record.config = &Clu_slave_alist_config;
783                 cluster->alist_record.meta = ondisk->clu_record_meta;
784                 cluster->alist_record.info = cluster;
785                 cluster->alist_mdata.config = &Clu_slave_alist_config;
786                 cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
787                 cluster->alist_mdata.info = cluster;
788
789                 cluster->clu_btree_beg = ondisk->clu_btree_beg;
790                 cluster->clu_btree_end = ondisk->clu_btree_end;
791         } else if (isnew) {
792                 error = hammer_io_new(volume->devvp, &cluster->io);
793         } else {
794                 error = 0;
795         }
796         if (error == 0 && isnew) {
797                 /*
798                  * If this is a new cluster we have to initialize
799                  * various ondisk structural elements.  The caller is
800                  * responsible for the remainder.
801                  */
802                 struct hammer_alist_live dummy;
803
804                 ondisk = cluster->ondisk;
805
806                 dummy.config = &Buf_alist_config;
807                 dummy.meta = ondisk->head.buf_almeta;
808                 dummy.info = NULL;
809                 initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER);
810
811                 hammer_alist_init(&cluster->alist_master);
812                 hammer_alist_init(&cluster->alist_btree);
813                 hammer_alist_init(&cluster->alist_record);
814                 hammer_alist_init(&cluster->alist_mdata);
815         }
816         hammer_unlock(&cluster->io.lock);
817         return (error);
818 }
819
820 /*
821  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
822  */
823 int
824 hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused)
825 {
826         hammer_ref(&cluster->io.lock);
827         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
828                 hammer_unload_buffer, NULL);
829         KKASSERT(cluster->io.lock.refs == 1);
830         hammer_rel_cluster(cluster, 1);
831         return(0);
832 }
833
834 /*
835  * Reference a cluster that is either already referenced or via a specially
836  * handled pointer (aka rootcl).
837  */
838 int
839 hammer_ref_cluster(hammer_cluster_t cluster)
840 {
841         int error;
842
843         KKASSERT(cluster != NULL);
844         hammer_ref(&cluster->io.lock);
845
846         /*
847          * Deal with on-disk info
848          */
849         if (cluster->ondisk == NULL) {
850                 error = hammer_load_cluster(cluster, 0);
851                 if (error)
852                         hammer_rel_cluster(cluster, 1);
853         } else {
854                 error = 0;
855         }
856         return(error);
857 }
858
859 /*
860  * Release a cluster.  We have to deal with several places where
861  * another thread can ref the cluster.
862  *
863  * Only destroy the structure itself if the related buffer cache buffer
864  * was disassociated from it.  This ties the management of the structure
865  * to the buffer cache subsystem.
866  */
867 void
868 hammer_rel_cluster(hammer_cluster_t cluster, int flush)
869 {
870         hammer_node_t node;
871         hammer_volume_t volume;
872
873         if (cluster->io.lock.refs == 1) {
874                 hammer_lock_ex(&cluster->io.lock);
875                 if (cluster->io.lock.refs == 1) {
876                         /*
877                          * Release the I/O.  If we or the kernel wants to
878                          * flush, this will release the bp.  Otherwise the
879                          * bp may be written and flushed passively by the
880                          * kernel later on.
881                          */
882                         hammer_io_release(&cluster->io, flush);
883
884                         /*
885                          * The B-Tree node cache is not counted in the
886                          * cluster's reference count.  Clean out the
887                          * cache.
888                          *
889                          * If the cluster acquires a new reference while we
890                          * are trying to clean it out, abort the cleaning.
891                          * 
892                          * Any actively referenced nodes will reference the
893                          * related buffer and cluster, so a ref count check
894                          * should be sufficient.
895                          */
896                         while (cluster->io.bp == NULL &&
897                                cluster->io.lock.refs == 1 &&
898                                (node = RB_ROOT(&cluster->rb_nods_root)) != NULL
899                         ) {
900                                 KKASSERT(node->lock.refs == 0);
901                                 hammer_flush_node(node);
902                         }
903
904                         /*
905                          * Final cleanup
906                          */
907                         if (cluster->io.bp == NULL &&
908                             cluster->io.lock.refs == 1 &&
909                             RB_EMPTY(&cluster->rb_nods_root)) {
910                                 KKASSERT(RB_EMPTY(&cluster->rb_bufs_root));
911                                 volume = cluster->volume;
912                                 RB_REMOVE(hammer_clu_rb_tree,
913                                           &volume->rb_clus_root, cluster);
914                                 cluster->volume = NULL; /* sanity */
915                                 kfree(cluster, M_HAMMER);
916                                 hammer_rel_volume(volume, 0);
917                                 return;
918                         }
919                 }
920                 hammer_unlock(&cluster->io.lock);
921         }
922         hammer_unref(&cluster->io.lock);
923 }
924
925 /************************************************************************
926  *                              BUFFERS                                 *
927  ************************************************************************
928  *
929  * Manage buffers.  Note that a buffer holds a reference to its associated
930  * cluster, and its cluster will hold a reference to the cluster's volume.
931  *
932  * A non-zero buf_type indicates that a new buffer should be created and
933  * zero'd.
934  */
935 hammer_buffer_t
936 hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
937                   u_int64_t buf_type, int *errorp)
938 {
939         hammer_buffer_t buffer;
940
941         /*
942          * Find the buffer.  Note that buffer 0 corresponds to the cluster
943          * header and should never be requested.
944          */
945         KKASSERT(buf_no >= cluster->ondisk->clu_start / HAMMER_BUFSIZE &&
946                  buf_no < cluster->ondisk->clu_limit / HAMMER_BUFSIZE);
947
948         /*
949          * Locate and lock the buffer structure, creating one if necessary.
950          */
951 again:
952         buffer = RB_LOOKUP(hammer_buf_rb_tree, &cluster->rb_bufs_root, buf_no);
953         if (buffer == NULL) {
954                 buffer = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
955                 buffer->buf_no = buf_no;
956                 buffer->cluster = cluster;
957                 buffer->volume = cluster->volume;
958                 buffer->io.offset = cluster->io.offset +
959                                     (buf_no * HAMMER_BUFSIZE);
960                 buffer->io.type = HAMMER_STRUCTURE_BUFFER;
961                 TAILQ_INIT(&buffer->clist);
962                 hammer_ref(&buffer->io.lock);
963
964                 /*
965                  * Insert the cluster into the RB tree and handle late
966                  * collisions.
967                  */
968                 if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
969                         hammer_unref(&buffer->io.lock);
970                         kfree(buffer, M_HAMMER);
971                         goto again;
972                 }
973                 hammer_ref(&cluster->io.lock);
974         } else {
975                 hammer_ref(&buffer->io.lock);
976         }
977
978         /*
979          * Deal with on-disk info
980          */
981         if (buffer->ondisk == NULL || buf_type) {
982                 *errorp = hammer_load_buffer(buffer, buf_type);
983                 if (*errorp) {
984                         hammer_rel_buffer(buffer, 1);
985                         buffer = NULL;
986                 }
987         } else {
988                 *errorp = 0;
989         }
990         return(buffer);
991 }
992
993 static int
994 hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
995 {
996         hammer_volume_t volume;
997         hammer_fsbuf_ondisk_t ondisk;
998         int error;
999
1000         /*
1001          * Load the buffer's on-disk info
1002          */
1003         volume = buffer->volume;
1004         hammer_lock_ex(&buffer->io.lock);
1005         if (buffer->ondisk == NULL) {
1006                 if (buf_type) {
1007                         error = hammer_io_new(volume->devvp, &buffer->io);
1008                 } else {
1009                         error = hammer_io_read(volume->devvp, &buffer->io);
1010                 }
1011                 if (error) {
1012                         hammer_unlock(&buffer->io.lock);
1013                         return (error);
1014                 }
1015                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
1016                 buffer->alist.config = &Buf_alist_config;
1017                 buffer->alist.meta = ondisk->head.buf_almeta;
1018                 buffer->buf_type = ondisk->head.buf_type;
1019         } else if (buf_type) {
1020                 error = hammer_io_new(volume->devvp, &buffer->io);
1021         } else {
1022                 error = 0;
1023         }
1024         if (error == 0 && buf_type) {
1025                 ondisk = buffer->ondisk;
1026                 initbuffer(&buffer->alist, &ondisk->head, buf_type);
1027                 buffer->buf_type = ondisk->head.buf_type;
1028         }
1029         hammer_unlock(&buffer->io.lock);
1030         return (error);
1031 }
1032
1033 /*
1034  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
1035  */
1036 int
1037 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
1038 {
1039         hammer_ref(&buffer->io.lock);
1040         hammer_flush_buffer_nodes(buffer);
1041         KKASSERT(buffer->io.lock.refs == 1);
1042         hammer_rel_buffer(buffer, 1);
1043         return(0);
1044 }
1045
1046 /*
1047  * Reference a buffer that is either already referenced or via a specially
1048  * handled pointer (aka cursor->buffer).
1049  */
1050 int
1051 hammer_ref_buffer(hammer_buffer_t buffer)
1052 {
1053         int error;
1054
1055         hammer_ref(&buffer->io.lock);
1056         if (buffer->ondisk == NULL) {
1057                 error = hammer_load_buffer(buffer, 0);
1058                 if (error) {
1059                         hammer_rel_buffer(buffer, 1);
1060                         /*
1061                          * NOTE: buffer pointer can become stale after
1062                          * the above release.
1063                          */
1064                 } else {
1065                         KKASSERT(buffer->buf_type ==
1066                                  buffer->ondisk->head.buf_type);
1067                 }
1068         } else {
1069                 error = 0;
1070         }
1071         return(error);
1072 }
1073
1074 /*
1075  * Release a buffer.  We have to deal with several places where
1076  * another thread can ref the buffer.
1077  *
1078  * Only destroy the structure itself if the related buffer cache buffer
1079  * was disassociated from it.  This ties the management of the structure
1080  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1081  * embedded io is referenced or not.
1082  */
1083 void
1084 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
1085 {
1086         hammer_cluster_t cluster;
1087         hammer_node_t node;
1088
1089         if (buffer->io.lock.refs == 1) {
1090                 hammer_lock_ex(&buffer->io.lock);
1091                 if (buffer->io.lock.refs == 1) {
1092                         hammer_io_release(&buffer->io, flush);
1093
1094                         /*
1095                          * Clean out the B-Tree node cache, if any, then
1096                          * clean up the cluster ref and free the buffer.
1097                          *
1098                          * If the buffer acquires a new reference while we
1099                          * are trying to clean it out, abort the cleaning.
1100                          */
1101                         while (buffer->io.bp == NULL &&
1102                                buffer->io.lock.refs == 1 &&
1103                                (node = TAILQ_FIRST(&buffer->clist)) != NULL
1104                         ) {
1105                                 KKASSERT(node->lock.refs == 0);
1106                                 hammer_flush_node(node);
1107                         }
1108                         if (buffer->io.bp == NULL &&
1109                             hammer_islastref(&buffer->io.lock)) {
1110                                 cluster = buffer->cluster;
1111                                 RB_REMOVE(hammer_buf_rb_tree,
1112                                           &cluster->rb_bufs_root, buffer);
1113                                 buffer->cluster = NULL; /* sanity */
1114                                 kfree(buffer, M_HAMMER);
1115                                 hammer_rel_cluster(cluster, 0);
1116                                 return;
1117                         }
1118                 }
1119                 hammer_unlock(&buffer->io.lock);
1120         }
1121         hammer_unref(&buffer->io.lock);
1122 }
1123
1124 /*
1125  * Flush passively cached B-Tree nodes associated with this buffer.
1126  *
1127  * NOTE: The buffer is referenced and locked.
1128  */
1129 void
1130 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1131 {
1132         hammer_node_t node;
1133
1134         node = TAILQ_FIRST(&buffer->clist);
1135         while (node) {
1136                 buffer->save_scan = TAILQ_NEXT(node, entry);
1137                 if (node->lock.refs == 0)
1138                         hammer_flush_node(node);
1139                 node = buffer->save_scan;
1140         }
1141 }
1142
1143 /************************************************************************
1144  *                              NODES                                   *
1145  ************************************************************************
1146  *
1147  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1148  * method used by the HAMMER filesystem.
1149  *
1150  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1151  * associated with its buffer.  It can have an active buffer reference
1152  * even when the node itself has no references.  The node also passively
1153  * associates itself with its cluster without holding any cluster refs.
1154  * The cluster ref is indirectly maintained by the active buffer ref when
1155  * a node is acquired.
1156  *
1157  * A hammer_node can also be passively associated with other HAMMER
1158  * structures, such as inodes, while retaining 0 references.  These
1159  * associations can be cleared backwards using a pointer-to-pointer in
1160  * the hammer_node.
1161  *
1162  * This allows the HAMMER implementation to cache hammer_node's long-term
1163  * and short-cut a great deal of the infrastructure's complexity.  In
1164  * most cases a cached node can be reacquired without having to dip into
1165  * either the buffer or cluster management code.
1166  *
1167  * The caller must pass a referenced cluster on call and will retain
1168  * ownership of the reference on return.  The node will acquire its own
1169  * additional references, if necessary.
1170  */
1171 hammer_node_t
1172 hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
1173 {
1174         hammer_node_t node;
1175
1176         /*
1177          * Locate the structure, allocating one if necessary.
1178          */
1179 again:
1180         node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
1181                          node_offset);
1182         if (node == NULL) {
1183                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
1184                 node->node_offset = node_offset;
1185                 node->cluster = cluster;
1186                 if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
1187                               node)) {
1188                         kfree(node, M_HAMMER);
1189                         goto again;
1190                 }
1191         }
1192         *errorp = hammer_ref_node(node);
1193         if (*errorp) {
1194                 /*
1195                  * NOTE: The node pointer may be stale on error return.
1196                  * In fact, its probably been destroyed.
1197                  */
1198                 node = NULL;
1199         }
1200         return(node);
1201 }
1202
1203 /*
1204  * Reference the node to prevent disassociations, then associate and
1205  * load the related buffer.  This routine can also be called to reference
1206  * a node from a cache pointer.
1207  *
1208  * NOTE: Because the caller does not have a ref on the node, the caller's
1209  * node pointer will be stale if an error is returned.  We may also wind
1210  * up clearing the related cache pointers.
1211  *
1212  * NOTE: The cluster is indirectly referenced by our buffer ref.
1213  */
1214 int
1215 hammer_ref_node(hammer_node_t node)
1216 {
1217         hammer_buffer_t buffer;
1218         int32_t buf_no;
1219         int error;
1220
1221         hammer_ref(&node->lock);
1222         error = 0;
1223         if (node->ondisk == NULL) {
1224                 hammer_lock_ex(&node->lock);
1225                 if (node->ondisk == NULL) {
1226                         /*
1227                          * This is a little confusing but the jist is that
1228                          * node->buffer determines whether the node is on
1229                          * the buffer's clist and node->ondisk determines
1230                          * whether the buffer is referenced.
1231                          */
1232                         if ((buffer = node->buffer) != NULL) {
1233                                 error = hammer_ref_buffer(buffer);
1234                         } else {
1235                                 buf_no = node->node_offset / HAMMER_BUFSIZE;
1236                                 buffer = hammer_get_buffer(node->cluster,
1237                                                            buf_no, 0, &error);
1238                                 if (buffer) {
1239                                         KKASSERT(error == 0);
1240                                         TAILQ_INSERT_TAIL(&buffer->clist,
1241                                                           node, entry);
1242                                         node->buffer = buffer;
1243                                 }
1244                         }
1245                         if (error == 0) {
1246                                 node->ondisk = (void *)((char *)buffer->ondisk +
1247                                        (node->node_offset & HAMMER_BUFMASK));
1248                         }
1249                 }
1250                 hammer_unlock(&node->lock);
1251         }
1252         if (error)
1253                 hammer_rel_node(node);
1254         return (error);
1255 }
1256
1257 /*
1258  * Release a hammer_node.  The node retains a passive association with
1259  * its cluster, buffer and caches.
1260  *
1261  * However, to avoid cluttering up kernel memory with tons of B-Tree
1262  * node cache structures we destroy the node if no passive cache or
1263  * (instantiated) buffer references exist.
1264  */
1265 void
1266 hammer_rel_node(hammer_node_t node)
1267 {
1268         hammer_cluster_t cluster;
1269         hammer_buffer_t buffer;
1270
1271         if (hammer_islastref(&node->lock)) {
1272                 cluster = node->cluster;
1273                 /*
1274                  * Clutter control, this case only occurs after a failed
1275                  * load since otherwise ondisk will be non-NULL.
1276                  */
1277                 if (node->cache1 == NULL && node->cache2 == NULL && 
1278                     node->ondisk == NULL) {
1279                         RB_REMOVE(hammer_nod_rb_tree, &cluster->rb_nods_root,
1280                                   node);
1281                         if ((buffer = node->buffer) != NULL) {
1282                                 node->buffer = NULL;
1283                                 hammer_remove_node_clist(buffer, node);
1284                         }
1285                         kfree(node, M_HAMMER);
1286                         return;
1287                 }
1288
1289                 /*
1290                  * node->ondisk determines whether we have a buffer reference
1291                  * to get rid of or not.  Only get rid of the reference if
1292                  * the kernel tried to flush the buffer.
1293                  *
1294                  * NOTE: Once unref'd the node can be physically destroyed,
1295                  * so our node is stale afterwords.
1296                  *
1297                  * This case occurs if the node still has cache references.
1298                  * We could remove the references and free the structure
1299                  * but for now we allow them (and the node structure) to
1300                  * remain intact.
1301                  */
1302                 if (node->ondisk && hammer_io_checkflush(&node->buffer->io)) {
1303                         buffer = node->buffer;
1304                         node->buffer = NULL;
1305                         node->ondisk = NULL;
1306                         hammer_remove_node_clist(buffer, node);
1307                         hammer_unref(&node->lock);
1308                         hammer_rel_buffer(buffer, 0);
1309                 } else {
1310                         hammer_unref(&node->lock);
1311                 }
1312         } else {
1313                 hammer_unref(&node->lock);
1314         }
1315 }
1316
1317 /*
1318  * Cache-and-release a hammer_node.  Kinda like catching and releasing a
1319  * fish, but keeping an eye on him.  The node is passively cached in *cache.
1320  *
1321  * NOTE!  HAMMER may NULL *cache at any time, even after you have
1322  * referenced the node!
1323  */
1324 void
1325 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1326 {
1327         if (node->cache1 != cache) {
1328                 if (node->cache2 == cache) {
1329                         struct hammer_node **tmp;
1330                         tmp = node->cache1;
1331                         node->cache1 = node->cache2;
1332                         node->cache2 = tmp;
1333                 } else {
1334                         if (node->cache2)
1335                                 *node->cache2 = NULL;
1336                         node->cache2 = node->cache1;
1337                         node->cache1 = cache;
1338                         *cache = node;
1339                 }
1340         }
1341 }
1342
1343 void
1344 hammer_uncache_node(struct hammer_node **cache)
1345 {
1346         hammer_node_t node;
1347
1348         if ((node = *cache) != NULL) {
1349                 *cache = NULL;
1350                 if (node->cache1 == cache) {
1351                         node->cache1 = node->cache2;
1352                         node->cache2 = NULL;
1353                 } else if (node->cache2 == cache) {
1354                         node->cache2 = NULL;
1355                 } else {
1356                         panic("hammer_uncache_node: missing cache linkage");
1357                 }
1358                 if (node->cache1 == NULL && node->cache2 == NULL &&
1359                     node->lock.refs == 0) {
1360                         hammer_flush_node(node);
1361                 }
1362         }
1363 }
1364
1365 /*
1366  * Remove a node's cache references and destroy the node if it has no
1367  * references.  This is typically called from the buffer handling code.
1368  *
1369  * The node may have an active buffer reference (ondisk != NULL) even
1370  * if the node itself has no references.
1371  *
1372  * Note that a caller iterating through nodes via a buffer must have its
1373  * own reference on the buffer or our hammer_rel_buffer() call below may
1374  * rip it out from under the caller.
1375  */
1376 void
1377 hammer_flush_node(hammer_node_t node)
1378 {
1379         hammer_buffer_t buffer;
1380
1381         if (node->cache1)
1382                 *node->cache1 = NULL;
1383         if (node->cache2)
1384                 *node->cache2 = NULL;
1385         if (node->lock.refs == 0) {
1386                 RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
1387                           node);
1388                 if ((buffer = node->buffer) != NULL) {
1389                         node->buffer = NULL;
1390                         hammer_remove_node_clist(buffer, node);
1391                         if (node->ondisk) {
1392                                 node->ondisk = NULL;
1393                                 hammer_rel_buffer(buffer, 0);
1394                         }
1395                 }
1396                 kfree(node, M_HAMMER);
1397         }
1398 }
1399
1400 /*
1401  * Remove a node from the buffer's clist.  Adjust save_scan as appropriate.
1402  * This is in its own little routine to properly handle interactions with
1403  * save_scan, so it is possible to block while scanning a buffer's node list.
1404  */
1405 static
1406 void
1407 hammer_remove_node_clist(hammer_buffer_t buffer, hammer_node_t node)
1408 {
1409         if (buffer->save_scan == node)
1410                 buffer->save_scan = TAILQ_NEXT(node, entry);
1411         TAILQ_REMOVE(&buffer->clist, node, entry);
1412 }
1413
1414 /************************************************************************
1415  *                              A-LIST ALLOCATORS                       *
1416  ************************************************************************/
1417
1418 /*
1419  * Allocate HAMMER elements - btree nodes, data storage, and record elements
1420  *
1421  * The passed *bufferp should be initialized to NULL.  On successive calls
1422  * *bufferp caches the most recent buffer used until put away by the caller.
1423  * Note that previously returned pointers using the cached buffer become
1424  * invalid on successive calls which reuse *bufferp.
1425  *
1426  * All allocations first attempt to use the block found at the specified
1427  * iterator.  If that fails the first available block is used.  If that
1428  * fails a new buffer is allocated and associated with the buffer type
1429  * A-list and the element is allocated out of the new buffer.
1430  */
1431
1432 hammer_node_t
1433 hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
1434 {
1435         hammer_buffer_t buffer;
1436         hammer_alist_t live;
1437         hammer_node_t node;
1438         int32_t elm_no;
1439         int32_t buf_no;
1440         int32_t node_offset;
1441
1442         /*
1443          * Allocate a B-Tree element
1444          */
1445         buffer = NULL;
1446         live = &cluster->alist_btree;
1447         elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
1448         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1449                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1450         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1451                 alloc_new_buffer(cluster, live,
1452                                  HAMMER_FSBUF_BTREE, HAMMER_BTREE_NODES,
1453                                  cluster->ondisk->idx_index, errorp, &buffer);
1454                 elm_no = hammer_alist_alloc(live, 1);
1455                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1456                         *errorp = ENOSPC;
1457                         if (buffer)
1458                                 hammer_rel_buffer(buffer, 0);
1459                         hammer_modify_cluster(cluster);
1460                         return(NULL);
1461                 }
1462         }
1463         cluster->ondisk->idx_index = elm_no;
1464         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_BTREE_NODES);
1465         hammer_modify_cluster(cluster);
1466
1467         /*
1468          * Load and return the B-Tree element
1469          */
1470         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1471         node_offset = buf_no * HAMMER_BUFSIZE +
1472                       offsetof(union hammer_fsbuf_ondisk,
1473                                btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
1474         node = hammer_get_node(cluster, node_offset, errorp);
1475         if (node) {
1476                 bzero(node->ondisk, sizeof(*node->ondisk));
1477         } else {
1478                 hammer_alist_free(live, elm_no, 1);
1479                 hammer_rel_node(node);
1480                 node = NULL;
1481         }
1482         if (buffer)
1483                 hammer_rel_buffer(buffer, 0);
1484         return(node);
1485 }
1486
1487 void *
1488 hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
1489                   int *errorp, struct hammer_buffer **bufferp)
1490 {
1491         hammer_buffer_t buffer;
1492         hammer_alist_t live;
1493         int32_t elm_no;
1494         int32_t buf_no;
1495         int32_t nblks;
1496         void *item;
1497
1498         /*
1499          * Deal with large data blocks.  The blocksize is HAMMER_BUFSIZE
1500          * for these allocations.
1501          */
1502         if ((bytes & HAMMER_BUFMASK) == 0) {
1503                 nblks = bytes / HAMMER_BUFSIZE;
1504                 /* only one block allowed for now (so buffer can hold it) */
1505                 KKASSERT(nblks == 1);
1506
1507                 buf_no = hammer_alloc_master(cluster, nblks,
1508                                              cluster->ondisk->idx_ldata, 1);
1509                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1510                         *errorp = ENOSPC;
1511                         return(NULL);
1512                 }
1513                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, nblks);
1514                 cluster->ondisk->idx_ldata = buf_no;
1515                 buffer = *bufferp;
1516                 *bufferp = hammer_get_buffer(cluster, buf_no, -1, errorp);
1517                 if (buffer)
1518                         hammer_rel_buffer(buffer, 0);
1519                 buffer = *bufferp;
1520                 return(buffer->ondisk);
1521         }
1522
1523         /*
1524          * Allocate a data element.  The block size is HAMMER_DATA_BLKSIZE
1525          * (64 bytes) for these allocations.
1526          */
1527         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1528         nblks /= HAMMER_DATA_BLKSIZE;
1529         live = &cluster->alist_mdata;
1530         elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data);
1531         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1532                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1533         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1534                 alloc_new_buffer(cluster, live,
1535                                  HAMMER_FSBUF_DATA, HAMMER_DATA_NODES,
1536                                  cluster->ondisk->idx_data, errorp, bufferp);
1537                 elm_no = hammer_alist_alloc(live, nblks);
1538                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1539                         *errorp = ENOSPC;
1540                         hammer_modify_cluster(cluster);
1541                         return(NULL);
1542                 }
1543         }
1544         cluster->ondisk->idx_index = elm_no;
1545         hammer_modify_cluster(cluster);
1546
1547         /*
1548          * Load and return the B-Tree element
1549          */
1550         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1551         buffer = *bufferp;
1552         if (buffer == NULL || buffer->cluster != cluster ||
1553             buffer->buf_no != buf_no) {
1554                 if (buffer)
1555                         hammer_rel_buffer(buffer, 0);
1556                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1557                 *bufferp = buffer;
1558         }
1559         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_DATA);
1560         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_DATA_NODES);
1561         item = &buffer->ondisk->data.data[elm_no & HAMMER_FSBUF_BLKMASK];
1562         bzero(item, nblks * HAMMER_DATA_BLKSIZE);
1563         *errorp = 0;
1564         return(item);
1565 }
1566
1567 void *
1568 hammer_alloc_record(hammer_cluster_t cluster,
1569                     int *errorp, struct hammer_buffer **bufferp)
1570 {
1571         hammer_buffer_t buffer;
1572         hammer_alist_t live;
1573         int32_t elm_no;
1574         int32_t buf_no;
1575         void *item;
1576
1577         /*
1578          * Allocate a record element
1579          */
1580         live = &cluster->alist_record;
1581         elm_no = hammer_alist_alloc_rev(live, 1, cluster->ondisk->idx_record);
1582         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1583                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1584         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1585                 alloc_new_buffer(cluster, live,
1586                                  HAMMER_FSBUF_RECORDS, HAMMER_RECORD_NODES,
1587                                  cluster->ondisk->idx_record, errorp, bufferp);
1588                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1589                 kprintf("hammer_alloc_record elm again %08x\n", elm_no);
1590                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1591                         *errorp = ENOSPC;
1592                         hammer_modify_cluster(cluster);
1593                         return(NULL);
1594                 }
1595         }
1596         cluster->ondisk->idx_record = elm_no;
1597         hammer_modify_cluster(cluster);
1598
1599         /*
1600          * Load and return the B-Tree element
1601          */
1602         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1603         buffer = *bufferp;
1604         if (buffer == NULL || buffer->cluster != cluster ||
1605             buffer->buf_no != buf_no) {
1606                 if (buffer)
1607                         hammer_rel_buffer(buffer, 0);
1608                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1609                 *bufferp = buffer;
1610         }
1611         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_RECORDS);
1612         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_RECORD_NODES);
1613         item = &buffer->ondisk->record.recs[elm_no & HAMMER_FSBUF_BLKMASK];
1614         bzero(item, sizeof(union hammer_record_ondisk));
1615         *errorp = 0;
1616         return(item);
1617 }
1618
1619 void
1620 hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
1621 {
1622         int32_t elm_no;
1623         int32_t nblks;
1624         hammer_alist_t live;
1625
1626         if ((bytes & HAMMER_BUFMASK) == 0) {
1627                 nblks = bytes / HAMMER_BUFSIZE;
1628                 KKASSERT(nblks == 1 && data == (void *)buffer->ondisk);
1629                 hammer_alist_free(&buffer->cluster->alist_master,
1630                                   buffer->buf_no, nblks);
1631                 hammer_adjust_stats(buffer->cluster, HAMMER_FSBUF_DATA, -nblks);
1632                 hammer_modify_cluster(buffer->cluster);
1633                 return;
1634         }
1635
1636         elm_no = ((char *)data - (char *)buffer->ondisk->data.data) /
1637                  HAMMER_DATA_BLKSIZE;
1638         KKASSERT(elm_no >= 0 && elm_no < HAMMER_DATA_NODES);
1639         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1640         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1641         nblks /= HAMMER_DATA_BLKSIZE;
1642         live = &buffer->cluster->alist_mdata;
1643         hammer_alist_free(live, elm_no, nblks);
1644         hammer_modify_cluster(buffer->cluster);
1645 }
1646
1647 void
1648 hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec)
1649 {
1650         int32_t elm_no;
1651         hammer_alist_t live;
1652
1653         elm_no = rec - &buffer->ondisk->record.recs[0];
1654         KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
1655         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1656         live = &buffer->cluster->alist_record;
1657         hammer_alist_free(live, elm_no, 1);
1658         hammer_modify_cluster(buffer->cluster);
1659 }
1660
1661 void
1662 hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
1663 {
1664         const int32_t blksize = sizeof(struct hammer_node_ondisk);
1665         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1666         hammer_alist_t live;
1667         int32_t elm_no;
1668
1669         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1670         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, btree.nodes[0]);
1671         live = &cluster->alist_btree;
1672         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1673         elm_no += fsbuf_offset / blksize;
1674         hammer_alist_free(live, elm_no, 1);
1675         hammer_modify_cluster(cluster);
1676 }
1677
1678 void
1679 hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
1680 {
1681         const int32_t blksize = HAMMER_DATA_BLKSIZE;
1682         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1683         hammer_alist_t live;
1684         int32_t elm_no;
1685         int32_t buf_no;
1686         int32_t nblks;
1687
1688         if ((bytes & HAMMER_BUFMASK) == 0) {
1689                 nblks = bytes / HAMMER_BUFSIZE;
1690                 KKASSERT(nblks == 1 && (bclu_offset & HAMMER_BUFMASK) == 0);
1691                 buf_no = bclu_offset / HAMMER_BUFSIZE;
1692                 hammer_alist_free(&cluster->alist_master, buf_no, nblks);
1693                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, -nblks);
1694                 hammer_modify_cluster(cluster);
1695                 return;
1696         }
1697
1698         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1699         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, data.data[0][0]);
1700         live = &cluster->alist_mdata;
1701         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1702         nblks /= HAMMER_DATA_BLKSIZE;
1703         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1704         elm_no += fsbuf_offset / blksize;
1705         hammer_alist_free(live, elm_no, nblks);
1706         hammer_modify_cluster(cluster);
1707 }
1708
1709 void
1710 hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset)
1711 {
1712         const int32_t blksize = sizeof(union hammer_record_ondisk);
1713         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1714         hammer_alist_t live;
1715         int32_t elm_no;
1716
1717         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1718         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, record.recs[0]);
1719         live = &cluster->alist_record;
1720         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1721         elm_no += fsbuf_offset / blksize;
1722         hammer_alist_free(live, elm_no, 1);
1723         hammer_modify_cluster(cluster);
1724 }
1725
1726
1727 /*
1728  * Allocate a new filesystem buffer and assign it to the specified
1729  * filesystem buffer type.  The new buffer will be added to the
1730  * type-specific A-list and initialized.
1731  */
1732 static void
1733 alloc_new_buffer(hammer_cluster_t cluster, hammer_alist_t live,
1734                  u_int64_t type, int32_t nelements,
1735                  int start, int *errorp, struct hammer_buffer **bufferp)
1736 {
1737         hammer_buffer_t buffer;
1738         int32_t buf_no;
1739         int isfwd;
1740
1741         start = start / HAMMER_FSBUF_MAXBLKS;   /* convert to buf_no */
1742
1743         isfwd = (type != HAMMER_FSBUF_RECORDS);
1744         buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
1745         KKASSERT(buf_no != HAMMER_ALIST_BLOCK_NONE); /* XXX */
1746         hammer_modify_cluster(cluster);
1747
1748         /*
1749          * The new buffer must be initialized (type != 0) regardless of
1750          * whether we already have it cached or not, so don't try to
1751          * optimize the cached buffer check.  Just call hammer_get_buffer().
1752          */
1753         buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
1754         if (*bufferp)
1755                 hammer_rel_buffer(*bufferp, 0);
1756         *bufferp = buffer;
1757
1758         /*
1759          * Finally, do a meta-free of the buffer's elements into the
1760          * type-specific A-list and update our statistics to reflect
1761          * the allocation.
1762          */
1763         if (buffer) {
1764                 kprintf("alloc_new_buffer buf_no %d type %016llx nelms %d\n",
1765                         buf_no, type, nelements);
1766                 hammer_alist_free(live, buf_no * HAMMER_FSBUF_MAXBLKS,
1767                                   nelements);
1768                 hammer_adjust_stats(cluster, type, 1);
1769         }
1770 }
1771
1772 /*
1773  * Sync dirty buffers to the media
1774  */
1775
1776 /*
1777  * Sync the entire filesystem.
1778  */
1779 int
1780 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1781 {
1782         struct hammer_sync_info info;
1783
1784         info.error = 0;
1785         info.waitfor = waitfor;
1786
1787         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1788                 hammer_sync_volume, &info);
1789         return(info.error);
1790 }
1791
1792 int
1793 hammer_sync_volume(hammer_volume_t volume, void *data)
1794 {
1795         struct hammer_sync_info *info = data;
1796
1797         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
1798                 hammer_sync_cluster, info);
1799         if (hammer_ref_volume(volume) == 0) {
1800                 hammer_io_flush(&volume->io, info);
1801                 hammer_rel_volume(volume, 0);
1802         }
1803         return(0);
1804 }
1805
1806 int
1807 hammer_sync_cluster(hammer_cluster_t cluster, void *data)
1808 {
1809         struct hammer_sync_info *info = data;
1810
1811         if (cluster->state != HAMMER_CLUSTER_IDLE) {
1812                 RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
1813                         hammer_sync_buffer, info);
1814                 if (hammer_ref_cluster(cluster) == 0) {
1815                         hammer_io_flush(&cluster->io, info);
1816                         hammer_rel_cluster(cluster, 0);
1817                 }
1818         }
1819         return(0);
1820 }
1821
1822 int
1823 hammer_sync_buffer(hammer_buffer_t buffer, void *data)
1824 {
1825         struct hammer_sync_info *info = data;
1826
1827         if (hammer_ref_buffer(buffer) == 0) {
1828                 hammer_lock_ex(&buffer->io.lock);
1829                 hammer_flush_buffer_nodes(buffer);
1830                 hammer_unlock(&buffer->io.lock);
1831                 hammer_io_flush(&buffer->io, info);
1832                 hammer_rel_buffer(buffer, 0);
1833         }
1834         return(0);
1835 }
1836
1837 #if 0
1838
1839 /*
1840  * Flush various tracking structures to disk
1841  */
1842
1843 /*
1844  * Flush various tracking structures to disk
1845  */
1846 void
1847 flush_all_volumes(void)
1848 {
1849         hammer_volume_t vol;
1850
1851         for (vol = VolBase; vol; vol = vol->next)
1852                 flush_volume(vol);
1853 }
1854
1855 void
1856 flush_volume(hammer_volume_t vol)
1857 {
1858         hammer_supercl_t supercl;
1859         hammer_cluster_t cl;
1860
1861         for (supercl = vol->supercl_base; supercl; supercl = supercl->next)
1862                 flush_supercl(supercl);
1863         for (cl = vol->cluster_base; cl; cl = cl->next)
1864                 flush_cluster(cl);
1865         writehammerbuf(vol, vol->ondisk, 0);
1866 }
1867
1868 void
1869 flush_supercl(hammer_supercl_t supercl)
1870 {
1871         int64_t supercl_offset;
1872
1873         supercl_offset = supercl->scl_offset;
1874         writehammerbuf(supercl->volume, supercl->ondisk, supercl_offset);
1875 }
1876
1877 void
1878 flush_cluster(hammer_cluster_t cl)
1879 {
1880         hammer_buffer_t buf;
1881         int64_t cluster_offset;
1882
1883         for (buf = cl->buffer_base; buf; buf = buf->next)
1884                 flush_buffer(buf);
1885         cluster_offset = cl->clu_offset;
1886         writehammerbuf(cl->volume, cl->ondisk, cluster_offset);
1887 }
1888
1889 void
1890 flush_buffer(hammer_buffer_t buf)
1891 {
1892         int64_t buffer_offset;
1893
1894         buffer_offset = buf->buf_offset + buf->cluster->clu_offset;
1895         writehammerbuf(buf->volume, buf->ondisk, buffer_offset);
1896 }
1897
1898 #endif
1899
1900 /*
1901  * Generic buffer initialization
1902  */
1903 static void
1904 initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
1905 {
1906         head->buf_type = type;
1907         hammer_alist_init(live);
1908 }
1909
1910 /*
1911  * Calculate the cluster's offset in the volume.  This calculation is
1912  * slightly more complex when using superclusters because superclusters
1913  * are grouped in blocks of 16, followed by 16 x N clusters where N
1914  * is the number of clusters a supercluster can manage.
1915  */
1916 static int64_t
1917 calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
1918 {
1919         int32_t scl_group;
1920         int64_t scl_group_size;
1921         int64_t off;
1922
1923         if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
1924                 scl_group = clu_no / HAMMER_VOL_SUPERCLUSTER_GROUP /
1925                             HAMMER_SCL_MAXCLUSTERS;
1926                 scl_group_size = 
1927                             ((int64_t)HAMMER_BUFSIZE *
1928                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
1929                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
1930                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
1931                 scl_group_size += 
1932                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
1933
1934                 off = volume->cluster_base +
1935                       scl_group * scl_group_size +
1936                       (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
1937                       ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
1938                        HAMMER_VOL_SUPERCLUSTER_GROUP))
1939                       * HAMMER_BUFSIZE;
1940         } else {
1941                 off = volume->cluster_base +
1942                       (int64_t)clu_no * volume->vol_clsize;
1943         }
1944         return(off);
1945 }
1946
1947 /*
1948  * Calculate a super-cluster's offset in the volume.
1949  */
1950 static int64_t
1951 calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
1952 {
1953         int64_t off;
1954         int32_t scl_group;
1955         int64_t scl_group_size;
1956
1957         KKASSERT (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL);
1958         scl_group = scl_no / HAMMER_VOL_SUPERCLUSTER_GROUP;
1959         if (scl_group) {
1960                 scl_group_size = 
1961                             ((int64_t)HAMMER_BUFSIZE *
1962                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
1963                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
1964                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
1965                 scl_group_size += 
1966                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
1967                 off = volume->cluster_base + (scl_group * scl_group_size) +
1968                       (scl_no % HAMMER_VOL_SUPERCLUSTER_GROUP) * HAMMER_BUFSIZE;
1969         } else {
1970                 off = volume->cluster_base + (scl_no * HAMMER_BUFSIZE);
1971         }
1972         return(off);
1973 }
1974
1975 /*
1976  *
1977  *
1978  */
1979 static int32_t
1980 hammer_alloc_master(hammer_cluster_t cluster, int nblks,
1981                     int32_t start, int isfwd)
1982 {
1983         int32_t buf_no;
1984
1985         if (isfwd) {
1986                 buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
1987                                                 nblks, start);
1988                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1989                         buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
1990                                                 nblks, 0);
1991                 }
1992         } else {
1993                 buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
1994                                                 nblks, start);
1995                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1996                         buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
1997                                                 nblks, HAMMER_ALIST_BLOCK_MAX);
1998                 }
1999         }
2000
2001         /*
2002          * Recover space from empty record, b-tree, and data a-lists.
2003          */
2004
2005         return(buf_no);
2006 }
2007
2008 /*
2009  * Adjust allocation statistics
2010  */
2011 static void
2012 hammer_adjust_stats(hammer_cluster_t cluster, u_int64_t buf_type, int nblks)
2013 {
2014         switch(buf_type) {
2015         case HAMMER_FSBUF_BTREE:
2016                 cluster->ondisk->stat_idx_bufs += nblks;
2017                 cluster->volume->ondisk->vol_stat_idx_bufs += nblks;
2018                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_idx_bufs += nblks;
2019                 break;
2020         case HAMMER_FSBUF_DATA:
2021                 cluster->ondisk->stat_data_bufs += nblks;
2022                 cluster->volume->ondisk->vol_stat_data_bufs += nblks;
2023                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_data_bufs += nblks;
2024                 break;
2025         case HAMMER_FSBUF_RECORDS:
2026                 cluster->ondisk->stat_rec_bufs += nblks;
2027                 cluster->volume->ondisk->vol_stat_rec_bufs += nblks;
2028                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_rec_bufs += nblks;
2029                 break;
2030         }
2031         hammer_modify_cluster(cluster);
2032         hammer_modify_volume(cluster->volume);
2033         hammer_modify_volume(cluster->volume->hmp->rootvol);
2034 }
2035
2036 /*
2037  * A-LIST SUPPORT
2038  *
2039  * Setup the parameters for the various A-lists we use in hammer.  The
2040  * supercluster A-list must be chained to the cluster A-list and cluster
2041  * slave A-lists are chained to buffer A-lists.
2042  *
2043  * See hammer_init_alist_config() below.
2044  */
2045
2046 /*
2047  * A-LIST - cluster recursion into a filesystem buffer
2048  */
2049 static int
2050 buffer_alist_init(void *info, int32_t blk, int32_t radix)
2051 {
2052         return(0);
2053 #if 0
2054         hammer_cluster_t cluster = info;
2055         hammer_buffer_t buffer;
2056         int32_t buf_no;
2057         int error = 0;
2058
2059         /*
2060          * Calculate the buffer number, initialize based on the buffer type.
2061          * The buffer has already been allocated so assert that it has been
2062          * initialized.
2063          */
2064         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2065         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2066         if (buffer) {
2067                 hammer_adjust_stats(cluster, buffer->ondisk->head.buf_type, 1);
2068                 hammer_rel_buffer(buffer, 0);
2069         }
2070         return (error);
2071 #endif
2072 }
2073
2074 static int
2075 buffer_alist_destroy(void *info, int32_t blk, int32_t radix)
2076 {
2077         return(0);
2078 #if 0
2079         hammer_cluster_t cluster = info;
2080         hammer_buffer_t buffer;
2081         int32_t buf_no;
2082         int error = 0;
2083
2084         /*
2085          * Calculate the buffer number, initialize based on the buffer type.
2086          * The buffer has already been allocated so assert that it has been
2087          * initialized.
2088          */
2089         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2090         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2091         if (buffer) {
2092                 hammer_adjust_stats(cluster, buffer->ondisk->head.buf_type, -1);
2093                 hammer_rel_buffer(buffer, 0);
2094         }
2095         return (error);
2096 #endif
2097 }
2098
2099 /*
2100  * Note: atblk can be negative and atblk - blk can go negative.
2101  */
2102 static int
2103 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2104                       int32_t count, int32_t atblk, int32_t *fullp)
2105 {
2106         hammer_cluster_t cluster = info;
2107         hammer_buffer_t buffer;
2108         int32_t buf_no;
2109         int32_t r;
2110         int error = 0;
2111
2112         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2113         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2114         if (buffer) {
2115                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2116
2117                 r = hammer_alist_alloc_fwd(&buffer->alist, count, atblk - blk);
2118                 if (r != HAMMER_ALIST_BLOCK_NONE)
2119                         r += blk;
2120                 *fullp = hammer_alist_isfull(&buffer->alist);
2121                 hammer_modify_buffer(buffer);
2122                 hammer_rel_buffer(buffer, 0);
2123         } else {
2124                 r = HAMMER_ALIST_BLOCK_NONE;
2125         }
2126         return(r);
2127 }
2128
2129 static int
2130 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2131                       int32_t count, int32_t atblk, int32_t *fullp)
2132 {
2133         hammer_cluster_t cluster = info;
2134         hammer_buffer_t buffer;
2135         int32_t buf_no;
2136         int32_t r;
2137         int error = 0;
2138
2139         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2140         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2141         if (buffer) {
2142                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2143
2144                 r = hammer_alist_alloc_rev(&buffer->alist, count, atblk - blk);
2145                 if (r != HAMMER_ALIST_BLOCK_NONE)
2146                         r += blk;
2147                 *fullp = hammer_alist_isfull(&buffer->alist);
2148                 hammer_modify_buffer(buffer);
2149                 hammer_rel_buffer(buffer, 0);
2150         } else {
2151                 r = HAMMER_ALIST_BLOCK_NONE;
2152                 *fullp = 0;
2153         }
2154         return(r);
2155 }
2156
2157 static void
2158 buffer_alist_free(void *info, int32_t blk, int32_t radix,
2159                  int32_t base_blk, int32_t count, int32_t *emptyp)
2160 {
2161         hammer_cluster_t cluster = info;
2162         hammer_buffer_t buffer;
2163         int32_t buf_no;
2164         int error = 0;
2165
2166         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2167         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2168         if (buffer) {
2169                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2170                 hammer_alist_free(&buffer->alist, base_blk, count);
2171                 *emptyp = hammer_alist_isempty(&buffer->alist);
2172                 /* XXX don't bother updating the buffer is completely empty? */
2173                 hammer_modify_buffer(buffer);
2174                 hammer_rel_buffer(buffer, 0);
2175         } else {
2176                 *emptyp = 0;
2177         }
2178 }
2179
2180 static void
2181 buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2182 {
2183 }
2184
2185 /*
2186  * A-LIST - super-cluster recursion into a cluster and cluster recursion
2187  * into a filesystem buffer.  A-List's are mostly self-contained entities,
2188  * but callbacks must be installed to recurse from one A-List to another.
2189  *
2190  * Implementing these callbacks allows us to operate a multi-layered A-List
2191  * as a single entity.
2192  */
2193 static int
2194 super_alist_init(void *info, int32_t blk, int32_t radix)
2195 {
2196         hammer_volume_t volume = info;
2197         hammer_supercl_t supercl;
2198         int32_t scl_no;
2199         int error = 0;
2200
2201         /*
2202          * Calculate the super-cluster number containing the cluster (blk)
2203          * and obtain the super-cluster buffer.
2204          */
2205         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2206         supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2207         if (supercl)
2208                 hammer_rel_supercl(supercl, 0);
2209         return (error);
2210 }
2211
2212 static int
2213 super_alist_destroy(void *info, int32_t blk, int32_t radix)
2214 {
2215         return(0);
2216 }
2217
2218 static int
2219 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2220                       int32_t count, int32_t atblk, int32_t *fullp)
2221 {
2222         hammer_volume_t volume = info;
2223         hammer_supercl_t supercl;
2224         int32_t scl_no;
2225         int32_t r;
2226         int error = 0;
2227
2228         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2229         supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2230         if (supercl) {
2231                 r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
2232                 if (r != HAMMER_ALIST_BLOCK_NONE)
2233                         r += blk;
2234                 *fullp = hammer_alist_isfull(&supercl->alist);
2235                 hammer_modify_supercl(supercl);
2236                 hammer_rel_supercl(supercl, 0);
2237         } else {
2238                 r = HAMMER_ALIST_BLOCK_NONE;
2239                 *fullp = 0;
2240         }
2241         return(r);
2242 }
2243
2244 static int
2245 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2246                       int32_t count, int32_t atblk, int32_t *fullp)
2247 {
2248         hammer_volume_t volume = info;
2249         hammer_supercl_t supercl;
2250         int32_t scl_no;
2251         int32_t r;
2252         int error = 0;
2253
2254         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2255         supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2256         if (supercl) {
2257                 r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
2258                 if (r != HAMMER_ALIST_BLOCK_NONE)
2259                         r += blk;
2260                 *fullp = hammer_alist_isfull(&supercl->alist);
2261                 hammer_modify_supercl(supercl);
2262                 hammer_rel_supercl(supercl, 0);
2263         } else { 
2264                 r = HAMMER_ALIST_BLOCK_NONE;
2265                 *fullp = 0;
2266         }
2267         return(r);
2268 }
2269
2270 static void
2271 super_alist_free(void *info, int32_t blk, int32_t radix,
2272                  int32_t base_blk, int32_t count, int32_t *emptyp)
2273 {
2274         hammer_volume_t volume = info;
2275         hammer_supercl_t supercl;
2276         int32_t scl_no;
2277         int error = 0;
2278
2279         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2280         supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2281         if (supercl) {
2282                 hammer_alist_free(&supercl->alist, base_blk, count);
2283                 *emptyp = hammer_alist_isempty(&supercl->alist);
2284                 hammer_modify_supercl(supercl);
2285                 hammer_rel_supercl(supercl, 0);
2286         } else {
2287                 *emptyp = 0;
2288         }
2289 }
2290
2291 static void
2292 super_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2293 {
2294 }
2295
2296 void
2297 hammer_init_alist_config(void)
2298 {
2299         hammer_alist_config_t config;
2300
2301         hammer_alist_template(&Buf_alist_config, HAMMER_FSBUF_MAXBLKS,
2302                               1, HAMMER_FSBUF_METAELMS);
2303         hammer_alist_template(&Vol_normal_alist_config, HAMMER_VOL_MAXCLUSTERS,
2304                               1, HAMMER_VOL_METAELMS_1LYR);
2305         hammer_alist_template(&Vol_super_alist_config,
2306                           HAMMER_VOL_MAXSUPERCLUSTERS * HAMMER_SCL_MAXCLUSTERS,
2307                               HAMMER_SCL_MAXCLUSTERS, HAMMER_VOL_METAELMS_2LYR);
2308         hammer_alist_template(&Supercl_alist_config, HAMMER_VOL_MAXCLUSTERS,
2309                               1, HAMMER_SUPERCL_METAELMS);
2310         hammer_alist_template(&Clu_master_alist_config, HAMMER_CLU_MAXBUFFERS,
2311                               1, HAMMER_CLU_MASTER_METAELMS);
2312         hammer_alist_template(&Clu_slave_alist_config,
2313                               HAMMER_CLU_MAXBUFFERS * HAMMER_FSBUF_MAXBLKS,
2314                               HAMMER_FSBUF_MAXBLKS, HAMMER_CLU_SLAVE_METAELMS);
2315
2316         config = &Vol_super_alist_config;
2317         config->bl_radix_init = super_alist_init;
2318         config->bl_radix_destroy = super_alist_destroy;
2319         config->bl_radix_alloc_fwd = super_alist_alloc_fwd;
2320         config->bl_radix_alloc_rev = super_alist_alloc_rev;
2321         config->bl_radix_free = super_alist_free;
2322         config->bl_radix_print = super_alist_print;
2323
2324         config = &Clu_slave_alist_config;
2325         config->bl_radix_init = buffer_alist_init;
2326         config->bl_radix_destroy = buffer_alist_destroy;
2327         config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd;
2328         config->bl_radix_alloc_rev = buffer_alist_alloc_rev;
2329         config->bl_radix_free = buffer_alist_free;
2330         config->bl_radix_print = buffer_alist_print;
2331 }
2332