HAMMER 26/many: Misc features.
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.27 2008/02/06 08:59:28 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_supercl(hammer_supercl_t supercl,
51                         hammer_alloc_state_t isnew);
52 static int hammer_load_cluster(hammer_cluster_t cluster, int getflags);
53 static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
54 static int hammer_load_node(hammer_node_t node);
55 static void alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type,
56                         hammer_alist_t live,
57                         int32_t start, int *errorp,
58                         struct hammer_buffer **bufferp);
59 #if 0
60 static void readhammerbuf(hammer_volume_t vol, void *data,
61                         int64_t offset);
62 static void writehammerbuf(hammer_volume_t vol, const void *data,
63                         int64_t offset);
64 #endif
65 static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
66 static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
67 static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks,
68                         int32_t start, int isfwd);
69 static void hammer_adjust_stats(hammer_cluster_t cluster,
70                         u_int64_t buf_type, int nblks);
71
72 struct hammer_alist_config Buf_alist_config;
73 struct hammer_alist_config Vol_normal_alist_config;
74 struct hammer_alist_config Vol_super_alist_config;
75 struct hammer_alist_config Supercl_alist_config;
76 struct hammer_alist_config Clu_master_alist_config;
77 struct hammer_alist_config Clu_slave_alist_config;
78
79 /*
80  * Red-Black tree support for various structures
81  */
82 static int
83 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
84 {
85         if (ip1->obj_id < ip2->obj_id)
86                 return(-1);
87         if (ip1->obj_id > ip2->obj_id)
88                 return(1);
89         if (ip1->obj_asof < ip2->obj_asof)
90                 return(-1);
91         if (ip1->obj_asof > ip2->obj_asof)
92                 return(1);
93         return(0);
94 }
95
96 static int
97 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
98 {
99         if (info->obj_id < ip->obj_id)
100                 return(-1);
101         if (info->obj_id > ip->obj_id)
102                 return(1);
103         if (info->obj_asof < ip->obj_asof)
104                 return(-1);
105         if (info->obj_asof > ip->obj_asof)
106                 return(1);
107         return(0);
108 }
109
110 static int
111 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
112 {
113         if (vol1->vol_no < vol2->vol_no)
114                 return(-1);
115         if (vol1->vol_no > vol2->vol_no)
116                 return(1);
117         return(0);
118 }
119
120 static int
121 hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
122 {
123         if (cl1->scl_no < cl2->scl_no)
124                 return(-1);
125         if (cl1->scl_no > cl2->scl_no)
126                 return(1);
127         return(0);
128 }
129
130 static int
131 hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
132 {
133         if (cl1->clu_no < cl2->clu_no)
134                 return(-1);
135         if (cl1->clu_no > cl2->clu_no)
136                 return(1);
137         return(0);
138 }
139
140 static int
141 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
142 {
143         if (buf1->buf_no < buf2->buf_no)
144                 return(-1);
145         if (buf1->buf_no > buf2->buf_no)
146                 return(1);
147         return(0);
148 }
149
150 static int
151 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
152 {
153         if (node1->node_offset < node2->node_offset)
154                 return(-1);
155         if (node1->node_offset > node2->node_offset)
156                 return(1);
157         return(0);
158 }
159
160 /*
161  * Note: The lookup function for hammer_ino_rb_tree winds up being named
162  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
163  * functions are normal, e.g. hammer_clu_rb_tree_RB_LOOKUP(root, clu_no).
164  */
165 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
166 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
167                 hammer_inode_info_cmp, hammer_inode_info_t);
168 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
169              hammer_vol_rb_compare, int32_t, vol_no);
170 RB_GENERATE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
171              hammer_scl_rb_compare, int32_t, scl_no);
172 RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
173              hammer_clu_rb_compare, int32_t, clu_no);
174 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
175              hammer_buf_rb_compare, int32_t, buf_no);
176 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
177              hammer_nod_rb_compare, int32_t, node_offset);
178
179 /************************************************************************
180  *                              VOLUMES                                 *
181  ************************************************************************
182  *
183  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
184  * code on failure.  Volumes must be loaded at mount time, get_volume() will
185  * not load a new volume.
186  *
187  * Calls made to hammer_load_volume() or single-threaded
188  */
189 int
190 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
191 {
192         struct mount *mp;
193         hammer_volume_t volume;
194         struct hammer_volume_ondisk *ondisk;
195         struct nlookupdata nd;
196         struct buf *bp = NULL;
197         int error;
198         int ronly;
199
200         mp = hmp->mp;
201         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
202
203         /*
204          * Allocate a volume structure
205          */
206         ++hammer_count_volumes;
207         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
208         volume->vol_name = kstrdup(volname, M_HAMMER);
209         volume->hmp = hmp;
210         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
211         volume->io.offset = 0LL;
212
213         /*
214          * Get the device vnode
215          */
216         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
217         if (error == 0)
218                 error = nlookup(&nd);
219         if (error == 0)
220                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
221         nlookup_done(&nd);
222         if (error == 0) {
223                 if (vn_isdisk(volume->devvp, &error)) {
224                         error = vfs_mountedon(volume->devvp);
225                 }
226         }
227         if (error == 0 &&
228             count_udev(volume->devvp->v_umajor, volume->devvp->v_uminor) > 0) {
229                 error = EBUSY;
230         }
231         if (error == 0) {
232                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
233                 error = vinvalbuf(volume->devvp, V_SAVE, 0, 0);
234                 if (error == 0) {
235                         error = VOP_OPEN(volume->devvp, 
236                                          (ronly ? FREAD : FREAD|FWRITE),
237                                          FSCRED, NULL);
238                 }
239                 vn_unlock(volume->devvp);
240         }
241         if (error) {
242                 hammer_free_volume(volume);
243                 return(error);
244         }
245         volume->devvp->v_rdev->si_mountpoint = mp;
246
247         /*
248          * Extract the volume number from the volume header and do various
249          * sanity checks.
250          */
251         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
252         if (error)
253                 goto late_failure;
254         ondisk = (void *)bp->b_data;
255         if (ondisk->head.buf_type != HAMMER_FSBUF_VOLUME) {
256                 kprintf("hammer_mount: volume %s has an invalid header\n",
257                         volume->vol_name);
258                 error = EFTYPE;
259                 goto late_failure;
260         }
261         volume->vol_no = ondisk->vol_no;
262         volume->cluster_base = ondisk->vol_clo_beg;
263         volume->vol_clsize = ondisk->vol_clsize;
264         volume->vol_flags = ondisk->vol_flags;
265         volume->nblocks = ondisk->vol_nblocks; 
266         RB_INIT(&volume->rb_clus_root);
267         RB_INIT(&volume->rb_scls_root);
268
269         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
270
271         if (RB_EMPTY(&hmp->rb_vols_root)) {
272                 hmp->fsid = ondisk->vol_fsid;
273         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
274                 kprintf("hammer_mount: volume %s's fsid does not match "
275                         "other volumes\n", volume->vol_name);
276                 error = EFTYPE;
277                 goto late_failure;
278         }
279
280         /*
281          * Insert the volume structure into the red-black tree.
282          */
283         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
284                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
285                         volume->vol_name, volume->vol_no);
286                 error = EEXIST;
287         }
288
289         /*
290          * Set the root volume and load the root cluster.  HAMMER special
291          * cases rootvol and rootcl and will not deallocate the structures.
292          * We do not hold a ref because this would prevent related I/O
293          * from being flushed.
294          */
295         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
296                 hmp->rootvol = volume;
297                 if (bp) {
298                         brelse(bp);
299                         bp = NULL;
300                 }
301                 hammer_ref_volume(volume);
302                 hmp->rootcl = hammer_get_cluster(volume,
303                                                  ondisk->vol0_root_clu_no,
304                                                  &error, GET_CLUSTER_NORECOVER);
305                 hammer_rel_cluster(hmp->rootcl, 0);
306                 hammer_rel_volume(volume, 0);
307                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
308         }
309 late_failure:
310         if (bp)
311                 brelse(bp);
312         if (error) {
313                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
314                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
315                 hammer_free_volume(volume);
316         }
317         return (error);
318 }
319
320 /*
321  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
322  * so returns -1 on failure.
323  */
324 int
325 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
326 {
327         struct hammer_mount *hmp = volume->hmp;
328         hammer_cluster_t rootcl;
329         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
330
331         /*
332          * Sync clusters, sync volume
333          */
334
335         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
336
337         /*
338          * Clean up the root cluster, which is held unlocked in the root
339          * volume.
340          */
341         if (hmp->rootvol == volume) {
342                 if ((rootcl = hmp->rootcl) != NULL)
343                         hmp->rootcl = NULL;
344                 hmp->rootvol = NULL;
345         }
346
347         /*
348          * Unload clusters and super-clusters.  Unloading a super-cluster
349          * also unloads related clusters, but the filesystem may not be
350          * using super-clusters so unload clusters anyway.
351          */
352         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
353                         hammer_unload_cluster, NULL);
354         RB_SCAN(hammer_scl_rb_tree, &volume->rb_scls_root, NULL,
355                         hammer_unload_supercl, NULL);
356         hammer_io_waitdep(&volume->io);
357
358         /*
359          * Release our buffer and flush anything left in the buffer cache.
360          */
361         hammer_io_release(&volume->io, 2);
362
363         /*
364          * There should be no references on the volume, no clusters, and
365          * no super-clusters.
366          */
367         KKASSERT(volume->io.lock.refs == 0);
368         KKASSERT(RB_EMPTY(&volume->rb_clus_root));
369         KKASSERT(RB_EMPTY(&volume->rb_scls_root));
370
371         volume->ondisk = NULL;
372         if (volume->devvp) {
373                 if (ronly) {
374                         vinvalbuf(volume->devvp, 0, 0, 0);
375                         VOP_CLOSE(volume->devvp, FREAD);
376                 } else {
377                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
378                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
379                 }
380         }
381
382         /*
383          * Destroy the structure
384          */
385         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
386         hammer_free_volume(volume);
387         return(0);
388 }
389
390 static
391 void
392 hammer_free_volume(hammer_volume_t volume)
393 {
394         if (volume->vol_name) {
395                 kfree(volume->vol_name, M_HAMMER);
396                 volume->vol_name = NULL;
397         }
398         if (volume->devvp) {
399                 if (vn_isdisk(volume->devvp, NULL) &&
400                     volume->devvp->v_rdev &&
401                     volume->devvp->v_rdev->si_mountpoint == volume->hmp->mp
402                 ) {
403                         volume->devvp->v_rdev->si_mountpoint = NULL;
404                 }
405                 vrele(volume->devvp);
406                 volume->devvp = NULL;
407         }
408         --hammer_count_volumes;
409         kfree(volume, M_HAMMER);
410 }
411
412 /*
413  * Get a HAMMER volume.  The volume must already exist.
414  */
415 hammer_volume_t
416 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
417 {
418         struct hammer_volume *volume;
419
420         /*
421          * Locate the volume structure
422          */
423         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
424         if (volume == NULL) {
425                 *errorp = ENOENT;
426                 return(NULL);
427         }
428         hammer_ref(&volume->io.lock);
429
430         /*
431          * Deal with on-disk info
432          */
433         if (volume->ondisk == NULL || volume->io.loading) {
434                 *errorp = hammer_load_volume(volume);
435                 if (*errorp) {
436                         hammer_rel_volume(volume, 1);
437                         volume = NULL;
438                 }
439         } else {
440                 *errorp = 0;
441         }
442         return(volume);
443 }
444
445 int
446 hammer_ref_volume(hammer_volume_t volume)
447 {
448         int error;
449
450         hammer_ref(&volume->io.lock);
451
452         /*
453          * Deal with on-disk info
454          */
455         if (volume->ondisk == NULL || volume->io.loading) {
456                 error = hammer_load_volume(volume);
457                 if (error)
458                         hammer_rel_volume(volume, 1);
459         } else {
460                 error = 0;
461         }
462         return (error);
463 }
464
465 hammer_volume_t
466 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
467 {
468         hammer_volume_t volume;
469
470         volume = hmp->rootvol;
471         KKASSERT(volume != NULL);
472         hammer_ref(&volume->io.lock);
473
474         /*
475          * Deal with on-disk info
476          */
477         if (volume->ondisk == NULL || volume->io.loading) {
478                 *errorp = hammer_load_volume(volume);
479                 if (*errorp) {
480                         hammer_rel_volume(volume, 1);
481                         volume = NULL;
482                 }
483         } else {
484                 *errorp = 0;
485         }
486         return (volume);
487 }
488
489 /*
490  * Load a volume's on-disk information.  The volume must be referenced and
491  * not locked.  We temporarily acquire an exclusive lock to interlock
492  * against releases or multiple get's.
493  */
494 static int
495 hammer_load_volume(hammer_volume_t volume)
496 {
497         struct hammer_volume_ondisk *ondisk;
498         int error;
499
500         hammer_lock_ex(&volume->io.lock);
501         KKASSERT(volume->io.loading == 0);
502         volume->io.loading = 1;
503
504         if (volume->ondisk == NULL) {
505                 error = hammer_io_read(volume->devvp, &volume->io);
506                 if (error) {
507                         volume->io.loading = 0;
508                         hammer_unlock(&volume->io.lock);
509                         return (error);
510                 }
511                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
512
513                 /*
514                  * Configure the volume's A-lists.  These are used to
515                  * allocate clusters.
516                  */
517                 if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
518                         volume->alist.config = &Vol_super_alist_config;
519                         volume->alist.meta = ondisk->vol_almeta.super;
520                         volume->alist.info = volume;
521                 } else {
522                         volume->alist.config = &Vol_normal_alist_config;
523                         volume->alist.meta = ondisk->vol_almeta.normal;
524                         volume->alist.info = NULL;
525                 }
526         } else {
527                 error = 0;
528         }
529         volume->io.loading = 0;
530         hammer_unlock(&volume->io.lock);
531         return(0);
532 }
533
534 /*
535  * Release a volume.  Call hammer_io_release on the last reference.  We have
536  * to acquire an exclusive lock to interlock against volume->ondisk tests
537  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
538  * lock to be held.
539  *
540  * Volumes are not unloaded from memory during normal operation.
541  */
542 void
543 hammer_rel_volume(hammer_volume_t volume, int flush)
544 {
545         if (volume->io.lock.refs == 1) {
546                 hammer_lock_ex(&volume->io.lock);
547                 if (volume->io.lock.refs == 1) {
548                         volume->ondisk = NULL;
549                         hammer_io_release(&volume->io, flush);
550                 } else if (flush) {
551                         hammer_io_flush(&volume->io);
552                 }
553                 hammer_unlock(&volume->io.lock);
554         }
555         hammer_unref(&volume->io.lock);
556 }
557
558 /************************************************************************
559  *                              SUPER-CLUSTERS                          *
560  ************************************************************************
561  *
562  * Manage super-clusters.  Note that a supercl holds a reference to its
563  * associated volume.
564  */
565 static int
566 hammer_find_supercl(hammer_volume_t volume, int32_t scl_no)
567 {
568         if (RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no))
569                 return(1);
570         return(0);
571 }
572
573 hammer_supercl_t
574 hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
575                    int *errorp, hammer_alloc_state_t isnew)
576 {
577         hammer_supercl_t supercl;
578
579         /*
580          * Locate and lock the super-cluster structure, creating one
581          * if necessary.
582          */
583 again:
584         supercl = RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no);
585         if (supercl == NULL) {
586                 ++hammer_count_supercls;
587                 supercl = kmalloc(sizeof(*supercl), M_HAMMER, M_WAITOK|M_ZERO);
588                 supercl->scl_no = scl_no;
589                 supercl->volume = volume;
590                 supercl->io.offset = calculate_supercl_offset(volume, scl_no);
591                 hammer_io_init(&supercl->io, HAMMER_STRUCTURE_SUPERCL);
592                 hammer_ref(&supercl->io.lock);
593
594                 /*
595                  * Insert the cluster into the RB tree and handle late
596                  * collisions.
597                  */
598                 if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
599                         hammer_unref(&supercl->io.lock);
600                         --hammer_count_supercls;
601                         kfree(supercl, M_HAMMER);
602                         goto again;
603                 }
604                 hammer_ref(&volume->io.lock);
605         } else {
606                 hammer_ref(&supercl->io.lock);
607         }
608
609         /*
610          * Deal with on-disk info
611          */
612         if (supercl->ondisk == NULL || isnew || supercl->io.loading) {
613                 *errorp = hammer_load_supercl(supercl, isnew);
614                 if (*errorp) {
615                         hammer_rel_supercl(supercl, 1);
616                         supercl = NULL;
617                 }
618         } else {
619                 *errorp = 0;
620         }
621         return(supercl);
622 }
623
624 static int
625 hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew)
626 {
627         struct hammer_supercl_ondisk *ondisk;
628         hammer_volume_t volume = supercl->volume;
629         int error;
630         int64_t nclusters;
631
632         hammer_lock_ex(&supercl->io.lock);
633         KKASSERT(supercl->io.loading == 0);
634         supercl->io.loading = 1;
635
636         if (supercl->ondisk == NULL) {
637                 if (isnew)
638                         error = hammer_io_new(volume->devvp, &supercl->io);
639                 else
640                         error = hammer_io_read(volume->devvp, &supercl->io);
641                 if (error) {
642                         supercl->io.loading = 0;
643                         hammer_unlock(&supercl->io.lock);
644                         return (error);
645                 }
646                 supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
647
648                 supercl->alist.config = &Supercl_alist_config;
649                 supercl->alist.meta = ondisk->scl_meta;
650                 supercl->alist.info = NULL;
651         } else if (isnew) {
652                 error = hammer_io_new(volume->devvp, &supercl->io);
653         } else {
654                 error = 0;
655         }
656         if (error == 0 && isnew) {
657                 /*
658                  * If this is a new super-cluster we have to initialize
659                  * various ondisk structural elements.  The caller is
660                  * responsible for the remainder.
661                  */
662                 struct hammer_alist_live dummy;
663
664                 hammer_modify_supercl(supercl);
665
666                 ondisk = supercl->ondisk;
667                 dummy.config = &Buf_alist_config;
668                 dummy.meta = ondisk->head.buf_almeta;
669                 dummy.info = NULL;
670                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL);
671
672                 nclusters = volume->ondisk->vol_nclusters -
673                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
674                 KKASSERT(nclusters > 0);
675                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
676                         nclusters = HAMMER_SCL_MAXCLUSTERS;
677                 hammer_alist_init(&supercl->alist, 0, (int32_t)nclusters,
678                                   isnew);
679         }
680         supercl->io.loading = 0;
681         hammer_unlock(&supercl->io.lock);
682         return (error);
683 }
684
685 /*
686  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
687  */
688 int
689 hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused)
690 {
691         KKASSERT(supercl->io.lock.refs == 0);
692         hammer_ref(&supercl->io.lock);
693         hammer_rel_supercl(supercl, 2);
694         return(0);
695 }
696
697 /*
698  * Release a super-cluster.  We have to deal with several places where
699  * another thread can ref the super-cluster.
700  *
701  * Only destroy the structure itself if the related buffer cache buffer
702  * was disassociated from it.  This ties the management of the structure
703  * to the buffer cache subsystem.
704  */
705 void
706 hammer_rel_supercl(hammer_supercl_t supercl, int flush)
707 {
708         hammer_volume_t volume;
709
710         if (supercl->io.lock.refs == 1) {
711                 hammer_lock_ex(&supercl->io.lock);
712                 if (supercl->io.lock.refs == 1) {
713                         hammer_io_release(&supercl->io, flush);
714                         if (supercl->io.bp == NULL &&
715                             supercl->io.lock.refs == 1) {
716                                 volume = supercl->volume;
717                                 RB_REMOVE(hammer_scl_rb_tree,
718                                           &volume->rb_scls_root, supercl);
719                                 supercl->volume = NULL; /* sanity */
720                                 --hammer_count_supercls;
721                                 kfree(supercl, M_HAMMER);
722                                 hammer_rel_volume(volume, 0);
723                                 return;
724                         }
725                 } else if (flush) {
726                         hammer_io_flush(&supercl->io);
727                 }
728                 hammer_unlock(&supercl->io.lock);
729         }
730         hammer_unref(&supercl->io.lock);
731 }
732
733 /************************************************************************
734  *                              CLUSTERS                                *
735  ************************************************************************
736  *
737  */
738 hammer_cluster_t
739 hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
740                    int *errorp, int getflags)
741 {
742         hammer_cluster_t cluster;
743
744 again:
745         cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
746         if (cluster == NULL) {
747                 ++hammer_count_clusters;
748                 cluster = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
749                 cluster->clu_no = clu_no;
750                 cluster->volume = volume;
751                 RB_INIT(&cluster->rb_bufs_root);
752                 RB_INIT(&cluster->rb_nods_root);
753                 hammer_io_init(&cluster->io, HAMMER_STRUCTURE_CLUSTER);
754                 cluster->io.offset = calculate_cluster_offset(volume, clu_no);
755                 hammer_ref(&cluster->io.lock);
756                 /* NOTE: cluster->io.validated expected to be 0 */
757
758                 /*
759                  * Insert the cluster into the RB tree and handle late
760                  * collisions.
761                  */
762                 if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
763                         hammer_unref(&cluster->io.lock);
764                         --hammer_count_clusters;
765                         kfree(cluster, M_HAMMER);
766                         goto again;
767                 }
768                 hammer_ref(&volume->io.lock);
769         } else {
770                 hammer_ref(&cluster->io.lock);
771         }
772
773         /*
774          * Deal with on-disk info
775          */
776         if (cluster->ondisk == NULL || getflags || cluster->io.validated == 0) {
777                 *errorp = hammer_load_cluster(cluster, getflags);
778                 if (*errorp) {
779                         hammer_rel_cluster(cluster, 1);
780                         cluster = NULL;
781                 }
782         } else {
783                 *errorp = 0;
784         }
785         return (cluster);
786 }
787
788 hammer_cluster_t
789 hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
790 {
791         hammer_cluster_t cluster;
792
793         cluster = hmp->rootcl;
794         KKASSERT(cluster != NULL);
795         hammer_ref(&cluster->io.lock);
796
797         /*
798          * Deal with on-disk info
799          */
800         if (cluster->ondisk == NULL || cluster->io.validated == 0) {
801                 *errorp = hammer_load_cluster(cluster, 0);
802                 if (*errorp) {
803                         hammer_rel_cluster(cluster, 1);
804                         cluster = NULL;
805                 }
806         } else {
807                 *errorp = 0;
808         }
809         return (cluster);
810 }
811
812 static
813 int
814 hammer_load_cluster(hammer_cluster_t cluster, int getflags)
815 {
816         hammer_volume_t volume = cluster->volume;
817         struct hammer_cluster_ondisk *ondisk;
818         int error;
819
820         hammer_lock_ex(&cluster->io.lock);
821         KKASSERT(cluster->io.loading == 0);
822         cluster->io.loading = 1;
823
824         if (cluster->ondisk == NULL) {
825                 KKASSERT(TAILQ_EMPTY(&cluster->io.deplist));
826
827                 /*
828                  * Unmodified buffers may be present, indicating that we
829                  * had already validated the cluster even though we no longer
830                  * have its ondisk info.
831                  */
832                 if (!RB_EMPTY(&cluster->rb_bufs_root))
833                         KKASSERT(cluster->io.validated);
834                 if (getflags & GET_CLUSTER_NEW)
835                         error = hammer_io_new(volume->devvp, &cluster->io);
836                 else
837                         error = hammer_io_read(volume->devvp, &cluster->io);
838                 if (error) {
839                         cluster->io.loading = 0;
840                         hammer_unlock(&cluster->io.lock);
841                         return (error);
842                 }
843                 cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
844
845                 cluster->alist_master.config = &Clu_master_alist_config;
846                 cluster->alist_master.meta = ondisk->clu_master_meta;
847                 cluster->alist_btree.config = &Clu_slave_alist_config;
848                 cluster->alist_btree.meta = ondisk->clu_btree_meta;
849                 cluster->alist_btree.info = cluster;
850                 cluster->alist_record.config = &Clu_slave_alist_config;
851                 cluster->alist_record.meta = ondisk->clu_record_meta;
852                 cluster->alist_record.info = cluster;
853                 cluster->alist_mdata.config = &Clu_slave_alist_config;
854                 cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
855                 cluster->alist_mdata.info = cluster;
856
857                 if ((getflags & GET_CLUSTER_NEW) == 0) {
858                         /*
859                          * Load cluster range info for easy access
860                          */
861                         cluster->clu_btree_beg = ondisk->clu_btree_beg;
862                         cluster->clu_btree_end = ondisk->clu_btree_end;
863                 }
864         } else if (getflags & GET_CLUSTER_NEW) {
865                 error = hammer_io_new(volume->devvp, &cluster->io);
866         } else {
867                 error = 0;
868         }
869         if (error == 0 && (getflags & GET_CLUSTER_NEW)) {
870                 /*
871                  * If this is a new cluster we have to initialize
872                  * various ondisk structural elements.  The caller is
873                  * responsible for the remainder.
874                  */
875                 struct hammer_alist_live dummy;
876                 hammer_node_t croot;
877                 hammer_volume_ondisk_t voldisk;
878                 int32_t nbuffers;
879
880                 cluster->flags &= ~HAMMER_CLUSTER_DELETED;
881
882                 hammer_modify_cluster(cluster);
883                 ondisk = cluster->ondisk;
884                 voldisk = volume->ondisk;
885
886                 dummy.config = &Buf_alist_config;
887                 dummy.meta = ondisk->head.buf_almeta;
888                 dummy.info = NULL;
889                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER);
890
891                 ondisk->vol_fsid = voldisk->vol_fsid;
892                 ondisk->vol_fstype = voldisk->vol_fstype;
893                 ondisk->clu_gen = 1;
894                 ondisk->clu_id = 0;     /* XXX */
895                 ondisk->clu_no = cluster->clu_no;
896                 ondisk->clu_flags = 0;
897                 ondisk->clu_start = HAMMER_BUFSIZE;
898                 ondisk->synchronized_rec_id = 1; /* XXX timestamp */
899                 KKASSERT(voldisk->vol_clo_end > cluster->io.offset);
900                 if (voldisk->vol_clo_end - cluster->io.offset >
901                     voldisk->vol_clsize) {
902                         ondisk->clu_limit = voldisk->vol_clsize;
903                 } else {
904                         ondisk->clu_limit = (int32_t)(voldisk->vol_clo_end -
905                                                       cluster->io.offset);
906                 }
907                 nbuffers = ondisk->clu_limit / HAMMER_BUFSIZE;
908                 hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
909                                   HAMMER_ASTATE_FREE);
910                 hammer_alist_init(&cluster->alist_btree,
911                                   HAMMER_FSBUF_MAXBLKS,
912                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
913                                   HAMMER_ASTATE_ALLOC);
914                 hammer_alist_init(&cluster->alist_record,
915                                   HAMMER_FSBUF_MAXBLKS,
916                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
917                                   HAMMER_ASTATE_ALLOC);
918                 hammer_alist_init(&cluster->alist_mdata,
919                                   HAMMER_FSBUF_MAXBLKS,
920                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
921                                   HAMMER_ASTATE_ALLOC);
922
923                 ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
924                 ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
925                 ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
926
927                 /*
928                  * Initialize the B-Tree.  We don't know what the caller
929                  * intends to do with the cluster so make sure it causes
930                  * an assertion if the caller makes no changes.
931                  */
932                 ondisk->clu_btree_parent_vol_no = -2;
933                 ondisk->clu_btree_parent_clu_no = -2;
934                 ondisk->clu_btree_parent_offset = -2;
935                 ondisk->clu_btree_parent_clu_gen = -2;
936
937                 croot = hammer_alloc_btree(cluster, &error);
938                 if (error == 0) {
939                         hammer_modify_node(croot);
940                         bzero(croot->ondisk, sizeof(*croot->ondisk));
941                         croot->ondisk->count = 0;
942                         croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
943                         hammer_modify_cluster(cluster);
944                         ondisk->clu_btree_root = croot->node_offset;
945                         hammer_rel_node(croot);
946                 }
947
948                 /*
949                  * We just formatted this cluster, don't try to recover it!
950                  */
951                 cluster->io.validated = 1;
952         }
953
954         /*
955          * If no error occured handle automatic cluster recovery unless
956          * the NORECOVER flag is passed (prevents recovery recursions) or
957          * the cluster has been flagged for deletion (prevents an attempt
958          * to recover a cluster which is no longer hooked into the tree).
959          *
960          * Setting hammer_debug_recover to 1 will force recovery on load
961          * whether or not the cluster is marked open.
962          *
963          * Setting hammer_debug_recover to -1 will force NO recovery
964          * regardless of state.
965          *
966          * io.validated can only be cleared if the buffer RB list is empty,
967          * preventing us from trying to recover an actively referenced
968          * cluster (which would blow the filesystem to smithereens).
969          */
970         if (error == 0 && cluster->io.validated == 0) {
971                 if ((getflags & GET_CLUSTER_NORECOVER) == 0 &&
972                     (cluster->flags & HAMMER_CLUSTER_DELETED) == 0) {
973                         if ((cluster->ondisk->clu_flags & HAMMER_CLUF_OPEN) ||
974                             hammer_debug_recover > 0) {
975                                 if (hammer_debug_recover >= 0)
976                                         hammer_recover(cluster);
977                         }
978                         cluster->io.validated = 1;
979                 } else if ((cluster->ondisk->clu_flags & HAMMER_CLUF_OPEN)==0) {
980                         cluster->io.validated = 1;
981                 }
982         }
983         cluster->io.loading = 0;
984         hammer_unlock(&cluster->io.lock);
985         return (error);
986 }
987
988 /*
989  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
990  */
991 int
992 hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused)
993 {
994         hammer_ref(&cluster->io.lock);
995         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
996                 hammer_unload_buffer, NULL);
997         hammer_io_waitdep(&cluster->io);
998         KKASSERT(cluster->io.lock.refs == 1);
999         hammer_rel_cluster(cluster, 2);
1000         return(0);
1001 }
1002
1003 /*
1004  * Update the cluster's synchronization TID, which is used during cluster
1005  * recovery.  NOTE: The cluster header is not written out until all related
1006  * records have been written out.
1007  */
1008 u_int64_t
1009 hammer_alloc_recid(hammer_cluster_t cluster)
1010 {
1011         u_int64_t recid;
1012
1013         hammer_modify_cluster(cluster);
1014         recid = cluster->ondisk->synchronized_rec_id++;
1015         return(recid);
1016 }
1017
1018 #if 0
1019 void
1020 hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid)
1021 {
1022         hammer_modify_cluster(cluster);
1023         if (cluster->ondisk->synchronized_tid < tid)
1024                 cluster->ondisk->synchronized_tid = tid;
1025 }
1026 #endif
1027
1028 /*
1029  * Reference a cluster that is either already referenced or via a specially
1030  * handled pointer (aka rootcl).
1031  */
1032 int
1033 hammer_ref_cluster(hammer_cluster_t cluster)
1034 {
1035         int error;
1036
1037         KKASSERT(cluster != NULL);
1038         hammer_ref(&cluster->io.lock);
1039
1040         /*
1041          * Deal with on-disk info
1042          */
1043         if (cluster->ondisk == NULL || cluster->io.validated == 0) {
1044                 error = hammer_load_cluster(cluster, 0);
1045                 if (error)
1046                         hammer_rel_cluster(cluster, 1);
1047         } else {
1048                 error = 0;
1049         }
1050         return(error);
1051 }
1052
1053 /*
1054  * Release a cluster.  We have to deal with several places where
1055  * another thread can ref the cluster.
1056  *
1057  * Only destroy the structure itself if we no longer have an IO or any
1058  * hammer buffers associated with the structure.
1059  */
1060 void
1061 hammer_rel_cluster(hammer_cluster_t cluster, int flush)
1062 {
1063         hammer_volume_t volume;
1064
1065         /*
1066          * Free a deleted cluster back to the pool when its last
1067          * active reference is released.  This prevents the cluster
1068          * from being reallocated until all its prior references go away.
1069          *
1070          * XXX implement a discard dependancy list which holds references
1071          * on clusters, preventing their deletion, until their parent cluster
1072          * has been flushed to disk.
1073          */
1074         if (cluster->io.lock.refs == 1) {
1075                 if (cluster->flags & HAMMER_CLUSTER_DELETED) {
1076                         cluster->flags &= ~HAMMER_CLUSTER_DELETED;
1077                         if (hammer_debug_general & 0x80)
1078                                 kprintf("FREE CLUSTER %d\n", cluster->clu_no);
1079                         if (cluster->ondisk->stat_records) {
1080                                 struct hammer_sync_info info;
1081
1082                                 info.error = 0;
1083                                 info.waitfor = MNT_WAIT;
1084                                 kprintf(" (still has %d records!)\n",
1085                                         cluster->ondisk->stat_records);
1086                                 Debugger("continue to recover cluster");
1087                                 hammer_recover(cluster);
1088                                 Debugger("continue to sync cluster");
1089                                 hammer_sync_cluster(cluster, &info);
1090                                 Debugger("now debug it");
1091                         }
1092
1093                         /*
1094                          * Clean up any statistics we left hanging in the
1095                          * cluster.
1096                          */
1097                         hammer_adjust_stats(cluster, HAMMER_FSBUF_BTREE,
1098                                             -cluster->ondisk->stat_idx_bufs);
1099                         hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA,
1100                                             -cluster->ondisk->stat_data_bufs);
1101                         hammer_adjust_stats(cluster, HAMMER_FSBUF_RECORDS,
1102                                             -cluster->ondisk->stat_rec_bufs);
1103                         /*
1104                          * hammer_discard_cluster(cluster) - throw away
1105                          * dirty backing store, recurse to any underlying
1106                          * buffers. XXX
1107                          */
1108                         hammer_free_cluster(cluster);
1109                 }
1110         }
1111
1112         if (cluster->io.lock.refs == 1) {
1113                 hammer_lock_ex(&cluster->io.lock);
1114                 if (cluster->io.lock.refs == 1) {
1115                         /*
1116                          * Release the I/O.  If we or the kernel wants to
1117                          * flush, this will release the bp.  Otherwise the
1118                          * bp may be written and flushed passively by the
1119                          * kernel later on.
1120                          */
1121                         hammer_io_release(&cluster->io, flush);
1122
1123                         /*
1124                          * Final cleanup
1125                          */
1126                         if (cluster != cluster->volume->hmp->rootcl &&
1127                             cluster->io.bp == NULL &&
1128                             cluster->io.lock.refs == 1 &&
1129                             RB_EMPTY(&cluster->rb_bufs_root)) {
1130                                 KKASSERT(RB_EMPTY(&cluster->rb_nods_root));
1131                                 volume = cluster->volume;
1132                                 RB_REMOVE(hammer_clu_rb_tree,
1133                                           &volume->rb_clus_root, cluster);
1134                                 cluster->volume = NULL; /* sanity */
1135                                 --hammer_count_clusters;
1136                                 kfree(cluster, M_HAMMER);
1137                                 hammer_rel_volume(volume, 0);
1138                                 return;
1139                         }
1140                 } else if (flush) {
1141                         hammer_io_flush(&cluster->io);
1142                 }
1143                 hammer_unlock(&cluster->io.lock);
1144         }
1145         hammer_unref(&cluster->io.lock);
1146 }
1147
1148 /************************************************************************
1149  *                              BUFFERS                                 *
1150  ************************************************************************
1151  *
1152  * Manage buffers.  Note that a buffer holds a reference to its associated
1153  * cluster, and its cluster will hold a reference to the cluster's volume.
1154  *
1155  * A non-zero buf_type indicates that a new buffer should be created and
1156  * zero'd.
1157  */
1158 hammer_buffer_t
1159 hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
1160                   u_int64_t buf_type, int *errorp)
1161 {
1162         hammer_buffer_t buffer;
1163
1164         /*
1165          * Find the buffer.  Note that buffer 0 corresponds to the cluster
1166          * header and should never be requested.
1167          */
1168         KKASSERT(buf_no >= cluster->ondisk->clu_start / HAMMER_BUFSIZE &&
1169                  buf_no < cluster->ondisk->clu_limit / HAMMER_BUFSIZE);
1170
1171         /*
1172          * Locate and lock the buffer structure, creating one if necessary.
1173          */
1174 again:
1175         buffer = RB_LOOKUP(hammer_buf_rb_tree, &cluster->rb_bufs_root, buf_no);
1176         if (buffer == NULL) {
1177                 ++hammer_count_buffers;
1178                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
1179                 buffer->buf_no = buf_no;
1180                 buffer->cluster = cluster;
1181                 buffer->volume = cluster->volume;
1182                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
1183                 buffer->io.offset = cluster->io.offset +
1184                                     (buf_no * HAMMER_BUFSIZE);
1185                 TAILQ_INIT(&buffer->clist);
1186                 hammer_ref(&buffer->io.lock);
1187
1188                 /*
1189                  * Insert the cluster into the RB tree and handle late
1190                  * collisions.
1191                  */
1192                 if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
1193                         hammer_unref(&buffer->io.lock);
1194                         --hammer_count_buffers;
1195                         kfree(buffer, M_HAMMER);
1196                         goto again;
1197                 }
1198                 hammer_ref(&cluster->io.lock);
1199         } else {
1200                 hammer_ref(&buffer->io.lock);
1201         }
1202
1203         /*
1204          * Deal with on-disk info
1205          */
1206         if (buffer->ondisk == NULL || buf_type || buffer->io.loading) {
1207                 *errorp = hammer_load_buffer(buffer, buf_type);
1208                 if (*errorp) {
1209                         hammer_rel_buffer(buffer, 1);
1210                         buffer = NULL;
1211                 }
1212         } else {
1213                 *errorp = 0;
1214         }
1215         return(buffer);
1216 }
1217
1218 static int
1219 hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
1220 {
1221         hammer_volume_t volume;
1222         hammer_fsbuf_ondisk_t ondisk;
1223         int error;
1224
1225         /*
1226          * Load the buffer's on-disk info
1227          */
1228         volume = buffer->volume;
1229         hammer_lock_ex(&buffer->io.lock);
1230         KKASSERT(buffer->io.loading == 0);
1231         buffer->io.loading = 1;
1232
1233         if (buffer->ondisk == NULL) {
1234                 if (buf_type) {
1235                         error = hammer_io_new(volume->devvp, &buffer->io);
1236                 } else {
1237                         error = hammer_io_read(volume->devvp, &buffer->io);
1238                 }
1239                 if (error) {
1240                         buffer->io.loading = 0;
1241                         hammer_unlock(&buffer->io.lock);
1242                         return (error);
1243                 }
1244                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
1245                 buffer->alist.config = &Buf_alist_config;
1246                 buffer->alist.meta = ondisk->head.buf_almeta;
1247                 buffer->buf_type = ondisk->head.buf_type;
1248         } else if (buf_type) {
1249                 error = hammer_io_new(volume->devvp, &buffer->io);
1250         } else {
1251                 error = 0;
1252         }
1253         if (error == 0 && buf_type) {
1254                 hammer_modify_buffer(buffer);
1255                 ondisk = buffer->ondisk;
1256                 hammer_initbuffer(&buffer->alist, &ondisk->head, buf_type);
1257                 buffer->buf_type = ondisk->head.buf_type;
1258         }
1259         buffer->io.loading = 0;
1260         hammer_unlock(&buffer->io.lock);
1261         return (error);
1262 }
1263
1264 /*
1265  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
1266  */
1267 int
1268 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
1269 {
1270         hammer_ref(&buffer->io.lock);
1271         hammer_flush_buffer_nodes(buffer);
1272         KKASSERT(buffer->io.lock.refs == 1);
1273         hammer_rel_buffer(buffer, 2);
1274         return(0);
1275 }
1276
1277 /*
1278  * Reference a buffer that is either already referenced or via a specially
1279  * handled pointer (aka cursor->buffer).
1280  */
1281 int
1282 hammer_ref_buffer(hammer_buffer_t buffer)
1283 {
1284         int error;
1285
1286         hammer_ref(&buffer->io.lock);
1287         if (buffer->ondisk == NULL || buffer->io.loading) {
1288                 error = hammer_load_buffer(buffer, 0);
1289                 if (error) {
1290                         hammer_rel_buffer(buffer, 1);
1291                         /*
1292                          * NOTE: buffer pointer can become stale after
1293                          * the above release.
1294                          */
1295                 } else {
1296                         KKASSERT(buffer->buf_type ==
1297                                  buffer->ondisk->head.buf_type);
1298                 }
1299         } else {
1300                 error = 0;
1301         }
1302         return(error);
1303 }
1304
1305 /*
1306  * Release a buffer.  We have to deal with several places where
1307  * another thread can ref the buffer.
1308  *
1309  * Only destroy the structure itself if the related buffer cache buffer
1310  * was disassociated from it.  This ties the management of the structure
1311  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1312  * embedded io is referenced or not.
1313  */
1314 void
1315 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
1316 {
1317         hammer_cluster_t cluster;
1318
1319         if (buffer->io.lock.refs == 1) {
1320                 hammer_lock_ex(&buffer->io.lock);
1321                 if (buffer->io.lock.refs == 1) {
1322                         hammer_io_release(&buffer->io, flush);
1323
1324                         if (buffer->io.bp == NULL &&
1325                             buffer->io.lock.refs == 1) {
1326                                 hammer_flush_buffer_nodes(buffer);
1327                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
1328                                 cluster = buffer->cluster;
1329                                 RB_REMOVE(hammer_buf_rb_tree,
1330                                           &cluster->rb_bufs_root, buffer);
1331                                 buffer->cluster = NULL; /* sanity */
1332                                 --hammer_count_buffers;
1333                                 kfree(buffer, M_HAMMER);
1334                                 hammer_rel_cluster(cluster, 0);
1335                                 return;
1336                         }
1337                 } else if (flush) {
1338                         hammer_io_flush(&buffer->io);
1339                 }
1340                 hammer_unlock(&buffer->io.lock);
1341         }
1342         hammer_unref(&buffer->io.lock);
1343 }
1344
1345 /************************************************************************
1346  *                              NODES                                   *
1347  ************************************************************************
1348  *
1349  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1350  * method used by the HAMMER filesystem.
1351  *
1352  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1353  * associated with its buffer, and will only referenced the buffer while
1354  * the node itself is referenced.
1355  *
1356  * A hammer_node can also be passively associated with other HAMMER
1357  * structures, such as inodes, while retaining 0 references.  These
1358  * associations can be cleared backwards using a pointer-to-pointer in
1359  * the hammer_node.
1360  *
1361  * This allows the HAMMER implementation to cache hammer_nodes long-term
1362  * and short-cut a great deal of the infrastructure's complexity.  In
1363  * most cases a cached node can be reacquired without having to dip into
1364  * either the buffer or cluster management code.
1365  *
1366  * The caller must pass a referenced cluster on call and will retain
1367  * ownership of the reference on return.  The node will acquire its own
1368  * additional references, if necessary.
1369  */
1370 hammer_node_t
1371 hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
1372 {
1373         hammer_node_t node;
1374
1375         /*
1376          * Locate the structure, allocating one if necessary.
1377          */
1378 again:
1379         node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
1380                          node_offset);
1381         if (node == NULL) {
1382                 ++hammer_count_nodes;
1383                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
1384                 node->node_offset = node_offset;
1385                 node->cluster = cluster;
1386                 if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
1387                               node)) {
1388                         --hammer_count_nodes;
1389                         kfree(node, M_HAMMER);
1390                         goto again;
1391                 }
1392         }
1393         hammer_ref(&node->lock);
1394         *errorp = hammer_load_node(node);
1395         if (*errorp) {
1396                 hammer_rel_node(node);
1397                 node = NULL;
1398         }
1399         return(node);
1400 }
1401
1402 /*
1403  * Reference an already-referenced node.
1404  */
1405 int
1406 hammer_ref_node(hammer_node_t node)
1407 {
1408         int error;
1409
1410         KKASSERT(node->lock.refs > 0);
1411         hammer_ref(&node->lock);
1412         if ((error = hammer_load_node(node)) != 0)
1413                 hammer_rel_node(node);
1414         return(error);
1415 }
1416
1417 /*
1418  * Load a node's on-disk data reference.
1419  */
1420 static int
1421 hammer_load_node(hammer_node_t node)
1422 {
1423         hammer_buffer_t buffer;
1424         int32_t buf_no;
1425         int error;
1426
1427         if (node->ondisk)
1428                 return(0);
1429         error = 0;
1430         hammer_lock_ex(&node->lock);
1431         if (node->ondisk == NULL) {
1432                 /*
1433                  * This is a little confusing but the jist is that
1434                  * node->buffer determines whether the node is on
1435                  * the buffer's clist and node->ondisk determines
1436                  * whether the buffer is referenced.
1437                  */
1438                 if ((buffer = node->buffer) != NULL) {
1439                         error = hammer_ref_buffer(buffer);
1440                 } else {
1441                         buf_no = node->node_offset / HAMMER_BUFSIZE;
1442                         buffer = hammer_get_buffer(node->cluster,
1443                                                    buf_no, 0, &error);
1444                         if (buffer) {
1445                                 KKASSERT(error == 0);
1446                                 TAILQ_INSERT_TAIL(&buffer->clist,
1447                                                   node, entry);
1448                                 node->buffer = buffer;
1449                         }
1450                 }
1451                 if (error == 0) {
1452                         node->ondisk = (void *)((char *)buffer->ondisk +
1453                                (node->node_offset & HAMMER_BUFMASK));
1454                 }
1455         }
1456         hammer_unlock(&node->lock);
1457         return (error);
1458 }
1459
1460 /*
1461  * Safely reference a node, interlock against flushes via the IO subsystem.
1462  */
1463 hammer_node_t
1464 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
1465                      int *errorp)
1466 {
1467         hammer_node_t node;
1468
1469         if ((node = *cache) != NULL)
1470                 hammer_ref(&node->lock);
1471         if (node) {
1472                 *errorp = hammer_load_node(node);
1473                 if (*errorp) {
1474                         hammer_rel_node(node);
1475                         node = NULL;
1476                 }
1477         } else {
1478                 *errorp = ENOENT;
1479         }
1480         return(node);
1481 }
1482
1483 /*
1484  * Release a hammer_node.  On the last release the node dereferences
1485  * its underlying buffer and may or may not be destroyed.
1486  */
1487 void
1488 hammer_rel_node(hammer_node_t node)
1489 {
1490         hammer_cluster_t cluster;
1491         hammer_buffer_t buffer;
1492         int32_t node_offset;
1493         int flags;
1494
1495         /*
1496          * If this isn't the last ref just decrement the ref count and
1497          * return.
1498          */
1499         if (node->lock.refs > 1) {
1500                 hammer_unref(&node->lock);
1501                 return;
1502         }
1503
1504         /*
1505          * If there is no ondisk info or no buffer the node failed to load,
1506          * remove the last reference and destroy the node.
1507          */
1508         if (node->ondisk == NULL) {
1509                 hammer_unref(&node->lock);
1510                 hammer_flush_node(node);
1511                 /* node is stale now */
1512                 return;
1513         }
1514
1515         /*
1516          * Do final cleanups and then either destroy the node and leave it
1517          * passively cached.  The buffer reference is removed regardless.
1518          */
1519         buffer = node->buffer;
1520         node->ondisk = NULL;
1521
1522         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
1523                 hammer_unref(&node->lock);
1524                 hammer_rel_buffer(buffer, 0);
1525                 return;
1526         }
1527
1528         /*
1529          * Destroy the node.  Record pertainant data because the node
1530          * becomes stale the instant we flush it.
1531          */
1532         flags = node->flags;
1533         node_offset = node->node_offset;
1534         hammer_unref(&node->lock);
1535         hammer_flush_node(node);
1536         /* node is stale */
1537
1538         cluster = buffer->cluster;
1539         if (flags & HAMMER_NODE_DELETED) {
1540                 if (node_offset == cluster->ondisk->clu_btree_root)
1541                         KKASSERT(cluster->flags & HAMMER_CLUSTER_DELETED);
1542                 hammer_free_btree(cluster, node_offset);
1543         }
1544         hammer_rel_buffer(buffer, 0);
1545 }
1546
1547 /*
1548  * Passively cache a referenced hammer_node in *cache.  The caller may
1549  * release the node on return.
1550  */
1551 void
1552 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1553 {
1554         hammer_node_t old;
1555
1556         /*
1557          * If the node is being deleted, don't cache it!
1558          */
1559         if (node->flags & HAMMER_NODE_DELETED)
1560                 return;
1561
1562         /*
1563          * Cache the node.  If we previously cached a different node we
1564          * have to give HAMMER a chance to destroy it.
1565          */
1566 again:
1567         if (node->cache1 != cache) {
1568                 if (node->cache2 != cache) {
1569                         if ((old = *cache) != NULL) {
1570                                 KKASSERT(node->lock.refs != 0);
1571                                 hammer_uncache_node(cache);
1572                                 goto again;
1573                         }
1574                         if (node->cache2)
1575                                 *node->cache2 = NULL;
1576                         node->cache2 = node->cache1;
1577                         node->cache1 = cache;
1578                         *cache = node;
1579                 } else {
1580                         struct hammer_node **tmp;
1581                         tmp = node->cache1;
1582                         node->cache1 = node->cache2;
1583                         node->cache2 = tmp;
1584                 }
1585         }
1586 }
1587
1588 void
1589 hammer_uncache_node(struct hammer_node **cache)
1590 {
1591         hammer_node_t node;
1592
1593         if ((node = *cache) != NULL) {
1594                 *cache = NULL;
1595                 if (node->cache1 == cache) {
1596                         node->cache1 = node->cache2;
1597                         node->cache2 = NULL;
1598                 } else if (node->cache2 == cache) {
1599                         node->cache2 = NULL;
1600                 } else {
1601                         panic("hammer_uncache_node: missing cache linkage");
1602                 }
1603                 if (node->cache1 == NULL && node->cache2 == NULL)
1604                         hammer_flush_node(node);
1605         }
1606 }
1607
1608 /*
1609  * Remove a node's cache references and destroy the node if it has no
1610  * other references or backing store.
1611  */
1612 void
1613 hammer_flush_node(hammer_node_t node)
1614 {
1615         hammer_buffer_t buffer;
1616
1617         if (node->cache1)
1618                 *node->cache1 = NULL;
1619         if (node->cache2)
1620                 *node->cache2 = NULL;
1621         if (node->lock.refs == 0 && node->ondisk == NULL) {
1622                 RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
1623                           node);
1624                 if ((buffer = node->buffer) != NULL) {
1625                         node->buffer = NULL;
1626                         TAILQ_REMOVE(&buffer->clist, node, entry);
1627                         /* buffer is unreferenced because ondisk is NULL */
1628                 }
1629                 --hammer_count_nodes;
1630                 kfree(node, M_HAMMER);
1631         }
1632 }
1633
1634 /*
1635  * Flush passively cached B-Tree nodes associated with this buffer.
1636  * This is only called when the buffer is about to be destroyed, so
1637  * none of the nodes should have any references.
1638  */
1639 void
1640 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1641 {
1642         hammer_node_t node;
1643
1644         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1645                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1646                 hammer_ref(&node->lock);
1647                 node->flags |= HAMMER_NODE_FLUSH;
1648                 hammer_rel_node(node);
1649         }
1650 }
1651
1652 /************************************************************************
1653  *                              A-LIST ALLOCATORS                       *
1654  ************************************************************************/
1655
1656 /*
1657  * Allocate HAMMER clusters
1658  */
1659 hammer_cluster_t
1660 hammer_alloc_cluster(hammer_mount_t hmp, hammer_cluster_t cluster_hint,
1661                      int *errorp)
1662 {
1663         hammer_volume_t volume;
1664         hammer_cluster_t cluster;
1665         int32_t clu_no;
1666         int32_t clu_hint;
1667         int32_t vol_beg;
1668         int32_t vol_no;
1669
1670         /*
1671          * Figure out our starting volume and hint.
1672          */
1673         if (cluster_hint) {
1674                 vol_beg = cluster_hint->volume->vol_no;
1675                 clu_hint = cluster_hint->clu_no;
1676         } else {
1677                 vol_beg = hmp->volume_iterator;
1678                 clu_hint = -1;
1679         }
1680
1681         /*
1682          * Loop through volumes looking for a free cluster.  If allocating
1683          * a new cluster relative to an existing cluster try to find a free
1684          * cluster on either side (clu_hint >= 0), otherwise just do a
1685          * forwards iteration.
1686          */
1687         vol_no = vol_beg;
1688         do {
1689                 volume = hammer_get_volume(hmp, vol_no, errorp);
1690                 if (*errorp) {
1691                         clu_no = HAMMER_ALIST_BLOCK_NONE;
1692                         break;
1693                 }
1694                 hammer_modify_volume(volume);
1695                 if (clu_hint == -1) {
1696                         clu_hint = volume->clu_iterator;
1697                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1698                                                         clu_hint);
1699                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1700                                 clu_no = hammer_alist_alloc_fwd(&volume->alist,
1701                                                                 1, 0);
1702                         }
1703                 } else {
1704                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1705                                                         clu_hint);
1706                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1707                                 clu_no = hammer_alist_alloc_rev(&volume->alist,
1708                                                                 1, clu_hint);
1709                         }
1710                 }
1711                 if (clu_no != HAMMER_ALIST_BLOCK_NONE)
1712                         break;
1713                 hammer_rel_volume(volume, 0);
1714                 volume = NULL;
1715                 *errorp = ENOSPC;
1716                 vol_no = (vol_no + 1) % hmp->nvolumes;
1717                 clu_hint = -1;
1718         } while (vol_no != vol_beg);
1719
1720         /*
1721          * Acquire the cluster.  On success this will force *errorp to 0.
1722          */
1723         if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
1724                 if (hammer_debug_general & 0x40) {
1725                         kprintf("ALLOC CLUSTER %d:%d\n", 
1726                                 volume->vol_no, clu_no);
1727                 }
1728                 cluster = hammer_get_cluster(volume, clu_no, errorp,
1729                                              GET_CLUSTER_NEW);
1730                 volume->clu_iterator = clu_no;
1731                 hammer_rel_volume(volume, 0);
1732         } else {
1733                 cluster = NULL;
1734         }
1735         if (cluster)
1736                 hammer_lock_ex(&cluster->io.lock);
1737         return(cluster);
1738 }
1739
1740 void
1741 hammer_init_cluster(hammer_cluster_t cluster, hammer_base_elm_t left_bound, 
1742                     hammer_base_elm_t right_bound)
1743 {
1744         hammer_cluster_ondisk_t ondisk = cluster->ondisk;
1745
1746         hammer_modify_cluster(cluster);
1747         ondisk->clu_btree_beg = *left_bound;
1748         ondisk->clu_btree_end = *right_bound;
1749         cluster->clu_btree_beg = ondisk->clu_btree_beg;
1750         cluster->clu_btree_end = ondisk->clu_btree_end;
1751 }
1752
1753 /*
1754  * Deallocate a cluster
1755  */
1756 void
1757 hammer_free_cluster(hammer_cluster_t cluster)
1758 {
1759         hammer_modify_volume(cluster->volume);
1760         hammer_alist_free(&cluster->volume->alist, cluster->clu_no, 1);
1761 }
1762
1763 /*
1764  * Allocate HAMMER elements - btree nodes, data storage, and record elements
1765  *
1766  * The passed *bufferp should be initialized to NULL.  On successive calls
1767  * *bufferp caches the most recent buffer used until put away by the caller.
1768  * Note that previously returned pointers using the cached buffer become
1769  * invalid on successive calls which reuse *bufferp.
1770  *
1771  * All allocations first attempt to use the block found at the specified
1772  * iterator.  If that fails the first available block is used.  If that
1773  * fails a new buffer is allocated and associated with the buffer type
1774  * A-list and the element is allocated out of the new buffer.
1775  *
1776  * This function also ensures that the required minimum number of buffers is
1777  * reserved to guarantee that recovery operations succeed.
1778  */
1779
1780 hammer_node_t
1781 hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
1782 {
1783         hammer_buffer_t buffer;
1784         hammer_alist_t live;
1785         hammer_node_t node;
1786         int32_t elm_no;
1787         int32_t buf_no;
1788         int32_t node_offset;
1789         int32_t n;
1790
1791         hammer_modify_cluster(cluster);
1792         buffer = NULL;
1793         live = &cluster->alist_btree;
1794
1795         /*
1796          * If we aren't recovering then ensure the required minimum
1797          * reservation is met. XXX if the recovery code packs the B-Tree
1798          * we don't have to do this.
1799          *
1800          * Calculate the number of buffers needed to hold the B-Tree.
1801          */
1802         if (cluster->io.validated) {
1803                 n = (cluster->ondisk->stat_records * 3 / 
1804                     HAMMER_BTREE_INT_ELMS / HAMMER_BTREE_NODES) + 1;
1805                 if (hammer_debug_general &&
1806                     cluster->ondisk->stat_idx_bufs < n) {
1807                         kprintf("hammer_alloc_btree: %d/%d buffers\n",
1808                                 cluster->ondisk->stat_idx_bufs, n);
1809                 }
1810                 while (cluster->ondisk->stat_idx_bufs < n) {
1811                         alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1812                                          cluster->ondisk->idx_index, errorp,
1813                                          &buffer);
1814                         if (*errorp) {
1815                                 if (buffer)
1816                                         hammer_rel_buffer(buffer, 0);
1817                                 return(NULL);
1818                         }
1819                 }
1820         }
1821
1822
1823         /*
1824          * Allocate a B-Tree element
1825          */
1826         elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
1827         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1828                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1829         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1830                 alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1831                                  cluster->ondisk->idx_index, errorp, &buffer);
1832                 elm_no = hammer_alist_alloc(live, 1);
1833                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1834                         *errorp = ENOSPC;
1835                         if (buffer)
1836                                 hammer_rel_buffer(buffer, 0);
1837                         return(NULL);
1838                 }
1839         }
1840         cluster->ondisk->idx_index = elm_no;
1841         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_BTREE_NODES);
1842
1843         /*
1844          * Load and return the B-Tree element
1845          */
1846         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1847         node_offset = buf_no * HAMMER_BUFSIZE +
1848                       offsetof(union hammer_fsbuf_ondisk,
1849                                btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
1850         node = hammer_get_node(cluster, node_offset, errorp);
1851         if (node) {
1852                 hammer_modify_node(node);
1853                 bzero(node->ondisk, sizeof(*node->ondisk));
1854                 KKASSERT((node->flags & (HAMMER_NODE_DELETED)) == 0);
1855         } else {
1856                 hammer_alist_free(live, elm_no, 1);
1857                 hammer_rel_node(node);
1858                 node = NULL;
1859         }
1860         if (buffer)
1861                 hammer_rel_buffer(buffer, 0);
1862         return(node);
1863 }
1864
1865 void *
1866 hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
1867                   int *errorp, struct hammer_buffer **bufferp)
1868 {
1869         hammer_buffer_t buffer;
1870         hammer_alist_t live;
1871         int32_t elm_no;
1872         int32_t buf_no;
1873         int32_t nblks;
1874         void *item;
1875
1876         /*
1877          * Deal with large data blocks.  The blocksize is HAMMER_BUFSIZE
1878          * for these allocations.
1879          */
1880         hammer_modify_cluster(cluster);
1881         if ((bytes & HAMMER_BUFMASK) == 0) {
1882                 nblks = bytes / HAMMER_BUFSIZE;
1883                 /* only one block allowed for now (so buffer can hold it) */
1884                 KKASSERT(nblks == 1);
1885
1886                 buf_no = hammer_alloc_master(cluster, nblks,
1887                                              cluster->ondisk->idx_ldata, 1);
1888                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1889                         *errorp = ENOSPC;
1890                         return(NULL);
1891                 }
1892                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, nblks);
1893                 cluster->ondisk->idx_ldata = buf_no;
1894                 buffer = *bufferp;
1895                 *bufferp = hammer_get_buffer(cluster, buf_no, -1, errorp);
1896                 if (buffer)
1897                         hammer_rel_buffer(buffer, 0);
1898                 buffer = *bufferp;
1899                 return(buffer->ondisk);
1900         }
1901
1902         /*
1903          * Allocate a data element.  The block size is HAMMER_DATA_BLKSIZE
1904          * (64 bytes) for these allocations.
1905          */
1906         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1907         nblks /= HAMMER_DATA_BLKSIZE;
1908         live = &cluster->alist_mdata;
1909         elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data);
1910         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1911                 elm_no = hammer_alist_alloc_fwd(live, nblks, 0);
1912         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1913                 alloc_new_buffer(cluster, HAMMER_FSBUF_DATA, live,
1914                                  cluster->ondisk->idx_data, errorp, bufferp);
1915                 elm_no = hammer_alist_alloc(live, nblks);
1916                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1917                         *errorp = ENOSPC;
1918                         return(NULL);
1919                 }
1920         }
1921         cluster->ondisk->idx_index = elm_no;
1922
1923         /*
1924          * Load and return the B-Tree element
1925          */
1926         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1927         buffer = *bufferp;
1928         if (buffer == NULL || buffer->cluster != cluster ||
1929             buffer->buf_no != buf_no) {
1930                 if (buffer)
1931                         hammer_rel_buffer(buffer, 0);
1932                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1933                 *bufferp = buffer;
1934         }
1935         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_DATA);
1936         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_DATA_NODES);
1937         hammer_modify_buffer(buffer);
1938         item = &buffer->ondisk->data.data[elm_no & HAMMER_FSBUF_BLKMASK];
1939         bzero(item, nblks * HAMMER_DATA_BLKSIZE);
1940         *errorp = 0;
1941         return(item);
1942 }
1943
1944 void *
1945 hammer_alloc_record(hammer_cluster_t cluster, int *errorp,
1946                     u_int8_t rec_type, struct hammer_buffer **bufferp)
1947 {
1948         hammer_buffer_t buffer;
1949         hammer_alist_t live;
1950         int32_t elm_no;
1951         int32_t buf_no;
1952         void *item;
1953
1954         /*
1955          * Allocate a record element
1956          */
1957         hammer_modify_cluster(cluster);
1958         live = &cluster->alist_record;
1959         elm_no = hammer_alist_alloc_rev(live, 1, cluster->ondisk->idx_record);
1960         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1961                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1962         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1963                 alloc_new_buffer(cluster, HAMMER_FSBUF_RECORDS, live,
1964                                  cluster->ondisk->idx_record, errorp, bufferp);
1965                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1966                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1967                         *errorp = ENOSPC;
1968                         return(NULL);
1969                 }
1970         }
1971         cluster->ondisk->idx_record = elm_no;
1972
1973         /*
1974          * Load and return the record element
1975          */
1976         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1977         buffer = *bufferp;
1978         if (buffer == NULL || buffer->cluster != cluster ||
1979             buffer->buf_no != buf_no) {
1980                 if (buffer)
1981                         hammer_rel_buffer(buffer, 0);
1982                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1983                 *bufferp = buffer;
1984         }
1985         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_RECORDS);
1986         KASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_RECORD_NODES,
1987                 ("elm_no %d (%d) out of bounds", elm_no, elm_no & HAMMER_FSBUF_BLKMASK));
1988         hammer_modify_buffer(buffer);
1989         item = &buffer->ondisk->record.recs[elm_no & HAMMER_FSBUF_BLKMASK];
1990         bzero(item, sizeof(union hammer_record_ondisk));
1991         *errorp = 0;
1992         ++cluster->ondisk->stat_records;
1993         if (rec_type == HAMMER_RECTYPE_CLUSTER)
1994                 ++cluster->ondisk->stat_records;
1995         return(item);
1996 }
1997
1998 void
1999 hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
2000 {
2001         int32_t elm_no;
2002         int32_t nblks;
2003         hammer_alist_t live;
2004
2005         hammer_modify_cluster(buffer->cluster);
2006         if ((bytes & HAMMER_BUFMASK) == 0) {
2007                 nblks = bytes / HAMMER_BUFSIZE;
2008                 KKASSERT(nblks == 1 && data == (void *)buffer->ondisk);
2009                 hammer_alist_free(&buffer->cluster->alist_master,
2010                                   buffer->buf_no, nblks);
2011                 hammer_adjust_stats(buffer->cluster, HAMMER_FSBUF_DATA, -nblks);
2012                 return;
2013         }
2014
2015         elm_no = ((char *)data - (char *)buffer->ondisk->data.data) /
2016                  HAMMER_DATA_BLKSIZE;
2017         KKASSERT(elm_no >= 0 && elm_no < HAMMER_DATA_NODES);
2018         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
2019         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
2020         nblks /= HAMMER_DATA_BLKSIZE;
2021         live = &buffer->cluster->alist_mdata;
2022         hammer_alist_free(live, elm_no, nblks);
2023 }
2024
2025 void
2026 hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec,
2027                         u_int8_t rec_type)
2028 {
2029         int32_t elm_no;
2030         hammer_alist_t live;
2031
2032         hammer_modify_cluster(buffer->cluster);
2033         elm_no = rec - &buffer->ondisk->record.recs[0];
2034         KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
2035         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
2036         live = &buffer->cluster->alist_record;
2037         hammer_alist_free(live, elm_no, 1);
2038         --buffer->cluster->ondisk->stat_records;
2039         if (rec_type == HAMMER_RECTYPE_CLUSTER)
2040                 --buffer->cluster->ondisk->stat_records;
2041 }
2042
2043 void
2044 hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
2045 {
2046         const int32_t blksize = sizeof(struct hammer_node_ondisk);
2047         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2048         hammer_alist_t live;
2049         int32_t elm_no;
2050
2051         hammer_modify_cluster(cluster);
2052         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2053         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, btree.nodes[0]);
2054         live = &cluster->alist_btree;
2055         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2056         elm_no += fsbuf_offset / blksize;
2057         hammer_alist_free(live, elm_no, 1);
2058 }
2059
2060 void
2061 hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
2062 {
2063         const int32_t blksize = HAMMER_DATA_BLKSIZE;
2064         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2065         hammer_alist_t live;
2066         int32_t elm_no;
2067         int32_t buf_no;
2068         int32_t nblks;
2069
2070         hammer_modify_cluster(cluster);
2071         if ((bytes & HAMMER_BUFMASK) == 0) {
2072                 nblks = bytes / HAMMER_BUFSIZE;
2073                 KKASSERT(nblks == 1 && (bclu_offset & HAMMER_BUFMASK) == 0);
2074                 buf_no = bclu_offset / HAMMER_BUFSIZE;
2075                 hammer_alist_free(&cluster->alist_master, buf_no, nblks);
2076                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, -nblks);
2077                 return;
2078         }
2079
2080         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2081         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, data.data[0][0]);
2082         live = &cluster->alist_mdata;
2083         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
2084         nblks /= HAMMER_DATA_BLKSIZE;
2085         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2086         elm_no += fsbuf_offset / blksize;
2087         hammer_alist_free(live, elm_no, nblks);
2088 }
2089
2090 void
2091 hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset,
2092                    u_int8_t rec_type)
2093 {
2094         const int32_t blksize = sizeof(union hammer_record_ondisk);
2095         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2096         hammer_alist_t live;
2097         int32_t elm_no;
2098
2099         hammer_modify_cluster(cluster);
2100         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2101         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, record.recs[0]);
2102         live = &cluster->alist_record;
2103         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2104         elm_no += fsbuf_offset / blksize;
2105         hammer_alist_free(live, elm_no, 1);
2106         --cluster->ondisk->stat_records;
2107         if (rec_type == HAMMER_RECTYPE_CLUSTER)
2108                 --cluster->ondisk->stat_records;
2109 }
2110
2111
2112 /*
2113  * Allocate a new filesystem buffer and assign it to the specified
2114  * filesystem buffer type.  The new buffer will be added to the
2115  * type-specific A-list and initialized.
2116  */
2117 static void
2118 alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live,
2119                  int start, int *errorp, struct hammer_buffer **bufferp)
2120 {
2121         hammer_buffer_t buffer;
2122         int32_t buf_no;
2123         int32_t base_blk;
2124         int isfwd;
2125
2126         if (*bufferp)
2127                 hammer_rel_buffer(*bufferp, 0);
2128         *bufferp = NULL;
2129
2130         start = start / HAMMER_FSBUF_MAXBLKS;   /* convert to buf_no */
2131         isfwd = (type != HAMMER_FSBUF_RECORDS);
2132         buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
2133         if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2134                 *errorp = ENOSPC;
2135                 return;
2136         }
2137
2138         /*
2139          * The new buffer must be initialized (type != 0) regardless of
2140          * whether we already have it cached or not, so don't try to
2141          * optimize the cached buffer check.  Just call hammer_get_buffer().
2142          */
2143         buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
2144         *bufferp = buffer;
2145
2146         /*
2147          * Do a meta-free of the buffer's elements into the type-specific
2148          * A-list and update our statistics to reflect the allocation.
2149          */
2150         if (buffer) {
2151                 hammer_modify_buffer(buffer);  /*XXX*/
2152                 hammer_adjust_stats(cluster, type, 1);
2153
2154                 /*
2155                  * Free the buffer to the appropriate slave list so the
2156                  * cluster-based allocator sees it.
2157                  */
2158                 base_blk = buf_no * HAMMER_FSBUF_MAXBLKS;
2159
2160                 switch(type) {
2161                 case HAMMER_FSBUF_BTREE:
2162                         hammer_alist_free(live, base_blk, HAMMER_BTREE_NODES);
2163                         break;
2164                 case HAMMER_FSBUF_DATA:
2165                         hammer_alist_free(live, base_blk, HAMMER_DATA_NODES);
2166                         break;
2167                 case HAMMER_FSBUF_RECORDS:
2168                         hammer_alist_free(live, base_blk, HAMMER_RECORD_NODES);
2169                         break;
2170                 }
2171         }
2172 }
2173
2174 /*
2175  * Sync dirty buffers to the media
2176  */
2177
2178 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
2179 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
2180
2181 int
2182 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
2183 {
2184         struct hammer_sync_info info;
2185
2186         info.error = 0;
2187         info.waitfor = waitfor;
2188
2189         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
2190                       hammer_sync_scan1, hammer_sync_scan2, &info);
2191
2192         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
2193                 hammer_sync_volume, &info);
2194         return(info.error);
2195 }
2196
2197 static int
2198 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
2199 {
2200         struct hammer_inode *ip;
2201
2202         ip = VTOI(vp);
2203         if (vp->v_type == VNON || ip == NULL ||
2204             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2205              RB_EMPTY(&vp->v_rbdirty_tree))) {
2206                 return(-1);
2207         }
2208         return(0);
2209 }
2210
2211 static int
2212 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2213 {
2214         struct hammer_sync_info *info = data;
2215         struct hammer_inode *ip;
2216         int error;
2217
2218         ip = VTOI(vp);
2219         if (vp->v_type == VNON || vp->v_type == VBAD ||
2220             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2221              RB_EMPTY(&vp->v_rbdirty_tree))) {
2222                 return(0);
2223         }
2224         error = VOP_FSYNC(vp, info->waitfor);
2225         if (error)
2226                 info->error = error;
2227         return(0);
2228 }
2229
2230 int
2231 hammer_sync_volume(hammer_volume_t volume, void *data)
2232 {
2233         struct hammer_sync_info *info = data;
2234
2235         hammer_ref(&volume->io.lock);
2236         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
2237                 hammer_sync_cluster, info);
2238         hammer_rel_volume(volume, 1);
2239         return(0);
2240 }
2241
2242 int
2243 hammer_sync_cluster(hammer_cluster_t cluster, void *data)
2244 {
2245         struct hammer_sync_info *info = data;
2246
2247         /*
2248          * XXX check if cluster deleted and don't bother to sync it?
2249          */
2250         hammer_ref(&cluster->io.lock);
2251         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
2252                 hammer_sync_buffer, info);
2253         /*hammer_io_waitdep(&cluster->io);*/
2254         hammer_rel_cluster(cluster, 1);
2255         return(0);
2256 }
2257
2258 int
2259 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
2260 {
2261         hammer_ref(&buffer->io.lock);
2262         hammer_rel_buffer(buffer, 1);
2263         return(0);
2264 }
2265
2266 /*
2267  * Generic buffer initialization.  Initialize the A-list into an all-allocated
2268  * state with the free block limit properly set.
2269  *
2270  * Note that alloc_new_buffer() will free the appropriate block range via
2271  * the appropriate cluster alist, so the free count is properly propogated.
2272  */
2273 void
2274 hammer_initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
2275 {
2276         head->buf_type = type;
2277
2278         switch(type) {
2279         case HAMMER_FSBUF_BTREE:
2280                 hammer_alist_init(live, 0, HAMMER_BTREE_NODES,
2281                                   HAMMER_ASTATE_ALLOC);
2282                 break;
2283         case HAMMER_FSBUF_DATA:
2284                 hammer_alist_init(live, 0, HAMMER_DATA_NODES,
2285                                   HAMMER_ASTATE_ALLOC);
2286                 break;
2287         case HAMMER_FSBUF_RECORDS:
2288                 hammer_alist_init(live, 0, HAMMER_RECORD_NODES,
2289                                   HAMMER_ASTATE_ALLOC);
2290                 break;
2291         default:
2292                 hammer_alist_init(live, 0, 0, HAMMER_ASTATE_ALLOC);
2293                 break;
2294         }
2295 }
2296
2297 /*
2298  * Calculate the cluster's offset in the volume.  This calculation is
2299  * slightly more complex when using superclusters because superclusters
2300  * are grouped in blocks of 16, followed by 16 x N clusters where N
2301  * is the number of clusters a supercluster can manage.
2302  */
2303 static int64_t
2304 calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
2305 {
2306         int32_t scl_group;
2307         int64_t scl_group_size;
2308         int64_t off;
2309
2310         if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
2311                 scl_group = clu_no / HAMMER_VOL_SUPERCLUSTER_GROUP /
2312                             HAMMER_SCL_MAXCLUSTERS;
2313                 scl_group_size = 
2314                             ((int64_t)HAMMER_BUFSIZE *
2315                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2316                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2317                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2318                 scl_group_size += 
2319                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2320
2321                 off = volume->cluster_base +
2322                       scl_group * scl_group_size +
2323                       (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
2324                       ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
2325                        HAMMER_VOL_SUPERCLUSTER_GROUP))
2326                       * volume->vol_clsize;
2327         } else {
2328                 off = volume->cluster_base +
2329                       (int64_t)clu_no * volume->vol_clsize;
2330         }
2331         return(off);
2332 }
2333
2334 /*
2335  * Calculate a super-cluster's offset in the volume.
2336  */
2337 static int64_t
2338 calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
2339 {
2340         int64_t off;
2341         int32_t scl_group;
2342         int64_t scl_group_size;
2343
2344         KKASSERT (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL);
2345         scl_group = scl_no / HAMMER_VOL_SUPERCLUSTER_GROUP;
2346         if (scl_group) {
2347                 scl_group_size = 
2348                             ((int64_t)HAMMER_BUFSIZE *
2349                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2350                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2351                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2352                 scl_group_size += 
2353                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2354                 off = volume->cluster_base + (scl_group * scl_group_size) +
2355                       (scl_no % HAMMER_VOL_SUPERCLUSTER_GROUP) * HAMMER_BUFSIZE;
2356         } else {
2357                 off = volume->cluster_base + (scl_no * HAMMER_BUFSIZE);
2358         }
2359         return(off);
2360 }
2361
2362 /*
2363  * Allocate nblks buffers from the cluster's master alist.
2364  */
2365 static int32_t
2366 hammer_alloc_master(hammer_cluster_t cluster, int nblks,
2367                     int32_t start, int isfwd)
2368 {
2369         int32_t buf_no;
2370
2371         hammer_modify_cluster(cluster);
2372         if (isfwd) {
2373                 buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2374                                                 nblks, start);
2375                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2376                         buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2377                                                 nblks, 0);
2378                 }
2379         } else {
2380                 buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2381                                                 nblks, start);
2382                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2383                         buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2384                                                 nblks, HAMMER_ALIST_BLOCK_MAX);
2385                 }
2386         }
2387
2388         /*
2389          * Recover space from empty record, b-tree, and data a-lists.
2390          */
2391
2392         return(buf_no);
2393 }
2394
2395 /*
2396  * Adjust allocation statistics
2397  */
2398 static void
2399 hammer_adjust_stats(hammer_cluster_t cluster, u_int64_t buf_type, int nblks)
2400 {
2401         if (nblks == 0)
2402                 return;
2403
2404         hammer_modify_cluster(cluster);
2405         hammer_modify_volume(cluster->volume);
2406         hammer_modify_volume(cluster->volume->hmp->rootvol);
2407
2408         switch(buf_type) {
2409         case HAMMER_FSBUF_BTREE:
2410                 cluster->ondisk->stat_idx_bufs += nblks;
2411                 cluster->volume->ondisk->vol_stat_idx_bufs += nblks;
2412                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_idx_bufs += nblks;
2413                 break;
2414         case HAMMER_FSBUF_DATA:
2415                 cluster->ondisk->stat_data_bufs += nblks;
2416                 cluster->volume->ondisk->vol_stat_data_bufs += nblks;
2417                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_data_bufs += nblks;
2418                 break;
2419         case HAMMER_FSBUF_RECORDS:
2420                 cluster->ondisk->stat_rec_bufs += nblks;
2421                 cluster->volume->ondisk->vol_stat_rec_bufs += nblks;
2422                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_rec_bufs += nblks;
2423                 break;
2424         }
2425 }
2426
2427 /*
2428  * A-LIST SUPPORT
2429  *
2430  * Setup the parameters for the various A-lists we use in hammer.  The
2431  * supercluster A-list must be chained to the cluster A-list and cluster
2432  * slave A-lists are chained to buffer A-lists.
2433  *
2434  * See hammer_init_alist_config() below.
2435  */
2436
2437 /*
2438  * A-LIST - cluster recursion into a filesystem buffer
2439  *
2440  * In the init case the buffer has already been initialized by
2441  * alloc_new_buffer() when it allocated the buffer out of the master
2442  * alist and marked it as free in the slave alist.
2443  *
2444  * Because we use a somewhat odd mechanism to assign buffers to slave
2445  * pools we can't actually free the buffer back to the master alist in
2446  * buffer_alist_destroy(), but instead must deal with that logic somewhere
2447  * else.
2448  */
2449 static int
2450 buffer_alist_init(void *info, int32_t blk, int32_t radix,
2451                   hammer_alloc_state_t state)
2452 {
2453         return(0);
2454 }
2455
2456 /*
2457  * Note: This routine is only called when freeing the last elements of
2458  * an initialized buffer.  Freeing all elements of the buffer when the
2459  * buffer was not previously initialized does not call this routine.
2460  */
2461 static int
2462 buffer_alist_destroy(void *info, int32_t blk, int32_t radix)
2463 {
2464         hammer_cluster_t cluster = info;
2465         int32_t buf_no;
2466
2467         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2468         if (hammer_debug_general & 0x80) {
2469                 kprintf("destroy buffer %d:%d:%d\n",
2470                         cluster->volume->vol_no, cluster->clu_no, buf_no);
2471         }
2472         return (0);
2473 }
2474
2475 /*
2476  * Note: atblk can be negative and atblk - blk can go negative.
2477  */
2478 static int
2479 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2480                       int32_t count, int32_t atblk, int32_t *fullp)
2481 {
2482         hammer_cluster_t cluster = info;
2483         hammer_buffer_t buffer;
2484         int32_t buf_no;
2485         int32_t r;
2486         int error = 0;
2487
2488         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2489         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2490         if (buffer) {
2491                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2492
2493                 hammer_modify_buffer(buffer);
2494                 r = hammer_alist_alloc_fwd(&buffer->alist, count, atblk - blk);
2495                 if (r != HAMMER_ALIST_BLOCK_NONE)
2496                         r += blk;
2497                 *fullp = hammer_alist_isfull(&buffer->alist);
2498                 hammer_rel_buffer(buffer, 0);
2499         } else {
2500                 r = HAMMER_ALIST_BLOCK_NONE;
2501                 *fullp = 1;
2502         }
2503         return(r);
2504 }
2505
2506 static int
2507 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2508                       int32_t count, int32_t atblk, int32_t *fullp)
2509 {
2510         hammer_cluster_t cluster = info;
2511         hammer_buffer_t buffer;
2512         int32_t buf_no;
2513         int32_t r;
2514         int error = 0;
2515
2516         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2517         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2518         if (buffer) {
2519                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2520                 hammer_modify_buffer(buffer);
2521                 r = hammer_alist_alloc_rev(&buffer->alist, count, atblk - blk);
2522                 if (r != HAMMER_ALIST_BLOCK_NONE)
2523                         r += blk;
2524                 *fullp = hammer_alist_isfull(&buffer->alist);
2525                 hammer_rel_buffer(buffer, 0);
2526         } else {
2527                 r = HAMMER_ALIST_BLOCK_NONE;
2528                 *fullp = 1;
2529         }
2530         return(r);
2531 }
2532
2533 static void
2534 buffer_alist_free(void *info, int32_t blk, int32_t radix,
2535                  int32_t base_blk, int32_t count, int32_t *emptyp)
2536 {
2537         hammer_cluster_t cluster = info;
2538         hammer_buffer_t buffer;
2539         int32_t buf_no;
2540         int error = 0;
2541
2542         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2543         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2544         if (buffer) {
2545                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2546                 hammer_modify_buffer(buffer);
2547                 hammer_alist_free(&buffer->alist, base_blk, count);
2548                 *emptyp = hammer_alist_isempty(&buffer->alist);
2549                 hammer_rel_buffer(buffer, 0);
2550         } else {
2551                 *emptyp = 0;
2552         }
2553 }
2554
2555 static int32_t
2556 buffer_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk,
2557                   int flags)
2558 {
2559         hammer_cluster_t cluster = info;
2560         hammer_buffer_t buffer;
2561         int32_t buf_no;
2562         int32_t maxblks;
2563         int error = 0;
2564
2565         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2566         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2567         if (buffer) {
2568                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2569                 switch(buffer->ondisk->head.buf_type) {
2570                 case HAMMER_FSBUF_RECORDS:
2571                         maxblks = HAMMER_RECORD_NODES;
2572                         break;
2573                 case HAMMER_FSBUF_BTREE:
2574                         maxblks = HAMMER_BTREE_NODES;
2575                         break;
2576                 case HAMMER_FSBUF_DATA:
2577                         maxblks = HAMMER_DATA_NODES;
2578                         break;
2579                 default:
2580                         panic("buffer_alist_find: unknown buffer type");
2581                         maxblks = 0;
2582                         break;
2583                 }
2584                 blk = hammer_alist_find(&buffer->alist, atblk - blk, maxblks,
2585                                         flags);
2586                 hammer_rel_buffer(buffer, 0);
2587         } else {
2588                 blk = HAMMER_ALIST_BLOCK_NONE;
2589         }
2590         return(blk);
2591 }
2592
2593 static void
2594 buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2595 {
2596 }
2597
2598 /*
2599  * A-LIST - super-cluster recursion into a cluster and cluster recursion
2600  * into a filesystem buffer.  A-List's are mostly self-contained entities,
2601  * but callbacks must be installed to recurse from one A-List to another.
2602  *
2603  * Implementing these callbacks allows us to operate a multi-layered A-List
2604  * as a single entity.
2605  */
2606
2607 /*
2608  * This occurs when allocating a cluster via the volume a-list and the
2609  * entry in the volume a-list indicated all-free.  The underlying supercl
2610  * has not yet been initialized.
2611  */
2612 static int
2613 super_alist_init(void *info, int32_t blk, int32_t radix,
2614                  hammer_alloc_state_t state)
2615 {
2616         hammer_volume_t volume = info;
2617         hammer_supercl_t supercl;
2618         int32_t scl_no;
2619         int error = 0;
2620
2621         /*
2622          * Calculate the super-cluster number containing the cluster (blk)
2623          * and obtain the super-cluster buffer.
2624          */
2625         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2626         supercl = hammer_get_supercl(volume, scl_no, &error, state);
2627         if (supercl)
2628                 hammer_rel_supercl(supercl, 0);
2629         return (error);
2630 }
2631
2632 static int
2633 super_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
2634 {
2635         hammer_volume_t volume = info;
2636         hammer_supercl_t supercl;
2637         int32_t scl_no;
2638         int error = 0;
2639
2640         /*
2641          * Calculate the super-cluster number containing the cluster (blk)
2642          * and obtain the super-cluster buffer.
2643          */
2644         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2645         supercl = hammer_get_supercl(volume, scl_no, &error,
2646                                      HAMMER_ASTATE_NONE);
2647         if (supercl) {
2648                 hammer_modify_supercl(supercl);
2649                 error = hammer_alist_recover(&supercl->alist, blk, 0, count);
2650                 /* free block count is returned if >= 0 */
2651                 hammer_rel_supercl(supercl, 0);
2652         } else {
2653                 error = -error;
2654         }
2655         return (error);
2656 }
2657
2658 /*
2659  * This occurs when freeing a cluster via the volume a-list and the
2660  * supercl is now 100% free.  We can destroy the supercl.
2661  *
2662  * What we actually do is just unset the modify bit so it doesn't get
2663  * written out.
2664  */
2665 static int
2666 super_alist_destroy(void *info, int32_t blk, int32_t radix)
2667 {
2668         hammer_volume_t volume = info;
2669         hammer_supercl_t supercl;
2670         int32_t scl_no;
2671         int error = 0;
2672
2673         /*
2674          * Calculate the super-cluster number containing the cluster (blk)
2675          * and obtain the super-cluster buffer.
2676          */
2677         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2678         if (hammer_find_supercl(volume, scl_no)) {
2679                 supercl = hammer_get_supercl(volume, scl_no, &error,
2680                                              HAMMER_ASTATE_FREE);
2681                                              /* XXX */
2682                 if (supercl) {
2683                         hammer_io_clear_modify(&supercl->io);
2684                         hammer_rel_supercl(supercl, 0);
2685                 }
2686         }
2687         return (error);
2688 }
2689
2690 static int
2691 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2692                       int32_t count, int32_t atblk, int32_t *fullp)
2693 {
2694         hammer_volume_t volume = info;
2695         hammer_supercl_t supercl;
2696         int32_t scl_no;
2697         int32_t r;
2698         int error = 0;
2699
2700         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2701         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2702         if (supercl) {
2703                 hammer_modify_supercl(supercl);
2704                 r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
2705                 if (r != HAMMER_ALIST_BLOCK_NONE)
2706                         r += blk;
2707                 *fullp = hammer_alist_isfull(&supercl->alist);
2708                 hammer_rel_supercl(supercl, 0);
2709         } else {
2710                 r = HAMMER_ALIST_BLOCK_NONE;
2711                 *fullp = 1;
2712         }
2713         return(r);
2714 }
2715
2716 static int
2717 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2718                       int32_t count, int32_t atblk, int32_t *fullp)
2719 {
2720         hammer_volume_t volume = info;
2721         hammer_supercl_t supercl;
2722         int32_t scl_no;
2723         int32_t r;
2724         int error = 0;
2725
2726         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2727         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2728         if (supercl) {
2729                 hammer_modify_supercl(supercl);
2730                 r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
2731                 if (r != HAMMER_ALIST_BLOCK_NONE)
2732                         r += blk;
2733                 *fullp = hammer_alist_isfull(&supercl->alist);
2734                 hammer_rel_supercl(supercl, 0);
2735         } else { 
2736                 r = HAMMER_ALIST_BLOCK_NONE;
2737                 *fullp = 1;
2738         }
2739         return(r);
2740 }
2741
2742 static void
2743 super_alist_free(void *info, int32_t blk, int32_t radix,
2744                  int32_t base_blk, int32_t count, int32_t *emptyp)
2745 {
2746         hammer_volume_t volume = info;
2747         hammer_supercl_t supercl;
2748         int32_t scl_no;
2749         int error = 0;
2750
2751         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2752         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2753         if (supercl) {
2754                 hammer_modify_supercl(supercl);
2755                 hammer_alist_free(&supercl->alist, base_blk, count);
2756                 *emptyp = hammer_alist_isempty(&supercl->alist);
2757                 hammer_rel_supercl(supercl, 0);
2758         } else {
2759                 *emptyp = 0;
2760         }
2761 }
2762
2763 static int32_t
2764 super_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk,
2765                   int flags)
2766 {
2767         hammer_volume_t volume = info;
2768         hammer_supercl_t supercl;
2769         int32_t scl_no;
2770         int32_t nclusters;
2771         int error = 0;
2772
2773         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2774         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2775         if (supercl) {
2776                 nclusters = supercl->volume->ondisk->vol_nclusters -
2777                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
2778                 KKASSERT(nclusters > 0);
2779                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
2780                         nclusters = HAMMER_SCL_MAXCLUSTERS;
2781                 blk = hammer_alist_find(&supercl->alist, atblk - blk,
2782                                         nclusters, flags);
2783                 hammer_rel_supercl(supercl, 0);
2784         } else {
2785                 blk = HAMMER_ALIST_BLOCK_NONE;
2786         }
2787         return(blk);
2788 }
2789
2790 static void
2791 super_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2792 {
2793 }
2794
2795 void
2796 hammer_init_alist_config(void)
2797 {
2798         hammer_alist_config_t config;
2799
2800         hammer_alist_template(&Buf_alist_config, HAMMER_FSBUF_MAXBLKS,
2801                               1, HAMMER_FSBUF_METAELMS, 0);
2802         hammer_alist_template(&Vol_normal_alist_config, HAMMER_VOL_MAXCLUSTERS,
2803                               1, HAMMER_VOL_METAELMS_1LYR, 0);
2804         hammer_alist_template(&Vol_super_alist_config,
2805                           HAMMER_VOL_MAXSUPERCLUSTERS * HAMMER_SCL_MAXCLUSTERS,
2806                               HAMMER_SCL_MAXCLUSTERS, HAMMER_VOL_METAELMS_2LYR,
2807                               0);
2808         hammer_alist_template(&Supercl_alist_config, HAMMER_VOL_MAXCLUSTERS,
2809                               1, HAMMER_SUPERCL_METAELMS, 0);
2810         hammer_alist_template(&Clu_master_alist_config, HAMMER_CLU_MAXBUFFERS,
2811                               1, HAMMER_CLU_MASTER_METAELMS, 0);
2812         hammer_alist_template(&Clu_slave_alist_config,
2813                               HAMMER_CLU_MAXBUFFERS * HAMMER_FSBUF_MAXBLKS,
2814                               HAMMER_FSBUF_MAXBLKS, HAMMER_CLU_SLAVE_METAELMS,
2815                               1);
2816
2817         config = &Vol_super_alist_config;
2818         config->bl_radix_init = super_alist_init;
2819         config->bl_radix_recover = super_alist_recover;
2820         config->bl_radix_destroy = super_alist_destroy;
2821         config->bl_radix_alloc_fwd = super_alist_alloc_fwd;
2822         config->bl_radix_alloc_rev = super_alist_alloc_rev;
2823         config->bl_radix_free = super_alist_free;
2824         config->bl_radix_find = super_alist_find;
2825         config->bl_radix_print = super_alist_print;
2826
2827         config = &Clu_slave_alist_config;
2828         config->bl_radix_init = buffer_alist_init;
2829         config->bl_radix_recover = buffer_alist_recover;
2830         config->bl_radix_destroy = buffer_alist_destroy;
2831         config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd;
2832         config->bl_radix_alloc_rev = buffer_alist_alloc_rev;
2833         config->bl_radix_free = buffer_alist_free;
2834         config->bl_radix_find = buffer_alist_find;
2835         config->bl_radix_print = buffer_alist_print;
2836 }
2837