HAMMER 24/many: Clean up edge cases
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.25 2008/01/25 05:49:08 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_supercl(hammer_supercl_t supercl,
51                         hammer_alloc_state_t isnew);
52 static int hammer_load_cluster(hammer_cluster_t cluster, int getflags);
53 static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
54 static int hammer_load_node(hammer_node_t node);
55 static void alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type,
56                         hammer_alist_t live,
57                         int32_t start, int *errorp,
58                         struct hammer_buffer **bufferp);
59 #if 0
60 static void readhammerbuf(hammer_volume_t vol, void *data,
61                         int64_t offset);
62 static void writehammerbuf(hammer_volume_t vol, const void *data,
63                         int64_t offset);
64 #endif
65 static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
66 static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
67 static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks,
68                         int32_t start, int isfwd);
69 static void hammer_adjust_stats(hammer_cluster_t cluster,
70                         u_int64_t buf_type, int nblks);
71
72 struct hammer_alist_config Buf_alist_config;
73 struct hammer_alist_config Vol_normal_alist_config;
74 struct hammer_alist_config Vol_super_alist_config;
75 struct hammer_alist_config Supercl_alist_config;
76 struct hammer_alist_config Clu_master_alist_config;
77 struct hammer_alist_config Clu_slave_alist_config;
78
79 /*
80  * Red-Black tree support for various structures
81  */
82 static int
83 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
84 {
85         if (ip1->obj_id < ip2->obj_id)
86                 return(-1);
87         if (ip1->obj_id > ip2->obj_id)
88                 return(1);
89         if (ip1->obj_asof < ip2->obj_asof)
90                 return(-1);
91         if (ip1->obj_asof > ip2->obj_asof)
92                 return(1);
93         return(0);
94 }
95
96 static int
97 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
98 {
99         if (info->obj_id < ip->obj_id)
100                 return(-1);
101         if (info->obj_id > ip->obj_id)
102                 return(1);
103         if (info->obj_asof < ip->obj_asof)
104                 return(-1);
105         if (info->obj_asof > ip->obj_asof)
106                 return(1);
107         return(0);
108 }
109
110 static int
111 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
112 {
113         if (vol1->vol_no < vol2->vol_no)
114                 return(-1);
115         if (vol1->vol_no > vol2->vol_no)
116                 return(1);
117         return(0);
118 }
119
120 static int
121 hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
122 {
123         if (cl1->scl_no < cl2->scl_no)
124                 return(-1);
125         if (cl1->scl_no > cl2->scl_no)
126                 return(1);
127         return(0);
128 }
129
130 static int
131 hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
132 {
133         if (cl1->clu_no < cl2->clu_no)
134                 return(-1);
135         if (cl1->clu_no > cl2->clu_no)
136                 return(1);
137         return(0);
138 }
139
140 static int
141 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
142 {
143         if (buf1->buf_no < buf2->buf_no)
144                 return(-1);
145         if (buf1->buf_no > buf2->buf_no)
146                 return(1);
147         return(0);
148 }
149
150 static int
151 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
152 {
153         if (node1->node_offset < node2->node_offset)
154                 return(-1);
155         if (node1->node_offset > node2->node_offset)
156                 return(1);
157         return(0);
158 }
159
160 /*
161  * Note: The lookup function for hammer_ino_rb_tree winds up being named
162  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
163  * functions are normal, e.g. hammer_clu_rb_tree_RB_LOOKUP(root, clu_no).
164  */
165 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
166 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
167                 hammer_inode_info_cmp, hammer_inode_info_t);
168 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
169              hammer_vol_rb_compare, int32_t, vol_no);
170 RB_GENERATE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
171              hammer_scl_rb_compare, int32_t, scl_no);
172 RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
173              hammer_clu_rb_compare, int32_t, clu_no);
174 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
175              hammer_buf_rb_compare, int32_t, buf_no);
176 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
177              hammer_nod_rb_compare, int32_t, node_offset);
178
179 /************************************************************************
180  *                              VOLUMES                                 *
181  ************************************************************************
182  *
183  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
184  * code on failure.  Volumes must be loaded at mount time, get_volume() will
185  * not load a new volume.
186  *
187  * Calls made to hammer_load_volume() or single-threaded
188  */
189 int
190 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
191 {
192         struct mount *mp;
193         hammer_volume_t volume;
194         struct hammer_volume_ondisk *ondisk;
195         struct nlookupdata nd;
196         struct buf *bp = NULL;
197         int error;
198         int ronly;
199
200         mp = hmp->mp;
201         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
202
203         /*
204          * Allocate a volume structure
205          */
206         ++hammer_count_volumes;
207         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
208         volume->vol_name = kstrdup(volname, M_HAMMER);
209         volume->hmp = hmp;
210         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
211         volume->io.offset = 0LL;
212
213         /*
214          * Get the device vnode
215          */
216         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
217         if (error == 0)
218                 error = nlookup(&nd);
219         if (error == 0)
220                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
221         nlookup_done(&nd);
222         if (error == 0) {
223                 vn_isdisk(volume->devvp, &error);
224         }
225         if (error == 0) {
226                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
227                 error = VOP_OPEN(volume->devvp, (ronly ? FREAD : FREAD|FWRITE),
228                                  FSCRED, NULL);
229                 vn_unlock(volume->devvp);
230         }
231         if (error) {
232                 hammer_free_volume(volume);
233                 return(error);
234         }
235
236         /*
237          * Extract the volume number from the volume header and do various
238          * sanity checks.
239          */
240         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
241         if (error)
242                 goto late_failure;
243         ondisk = (void *)bp->b_data;
244         if (ondisk->head.buf_type != HAMMER_FSBUF_VOLUME) {
245                 kprintf("hammer_mount: volume %s has an invalid header\n",
246                         volume->vol_name);
247                 error = EFTYPE;
248                 goto late_failure;
249         }
250         volume->vol_no = ondisk->vol_no;
251         volume->cluster_base = ondisk->vol_clo_beg;
252         volume->vol_clsize = ondisk->vol_clsize;
253         volume->vol_flags = ondisk->vol_flags;
254         volume->nblocks = ondisk->vol_nblocks; 
255         RB_INIT(&volume->rb_clus_root);
256         RB_INIT(&volume->rb_scls_root);
257
258         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
259
260         if (RB_EMPTY(&hmp->rb_vols_root)) {
261                 hmp->fsid = ondisk->vol_fsid;
262         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
263                 kprintf("hammer_mount: volume %s's fsid does not match "
264                         "other volumes\n", volume->vol_name);
265                 error = EFTYPE;
266                 goto late_failure;
267         }
268
269         /*
270          * Insert the volume structure into the red-black tree.
271          */
272         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
273                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
274                         volume->vol_name, volume->vol_no);
275                 error = EEXIST;
276         }
277
278         /*
279          * Set the root volume and load the root cluster.  HAMMER special
280          * cases rootvol and rootcl and will not deallocate the structures.
281          * We do not hold a ref because this would prevent related I/O
282          * from being flushed.
283          */
284         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
285                 hmp->rootvol = volume;
286                 if (bp) {
287                         brelse(bp);
288                         bp = NULL;
289                 }
290                 hammer_ref_volume(volume);
291                 hmp->rootcl = hammer_get_cluster(volume,
292                                                  ondisk->vol0_root_clu_no,
293                                                  &error, GET_CLUSTER_NORECOVER);
294                 hammer_rel_cluster(hmp->rootcl, 0);
295                 hammer_rel_volume(volume, 0);
296                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
297         }
298 late_failure:
299         if (bp)
300                 brelse(bp);
301         if (error) {
302                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
303                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
304                 hammer_free_volume(volume);
305         }
306         return (error);
307 }
308
309 /*
310  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
311  * so returns -1 on failure.
312  */
313 int
314 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
315 {
316         struct hammer_mount *hmp = volume->hmp;
317         hammer_cluster_t rootcl;
318         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
319
320         /*
321          * Sync clusters, sync volume
322          */
323
324         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
325
326         /*
327          * Clean up the root cluster, which is held unlocked in the root
328          * volume.
329          */
330         if (hmp->rootvol == volume) {
331                 if ((rootcl = hmp->rootcl) != NULL)
332                         hmp->rootcl = NULL;
333                 hmp->rootvol = NULL;
334         }
335
336         /*
337          * Unload clusters and super-clusters.  Unloading a super-cluster
338          * also unloads related clusters, but the filesystem may not be
339          * using super-clusters so unload clusters anyway.
340          */
341         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
342                         hammer_unload_cluster, NULL);
343         RB_SCAN(hammer_scl_rb_tree, &volume->rb_scls_root, NULL,
344                         hammer_unload_supercl, NULL);
345         hammer_io_waitdep(&volume->io);
346
347         /*
348          * Release our buffer and flush anything left in the buffer cache.
349          */
350         hammer_io_release(&volume->io, 2);
351
352         /*
353          * There should be no references on the volume, no clusters, and
354          * no super-clusters.
355          */
356         KKASSERT(volume->io.lock.refs == 0);
357         KKASSERT(RB_EMPTY(&volume->rb_clus_root));
358         KKASSERT(RB_EMPTY(&volume->rb_scls_root));
359
360         volume->ondisk = NULL;
361         if (volume->devvp) {
362                 if (ronly) {
363                         vinvalbuf(volume->devvp, 0, 0, 0);
364                         VOP_CLOSE(volume->devvp, FREAD);
365                 } else {
366                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
367                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
368                 }
369         }
370
371         /*
372          * Destroy the structure
373          */
374         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
375         hammer_free_volume(volume);
376         return(0);
377 }
378
379 static
380 void
381 hammer_free_volume(hammer_volume_t volume)
382 {
383         if (volume->vol_name) {
384                 kfree(volume->vol_name, M_HAMMER);
385                 volume->vol_name = NULL;
386         }
387         if (volume->devvp) {
388                 vrele(volume->devvp);
389                 volume->devvp = NULL;
390         }
391         --hammer_count_volumes;
392         kfree(volume, M_HAMMER);
393 }
394
395 /*
396  * Get a HAMMER volume.  The volume must already exist.
397  */
398 hammer_volume_t
399 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
400 {
401         struct hammer_volume *volume;
402
403         /*
404          * Locate the volume structure
405          */
406         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
407         if (volume == NULL) {
408                 *errorp = ENOENT;
409                 return(NULL);
410         }
411         hammer_ref(&volume->io.lock);
412
413         /*
414          * Deal with on-disk info
415          */
416         if (volume->ondisk == NULL || volume->io.loading) {
417                 *errorp = hammer_load_volume(volume);
418                 if (*errorp) {
419                         hammer_rel_volume(volume, 1);
420                         volume = NULL;
421                 }
422         } else {
423                 *errorp = 0;
424         }
425         return(volume);
426 }
427
428 int
429 hammer_ref_volume(hammer_volume_t volume)
430 {
431         int error;
432
433         hammer_ref(&volume->io.lock);
434
435         /*
436          * Deal with on-disk info
437          */
438         if (volume->ondisk == NULL || volume->io.loading) {
439                 error = hammer_load_volume(volume);
440                 if (error)
441                         hammer_rel_volume(volume, 1);
442         } else {
443                 error = 0;
444         }
445         return (error);
446 }
447
448 hammer_volume_t
449 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
450 {
451         hammer_volume_t volume;
452
453         volume = hmp->rootvol;
454         KKASSERT(volume != NULL);
455         hammer_ref(&volume->io.lock);
456
457         /*
458          * Deal with on-disk info
459          */
460         if (volume->ondisk == NULL || volume->io.loading) {
461                 *errorp = hammer_load_volume(volume);
462                 if (*errorp) {
463                         hammer_rel_volume(volume, 1);
464                         volume = NULL;
465                 }
466         } else {
467                 *errorp = 0;
468         }
469         return (volume);
470 }
471
472 /*
473  * Load a volume's on-disk information.  The volume must be referenced and
474  * not locked.  We temporarily acquire an exclusive lock to interlock
475  * against releases or multiple get's.
476  */
477 static int
478 hammer_load_volume(hammer_volume_t volume)
479 {
480         struct hammer_volume_ondisk *ondisk;
481         int error;
482
483         hammer_lock_ex(&volume->io.lock);
484         KKASSERT(volume->io.loading == 0);
485         volume->io.loading = 1;
486
487         if (volume->ondisk == NULL) {
488                 error = hammer_io_read(volume->devvp, &volume->io);
489                 if (error) {
490                         volume->io.loading = 0;
491                         hammer_unlock(&volume->io.lock);
492                         return (error);
493                 }
494                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
495
496                 /*
497                  * Configure the volume's A-lists.  These are used to
498                  * allocate clusters.
499                  */
500                 if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
501                         volume->alist.config = &Vol_super_alist_config;
502                         volume->alist.meta = ondisk->vol_almeta.super;
503                         volume->alist.info = volume;
504                 } else {
505                         volume->alist.config = &Vol_normal_alist_config;
506                         volume->alist.meta = ondisk->vol_almeta.normal;
507                         volume->alist.info = NULL;
508                 }
509         } else {
510                 error = 0;
511         }
512         volume->io.loading = 0;
513         hammer_unlock(&volume->io.lock);
514         return(0);
515 }
516
517 /*
518  * Release a volume.  Call hammer_io_release on the last reference.  We have
519  * to acquire an exclusive lock to interlock against volume->ondisk tests
520  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
521  * lock to be held.
522  *
523  * Volumes are not unloaded from memory during normal operation.
524  */
525 void
526 hammer_rel_volume(hammer_volume_t volume, int flush)
527 {
528         if (volume->io.lock.refs == 1) {
529                 hammer_lock_ex(&volume->io.lock);
530                 if (volume->io.lock.refs == 1) {
531                         volume->ondisk = NULL;
532                         hammer_io_release(&volume->io, flush);
533                 } else if (flush) {
534                         hammer_io_flush(&volume->io);
535                 }
536                 hammer_unlock(&volume->io.lock);
537         }
538         hammer_unref(&volume->io.lock);
539 }
540
541 /************************************************************************
542  *                              SUPER-CLUSTERS                          *
543  ************************************************************************
544  *
545  * Manage super-clusters.  Note that a supercl holds a reference to its
546  * associated volume.
547  */
548 static int
549 hammer_find_supercl(hammer_volume_t volume, int32_t scl_no)
550 {
551         if (RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no))
552                 return(1);
553         return(0);
554 }
555
556 hammer_supercl_t
557 hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
558                    int *errorp, hammer_alloc_state_t isnew)
559 {
560         hammer_supercl_t supercl;
561
562         /*
563          * Locate and lock the super-cluster structure, creating one
564          * if necessary.
565          */
566 again:
567         supercl = RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no);
568         if (supercl == NULL) {
569                 ++hammer_count_supercls;
570                 supercl = kmalloc(sizeof(*supercl), M_HAMMER, M_WAITOK|M_ZERO);
571                 supercl->scl_no = scl_no;
572                 supercl->volume = volume;
573                 supercl->io.offset = calculate_supercl_offset(volume, scl_no);
574                 hammer_io_init(&supercl->io, HAMMER_STRUCTURE_SUPERCL);
575                 hammer_ref(&supercl->io.lock);
576
577                 /*
578                  * Insert the cluster into the RB tree and handle late
579                  * collisions.
580                  */
581                 if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
582                         hammer_unref(&supercl->io.lock);
583                         --hammer_count_supercls;
584                         kfree(supercl, M_HAMMER);
585                         goto again;
586                 }
587                 hammer_ref(&volume->io.lock);
588         } else {
589                 hammer_ref(&supercl->io.lock);
590         }
591
592         /*
593          * Deal with on-disk info
594          */
595         if (supercl->ondisk == NULL || isnew || supercl->io.loading) {
596                 *errorp = hammer_load_supercl(supercl, isnew);
597                 if (*errorp) {
598                         hammer_rel_supercl(supercl, 1);
599                         supercl = NULL;
600                 }
601         } else {
602                 *errorp = 0;
603         }
604         return(supercl);
605 }
606
607 static int
608 hammer_load_supercl(hammer_supercl_t supercl, hammer_alloc_state_t isnew)
609 {
610         struct hammer_supercl_ondisk *ondisk;
611         hammer_volume_t volume = supercl->volume;
612         int error;
613         int64_t nclusters;
614
615         hammer_lock_ex(&supercl->io.lock);
616         KKASSERT(supercl->io.loading == 0);
617         supercl->io.loading = 1;
618
619         if (supercl->ondisk == NULL) {
620                 if (isnew)
621                         error = hammer_io_new(volume->devvp, &supercl->io);
622                 else
623                         error = hammer_io_read(volume->devvp, &supercl->io);
624                 if (error) {
625                         supercl->io.loading = 0;
626                         hammer_unlock(&supercl->io.lock);
627                         return (error);
628                 }
629                 supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
630
631                 supercl->alist.config = &Supercl_alist_config;
632                 supercl->alist.meta = ondisk->scl_meta;
633                 supercl->alist.info = NULL;
634         } else if (isnew) {
635                 error = hammer_io_new(volume->devvp, &supercl->io);
636         } else {
637                 error = 0;
638         }
639         if (error == 0 && isnew) {
640                 /*
641                  * If this is a new super-cluster we have to initialize
642                  * various ondisk structural elements.  The caller is
643                  * responsible for the remainder.
644                  */
645                 struct hammer_alist_live dummy;
646
647                 hammer_modify_supercl(supercl);
648
649                 ondisk = supercl->ondisk;
650                 dummy.config = &Buf_alist_config;
651                 dummy.meta = ondisk->head.buf_almeta;
652                 dummy.info = NULL;
653                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL);
654
655                 nclusters = volume->ondisk->vol_nclusters -
656                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
657                 KKASSERT(nclusters > 0);
658                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
659                         nclusters = HAMMER_SCL_MAXCLUSTERS;
660                 hammer_alist_init(&supercl->alist, 0, (int32_t)nclusters,
661                                   isnew);
662         }
663         supercl->io.loading = 0;
664         hammer_unlock(&supercl->io.lock);
665         return (error);
666 }
667
668 /*
669  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
670  */
671 int
672 hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused)
673 {
674         KKASSERT(supercl->io.lock.refs == 0);
675         hammer_ref(&supercl->io.lock);
676         hammer_rel_supercl(supercl, 2);
677         return(0);
678 }
679
680 /*
681  * Release a super-cluster.  We have to deal with several places where
682  * another thread can ref the super-cluster.
683  *
684  * Only destroy the structure itself if the related buffer cache buffer
685  * was disassociated from it.  This ties the management of the structure
686  * to the buffer cache subsystem.
687  */
688 void
689 hammer_rel_supercl(hammer_supercl_t supercl, int flush)
690 {
691         hammer_volume_t volume;
692
693         if (supercl->io.lock.refs == 1) {
694                 hammer_lock_ex(&supercl->io.lock);
695                 if (supercl->io.lock.refs == 1) {
696                         hammer_io_release(&supercl->io, flush);
697                         if (supercl->io.bp == NULL &&
698                             supercl->io.lock.refs == 1) {
699                                 volume = supercl->volume;
700                                 RB_REMOVE(hammer_scl_rb_tree,
701                                           &volume->rb_scls_root, supercl);
702                                 supercl->volume = NULL; /* sanity */
703                                 --hammer_count_supercls;
704                                 kfree(supercl, M_HAMMER);
705                                 hammer_rel_volume(volume, 0);
706                                 return;
707                         }
708                 } else if (flush) {
709                         hammer_io_flush(&supercl->io);
710                 }
711                 hammer_unlock(&supercl->io.lock);
712         }
713         hammer_unref(&supercl->io.lock);
714 }
715
716 /************************************************************************
717  *                              CLUSTERS                                *
718  ************************************************************************
719  *
720  */
721 hammer_cluster_t
722 hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
723                    int *errorp, int getflags)
724 {
725         hammer_cluster_t cluster;
726
727 again:
728         cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
729         if (cluster == NULL) {
730                 ++hammer_count_clusters;
731                 cluster = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
732                 cluster->clu_no = clu_no;
733                 cluster->volume = volume;
734                 RB_INIT(&cluster->rb_bufs_root);
735                 RB_INIT(&cluster->rb_nods_root);
736                 hammer_io_init(&cluster->io, HAMMER_STRUCTURE_CLUSTER);
737                 cluster->io.offset = calculate_cluster_offset(volume, clu_no);
738                 hammer_ref(&cluster->io.lock);
739                 /* NOTE: cluster->io.validated expected to be 0 */
740
741                 /*
742                  * Insert the cluster into the RB tree and handle late
743                  * collisions.
744                  */
745                 if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
746                         hammer_unref(&cluster->io.lock);
747                         --hammer_count_clusters;
748                         kfree(cluster, M_HAMMER);
749                         goto again;
750                 }
751                 hammer_ref(&volume->io.lock);
752         } else {
753                 hammer_ref(&cluster->io.lock);
754         }
755
756         /*
757          * Deal with on-disk info
758          */
759         if (cluster->ondisk == NULL || getflags || cluster->io.validated == 0) {
760                 *errorp = hammer_load_cluster(cluster, getflags);
761                 if (*errorp) {
762                         hammer_rel_cluster(cluster, 1);
763                         cluster = NULL;
764                 }
765         } else {
766                 *errorp = 0;
767         }
768         return (cluster);
769 }
770
771 hammer_cluster_t
772 hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
773 {
774         hammer_cluster_t cluster;
775
776         cluster = hmp->rootcl;
777         KKASSERT(cluster != NULL);
778         hammer_ref(&cluster->io.lock);
779
780         /*
781          * Deal with on-disk info
782          */
783         if (cluster->ondisk == NULL || cluster->io.validated == 0) {
784                 *errorp = hammer_load_cluster(cluster, 0);
785                 if (*errorp) {
786                         hammer_rel_cluster(cluster, 1);
787                         cluster = NULL;
788                 }
789         } else {
790                 *errorp = 0;
791         }
792         return (cluster);
793 }
794
795 static
796 int
797 hammer_load_cluster(hammer_cluster_t cluster, int getflags)
798 {
799         hammer_volume_t volume = cluster->volume;
800         struct hammer_cluster_ondisk *ondisk;
801         int error;
802
803         hammer_lock_ex(&cluster->io.lock);
804         KKASSERT(cluster->io.loading == 0);
805         cluster->io.loading = 1;
806
807         if (cluster->ondisk == NULL) {
808                 KKASSERT(TAILQ_EMPTY(&cluster->io.deplist));
809
810                 /*
811                  * Unmodified buffers may be present, indicating that we
812                  * had already validated the cluster even though we no longer
813                  * have its ondisk info.
814                  */
815                 if (!RB_EMPTY(&cluster->rb_bufs_root))
816                         KKASSERT(cluster->io.validated);
817                 if (getflags & GET_CLUSTER_NEW)
818                         error = hammer_io_new(volume->devvp, &cluster->io);
819                 else
820                         error = hammer_io_read(volume->devvp, &cluster->io);
821                 if (error) {
822                         cluster->io.loading = 0;
823                         hammer_unlock(&cluster->io.lock);
824                         return (error);
825                 }
826                 cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
827
828                 cluster->alist_master.config = &Clu_master_alist_config;
829                 cluster->alist_master.meta = ondisk->clu_master_meta;
830                 cluster->alist_btree.config = &Clu_slave_alist_config;
831                 cluster->alist_btree.meta = ondisk->clu_btree_meta;
832                 cluster->alist_btree.info = cluster;
833                 cluster->alist_record.config = &Clu_slave_alist_config;
834                 cluster->alist_record.meta = ondisk->clu_record_meta;
835                 cluster->alist_record.info = cluster;
836                 cluster->alist_mdata.config = &Clu_slave_alist_config;
837                 cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
838                 cluster->alist_mdata.info = cluster;
839
840                 if ((getflags & GET_CLUSTER_NEW) == 0) {
841                         /*
842                          * Load cluster range info for easy access
843                          */
844                         cluster->clu_btree_beg = ondisk->clu_btree_beg;
845                         cluster->clu_btree_end = ondisk->clu_btree_end;
846                 }
847         } else if (getflags & GET_CLUSTER_NEW) {
848                 error = hammer_io_new(volume->devvp, &cluster->io);
849         } else {
850                 error = 0;
851         }
852         if (error == 0 && (getflags & GET_CLUSTER_NEW)) {
853                 /*
854                  * If this is a new cluster we have to initialize
855                  * various ondisk structural elements.  The caller is
856                  * responsible for the remainder.
857                  */
858                 struct hammer_alist_live dummy;
859                 hammer_node_t croot;
860                 hammer_volume_ondisk_t voldisk;
861                 int32_t nbuffers;
862
863                 cluster->flags &= ~HAMMER_CLUSTER_DELETED;
864
865                 hammer_modify_cluster(cluster);
866                 ondisk = cluster->ondisk;
867                 voldisk = volume->ondisk;
868
869                 dummy.config = &Buf_alist_config;
870                 dummy.meta = ondisk->head.buf_almeta;
871                 dummy.info = NULL;
872                 hammer_initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER);
873
874                 ondisk->vol_fsid = voldisk->vol_fsid;
875                 ondisk->vol_fstype = voldisk->vol_fstype;
876                 ondisk->clu_gen = 1;
877                 ondisk->clu_id = 0;     /* XXX */
878                 ondisk->clu_no = cluster->clu_no;
879                 ondisk->clu_flags = 0;
880                 ondisk->clu_start = HAMMER_BUFSIZE;
881                 ondisk->synchronized_rec_id = 1; /* XXX timestamp */
882                 KKASSERT(voldisk->vol_clo_end > cluster->io.offset);
883                 if (voldisk->vol_clo_end - cluster->io.offset >
884                     voldisk->vol_clsize) {
885                         ondisk->clu_limit = voldisk->vol_clsize;
886                 } else {
887                         ondisk->clu_limit = (int32_t)(voldisk->vol_clo_end -
888                                                       cluster->io.offset);
889                 }
890                 nbuffers = ondisk->clu_limit / HAMMER_BUFSIZE;
891                 hammer_alist_init(&cluster->alist_master, 1, nbuffers - 1,
892                                   HAMMER_ASTATE_FREE);
893                 hammer_alist_init(&cluster->alist_btree,
894                                   HAMMER_FSBUF_MAXBLKS,
895                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
896                                   HAMMER_ASTATE_ALLOC);
897                 hammer_alist_init(&cluster->alist_record,
898                                   HAMMER_FSBUF_MAXBLKS,
899                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
900                                   HAMMER_ASTATE_ALLOC);
901                 hammer_alist_init(&cluster->alist_mdata,
902                                   HAMMER_FSBUF_MAXBLKS,
903                                   (nbuffers - 1) * HAMMER_FSBUF_MAXBLKS,
904                                   HAMMER_ASTATE_ALLOC);
905
906                 ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
907                 ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
908                 ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
909
910                 /*
911                  * Initialize the B-Tree.  We don't know what the caller
912                  * intends to do with the cluster so make sure it causes
913                  * an assertion if the caller makes no changes.
914                  */
915                 ondisk->clu_btree_parent_vol_no = -2;
916                 ondisk->clu_btree_parent_clu_no = -2;
917                 ondisk->clu_btree_parent_offset = -2;
918                 ondisk->clu_btree_parent_clu_gen = -2;
919
920                 croot = hammer_alloc_btree(cluster, &error);
921                 if (error == 0) {
922                         hammer_modify_node(croot);
923                         bzero(croot->ondisk, sizeof(*croot->ondisk));
924                         croot->ondisk->count = 0;
925                         croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
926                         hammer_modify_cluster(cluster);
927                         ondisk->clu_btree_root = croot->node_offset;
928                         hammer_rel_node(croot);
929                 }
930
931                 /*
932                  * We just formatted this cluster, don't try to recover it!
933                  */
934                 cluster->io.validated = 1;
935         }
936
937         /*
938          * If no error occured handle automatic cluster recovery unless
939          * the NORECOVER flag is passed (prevents recovery recursions) or
940          * the cluster has been flagged for deletion (prevents an attempt
941          * to recover a cluster which is no longer hooked into the tree).
942          *
943          * Setting hammer_debug_recover to 1 will force recovery on load
944          * whether or not the cluster is marked open.
945          *
946          * Setting hammer_debug_recover to -1 will force NO recovery
947          * regardless of state.
948          *
949          * io.validated can only be cleared if the buffer RB list is empty,
950          * preventing us from trying to recover an actively referenced
951          * cluster (which would blow the filesystem to smithereens).
952          */
953         if (error == 0 && cluster->io.validated == 0) {
954                 if ((getflags & GET_CLUSTER_NORECOVER) == 0 &&
955                     (cluster->flags & HAMMER_CLUSTER_DELETED) == 0) {
956                         if ((cluster->ondisk->clu_flags & HAMMER_CLUF_OPEN) ||
957                             hammer_debug_recover > 0) {
958                                 if (hammer_debug_recover >= 0)
959                                         hammer_recover(cluster);
960                         }
961                         cluster->io.validated = 1;
962                 } else if ((cluster->ondisk->clu_flags & HAMMER_CLUF_OPEN)==0) {
963                         cluster->io.validated = 1;
964                 }
965         }
966         cluster->io.loading = 0;
967         hammer_unlock(&cluster->io.lock);
968         return (error);
969 }
970
971 /*
972  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
973  */
974 int
975 hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused)
976 {
977         hammer_ref(&cluster->io.lock);
978         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
979                 hammer_unload_buffer, NULL);
980         hammer_io_waitdep(&cluster->io);
981         KKASSERT(cluster->io.lock.refs == 1);
982         hammer_rel_cluster(cluster, 2);
983         return(0);
984 }
985
986 /*
987  * Update the cluster's synchronization TID, which is used during cluster
988  * recovery.  NOTE: The cluster header is not written out until all related
989  * records have been written out.
990  */
991 u_int64_t
992 hammer_alloc_recid(hammer_cluster_t cluster)
993 {
994         u_int64_t recid;
995
996         hammer_modify_cluster(cluster);
997         recid = cluster->ondisk->synchronized_rec_id++;
998         return(recid);
999 }
1000
1001 #if 0
1002 void
1003 hammer_update_syncid(hammer_cluster_t cluster, hammer_tid_t tid)
1004 {
1005         hammer_modify_cluster(cluster);
1006         if (cluster->ondisk->synchronized_tid < tid)
1007                 cluster->ondisk->synchronized_tid = tid;
1008 }
1009 #endif
1010
1011 /*
1012  * Reference a cluster that is either already referenced or via a specially
1013  * handled pointer (aka rootcl).
1014  */
1015 int
1016 hammer_ref_cluster(hammer_cluster_t cluster)
1017 {
1018         int error;
1019
1020         KKASSERT(cluster != NULL);
1021         hammer_ref(&cluster->io.lock);
1022
1023         /*
1024          * Deal with on-disk info
1025          */
1026         if (cluster->ondisk == NULL || cluster->io.validated == 0) {
1027                 error = hammer_load_cluster(cluster, 0);
1028                 if (error)
1029                         hammer_rel_cluster(cluster, 1);
1030         } else {
1031                 error = 0;
1032         }
1033         return(error);
1034 }
1035
1036 /*
1037  * Release a cluster.  We have to deal with several places where
1038  * another thread can ref the cluster.
1039  *
1040  * Only destroy the structure itself if we no longer have an IO or any
1041  * hammer buffers associated with the structure.
1042  */
1043 void
1044 hammer_rel_cluster(hammer_cluster_t cluster, int flush)
1045 {
1046         hammer_volume_t volume;
1047
1048         /*
1049          * Free a deleted cluster back to the pool when its last
1050          * active reference is released.  This prevents the cluster
1051          * from being reallocated until all its prior references go away.
1052          *
1053          * XXX implement a discard dependancy list which holds references
1054          * on clusters, preventing their deletion, until their parent cluster
1055          * has been flushed to disk.
1056          */
1057         if (cluster->io.lock.refs == 1) {
1058                 if (cluster->flags & HAMMER_CLUSTER_DELETED) {
1059                         cluster->flags &= ~HAMMER_CLUSTER_DELETED;
1060                         kprintf("FREE CLUSTER %d", cluster->clu_no);
1061                         if (cluster->ondisk->stat_records) {
1062                                 struct hammer_sync_info info;
1063
1064                                 info.error = 0;
1065                                 info.waitfor = MNT_WAIT;
1066                                 kprintf(" (still has %d records!)\n",
1067                                         cluster->ondisk->stat_records);
1068                                 Debugger("continue to recover cluster");
1069                                 hammer_recover(cluster);
1070                                 Debugger("continue to sync cluster");
1071                                 hammer_sync_cluster(cluster, &info);
1072                                 Debugger("now debug it");
1073                         }
1074                         kprintf("\n");
1075
1076                         /*
1077                          * Clean up any statistics we left hanging in the
1078                          * cluster.
1079                          */
1080                         hammer_adjust_stats(cluster, HAMMER_FSBUF_BTREE,
1081                                             -cluster->ondisk->stat_idx_bufs);
1082                         hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA,
1083                                             -cluster->ondisk->stat_data_bufs);
1084                         hammer_adjust_stats(cluster, HAMMER_FSBUF_RECORDS,
1085                                             -cluster->ondisk->stat_rec_bufs);
1086                         /*
1087                          * hammer_discard_cluster(cluster) - throw away
1088                          * dirty backing store, recurse to any underlying
1089                          * buffers. XXX
1090                          */
1091                         hammer_free_cluster(cluster);
1092                 }
1093         }
1094
1095         if (cluster->io.lock.refs == 1) {
1096                 hammer_lock_ex(&cluster->io.lock);
1097                 if (cluster->io.lock.refs == 1) {
1098                         /*
1099                          * Release the I/O.  If we or the kernel wants to
1100                          * flush, this will release the bp.  Otherwise the
1101                          * bp may be written and flushed passively by the
1102                          * kernel later on.
1103                          */
1104                         hammer_io_release(&cluster->io, flush);
1105
1106                         /*
1107                          * Final cleanup
1108                          */
1109                         if (cluster != cluster->volume->hmp->rootcl &&
1110                             cluster->io.bp == NULL &&
1111                             cluster->io.lock.refs == 1 &&
1112                             RB_EMPTY(&cluster->rb_bufs_root)) {
1113                                 KKASSERT(RB_EMPTY(&cluster->rb_nods_root));
1114                                 volume = cluster->volume;
1115                                 RB_REMOVE(hammer_clu_rb_tree,
1116                                           &volume->rb_clus_root, cluster);
1117                                 cluster->volume = NULL; /* sanity */
1118                                 --hammer_count_clusters;
1119                                 kfree(cluster, M_HAMMER);
1120                                 hammer_rel_volume(volume, 0);
1121                                 return;
1122                         }
1123                 } else if (flush) {
1124                         hammer_io_flush(&cluster->io);
1125                 }
1126                 hammer_unlock(&cluster->io.lock);
1127         }
1128         hammer_unref(&cluster->io.lock);
1129 }
1130
1131 /************************************************************************
1132  *                              BUFFERS                                 *
1133  ************************************************************************
1134  *
1135  * Manage buffers.  Note that a buffer holds a reference to its associated
1136  * cluster, and its cluster will hold a reference to the cluster's volume.
1137  *
1138  * A non-zero buf_type indicates that a new buffer should be created and
1139  * zero'd.
1140  */
1141 hammer_buffer_t
1142 hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
1143                   u_int64_t buf_type, int *errorp)
1144 {
1145         hammer_buffer_t buffer;
1146
1147         /*
1148          * Find the buffer.  Note that buffer 0 corresponds to the cluster
1149          * header and should never be requested.
1150          */
1151         KKASSERT(buf_no >= cluster->ondisk->clu_start / HAMMER_BUFSIZE &&
1152                  buf_no < cluster->ondisk->clu_limit / HAMMER_BUFSIZE);
1153
1154         /*
1155          * Locate and lock the buffer structure, creating one if necessary.
1156          */
1157 again:
1158         buffer = RB_LOOKUP(hammer_buf_rb_tree, &cluster->rb_bufs_root, buf_no);
1159         if (buffer == NULL) {
1160                 ++hammer_count_buffers;
1161                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
1162                 buffer->buf_no = buf_no;
1163                 buffer->cluster = cluster;
1164                 buffer->volume = cluster->volume;
1165                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
1166                 buffer->io.offset = cluster->io.offset +
1167                                     (buf_no * HAMMER_BUFSIZE);
1168                 TAILQ_INIT(&buffer->clist);
1169                 hammer_ref(&buffer->io.lock);
1170
1171                 /*
1172                  * Insert the cluster into the RB tree and handle late
1173                  * collisions.
1174                  */
1175                 if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
1176                         hammer_unref(&buffer->io.lock);
1177                         --hammer_count_buffers;
1178                         kfree(buffer, M_HAMMER);
1179                         goto again;
1180                 }
1181                 hammer_ref(&cluster->io.lock);
1182         } else {
1183                 hammer_ref(&buffer->io.lock);
1184         }
1185
1186         /*
1187          * Deal with on-disk info
1188          */
1189         if (buffer->ondisk == NULL || buf_type || buffer->io.loading) {
1190                 *errorp = hammer_load_buffer(buffer, buf_type);
1191                 if (*errorp) {
1192                         hammer_rel_buffer(buffer, 1);
1193                         buffer = NULL;
1194                 }
1195         } else {
1196                 *errorp = 0;
1197         }
1198         return(buffer);
1199 }
1200
1201 static int
1202 hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
1203 {
1204         hammer_volume_t volume;
1205         hammer_fsbuf_ondisk_t ondisk;
1206         int error;
1207
1208         /*
1209          * Load the buffer's on-disk info
1210          */
1211         volume = buffer->volume;
1212         hammer_lock_ex(&buffer->io.lock);
1213         KKASSERT(buffer->io.loading == 0);
1214         buffer->io.loading = 1;
1215
1216         if (buffer->ondisk == NULL) {
1217                 if (buf_type) {
1218                         error = hammer_io_new(volume->devvp, &buffer->io);
1219                 } else {
1220                         error = hammer_io_read(volume->devvp, &buffer->io);
1221                 }
1222                 if (error) {
1223                         buffer->io.loading = 0;
1224                         hammer_unlock(&buffer->io.lock);
1225                         return (error);
1226                 }
1227                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
1228                 buffer->alist.config = &Buf_alist_config;
1229                 buffer->alist.meta = ondisk->head.buf_almeta;
1230                 buffer->buf_type = ondisk->head.buf_type;
1231         } else if (buf_type) {
1232                 error = hammer_io_new(volume->devvp, &buffer->io);
1233         } else {
1234                 error = 0;
1235         }
1236         if (error == 0 && buf_type) {
1237                 hammer_modify_buffer(buffer);
1238                 ondisk = buffer->ondisk;
1239                 hammer_initbuffer(&buffer->alist, &ondisk->head, buf_type);
1240                 buffer->buf_type = ondisk->head.buf_type;
1241         }
1242         buffer->io.loading = 0;
1243         hammer_unlock(&buffer->io.lock);
1244         return (error);
1245 }
1246
1247 /*
1248  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
1249  */
1250 int
1251 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
1252 {
1253         hammer_ref(&buffer->io.lock);
1254         hammer_flush_buffer_nodes(buffer);
1255         KKASSERT(buffer->io.lock.refs == 1);
1256         hammer_rel_buffer(buffer, 2);
1257         return(0);
1258 }
1259
1260 /*
1261  * Reference a buffer that is either already referenced or via a specially
1262  * handled pointer (aka cursor->buffer).
1263  */
1264 int
1265 hammer_ref_buffer(hammer_buffer_t buffer)
1266 {
1267         int error;
1268
1269         hammer_ref(&buffer->io.lock);
1270         if (buffer->ondisk == NULL || buffer->io.loading) {
1271                 error = hammer_load_buffer(buffer, 0);
1272                 if (error) {
1273                         hammer_rel_buffer(buffer, 1);
1274                         /*
1275                          * NOTE: buffer pointer can become stale after
1276                          * the above release.
1277                          */
1278                 } else {
1279                         KKASSERT(buffer->buf_type ==
1280                                  buffer->ondisk->head.buf_type);
1281                 }
1282         } else {
1283                 error = 0;
1284         }
1285         return(error);
1286 }
1287
1288 /*
1289  * Release a buffer.  We have to deal with several places where
1290  * another thread can ref the buffer.
1291  *
1292  * Only destroy the structure itself if the related buffer cache buffer
1293  * was disassociated from it.  This ties the management of the structure
1294  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1295  * embedded io is referenced or not.
1296  */
1297 void
1298 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
1299 {
1300         hammer_cluster_t cluster;
1301
1302         if (buffer->io.lock.refs == 1) {
1303                 hammer_lock_ex(&buffer->io.lock);
1304                 if (buffer->io.lock.refs == 1) {
1305                         hammer_io_release(&buffer->io, flush);
1306
1307                         if (buffer->io.bp == NULL &&
1308                             buffer->io.lock.refs == 1) {
1309                                 hammer_flush_buffer_nodes(buffer);
1310                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
1311                                 cluster = buffer->cluster;
1312                                 RB_REMOVE(hammer_buf_rb_tree,
1313                                           &cluster->rb_bufs_root, buffer);
1314                                 buffer->cluster = NULL; /* sanity */
1315                                 --hammer_count_buffers;
1316                                 kfree(buffer, M_HAMMER);
1317                                 hammer_rel_cluster(cluster, 0);
1318                                 return;
1319                         }
1320                 } else if (flush) {
1321                         hammer_io_flush(&buffer->io);
1322                 }
1323                 hammer_unlock(&buffer->io.lock);
1324         }
1325         hammer_unref(&buffer->io.lock);
1326 }
1327
1328 /************************************************************************
1329  *                              NODES                                   *
1330  ************************************************************************
1331  *
1332  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1333  * method used by the HAMMER filesystem.
1334  *
1335  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1336  * associated with its buffer, and will only referenced the buffer while
1337  * the node itself is referenced.
1338  *
1339  * A hammer_node can also be passively associated with other HAMMER
1340  * structures, such as inodes, while retaining 0 references.  These
1341  * associations can be cleared backwards using a pointer-to-pointer in
1342  * the hammer_node.
1343  *
1344  * This allows the HAMMER implementation to cache hammer_nodes long-term
1345  * and short-cut a great deal of the infrastructure's complexity.  In
1346  * most cases a cached node can be reacquired without having to dip into
1347  * either the buffer or cluster management code.
1348  *
1349  * The caller must pass a referenced cluster on call and will retain
1350  * ownership of the reference on return.  The node will acquire its own
1351  * additional references, if necessary.
1352  */
1353 hammer_node_t
1354 hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
1355 {
1356         hammer_node_t node;
1357
1358         /*
1359          * Locate the structure, allocating one if necessary.
1360          */
1361 again:
1362         node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
1363                          node_offset);
1364         if (node == NULL) {
1365                 ++hammer_count_nodes;
1366                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
1367                 node->node_offset = node_offset;
1368                 node->cluster = cluster;
1369                 if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
1370                               node)) {
1371                         --hammer_count_nodes;
1372                         kfree(node, M_HAMMER);
1373                         goto again;
1374                 }
1375         }
1376         hammer_ref(&node->lock);
1377         *errorp = hammer_load_node(node);
1378         if (*errorp) {
1379                 hammer_rel_node(node);
1380                 node = NULL;
1381         }
1382         return(node);
1383 }
1384
1385 /*
1386  * Reference an already-referenced node.
1387  */
1388 int
1389 hammer_ref_node(hammer_node_t node)
1390 {
1391         int error;
1392
1393         KKASSERT(node->lock.refs > 0);
1394         hammer_ref(&node->lock);
1395         if ((error = hammer_load_node(node)) != 0)
1396                 hammer_rel_node(node);
1397         return(error);
1398 }
1399
1400 /*
1401  * Load a node's on-disk data reference.
1402  */
1403 static int
1404 hammer_load_node(hammer_node_t node)
1405 {
1406         hammer_buffer_t buffer;
1407         int32_t buf_no;
1408         int error;
1409
1410         if (node->ondisk)
1411                 return(0);
1412         error = 0;
1413         hammer_lock_ex(&node->lock);
1414         if (node->ondisk == NULL) {
1415                 /*
1416                  * This is a little confusing but the jist is that
1417                  * node->buffer determines whether the node is on
1418                  * the buffer's clist and node->ondisk determines
1419                  * whether the buffer is referenced.
1420                  */
1421                 if ((buffer = node->buffer) != NULL) {
1422                         error = hammer_ref_buffer(buffer);
1423                 } else {
1424                         buf_no = node->node_offset / HAMMER_BUFSIZE;
1425                         buffer = hammer_get_buffer(node->cluster,
1426                                                    buf_no, 0, &error);
1427                         if (buffer) {
1428                                 KKASSERT(error == 0);
1429                                 TAILQ_INSERT_TAIL(&buffer->clist,
1430                                                   node, entry);
1431                                 node->buffer = buffer;
1432                         }
1433                 }
1434                 if (error == 0) {
1435                         node->ondisk = (void *)((char *)buffer->ondisk +
1436                                (node->node_offset & HAMMER_BUFMASK));
1437                 }
1438         }
1439         hammer_unlock(&node->lock);
1440         return (error);
1441 }
1442
1443 /*
1444  * Safely reference a node, interlock against flushes via the IO subsystem.
1445  */
1446 hammer_node_t
1447 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
1448                      int *errorp)
1449 {
1450         hammer_node_t node;
1451
1452         if ((node = *cache) != NULL)
1453                 hammer_ref(&node->lock);
1454         if (node) {
1455                 *errorp = hammer_load_node(node);
1456                 if (*errorp) {
1457                         hammer_rel_node(node);
1458                         node = NULL;
1459                 }
1460         } else {
1461                 *errorp = ENOENT;
1462         }
1463         return(node);
1464 }
1465
1466 /*
1467  * Release a hammer_node.  On the last release the node dereferences
1468  * its underlying buffer and may or may not be destroyed.
1469  */
1470 void
1471 hammer_rel_node(hammer_node_t node)
1472 {
1473         hammer_cluster_t cluster;
1474         hammer_buffer_t buffer;
1475         int32_t node_offset;
1476         int flags;
1477
1478         /*
1479          * If this isn't the last ref just decrement the ref count and
1480          * return.
1481          */
1482         if (node->lock.refs > 1) {
1483                 hammer_unref(&node->lock);
1484                 return;
1485         }
1486
1487         /*
1488          * If there is no ondisk info or no buffer the node failed to load,
1489          * remove the last reference and destroy the node.
1490          */
1491         if (node->ondisk == NULL) {
1492                 hammer_unref(&node->lock);
1493                 hammer_flush_node(node);
1494                 /* node is stale now */
1495                 return;
1496         }
1497
1498         /*
1499          * Do final cleanups and then either destroy the node and leave it
1500          * passively cached.  The buffer reference is removed regardless.
1501          */
1502         buffer = node->buffer;
1503         node->ondisk = NULL;
1504
1505         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
1506                 hammer_unref(&node->lock);
1507                 hammer_rel_buffer(buffer, 0);
1508                 return;
1509         }
1510
1511         /*
1512          * Destroy the node.  Record pertainant data because the node
1513          * becomes stale the instant we flush it.
1514          */
1515         flags = node->flags;
1516         node_offset = node->node_offset;
1517         hammer_unref(&node->lock);
1518         hammer_flush_node(node);
1519         /* node is stale */
1520
1521         cluster = buffer->cluster;
1522         if (flags & HAMMER_NODE_DELETED) {
1523                 if (node_offset == cluster->ondisk->clu_btree_root)
1524                         KKASSERT(cluster->flags & HAMMER_CLUSTER_DELETED);
1525                 hammer_free_btree(cluster, node_offset);
1526         }
1527         hammer_rel_buffer(buffer, 0);
1528 }
1529
1530 /*
1531  * Passively cache a referenced hammer_node in *cache.  The caller may
1532  * release the node on return.
1533  */
1534 void
1535 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1536 {
1537         hammer_node_t old;
1538
1539         /*
1540          * If the node is being deleted, don't cache it!
1541          */
1542         if (node->flags & HAMMER_NODE_DELETED)
1543                 return;
1544
1545         /*
1546          * Cache the node.  If we previously cached a different node we
1547          * have to give HAMMER a chance to destroy it.
1548          */
1549 again:
1550         if (node->cache1 != cache) {
1551                 if (node->cache2 != cache) {
1552                         if ((old = *cache) != NULL) {
1553                                 KKASSERT(node->lock.refs != 0);
1554                                 hammer_uncache_node(cache);
1555                                 goto again;
1556                         }
1557                         if (node->cache2)
1558                                 *node->cache2 = NULL;
1559                         node->cache2 = node->cache1;
1560                         node->cache1 = cache;
1561                         *cache = node;
1562                 } else {
1563                         struct hammer_node **tmp;
1564                         tmp = node->cache1;
1565                         node->cache1 = node->cache2;
1566                         node->cache2 = tmp;
1567                 }
1568         }
1569 }
1570
1571 void
1572 hammer_uncache_node(struct hammer_node **cache)
1573 {
1574         hammer_node_t node;
1575
1576         if ((node = *cache) != NULL) {
1577                 *cache = NULL;
1578                 if (node->cache1 == cache) {
1579                         node->cache1 = node->cache2;
1580                         node->cache2 = NULL;
1581                 } else if (node->cache2 == cache) {
1582                         node->cache2 = NULL;
1583                 } else {
1584                         panic("hammer_uncache_node: missing cache linkage");
1585                 }
1586                 if (node->cache1 == NULL && node->cache2 == NULL)
1587                         hammer_flush_node(node);
1588         }
1589 }
1590
1591 /*
1592  * Remove a node's cache references and destroy the node if it has no
1593  * other references or backing store.
1594  */
1595 void
1596 hammer_flush_node(hammer_node_t node)
1597 {
1598         hammer_buffer_t buffer;
1599
1600         if (node->cache1)
1601                 *node->cache1 = NULL;
1602         if (node->cache2)
1603                 *node->cache2 = NULL;
1604         if (node->lock.refs == 0 && node->ondisk == NULL) {
1605                 RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
1606                           node);
1607                 if ((buffer = node->buffer) != NULL) {
1608                         node->buffer = NULL;
1609                         TAILQ_REMOVE(&buffer->clist, node, entry);
1610                         /* buffer is unreferenced because ondisk is NULL */
1611                 }
1612                 --hammer_count_nodes;
1613                 kfree(node, M_HAMMER);
1614         }
1615 }
1616
1617 /*
1618  * Flush passively cached B-Tree nodes associated with this buffer.
1619  * This is only called when the buffer is about to be destroyed, so
1620  * none of the nodes should have any references.
1621  */
1622 void
1623 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1624 {
1625         hammer_node_t node;
1626
1627         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1628                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1629                 hammer_ref(&node->lock);
1630                 node->flags |= HAMMER_NODE_FLUSH;
1631                 hammer_rel_node(node);
1632         }
1633 }
1634
1635 /************************************************************************
1636  *                              A-LIST ALLOCATORS                       *
1637  ************************************************************************/
1638
1639 /*
1640  * Allocate HAMMER clusters
1641  */
1642 hammer_cluster_t
1643 hammer_alloc_cluster(hammer_mount_t hmp, hammer_cluster_t cluster_hint,
1644                      int *errorp)
1645 {
1646         hammer_volume_t volume;
1647         hammer_cluster_t cluster;
1648         int32_t clu_no;
1649         int32_t clu_hint;
1650         int32_t vol_beg;
1651         int32_t vol_no;
1652
1653         /*
1654          * Figure out our starting volume and hint.
1655          */
1656         if (cluster_hint) {
1657                 vol_beg = cluster_hint->volume->vol_no;
1658                 clu_hint = cluster_hint->clu_no;
1659         } else {
1660                 vol_beg = hmp->volume_iterator;
1661                 clu_hint = -1;
1662         }
1663
1664         /*
1665          * Loop through volumes looking for a free cluster.  If allocating
1666          * a new cluster relative to an existing cluster try to find a free
1667          * cluster on either side (clu_hint >= 0), otherwise just do a
1668          * forwards iteration.
1669          */
1670         vol_no = vol_beg;
1671         do {
1672                 volume = hammer_get_volume(hmp, vol_no, errorp);
1673                 kprintf("VOLUME %p %d\n", volume, vol_no);
1674                 if (*errorp) {
1675                         clu_no = HAMMER_ALIST_BLOCK_NONE;
1676                         break;
1677                 }
1678                 hammer_modify_volume(volume);
1679                 if (clu_hint == -1) {
1680                         clu_hint = volume->clu_iterator;
1681                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1682                                                         clu_hint);
1683                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1684                                 clu_no = hammer_alist_alloc_fwd(&volume->alist,
1685                                                                 1, 0);
1686                         }
1687                 } else {
1688                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1689                                                         clu_hint);
1690                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1691                                 clu_no = hammer_alist_alloc_rev(&volume->alist,
1692                                                                 1, clu_hint);
1693                         }
1694                 }
1695                 if (clu_no != HAMMER_ALIST_BLOCK_NONE)
1696                         break;
1697                 hammer_rel_volume(volume, 0);
1698                 volume = NULL;
1699                 *errorp = ENOSPC;
1700                 vol_no = (vol_no + 1) % hmp->nvolumes;
1701                 clu_hint = -1;
1702         } while (vol_no != vol_beg);
1703
1704         /*
1705          * Acquire the cluster.  On success this will force *errorp to 0.
1706          */
1707         if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
1708                 kprintf("ALLOC CLUSTER %d:%d\n", volume->vol_no, clu_no);
1709                 cluster = hammer_get_cluster(volume, clu_no, errorp,
1710                                              GET_CLUSTER_NEW);
1711                 volume->clu_iterator = clu_no;
1712                 hammer_rel_volume(volume, 0);
1713         } else {
1714                 cluster = NULL;
1715         }
1716         if (cluster)
1717                 hammer_lock_ex(&cluster->io.lock);
1718         return(cluster);
1719 }
1720
1721 void
1722 hammer_init_cluster(hammer_cluster_t cluster, hammer_base_elm_t left_bound, 
1723                     hammer_base_elm_t right_bound)
1724 {
1725         hammer_cluster_ondisk_t ondisk = cluster->ondisk;
1726
1727         hammer_modify_cluster(cluster);
1728         ondisk->clu_btree_beg = *left_bound;
1729         ondisk->clu_btree_end = *right_bound;
1730         cluster->clu_btree_beg = ondisk->clu_btree_beg;
1731         cluster->clu_btree_end = ondisk->clu_btree_end;
1732 }
1733
1734 /*
1735  * Deallocate a cluster
1736  */
1737 void
1738 hammer_free_cluster(hammer_cluster_t cluster)
1739 {
1740         hammer_modify_volume(cluster->volume);
1741         hammer_alist_free(&cluster->volume->alist, cluster->clu_no, 1);
1742 }
1743
1744 /*
1745  * Allocate HAMMER elements - btree nodes, data storage, and record elements
1746  *
1747  * The passed *bufferp should be initialized to NULL.  On successive calls
1748  * *bufferp caches the most recent buffer used until put away by the caller.
1749  * Note that previously returned pointers using the cached buffer become
1750  * invalid on successive calls which reuse *bufferp.
1751  *
1752  * All allocations first attempt to use the block found at the specified
1753  * iterator.  If that fails the first available block is used.  If that
1754  * fails a new buffer is allocated and associated with the buffer type
1755  * A-list and the element is allocated out of the new buffer.
1756  *
1757  * This function also ensures that the required minimum number of buffers is
1758  * reserved to guarantee that recovery operations succeed.
1759  */
1760
1761 hammer_node_t
1762 hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
1763 {
1764         hammer_buffer_t buffer;
1765         hammer_alist_t live;
1766         hammer_node_t node;
1767         int32_t elm_no;
1768         int32_t buf_no;
1769         int32_t node_offset;
1770         int32_t n;
1771
1772         hammer_modify_cluster(cluster);
1773         buffer = NULL;
1774         live = &cluster->alist_btree;
1775
1776         /*
1777          * If we aren't recovering then ensure the required minimum
1778          * reservation is met. XXX if the recovery code packs the B-Tree
1779          * we don't have to do this.
1780          *
1781          * Calculate the number of buffers needed to hold the B-Tree.
1782          */
1783         if (cluster->io.validated) {
1784                 n = (cluster->ondisk->stat_records * 3 / 
1785                     HAMMER_BTREE_INT_ELMS / HAMMER_BTREE_NODES) + 1;
1786                 if (cluster->ondisk->stat_idx_bufs < n) {
1787                         kprintf("hammer_alloc_btree: %d/%d buffers\n",
1788                                 cluster->ondisk->stat_idx_bufs, n);
1789                 }
1790                 while (cluster->ondisk->stat_idx_bufs < n) {
1791                         alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1792                                          cluster->ondisk->idx_index, errorp,
1793                                          &buffer);
1794                         if (*errorp) {
1795                                 if (buffer)
1796                                         hammer_rel_buffer(buffer, 0);
1797                                 return(NULL);
1798                         }
1799                 }
1800         }
1801
1802
1803         /*
1804          * Allocate a B-Tree element
1805          */
1806         elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
1807         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1808                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1809         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1810                 alloc_new_buffer(cluster, HAMMER_FSBUF_BTREE, live,
1811                                  cluster->ondisk->idx_index, errorp, &buffer);
1812                 elm_no = hammer_alist_alloc(live, 1);
1813                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1814                         *errorp = ENOSPC;
1815                         if (buffer)
1816                                 hammer_rel_buffer(buffer, 0);
1817                         return(NULL);
1818                 }
1819         }
1820         cluster->ondisk->idx_index = elm_no;
1821         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_BTREE_NODES);
1822
1823         /*
1824          * Load and return the B-Tree element
1825          */
1826         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1827         node_offset = buf_no * HAMMER_BUFSIZE +
1828                       offsetof(union hammer_fsbuf_ondisk,
1829                                btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
1830         node = hammer_get_node(cluster, node_offset, errorp);
1831         if (node) {
1832                 hammer_modify_node(node);
1833                 bzero(node->ondisk, sizeof(*node->ondisk));
1834                 KKASSERT((node->flags & (HAMMER_NODE_DELETED)) == 0);
1835         } else {
1836                 hammer_alist_free(live, elm_no, 1);
1837                 hammer_rel_node(node);
1838                 node = NULL;
1839         }
1840         if (buffer)
1841                 hammer_rel_buffer(buffer, 0);
1842         return(node);
1843 }
1844
1845 void *
1846 hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
1847                   int *errorp, struct hammer_buffer **bufferp)
1848 {
1849         hammer_buffer_t buffer;
1850         hammer_alist_t live;
1851         int32_t elm_no;
1852         int32_t buf_no;
1853         int32_t nblks;
1854         void *item;
1855
1856         /*
1857          * Deal with large data blocks.  The blocksize is HAMMER_BUFSIZE
1858          * for these allocations.
1859          */
1860         hammer_modify_cluster(cluster);
1861         if ((bytes & HAMMER_BUFMASK) == 0) {
1862                 nblks = bytes / HAMMER_BUFSIZE;
1863                 /* only one block allowed for now (so buffer can hold it) */
1864                 KKASSERT(nblks == 1);
1865
1866                 buf_no = hammer_alloc_master(cluster, nblks,
1867                                              cluster->ondisk->idx_ldata, 1);
1868                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1869                         *errorp = ENOSPC;
1870                         return(NULL);
1871                 }
1872                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, nblks);
1873                 cluster->ondisk->idx_ldata = buf_no;
1874                 buffer = *bufferp;
1875                 *bufferp = hammer_get_buffer(cluster, buf_no, -1, errorp);
1876                 if (buffer)
1877                         hammer_rel_buffer(buffer, 0);
1878                 buffer = *bufferp;
1879                 return(buffer->ondisk);
1880         }
1881
1882         /*
1883          * Allocate a data element.  The block size is HAMMER_DATA_BLKSIZE
1884          * (64 bytes) for these allocations.
1885          */
1886         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1887         nblks /= HAMMER_DATA_BLKSIZE;
1888         live = &cluster->alist_mdata;
1889         elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data);
1890         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1891                 elm_no = hammer_alist_alloc_fwd(live, nblks, 0);
1892         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1893                 alloc_new_buffer(cluster, HAMMER_FSBUF_DATA, live,
1894                                  cluster->ondisk->idx_data, errorp, bufferp);
1895                 elm_no = hammer_alist_alloc(live, nblks);
1896                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1897                         *errorp = ENOSPC;
1898                         return(NULL);
1899                 }
1900         }
1901         cluster->ondisk->idx_index = elm_no;
1902
1903         /*
1904          * Load and return the B-Tree element
1905          */
1906         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1907         buffer = *bufferp;
1908         if (buffer == NULL || buffer->cluster != cluster ||
1909             buffer->buf_no != buf_no) {
1910                 if (buffer)
1911                         hammer_rel_buffer(buffer, 0);
1912                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1913                 *bufferp = buffer;
1914         }
1915         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_DATA);
1916         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_DATA_NODES);
1917         hammer_modify_buffer(buffer);
1918         item = &buffer->ondisk->data.data[elm_no & HAMMER_FSBUF_BLKMASK];
1919         bzero(item, nblks * HAMMER_DATA_BLKSIZE);
1920         *errorp = 0;
1921         return(item);
1922 }
1923
1924 void *
1925 hammer_alloc_record(hammer_cluster_t cluster, int *errorp,
1926                     u_int8_t rec_type, struct hammer_buffer **bufferp)
1927 {
1928         hammer_buffer_t buffer;
1929         hammer_alist_t live;
1930         int32_t elm_no;
1931         int32_t buf_no;
1932         void *item;
1933
1934         /*
1935          * Allocate a record element
1936          */
1937         hammer_modify_cluster(cluster);
1938         live = &cluster->alist_record;
1939         elm_no = hammer_alist_alloc_rev(live, 1, cluster->ondisk->idx_record);
1940         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1941                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1942         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1943                 alloc_new_buffer(cluster, HAMMER_FSBUF_RECORDS, live,
1944                                  cluster->ondisk->idx_record, errorp, bufferp);
1945                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1946                 kprintf("hammer_alloc_record elm again %08x\n", elm_no);
1947                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1948                         *errorp = ENOSPC;
1949                         return(NULL);
1950                 }
1951         }
1952         cluster->ondisk->idx_record = elm_no;
1953
1954         /*
1955          * Load and return the record element
1956          */
1957         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1958         buffer = *bufferp;
1959         if (buffer == NULL || buffer->cluster != cluster ||
1960             buffer->buf_no != buf_no) {
1961                 if (buffer)
1962                         hammer_rel_buffer(buffer, 0);
1963                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1964                 *bufferp = buffer;
1965         }
1966         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_RECORDS);
1967         KASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_RECORD_NODES,
1968                 ("elm_no %d (%d) out of bounds", elm_no, elm_no & HAMMER_FSBUF_BLKMASK));
1969         hammer_modify_buffer(buffer);
1970         item = &buffer->ondisk->record.recs[elm_no & HAMMER_FSBUF_BLKMASK];
1971         bzero(item, sizeof(union hammer_record_ondisk));
1972         *errorp = 0;
1973         ++cluster->ondisk->stat_records;
1974         if (rec_type == HAMMER_RECTYPE_CLUSTER)
1975                 ++cluster->ondisk->stat_records;
1976         return(item);
1977 }
1978
1979 void
1980 hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
1981 {
1982         int32_t elm_no;
1983         int32_t nblks;
1984         hammer_alist_t live;
1985
1986         hammer_modify_cluster(buffer->cluster);
1987         if ((bytes & HAMMER_BUFMASK) == 0) {
1988                 nblks = bytes / HAMMER_BUFSIZE;
1989                 KKASSERT(nblks == 1 && data == (void *)buffer->ondisk);
1990                 hammer_alist_free(&buffer->cluster->alist_master,
1991                                   buffer->buf_no, nblks);
1992                 hammer_adjust_stats(buffer->cluster, HAMMER_FSBUF_DATA, -nblks);
1993                 return;
1994         }
1995
1996         elm_no = ((char *)data - (char *)buffer->ondisk->data.data) /
1997                  HAMMER_DATA_BLKSIZE;
1998         KKASSERT(elm_no >= 0 && elm_no < HAMMER_DATA_NODES);
1999         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
2000         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
2001         nblks /= HAMMER_DATA_BLKSIZE;
2002         live = &buffer->cluster->alist_mdata;
2003         hammer_alist_free(live, elm_no, nblks);
2004 }
2005
2006 void
2007 hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec,
2008                         u_int8_t rec_type)
2009 {
2010         int32_t elm_no;
2011         hammer_alist_t live;
2012
2013         hammer_modify_cluster(buffer->cluster);
2014         elm_no = rec - &buffer->ondisk->record.recs[0];
2015         KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
2016         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
2017         live = &buffer->cluster->alist_record;
2018         hammer_alist_free(live, elm_no, 1);
2019         --buffer->cluster->ondisk->stat_records;
2020         if (rec_type == HAMMER_RECTYPE_CLUSTER)
2021                 --buffer->cluster->ondisk->stat_records;
2022 }
2023
2024 void
2025 hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
2026 {
2027         const int32_t blksize = sizeof(struct hammer_node_ondisk);
2028         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2029         hammer_alist_t live;
2030         int32_t elm_no;
2031
2032         hammer_modify_cluster(cluster);
2033         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2034         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, btree.nodes[0]);
2035         live = &cluster->alist_btree;
2036         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2037         elm_no += fsbuf_offset / blksize;
2038         hammer_alist_free(live, elm_no, 1);
2039 }
2040
2041 void
2042 hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
2043 {
2044         const int32_t blksize = HAMMER_DATA_BLKSIZE;
2045         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2046         hammer_alist_t live;
2047         int32_t elm_no;
2048         int32_t buf_no;
2049         int32_t nblks;
2050
2051         hammer_modify_cluster(cluster);
2052         if ((bytes & HAMMER_BUFMASK) == 0) {
2053                 nblks = bytes / HAMMER_BUFSIZE;
2054                 KKASSERT(nblks == 1 && (bclu_offset & HAMMER_BUFMASK) == 0);
2055                 buf_no = bclu_offset / HAMMER_BUFSIZE;
2056                 hammer_alist_free(&cluster->alist_master, buf_no, nblks);
2057                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, -nblks);
2058                 return;
2059         }
2060
2061         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2062         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, data.data[0][0]);
2063         live = &cluster->alist_mdata;
2064         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
2065         nblks /= HAMMER_DATA_BLKSIZE;
2066         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2067         elm_no += fsbuf_offset / blksize;
2068         hammer_alist_free(live, elm_no, nblks);
2069 }
2070
2071 void
2072 hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset,
2073                    u_int8_t rec_type)
2074 {
2075         const int32_t blksize = sizeof(union hammer_record_ondisk);
2076         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
2077         hammer_alist_t live;
2078         int32_t elm_no;
2079
2080         hammer_modify_cluster(cluster);
2081         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
2082         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, record.recs[0]);
2083         live = &cluster->alist_record;
2084         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
2085         elm_no += fsbuf_offset / blksize;
2086         hammer_alist_free(live, elm_no, 1);
2087         --cluster->ondisk->stat_records;
2088         if (rec_type == HAMMER_RECTYPE_CLUSTER)
2089                 --cluster->ondisk->stat_records;
2090 }
2091
2092
2093 /*
2094  * Allocate a new filesystem buffer and assign it to the specified
2095  * filesystem buffer type.  The new buffer will be added to the
2096  * type-specific A-list and initialized.
2097  */
2098 static void
2099 alloc_new_buffer(hammer_cluster_t cluster, u_int64_t type, hammer_alist_t live,
2100                  int start, int *errorp, struct hammer_buffer **bufferp)
2101 {
2102         hammer_buffer_t buffer;
2103         int32_t buf_no;
2104         int32_t base_blk;
2105         int isfwd;
2106
2107         if (*bufferp)
2108                 hammer_rel_buffer(*bufferp, 0);
2109         *bufferp = NULL;
2110
2111         start = start / HAMMER_FSBUF_MAXBLKS;   /* convert to buf_no */
2112         isfwd = (type != HAMMER_FSBUF_RECORDS);
2113         buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
2114         if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2115                 *errorp = ENOSPC;
2116                 return;
2117         }
2118
2119         /*
2120          * The new buffer must be initialized (type != 0) regardless of
2121          * whether we already have it cached or not, so don't try to
2122          * optimize the cached buffer check.  Just call hammer_get_buffer().
2123          */
2124         buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
2125         *bufferp = buffer;
2126
2127         /*
2128          * Do a meta-free of the buffer's elements into the type-specific
2129          * A-list and update our statistics to reflect the allocation.
2130          */
2131         if (buffer) {
2132 #if 0
2133                 kprintf("alloc_new_buffer buf_no %d type %016llx nelms %d\n",
2134                         buf_no, type, nelements);
2135 #endif
2136                 hammer_modify_buffer(buffer);  /*XXX*/
2137                 hammer_adjust_stats(cluster, type, 1);
2138
2139                 /*
2140                  * Free the buffer to the appropriate slave list so the
2141                  * cluster-based allocator sees it.
2142                  */
2143                 base_blk = buf_no * HAMMER_FSBUF_MAXBLKS;
2144
2145                 switch(type) {
2146                 case HAMMER_FSBUF_BTREE:
2147                         hammer_alist_free(live, base_blk, HAMMER_BTREE_NODES);
2148                         break;
2149                 case HAMMER_FSBUF_DATA:
2150                         hammer_alist_free(live, base_blk, HAMMER_DATA_NODES);
2151                         break;
2152                 case HAMMER_FSBUF_RECORDS:
2153                         hammer_alist_free(live, base_blk, HAMMER_RECORD_NODES);
2154                         break;
2155                 }
2156         }
2157 }
2158
2159 /*
2160  * Sync dirty buffers to the media
2161  */
2162
2163 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
2164 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
2165
2166 int
2167 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
2168 {
2169         struct hammer_sync_info info;
2170
2171         info.error = 0;
2172         info.waitfor = waitfor;
2173
2174         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
2175                       hammer_sync_scan1, hammer_sync_scan2, &info);
2176
2177         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
2178                 hammer_sync_volume, &info);
2179         return(info.error);
2180 }
2181
2182 static int
2183 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
2184 {
2185         struct hammer_inode *ip;
2186
2187         ip = VTOI(vp);
2188         if (vp->v_type == VNON || ip == NULL ||
2189             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2190              RB_EMPTY(&vp->v_rbdirty_tree))) {
2191                 return(-1);
2192         }
2193         return(0);
2194 }
2195
2196 static int
2197 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2198 {
2199         struct hammer_sync_info *info = data;
2200         struct hammer_inode *ip;
2201         int error;
2202
2203         ip = VTOI(vp);
2204         if (vp->v_type == VNON || vp->v_type == VBAD ||
2205             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
2206              RB_EMPTY(&vp->v_rbdirty_tree))) {
2207                 return(0);
2208         }
2209         if (vp->v_type != VCHR) {
2210                 error = VOP_FSYNC(vp, info->waitfor);
2211                 if (error)
2212                         info->error = error;
2213         }
2214         return(0);
2215 }
2216
2217 int
2218 hammer_sync_volume(hammer_volume_t volume, void *data)
2219 {
2220         struct hammer_sync_info *info = data;
2221
2222         hammer_ref(&volume->io.lock);
2223         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
2224                 hammer_sync_cluster, info);
2225         hammer_rel_volume(volume, 1);
2226         return(0);
2227 }
2228
2229 int
2230 hammer_sync_cluster(hammer_cluster_t cluster, void *data)
2231 {
2232         struct hammer_sync_info *info = data;
2233
2234         /*
2235          * XXX check if cluster deleted and don't bother to sync it?
2236          */
2237         hammer_ref(&cluster->io.lock);
2238         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
2239                 hammer_sync_buffer, info);
2240         /*hammer_io_waitdep(&cluster->io);*/
2241         hammer_rel_cluster(cluster, 1);
2242         return(0);
2243 }
2244
2245 int
2246 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
2247 {
2248         hammer_ref(&buffer->io.lock);
2249         hammer_rel_buffer(buffer, 1);
2250         return(0);
2251 }
2252
2253 /*
2254  * Generic buffer initialization.  Initialize the A-list into an all-allocated
2255  * state with the free block limit properly set.
2256  *
2257  * Note that alloc_new_buffer() will free the appropriate block range via
2258  * the appropriate cluster alist, so the free count is properly propogated.
2259  */
2260 void
2261 hammer_initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
2262 {
2263         head->buf_type = type;
2264
2265         switch(type) {
2266         case HAMMER_FSBUF_BTREE:
2267                 hammer_alist_init(live, 0, HAMMER_BTREE_NODES,
2268                                   HAMMER_ASTATE_ALLOC);
2269                 break;
2270         case HAMMER_FSBUF_DATA:
2271                 hammer_alist_init(live, 0, HAMMER_DATA_NODES,
2272                                   HAMMER_ASTATE_ALLOC);
2273                 break;
2274         case HAMMER_FSBUF_RECORDS:
2275                 hammer_alist_init(live, 0, HAMMER_RECORD_NODES,
2276                                   HAMMER_ASTATE_ALLOC);
2277                 break;
2278         default:
2279                 hammer_alist_init(live, 0, 0, HAMMER_ASTATE_ALLOC);
2280                 break;
2281         }
2282 }
2283
2284 /*
2285  * Calculate the cluster's offset in the volume.  This calculation is
2286  * slightly more complex when using superclusters because superclusters
2287  * are grouped in blocks of 16, followed by 16 x N clusters where N
2288  * is the number of clusters a supercluster can manage.
2289  */
2290 static int64_t
2291 calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
2292 {
2293         int32_t scl_group;
2294         int64_t scl_group_size;
2295         int64_t off;
2296
2297         if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
2298                 scl_group = clu_no / HAMMER_VOL_SUPERCLUSTER_GROUP /
2299                             HAMMER_SCL_MAXCLUSTERS;
2300                 scl_group_size = 
2301                             ((int64_t)HAMMER_BUFSIZE *
2302                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2303                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2304                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2305                 scl_group_size += 
2306                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2307
2308                 off = volume->cluster_base +
2309                       scl_group * scl_group_size +
2310                       (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
2311                       ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
2312                        HAMMER_VOL_SUPERCLUSTER_GROUP))
2313                       * volume->vol_clsize;
2314         } else {
2315                 off = volume->cluster_base +
2316                       (int64_t)clu_no * volume->vol_clsize;
2317         }
2318         return(off);
2319 }
2320
2321 /*
2322  * Calculate a super-cluster's offset in the volume.
2323  */
2324 static int64_t
2325 calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
2326 {
2327         int64_t off;
2328         int32_t scl_group;
2329         int64_t scl_group_size;
2330
2331         KKASSERT (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL);
2332         scl_group = scl_no / HAMMER_VOL_SUPERCLUSTER_GROUP;
2333         if (scl_group) {
2334                 scl_group_size = 
2335                             ((int64_t)HAMMER_BUFSIZE *
2336                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2337                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2338                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2339                 scl_group_size += 
2340                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2341                 off = volume->cluster_base + (scl_group * scl_group_size) +
2342                       (scl_no % HAMMER_VOL_SUPERCLUSTER_GROUP) * HAMMER_BUFSIZE;
2343         } else {
2344                 off = volume->cluster_base + (scl_no * HAMMER_BUFSIZE);
2345         }
2346         return(off);
2347 }
2348
2349 /*
2350  * Allocate nblks buffers from the cluster's master alist.
2351  */
2352 static int32_t
2353 hammer_alloc_master(hammer_cluster_t cluster, int nblks,
2354                     int32_t start, int isfwd)
2355 {
2356         int32_t buf_no;
2357
2358         hammer_modify_cluster(cluster);
2359         if (isfwd) {
2360                 buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2361                                                 nblks, start);
2362                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2363                         buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2364                                                 nblks, 0);
2365                 }
2366         } else {
2367                 buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2368                                                 nblks, start);
2369                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2370                         buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2371                                                 nblks, HAMMER_ALIST_BLOCK_MAX);
2372                 }
2373         }
2374
2375         /*
2376          * Recover space from empty record, b-tree, and data a-lists.
2377          */
2378
2379         return(buf_no);
2380 }
2381
2382 /*
2383  * Adjust allocation statistics
2384  */
2385 static void
2386 hammer_adjust_stats(hammer_cluster_t cluster, u_int64_t buf_type, int nblks)
2387 {
2388         if (nblks == 0)
2389                 return;
2390
2391         hammer_modify_cluster(cluster);
2392         hammer_modify_volume(cluster->volume);
2393         hammer_modify_volume(cluster->volume->hmp->rootvol);
2394
2395         switch(buf_type) {
2396         case HAMMER_FSBUF_BTREE:
2397                 cluster->ondisk->stat_idx_bufs += nblks;
2398                 cluster->volume->ondisk->vol_stat_idx_bufs += nblks;
2399                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_idx_bufs += nblks;
2400                 break;
2401         case HAMMER_FSBUF_DATA:
2402                 cluster->ondisk->stat_data_bufs += nblks;
2403                 cluster->volume->ondisk->vol_stat_data_bufs += nblks;
2404                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_data_bufs += nblks;
2405                 break;
2406         case HAMMER_FSBUF_RECORDS:
2407                 cluster->ondisk->stat_rec_bufs += nblks;
2408                 cluster->volume->ondisk->vol_stat_rec_bufs += nblks;
2409                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_rec_bufs += nblks;
2410                 break;
2411         }
2412 }
2413
2414 /*
2415  * A-LIST SUPPORT
2416  *
2417  * Setup the parameters for the various A-lists we use in hammer.  The
2418  * supercluster A-list must be chained to the cluster A-list and cluster
2419  * slave A-lists are chained to buffer A-lists.
2420  *
2421  * See hammer_init_alist_config() below.
2422  */
2423
2424 /*
2425  * A-LIST - cluster recursion into a filesystem buffer
2426  *
2427  * In the init case the buffer has already been initialized by
2428  * alloc_new_buffer() when it allocated the buffer out of the master
2429  * alist and marked it as free in the slave alist.
2430  *
2431  * Because we use a somewhat odd mechanism to assign buffers to slave
2432  * pools we can't actually free the buffer back to the master alist in
2433  * buffer_alist_destroy(), but instead must deal with that logic somewhere
2434  * else.
2435  */
2436 static int
2437 buffer_alist_init(void *info, int32_t blk, int32_t radix,
2438                   hammer_alloc_state_t state)
2439 {
2440         return(0);
2441 }
2442
2443 /*
2444  * Note: This routine is only called when freeing the last elements of
2445  * an initialized buffer.  Freeing all elements of the buffer when the
2446  * buffer was not previously initialized does not call this routine.
2447  */
2448 static int
2449 buffer_alist_destroy(void *info, int32_t blk, int32_t radix)
2450 {
2451         hammer_cluster_t cluster = info;
2452         int32_t buf_no;
2453
2454         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2455         kprintf("destroy buffer %d:%d:%d\n", cluster->volume->vol_no, cluster->clu_no, buf_no);
2456         return (0);
2457 }
2458
2459 /*
2460  * Note: atblk can be negative and atblk - blk can go negative.
2461  */
2462 static int
2463 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2464                       int32_t count, int32_t atblk, int32_t *fullp)
2465 {
2466         hammer_cluster_t cluster = info;
2467         hammer_buffer_t buffer;
2468         int32_t buf_no;
2469         int32_t r;
2470         int error = 0;
2471
2472         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2473         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2474         if (buffer) {
2475                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2476
2477                 hammer_modify_buffer(buffer);
2478                 r = hammer_alist_alloc_fwd(&buffer->alist, count, atblk - blk);
2479                 if (r != HAMMER_ALIST_BLOCK_NONE)
2480                         r += blk;
2481                 *fullp = hammer_alist_isfull(&buffer->alist);
2482                 hammer_rel_buffer(buffer, 0);
2483         } else {
2484                 r = HAMMER_ALIST_BLOCK_NONE;
2485                 *fullp = 1;
2486         }
2487         return(r);
2488 }
2489
2490 static int
2491 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2492                       int32_t count, int32_t atblk, int32_t *fullp)
2493 {
2494         hammer_cluster_t cluster = info;
2495         hammer_buffer_t buffer;
2496         int32_t buf_no;
2497         int32_t r;
2498         int error = 0;
2499
2500         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2501         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2502         if (buffer) {
2503                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2504                 hammer_modify_buffer(buffer);
2505                 r = hammer_alist_alloc_rev(&buffer->alist, count, atblk - blk);
2506                 if (r != HAMMER_ALIST_BLOCK_NONE)
2507                         r += blk;
2508                 *fullp = hammer_alist_isfull(&buffer->alist);
2509                 hammer_rel_buffer(buffer, 0);
2510         } else {
2511                 r = HAMMER_ALIST_BLOCK_NONE;
2512                 *fullp = 1;
2513         }
2514         return(r);
2515 }
2516
2517 static void
2518 buffer_alist_free(void *info, int32_t blk, int32_t radix,
2519                  int32_t base_blk, int32_t count, int32_t *emptyp)
2520 {
2521         hammer_cluster_t cluster = info;
2522         hammer_buffer_t buffer;
2523         int32_t buf_no;
2524         int error = 0;
2525
2526         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2527         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2528         if (buffer) {
2529                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2530                 hammer_modify_buffer(buffer);
2531                 hammer_alist_free(&buffer->alist, base_blk, count);
2532                 *emptyp = hammer_alist_isempty(&buffer->alist);
2533                 hammer_rel_buffer(buffer, 0);
2534         } else {
2535                 *emptyp = 0;
2536         }
2537 }
2538
2539 static int32_t
2540 buffer_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk,
2541                   int flags)
2542 {
2543         hammer_cluster_t cluster = info;
2544         hammer_buffer_t buffer;
2545         int32_t buf_no;
2546         int32_t maxblks;
2547         int error = 0;
2548
2549         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2550         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2551         if (buffer) {
2552                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2553                 switch(buffer->ondisk->head.buf_type) {
2554                 case HAMMER_FSBUF_RECORDS:
2555                         maxblks = HAMMER_RECORD_NODES;
2556                         break;
2557                 case HAMMER_FSBUF_BTREE:
2558                         maxblks = HAMMER_BTREE_NODES;
2559                         break;
2560                 case HAMMER_FSBUF_DATA:
2561                         maxblks = HAMMER_DATA_NODES;
2562                         break;
2563                 default:
2564                         panic("buffer_alist_find: unknown buffer type");
2565                         maxblks = 0;
2566                         break;
2567                 }
2568                 blk = hammer_alist_find(&buffer->alist, atblk - blk, maxblks,
2569                                         flags);
2570                 hammer_rel_buffer(buffer, 0);
2571         } else {
2572                 blk = HAMMER_ALIST_BLOCK_NONE;
2573         }
2574         return(blk);
2575 }
2576
2577 static void
2578 buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2579 {
2580 }
2581
2582 /*
2583  * A-LIST - super-cluster recursion into a cluster and cluster recursion
2584  * into a filesystem buffer.  A-List's are mostly self-contained entities,
2585  * but callbacks must be installed to recurse from one A-List to another.
2586  *
2587  * Implementing these callbacks allows us to operate a multi-layered A-List
2588  * as a single entity.
2589  */
2590
2591 /*
2592  * This occurs when allocating a cluster via the volume a-list and the
2593  * entry in the volume a-list indicated all-free.  The underlying supercl
2594  * has not yet been initialized.
2595  */
2596 static int
2597 super_alist_init(void *info, int32_t blk, int32_t radix,
2598                  hammer_alloc_state_t state)
2599 {
2600         hammer_volume_t volume = info;
2601         hammer_supercl_t supercl;
2602         int32_t scl_no;
2603         int error = 0;
2604
2605         /*
2606          * Calculate the super-cluster number containing the cluster (blk)
2607          * and obtain the super-cluster buffer.
2608          */
2609         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2610         supercl = hammer_get_supercl(volume, scl_no, &error, state);
2611         if (supercl)
2612                 hammer_rel_supercl(supercl, 0);
2613         return (error);
2614 }
2615
2616 static int
2617 super_alist_recover(void *info, int32_t blk, int32_t radix, int32_t count)
2618 {
2619         hammer_volume_t volume = info;
2620         hammer_supercl_t supercl;
2621         int32_t scl_no;
2622         int error = 0;
2623
2624         /*
2625          * Calculate the super-cluster number containing the cluster (blk)
2626          * and obtain the super-cluster buffer.
2627          */
2628         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2629         supercl = hammer_get_supercl(volume, scl_no, &error,
2630                                      HAMMER_ASTATE_NONE);
2631         if (supercl) {
2632                 hammer_modify_supercl(supercl);
2633                 error = hammer_alist_recover(&supercl->alist, blk, 0, count);
2634                 /* free block count is returned if >= 0 */
2635                 hammer_rel_supercl(supercl, 0);
2636         } else {
2637                 error = -error;
2638         }
2639         return (error);
2640 }
2641
2642 /*
2643  * This occurs when freeing a cluster via the volume a-list and the
2644  * supercl is now 100% free.  We can destroy the supercl.
2645  *
2646  * What we actually do is just unset the modify bit so it doesn't get
2647  * written out.
2648  */
2649 static int
2650 super_alist_destroy(void *info, int32_t blk, int32_t radix)
2651 {
2652         hammer_volume_t volume = info;
2653         hammer_supercl_t supercl;
2654         int32_t scl_no;
2655         int error = 0;
2656
2657         /*
2658          * Calculate the super-cluster number containing the cluster (blk)
2659          * and obtain the super-cluster buffer.
2660          */
2661         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2662         if (hammer_find_supercl(volume, scl_no)) {
2663                 supercl = hammer_get_supercl(volume, scl_no, &error,
2664                                              HAMMER_ASTATE_FREE);
2665                                              /* XXX */
2666                 if (supercl) {
2667                         hammer_io_clear_modify(&supercl->io);
2668                         hammer_rel_supercl(supercl, 0);
2669                 }
2670         }
2671         return (error);
2672 }
2673
2674 static int
2675 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2676                       int32_t count, int32_t atblk, int32_t *fullp)
2677 {
2678         hammer_volume_t volume = info;
2679         hammer_supercl_t supercl;
2680         int32_t scl_no;
2681         int32_t r;
2682         int error = 0;
2683
2684         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2685         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2686         if (supercl) {
2687                 hammer_modify_supercl(supercl);
2688                 r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
2689                 if (r != HAMMER_ALIST_BLOCK_NONE)
2690                         r += blk;
2691                 *fullp = hammer_alist_isfull(&supercl->alist);
2692                 hammer_rel_supercl(supercl, 0);
2693         } else {
2694                 r = HAMMER_ALIST_BLOCK_NONE;
2695                 *fullp = 1;
2696         }
2697         return(r);
2698 }
2699
2700 static int
2701 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2702                       int32_t count, int32_t atblk, int32_t *fullp)
2703 {
2704         hammer_volume_t volume = info;
2705         hammer_supercl_t supercl;
2706         int32_t scl_no;
2707         int32_t r;
2708         int error = 0;
2709
2710         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2711         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2712         if (supercl) {
2713                 hammer_modify_supercl(supercl);
2714                 r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
2715                 if (r != HAMMER_ALIST_BLOCK_NONE)
2716                         r += blk;
2717                 *fullp = hammer_alist_isfull(&supercl->alist);
2718                 hammer_rel_supercl(supercl, 0);
2719         } else { 
2720                 r = HAMMER_ALIST_BLOCK_NONE;
2721                 *fullp = 1;
2722         }
2723         return(r);
2724 }
2725
2726 static void
2727 super_alist_free(void *info, int32_t blk, int32_t radix,
2728                  int32_t base_blk, int32_t count, int32_t *emptyp)
2729 {
2730         hammer_volume_t volume = info;
2731         hammer_supercl_t supercl;
2732         int32_t scl_no;
2733         int error = 0;
2734
2735         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2736         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2737         if (supercl) {
2738                 hammer_modify_supercl(supercl);
2739                 hammer_alist_free(&supercl->alist, base_blk, count);
2740                 *emptyp = hammer_alist_isempty(&supercl->alist);
2741                 hammer_rel_supercl(supercl, 0);
2742         } else {
2743                 *emptyp = 0;
2744         }
2745 }
2746
2747 static int32_t
2748 super_alist_find(void *info, int32_t blk, int32_t radix, int32_t atblk,
2749                   int flags)
2750 {
2751         hammer_volume_t volume = info;
2752         hammer_supercl_t supercl;
2753         int32_t scl_no;
2754         int32_t nclusters;
2755         int error = 0;
2756
2757         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2758         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2759         if (supercl) {
2760                 nclusters = supercl->volume->ondisk->vol_nclusters -
2761                             ((int64_t)supercl->scl_no * HAMMER_SCL_MAXCLUSTERS);
2762                 KKASSERT(nclusters > 0);
2763                 if (nclusters > HAMMER_SCL_MAXCLUSTERS)
2764                         nclusters = HAMMER_SCL_MAXCLUSTERS;
2765                 blk = hammer_alist_find(&supercl->alist, atblk - blk,
2766                                         nclusters, flags);
2767                 hammer_rel_supercl(supercl, 0);
2768         } else {
2769                 blk = HAMMER_ALIST_BLOCK_NONE;
2770         }
2771         return(blk);
2772 }
2773
2774 static void
2775 super_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2776 {
2777 }
2778
2779 void
2780 hammer_init_alist_config(void)
2781 {
2782         hammer_alist_config_t config;
2783
2784         hammer_alist_template(&Buf_alist_config, HAMMER_FSBUF_MAXBLKS,
2785                               1, HAMMER_FSBUF_METAELMS, 0);
2786         hammer_alist_template(&Vol_normal_alist_config, HAMMER_VOL_MAXCLUSTERS,
2787                               1, HAMMER_VOL_METAELMS_1LYR, 0);
2788         hammer_alist_template(&Vol_super_alist_config,
2789                           HAMMER_VOL_MAXSUPERCLUSTERS * HAMMER_SCL_MAXCLUSTERS,
2790                               HAMMER_SCL_MAXCLUSTERS, HAMMER_VOL_METAELMS_2LYR,
2791                               0);
2792         hammer_alist_template(&Supercl_alist_config, HAMMER_VOL_MAXCLUSTERS,
2793                               1, HAMMER_SUPERCL_METAELMS, 0);
2794         hammer_alist_template(&Clu_master_alist_config, HAMMER_CLU_MAXBUFFERS,
2795                               1, HAMMER_CLU_MASTER_METAELMS, 0);
2796         hammer_alist_template(&Clu_slave_alist_config,
2797                               HAMMER_CLU_MAXBUFFERS * HAMMER_FSBUF_MAXBLKS,
2798                               HAMMER_FSBUF_MAXBLKS, HAMMER_CLU_SLAVE_METAELMS,
2799                               1);
2800
2801         config = &Vol_super_alist_config;
2802         config->bl_radix_init = super_alist_init;
2803         config->bl_radix_recover = super_alist_recover;
2804         config->bl_radix_destroy = super_alist_destroy;
2805         config->bl_radix_alloc_fwd = super_alist_alloc_fwd;
2806         config->bl_radix_alloc_rev = super_alist_alloc_rev;
2807         config->bl_radix_free = super_alist_free;
2808         config->bl_radix_find = super_alist_find;
2809         config->bl_radix_print = super_alist_print;
2810
2811         config = &Clu_slave_alist_config;
2812         config->bl_radix_init = buffer_alist_init;
2813         config->bl_radix_recover = buffer_alist_recover;
2814         config->bl_radix_destroy = buffer_alist_destroy;
2815         config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd;
2816         config->bl_radix_alloc_rev = buffer_alist_alloc_rev;
2817         config->bl_radix_free = buffer_alist_free;
2818         config->bl_radix_find = buffer_alist_find;
2819         config->bl_radix_print = buffer_alist_print;
2820 }
2821