HAMMER 12/many - add VOPs for symlinks, device, and fifo support.
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.12 2007/12/30 00:47:22 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_supercl(hammer_supercl_t supercl, int isnew);
51 static int hammer_load_cluster(hammer_cluster_t cluster, int isnew);
52 static int hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type);
53 static void hammer_remove_node_clist(hammer_buffer_t buffer,
54                         hammer_node_t node);
55 static void initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head,
56                         u_int64_t type);
57 static void alloc_new_buffer(hammer_cluster_t cluster,
58                         hammer_alist_t live, u_int64_t type, int32_t nelements,
59                         int32_t start,
60                         int *errorp, struct hammer_buffer **bufferp);
61 #if 0
62 static void readhammerbuf(hammer_volume_t vol, void *data,
63                         int64_t offset);
64 static void writehammerbuf(hammer_volume_t vol, const void *data,
65                         int64_t offset);
66 #endif
67 static int64_t calculate_cluster_offset(hammer_volume_t vol, int32_t clu_no);
68 static int64_t calculate_supercl_offset(hammer_volume_t vol, int32_t scl_no);
69 static int32_t hammer_alloc_master(hammer_cluster_t cluster, int nblks,
70                         int32_t start, int isfwd);
71 static void hammer_adjust_stats(hammer_cluster_t cluster,
72                         u_int64_t buf_type, int nblks);
73
74 struct hammer_alist_config Buf_alist_config;
75 struct hammer_alist_config Vol_normal_alist_config;
76 struct hammer_alist_config Vol_super_alist_config;
77 struct hammer_alist_config Supercl_alist_config;
78 struct hammer_alist_config Clu_master_alist_config;
79 struct hammer_alist_config Clu_slave_alist_config;
80
81 /*
82  * Red-Black tree support for various structures
83  */
84 static int
85 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
86 {
87         if (ip1->obj_id < ip2->obj_id)
88                 return(-1);
89         if (ip1->obj_id > ip2->obj_id)
90                 return(1);
91         if (ip1->obj_asof < ip2->obj_asof)
92                 return(-1);
93         if (ip1->obj_asof > ip2->obj_asof)
94                 return(1);
95         return(0);
96 }
97
98 static int
99 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
100 {
101         if (info->obj_id < ip->obj_id)
102                 return(-1);
103         if (info->obj_id > ip->obj_id)
104                 return(1);
105         if (info->obj_asof < ip->obj_asof)
106                 return(-1);
107         if (info->obj_asof > ip->obj_asof)
108                 return(1);
109         return(0);
110 }
111
112 static int
113 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
114 {
115         if (vol1->vol_no < vol2->vol_no)
116                 return(-1);
117         if (vol1->vol_no > vol2->vol_no)
118                 return(1);
119         return(0);
120 }
121
122 static int
123 hammer_scl_rb_compare(hammer_supercl_t cl1, hammer_supercl_t cl2)
124 {
125         if (cl1->scl_no < cl2->scl_no)
126                 return(-1);
127         if (cl1->scl_no > cl2->scl_no)
128                 return(1);
129         return(0);
130 }
131
132 static int
133 hammer_clu_rb_compare(hammer_cluster_t cl1, hammer_cluster_t cl2)
134 {
135         if (cl1->clu_no < cl2->clu_no)
136                 return(-1);
137         if (cl1->clu_no > cl2->clu_no)
138                 return(1);
139         return(0);
140 }
141
142 static int
143 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
144 {
145         if (buf1->buf_no < buf2->buf_no)
146                 return(-1);
147         if (buf1->buf_no > buf2->buf_no)
148                 return(1);
149         return(0);
150 }
151
152 static int
153 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
154 {
155         if (node1->node_offset < node2->node_offset)
156                 return(-1);
157         if (node1->node_offset > node2->node_offset)
158                 return(1);
159         return(0);
160 }
161
162 /*
163  * Note: The lookup function for hammer_ino_rb_tree winds up being named
164  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
165  * functions are normal, e.g. hammer_clu_rb_tree_RB_LOOKUP(root, clu_no).
166  */
167 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
168 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
169                 hammer_inode_info_cmp, hammer_inode_info_t);
170 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
171              hammer_vol_rb_compare, int32_t, vol_no);
172 RB_GENERATE2(hammer_scl_rb_tree, hammer_supercl, rb_node,
173              hammer_scl_rb_compare, int32_t, scl_no);
174 RB_GENERATE2(hammer_clu_rb_tree, hammer_cluster, rb_node,
175              hammer_clu_rb_compare, int32_t, clu_no);
176 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
177              hammer_buf_rb_compare, int32_t, buf_no);
178 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
179              hammer_nod_rb_compare, int32_t, node_offset);
180
181 /************************************************************************
182  *                              VOLUMES                                 *
183  ************************************************************************
184  *
185  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
186  * code on failure.  Volumes must be loaded at mount time, get_volume() will
187  * not load a new volume.
188  *
189  * Calls made to hammer_load_volume() or single-threaded
190  */
191 int
192 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
193 {
194         struct mount *mp;
195         hammer_volume_t volume;
196         struct hammer_volume_ondisk *ondisk;
197         struct nlookupdata nd;
198         struct buf *bp = NULL;
199         int error;
200         int ronly;
201
202         mp = hmp->mp;
203         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
204
205         /*
206          * Allocate a volume structure
207          */
208         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
209         volume->vol_name = kstrdup(volname, M_HAMMER);
210         volume->hmp = hmp;
211         volume->io.type = HAMMER_STRUCTURE_VOLUME;
212         volume->io.offset = 0LL;
213
214         /*
215          * Get the device vnode
216          */
217         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
218         if (error == 0)
219                 error = nlookup(&nd);
220         if (error == 0)
221                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
222         nlookup_done(&nd);
223         if (error == 0) {
224                 vn_isdisk(volume->devvp, &error);
225         }
226         if (error == 0) {
227                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
228                 error = VOP_OPEN(volume->devvp, (ronly ? FREAD : FREAD|FWRITE),
229                                  FSCRED, NULL);
230                 vn_unlock(volume->devvp);
231         }
232         if (error) {
233                 hammer_free_volume(volume);
234                 return(error);
235         }
236
237         /*
238          * Extract the volume number from the volume header and do various
239          * sanity checks.
240          */
241         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
242         if (error)
243                 goto late_failure;
244         ondisk = (void *)bp->b_data;
245         if (ondisk->head.buf_type != HAMMER_FSBUF_VOLUME) {
246                 kprintf("hammer_mount: volume %s has an invalid header\n",
247                         volume->vol_name);
248                 error = EFTYPE;
249                 goto late_failure;
250         }
251         volume->vol_no = ondisk->vol_no;
252         volume->cluster_base = ondisk->vol_clo_beg;
253         volume->vol_clsize = ondisk->vol_clsize;
254         volume->vol_flags = ondisk->vol_flags;
255         volume->nblocks = ondisk->vol_nblocks; 
256         RB_INIT(&volume->rb_clus_root);
257         RB_INIT(&volume->rb_scls_root);
258
259         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
260
261         if (RB_EMPTY(&hmp->rb_vols_root)) {
262                 hmp->fsid = ondisk->vol_fsid;
263         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
264                 kprintf("hammer_mount: volume %s's fsid does not match "
265                         "other volumes\n", volume->vol_name);
266                 error = EFTYPE;
267                 goto late_failure;
268         }
269
270         /*
271          * Insert the volume structure into the red-black tree.
272          */
273         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
274                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
275                         volume->vol_name, volume->vol_no);
276                 error = EEXIST;
277         }
278
279         /*
280          * Set the root volume and load the root cluster.  HAMMER special
281          * cases rootvol and rootcl and will not deallocate the structures.
282          * We do not hold a ref because this would prevent related I/O
283          * from being flushed.
284          */
285         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
286                 hmp->rootvol = volume;
287                 hmp->rootcl = hammer_get_cluster(volume,
288                                                  ondisk->vol0_root_clu_no,
289                                                  &error, 0);
290                 hammer_rel_cluster(hmp->rootcl, 0);
291                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
292         }
293 late_failure:
294         if (bp)
295                 brelse(bp);
296         if (error) {
297                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
298                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
299                 hammer_free_volume(volume);
300         }
301         return (error);
302 }
303
304 /*
305  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
306  * so returns -1 on failure.
307  */
308 int
309 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
310 {
311         struct hammer_mount *hmp = volume->hmp;
312         hammer_cluster_t rootcl;
313         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
314
315         /*
316          * Sync clusters, sync volume
317          */
318
319         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
320
321         /*
322          * Clean up the root cluster, which is held unlocked in the root
323          * volume.
324          */
325         if (hmp->rootvol == volume) {
326                 if ((rootcl = hmp->rootcl) != NULL)
327                         hmp->rootcl = NULL;
328                 hmp->rootvol = NULL;
329         }
330
331         /*
332          * Unload clusters and super-clusters.  Unloading a super-cluster
333          * also unloads related clusters, but the filesystem may not be
334          * using super-clusters so unload clusters anyway.
335          */
336         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
337                         hammer_unload_cluster, NULL);
338         RB_SCAN(hammer_scl_rb_tree, &volume->rb_scls_root, NULL,
339                         hammer_unload_supercl, NULL);
340
341         /*
342          * Release our buffer and flush anything left in the buffer cache.
343          */
344         hammer_io_release(&volume->io, 1);
345
346         /*
347          * There should be no references on the volume, no clusters, and
348          * no super-clusters.
349          */
350         KKASSERT(volume->io.lock.refs == 0);
351         KKASSERT(RB_EMPTY(&volume->rb_clus_root));
352         KKASSERT(RB_EMPTY(&volume->rb_scls_root));
353
354         volume->ondisk = NULL;
355         if (volume->devvp) {
356                 if (ronly) {
357                         vinvalbuf(volume->devvp, 0, 0, 0);
358                         VOP_CLOSE(volume->devvp, FREAD);
359                 } else {
360                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
361                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
362                 }
363         }
364
365         /*
366          * Destroy the structure
367          */
368         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
369         hammer_free_volume(volume);
370         return(0);
371 }
372
373 static
374 void
375 hammer_free_volume(hammer_volume_t volume)
376 {
377         if (volume->vol_name) {
378                 kfree(volume->vol_name, M_HAMMER);
379                 volume->vol_name = NULL;
380         }
381         if (volume->devvp) {
382                 vrele(volume->devvp);
383                 volume->devvp = NULL;
384         }
385         kfree(volume, M_HAMMER);
386 }
387
388 /*
389  * Get a HAMMER volume.  The volume must already exist.
390  */
391 hammer_volume_t
392 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
393 {
394         struct hammer_volume *volume;
395
396         /*
397          * Locate the volume structure
398          */
399         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
400         if (volume == NULL) {
401                 *errorp = ENOENT;
402                 return(NULL);
403         }
404         hammer_ref(&volume->io.lock);
405
406         /*
407          * Deal with on-disk info
408          */
409         if (volume->ondisk == NULL) {
410                 *errorp = hammer_load_volume(volume);
411                 if (*errorp) {
412                         hammer_rel_volume(volume, 1);
413                         volume = NULL;
414                 }
415         } else {
416                 *errorp = 0;
417         }
418         return(volume);
419 }
420
421 int
422 hammer_ref_volume(hammer_volume_t volume)
423 {
424         int error;
425
426         hammer_ref(&volume->io.lock);
427
428         /*
429          * Deal with on-disk info
430          */
431         if (volume->ondisk == NULL) {
432                 error = hammer_load_volume(volume);
433                 if (error)
434                         hammer_rel_volume(volume, 1);
435         } else {
436                 error = 0;
437         }
438         return (error);
439 }
440
441 hammer_volume_t
442 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
443 {
444         hammer_volume_t volume;
445
446         volume = hmp->rootvol;
447         KKASSERT(volume != NULL);
448         hammer_ref(&volume->io.lock);
449
450         /*
451          * Deal with on-disk info
452          */
453         if (volume->ondisk == NULL) {
454                 *errorp = hammer_load_volume(volume);
455                 if (*errorp) {
456                         hammer_rel_volume(volume, 1);
457                         volume = NULL;
458                 }
459         } else {
460                 *errorp = 0;
461         }
462         return (volume);
463 }
464
465 /*
466  * Load a volume's on-disk information.  The volume must be referenced and
467  * not locked.  We temporarily acquire an exclusive lock to interlock
468  * against releases or multiple get's.
469  */
470 static int
471 hammer_load_volume(hammer_volume_t volume)
472 {
473         struct hammer_volume_ondisk *ondisk;
474         int error;
475
476         hammer_lock_ex(&volume->io.lock);
477         if (volume->ondisk == NULL) {
478                 error = hammer_io_read(volume->devvp, &volume->io);
479                 if (error) {
480                         hammer_unlock(&volume->io.lock);
481                         return (error);
482                 }
483                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
484
485                 /*
486                  * Configure the volume's A-lists.  These are used to
487                  * allocate clusters.
488                  */
489                 if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
490                         volume->alist.config = &Vol_super_alist_config;
491                         volume->alist.meta = ondisk->vol_almeta.super;
492                         volume->alist.info = volume;
493                 } else {
494                         volume->alist.config = &Vol_normal_alist_config;
495                         volume->alist.meta = ondisk->vol_almeta.normal;
496                         volume->alist.info = NULL;
497                 }
498         } else {
499                 error = 0;
500         }
501         hammer_unlock(&volume->io.lock);
502         return(0);
503 }
504
505 /*
506  * Release a volume.  Call hammer_io_release on the last reference.  We have
507  * to acquire an exclusive lock to interlock against volume->ondisk tests
508  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
509  * lock to be held.
510  *
511  * Volumes are not unloaded from memory during normal operation.
512  */
513 void
514 hammer_rel_volume(hammer_volume_t volume, int flush)
515 {
516         if (volume->io.lock.refs == 1) {
517                 hammer_lock_ex(&volume->io.lock);
518                 if (volume->io.lock.refs == 1) {
519                         volume->ondisk = NULL;
520                         hammer_io_release(&volume->io, flush);
521                 }
522                 hammer_unlock(&volume->io.lock);
523         }
524         hammer_unref(&volume->io.lock);
525 }
526
527 /************************************************************************
528  *                              SUPER-CLUSTERS                          *
529  ************************************************************************
530  *
531  * Manage super-clusters.  Note that a supercl holds a reference to its
532  * associated volume.
533  */
534 hammer_supercl_t
535 hammer_get_supercl(hammer_volume_t volume, int32_t scl_no,
536                    int *errorp, int isnew)
537 {
538         hammer_supercl_t supercl;
539
540         /*
541          * Locate and lock the super-cluster structure, creating one
542          * if necessary.
543          */
544 again:
545         supercl = RB_LOOKUP(hammer_scl_rb_tree, &volume->rb_scls_root, scl_no);
546         if (supercl == NULL) {
547                 supercl = kmalloc(sizeof(*supercl), M_HAMMER, M_WAITOK|M_ZERO);
548                 supercl->scl_no = scl_no;
549                 supercl->volume = volume;
550                 supercl->io.offset = calculate_supercl_offset(volume, scl_no);
551                 supercl->io.type = HAMMER_STRUCTURE_SUPERCL;
552                 hammer_ref(&supercl->io.lock);
553
554                 /*
555                  * Insert the cluster into the RB tree and handle late
556                  * collisions.
557                  */
558                 if (RB_INSERT(hammer_scl_rb_tree, &volume->rb_scls_root, supercl)) {
559                         hammer_unref(&supercl->io.lock);
560                         kfree(supercl, M_HAMMER);
561                         goto again;
562                 }
563                 hammer_ref(&volume->io.lock);
564         } else {
565                 hammer_ref(&supercl->io.lock);
566         }
567
568         /*
569          * Deal with on-disk info
570          */
571         if (supercl->ondisk == NULL || isnew) {
572                 *errorp = hammer_load_supercl(supercl, isnew);
573                 if (*errorp) {
574                         hammer_rel_supercl(supercl, 1);
575                         supercl = NULL;
576                 }
577         } else {
578                 *errorp = 0;
579         }
580         return(supercl);
581 }
582
583 static int
584 hammer_load_supercl(hammer_supercl_t supercl, int isnew)
585 {
586         struct hammer_supercl_ondisk *ondisk;
587         hammer_volume_t volume = supercl->volume;
588         int error;
589
590         hammer_lock_ex(&supercl->io.lock);
591         if (supercl->ondisk == NULL) {
592                 if (isnew)
593                         error = hammer_io_new(volume->devvp, &supercl->io);
594                 else
595                         error = hammer_io_read(volume->devvp, &supercl->io);
596                 if (error) {
597                         hammer_unlock(&supercl->io.lock);
598                         return (error);
599                 }
600                 supercl->ondisk = ondisk = (void *)supercl->io.bp->b_data;
601
602                 supercl->alist.config = &Supercl_alist_config;
603                 supercl->alist.meta = ondisk->scl_meta;
604                 supercl->alist.info = NULL;
605         } else if (isnew) {
606                 error = hammer_io_new(volume->devvp, &supercl->io);
607         } else {
608                 error = 0;
609         }
610         if (error == 0 && isnew) {
611                 /*
612                  * If this is a new super-cluster we have to initialize
613                  * various ondisk structural elements.  The caller is
614                  * responsible for the remainder.
615                  */
616                 struct hammer_alist_live dummy;
617
618                 ondisk = supercl->ondisk;
619                 dummy.config = &Buf_alist_config;
620                 dummy.meta = ondisk->head.buf_almeta;
621                 dummy.info = NULL;
622                 initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_SUPERCL);
623                 hammer_alist_init(&supercl->alist);
624         }
625         hammer_unlock(&supercl->io.lock);
626         return (error);
627 }
628
629 /*
630  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
631  */
632 int
633 hammer_unload_supercl(hammer_supercl_t supercl, void *data __unused)
634 {
635         KKASSERT(supercl->io.lock.refs == 0);
636         hammer_ref(&supercl->io.lock);
637         hammer_rel_supercl(supercl, 1);
638         return(0);
639 }
640
641 /*
642  * Release a super-cluster.  We have to deal with several places where
643  * another thread can ref the super-cluster.
644  *
645  * Only destroy the structure itself if the related buffer cache buffer
646  * was disassociated from it.  This ties the management of the structure
647  * to the buffer cache subsystem.
648  */
649 void
650 hammer_rel_supercl(hammer_supercl_t supercl, int flush)
651 {
652         hammer_volume_t volume;
653
654         if (supercl->io.lock.refs == 1) {
655                 hammer_lock_ex(&supercl->io.lock);
656                 if (supercl->io.lock.refs == 1) {
657                         hammer_io_release(&supercl->io, flush);
658                         if (supercl->io.bp == NULL &&
659                             supercl->io.lock.refs == 1) {
660                                 volume = supercl->volume;
661                                 RB_REMOVE(hammer_scl_rb_tree,
662                                           &volume->rb_scls_root, supercl);
663                                 supercl->volume = NULL; /* sanity */
664                                 kfree(supercl, M_HAMMER);
665                                 hammer_rel_volume(volume, 0);
666                                 return;
667                         }
668                 }
669                 hammer_unlock(&supercl->io.lock);
670         }
671         hammer_unref(&supercl->io.lock);
672 }
673
674 /************************************************************************
675  *                              CLUSTERS                                *
676  ************************************************************************
677  *
678  * Manage clusters.  Note that a cluster holds a reference to its
679  * associated volume.
680  */
681 hammer_cluster_t
682 hammer_get_cluster(hammer_volume_t volume, int32_t clu_no,
683                    int *errorp, int isnew)
684 {
685         hammer_cluster_t cluster;
686
687 again:
688         cluster = RB_LOOKUP(hammer_clu_rb_tree, &volume->rb_clus_root, clu_no);
689         if (cluster == NULL) {
690                 cluster = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
691                 cluster->clu_no = clu_no;
692                 cluster->volume = volume;
693                 cluster->io.offset = calculate_cluster_offset(volume, clu_no);
694                 cluster->state = HAMMER_CLUSTER_IDLE;
695                 RB_INIT(&cluster->rb_bufs_root);
696                 RB_INIT(&cluster->rb_nods_root);
697                 cluster->io.type = HAMMER_STRUCTURE_CLUSTER;
698                 hammer_ref(&cluster->io.lock);
699
700                 /*
701                  * Insert the cluster into the RB tree and handle late
702                  * collisions.
703                  */
704                 if (RB_INSERT(hammer_clu_rb_tree, &volume->rb_clus_root, cluster)) {
705                         hammer_unref(&cluster->io.lock);
706                         kfree(cluster, M_HAMMER);
707                         goto again;
708                 }
709                 hammer_ref(&volume->io.lock);
710         } else {
711                 hammer_ref(&cluster->io.lock);
712         }
713
714         /*
715          * Deal with on-disk info
716          */
717         if (cluster->ondisk == NULL || isnew) {
718                 *errorp = hammer_load_cluster(cluster, isnew);
719                 if (*errorp) {
720                         hammer_rel_cluster(cluster, 1);
721                         cluster = NULL;
722                 }
723         } else {
724                 *errorp = 0;
725         }
726         return (cluster);
727 }
728
729 hammer_cluster_t
730 hammer_get_root_cluster(struct hammer_mount *hmp, int *errorp)
731 {
732         hammer_cluster_t cluster;
733
734         cluster = hmp->rootcl;
735         KKASSERT(cluster != NULL);
736         hammer_ref(&cluster->io.lock);
737
738         /*
739          * Deal with on-disk info
740          */
741         if (cluster->ondisk == NULL) {
742                 *errorp = hammer_load_cluster(cluster, 0);
743                 if (*errorp) {
744                         hammer_rel_cluster(cluster, 1);
745                         cluster = NULL;
746                 }
747         } else {
748                 *errorp = 0;
749         }
750         return (cluster);
751 }
752
753 static
754 int
755 hammer_load_cluster(hammer_cluster_t cluster, int isnew)
756 {
757         hammer_volume_t volume = cluster->volume;
758         struct hammer_cluster_ondisk *ondisk;
759         int error;
760
761         /*
762          * Load the cluster's on-disk info
763          */
764         hammer_lock_ex(&cluster->io.lock);
765         if (cluster->ondisk == NULL) {
766                 if (isnew)
767                         error = hammer_io_new(volume->devvp, &cluster->io);
768                 else
769                         error = hammer_io_read(volume->devvp, &cluster->io);
770                 if (error) {
771                         hammer_unlock(&cluster->io.lock);
772                         return (error);
773                 }
774                 cluster->ondisk = ondisk = (void *)cluster->io.bp->b_data;
775
776                 cluster->alist_master.config = &Clu_master_alist_config;
777                 cluster->alist_master.meta = ondisk->clu_master_meta;
778                 cluster->alist_btree.config = &Clu_slave_alist_config;
779                 cluster->alist_btree.meta = ondisk->clu_btree_meta;
780                 cluster->alist_btree.info = cluster;
781                 cluster->alist_record.config = &Clu_slave_alist_config;
782                 cluster->alist_record.meta = ondisk->clu_record_meta;
783                 cluster->alist_record.info = cluster;
784                 cluster->alist_mdata.config = &Clu_slave_alist_config;
785                 cluster->alist_mdata.meta = ondisk->clu_mdata_meta;
786                 cluster->alist_mdata.info = cluster;
787
788                 if (isnew == 0) {
789                         cluster->clu_btree_beg = ondisk->clu_btree_beg;
790                         cluster->clu_btree_end = ondisk->clu_btree_end;
791                 }
792         } else if (isnew) {
793                 error = hammer_io_new(volume->devvp, &cluster->io);
794         } else {
795                 error = 0;
796         }
797         if (error == 0 && isnew) {
798                 /*
799                  * If this is a new cluster we have to initialize
800                  * various ondisk structural elements.  The caller is
801                  * responsible for the remainder.
802                  */
803                 struct hammer_alist_live dummy;
804                 hammer_node_t croot;
805                 hammer_volume_ondisk_t voldisk;
806                 int32_t nbuffers;
807
808                 ondisk = cluster->ondisk;
809                 voldisk = volume->ondisk;
810
811                 dummy.config = &Buf_alist_config;
812                 dummy.meta = ondisk->head.buf_almeta;
813                 dummy.info = NULL;
814                 initbuffer(&dummy, &ondisk->head, HAMMER_FSBUF_CLUSTER);
815
816                 hammer_alist_init(&cluster->alist_master);
817                 hammer_alist_init(&cluster->alist_btree);
818                 hammer_alist_init(&cluster->alist_record);
819                 hammer_alist_init(&cluster->alist_mdata);
820
821                 ondisk->vol_fsid = voldisk->vol_fsid;
822                 ondisk->vol_fstype = voldisk->vol_fstype;
823                 ondisk->clu_gen = 1;
824                 ondisk->clu_id = 0;     /* XXX */
825                 ondisk->clu_no = cluster->clu_no;
826                 ondisk->clu_flags = 0;
827                 ondisk->clu_start = HAMMER_BUFSIZE;
828                 KKASSERT(voldisk->vol_clo_end > cluster->io.offset);
829                 if (voldisk->vol_clo_end - cluster->io.offset >
830                     voldisk->vol_clsize) {
831                         ondisk->clu_limit = voldisk->vol_clsize;
832                 } else {
833                         ondisk->clu_limit = (int32_t)(voldisk->vol_clo_end -
834                                                       cluster->io.offset);
835                 }
836                 nbuffers = ondisk->clu_limit / HAMMER_BUFSIZE;
837                 hammer_alist_free(&cluster->alist_master, 1, nbuffers - 1);
838                 ondisk->idx_data = 1 * HAMMER_FSBUF_MAXBLKS;
839                 ondisk->idx_index = 0 * HAMMER_FSBUF_MAXBLKS;
840                 ondisk->idx_record = nbuffers * HAMMER_FSBUF_MAXBLKS;
841
842                 /*
843                  * Initialize the B-Tree.  We don't know what the caller
844                  * intends to do with the cluster so make sure it causes
845                  * an assertion if the caller makes no changes.
846                  */
847                 ondisk->clu_btree_parent_vol_no = -2;
848                 ondisk->clu_btree_parent_clu_no = -2;
849                 ondisk->clu_btree_parent_offset = -2;
850                 ondisk->clu_btree_parent_clu_gen = -2;
851
852                 croot = hammer_alloc_btree(cluster, &error);
853                 if (error == 0) {
854                         bzero(croot->ondisk, sizeof(*croot->ondisk));
855                         croot->ondisk->count = 0;
856                         croot->ondisk->type = HAMMER_BTREE_TYPE_LEAF;
857                         ondisk->clu_btree_root = croot->node_offset;
858                         hammer_modify_cluster(cluster);
859                 }
860         }
861         hammer_unlock(&cluster->io.lock);
862         return (error);
863 }
864
865 /*
866  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
867  */
868 int
869 hammer_unload_cluster(hammer_cluster_t cluster, void *data __unused)
870 {
871         hammer_ref(&cluster->io.lock);
872         RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
873                 hammer_unload_buffer, NULL);
874         KKASSERT(cluster->io.lock.refs == 1);
875         hammer_rel_cluster(cluster, 1);
876         return(0);
877 }
878
879 /*
880  * Reference a cluster that is either already referenced or via a specially
881  * handled pointer (aka rootcl).
882  */
883 int
884 hammer_ref_cluster(hammer_cluster_t cluster)
885 {
886         int error;
887
888         KKASSERT(cluster != NULL);
889         hammer_ref(&cluster->io.lock);
890
891         /*
892          * Deal with on-disk info
893          */
894         if (cluster->ondisk == NULL) {
895                 error = hammer_load_cluster(cluster, 0);
896                 if (error)
897                         hammer_rel_cluster(cluster, 1);
898         } else {
899                 error = 0;
900         }
901         return(error);
902 }
903
904 /*
905  * Release a cluster.  We have to deal with several places where
906  * another thread can ref the cluster.
907  *
908  * Only destroy the structure itself if the related buffer cache buffer
909  * was disassociated from it.  This ties the management of the structure
910  * to the buffer cache subsystem.
911  */
912 void
913 hammer_rel_cluster(hammer_cluster_t cluster, int flush)
914 {
915         hammer_node_t node;
916         hammer_volume_t volume;
917
918         if (cluster->io.lock.refs == 1) {
919                 hammer_lock_ex(&cluster->io.lock);
920                 if (cluster->io.lock.refs == 1) {
921                         /*
922                          * Release the I/O.  If we or the kernel wants to
923                          * flush, this will release the bp.  Otherwise the
924                          * bp may be written and flushed passively by the
925                          * kernel later on.
926                          */
927                         hammer_io_release(&cluster->io, flush);
928
929                         /*
930                          * The B-Tree node cache is not counted in the
931                          * cluster's reference count.  Clean out the
932                          * cache.
933                          *
934                          * If the cluster acquires a new reference while we
935                          * are trying to clean it out, abort the cleaning.
936                          * 
937                          * Any actively referenced nodes will reference the
938                          * related buffer and cluster, so a ref count check
939                          * should be sufficient.
940                          */
941                         while (cluster->io.bp == NULL &&
942                                cluster->io.lock.refs == 1 &&
943                                (node = RB_ROOT(&cluster->rb_nods_root)) != NULL
944                         ) {
945                                 KKASSERT(node->lock.refs == 0);
946                                 hammer_flush_node(node);
947                         }
948
949                         /*
950                          * Final cleanup
951                          */
952                         if (cluster != cluster->volume->hmp->rootcl &&
953                             cluster->io.bp == NULL &&
954                             cluster->io.lock.refs == 1 &&
955                             RB_EMPTY(&cluster->rb_nods_root)) {
956                                 KKASSERT(RB_EMPTY(&cluster->rb_bufs_root));
957                                 volume = cluster->volume;
958                                 RB_REMOVE(hammer_clu_rb_tree,
959                                           &volume->rb_clus_root, cluster);
960                                 cluster->volume = NULL; /* sanity */
961                                 kfree(cluster, M_HAMMER);
962                                 hammer_rel_volume(volume, 0);
963                                 return;
964                         }
965                 }
966                 hammer_unlock(&cluster->io.lock);
967         }
968         hammer_unref(&cluster->io.lock);
969 }
970
971 /************************************************************************
972  *                              BUFFERS                                 *
973  ************************************************************************
974  *
975  * Manage buffers.  Note that a buffer holds a reference to its associated
976  * cluster, and its cluster will hold a reference to the cluster's volume.
977  *
978  * A non-zero buf_type indicates that a new buffer should be created and
979  * zero'd.
980  */
981 hammer_buffer_t
982 hammer_get_buffer(hammer_cluster_t cluster, int32_t buf_no,
983                   u_int64_t buf_type, int *errorp)
984 {
985         hammer_buffer_t buffer;
986
987         /*
988          * Find the buffer.  Note that buffer 0 corresponds to the cluster
989          * header and should never be requested.
990          */
991         KKASSERT(buf_no >= cluster->ondisk->clu_start / HAMMER_BUFSIZE &&
992                  buf_no < cluster->ondisk->clu_limit / HAMMER_BUFSIZE);
993
994         /*
995          * Locate and lock the buffer structure, creating one if necessary.
996          */
997 again:
998         buffer = RB_LOOKUP(hammer_buf_rb_tree, &cluster->rb_bufs_root, buf_no);
999         if (buffer == NULL) {
1000                 buffer = kmalloc(sizeof(*cluster), M_HAMMER, M_WAITOK|M_ZERO);
1001                 buffer->buf_no = buf_no;
1002                 buffer->cluster = cluster;
1003                 buffer->volume = cluster->volume;
1004                 buffer->io.offset = cluster->io.offset +
1005                                     (buf_no * HAMMER_BUFSIZE);
1006                 buffer->io.type = HAMMER_STRUCTURE_BUFFER;
1007                 TAILQ_INIT(&buffer->clist);
1008                 hammer_ref(&buffer->io.lock);
1009
1010                 /*
1011                  * Insert the cluster into the RB tree and handle late
1012                  * collisions.
1013                  */
1014                 if (RB_INSERT(hammer_buf_rb_tree, &cluster->rb_bufs_root, buffer)) {
1015                         hammer_unref(&buffer->io.lock);
1016                         kfree(buffer, M_HAMMER);
1017                         goto again;
1018                 }
1019                 hammer_ref(&cluster->io.lock);
1020         } else {
1021                 hammer_ref(&buffer->io.lock);
1022         }
1023
1024         /*
1025          * Deal with on-disk info
1026          */
1027         if (buffer->ondisk == NULL || buf_type) {
1028                 *errorp = hammer_load_buffer(buffer, buf_type);
1029                 if (*errorp) {
1030                         hammer_rel_buffer(buffer, 1);
1031                         buffer = NULL;
1032                 }
1033         } else {
1034                 *errorp = 0;
1035         }
1036         return(buffer);
1037 }
1038
1039 static int
1040 hammer_load_buffer(hammer_buffer_t buffer, u_int64_t buf_type)
1041 {
1042         hammer_volume_t volume;
1043         hammer_fsbuf_ondisk_t ondisk;
1044         int error;
1045
1046         /*
1047          * Load the buffer's on-disk info
1048          */
1049         volume = buffer->volume;
1050         hammer_lock_ex(&buffer->io.lock);
1051         if (buffer->ondisk == NULL) {
1052                 if (buf_type) {
1053                         error = hammer_io_new(volume->devvp, &buffer->io);
1054                 } else {
1055                         error = hammer_io_read(volume->devvp, &buffer->io);
1056                 }
1057                 if (error) {
1058                         hammer_unlock(&buffer->io.lock);
1059                         return (error);
1060                 }
1061                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
1062                 buffer->alist.config = &Buf_alist_config;
1063                 buffer->alist.meta = ondisk->head.buf_almeta;
1064                 buffer->buf_type = ondisk->head.buf_type;
1065         } else if (buf_type) {
1066                 error = hammer_io_new(volume->devvp, &buffer->io);
1067         } else {
1068                 error = 0;
1069         }
1070         if (error == 0 && buf_type) {
1071                 ondisk = buffer->ondisk;
1072                 initbuffer(&buffer->alist, &ondisk->head, buf_type);
1073                 buffer->buf_type = ondisk->head.buf_type;
1074         }
1075         hammer_unlock(&buffer->io.lock);
1076         return (error);
1077 }
1078
1079 /*
1080  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
1081  */
1082 int
1083 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
1084 {
1085         hammer_ref(&buffer->io.lock);
1086         hammer_flush_buffer_nodes(buffer);
1087         KKASSERT(buffer->io.lock.refs == 1);
1088         hammer_rel_buffer(buffer, 1);
1089         return(0);
1090 }
1091
1092 /*
1093  * Reference a buffer that is either already referenced or via a specially
1094  * handled pointer (aka cursor->buffer).
1095  */
1096 int
1097 hammer_ref_buffer(hammer_buffer_t buffer)
1098 {
1099         int error;
1100
1101         hammer_ref(&buffer->io.lock);
1102         if (buffer->ondisk == NULL) {
1103                 error = hammer_load_buffer(buffer, 0);
1104                 if (error) {
1105                         hammer_rel_buffer(buffer, 1);
1106                         /*
1107                          * NOTE: buffer pointer can become stale after
1108                          * the above release.
1109                          */
1110                 } else {
1111                         KKASSERT(buffer->buf_type ==
1112                                  buffer->ondisk->head.buf_type);
1113                 }
1114         } else {
1115                 error = 0;
1116         }
1117         return(error);
1118 }
1119
1120 /*
1121  * Release a buffer.  We have to deal with several places where
1122  * another thread can ref the buffer.
1123  *
1124  * Only destroy the structure itself if the related buffer cache buffer
1125  * was disassociated from it.  This ties the management of the structure
1126  * to the buffer cache subsystem.  buffer->ondisk determines whether the
1127  * embedded io is referenced or not.
1128  */
1129 void
1130 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
1131 {
1132         hammer_cluster_t cluster;
1133         hammer_node_t node;
1134
1135         if (buffer->io.lock.refs == 1) {
1136                 hammer_lock_ex(&buffer->io.lock);
1137                 if (buffer->io.lock.refs == 1) {
1138                         hammer_io_release(&buffer->io, flush);
1139
1140                         /*
1141                          * Clean out the B-Tree node cache, if any, then
1142                          * clean up the cluster ref and free the buffer.
1143                          *
1144                          * If the buffer acquires a new reference while we
1145                          * are trying to clean it out, abort the cleaning.
1146                          */
1147                         while (buffer->io.bp == NULL &&
1148                                buffer->io.lock.refs == 1 &&
1149                                (node = TAILQ_FIRST(&buffer->clist)) != NULL
1150                         ) {
1151                                 KKASSERT(node->lock.refs == 0);
1152                                 hammer_flush_node(node);
1153                         }
1154                         if (buffer->io.bp == NULL &&
1155                             hammer_islastref(&buffer->io.lock)) {
1156                                 cluster = buffer->cluster;
1157                                 RB_REMOVE(hammer_buf_rb_tree,
1158                                           &cluster->rb_bufs_root, buffer);
1159                                 buffer->cluster = NULL; /* sanity */
1160                                 kfree(buffer, M_HAMMER);
1161                                 hammer_rel_cluster(cluster, 0);
1162                                 return;
1163                         }
1164                 }
1165                 hammer_unlock(&buffer->io.lock);
1166         }
1167         hammer_unref(&buffer->io.lock);
1168 }
1169
1170 /*
1171  * Flush passively cached B-Tree nodes associated with this buffer.
1172  *
1173  * NOTE: The buffer is referenced and locked.
1174  */
1175 void
1176 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1177 {
1178         hammer_node_t node;
1179
1180         node = TAILQ_FIRST(&buffer->clist);
1181         while (node) {
1182                 buffer->save_scan = TAILQ_NEXT(node, entry);
1183                 if (node->lock.refs == 0)
1184                         hammer_flush_node(node);
1185                 node = buffer->save_scan;
1186         }
1187 }
1188
1189 /************************************************************************
1190  *                              NODES                                   *
1191  ************************************************************************
1192  *
1193  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
1194  * method used by the HAMMER filesystem.
1195  *
1196  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
1197  * associated with its buffer.  It can have an active buffer reference
1198  * even when the node itself has no references.  The node also passively
1199  * associates itself with its cluster without holding any cluster refs.
1200  * The cluster ref is indirectly maintained by the active buffer ref when
1201  * a node is acquired.
1202  *
1203  * A hammer_node can also be passively associated with other HAMMER
1204  * structures, such as inodes, while retaining 0 references.  These
1205  * associations can be cleared backwards using a pointer-to-pointer in
1206  * the hammer_node.
1207  *
1208  * This allows the HAMMER implementation to cache hammer_node's long-term
1209  * and short-cut a great deal of the infrastructure's complexity.  In
1210  * most cases a cached node can be reacquired without having to dip into
1211  * either the buffer or cluster management code.
1212  *
1213  * The caller must pass a referenced cluster on call and will retain
1214  * ownership of the reference on return.  The node will acquire its own
1215  * additional references, if necessary.
1216  */
1217 hammer_node_t
1218 hammer_get_node(hammer_cluster_t cluster, int32_t node_offset, int *errorp)
1219 {
1220         hammer_node_t node;
1221
1222         /*
1223          * Locate the structure, allocating one if necessary.
1224          */
1225 again:
1226         node = RB_LOOKUP(hammer_nod_rb_tree, &cluster->rb_nods_root,
1227                          node_offset);
1228         if (node == NULL) {
1229                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
1230                 node->node_offset = node_offset;
1231                 node->cluster = cluster;
1232                 if (RB_INSERT(hammer_nod_rb_tree, &cluster->rb_nods_root,
1233                               node)) {
1234                         kfree(node, M_HAMMER);
1235                         goto again;
1236                 }
1237         }
1238         *errorp = hammer_ref_node(node);
1239         if (*errorp) {
1240                 /*
1241                  * NOTE: The node pointer may be stale on error return.
1242                  * In fact, its probably been destroyed.
1243                  */
1244                 node = NULL;
1245         }
1246         return(node);
1247 }
1248
1249 /*
1250  * Reference the node to prevent disassociations, then associate and
1251  * load the related buffer.  This routine can also be called to reference
1252  * a node from a cache pointer.
1253  *
1254  * NOTE: Because the caller does not have a ref on the node, the caller's
1255  * node pointer will be stale if an error is returned.  We may also wind
1256  * up clearing the related cache pointers.
1257  *
1258  * NOTE: The cluster is indirectly referenced by our buffer ref.
1259  */
1260 int
1261 hammer_ref_node(hammer_node_t node)
1262 {
1263         hammer_buffer_t buffer;
1264         int32_t buf_no;
1265         int error;
1266
1267         hammer_ref(&node->lock);
1268         error = 0;
1269         if (node->ondisk == NULL) {
1270                 hammer_lock_ex(&node->lock);
1271                 if (node->ondisk == NULL) {
1272                         /*
1273                          * This is a little confusing but the jist is that
1274                          * node->buffer determines whether the node is on
1275                          * the buffer's clist and node->ondisk determines
1276                          * whether the buffer is referenced.
1277                          */
1278                         if ((buffer = node->buffer) != NULL) {
1279                                 error = hammer_ref_buffer(buffer);
1280                         } else {
1281                                 buf_no = node->node_offset / HAMMER_BUFSIZE;
1282                                 buffer = hammer_get_buffer(node->cluster,
1283                                                            buf_no, 0, &error);
1284                                 if (buffer) {
1285                                         KKASSERT(error == 0);
1286                                         TAILQ_INSERT_TAIL(&buffer->clist,
1287                                                           node, entry);
1288                                         node->buffer = buffer;
1289                                 }
1290                         }
1291                         if (error == 0) {
1292                                 node->ondisk = (void *)((char *)buffer->ondisk +
1293                                        (node->node_offset & HAMMER_BUFMASK));
1294                         }
1295                 }
1296                 hammer_unlock(&node->lock);
1297         }
1298         if (error)
1299                 hammer_rel_node(node);
1300         return (error);
1301 }
1302
1303 /*
1304  * Release a hammer_node.  The node retains a passive association with
1305  * its cluster, buffer and caches.
1306  *
1307  * However, to avoid cluttering up kernel memory with tons of B-Tree
1308  * node cache structures we destroy the node if no passive cache or
1309  * (instantiated) buffer references exist.
1310  */
1311 void
1312 hammer_rel_node(hammer_node_t node)
1313 {
1314         hammer_cluster_t cluster;
1315         hammer_buffer_t buffer;
1316
1317         if (hammer_islastref(&node->lock)) {
1318                 cluster = node->cluster;
1319                 /*
1320                  * Clutter control, this case only occurs after a failed
1321                  * load since otherwise ondisk will be non-NULL.
1322                  */
1323                 if (node->cache1 == NULL && node->cache2 == NULL && 
1324                     node->ondisk == NULL) {
1325                         RB_REMOVE(hammer_nod_rb_tree, &cluster->rb_nods_root,
1326                                   node);
1327                         if ((buffer = node->buffer) != NULL) {
1328                                 node->buffer = NULL;
1329                                 hammer_remove_node_clist(buffer, node);
1330                         }
1331                         kfree(node, M_HAMMER);
1332                         return;
1333                 }
1334
1335                 /*
1336                  * node->ondisk determines whether we have a buffer reference
1337                  * to get rid of or not.  Only get rid of the reference if
1338                  * the kernel tried to flush the buffer.
1339                  *
1340                  * NOTE: Once unref'd the node can be physically destroyed,
1341                  * so our node is stale afterwords.
1342                  *
1343                  * This case occurs if the node still has cache references.
1344                  * We could remove the references and free the structure
1345                  * but for now we allow them (and the node structure) to
1346                  * remain intact.
1347                  */
1348                 if (node->ondisk && hammer_io_checkflush(&node->buffer->io)) {
1349                         buffer = node->buffer;
1350                         node->buffer = NULL;
1351                         node->ondisk = NULL;
1352                         hammer_remove_node_clist(buffer, node);
1353                         hammer_unref(&node->lock);
1354                         hammer_rel_buffer(buffer, 0);
1355                 } else {
1356                         hammer_unref(&node->lock);
1357                 }
1358         } else {
1359                 hammer_unref(&node->lock);
1360         }
1361 }
1362
1363 /*
1364  * Cache-and-release a hammer_node.  Kinda like catching and releasing a
1365  * fish, but keeping an eye on him.  The node is passively cached in *cache.
1366  *
1367  * NOTE!  HAMMER may NULL *cache at any time, even after you have
1368  * referenced the node!
1369  */
1370 void
1371 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
1372 {
1373         if (node->cache1 != cache) {
1374                 if (node->cache2 == cache) {
1375                         struct hammer_node **tmp;
1376                         tmp = node->cache1;
1377                         node->cache1 = node->cache2;
1378                         node->cache2 = tmp;
1379                 } else {
1380                         if (node->cache2)
1381                                 *node->cache2 = NULL;
1382                         node->cache2 = node->cache1;
1383                         node->cache1 = cache;
1384                         *cache = node;
1385                 }
1386         }
1387 }
1388
1389 void
1390 hammer_uncache_node(struct hammer_node **cache)
1391 {
1392         hammer_node_t node;
1393
1394         if ((node = *cache) != NULL) {
1395                 *cache = NULL;
1396                 if (node->cache1 == cache) {
1397                         node->cache1 = node->cache2;
1398                         node->cache2 = NULL;
1399                 } else if (node->cache2 == cache) {
1400                         node->cache2 = NULL;
1401                 } else {
1402                         panic("hammer_uncache_node: missing cache linkage");
1403                 }
1404                 if (node->cache1 == NULL && node->cache2 == NULL &&
1405                     node->lock.refs == 0) {
1406                         hammer_flush_node(node);
1407                 }
1408         }
1409 }
1410
1411 /*
1412  * Remove a node's cache references and destroy the node if it has no
1413  * references.  This is typically called from the buffer handling code.
1414  *
1415  * The node may have an active buffer reference (ondisk != NULL) even
1416  * if the node itself has no references.
1417  *
1418  * Note that a caller iterating through nodes via a buffer must have its
1419  * own reference on the buffer or our hammer_rel_buffer() call below may
1420  * rip it out from under the caller.
1421  */
1422 void
1423 hammer_flush_node(hammer_node_t node)
1424 {
1425         hammer_buffer_t buffer;
1426
1427         if (node->cache1)
1428                 *node->cache1 = NULL;
1429         if (node->cache2)
1430                 *node->cache2 = NULL;
1431         if (node->lock.refs == 0) {
1432                 RB_REMOVE(hammer_nod_rb_tree, &node->cluster->rb_nods_root,
1433                           node);
1434                 if ((buffer = node->buffer) != NULL) {
1435                         node->buffer = NULL;
1436                         hammer_remove_node_clist(buffer, node);
1437                         if (node->ondisk) {
1438                                 node->ondisk = NULL;
1439                                 hammer_rel_buffer(buffer, 0);
1440                         }
1441                 }
1442                 kfree(node, M_HAMMER);
1443         }
1444 }
1445
1446 /*
1447  * Remove a node from the buffer's clist.  Adjust save_scan as appropriate.
1448  * This is in its own little routine to properly handle interactions with
1449  * save_scan, so it is possible to block while scanning a buffer's node list.
1450  */
1451 static
1452 void
1453 hammer_remove_node_clist(hammer_buffer_t buffer, hammer_node_t node)
1454 {
1455         if (buffer->save_scan == node)
1456                 buffer->save_scan = TAILQ_NEXT(node, entry);
1457         TAILQ_REMOVE(&buffer->clist, node, entry);
1458 }
1459
1460 /************************************************************************
1461  *                              A-LIST ALLOCATORS                       *
1462  ************************************************************************/
1463
1464 /*
1465  * Allocate HAMMER clusters
1466  */
1467 hammer_cluster_t
1468 hammer_alloc_cluster(hammer_mount_t hmp, hammer_cluster_t cluster_hint,
1469                      int *errorp)
1470 {
1471         hammer_volume_t volume;
1472         hammer_cluster_t cluster;
1473         int32_t clu_no;
1474         int32_t clu_hint;
1475         int32_t vol_beg;
1476         int32_t vol_no;
1477
1478         /*
1479          * Figure out our starting volume and hint.
1480          */
1481         if (cluster_hint) {
1482                 vol_beg = cluster_hint->volume->vol_no;
1483                 clu_hint = cluster_hint->clu_no;
1484         } else {
1485                 vol_beg = hmp->volume_iterator;
1486                 clu_hint = -1;
1487         }
1488
1489         /*
1490          * Loop through volumes looking for a free cluster.  If allocating
1491          * a new cluster relative to an existing cluster try to find a free
1492          * cluster on either side (clu_hint >= 0), otherwise just do a
1493          * forwards iteration.
1494          */
1495         vol_no = vol_beg;
1496         do {
1497                 volume = hammer_get_volume(hmp, vol_no, errorp);
1498                 kprintf("VOLUME %p %d\n", volume, vol_no);
1499                 if (*errorp) {
1500                         clu_no = HAMMER_ALIST_BLOCK_NONE;
1501                         break;
1502                 }
1503                 if (clu_hint == -1) {
1504                         clu_hint = volume->clu_iterator;
1505                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1506                                                         clu_hint);
1507                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1508                                 clu_no = hammer_alist_alloc_fwd(&volume->alist,
1509                                                                 1, 0);
1510                         }
1511                 } else {
1512                         clu_no = hammer_alist_alloc_fwd(&volume->alist, 1,
1513                                                         clu_hint);
1514                         if (clu_no == HAMMER_ALIST_BLOCK_NONE) {
1515                                 clu_no = hammer_alist_alloc_rev(&volume->alist,
1516                                                                 1, clu_hint);
1517                         }
1518                 }
1519                 if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
1520                         hammer_modify_volume(volume);
1521                         break;
1522                 }
1523                 hammer_rel_volume(volume, 0);
1524                 volume = NULL;
1525                 *errorp = ENOSPC;
1526                 vol_no = (vol_no + 1) % hmp->nvolumes;
1527                 clu_hint = -1;
1528         } while (vol_no != vol_beg);
1529
1530         /*
1531          * Acquire the cluster.  On success this will force *errorp to 0.
1532          */
1533         if (clu_no != HAMMER_ALIST_BLOCK_NONE) {
1534                 kprintf("ALLOC CLUSTER %d\n", clu_no);
1535                 cluster = hammer_get_cluster(volume, clu_no, errorp, 1);
1536                 volume->clu_iterator = clu_no;
1537                 hammer_rel_volume(volume, 0);
1538         } else {
1539                 cluster = NULL;
1540         }
1541         if (cluster)
1542                 hammer_lock_ex(&cluster->io.lock);
1543         return(cluster);
1544 }
1545
1546 void
1547 hammer_init_cluster(hammer_cluster_t cluster, hammer_base_elm_t left_bound, 
1548                     hammer_base_elm_t right_bound)
1549 {
1550         hammer_cluster_ondisk_t ondisk = cluster->ondisk;
1551
1552         ondisk->clu_btree_beg = *left_bound;
1553         ondisk->clu_btree_end = *right_bound;
1554         cluster->clu_btree_beg = ondisk->clu_btree_beg;
1555         cluster->clu_btree_end = ondisk->clu_btree_end;
1556 }
1557
1558 /*
1559  * Deallocate a cluster
1560  */
1561 void
1562 hammer_free_cluster(hammer_cluster_t cluster)
1563 {
1564         hammer_alist_free(&cluster->volume->alist, cluster->clu_no, 1);
1565 }
1566
1567 /*
1568  * Allocate HAMMER elements - btree nodes, data storage, and record elements
1569  *
1570  * The passed *bufferp should be initialized to NULL.  On successive calls
1571  * *bufferp caches the most recent buffer used until put away by the caller.
1572  * Note that previously returned pointers using the cached buffer become
1573  * invalid on successive calls which reuse *bufferp.
1574  *
1575  * All allocations first attempt to use the block found at the specified
1576  * iterator.  If that fails the first available block is used.  If that
1577  * fails a new buffer is allocated and associated with the buffer type
1578  * A-list and the element is allocated out of the new buffer.
1579  */
1580
1581 hammer_node_t
1582 hammer_alloc_btree(hammer_cluster_t cluster, int *errorp)
1583 {
1584         hammer_buffer_t buffer;
1585         hammer_alist_t live;
1586         hammer_node_t node;
1587         int32_t elm_no;
1588         int32_t buf_no;
1589         int32_t node_offset;
1590
1591         /*
1592          * Allocate a B-Tree element
1593          */
1594         buffer = NULL;
1595         live = &cluster->alist_btree;
1596         elm_no = hammer_alist_alloc_fwd(live, 1, cluster->ondisk->idx_index);
1597         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1598                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1599         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1600                 alloc_new_buffer(cluster, live,
1601                                  HAMMER_FSBUF_BTREE, HAMMER_BTREE_NODES,
1602                                  cluster->ondisk->idx_index, errorp, &buffer);
1603                 elm_no = hammer_alist_alloc(live, 1);
1604                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1605                         *errorp = ENOSPC;
1606                         if (buffer)
1607                                 hammer_rel_buffer(buffer, 0);
1608                         hammer_modify_cluster(cluster);
1609                         return(NULL);
1610                 }
1611         }
1612         cluster->ondisk->idx_index = elm_no;
1613         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_BTREE_NODES);
1614         hammer_modify_cluster(cluster);
1615
1616         /*
1617          * Load and return the B-Tree element
1618          */
1619         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1620         node_offset = buf_no * HAMMER_BUFSIZE +
1621                       offsetof(union hammer_fsbuf_ondisk,
1622                                btree.nodes[elm_no & HAMMER_FSBUF_BLKMASK]);
1623         node = hammer_get_node(cluster, node_offset, errorp);
1624         if (node) {
1625                 bzero(node->ondisk, sizeof(*node->ondisk));
1626         } else {
1627                 hammer_alist_free(live, elm_no, 1);
1628                 hammer_rel_node(node);
1629                 node = NULL;
1630         }
1631         if (buffer)
1632                 hammer_rel_buffer(buffer, 0);
1633         return(node);
1634 }
1635
1636 void *
1637 hammer_alloc_data(hammer_cluster_t cluster, int32_t bytes,
1638                   int *errorp, struct hammer_buffer **bufferp)
1639 {
1640         hammer_buffer_t buffer;
1641         hammer_alist_t live;
1642         int32_t elm_no;
1643         int32_t buf_no;
1644         int32_t nblks;
1645         void *item;
1646
1647         /*
1648          * Deal with large data blocks.  The blocksize is HAMMER_BUFSIZE
1649          * for these allocations.
1650          */
1651         if ((bytes & HAMMER_BUFMASK) == 0) {
1652                 nblks = bytes / HAMMER_BUFSIZE;
1653                 /* only one block allowed for now (so buffer can hold it) */
1654                 KKASSERT(nblks == 1);
1655
1656                 buf_no = hammer_alloc_master(cluster, nblks,
1657                                              cluster->ondisk->idx_ldata, 1);
1658                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1659                         *errorp = ENOSPC;
1660                         return(NULL);
1661                 }
1662                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, nblks);
1663                 cluster->ondisk->idx_ldata = buf_no;
1664                 buffer = *bufferp;
1665                 *bufferp = hammer_get_buffer(cluster, buf_no, -1, errorp);
1666                 if (buffer)
1667                         hammer_rel_buffer(buffer, 0);
1668                 buffer = *bufferp;
1669                 return(buffer->ondisk);
1670         }
1671
1672         /*
1673          * Allocate a data element.  The block size is HAMMER_DATA_BLKSIZE
1674          * (64 bytes) for these allocations.
1675          */
1676         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1677         nblks /= HAMMER_DATA_BLKSIZE;
1678         live = &cluster->alist_mdata;
1679         elm_no = hammer_alist_alloc_fwd(live, nblks, cluster->ondisk->idx_data);
1680         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1681                 elm_no = hammer_alist_alloc_fwd(live, 1, 0);
1682         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1683                 alloc_new_buffer(cluster, live,
1684                                  HAMMER_FSBUF_DATA, HAMMER_DATA_NODES,
1685                                  cluster->ondisk->idx_data, errorp, bufferp);
1686                 elm_no = hammer_alist_alloc(live, nblks);
1687                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1688                         *errorp = ENOSPC;
1689                         hammer_modify_cluster(cluster);
1690                         return(NULL);
1691                 }
1692         }
1693         cluster->ondisk->idx_index = elm_no;
1694         hammer_modify_cluster(cluster);
1695
1696         /*
1697          * Load and return the B-Tree element
1698          */
1699         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1700         buffer = *bufferp;
1701         if (buffer == NULL || buffer->cluster != cluster ||
1702             buffer->buf_no != buf_no) {
1703                 if (buffer)
1704                         hammer_rel_buffer(buffer, 0);
1705                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1706                 *bufferp = buffer;
1707         }
1708         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_DATA);
1709         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_DATA_NODES);
1710         item = &buffer->ondisk->data.data[elm_no & HAMMER_FSBUF_BLKMASK];
1711         bzero(item, nblks * HAMMER_DATA_BLKSIZE);
1712         *errorp = 0;
1713         return(item);
1714 }
1715
1716 void *
1717 hammer_alloc_record(hammer_cluster_t cluster,
1718                     int *errorp, struct hammer_buffer **bufferp)
1719 {
1720         hammer_buffer_t buffer;
1721         hammer_alist_t live;
1722         int32_t elm_no;
1723         int32_t buf_no;
1724         void *item;
1725
1726         /*
1727          * Allocate a record element
1728          */
1729         live = &cluster->alist_record;
1730         elm_no = hammer_alist_alloc_rev(live, 1, cluster->ondisk->idx_record);
1731         if (elm_no == HAMMER_ALIST_BLOCK_NONE)
1732                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1733         if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1734                 alloc_new_buffer(cluster, live,
1735                                  HAMMER_FSBUF_RECORDS, HAMMER_RECORD_NODES,
1736                                  cluster->ondisk->idx_record, errorp, bufferp);
1737                 elm_no = hammer_alist_alloc_rev(live, 1,HAMMER_ALIST_BLOCK_MAX);
1738                 kprintf("hammer_alloc_record elm again %08x\n", elm_no);
1739                 if (elm_no == HAMMER_ALIST_BLOCK_NONE) {
1740                         *errorp = ENOSPC;
1741                         hammer_modify_cluster(cluster);
1742                         return(NULL);
1743                 }
1744         }
1745         cluster->ondisk->idx_record = elm_no;
1746         hammer_modify_cluster(cluster);
1747
1748         /*
1749          * Load and return the B-Tree element
1750          */
1751         buf_no = elm_no / HAMMER_FSBUF_MAXBLKS;
1752         buffer = *bufferp;
1753         if (buffer == NULL || buffer->cluster != cluster ||
1754             buffer->buf_no != buf_no) {
1755                 if (buffer)
1756                         hammer_rel_buffer(buffer, 0);
1757                 buffer = hammer_get_buffer(cluster, buf_no, 0, errorp);
1758                 *bufferp = buffer;
1759         }
1760         KKASSERT(buffer->ondisk->head.buf_type == HAMMER_FSBUF_RECORDS);
1761         KKASSERT((elm_no & HAMMER_FSBUF_BLKMASK) < HAMMER_RECORD_NODES);
1762         item = &buffer->ondisk->record.recs[elm_no & HAMMER_FSBUF_BLKMASK];
1763         bzero(item, sizeof(union hammer_record_ondisk));
1764         *errorp = 0;
1765         return(item);
1766 }
1767
1768 void
1769 hammer_free_data_ptr(hammer_buffer_t buffer, void *data, int bytes)
1770 {
1771         int32_t elm_no;
1772         int32_t nblks;
1773         hammer_alist_t live;
1774
1775         if ((bytes & HAMMER_BUFMASK) == 0) {
1776                 nblks = bytes / HAMMER_BUFSIZE;
1777                 KKASSERT(nblks == 1 && data == (void *)buffer->ondisk);
1778                 hammer_alist_free(&buffer->cluster->alist_master,
1779                                   buffer->buf_no, nblks);
1780                 hammer_adjust_stats(buffer->cluster, HAMMER_FSBUF_DATA, -nblks);
1781                 hammer_modify_cluster(buffer->cluster);
1782                 return;
1783         }
1784
1785         elm_no = ((char *)data - (char *)buffer->ondisk->data.data) /
1786                  HAMMER_DATA_BLKSIZE;
1787         KKASSERT(elm_no >= 0 && elm_no < HAMMER_DATA_NODES);
1788         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1789         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1790         nblks /= HAMMER_DATA_BLKSIZE;
1791         live = &buffer->cluster->alist_mdata;
1792         hammer_alist_free(live, elm_no, nblks);
1793         hammer_modify_cluster(buffer->cluster);
1794 }
1795
1796 void
1797 hammer_free_record_ptr(hammer_buffer_t buffer, union hammer_record_ondisk *rec)
1798 {
1799         int32_t elm_no;
1800         hammer_alist_t live;
1801
1802         elm_no = rec - &buffer->ondisk->record.recs[0];
1803         KKASSERT(elm_no >= 0 && elm_no < HAMMER_BTREE_NODES);
1804         elm_no += buffer->buf_no * HAMMER_FSBUF_MAXBLKS;
1805         live = &buffer->cluster->alist_record;
1806         hammer_alist_free(live, elm_no, 1);
1807         hammer_modify_cluster(buffer->cluster);
1808 }
1809
1810 void
1811 hammer_free_btree(hammer_cluster_t cluster, int32_t bclu_offset)
1812 {
1813         const int32_t blksize = sizeof(struct hammer_node_ondisk);
1814         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1815         hammer_alist_t live;
1816         int32_t elm_no;
1817
1818         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1819         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, btree.nodes[0]);
1820         live = &cluster->alist_btree;
1821         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1822         elm_no += fsbuf_offset / blksize;
1823         hammer_alist_free(live, elm_no, 1);
1824         hammer_modify_cluster(cluster);
1825 }
1826
1827 void
1828 hammer_free_data(hammer_cluster_t cluster, int32_t bclu_offset, int32_t bytes)
1829 {
1830         const int32_t blksize = HAMMER_DATA_BLKSIZE;
1831         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1832         hammer_alist_t live;
1833         int32_t elm_no;
1834         int32_t buf_no;
1835         int32_t nblks;
1836
1837         if ((bytes & HAMMER_BUFMASK) == 0) {
1838                 nblks = bytes / HAMMER_BUFSIZE;
1839                 KKASSERT(nblks == 1 && (bclu_offset & HAMMER_BUFMASK) == 0);
1840                 buf_no = bclu_offset / HAMMER_BUFSIZE;
1841                 hammer_alist_free(&cluster->alist_master, buf_no, nblks);
1842                 hammer_adjust_stats(cluster, HAMMER_FSBUF_DATA, -nblks);
1843                 hammer_modify_cluster(cluster);
1844                 return;
1845         }
1846
1847         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1848         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, data.data[0][0]);
1849         live = &cluster->alist_mdata;
1850         nblks = (bytes + HAMMER_DATA_BLKMASK) & ~HAMMER_DATA_BLKMASK;
1851         nblks /= HAMMER_DATA_BLKSIZE;
1852         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1853         elm_no += fsbuf_offset / blksize;
1854         hammer_alist_free(live, elm_no, nblks);
1855         hammer_modify_cluster(cluster);
1856 }
1857
1858 void
1859 hammer_free_record(hammer_cluster_t cluster, int32_t bclu_offset)
1860 {
1861         const int32_t blksize = sizeof(union hammer_record_ondisk);
1862         int32_t fsbuf_offset = bclu_offset & HAMMER_BUFMASK;
1863         hammer_alist_t live;
1864         int32_t elm_no;
1865
1866         elm_no = bclu_offset / HAMMER_BUFSIZE * HAMMER_FSBUF_MAXBLKS;
1867         fsbuf_offset -= offsetof(union hammer_fsbuf_ondisk, record.recs[0]);
1868         live = &cluster->alist_record;
1869         KKASSERT(fsbuf_offset >= 0 && fsbuf_offset % blksize == 0);
1870         elm_no += fsbuf_offset / blksize;
1871         hammer_alist_free(live, elm_no, 1);
1872         hammer_modify_cluster(cluster);
1873 }
1874
1875
1876 /*
1877  * Allocate a new filesystem buffer and assign it to the specified
1878  * filesystem buffer type.  The new buffer will be added to the
1879  * type-specific A-list and initialized.
1880  */
1881 static void
1882 alloc_new_buffer(hammer_cluster_t cluster, hammer_alist_t live,
1883                  u_int64_t type, int32_t nelements,
1884                  int start, int *errorp, struct hammer_buffer **bufferp)
1885 {
1886         hammer_buffer_t buffer;
1887         int32_t buf_no;
1888         int isfwd;
1889
1890         start = start / HAMMER_FSBUF_MAXBLKS;   /* convert to buf_no */
1891
1892         isfwd = (type != HAMMER_FSBUF_RECORDS);
1893         buf_no = hammer_alloc_master(cluster, 1, start, isfwd);
1894         if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
1895                 *errorp = ENOSPC;
1896                 *bufferp = NULL;
1897                 return;
1898         }
1899
1900         hammer_modify_cluster(cluster);
1901
1902         /*
1903          * The new buffer must be initialized (type != 0) regardless of
1904          * whether we already have it cached or not, so don't try to
1905          * optimize the cached buffer check.  Just call hammer_get_buffer().
1906          */
1907         buffer = hammer_get_buffer(cluster, buf_no, type, errorp);
1908         if (*bufferp)
1909                 hammer_rel_buffer(*bufferp, 0);
1910         *bufferp = buffer;
1911
1912         /*
1913          * Finally, do a meta-free of the buffer's elements into the
1914          * type-specific A-list and update our statistics to reflect
1915          * the allocation.
1916          */
1917         if (buffer) {
1918                 kprintf("alloc_new_buffer buf_no %d type %016llx nelms %d\n",
1919                         buf_no, type, nelements);
1920                 hammer_alist_free(live, buf_no * HAMMER_FSBUF_MAXBLKS,
1921                                   nelements);
1922                 hammer_adjust_stats(cluster, type, 1);
1923         }
1924 }
1925
1926 /*
1927  * Sync dirty buffers to the media
1928  */
1929
1930 /*
1931  * Sync the entire filesystem.
1932  */
1933 int
1934 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1935 {
1936         struct hammer_sync_info info;
1937
1938         info.error = 0;
1939         info.waitfor = waitfor;
1940
1941         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1942                 hammer_sync_volume, &info);
1943         return(info.error);
1944 }
1945
1946 int
1947 hammer_sync_volume(hammer_volume_t volume, void *data)
1948 {
1949         struct hammer_sync_info *info = data;
1950
1951         RB_SCAN(hammer_clu_rb_tree, &volume->rb_clus_root, NULL,
1952                 hammer_sync_cluster, info);
1953         if (hammer_ref_volume(volume) == 0) {
1954                 hammer_io_flush(&volume->io, info);
1955                 hammer_rel_volume(volume, 0);
1956         }
1957         return(0);
1958 }
1959
1960 int
1961 hammer_sync_cluster(hammer_cluster_t cluster, void *data)
1962 {
1963         struct hammer_sync_info *info = data;
1964
1965         if (cluster->state != HAMMER_CLUSTER_IDLE) {
1966                 RB_SCAN(hammer_buf_rb_tree, &cluster->rb_bufs_root, NULL,
1967                         hammer_sync_buffer, info);
1968                 if (hammer_ref_cluster(cluster) == 0) {
1969                         hammer_io_flush(&cluster->io, info);
1970                         hammer_rel_cluster(cluster, 0);
1971                 }
1972         }
1973         return(0);
1974 }
1975
1976 int
1977 hammer_sync_buffer(hammer_buffer_t buffer, void *data)
1978 {
1979         struct hammer_sync_info *info = data;
1980
1981         if (hammer_ref_buffer(buffer) == 0) {
1982                 hammer_lock_ex(&buffer->io.lock);
1983                 hammer_flush_buffer_nodes(buffer);
1984                 hammer_unlock(&buffer->io.lock);
1985                 hammer_io_flush(&buffer->io, info);
1986                 hammer_rel_buffer(buffer, 0);
1987         }
1988         return(0);
1989 }
1990
1991 #if 0
1992
1993 /*
1994  * Flush various tracking structures to disk
1995  */
1996
1997 /*
1998  * Flush various tracking structures to disk
1999  */
2000 void
2001 flush_all_volumes(void)
2002 {
2003         hammer_volume_t vol;
2004
2005         for (vol = VolBase; vol; vol = vol->next)
2006                 flush_volume(vol);
2007 }
2008
2009 void
2010 flush_volume(hammer_volume_t vol)
2011 {
2012         hammer_supercl_t supercl;
2013         hammer_cluster_t cl;
2014
2015         for (supercl = vol->supercl_base; supercl; supercl = supercl->next)
2016                 flush_supercl(supercl);
2017         for (cl = vol->cluster_base; cl; cl = cl->next)
2018                 flush_cluster(cl);
2019         writehammerbuf(vol, vol->ondisk, 0);
2020 }
2021
2022 void
2023 flush_supercl(hammer_supercl_t supercl)
2024 {
2025         int64_t supercl_offset;
2026
2027         supercl_offset = supercl->scl_offset;
2028         writehammerbuf(supercl->volume, supercl->ondisk, supercl_offset);
2029 }
2030
2031 void
2032 flush_cluster(hammer_cluster_t cl)
2033 {
2034         hammer_buffer_t buf;
2035         int64_t cluster_offset;
2036
2037         for (buf = cl->buffer_base; buf; buf = buf->next)
2038                 flush_buffer(buf);
2039         cluster_offset = cl->clu_offset;
2040         writehammerbuf(cl->volume, cl->ondisk, cluster_offset);
2041 }
2042
2043 void
2044 flush_buffer(hammer_buffer_t buf)
2045 {
2046         int64_t buffer_offset;
2047
2048         buffer_offset = buf->buf_offset + buf->cluster->clu_offset;
2049         writehammerbuf(buf->volume, buf->ondisk, buffer_offset);
2050 }
2051
2052 #endif
2053
2054 /*
2055  * Generic buffer initialization
2056  */
2057 static void
2058 initbuffer(hammer_alist_t live, hammer_fsbuf_head_t head, u_int64_t type)
2059 {
2060         head->buf_type = type;
2061         hammer_alist_init(live);
2062 }
2063
2064 /*
2065  * Calculate the cluster's offset in the volume.  This calculation is
2066  * slightly more complex when using superclusters because superclusters
2067  * are grouped in blocks of 16, followed by 16 x N clusters where N
2068  * is the number of clusters a supercluster can manage.
2069  */
2070 static int64_t
2071 calculate_cluster_offset(hammer_volume_t volume, int32_t clu_no)
2072 {
2073         int32_t scl_group;
2074         int64_t scl_group_size;
2075         int64_t off;
2076
2077         if (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL) {
2078                 scl_group = clu_no / HAMMER_VOL_SUPERCLUSTER_GROUP /
2079                             HAMMER_SCL_MAXCLUSTERS;
2080                 scl_group_size = 
2081                             ((int64_t)HAMMER_BUFSIZE *
2082                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2083                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2084                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2085                 scl_group_size += 
2086                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2087
2088                 off = volume->cluster_base +
2089                       scl_group * scl_group_size +
2090                       (HAMMER_BUFSIZE * HAMMER_VOL_SUPERCLUSTER_GROUP) +
2091                       ((int64_t)clu_no % ((int64_t)HAMMER_SCL_MAXCLUSTERS *
2092                        HAMMER_VOL_SUPERCLUSTER_GROUP))
2093                       * volume->vol_clsize;
2094         } else {
2095                 off = volume->cluster_base +
2096                       (int64_t)clu_no * volume->vol_clsize;
2097         }
2098         return(off);
2099 }
2100
2101 /*
2102  * Calculate a super-cluster's offset in the volume.
2103  */
2104 static int64_t
2105 calculate_supercl_offset(hammer_volume_t volume, int32_t scl_no)
2106 {
2107         int64_t off;
2108         int32_t scl_group;
2109         int64_t scl_group_size;
2110
2111         KKASSERT (volume->vol_flags & HAMMER_VOLF_USINGSUPERCL);
2112         scl_group = scl_no / HAMMER_VOL_SUPERCLUSTER_GROUP;
2113         if (scl_group) {
2114                 scl_group_size = 
2115                             ((int64_t)HAMMER_BUFSIZE *
2116                              HAMMER_VOL_SUPERCLUSTER_GROUP) +
2117                             ((int64_t)HAMMER_VOL_SUPERCLUSTER_GROUP *
2118                              volume->vol_clsize * HAMMER_SCL_MAXCLUSTERS);
2119                 scl_group_size += 
2120                             HAMMER_VOL_SUPERCLUSTER_GROUP * HAMMER_BUFSIZE;
2121                 off = volume->cluster_base + (scl_group * scl_group_size) +
2122                       (scl_no % HAMMER_VOL_SUPERCLUSTER_GROUP) * HAMMER_BUFSIZE;
2123         } else {
2124                 off = volume->cluster_base + (scl_no * HAMMER_BUFSIZE);
2125         }
2126         return(off);
2127 }
2128
2129 /*
2130  *
2131  *
2132  */
2133 static int32_t
2134 hammer_alloc_master(hammer_cluster_t cluster, int nblks,
2135                     int32_t start, int isfwd)
2136 {
2137         int32_t buf_no;
2138
2139         if (isfwd) {
2140                 buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2141                                                 nblks, start);
2142                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2143                         buf_no = hammer_alist_alloc_fwd(&cluster->alist_master,
2144                                                 nblks, 0);
2145                 }
2146         } else {
2147                 buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2148                                                 nblks, start);
2149                 if (buf_no == HAMMER_ALIST_BLOCK_NONE) {
2150                         buf_no = hammer_alist_alloc_rev(&cluster->alist_master,
2151                                                 nblks, HAMMER_ALIST_BLOCK_MAX);
2152                 }
2153         }
2154
2155         /*
2156          * Recover space from empty record, b-tree, and data a-lists.
2157          */
2158
2159         return(buf_no);
2160 }
2161
2162 /*
2163  * Adjust allocation statistics
2164  */
2165 static void
2166 hammer_adjust_stats(hammer_cluster_t cluster, u_int64_t buf_type, int nblks)
2167 {
2168         switch(buf_type) {
2169         case HAMMER_FSBUF_BTREE:
2170                 cluster->ondisk->stat_idx_bufs += nblks;
2171                 cluster->volume->ondisk->vol_stat_idx_bufs += nblks;
2172                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_idx_bufs += nblks;
2173                 break;
2174         case HAMMER_FSBUF_DATA:
2175                 cluster->ondisk->stat_data_bufs += nblks;
2176                 cluster->volume->ondisk->vol_stat_data_bufs += nblks;
2177                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_data_bufs += nblks;
2178                 break;
2179         case HAMMER_FSBUF_RECORDS:
2180                 cluster->ondisk->stat_rec_bufs += nblks;
2181                 cluster->volume->ondisk->vol_stat_rec_bufs += nblks;
2182                 cluster->volume->hmp->rootvol->ondisk->vol0_stat_rec_bufs += nblks;
2183                 break;
2184         }
2185         hammer_modify_cluster(cluster);
2186         hammer_modify_volume(cluster->volume);
2187         hammer_modify_volume(cluster->volume->hmp->rootvol);
2188 }
2189
2190 /*
2191  * A-LIST SUPPORT
2192  *
2193  * Setup the parameters for the various A-lists we use in hammer.  The
2194  * supercluster A-list must be chained to the cluster A-list and cluster
2195  * slave A-lists are chained to buffer A-lists.
2196  *
2197  * See hammer_init_alist_config() below.
2198  */
2199
2200 /*
2201  * A-LIST - cluster recursion into a filesystem buffer
2202  */
2203 static int
2204 buffer_alist_init(void *info, int32_t blk, int32_t radix)
2205 {
2206         return(0);
2207 #if 0
2208         hammer_cluster_t cluster = info;
2209         hammer_buffer_t buffer;
2210         int32_t buf_no;
2211         int error = 0;
2212
2213         /*
2214          * Calculate the buffer number, initialize based on the buffer type.
2215          * The buffer has already been allocated so assert that it has been
2216          * initialized.
2217          */
2218         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2219         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2220         if (buffer) {
2221                 hammer_adjust_stats(cluster, buffer->ondisk->head.buf_type, 1);
2222                 hammer_rel_buffer(buffer, 0);
2223         }
2224         return (error);
2225 #endif
2226 }
2227
2228 static int
2229 buffer_alist_destroy(void *info, int32_t blk, int32_t radix)
2230 {
2231         return(0);
2232 #if 0
2233         hammer_cluster_t cluster = info;
2234         hammer_buffer_t buffer;
2235         int32_t buf_no;
2236         int error = 0;
2237
2238         /*
2239          * Calculate the buffer number, initialize based on the buffer type.
2240          * The buffer has already been allocated so assert that it has been
2241          * initialized.
2242          */
2243         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2244         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2245         if (buffer) {
2246                 hammer_adjust_stats(cluster, buffer->ondisk->head.buf_type, -1);
2247                 hammer_rel_buffer(buffer, 0);
2248         }
2249         return (error);
2250 #endif
2251 }
2252
2253 /*
2254  * Note: atblk can be negative and atblk - blk can go negative.
2255  */
2256 static int
2257 buffer_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2258                       int32_t count, int32_t atblk, int32_t *fullp)
2259 {
2260         hammer_cluster_t cluster = info;
2261         hammer_buffer_t buffer;
2262         int32_t buf_no;
2263         int32_t r;
2264         int error = 0;
2265
2266         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2267         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2268         if (buffer) {
2269                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2270
2271                 r = hammer_alist_alloc_fwd(&buffer->alist, count, atblk - blk);
2272                 if (r != HAMMER_ALIST_BLOCK_NONE)
2273                         r += blk;
2274                 *fullp = hammer_alist_isfull(&buffer->alist);
2275                 hammer_modify_buffer(buffer);
2276                 hammer_rel_buffer(buffer, 0);
2277         } else {
2278                 r = HAMMER_ALIST_BLOCK_NONE;
2279         }
2280         return(r);
2281 }
2282
2283 static int
2284 buffer_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2285                       int32_t count, int32_t atblk, int32_t *fullp)
2286 {
2287         hammer_cluster_t cluster = info;
2288         hammer_buffer_t buffer;
2289         int32_t buf_no;
2290         int32_t r;
2291         int error = 0;
2292
2293         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2294         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2295         if (buffer) {
2296                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2297
2298                 r = hammer_alist_alloc_rev(&buffer->alist, count, atblk - blk);
2299                 if (r != HAMMER_ALIST_BLOCK_NONE)
2300                         r += blk;
2301                 *fullp = hammer_alist_isfull(&buffer->alist);
2302                 hammer_modify_buffer(buffer);
2303                 hammer_rel_buffer(buffer, 0);
2304         } else {
2305                 r = HAMMER_ALIST_BLOCK_NONE;
2306                 *fullp = 0;
2307         }
2308         return(r);
2309 }
2310
2311 static void
2312 buffer_alist_free(void *info, int32_t blk, int32_t radix,
2313                  int32_t base_blk, int32_t count, int32_t *emptyp)
2314 {
2315         hammer_cluster_t cluster = info;
2316         hammer_buffer_t buffer;
2317         int32_t buf_no;
2318         int error = 0;
2319
2320         buf_no = blk / HAMMER_FSBUF_MAXBLKS;
2321         buffer = hammer_get_buffer(cluster, buf_no, 0, &error);
2322         if (buffer) {
2323                 KKASSERT(buffer->ondisk->head.buf_type != 0);
2324                 hammer_alist_free(&buffer->alist, base_blk, count);
2325                 *emptyp = hammer_alist_isempty(&buffer->alist);
2326                 /* XXX don't bother updating the buffer is completely empty? */
2327                 hammer_modify_buffer(buffer);
2328                 hammer_rel_buffer(buffer, 0);
2329         } else {
2330                 *emptyp = 0;
2331         }
2332 }
2333
2334 static void
2335 buffer_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2336 {
2337 }
2338
2339 /*
2340  * A-LIST - super-cluster recursion into a cluster and cluster recursion
2341  * into a filesystem buffer.  A-List's are mostly self-contained entities,
2342  * but callbacks must be installed to recurse from one A-List to another.
2343  *
2344  * Implementing these callbacks allows us to operate a multi-layered A-List
2345  * as a single entity.
2346  */
2347 static int
2348 super_alist_init(void *info, int32_t blk, int32_t radix)
2349 {
2350         hammer_volume_t volume = info;
2351         hammer_supercl_t supercl;
2352         int32_t scl_no;
2353         int error = 0;
2354
2355         /*
2356          * Calculate the super-cluster number containing the cluster (blk)
2357          * and obtain the super-cluster buffer.
2358          */
2359         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2360         supercl = hammer_get_supercl(volume, scl_no, &error, 1);
2361         if (supercl)
2362                 hammer_rel_supercl(supercl, 0);
2363         return (error);
2364 }
2365
2366 static int
2367 super_alist_destroy(void *info, int32_t blk, int32_t radix)
2368 {
2369         return(0);
2370 }
2371
2372 static int
2373 super_alist_alloc_fwd(void *info, int32_t blk, int32_t radix,
2374                       int32_t count, int32_t atblk, int32_t *fullp)
2375 {
2376         hammer_volume_t volume = info;
2377         hammer_supercl_t supercl;
2378         int32_t scl_no;
2379         int32_t r;
2380         int error = 0;
2381
2382         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2383         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2384         if (supercl) {
2385                 r = hammer_alist_alloc_fwd(&supercl->alist, count, atblk - blk);
2386                 if (r != HAMMER_ALIST_BLOCK_NONE)
2387                         r += blk;
2388                 *fullp = hammer_alist_isfull(&supercl->alist);
2389                 hammer_modify_supercl(supercl);
2390                 hammer_rel_supercl(supercl, 0);
2391         } else {
2392                 r = HAMMER_ALIST_BLOCK_NONE;
2393                 *fullp = 0;
2394         }
2395         return(r);
2396 }
2397
2398 static int
2399 super_alist_alloc_rev(void *info, int32_t blk, int32_t radix,
2400                       int32_t count, int32_t atblk, int32_t *fullp)
2401 {
2402         hammer_volume_t volume = info;
2403         hammer_supercl_t supercl;
2404         int32_t scl_no;
2405         int32_t r;
2406         int error = 0;
2407
2408         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2409         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2410         if (supercl) {
2411                 r = hammer_alist_alloc_rev(&supercl->alist, count, atblk - blk);
2412                 if (r != HAMMER_ALIST_BLOCK_NONE)
2413                         r += blk;
2414                 *fullp = hammer_alist_isfull(&supercl->alist);
2415                 hammer_modify_supercl(supercl);
2416                 hammer_rel_supercl(supercl, 0);
2417         } else { 
2418                 r = HAMMER_ALIST_BLOCK_NONE;
2419                 *fullp = 0;
2420         }
2421         return(r);
2422 }
2423
2424 static void
2425 super_alist_free(void *info, int32_t blk, int32_t radix,
2426                  int32_t base_blk, int32_t count, int32_t *emptyp)
2427 {
2428         hammer_volume_t volume = info;
2429         hammer_supercl_t supercl;
2430         int32_t scl_no;
2431         int error = 0;
2432
2433         scl_no = blk / HAMMER_SCL_MAXCLUSTERS;
2434         supercl = hammer_get_supercl(volume, scl_no, &error, 0);
2435         if (supercl) {
2436                 hammer_alist_free(&supercl->alist, base_blk, count);
2437                 *emptyp = hammer_alist_isempty(&supercl->alist);
2438                 hammer_modify_supercl(supercl);
2439                 hammer_rel_supercl(supercl, 0);
2440         } else {
2441                 *emptyp = 0;
2442         }
2443 }
2444
2445 static void
2446 super_alist_print(void *info, int32_t blk, int32_t radix, int tab)
2447 {
2448 }
2449
2450 void
2451 hammer_init_alist_config(void)
2452 {
2453         hammer_alist_config_t config;
2454
2455         hammer_alist_template(&Buf_alist_config, HAMMER_FSBUF_MAXBLKS,
2456                               1, HAMMER_FSBUF_METAELMS);
2457         hammer_alist_template(&Vol_normal_alist_config, HAMMER_VOL_MAXCLUSTERS,
2458                               1, HAMMER_VOL_METAELMS_1LYR);
2459         hammer_alist_template(&Vol_super_alist_config,
2460                           HAMMER_VOL_MAXSUPERCLUSTERS * HAMMER_SCL_MAXCLUSTERS,
2461                               HAMMER_SCL_MAXCLUSTERS, HAMMER_VOL_METAELMS_2LYR);
2462         hammer_alist_template(&Supercl_alist_config, HAMMER_VOL_MAXCLUSTERS,
2463                               1, HAMMER_SUPERCL_METAELMS);
2464         hammer_alist_template(&Clu_master_alist_config, HAMMER_CLU_MAXBUFFERS,
2465                               1, HAMMER_CLU_MASTER_METAELMS);
2466         hammer_alist_template(&Clu_slave_alist_config,
2467                               HAMMER_CLU_MAXBUFFERS * HAMMER_FSBUF_MAXBLKS,
2468                               HAMMER_FSBUF_MAXBLKS, HAMMER_CLU_SLAVE_METAELMS);
2469
2470         config = &Vol_super_alist_config;
2471         config->bl_radix_init = super_alist_init;
2472         config->bl_radix_destroy = super_alist_destroy;
2473         config->bl_radix_alloc_fwd = super_alist_alloc_fwd;
2474         config->bl_radix_alloc_rev = super_alist_alloc_rev;
2475         config->bl_radix_free = super_alist_free;
2476         config->bl_radix_print = super_alist_print;
2477
2478         config = &Clu_slave_alist_config;
2479         config->bl_radix_init = buffer_alist_init;
2480         config->bl_radix_destroy = buffer_alist_destroy;
2481         config->bl_radix_alloc_fwd = buffer_alist_alloc_fwd;
2482         config->bl_radix_alloc_rev = buffer_alist_alloc_rev;
2483         config->bl_radix_free = buffer_alist_free;
2484         config->bl_radix_print = buffer_alist_print;
2485 }
2486