336739ddd46647d6f5879f5545eb4e7775389fc1
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.28 2008/02/08 08:30:59 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew);
51 static int hammer_load_node(hammer_node_t node);
52 static hammer_off_t hammer_advance_fifo(hammer_volume_t volume,
53                 hammer_off_t off, int32_t bytes);
54
55 static hammer_off_t hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len,
56                 int32_t data_len, struct hammer_buffer **rec_bufferp,
57                 u_int16_t hdr_type, int can_cross, 
58                 struct hammer_buffer **data2_bufferp, int *errorp);
59
60 /*
61  * Red-Black tree support for various structures
62  */
63 static int
64 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
65 {
66         if (ip1->obj_id < ip2->obj_id)
67                 return(-1);
68         if (ip1->obj_id > ip2->obj_id)
69                 return(1);
70         if (ip1->obj_asof < ip2->obj_asof)
71                 return(-1);
72         if (ip1->obj_asof > ip2->obj_asof)
73                 return(1);
74         return(0);
75 }
76
77 static int
78 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
79 {
80         if (info->obj_id < ip->obj_id)
81                 return(-1);
82         if (info->obj_id > ip->obj_id)
83                 return(1);
84         if (info->obj_asof < ip->obj_asof)
85                 return(-1);
86         if (info->obj_asof > ip->obj_asof)
87                 return(1);
88         return(0);
89 }
90
91 static int
92 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
93 {
94         if (vol1->vol_no < vol2->vol_no)
95                 return(-1);
96         if (vol1->vol_no > vol2->vol_no)
97                 return(1);
98         return(0);
99 }
100
101 static int
102 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
103 {
104         if (buf1->buf_offset < buf2->buf_offset)
105                 return(-1);
106         if (buf1->buf_offset > buf2->buf_offset)
107                 return(1);
108         return(0);
109 }
110
111 static int
112 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
113 {
114         if (node1->node_offset < node2->node_offset)
115                 return(-1);
116         if (node1->node_offset > node2->node_offset)
117                 return(1);
118         return(0);
119 }
120
121 /*
122  * Note: The lookup function for hammer_ino_rb_tree winds up being named
123  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
124  * functions are normal, e.g. hammer_buf_rb_tree_RB_LOOKUP(root, buf_offset).
125  */
126 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
127 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
128                 hammer_inode_info_cmp, hammer_inode_info_t);
129 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
130              hammer_vol_rb_compare, int32_t, vol_no);
131 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
132              hammer_buf_rb_compare, hammer_off_t, buf_offset);
133 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
134              hammer_nod_rb_compare, hammer_off_t, node_offset);
135
136 /************************************************************************
137  *                              VOLUMES                                 *
138  ************************************************************************
139  *
140  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
141  * code on failure.  Volumes must be loaded at mount time, get_volume() will
142  * not load a new volume.
143  *
144  * Calls made to hammer_load_volume() or single-threaded
145  */
146 int
147 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
148 {
149         struct mount *mp;
150         hammer_volume_t volume;
151         struct hammer_volume_ondisk *ondisk;
152         struct nlookupdata nd;
153         struct buf *bp = NULL;
154         int error;
155         int ronly;
156
157         mp = hmp->mp;
158         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
159
160         /*
161          * Allocate a volume structure
162          */
163         ++hammer_count_volumes;
164         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
165         volume->vol_name = kstrdup(volname, M_HAMMER);
166         volume->hmp = hmp;
167         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
168         volume->io.offset = 0LL;
169
170         /*
171          * Get the device vnode
172          */
173         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
174         if (error == 0)
175                 error = nlookup(&nd);
176         if (error == 0)
177                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
178         nlookup_done(&nd);
179         if (error == 0) {
180                 if (vn_isdisk(volume->devvp, &error)) {
181                         error = vfs_mountedon(volume->devvp);
182                 }
183         }
184         if (error == 0 &&
185             count_udev(volume->devvp->v_umajor, volume->devvp->v_uminor) > 0) {
186                 error = EBUSY;
187         }
188         if (error == 0) {
189                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
190                 error = vinvalbuf(volume->devvp, V_SAVE, 0, 0);
191                 if (error == 0) {
192                         error = VOP_OPEN(volume->devvp, 
193                                          (ronly ? FREAD : FREAD|FWRITE),
194                                          FSCRED, NULL);
195                 }
196                 vn_unlock(volume->devvp);
197         }
198         if (error) {
199                 hammer_free_volume(volume);
200                 return(error);
201         }
202         volume->devvp->v_rdev->si_mountpoint = mp;
203
204         /*
205          * Extract the volume number from the volume header and do various
206          * sanity checks.
207          */
208         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
209         if (error)
210                 goto late_failure;
211         ondisk = (void *)bp->b_data;
212         if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) {
213                 kprintf("hammer_mount: volume %s has an invalid header\n",
214                         volume->vol_name);
215                 error = EFTYPE;
216                 goto late_failure;
217         }
218         volume->vol_no = ondisk->vol_no;
219         volume->buffer_base = ondisk->vol_buf_beg;
220         volume->vol_flags = ondisk->vol_flags;
221         volume->nblocks = ondisk->vol_nblocks; 
222         volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no,
223                                     ondisk->vol_buf_end - ondisk->vol_buf_beg);
224         RB_INIT(&volume->rb_bufs_root);
225
226         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
227
228         if (RB_EMPTY(&hmp->rb_vols_root)) {
229                 hmp->fsid = ondisk->vol_fsid;
230         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
231                 kprintf("hammer_mount: volume %s's fsid does not match "
232                         "other volumes\n", volume->vol_name);
233                 error = EFTYPE;
234                 goto late_failure;
235         }
236
237         /*
238          * Insert the volume structure into the red-black tree.
239          */
240         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
241                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
242                         volume->vol_name, volume->vol_no);
243                 error = EEXIST;
244         }
245
246         /*
247          * Set the root volume .  HAMMER special cases rootvol the structure.
248          * We do not hold a ref because this would prevent related I/O
249          * from being flushed.
250          */
251         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
252                 hmp->rootvol = volume;
253                 if (bp) {
254                         brelse(bp);
255                         bp = NULL;
256                 }
257                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
258         }
259 late_failure:
260         if (bp)
261                 brelse(bp);
262         if (error) {
263                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
264                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
265                 hammer_free_volume(volume);
266         }
267         return (error);
268 }
269
270 /*
271  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
272  * so returns -1 on failure.
273  */
274 int
275 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
276 {
277         struct hammer_mount *hmp = volume->hmp;
278         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
279
280         /*
281          * Sync clusters, sync volume
282          */
283
284         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
285
286         /*
287          * Clean up the root volume pointer, which is held unlocked in hmp.
288          */
289         if (hmp->rootvol == volume)
290                 hmp->rootvol = NULL;
291
292         /*
293          * Unload clusters and super-clusters.  Unloading a super-cluster
294          * also unloads related clusters, but the filesystem may not be
295          * using super-clusters so unload clusters anyway.
296          */
297         RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
298                         hammer_unload_buffer, NULL);
299         hammer_io_waitdep(&volume->io);
300
301         /*
302          * Release our buffer and flush anything left in the buffer cache.
303          */
304         hammer_io_release(&volume->io, 2);
305
306         /*
307          * There should be no references on the volume, no clusters, and
308          * no super-clusters.
309          */
310         KKASSERT(volume->io.lock.refs == 0);
311         KKASSERT(RB_EMPTY(&volume->rb_bufs_root));
312
313         volume->ondisk = NULL;
314         if (volume->devvp) {
315                 if (ronly) {
316                         vinvalbuf(volume->devvp, 0, 0, 0);
317                         VOP_CLOSE(volume->devvp, FREAD);
318                 } else {
319                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
320                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
321                 }
322         }
323
324         /*
325          * Destroy the structure
326          */
327         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
328         hammer_free_volume(volume);
329         return(0);
330 }
331
332 static
333 void
334 hammer_free_volume(hammer_volume_t volume)
335 {
336         if (volume->vol_name) {
337                 kfree(volume->vol_name, M_HAMMER);
338                 volume->vol_name = NULL;
339         }
340         if (volume->devvp) {
341                 if (vn_isdisk(volume->devvp, NULL) &&
342                     volume->devvp->v_rdev &&
343                     volume->devvp->v_rdev->si_mountpoint == volume->hmp->mp
344                 ) {
345                         volume->devvp->v_rdev->si_mountpoint = NULL;
346                 }
347                 vrele(volume->devvp);
348                 volume->devvp = NULL;
349         }
350         --hammer_count_volumes;
351         kfree(volume, M_HAMMER);
352 }
353
354 /*
355  * Get a HAMMER volume.  The volume must already exist.
356  */
357 hammer_volume_t
358 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
359 {
360         struct hammer_volume *volume;
361
362         /*
363          * Locate the volume structure
364          */
365         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
366         if (volume == NULL) {
367                 *errorp = ENOENT;
368                 return(NULL);
369         }
370         hammer_ref(&volume->io.lock);
371
372         /*
373          * Deal with on-disk info
374          */
375         if (volume->ondisk == NULL || volume->io.loading) {
376                 *errorp = hammer_load_volume(volume);
377                 if (*errorp) {
378                         hammer_rel_volume(volume, 1);
379                         volume = NULL;
380                 }
381         } else {
382                 *errorp = 0;
383         }
384         return(volume);
385 }
386
387 int
388 hammer_ref_volume(hammer_volume_t volume)
389 {
390         int error;
391
392         hammer_ref(&volume->io.lock);
393
394         /*
395          * Deal with on-disk info
396          */
397         if (volume->ondisk == NULL || volume->io.loading) {
398                 error = hammer_load_volume(volume);
399                 if (error)
400                         hammer_rel_volume(volume, 1);
401         } else {
402                 error = 0;
403         }
404         return (error);
405 }
406
407 hammer_volume_t
408 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
409 {
410         hammer_volume_t volume;
411
412         volume = hmp->rootvol;
413         KKASSERT(volume != NULL);
414         hammer_ref(&volume->io.lock);
415
416         /*
417          * Deal with on-disk info
418          */
419         if (volume->ondisk == NULL || volume->io.loading) {
420                 *errorp = hammer_load_volume(volume);
421                 if (*errorp) {
422                         hammer_rel_volume(volume, 1);
423                         volume = NULL;
424                 }
425         } else {
426                 *errorp = 0;
427         }
428         return (volume);
429 }
430
431 /*
432  * Load a volume's on-disk information.  The volume must be referenced and
433  * not locked.  We temporarily acquire an exclusive lock to interlock
434  * against releases or multiple get's.
435  */
436 static int
437 hammer_load_volume(hammer_volume_t volume)
438 {
439         struct hammer_volume_ondisk *ondisk;
440         int error;
441
442         hammer_lock_ex(&volume->io.lock);
443         KKASSERT(volume->io.loading == 0);
444         volume->io.loading = 1;
445
446         if (volume->ondisk == NULL) {
447                 error = hammer_io_read(volume->devvp, &volume->io);
448                 if (error) {
449                         volume->io.loading = 0;
450                         hammer_unlock(&volume->io.lock);
451                         return (error);
452                 }
453                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
454         } else {
455                 error = 0;
456         }
457         volume->io.loading = 0;
458         hammer_unlock(&volume->io.lock);
459         return(0);
460 }
461
462 /*
463  * Release a volume.  Call hammer_io_release on the last reference.  We have
464  * to acquire an exclusive lock to interlock against volume->ondisk tests
465  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
466  * lock to be held.
467  *
468  * Volumes are not unloaded from memory during normal operation.
469  */
470 void
471 hammer_rel_volume(hammer_volume_t volume, int flush)
472 {
473         if (volume->io.lock.refs == 1) {
474                 hammer_lock_ex(&volume->io.lock);
475                 if (volume->io.lock.refs == 1) {
476                         volume->ondisk = NULL;
477                         hammer_io_release(&volume->io, flush);
478                 } else if (flush) {
479                         hammer_io_flush(&volume->io);
480                 }
481                 hammer_unlock(&volume->io.lock);
482         }
483         hammer_unref(&volume->io.lock);
484 }
485
486 /************************************************************************
487  *                              BUFFERS                                 *
488  ************************************************************************
489  *
490  * Manage buffers.  Note that a buffer holds a reference to its associated
491  * cluster, and its cluster will hold a reference to the cluster's volume.
492  */
493 hammer_buffer_t
494 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
495                   int isnew, int *errorp)
496 {
497         hammer_buffer_t buffer;
498         hammer_volume_t volume;
499         int vol_no;
500
501         buf_offset &= ~HAMMER_BUFMASK64;
502         KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
503         vol_no = HAMMER_VOL_DECODE(buf_offset);
504         volume = hammer_get_volume(hmp, vol_no, errorp);
505         if (volume == NULL)
506                 return(NULL);
507         /*
508          * NOTE: buf_offset and maxbuf_off are both full offset
509          * specifications.
510          */
511         KKASSERT(buf_offset < volume->maxbuf_off);
512
513         /*
514          * Locate and lock the buffer structure, creating one if necessary.
515          */
516 again:
517         buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root,
518                            buf_offset);
519         if (buffer == NULL) {
520                 ++hammer_count_buffers;
521                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
522                 buffer->buf_offset = buf_offset;
523                 buffer->volume = volume;
524                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
525                 buffer->io.offset = volume->ondisk->vol_buf_beg +
526                                     (buf_offset & HAMMER_OFF_SHORT_MASK);
527                 TAILQ_INIT(&buffer->clist);
528                 hammer_ref(&buffer->io.lock);
529
530                 /*
531                  * Insert the buffer into the RB tree and handle late
532                  * collisions.
533                  */
534                 if (RB_INSERT(hammer_buf_rb_tree, &volume->rb_bufs_root, buffer)) {
535                         hammer_unref(&buffer->io.lock);
536                         --hammer_count_buffers;
537                         kfree(buffer, M_HAMMER);
538                         goto again;
539                 }
540                 hammer_ref(&volume->io.lock);
541         } else {
542                 hammer_ref(&buffer->io.lock);
543         }
544
545         /*
546          * Deal with on-disk info
547          */
548         if (buffer->ondisk == NULL || buffer->io.loading) {
549                 *errorp = hammer_load_buffer(buffer, isnew);
550                 if (*errorp) {
551                         hammer_rel_buffer(buffer, 1);
552                         buffer = NULL;
553                 }
554         } else {
555                 *errorp = 0;
556         }
557         hammer_rel_volume(volume, 0);
558         return(buffer);
559 }
560
561 static int
562 hammer_load_buffer(hammer_buffer_t buffer, int isnew)
563 {
564         hammer_volume_t volume;
565         void *ondisk;
566         int error;
567
568         /*
569          * Load the buffer's on-disk info
570          */
571         volume = buffer->volume;
572         hammer_lock_ex(&buffer->io.lock);
573         KKASSERT(buffer->io.loading == 0);
574         buffer->io.loading = 1;
575
576         if (buffer->ondisk == NULL) {
577                 if (isnew) {
578                         error = hammer_io_new(volume->devvp, &buffer->io);
579                 } else {
580                         error = hammer_io_read(volume->devvp, &buffer->io);
581                 }
582                 if (error) {
583                         buffer->io.loading = 0;
584                         hammer_unlock(&buffer->io.lock);
585                         return (error);
586                 }
587                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
588         } else if (isnew) {
589                 error = hammer_io_new(volume->devvp, &buffer->io);
590         } else {
591                 error = 0;
592         }
593         if (error == 0 && isnew) {
594                 hammer_modify_buffer(buffer, NULL, 0);
595                 /* additional initialization goes here */
596         }
597         buffer->io.loading = 0;
598         hammer_unlock(&buffer->io.lock);
599         return (error);
600 }
601
602 /*
603  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
604  */
605 int
606 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
607 {
608         hammer_ref(&buffer->io.lock);
609         hammer_flush_buffer_nodes(buffer);
610         KKASSERT(buffer->io.lock.refs == 1);
611         hammer_rel_buffer(buffer, 2);
612         return(0);
613 }
614
615 /*
616  * Reference a buffer that is either already referenced or via a specially
617  * handled pointer (aka cursor->buffer).
618  */
619 int
620 hammer_ref_buffer(hammer_buffer_t buffer)
621 {
622         int error;
623
624         hammer_ref(&buffer->io.lock);
625         if (buffer->ondisk == NULL || buffer->io.loading) {
626                 error = hammer_load_buffer(buffer, 0);
627                 if (error) {
628                         hammer_rel_buffer(buffer, 1);
629                         /*
630                          * NOTE: buffer pointer can become stale after
631                          * the above release.
632                          */
633                 }
634         } else {
635                 error = 0;
636         }
637         return(error);
638 }
639
640 /*
641  * Release a buffer.  We have to deal with several places where
642  * another thread can ref the buffer.
643  *
644  * Only destroy the structure itself if the related buffer cache buffer
645  * was disassociated from it.  This ties the management of the structure
646  * to the buffer cache subsystem.  buffer->ondisk determines whether the
647  * embedded io is referenced or not.
648  */
649 void
650 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
651 {
652         hammer_volume_t volume;
653
654         if (buffer->io.lock.refs == 1) {
655                 hammer_lock_ex(&buffer->io.lock);
656                 if (buffer->io.lock.refs == 1) {
657                         hammer_io_release(&buffer->io, flush);
658
659                         if (buffer->io.bp == NULL &&
660                             buffer->io.lock.refs == 1) {
661                                 hammer_flush_buffer_nodes(buffer);
662                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
663                                 volume = buffer->volume;
664                                 RB_REMOVE(hammer_buf_rb_tree,
665                                           &volume->rb_bufs_root, buffer);
666                                 buffer->volume = NULL; /* sanity */
667                                 --hammer_count_buffers;
668                                 kfree(buffer, M_HAMMER);
669                                 hammer_rel_volume(volume, 0);
670                                 return;
671                         }
672                 } else if (flush) {
673                         hammer_io_flush(&buffer->io);
674                 }
675                 hammer_unlock(&buffer->io.lock);
676         }
677         hammer_unref(&buffer->io.lock);
678 }
679
680 /*
681  * Access the filesystem buffer containing the specified hammer offset.
682  * buf_offset is a conglomeration of the volume number and vol_buf_beg
683  * relative buffer offset.  It must also have bit 55 set to be valid.
684  * (see hammer_off_t in hammer_disk.h).
685  *
686  * Any prior buffer in *bufferp will be released and replaced by the
687  * requested buffer.
688  */
689 void *
690 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 
691              struct hammer_buffer **bufferp)
692 {
693         hammer_buffer_t buffer;
694         int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
695
696         buf_offset &= ~HAMMER_BUFMASK64;
697
698         buffer = *bufferp;
699         if (buffer == NULL || buffer->buf_offset != buf_offset) {
700                 if (buffer)
701                         hammer_rel_buffer(buffer, 0);
702                 buffer = hammer_get_buffer(hmp, buf_offset, 0, errorp);
703                 *bufferp = buffer;
704         } else {
705                 *errorp = 0;
706         }
707
708         /*
709          * Return a pointer to the buffer data.
710          */
711         if (buffer == NULL)
712                 return(NULL);
713         else
714                 return((char *)buffer->ondisk + xoff);
715 }
716
717 /*
718  * Access the filesystem buffer containing the specified hammer offset.
719  * No disk read operation occurs.  The result buffer may contain garbage.
720  *
721  * Any prior buffer in *bufferp will be released and replaced by the
722  * requested buffer.
723  */
724 void *
725 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 
726              struct hammer_buffer **bufferp)
727 {
728         hammer_buffer_t buffer;
729         int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
730
731         buf_offset &= ~HAMMER_BUFMASK64;
732
733         buffer = *bufferp;
734         if (buffer == NULL || buffer->buf_offset != buf_offset) {
735                 if (buffer)
736                         hammer_rel_buffer(buffer, 0);
737                 buffer = hammer_get_buffer(hmp, buf_offset, 1, errorp);
738                 *bufferp = buffer;
739         } else {
740                 *errorp = 0;
741         }
742
743         /*
744          * Return a pointer to the buffer data.
745          */
746         if (buffer == NULL)
747                 return(NULL);
748         else
749                 return((char *)buffer->ondisk + xoff);
750 }
751
752 /************************************************************************
753  *                              NODES                                   *
754  ************************************************************************
755  *
756  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
757  * method used by the HAMMER filesystem.
758  *
759  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
760  * associated with its buffer, and will only referenced the buffer while
761  * the node itself is referenced.
762  *
763  * A hammer_node can also be passively associated with other HAMMER
764  * structures, such as inodes, while retaining 0 references.  These
765  * associations can be cleared backwards using a pointer-to-pointer in
766  * the hammer_node.
767  *
768  * This allows the HAMMER implementation to cache hammer_nodes long-term
769  * and short-cut a great deal of the infrastructure's complexity.  In
770  * most cases a cached node can be reacquired without having to dip into
771  * either the buffer or cluster management code.
772  *
773  * The caller must pass a referenced cluster on call and will retain
774  * ownership of the reference on return.  The node will acquire its own
775  * additional references, if necessary.
776  */
777 hammer_node_t
778 hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset, int *errorp)
779 {
780         hammer_volume_t volume;
781         hammer_node_t node;
782         int32_t vol_no;
783
784         KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) ==
785                  HAMMER_ZONE_RAW_BUFFER);
786         vol_no = HAMMER_VOL_DECODE(node_offset);
787         volume = hammer_get_volume(hmp, vol_no, errorp);
788         if (volume == NULL)
789                 return(NULL);
790
791         /*
792          * Locate the structure, allocating one if necessary.
793          */
794 again:
795         node = RB_LOOKUP(hammer_nod_rb_tree, &volume->rb_nods_root,
796                          node_offset);
797         if (node == NULL) {
798                 ++hammer_count_nodes;
799                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
800                 node->node_offset = node_offset;
801                 node->volume = volume;  /* not directly referenced */
802                 if (RB_INSERT(hammer_nod_rb_tree, &volume->rb_nods_root,
803                               node)) {
804                         --hammer_count_nodes;
805                         kfree(node, M_HAMMER);
806                         goto again;
807                 }
808         }
809         hammer_ref(&node->lock);
810         *errorp = hammer_load_node(node);
811         if (*errorp) {
812                 hammer_rel_node(node);
813                 node = NULL;
814         }
815         hammer_rel_volume(volume, 0);
816         return(node);
817 }
818
819 /*
820  * Reference an already-referenced node.
821  */
822 int
823 hammer_ref_node(hammer_node_t node)
824 {
825         int error;
826
827         KKASSERT(node->lock.refs > 0);
828         hammer_ref(&node->lock);
829         if ((error = hammer_load_node(node)) != 0)
830                 hammer_rel_node(node);
831         return(error);
832 }
833
834 /*
835  * Load a node's on-disk data reference.
836  */
837 static int
838 hammer_load_node(hammer_node_t node)
839 {
840         hammer_buffer_t buffer;
841         int error;
842
843         if (node->ondisk)
844                 return(0);
845         error = 0;
846         hammer_lock_ex(&node->lock);
847         if (node->ondisk == NULL) {
848                 /*
849                  * This is a little confusing but the jist is that
850                  * node->buffer determines whether the node is on
851                  * the buffer's clist and node->ondisk determines
852                  * whether the buffer is referenced.
853                  */
854                 if ((buffer = node->buffer) != NULL) {
855                         error = hammer_ref_buffer(buffer);
856                 } else {
857                         buffer = hammer_get_buffer(node->volume->hmp,
858                                                    node->node_offset, 0,
859                                                    &error);
860                         if (buffer) {
861                                 KKASSERT(error == 0);
862                                 TAILQ_INSERT_TAIL(&buffer->clist,
863                                                   node, entry);
864                                 node->buffer = buffer;
865                         }
866                 }
867                 if (error == 0) {
868                         node->ondisk = (void *)((char *)buffer->ondisk +
869                                (node->node_offset & HAMMER_BUFMASK));
870                 }
871         }
872         hammer_unlock(&node->lock);
873         return (error);
874 }
875
876 /*
877  * Safely reference a node, interlock against flushes via the IO subsystem.
878  */
879 hammer_node_t
880 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
881                      int *errorp)
882 {
883         hammer_node_t node;
884
885         if ((node = *cache) != NULL)
886                 hammer_ref(&node->lock);
887         if (node) {
888                 *errorp = hammer_load_node(node);
889                 if (*errorp) {
890                         hammer_rel_node(node);
891                         node = NULL;
892                 }
893         } else {
894                 *errorp = ENOENT;
895         }
896         return(node);
897 }
898
899 /*
900  * Release a hammer_node.  On the last release the node dereferences
901  * its underlying buffer and may or may not be destroyed.
902  */
903 void
904 hammer_rel_node(hammer_node_t node)
905 {
906         hammer_buffer_t buffer;
907
908         /*
909          * If this isn't the last ref just decrement the ref count and
910          * return.
911          */
912         if (node->lock.refs > 1) {
913                 hammer_unref(&node->lock);
914                 return;
915         }
916
917         /*
918          * If there is no ondisk info or no buffer the node failed to load,
919          * remove the last reference and destroy the node.
920          */
921         if (node->ondisk == NULL) {
922                 hammer_unref(&node->lock);
923                 hammer_flush_node(node);
924                 /* node is stale now */
925                 return;
926         }
927
928         /*
929          * Do final cleanups and then either destroy the node and leave it
930          * passively cached.  The buffer reference is removed regardless.
931          */
932         buffer = node->buffer;
933         node->ondisk = NULL;
934
935         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
936                 hammer_unref(&node->lock);
937                 hammer_rel_buffer(buffer, 0);
938                 return;
939         }
940
941         /*
942          * Destroy the node if it has been marked for deletion.  We mark
943          * it as being free.  Note that the disk space is physically
944          * freed when the fifo cycles back through the node.
945          */
946         if (node->flags & HAMMER_NODE_DELETED)
947                 hammer_free_fifo(node->volume->hmp, node->node_offset);
948
949         /*
950          * Destroy the node.  Record pertainant data because the node
951          * becomes stale the instant we flush it.
952          */
953         hammer_unref(&node->lock);
954         hammer_flush_node(node);
955         /* node is stale */
956         hammer_rel_buffer(buffer, 0);
957 }
958
959 /*
960  * Passively cache a referenced hammer_node in *cache.  The caller may
961  * release the node on return.
962  */
963 void
964 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
965 {
966         hammer_node_t old;
967
968         /*
969          * If the node is being deleted, don't cache it!
970          */
971         if (node->flags & HAMMER_NODE_DELETED)
972                 return;
973
974         /*
975          * Cache the node.  If we previously cached a different node we
976          * have to give HAMMER a chance to destroy it.
977          */
978 again:
979         if (node->cache1 != cache) {
980                 if (node->cache2 != cache) {
981                         if ((old = *cache) != NULL) {
982                                 KKASSERT(node->lock.refs != 0);
983                                 hammer_uncache_node(cache);
984                                 goto again;
985                         }
986                         if (node->cache2)
987                                 *node->cache2 = NULL;
988                         node->cache2 = node->cache1;
989                         node->cache1 = cache;
990                         *cache = node;
991                 } else {
992                         struct hammer_node **tmp;
993                         tmp = node->cache1;
994                         node->cache1 = node->cache2;
995                         node->cache2 = tmp;
996                 }
997         }
998 }
999
1000 void
1001 hammer_uncache_node(struct hammer_node **cache)
1002 {
1003         hammer_node_t node;
1004
1005         if ((node = *cache) != NULL) {
1006                 *cache = NULL;
1007                 if (node->cache1 == cache) {
1008                         node->cache1 = node->cache2;
1009                         node->cache2 = NULL;
1010                 } else if (node->cache2 == cache) {
1011                         node->cache2 = NULL;
1012                 } else {
1013                         panic("hammer_uncache_node: missing cache linkage");
1014                 }
1015                 if (node->cache1 == NULL && node->cache2 == NULL)
1016                         hammer_flush_node(node);
1017         }
1018 }
1019
1020 /*
1021  * Remove a node's cache references and destroy the node if it has no
1022  * other references or backing store.
1023  */
1024 void
1025 hammer_flush_node(hammer_node_t node)
1026 {
1027         hammer_buffer_t buffer;
1028
1029         if (node->cache1)
1030                 *node->cache1 = NULL;
1031         if (node->cache2)
1032                 *node->cache2 = NULL;
1033         if (node->lock.refs == 0 && node->ondisk == NULL) {
1034                 RB_REMOVE(hammer_nod_rb_tree, &node->volume->rb_nods_root,
1035                           node);
1036                 if ((buffer = node->buffer) != NULL) {
1037                         node->buffer = NULL;
1038                         TAILQ_REMOVE(&buffer->clist, node, entry);
1039                         /* buffer is unreferenced because ondisk is NULL */
1040                 }
1041                 --hammer_count_nodes;
1042                 kfree(node, M_HAMMER);
1043         }
1044 }
1045
1046 /*
1047  * Flush passively cached B-Tree nodes associated with this buffer.
1048  * This is only called when the buffer is about to be destroyed, so
1049  * none of the nodes should have any references.
1050  */
1051 void
1052 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1053 {
1054         hammer_node_t node;
1055
1056         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1057                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1058                 hammer_ref(&node->lock);
1059                 node->flags |= HAMMER_NODE_FLUSH;
1060                 hammer_rel_node(node);
1061         }
1062 }
1063
1064
1065 /************************************************************************
1066  *                              ALLOCATORS                              *
1067  ************************************************************************/
1068
1069 /*
1070  * Allocate a B-Tree node.
1071  */
1072 hammer_node_t
1073 hammer_alloc_btree(hammer_mount_t hmp, int *errorp)
1074 {
1075         hammer_buffer_t buffer = NULL;
1076         hammer_node_t node = NULL;
1077         hammer_off_t node_offset;
1078
1079         node_offset = hammer_alloc_fifo(hmp, sizeof(struct hammer_node_ondisk),
1080                                         0, &buffer, HAMMER_HEAD_TYPE_BTREE,
1081                                         0, NULL,
1082                                         errorp);
1083         if (*errorp == 0)
1084                 node = hammer_get_node(hmp, node_offset, errorp);
1085         if (buffer)
1086                 hammer_rel_buffer(buffer, 0);
1087         return(node);
1088 }
1089
1090 /*
1091  * The returned buffers are already appropriately marked as being modified.
1092  * If the caller marks them again unnecessary undo records may be generated.
1093  *
1094  * The core record (rec_len) cannot cross a buffer boundary.  The record + data
1095  * is only allowed to cross a buffer boundary for HAMMER_RECTYPE_DATA
1096  */
1097 void *
1098 hammer_alloc_record(hammer_mount_t hmp, 
1099                         hammer_off_t *rec_offp, u_int8_t rec_type, 
1100                         int32_t rec_len, struct hammer_buffer **rec_bufferp,
1101                         hammer_off_t *data_offp, int32_t data_len, 
1102                         void **data1p, void **data2p, int32_t *data2_index,
1103                         struct hammer_buffer **data2_bufferp,
1104                         int *errorp)
1105 {
1106         int32_t aligned_rec_len, n;
1107         hammer_off_t rec_offset;
1108         hammer_record_ondisk_t rec;
1109         int can_cross;
1110
1111         aligned_rec_len = (rec_len + HAMMER_HEAD_ALIGN_MASK) &
1112                           ~HAMMER_HEAD_ALIGN_MASK;
1113         can_cross = (rec_type == HAMMER_RECTYPE_DATA);
1114
1115         rec_offset = hammer_alloc_fifo(hmp, aligned_rec_len, data_len,
1116                                        rec_bufferp, HAMMER_HEAD_TYPE_RECORD,
1117                                        can_cross, data2_bufferp, errorp);
1118         if (*errorp)
1119                 return(NULL);
1120
1121         /*
1122          * Basic return values.
1123          */
1124         *rec_offp = rec_offset;
1125         if (data_offp)
1126                 *data_offp = rec_offset + aligned_rec_len;
1127         rec = (void *)((char *)(*rec_bufferp)->ondisk +
1128                        ((int32_t)rec_offset & HAMMER_BUFMASK));
1129         if (data_len)
1130                 rec->base.data_off = rec_offset + aligned_rec_len;
1131         rec->base.data_len = data_len;
1132         if (data1p)
1133                 *data1p = (void *)((char *)rec + aligned_rec_len);
1134         if (data2_index) {
1135                 n = ((int32_t)rec_offset & HAMMER_BUFMASK) +
1136                      aligned_rec_len + data_len;
1137                 if (n > HAMMER_BUFSIZE) {
1138                         *data2_index = data_len - (n - HAMMER_BUFSIZE);
1139                         KKASSERT(can_cross != 0);
1140                         *data2p = (*data2_bufferp)->ondisk;
1141                 } else {
1142                         *data2_index = data_len;
1143                         *data2p = NULL;
1144                 }
1145         } else {
1146                 KKASSERT(data2p == NULL);
1147         }
1148         return(rec);
1149 }
1150
1151 /*
1152  * Generate an undo fifo entry and return the buffer to the caller (XXX).
1153  * The caller must create a dependancy to ensure that the undo record is
1154  * flushed before the modified buffer is flushed.
1155  */
1156 int
1157 hammer_generate_undo(hammer_mount_t hmp, hammer_off_t off, void *base, int len)
1158 {
1159         hammer_off_t rec_offset;
1160         hammer_fifo_undo_t undo;
1161         hammer_buffer_t buffer = NULL;
1162         int error;
1163
1164         rec_offset = hammer_alloc_fifo(hmp, sizeof(*undo), len,
1165                                        &buffer, HAMMER_HEAD_TYPE_UNDO,
1166                                        0, NULL, &error);
1167         if (error == 0) {
1168                 undo = (void *)((char *)buffer->ondisk + 
1169                                 ((int32_t)rec_offset & HAMMER_BUFMASK));
1170                 undo->undo_offset = off;
1171                 bcopy(base, undo + 1, len);
1172         }
1173         if (buffer)
1174                 hammer_rel_buffer(buffer, 0);
1175         return(error);
1176 }
1177
1178 /*
1179  * Allocate space from the FIFO.  The first rec_len bytes will be zero'd.
1180  * The entire space is marked modified (the caller should not remark it as
1181  * that will cause unnecessary undo records to be added).
1182  */
1183 static
1184 hammer_off_t
1185 hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len, int32_t data_len,
1186                   struct hammer_buffer **rec_bufferp, u_int16_t hdr_type,
1187                   int can_cross, 
1188                   struct hammer_buffer **data2_bufferp, int *errorp)
1189 {
1190         hammer_volume_t root_volume;
1191         hammer_volume_t end_volume;
1192         hammer_volume_ondisk_t ondisk;
1193         hammer_fifo_head_t head;
1194         hammer_off_t end_off = 0;
1195         hammer_off_t tmp_off = 0;
1196         int32_t end_vol_no;
1197         int32_t tmp_vol_no;
1198         int32_t xoff;
1199         int32_t aligned_bytes;
1200         int must_pad;
1201
1202         aligned_bytes = (rec_len + data_len + HAMMER_HEAD_ALIGN_MASK) &
1203                         ~HAMMER_HEAD_ALIGN_MASK;
1204
1205         root_volume = hammer_get_root_volume(hmp, errorp);
1206         while (root_volume) {
1207                 hammer_modify_volume(root_volume, NULL, 0);
1208                 ondisk = root_volume->ondisk;
1209
1210                 end_off = ondisk->vol0_fifo_end;
1211                 end_vol_no = HAMMER_VOL_DECODE(end_off);
1212
1213                 end_volume = hammer_get_volume(hmp, end_vol_no, errorp);
1214                 if (*errorp)
1215                         goto done;
1216
1217                 /*
1218                  * Check to see if we ran out of space.  Include some extra
1219                  * room.
1220                  *
1221                  * vol0_fifo_end cannot be advanced into the same buffer
1222                  * that vol0_fifo_beg resides in.  This allows us to
1223                  * instantiate a new buffer without reading it in.
1224                  *
1225                  * XXX messy.
1226                  */
1227                 tmp_off = ondisk->vol0_fifo_beg & ~HAMMER_BUFMASK64;
1228                 tmp_vol_no = HAMMER_VOL_DECODE(tmp_off);
1229                 if ((tmp_off & HAMMER_OFF_SHORT_MASK) == 0) {
1230                         if (end_vol_no + 1 == tmp_vol_no) {
1231                                 tmp_vol_no = end_vol_no;
1232                                 tmp_off = end_volume->maxbuf_off;
1233                         } else if (end_vol_no + 1 == hmp->nvolumes &&
1234                                    tmp_vol_no == 0) {
1235                                 tmp_vol_no = end_vol_no;
1236                                 tmp_off = end_volume->maxbuf_off;
1237                         }
1238                 }
1239                 hammer_rel_volume(end_volume, 0);
1240
1241                 /*
1242                  * XXX dummy head at end of fifo
1243                  */
1244                 if (end_vol_no == tmp_vol_no &&
1245                     end_off < tmp_off &&
1246                     end_off + aligned_bytes + sizeof(*head) >= tmp_off) {
1247                         *errorp = ENOSPC;
1248                         goto done;
1249                 }
1250
1251                 if ((int32_t)end_off & HAMMER_BUFMASK)
1252                         head = hammer_bread(hmp, end_off, errorp, rec_bufferp);
1253                 else
1254                         head = hammer_bnew(hmp, end_off, errorp, rec_bufferp);
1255                 if (*errorp)
1256                         goto done;
1257
1258                 /*
1259                  * Load the buffer, retry if someone else squeeked in
1260                  * while we were blocked.
1261                  */
1262
1263                 if (ondisk->vol0_fifo_end != end_off)
1264                         continue;
1265
1266                 /*
1267                  * Ok, we're gonna do something.  Modify the buffer
1268                  */
1269                 hammer_modify_buffer(*rec_bufferp, NULL, 0);
1270                 if (ondisk->vol0_fifo_end != end_off)
1271                         continue;
1272                 xoff = (int32_t)end_off & HAMMER_BUFMASK;
1273
1274                 /*
1275                  * The non-data portion of the fifo record cannot cross
1276                  * a buffer boundary.
1277                  *
1278                  * The entire record cannot cross a buffer boundary if
1279                  * can_cross is 0.
1280                  *
1281                  * It is illegal for a record to cross a volume boundary.
1282                  *
1283                  * It is illegal for a record to cross a recovery boundary
1284                  * (this is so recovery code is guaranteed a record rather
1285                  * then data at certain points).
1286                  *
1287                  * Add a pad record and loop if it does.
1288                  */
1289                 must_pad = 0;
1290                 if (xoff + rec_len > HAMMER_BUFSIZE)
1291                         must_pad = 1;
1292                 if (can_cross == 0) {
1293                         if (xoff + aligned_bytes > HAMMER_BUFSIZE)
1294                                 must_pad = 1;
1295                 } else {
1296                         if (xoff + aligned_bytes > HAMMER_BUFSIZE &&
1297                             (end_off + aligned_bytes) >=
1298                             (*rec_bufferp)->volume->maxbuf_off) {
1299                                 must_pad = 1;
1300                         }
1301                         if ((end_off ^ (end_off + aligned_bytes)) &
1302                             HAMMER_OFF_SHORT_REC_MASK) {
1303                                 must_pad = 1;
1304                         }
1305                 }
1306                 if (must_pad) {
1307                         must_pad = HAMMER_BUFSIZE - xoff;
1308                         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1309                         head->hdr_type = HAMMER_HEAD_TYPE_PAD;
1310                         head->hdr_fwd_link = must_pad;
1311                         head->hdr_seq = 0; /* XXX seq */
1312                         KKASSERT((must_pad & 7) == 0);
1313                         ondisk->vol0_fifo_end =
1314                                 hammer_advance_fifo((*rec_bufferp)->volume,
1315                                                     end_off, must_pad);
1316                         /* XXX rev_link */
1317                         continue;
1318                 }
1319
1320                 if (xoff + aligned_bytes > HAMMER_BUFSIZE) {
1321                         KKASSERT(xoff + aligned_bytes <= HAMMER_BUFSIZE * 2);
1322                         hammer_bnew(hmp, end_off + (HAMMER_BUFSIZE - xoff),
1323                                     errorp, data2_bufferp);
1324                         hammer_modify_buffer(*data2_bufferp, NULL, 0);
1325                         if (*errorp)
1326                                 goto done;
1327                 }
1328
1329                 head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1330                 head->hdr_type = hdr_type;
1331                 head->hdr_fwd_link = aligned_bytes / 64;
1332                 head->hdr_rev_link = -1; /* XXX */
1333                 head->hdr_crc = 0;
1334                 head->hdr_seq = 0;      /* XXX */
1335                 ondisk->vol0_fifo_end =
1336                         hammer_advance_fifo((*rec_bufferp)->volume,
1337                                             end_off, aligned_bytes);
1338 done:
1339                 hammer_rel_volume(root_volume, 0);
1340                 break;
1341         }
1342         if (*errorp)
1343                 end_off = 0;
1344         return(end_off);
1345 }
1346
1347 /*
1348  * Mark a fifo record as having been freed.  XXX needs undo.
1349  */
1350 void
1351 hammer_free_fifo(hammer_mount_t hmp, hammer_off_t fifo_offset)
1352 {
1353         hammer_buffer_t buffer = NULL;
1354         hammer_fifo_head_t head;
1355         int error;
1356
1357         head = hammer_bread(hmp, fifo_offset, &error, &buffer);
1358         if (head) {
1359                 hammer_modify_buffer(buffer, &head->hdr_type,
1360                                      sizeof(head->hdr_type));
1361                 head->hdr_type |= HAMMER_HEAD_TYPEF_FREED;
1362         }
1363         if (buffer)
1364                 hammer_rel_buffer(buffer, 0);
1365 }
1366
1367 /*
1368  * Attempt to rewind the FIFO
1369  *
1370  * This routine is allowed to do nothing.
1371  */
1372 void
1373 hammer_unwind_fifo(hammer_mount_t hmp, hammer_off_t rec_offset)
1374 {
1375 }
1376
1377 /*
1378  * Advance the FIFO a certain number of bytes.
1379  */
1380 static
1381 hammer_off_t
1382 hammer_advance_fifo(hammer_volume_t volume, hammer_off_t off, int32_t bytes)
1383 {
1384         int32_t vol_no;
1385
1386         off += bytes;
1387         KKASSERT(off <= volume->maxbuf_off);
1388         KKASSERT((off & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
1389         if (off == volume->maxbuf_off) {
1390                 vol_no = volume->vol_no + 1;
1391                 if (vol_no == volume->hmp->nvolumes)
1392                         vol_no = 0;
1393                 off = HAMMER_ENCODE_RAW_BUFFER(vol_no, 0);
1394         }
1395         return(off);
1396 }
1397
1398 /*
1399  * Sync dirty buffers to the media
1400  */
1401
1402 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
1403 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
1404
1405 int
1406 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1407 {
1408         struct hammer_sync_info info;
1409
1410         info.error = 0;
1411         info.waitfor = waitfor;
1412
1413         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
1414                       hammer_sync_scan1, hammer_sync_scan2, &info);
1415
1416         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1417                 hammer_sync_volume, &info);
1418         return(info.error);
1419 }
1420
1421 static int
1422 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1423 {
1424         struct hammer_inode *ip;
1425
1426         ip = VTOI(vp);
1427         if (vp->v_type == VNON || ip == NULL ||
1428             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1429              RB_EMPTY(&vp->v_rbdirty_tree))) {
1430                 return(-1);
1431         }
1432         return(0);
1433 }
1434
1435 static int
1436 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1437 {
1438         struct hammer_sync_info *info = data;
1439         struct hammer_inode *ip;
1440         int error;
1441
1442         ip = VTOI(vp);
1443         if (vp->v_type == VNON || vp->v_type == VBAD ||
1444             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1445              RB_EMPTY(&vp->v_rbdirty_tree))) {
1446                 return(0);
1447         }
1448         error = VOP_FSYNC(vp, info->waitfor);
1449         if (error)
1450                 info->error = error;
1451         return(0);
1452 }
1453
1454 int
1455 hammer_sync_volume(hammer_volume_t volume, void *data)
1456 {
1457         struct hammer_sync_info *info = data;
1458
1459         hammer_ref(&volume->io.lock);
1460         RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
1461                 hammer_sync_buffer, info);
1462         hammer_rel_volume(volume, 1);
1463         return(0);
1464 }
1465
1466 int
1467 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
1468 {
1469         hammer_ref(&buffer->io.lock);
1470         hammer_rel_buffer(buffer, 1);
1471         return(0);
1472 }
1473
1474 /*
1475  * Generic buffer initialization.  Initialize the A-list into an all-allocated
1476  * state with the free block limit properly set.
1477  *
1478  * Note that alloc_new_buffer() will free the appropriate block range via
1479  * the appropriate cluster alist, so the free count is properly propogated.
1480  */
1481 void
1482 hammer_init_fifo(hammer_fifo_head_t head, u_int16_t type)
1483 {
1484         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1485         head->hdr_type = type;
1486         head->hdr_rev_link = 0;
1487         head->hdr_fwd_link = 0;
1488         head->hdr_crc = 0;
1489         head->hdr_seq = 0;
1490 }
1491