HAMMER 31B/many: Fix busy block dev on HAMMER umount
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.32 2008/02/24 20:08:50 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew);
51 static int hammer_load_node(hammer_node_t node);
52 #if 0
53 static hammer_off_t hammer_advance_fifo(hammer_volume_t volume,
54                 hammer_off_t off, int32_t bytes);
55
56 static hammer_off_t hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len,
57                 int32_t data_len, struct hammer_buffer **rec_bufferp,
58                 u_int16_t hdr_type, int can_cross, 
59                 struct hammer_buffer **data2_bufferp, int *errorp);
60 #endif
61
62 /*
63  * Red-Black tree support for various structures
64  */
65 static int
66 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
67 {
68         if (ip1->obj_id < ip2->obj_id)
69                 return(-1);
70         if (ip1->obj_id > ip2->obj_id)
71                 return(1);
72         if (ip1->obj_asof < ip2->obj_asof)
73                 return(-1);
74         if (ip1->obj_asof > ip2->obj_asof)
75                 return(1);
76         return(0);
77 }
78
79 static int
80 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
81 {
82         if (info->obj_id < ip->obj_id)
83                 return(-1);
84         if (info->obj_id > ip->obj_id)
85                 return(1);
86         if (info->obj_asof < ip->obj_asof)
87                 return(-1);
88         if (info->obj_asof > ip->obj_asof)
89                 return(1);
90         return(0);
91 }
92
93 static int
94 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
95 {
96         if (vol1->vol_no < vol2->vol_no)
97                 return(-1);
98         if (vol1->vol_no > vol2->vol_no)
99                 return(1);
100         return(0);
101 }
102
103 static int
104 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
105 {
106         if (buf1->zone2_offset < buf2->zone2_offset)
107                 return(-1);
108         if (buf1->zone2_offset > buf2->zone2_offset)
109                 return(1);
110         return(0);
111 }
112
113 static int
114 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
115 {
116         if (node1->node_offset < node2->node_offset)
117                 return(-1);
118         if (node1->node_offset > node2->node_offset)
119                 return(1);
120         return(0);
121 }
122
123 /*
124  * Note: The lookup function for hammer_ino_rb_tree winds up being named
125  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
126  * functions are normal, e.g. hammer_buf_rb_tree_RB_LOOKUP(root, zone2_offset).
127  */
128 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
129 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
130                 hammer_inode_info_cmp, hammer_inode_info_t);
131 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
132              hammer_vol_rb_compare, int32_t, vol_no);
133 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
134              hammer_buf_rb_compare, hammer_off_t, zone2_offset);
135 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
136              hammer_nod_rb_compare, hammer_off_t, node_offset);
137
138 /************************************************************************
139  *                              VOLUMES                                 *
140  ************************************************************************
141  *
142  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
143  * code on failure.  Volumes must be loaded at mount time, get_volume() will
144  * not load a new volume.
145  *
146  * Calls made to hammer_load_volume() or single-threaded
147  */
148 int
149 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
150 {
151         struct mount *mp;
152         hammer_volume_t volume;
153         struct hammer_volume_ondisk *ondisk;
154         struct nlookupdata nd;
155         struct buf *bp = NULL;
156         int error;
157         int ronly;
158         int setmp = 0;
159
160         mp = hmp->mp;
161         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
162
163         /*
164          * Allocate a volume structure
165          */
166         ++hammer_count_volumes;
167         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
168         volume->vol_name = kstrdup(volname, M_HAMMER);
169         volume->hmp = hmp;
170         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
171         volume->io.offset = 0LL;
172
173         /*
174          * Get the device vnode
175          */
176         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
177         if (error == 0)
178                 error = nlookup(&nd);
179         if (error == 0)
180                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
181         nlookup_done(&nd);
182         if (error == 0) {
183                 if (vn_isdisk(volume->devvp, &error)) {
184                         error = vfs_mountedon(volume->devvp);
185                 }
186         }
187         if (error == 0 &&
188             count_udev(volume->devvp->v_umajor, volume->devvp->v_uminor) > 0) {
189                 error = EBUSY;
190         }
191         if (error == 0) {
192                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
193                 error = vinvalbuf(volume->devvp, V_SAVE, 0, 0);
194                 if (error == 0) {
195                         error = VOP_OPEN(volume->devvp, 
196                                          (ronly ? FREAD : FREAD|FWRITE),
197                                          FSCRED, NULL);
198                 }
199                 vn_unlock(volume->devvp);
200         }
201         if (error) {
202                 hammer_free_volume(volume);
203                 return(error);
204         }
205         volume->devvp->v_rdev->si_mountpoint = mp;
206         setmp = 1;
207
208         /*
209          * Extract the volume number from the volume header and do various
210          * sanity checks.
211          */
212         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
213         if (error)
214                 goto late_failure;
215         ondisk = (void *)bp->b_data;
216         if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) {
217                 kprintf("hammer_mount: volume %s has an invalid header\n",
218                         volume->vol_name);
219                 error = EFTYPE;
220                 goto late_failure;
221         }
222         volume->vol_no = ondisk->vol_no;
223         volume->buffer_base = ondisk->vol_buf_beg;
224         volume->vol_flags = ondisk->vol_flags;
225         volume->nblocks = ondisk->vol_nblocks; 
226         volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no,
227                                     ondisk->vol_buf_end - ondisk->vol_buf_beg);
228         RB_INIT(&volume->rb_bufs_root);
229
230         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
231
232         if (RB_EMPTY(&hmp->rb_vols_root)) {
233                 hmp->fsid = ondisk->vol_fsid;
234         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
235                 kprintf("hammer_mount: volume %s's fsid does not match "
236                         "other volumes\n", volume->vol_name);
237                 error = EFTYPE;
238                 goto late_failure;
239         }
240
241         /*
242          * Insert the volume structure into the red-black tree.
243          */
244         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
245                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
246                         volume->vol_name, volume->vol_no);
247                 error = EEXIST;
248         }
249
250         /*
251          * Set the root volume .  HAMMER special cases rootvol the structure.
252          * We do not hold a ref because this would prevent related I/O
253          * from being flushed.
254          */
255         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
256                 hmp->rootvol = volume;
257                 if (bp) {
258                         brelse(bp);
259                         bp = NULL;
260                 }
261                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
262         }
263 late_failure:
264         if (bp)
265                 brelse(bp);
266         if (error) {
267                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
268                 if (setmp)
269                         volume->devvp->v_rdev->si_mountpoint = NULL;
270                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
271                 hammer_free_volume(volume);
272         }
273         return (error);
274 }
275
276 /*
277  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
278  * so returns -1 on failure.
279  */
280 int
281 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
282 {
283         struct hammer_mount *hmp = volume->hmp;
284         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
285
286         /*
287          * Sync clusters, sync volume
288          */
289
290         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
291
292         /*
293          * Clean up the root volume pointer, which is held unlocked in hmp.
294          */
295         if (hmp->rootvol == volume)
296                 hmp->rootvol = NULL;
297
298         /*
299          * Unload clusters and super-clusters.  Unloading a super-cluster
300          * also unloads related clusters, but the filesystem may not be
301          * using super-clusters so unload clusters anyway.
302          */
303         RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
304                         hammer_unload_buffer, NULL);
305         hammer_io_waitdep(&volume->io);
306
307         /*
308          * Release our buffer and flush anything left in the buffer cache.
309          */
310         hammer_io_release(&volume->io, 2);
311
312         /*
313          * There should be no references on the volume, no clusters, and
314          * no super-clusters.
315          */
316         KKASSERT(volume->io.lock.refs == 0);
317         KKASSERT(RB_EMPTY(&volume->rb_bufs_root));
318
319         volume->ondisk = NULL;
320         if (volume->devvp) {
321                 if (volume->devvp->v_rdev &&
322                     volume->devvp->v_rdev->si_mountpoint == hmp->mp
323                 ) {
324                         volume->devvp->v_rdev->si_mountpoint = NULL;
325                 }
326                 if (ronly) {
327                         vinvalbuf(volume->devvp, 0, 0, 0);
328                         VOP_CLOSE(volume->devvp, FREAD);
329                 } else {
330                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
331                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
332                 }
333         }
334
335         /*
336          * Destroy the structure
337          */
338         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
339         hammer_free_volume(volume);
340         return(0);
341 }
342
343 static
344 void
345 hammer_free_volume(hammer_volume_t volume)
346 {
347         if (volume->vol_name) {
348                 kfree(volume->vol_name, M_HAMMER);
349                 volume->vol_name = NULL;
350         }
351         if (volume->devvp) {
352                 vrele(volume->devvp);
353                 volume->devvp = NULL;
354         }
355         --hammer_count_volumes;
356         kfree(volume, M_HAMMER);
357 }
358
359 /*
360  * Get a HAMMER volume.  The volume must already exist.
361  */
362 hammer_volume_t
363 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
364 {
365         struct hammer_volume *volume;
366
367         /*
368          * Locate the volume structure
369          */
370         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
371         if (volume == NULL) {
372                 *errorp = ENOENT;
373                 return(NULL);
374         }
375         hammer_ref(&volume->io.lock);
376
377         /*
378          * Deal with on-disk info
379          */
380         if (volume->ondisk == NULL || volume->io.loading) {
381                 *errorp = hammer_load_volume(volume);
382                 if (*errorp) {
383                         hammer_rel_volume(volume, 1);
384                         volume = NULL;
385                 }
386         } else {
387                 *errorp = 0;
388         }
389         return(volume);
390 }
391
392 int
393 hammer_ref_volume(hammer_volume_t volume)
394 {
395         int error;
396
397         hammer_ref(&volume->io.lock);
398
399         /*
400          * Deal with on-disk info
401          */
402         if (volume->ondisk == NULL || volume->io.loading) {
403                 error = hammer_load_volume(volume);
404                 if (error)
405                         hammer_rel_volume(volume, 1);
406         } else {
407                 error = 0;
408         }
409         return (error);
410 }
411
412 hammer_volume_t
413 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
414 {
415         hammer_volume_t volume;
416
417         volume = hmp->rootvol;
418         KKASSERT(volume != NULL);
419         hammer_ref(&volume->io.lock);
420
421         /*
422          * Deal with on-disk info
423          */
424         if (volume->ondisk == NULL || volume->io.loading) {
425                 *errorp = hammer_load_volume(volume);
426                 if (*errorp) {
427                         hammer_rel_volume(volume, 1);
428                         volume = NULL;
429                 }
430         } else {
431                 *errorp = 0;
432         }
433         return (volume);
434 }
435
436 /*
437  * Load a volume's on-disk information.  The volume must be referenced and
438  * not locked.  We temporarily acquire an exclusive lock to interlock
439  * against releases or multiple get's.
440  */
441 static int
442 hammer_load_volume(hammer_volume_t volume)
443 {
444         struct hammer_volume_ondisk *ondisk;
445         int error;
446
447         hammer_lock_ex(&volume->io.lock);
448         KKASSERT(volume->io.loading == 0);
449         volume->io.loading = 1;
450
451         if (volume->ondisk == NULL) {
452                 error = hammer_io_read(volume->devvp, &volume->io);
453                 if (error) {
454                         volume->io.loading = 0;
455                         hammer_unlock(&volume->io.lock);
456                         return (error);
457                 }
458                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
459         } else {
460                 error = 0;
461         }
462         volume->io.loading = 0;
463         hammer_unlock(&volume->io.lock);
464         return(0);
465 }
466
467 /*
468  * Release a volume.  Call hammer_io_release on the last reference.  We have
469  * to acquire an exclusive lock to interlock against volume->ondisk tests
470  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
471  * lock to be held.
472  *
473  * Volumes are not unloaded from memory during normal operation.
474  */
475 void
476 hammer_rel_volume(hammer_volume_t volume, int flush)
477 {
478         if (volume->io.lock.refs == 1) {
479                 hammer_lock_ex(&volume->io.lock);
480                 if (volume->io.lock.refs == 1) {
481                         volume->ondisk = NULL;
482                         hammer_io_release(&volume->io, flush);
483                 } else if (flush) {
484                         hammer_io_flush(&volume->io);
485                 }
486                 hammer_unlock(&volume->io.lock);
487         }
488         hammer_unref(&volume->io.lock);
489 }
490
491 /************************************************************************
492  *                              BUFFERS                                 *
493  ************************************************************************
494  *
495  * Manage buffers.  Currently all blockmap-backed zones are translated
496  * to zone-2 buffer offsets.
497  */
498 hammer_buffer_t
499 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
500                   int isnew, int *errorp)
501 {
502         hammer_buffer_t buffer;
503         hammer_volume_t volume;
504         hammer_off_t    zoneX_offset;
505         int vol_no;
506         int zone;
507
508         zoneX_offset = buf_offset;
509         zone = HAMMER_ZONE_DECODE(buf_offset);
510         if (zone > HAMMER_ZONE_RAW_BUFFER_INDEX) {
511                 buf_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp);
512                 KKASSERT(*errorp == 0);
513         }
514         buf_offset &= ~HAMMER_BUFMASK64;
515         KKASSERT((buf_offset & HAMMER_ZONE_RAW_BUFFER) ==
516                  HAMMER_ZONE_RAW_BUFFER);
517         vol_no = HAMMER_VOL_DECODE(buf_offset);
518         volume = hammer_get_volume(hmp, vol_no, errorp);
519         if (volume == NULL)
520                 return(NULL);
521
522         /*
523          * NOTE: buf_offset and maxbuf_off are both full offset
524          * specifications.
525          */
526         KKASSERT(buf_offset < volume->maxbuf_off);
527
528         /*
529          * Locate and lock the buffer structure, creating one if necessary.
530          */
531 again:
532         buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root,
533                            buf_offset);
534         if (buffer == NULL) {
535                 ++hammer_count_buffers;
536                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
537                 buffer->zone2_offset = buf_offset;
538                 buffer->volume = volume;
539                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
540                 buffer->io.offset = volume->ondisk->vol_buf_beg +
541                                     (buf_offset & HAMMER_OFF_SHORT_MASK);
542                 TAILQ_INIT(&buffer->clist);
543                 hammer_ref(&buffer->io.lock);
544
545                 /*
546                  * Insert the buffer into the RB tree and handle late
547                  * collisions.
548                  */
549                 if (RB_INSERT(hammer_buf_rb_tree, &volume->rb_bufs_root, buffer)) {
550                         hammer_unref(&buffer->io.lock);
551                         --hammer_count_buffers;
552                         kfree(buffer, M_HAMMER);
553                         goto again;
554                 }
555                 hammer_ref(&volume->io.lock);
556         } else {
557                 hammer_ref(&buffer->io.lock);
558         }
559
560         /*
561          * Cache the blockmap translation
562          */
563         if ((zoneX_offset & HAMMER_ZONE_RAW_BUFFER) != HAMMER_ZONE_RAW_BUFFER)
564                 buffer->zoneX_offset = zoneX_offset;
565
566         /*
567          * Deal with on-disk info
568          */
569         if (buffer->ondisk == NULL || buffer->io.loading) {
570                 *errorp = hammer_load_buffer(buffer, isnew);
571                 if (*errorp) {
572                         hammer_rel_buffer(buffer, 1);
573                         buffer = NULL;
574                 }
575         } else {
576                 *errorp = 0;
577         }
578         hammer_rel_volume(volume, 0);
579         return(buffer);
580 }
581
582 static int
583 hammer_load_buffer(hammer_buffer_t buffer, int isnew)
584 {
585         hammer_volume_t volume;
586         void *ondisk;
587         int error;
588
589         /*
590          * Load the buffer's on-disk info
591          */
592         volume = buffer->volume;
593         hammer_lock_ex(&buffer->io.lock);
594         KKASSERT(buffer->io.loading == 0);
595         buffer->io.loading = 1;
596
597         if (buffer->ondisk == NULL) {
598                 if (isnew) {
599                         error = hammer_io_new(volume->devvp, &buffer->io);
600                 } else {
601                         error = hammer_io_read(volume->devvp, &buffer->io);
602                 }
603                 if (error) {
604                         buffer->io.loading = 0;
605                         hammer_unlock(&buffer->io.lock);
606                         return (error);
607                 }
608                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
609         } else if (isnew) {
610                 error = hammer_io_new(volume->devvp, &buffer->io);
611         } else {
612                 error = 0;
613         }
614         if (error == 0 && isnew) {
615                 hammer_modify_buffer(buffer, NULL, 0);
616                 /* additional initialization goes here */
617         }
618         buffer->io.loading = 0;
619         hammer_unlock(&buffer->io.lock);
620         return (error);
621 }
622
623 /*
624  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
625  */
626 int
627 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
628 {
629         hammer_ref(&buffer->io.lock);
630         hammer_flush_buffer_nodes(buffer);
631         KKASSERT(buffer->io.lock.refs == 1);
632         hammer_rel_buffer(buffer, 2);
633         return(0);
634 }
635
636 /*
637  * Reference a buffer that is either already referenced or via a specially
638  * handled pointer (aka cursor->buffer).
639  */
640 int
641 hammer_ref_buffer(hammer_buffer_t buffer)
642 {
643         int error;
644
645         hammer_ref(&buffer->io.lock);
646         if (buffer->ondisk == NULL || buffer->io.loading) {
647                 error = hammer_load_buffer(buffer, 0);
648                 if (error) {
649                         hammer_rel_buffer(buffer, 1);
650                         /*
651                          * NOTE: buffer pointer can become stale after
652                          * the above release.
653                          */
654                 }
655         } else {
656                 error = 0;
657         }
658         return(error);
659 }
660
661 /*
662  * Release a buffer.  We have to deal with several places where
663  * another thread can ref the buffer.
664  *
665  * Only destroy the structure itself if the related buffer cache buffer
666  * was disassociated from it.  This ties the management of the structure
667  * to the buffer cache subsystem.  buffer->ondisk determines whether the
668  * embedded io is referenced or not.
669  */
670 void
671 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
672 {
673         hammer_volume_t volume;
674
675         if (buffer->io.lock.refs == 1) {
676                 hammer_lock_ex(&buffer->io.lock);
677                 if (buffer->io.lock.refs == 1) {
678                         hammer_io_release(&buffer->io, flush);
679
680                         if (buffer->io.bp == NULL &&
681                             buffer->io.lock.refs == 1) {
682                                 hammer_flush_buffer_nodes(buffer);
683                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
684                                 volume = buffer->volume;
685                                 RB_REMOVE(hammer_buf_rb_tree,
686                                           &volume->rb_bufs_root, buffer);
687                                 buffer->volume = NULL; /* sanity */
688                                 --hammer_count_buffers;
689                                 kfree(buffer, M_HAMMER);
690                                 hammer_rel_volume(volume, 0);
691                                 return;
692                         }
693                 } else if (flush) {
694                         hammer_io_flush(&buffer->io);
695                 }
696                 hammer_unlock(&buffer->io.lock);
697         }
698         hammer_unref(&buffer->io.lock);
699 }
700
701 /*
702  * Access the filesystem buffer containing the specified hammer offset.
703  * buf_offset is a conglomeration of the volume number and vol_buf_beg
704  * relative buffer offset.  It must also have bit 55 set to be valid.
705  * (see hammer_off_t in hammer_disk.h).
706  *
707  * Any prior buffer in *bufferp will be released and replaced by the
708  * requested buffer.
709  */
710 void *
711 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 
712              struct hammer_buffer **bufferp)
713 {
714         hammer_buffer_t buffer;
715         int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
716
717         buf_offset &= ~HAMMER_BUFMASK64;
718         KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) != 0);
719
720         buffer = *bufferp;
721         if (buffer == NULL || (buffer->zone2_offset != buf_offset &&
722                                buffer->zoneX_offset != buf_offset)) {
723                 if (buffer)
724                         hammer_rel_buffer(buffer, 0);
725                 buffer = hammer_get_buffer(hmp, buf_offset, 0, errorp);
726                 *bufferp = buffer;
727         } else {
728                 *errorp = 0;
729         }
730
731         /*
732          * Return a pointer to the buffer data.
733          */
734         if (buffer == NULL)
735                 return(NULL);
736         else
737                 return((char *)buffer->ondisk + xoff);
738 }
739
740 /*
741  * Access the filesystem buffer containing the specified hammer offset.
742  * No disk read operation occurs.  The result buffer may contain garbage.
743  *
744  * Any prior buffer in *bufferp will be released and replaced by the
745  * requested buffer.
746  */
747 void *
748 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 
749              struct hammer_buffer **bufferp)
750 {
751         hammer_buffer_t buffer;
752         int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
753
754         buf_offset &= ~HAMMER_BUFMASK64;
755
756         buffer = *bufferp;
757         if (buffer == NULL || (buffer->zone2_offset != buf_offset &&
758                                buffer->zoneX_offset != buf_offset)) {
759                 if (buffer)
760                         hammer_rel_buffer(buffer, 0);
761                 buffer = hammer_get_buffer(hmp, buf_offset, 1, errorp);
762                 *bufferp = buffer;
763         } else {
764                 *errorp = 0;
765         }
766
767         /*
768          * Return a pointer to the buffer data.
769          */
770         if (buffer == NULL)
771                 return(NULL);
772         else
773                 return((char *)buffer->ondisk + xoff);
774 }
775
776 /************************************************************************
777  *                              NODES                                   *
778  ************************************************************************
779  *
780  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
781  * method used by the HAMMER filesystem.
782  *
783  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
784  * associated with its buffer, and will only referenced the buffer while
785  * the node itself is referenced.
786  *
787  * A hammer_node can also be passively associated with other HAMMER
788  * structures, such as inodes, while retaining 0 references.  These
789  * associations can be cleared backwards using a pointer-to-pointer in
790  * the hammer_node.
791  *
792  * This allows the HAMMER implementation to cache hammer_nodes long-term
793  * and short-cut a great deal of the infrastructure's complexity.  In
794  * most cases a cached node can be reacquired without having to dip into
795  * either the buffer or cluster management code.
796  *
797  * The caller must pass a referenced cluster on call and will retain
798  * ownership of the reference on return.  The node will acquire its own
799  * additional references, if necessary.
800  */
801 hammer_node_t
802 hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset, int *errorp)
803 {
804         hammer_node_t node;
805
806         KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE);
807
808         /*
809          * Locate the structure, allocating one if necessary.
810          */
811 again:
812         node = RB_LOOKUP(hammer_nod_rb_tree, &hmp->rb_nods_root, node_offset);
813         if (node == NULL) {
814                 ++hammer_count_nodes;
815                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
816                 node->node_offset = node_offset;
817                 node->hmp = hmp;
818                 if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) {
819                         --hammer_count_nodes;
820                         kfree(node, M_HAMMER);
821                         goto again;
822                 }
823         }
824         hammer_ref(&node->lock);
825         *errorp = hammer_load_node(node);
826         if (*errorp) {
827                 hammer_rel_node(node);
828                 node = NULL;
829         }
830         return(node);
831 }
832
833 /*
834  * Reference an already-referenced node.
835  */
836 int
837 hammer_ref_node(hammer_node_t node)
838 {
839         int error;
840
841         KKASSERT(node->lock.refs > 0);
842         hammer_ref(&node->lock);
843         if ((error = hammer_load_node(node)) != 0)
844                 hammer_rel_node(node);
845         return(error);
846 }
847
848 /*
849  * Load a node's on-disk data reference.
850  */
851 static int
852 hammer_load_node(hammer_node_t node)
853 {
854         hammer_buffer_t buffer;
855         int error;
856
857         if (node->ondisk)
858                 return(0);
859         error = 0;
860         hammer_lock_ex(&node->lock);
861         if (node->ondisk == NULL) {
862                 /*
863                  * This is a little confusing but the jist is that
864                  * node->buffer determines whether the node is on
865                  * the buffer's clist and node->ondisk determines
866                  * whether the buffer is referenced.
867                  */
868                 if ((buffer = node->buffer) != NULL) {
869                         error = hammer_ref_buffer(buffer);
870                 } else {
871                         buffer = hammer_get_buffer(node->hmp,
872                                                    node->node_offset, 0,
873                                                    &error);
874                         if (buffer) {
875                                 KKASSERT(error == 0);
876                                 TAILQ_INSERT_TAIL(&buffer->clist,
877                                                   node, entry);
878                                 node->buffer = buffer;
879                         }
880                 }
881                 if (error == 0) {
882                         node->ondisk = (void *)((char *)buffer->ondisk +
883                                (node->node_offset & HAMMER_BUFMASK));
884                 }
885         }
886         hammer_unlock(&node->lock);
887         return (error);
888 }
889
890 /*
891  * Safely reference a node, interlock against flushes via the IO subsystem.
892  */
893 hammer_node_t
894 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
895                      int *errorp)
896 {
897         hammer_node_t node;
898
899         if ((node = *cache) != NULL)
900                 hammer_ref(&node->lock);
901         if (node) {
902                 *errorp = hammer_load_node(node);
903                 if (*errorp) {
904                         hammer_rel_node(node);
905                         node = NULL;
906                 }
907         } else {
908                 *errorp = ENOENT;
909         }
910         return(node);
911 }
912
913 /*
914  * Release a hammer_node.  On the last release the node dereferences
915  * its underlying buffer and may or may not be destroyed.
916  */
917 void
918 hammer_rel_node(hammer_node_t node)
919 {
920         hammer_buffer_t buffer;
921
922         /*
923          * If this isn't the last ref just decrement the ref count and
924          * return.
925          */
926         if (node->lock.refs > 1) {
927                 hammer_unref(&node->lock);
928                 return;
929         }
930
931         /*
932          * If there is no ondisk info or no buffer the node failed to load,
933          * remove the last reference and destroy the node.
934          */
935         if (node->ondisk == NULL) {
936                 hammer_unref(&node->lock);
937                 hammer_flush_node(node);
938                 /* node is stale now */
939                 return;
940         }
941
942         /*
943          * Do final cleanups and then either destroy the node and leave it
944          * passively cached.  The buffer reference is removed regardless.
945          */
946         buffer = node->buffer;
947         node->ondisk = NULL;
948
949         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
950                 hammer_unref(&node->lock);
951                 hammer_rel_buffer(buffer, 0);
952                 return;
953         }
954
955         /*
956          * Destroy the node if it has been marked for deletion.  We mark
957          * it as being free.  Note that the disk space is physically
958          * freed when the fifo cycles back through the node.
959          */
960         if (node->flags & HAMMER_NODE_DELETED) {
961                 hammer_blockmap_free(node->hmp, node->node_offset,
962                                      sizeof(*node->ondisk));
963         }
964
965         /*
966          * Destroy the node.  Record pertainant data because the node
967          * becomes stale the instant we flush it.
968          */
969         hammer_unref(&node->lock);
970         hammer_flush_node(node);
971         /* node is stale */
972         hammer_rel_buffer(buffer, 0);
973 }
974
975 /*
976  * Passively cache a referenced hammer_node in *cache.  The caller may
977  * release the node on return.
978  */
979 void
980 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
981 {
982         hammer_node_t old;
983
984         /*
985          * If the node is being deleted, don't cache it!
986          */
987         if (node->flags & HAMMER_NODE_DELETED)
988                 return;
989
990         /*
991          * Cache the node.  If we previously cached a different node we
992          * have to give HAMMER a chance to destroy it.
993          */
994 again:
995         if (node->cache1 != cache) {
996                 if (node->cache2 != cache) {
997                         if ((old = *cache) != NULL) {
998                                 KKASSERT(node->lock.refs != 0);
999                                 hammer_uncache_node(cache);
1000                                 goto again;
1001                         }
1002                         if (node->cache2)
1003                                 *node->cache2 = NULL;
1004                         node->cache2 = node->cache1;
1005                         node->cache1 = cache;
1006                         *cache = node;
1007                 } else {
1008                         struct hammer_node **tmp;
1009                         tmp = node->cache1;
1010                         node->cache1 = node->cache2;
1011                         node->cache2 = tmp;
1012                 }
1013         }
1014 }
1015
1016 void
1017 hammer_uncache_node(struct hammer_node **cache)
1018 {
1019         hammer_node_t node;
1020
1021         if ((node = *cache) != NULL) {
1022                 *cache = NULL;
1023                 if (node->cache1 == cache) {
1024                         node->cache1 = node->cache2;
1025                         node->cache2 = NULL;
1026                 } else if (node->cache2 == cache) {
1027                         node->cache2 = NULL;
1028                 } else {
1029                         panic("hammer_uncache_node: missing cache linkage");
1030                 }
1031                 if (node->cache1 == NULL && node->cache2 == NULL)
1032                         hammer_flush_node(node);
1033         }
1034 }
1035
1036 /*
1037  * Remove a node's cache references and destroy the node if it has no
1038  * other references or backing store.
1039  */
1040 void
1041 hammer_flush_node(hammer_node_t node)
1042 {
1043         hammer_buffer_t buffer;
1044
1045         if (node->cache1)
1046                 *node->cache1 = NULL;
1047         if (node->cache2)
1048                 *node->cache2 = NULL;
1049         if (node->lock.refs == 0 && node->ondisk == NULL) {
1050                 RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node);
1051                 if ((buffer = node->buffer) != NULL) {
1052                         node->buffer = NULL;
1053                         TAILQ_REMOVE(&buffer->clist, node, entry);
1054                         /* buffer is unreferenced because ondisk is NULL */
1055                 }
1056                 --hammer_count_nodes;
1057                 kfree(node, M_HAMMER);
1058         }
1059 }
1060
1061 /*
1062  * Flush passively cached B-Tree nodes associated with this buffer.
1063  * This is only called when the buffer is about to be destroyed, so
1064  * none of the nodes should have any references.
1065  */
1066 void
1067 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1068 {
1069         hammer_node_t node;
1070
1071         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1072                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1073                 hammer_ref(&node->lock);
1074                 node->flags |= HAMMER_NODE_FLUSH;
1075                 hammer_rel_node(node);
1076         }
1077 }
1078
1079
1080 /************************************************************************
1081  *                              ALLOCATORS                              *
1082  ************************************************************************/
1083
1084 /*
1085  * Allocate a B-Tree node.
1086  */
1087 hammer_node_t
1088 hammer_alloc_btree(hammer_mount_t hmp, int *errorp)
1089 {
1090         hammer_buffer_t buffer = NULL;
1091         hammer_node_t node = NULL;
1092         hammer_off_t node_offset;
1093
1094         node_offset = hammer_blockmap_alloc(hmp, HAMMER_ZONE_BTREE_INDEX,
1095                                             sizeof(struct hammer_node_ondisk),
1096                                             errorp);
1097         if (*errorp == 0) {
1098                 node = hammer_get_node(hmp, node_offset, errorp);
1099                 hammer_modify_node(node);
1100                 bzero(node->ondisk, sizeof(*node->ondisk));
1101         }
1102         if (buffer)
1103                 hammer_rel_buffer(buffer, 0);
1104         return(node);
1105 }
1106
1107 /*
1108  * The returned buffers are already appropriately marked as being modified.
1109  * If the caller marks them again unnecessary undo records may be generated.
1110  *
1111  * In-band data is indicated by data_bufferp == NULL.  Pass a data_len of 0
1112  * for zero-fill (caller modifies data_len afterwords).
1113  */
1114 void *
1115 hammer_alloc_record(hammer_mount_t hmp, 
1116                     hammer_off_t *rec_offp, u_int8_t rec_type, 
1117                     struct hammer_buffer **rec_bufferp,
1118                     int32_t data_len, void **datap,
1119                     struct hammer_buffer **data_bufferp, int *errorp)
1120 {
1121         hammer_record_ondisk_t rec;
1122         hammer_off_t rec_offset;
1123         hammer_off_t data_offset;
1124         int32_t reclen;
1125
1126         if (datap)
1127                 *datap = NULL;
1128
1129         /*
1130          * Allocate the record
1131          */
1132         rec_offset = hammer_blockmap_alloc(hmp, HAMMER_ZONE_RECORD_INDEX,
1133                                            HAMMER_RECORD_SIZE, errorp);
1134         if (*errorp)
1135                 return(NULL);
1136
1137         /*
1138          * Allocate data
1139          */
1140         if (data_len) {
1141                 if (data_bufferp == NULL) {
1142                         switch(rec_type) {
1143                         case HAMMER_RECTYPE_DATA:
1144                                 reclen = offsetof(struct hammer_data_record,
1145                                                   data[0]);
1146                                 break;
1147                         case HAMMER_RECTYPE_DIRENTRY:
1148                                 reclen = offsetof(struct hammer_entry_record,
1149                                                   name[0]);
1150                                 break;
1151                         default:
1152                                 panic("hammer_alloc_record: illegal "
1153                                       "in-band data");
1154                                 /* NOT REACHED */
1155                                 reclen = 0;
1156                                 break;
1157                         }
1158                         KKASSERT(reclen + data_len <= HAMMER_RECORD_SIZE);
1159                         data_offset = rec_offset + reclen;
1160                 } else if (data_len < HAMMER_BUFSIZE) {
1161                         data_offset = hammer_blockmap_alloc(hmp,
1162                                                 HAMMER_ZONE_SMALL_DATA_INDEX,
1163                                                 data_len, errorp);
1164                 } else {
1165                         data_offset = hammer_blockmap_alloc(hmp,
1166                                                 HAMMER_ZONE_LARGE_DATA_INDEX,
1167                                                 data_len, errorp);
1168                 }
1169         } else {
1170                 data_offset = 0;
1171         }
1172         if (*errorp) {
1173                 hammer_blockmap_free(hmp, rec_offset, HAMMER_RECORD_SIZE);
1174                 return(NULL);
1175         }
1176
1177         /*
1178          * Basic return values.
1179          */
1180         *rec_offp = rec_offset;
1181         rec = hammer_bread(hmp, rec_offset, errorp, rec_bufferp);
1182         hammer_modify_buffer(*rec_bufferp, NULL, 0);
1183         bzero(rec, sizeof(*rec));
1184         KKASSERT(*errorp == 0);
1185         rec->base.data_off = data_offset;
1186         rec->base.data_len = data_len;
1187
1188         if (data_bufferp) {
1189                 if (data_len) {
1190                         *datap = hammer_bread(hmp, data_offset, errorp,
1191                                               data_bufferp);
1192                         KKASSERT(*errorp == 0);
1193                         hammer_modify_buffer(*data_bufferp, NULL, 0);
1194                 } else {
1195                         *datap = NULL;
1196                 }
1197         } else if (data_len) {
1198                 KKASSERT(data_offset + data_len - rec_offset <=
1199                          HAMMER_RECORD_SIZE); 
1200                 if (datap) {
1201                         *datap = (void *)((char *)rec +
1202                                           (int32_t)(data_offset - rec_offset));
1203                 }
1204         } else {
1205                 KKASSERT(datap == NULL);
1206         }
1207         KKASSERT(*errorp == 0);
1208         return(rec);
1209 }
1210
1211 /*
1212  * Generate an undo fifo entry and return the buffer to the caller (XXX).
1213  * The caller must create a dependancy to ensure that the undo record is
1214  * flushed before the modified buffer is flushed.
1215  */
1216 int
1217 hammer_generate_undo(hammer_mount_t hmp, hammer_off_t off, void *base, int len)
1218 {
1219         return(0);
1220 #if 0
1221         hammer_off_t rec_offset;
1222         hammer_fifo_undo_t undo;
1223         hammer_buffer_t buffer = NULL;
1224         int error;
1225
1226         rec_offset = hammer_alloc_fifo(hmp, sizeof(*undo), len,
1227                                        &buffer, HAMMER_HEAD_TYPE_UNDO,
1228                                        0, NULL, &error);
1229         if (error == 0) {
1230                 undo = (void *)((char *)buffer->ondisk + 
1231                                 ((int32_t)rec_offset & HAMMER_BUFMASK));
1232                 undo->undo_offset = off;
1233                 bcopy(base, undo + 1, len);
1234         }
1235         if (buffer)
1236                 hammer_rel_buffer(buffer, 0);
1237         return(error);
1238 #endif
1239 }
1240
1241 #if 0
1242
1243 /*
1244  * Allocate space from the FIFO.  The first rec_len bytes will be zero'd.
1245  * The entire space is marked modified (the caller should not remark it as
1246  * that will cause unnecessary undo records to be added).
1247  */
1248 static
1249 hammer_off_t
1250 hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len, int32_t data_len,
1251                   struct hammer_buffer **rec_bufferp, u_int16_t hdr_type,
1252                   int can_cross, 
1253                   struct hammer_buffer **data2_bufferp, int *errorp)
1254 {
1255         hammer_volume_t root_volume;
1256         hammer_volume_t end_volume;
1257         hammer_volume_ondisk_t ondisk;
1258         hammer_fifo_head_t head;
1259         hammer_fifo_tail_t tail;
1260         hammer_off_t end_off = 0;
1261         hammer_off_t tmp_off = 0;
1262         int32_t end_vol_no;
1263         int32_t tmp_vol_no;
1264         int32_t xoff;
1265         int32_t aligned_bytes;
1266         int must_pad;
1267
1268         aligned_bytes = (rec_len + data_len + HAMMER_TAIL_ONDISK_SIZE +
1269                          HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK;
1270
1271         root_volume = hammer_get_root_volume(hmp, errorp);
1272         if (root_volume)
1273                 hammer_modify_volume(root_volume, NULL, 0);
1274
1275         while (root_volume) {
1276                 ondisk = root_volume->ondisk;
1277
1278                 end_off = ondisk->vol0_fifo_end;
1279                 end_vol_no = HAMMER_VOL_DECODE(end_off);
1280
1281                 end_volume = hammer_get_volume(hmp, end_vol_no, errorp);
1282                 if (*errorp)
1283                         goto done;
1284
1285                 /*
1286                  * Check to see if we ran out of space.  Include some extra
1287                  * room.
1288                  *
1289                  * vol0_fifo_end cannot be advanced into the same buffer
1290                  * that vol0_fifo_beg resides in.  This allows us to
1291                  * instantiate a new buffer without reading it in.
1292                  *
1293                  * XXX messy.
1294                  */
1295                 tmp_off = ondisk->vol0_fifo_beg & ~HAMMER_BUFMASK64;
1296                 tmp_vol_no = HAMMER_VOL_DECODE(tmp_off);
1297                 if ((tmp_off & HAMMER_OFF_SHORT_MASK) == 0) {
1298                         if (end_vol_no + 1 == tmp_vol_no) {
1299                                 tmp_vol_no = end_vol_no;
1300                                 tmp_off = end_volume->maxbuf_off;
1301                         } else if (end_vol_no + 1 == hmp->nvolumes &&
1302                                    tmp_vol_no == 0) {
1303                                 tmp_vol_no = end_vol_no;
1304                                 tmp_off = end_volume->maxbuf_off;
1305                         }
1306                 }
1307                 hammer_rel_volume(end_volume, 0);
1308
1309                 /*
1310                  * XXX dummy head at end of fifo
1311                  */
1312                 if (end_vol_no == tmp_vol_no &&
1313                     end_off < tmp_off &&
1314                     end_off + aligned_bytes + sizeof(*head) >= tmp_off) {
1315                         *errorp = ENOSPC;
1316                         goto done;
1317                 }
1318
1319                 if ((int32_t)end_off & HAMMER_BUFMASK)
1320                         head = hammer_bread(hmp, end_off, errorp, rec_bufferp);
1321                 else
1322                         head = hammer_bnew(hmp, end_off, errorp, rec_bufferp);
1323                 if (*errorp)
1324                         goto done;
1325
1326                 /*
1327                  * Load the buffer, retry if someone else squeeked in
1328                  * while we were blocked.
1329                  */
1330
1331                 if (ondisk->vol0_fifo_end != end_off)
1332                         continue;
1333
1334                 /*
1335                  * Ok, we're gonna do something.  Modify the buffer
1336                  */
1337                 hammer_modify_buffer(*rec_bufferp, NULL, 0);
1338                 if (ondisk->vol0_fifo_end != end_off)
1339                         continue;
1340                 xoff = (int32_t)end_off & HAMMER_BUFMASK;
1341
1342                 /*
1343                  * The non-data portion of the fifo record cannot cross
1344                  * a buffer boundary.
1345                  *
1346                  * The entire record cannot cross a buffer boundary if
1347                  * can_cross is 0.
1348                  *
1349                  * The entire record cannot cover more then two whole buffers
1350                  * regardless.  Even if the data portion is 16K, this case
1351                  * can occur due to the addition of the fifo_tail.
1352                  *
1353                  * It is illegal for a record to cross a volume boundary.
1354                  *
1355                  * It is illegal for a record to cross a recovery boundary
1356                  * (this is so recovery code is guaranteed a record rather
1357                  * then data at certain points).
1358                  *
1359                  * Add a pad record and loop if it does.
1360                  */
1361                 must_pad = 0;
1362                 if (xoff + rec_len > HAMMER_BUFSIZE)
1363                         must_pad = 1;
1364                 if (can_cross == 0) {
1365                         if (xoff + aligned_bytes > HAMMER_BUFSIZE)
1366                                 must_pad = 1;
1367                 } else {
1368                         if (xoff + aligned_bytes > HAMMER_BUFSIZE &&
1369                             (end_off + aligned_bytes) >=
1370                             (*rec_bufferp)->volume->maxbuf_off) {
1371                                 must_pad = 1;
1372                         }
1373                         if ((end_off ^ (end_off + aligned_bytes)) &
1374                             HAMMER_OFF_SHORT_REC_MASK) {
1375                                 must_pad = 1;
1376                         }
1377                         if (xoff + aligned_bytes - HAMMER_BUFSIZE >
1378                             HAMMER_BUFSIZE) {
1379                                 KKASSERT(xoff != 0);
1380                                 must_pad = 1;
1381                         }
1382                 }
1383
1384                 /*
1385                  * Pad to end of the buffer if necessary.  PADs can be
1386                  * squeezed into as little as 8 bytes (hence our alignment
1387                  * requirement).  The crc, reserved, and sequence number
1388                  * fields are not used, but initialize them anyway if there
1389                  * is enough room.
1390                  */
1391                 if (must_pad) {
1392                         xoff = HAMMER_BUFSIZE - xoff;
1393                         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1394                         head->hdr_type = HAMMER_HEAD_TYPE_PAD;
1395                         head->hdr_size = xoff;
1396                         if (xoff >= HAMMER_HEAD_ONDISK_SIZE +
1397                                     HAMMER_TAIL_ONDISK_SIZE) {
1398                                 head->hdr_crc = 0;
1399                                 head->hdr_reserved02 = 0;
1400                                 head->hdr_seq = 0;
1401                         }
1402
1403                         tail = (void *)((char *)head + xoff -
1404                                         HAMMER_TAIL_ONDISK_SIZE);
1405                         if ((void *)head != (void *)tail) {
1406                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
1407                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
1408                                 tail->tail_size = xoff;
1409                         }
1410                         KKASSERT((xoff & HAMMER_HEAD_ALIGN_MASK) == 0);
1411                         ondisk->vol0_fifo_end =
1412                                 hammer_advance_fifo((*rec_bufferp)->volume,
1413                                                     end_off, xoff);
1414                         continue;
1415                 }
1416
1417                 if (xoff + aligned_bytes > HAMMER_BUFSIZE) {
1418                         xoff = xoff + aligned_bytes - HAMMER_BUFSIZE;
1419
1420                         KKASSERT(xoff <= HAMMER_BUFSIZE);
1421                         tail = hammer_bnew(hmp, end_off + aligned_bytes -
1422                                                 HAMMER_TAIL_ONDISK_SIZE,
1423                                            errorp, data2_bufferp);
1424                         hammer_modify_buffer(*data2_bufferp, NULL, 0);
1425                         if (*errorp)
1426                                 goto done;
1427
1428                         /*
1429                          * Retry if someone else appended to the fifo while
1430                          * we were blocked.
1431                          */
1432                         if (ondisk->vol0_fifo_end != end_off)
1433                                 continue;
1434                 } else {
1435                         tail = (void *)((char *)head + aligned_bytes -
1436                                         HAMMER_TAIL_ONDISK_SIZE);
1437                 }
1438
1439                 bzero(head, rec_len);
1440                 head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1441                 head->hdr_type = hdr_type;
1442                 head->hdr_size = aligned_bytes;
1443                 head->hdr_crc = 0;
1444                 head->hdr_seq = root_volume->ondisk->vol0_next_seq++;
1445
1446                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
1447                 tail->tail_type = hdr_type;
1448                 tail->tail_size = aligned_bytes;
1449
1450                 ondisk->vol0_fifo_end =
1451                         hammer_advance_fifo((*rec_bufferp)->volume,
1452                                             end_off, aligned_bytes);
1453 done:
1454                 hammer_rel_volume(root_volume, 0);
1455                 break;
1456         }
1457         if (*errorp)
1458                 end_off = 0;
1459         return(end_off);
1460 }
1461
1462 /*
1463  * Mark a fifo record as having been freed.  XXX needs undo.
1464  */
1465 void
1466 hammer_free_fifo(hammer_mount_t hmp, hammer_off_t fifo_offset)
1467 {
1468         hammer_buffer_t buffer = NULL;
1469         hammer_fifo_head_t head;
1470         int error;
1471
1472         head = hammer_bread(hmp, fifo_offset, &error, &buffer);
1473         if (head) {
1474                 hammer_modify_buffer(buffer, &head->hdr_type,
1475                                      sizeof(head->hdr_type));
1476                 head->hdr_type |= HAMMER_HEAD_FLAG_FREE;
1477         }
1478         if (buffer)
1479                 hammer_rel_buffer(buffer, 0);
1480 }
1481
1482 /*
1483  * Attempt to rewind the FIFO
1484  *
1485  * This routine is allowed to do nothing.
1486  */
1487 void
1488 hammer_unwind_fifo(hammer_mount_t hmp, hammer_off_t rec_offset)
1489 {
1490 }
1491
1492 /*
1493  * Advance the FIFO a certain number of bytes.
1494  */
1495 static
1496 hammer_off_t
1497 hammer_advance_fifo(hammer_volume_t volume, hammer_off_t off, int32_t bytes)
1498 {
1499         int32_t vol_no;
1500
1501         off += bytes;
1502         KKASSERT(off <= volume->maxbuf_off);
1503         KKASSERT((off & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
1504         if (off == volume->maxbuf_off) {
1505                 vol_no = volume->vol_no + 1;
1506                 if (vol_no == volume->hmp->nvolumes)
1507                         vol_no = 0;
1508                 off = HAMMER_ENCODE_RAW_BUFFER(vol_no, 0);
1509         }
1510         return(off);
1511 }
1512 #endif
1513
1514 /*
1515  * Sync dirty buffers to the media
1516  */
1517
1518 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
1519 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
1520
1521 int
1522 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1523 {
1524         struct hammer_sync_info info;
1525
1526         info.error = 0;
1527         info.waitfor = waitfor;
1528
1529         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
1530                       hammer_sync_scan1, hammer_sync_scan2, &info);
1531
1532         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1533                 hammer_sync_volume, &info);
1534         return(info.error);
1535 }
1536
1537 static int
1538 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1539 {
1540         struct hammer_inode *ip;
1541
1542         ip = VTOI(vp);
1543         if (vp->v_type == VNON || ip == NULL ||
1544             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1545              RB_EMPTY(&vp->v_rbdirty_tree))) {
1546                 return(-1);
1547         }
1548         return(0);
1549 }
1550
1551 static int
1552 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1553 {
1554         struct hammer_sync_info *info = data;
1555         struct hammer_inode *ip;
1556         int error;
1557
1558         ip = VTOI(vp);
1559         if (vp->v_type == VNON || vp->v_type == VBAD ||
1560             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1561              RB_EMPTY(&vp->v_rbdirty_tree))) {
1562                 return(0);
1563         }
1564         error = VOP_FSYNC(vp, info->waitfor);
1565         if (error)
1566                 info->error = error;
1567         return(0);
1568 }
1569
1570 int
1571 hammer_sync_volume(hammer_volume_t volume, void *data)
1572 {
1573         struct hammer_sync_info *info = data;
1574
1575         hammer_ref(&volume->io.lock);
1576         RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
1577                 hammer_sync_buffer, info);
1578         hammer_rel_volume(volume, 1);
1579         return(0);
1580 }
1581
1582 int
1583 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
1584 {
1585         hammer_ref(&buffer->io.lock);
1586         hammer_rel_buffer(buffer, 1);
1587         return(0);
1588 }
1589
1590 #if 0
1591 /*
1592  * Generic buffer initialization.  Initialize the A-list into an all-allocated
1593  * state with the free block limit properly set.
1594  *
1595  * Note that alloc_new_buffer() will free the appropriate block range via
1596  * the appropriate cluster alist, so the free count is properly propogated.
1597  */
1598 void
1599 hammer_init_fifo(hammer_fifo_head_t head, u_int16_t type)
1600 {
1601         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1602         head->hdr_type = type;
1603         head->hdr_size = 0;
1604         head->hdr_crc = 0;
1605         head->hdr_seq = 0;
1606 }
1607
1608 #endif
1609