26cb8f7db0e9b98d2d8645938ae8dde00745c0fa
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.30 2008/02/10 18:58:22 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew);
51 static int hammer_load_node(hammer_node_t node);
52 #if 0
53 static hammer_off_t hammer_advance_fifo(hammer_volume_t volume,
54                 hammer_off_t off, int32_t bytes);
55
56 static hammer_off_t hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len,
57                 int32_t data_len, struct hammer_buffer **rec_bufferp,
58                 u_int16_t hdr_type, int can_cross, 
59                 struct hammer_buffer **data2_bufferp, int *errorp);
60 #endif
61
62 /*
63  * Red-Black tree support for various structures
64  */
65 static int
66 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
67 {
68         if (ip1->obj_id < ip2->obj_id)
69                 return(-1);
70         if (ip1->obj_id > ip2->obj_id)
71                 return(1);
72         if (ip1->obj_asof < ip2->obj_asof)
73                 return(-1);
74         if (ip1->obj_asof > ip2->obj_asof)
75                 return(1);
76         return(0);
77 }
78
79 static int
80 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
81 {
82         if (info->obj_id < ip->obj_id)
83                 return(-1);
84         if (info->obj_id > ip->obj_id)
85                 return(1);
86         if (info->obj_asof < ip->obj_asof)
87                 return(-1);
88         if (info->obj_asof > ip->obj_asof)
89                 return(1);
90         return(0);
91 }
92
93 static int
94 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
95 {
96         if (vol1->vol_no < vol2->vol_no)
97                 return(-1);
98         if (vol1->vol_no > vol2->vol_no)
99                 return(1);
100         return(0);
101 }
102
103 static int
104 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
105 {
106         if (buf1->zone2_offset < buf2->zone2_offset)
107                 return(-1);
108         if (buf1->zone2_offset > buf2->zone2_offset)
109                 return(1);
110         return(0);
111 }
112
113 static int
114 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
115 {
116         if (node1->node_offset < node2->node_offset)
117                 return(-1);
118         if (node1->node_offset > node2->node_offset)
119                 return(1);
120         return(0);
121 }
122
123 /*
124  * Note: The lookup function for hammer_ino_rb_tree winds up being named
125  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
126  * functions are normal, e.g. hammer_buf_rb_tree_RB_LOOKUP(root, zone2_offset).
127  */
128 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
129 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
130                 hammer_inode_info_cmp, hammer_inode_info_t);
131 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
132              hammer_vol_rb_compare, int32_t, vol_no);
133 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
134              hammer_buf_rb_compare, hammer_off_t, zone2_offset);
135 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
136              hammer_nod_rb_compare, hammer_off_t, node_offset);
137
138 /************************************************************************
139  *                              VOLUMES                                 *
140  ************************************************************************
141  *
142  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
143  * code on failure.  Volumes must be loaded at mount time, get_volume() will
144  * not load a new volume.
145  *
146  * Calls made to hammer_load_volume() or single-threaded
147  */
148 int
149 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
150 {
151         struct mount *mp;
152         hammer_volume_t volume;
153         struct hammer_volume_ondisk *ondisk;
154         struct nlookupdata nd;
155         struct buf *bp = NULL;
156         int error;
157         int ronly;
158
159         mp = hmp->mp;
160         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
161
162         /*
163          * Allocate a volume structure
164          */
165         ++hammer_count_volumes;
166         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
167         volume->vol_name = kstrdup(volname, M_HAMMER);
168         volume->hmp = hmp;
169         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
170         volume->io.offset = 0LL;
171
172         /*
173          * Get the device vnode
174          */
175         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
176         if (error == 0)
177                 error = nlookup(&nd);
178         if (error == 0)
179                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
180         nlookup_done(&nd);
181         if (error == 0) {
182                 if (vn_isdisk(volume->devvp, &error)) {
183                         error = vfs_mountedon(volume->devvp);
184                 }
185         }
186         if (error == 0 &&
187             count_udev(volume->devvp->v_umajor, volume->devvp->v_uminor) > 0) {
188                 error = EBUSY;
189         }
190         if (error == 0) {
191                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
192                 error = vinvalbuf(volume->devvp, V_SAVE, 0, 0);
193                 if (error == 0) {
194                         error = VOP_OPEN(volume->devvp, 
195                                          (ronly ? FREAD : FREAD|FWRITE),
196                                          FSCRED, NULL);
197                 }
198                 vn_unlock(volume->devvp);
199         }
200         if (error) {
201                 hammer_free_volume(volume);
202                 return(error);
203         }
204         volume->devvp->v_rdev->si_mountpoint = mp;
205
206         /*
207          * Extract the volume number from the volume header and do various
208          * sanity checks.
209          */
210         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
211         if (error)
212                 goto late_failure;
213         ondisk = (void *)bp->b_data;
214         if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) {
215                 kprintf("hammer_mount: volume %s has an invalid header\n",
216                         volume->vol_name);
217                 error = EFTYPE;
218                 goto late_failure;
219         }
220         volume->vol_no = ondisk->vol_no;
221         volume->buffer_base = ondisk->vol_buf_beg;
222         volume->vol_flags = ondisk->vol_flags;
223         volume->nblocks = ondisk->vol_nblocks; 
224         volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no,
225                                     ondisk->vol_buf_end - ondisk->vol_buf_beg);
226         RB_INIT(&volume->rb_bufs_root);
227
228         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
229
230         if (RB_EMPTY(&hmp->rb_vols_root)) {
231                 hmp->fsid = ondisk->vol_fsid;
232         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
233                 kprintf("hammer_mount: volume %s's fsid does not match "
234                         "other volumes\n", volume->vol_name);
235                 error = EFTYPE;
236                 goto late_failure;
237         }
238
239         /*
240          * Insert the volume structure into the red-black tree.
241          */
242         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
243                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
244                         volume->vol_name, volume->vol_no);
245                 error = EEXIST;
246         }
247
248         /*
249          * Set the root volume .  HAMMER special cases rootvol the structure.
250          * We do not hold a ref because this would prevent related I/O
251          * from being flushed.
252          */
253         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
254                 hmp->rootvol = volume;
255                 if (bp) {
256                         brelse(bp);
257                         bp = NULL;
258                 }
259                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
260         }
261 late_failure:
262         if (bp)
263                 brelse(bp);
264         if (error) {
265                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
266                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
267                 hammer_free_volume(volume);
268         }
269         return (error);
270 }
271
272 /*
273  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
274  * so returns -1 on failure.
275  */
276 int
277 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
278 {
279         struct hammer_mount *hmp = volume->hmp;
280         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
281
282         /*
283          * Sync clusters, sync volume
284          */
285
286         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
287
288         /*
289          * Clean up the root volume pointer, which is held unlocked in hmp.
290          */
291         if (hmp->rootvol == volume)
292                 hmp->rootvol = NULL;
293
294         /*
295          * Unload clusters and super-clusters.  Unloading a super-cluster
296          * also unloads related clusters, but the filesystem may not be
297          * using super-clusters so unload clusters anyway.
298          */
299         RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
300                         hammer_unload_buffer, NULL);
301         hammer_io_waitdep(&volume->io);
302
303         /*
304          * Release our buffer and flush anything left in the buffer cache.
305          */
306         hammer_io_release(&volume->io, 2);
307
308         /*
309          * There should be no references on the volume, no clusters, and
310          * no super-clusters.
311          */
312         KKASSERT(volume->io.lock.refs == 0);
313         KKASSERT(RB_EMPTY(&volume->rb_bufs_root));
314
315         volume->ondisk = NULL;
316         if (volume->devvp) {
317                 if (ronly) {
318                         vinvalbuf(volume->devvp, 0, 0, 0);
319                         VOP_CLOSE(volume->devvp, FREAD);
320                 } else {
321                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
322                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
323                 }
324         }
325
326         /*
327          * Destroy the structure
328          */
329         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
330         hammer_free_volume(volume);
331         return(0);
332 }
333
334 static
335 void
336 hammer_free_volume(hammer_volume_t volume)
337 {
338         if (volume->vol_name) {
339                 kfree(volume->vol_name, M_HAMMER);
340                 volume->vol_name = NULL;
341         }
342         if (volume->devvp) {
343                 if (vn_isdisk(volume->devvp, NULL) &&
344                     volume->devvp->v_rdev &&
345                     volume->devvp->v_rdev->si_mountpoint == volume->hmp->mp
346                 ) {
347                         volume->devvp->v_rdev->si_mountpoint = NULL;
348                 }
349                 vrele(volume->devvp);
350                 volume->devvp = NULL;
351         }
352         --hammer_count_volumes;
353         kfree(volume, M_HAMMER);
354 }
355
356 /*
357  * Get a HAMMER volume.  The volume must already exist.
358  */
359 hammer_volume_t
360 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
361 {
362         struct hammer_volume *volume;
363
364         /*
365          * Locate the volume structure
366          */
367         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
368         if (volume == NULL) {
369                 *errorp = ENOENT;
370                 return(NULL);
371         }
372         hammer_ref(&volume->io.lock);
373
374         /*
375          * Deal with on-disk info
376          */
377         if (volume->ondisk == NULL || volume->io.loading) {
378                 *errorp = hammer_load_volume(volume);
379                 if (*errorp) {
380                         hammer_rel_volume(volume, 1);
381                         volume = NULL;
382                 }
383         } else {
384                 *errorp = 0;
385         }
386         return(volume);
387 }
388
389 int
390 hammer_ref_volume(hammer_volume_t volume)
391 {
392         int error;
393
394         hammer_ref(&volume->io.lock);
395
396         /*
397          * Deal with on-disk info
398          */
399         if (volume->ondisk == NULL || volume->io.loading) {
400                 error = hammer_load_volume(volume);
401                 if (error)
402                         hammer_rel_volume(volume, 1);
403         } else {
404                 error = 0;
405         }
406         return (error);
407 }
408
409 hammer_volume_t
410 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
411 {
412         hammer_volume_t volume;
413
414         volume = hmp->rootvol;
415         KKASSERT(volume != NULL);
416         hammer_ref(&volume->io.lock);
417
418         /*
419          * Deal with on-disk info
420          */
421         if (volume->ondisk == NULL || volume->io.loading) {
422                 *errorp = hammer_load_volume(volume);
423                 if (*errorp) {
424                         hammer_rel_volume(volume, 1);
425                         volume = NULL;
426                 }
427         } else {
428                 *errorp = 0;
429         }
430         return (volume);
431 }
432
433 /*
434  * Load a volume's on-disk information.  The volume must be referenced and
435  * not locked.  We temporarily acquire an exclusive lock to interlock
436  * against releases or multiple get's.
437  */
438 static int
439 hammer_load_volume(hammer_volume_t volume)
440 {
441         struct hammer_volume_ondisk *ondisk;
442         int error;
443
444         hammer_lock_ex(&volume->io.lock);
445         KKASSERT(volume->io.loading == 0);
446         volume->io.loading = 1;
447
448         if (volume->ondisk == NULL) {
449                 error = hammer_io_read(volume->devvp, &volume->io);
450                 if (error) {
451                         volume->io.loading = 0;
452                         hammer_unlock(&volume->io.lock);
453                         return (error);
454                 }
455                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
456         } else {
457                 error = 0;
458         }
459         volume->io.loading = 0;
460         hammer_unlock(&volume->io.lock);
461         return(0);
462 }
463
464 /*
465  * Release a volume.  Call hammer_io_release on the last reference.  We have
466  * to acquire an exclusive lock to interlock against volume->ondisk tests
467  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
468  * lock to be held.
469  *
470  * Volumes are not unloaded from memory during normal operation.
471  */
472 void
473 hammer_rel_volume(hammer_volume_t volume, int flush)
474 {
475         if (volume->io.lock.refs == 1) {
476                 hammer_lock_ex(&volume->io.lock);
477                 if (volume->io.lock.refs == 1) {
478                         volume->ondisk = NULL;
479                         hammer_io_release(&volume->io, flush);
480                 } else if (flush) {
481                         hammer_io_flush(&volume->io);
482                 }
483                 hammer_unlock(&volume->io.lock);
484         }
485         hammer_unref(&volume->io.lock);
486 }
487
488 /************************************************************************
489  *                              BUFFERS                                 *
490  ************************************************************************
491  *
492  * Manage buffers.  Currently all blockmap-backed zones are translated
493  * to zone-2 buffer offsets.
494  */
495 hammer_buffer_t
496 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
497                   int isnew, int *errorp)
498 {
499         hammer_buffer_t buffer;
500         hammer_volume_t volume;
501         hammer_off_t    zoneX_offset;
502         int vol_no;
503         int zone;
504
505         zoneX_offset = buf_offset;
506         zone = HAMMER_ZONE_DECODE(buf_offset);
507         if (zone > HAMMER_ZONE_RAW_BUFFER_INDEX) {
508                 buf_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp);
509                 KKASSERT(*errorp == 0);
510         }
511         buf_offset &= ~HAMMER_BUFMASK64;
512         KKASSERT((buf_offset & HAMMER_ZONE_RAW_BUFFER) ==
513                  HAMMER_ZONE_RAW_BUFFER);
514         vol_no = HAMMER_VOL_DECODE(buf_offset);
515         volume = hammer_get_volume(hmp, vol_no, errorp);
516         if (volume == NULL)
517                 return(NULL);
518
519         /*
520          * NOTE: buf_offset and maxbuf_off are both full offset
521          * specifications.
522          */
523         KKASSERT(buf_offset < volume->maxbuf_off);
524
525         /*
526          * Locate and lock the buffer structure, creating one if necessary.
527          */
528 again:
529         buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root,
530                            buf_offset);
531         if (buffer == NULL) {
532                 ++hammer_count_buffers;
533                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
534                 buffer->zone2_offset = buf_offset;
535                 buffer->volume = volume;
536                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
537                 buffer->io.offset = volume->ondisk->vol_buf_beg +
538                                     (buf_offset & HAMMER_OFF_SHORT_MASK);
539                 TAILQ_INIT(&buffer->clist);
540                 hammer_ref(&buffer->io.lock);
541
542                 /*
543                  * Insert the buffer into the RB tree and handle late
544                  * collisions.
545                  */
546                 if (RB_INSERT(hammer_buf_rb_tree, &volume->rb_bufs_root, buffer)) {
547                         hammer_unref(&buffer->io.lock);
548                         --hammer_count_buffers;
549                         kfree(buffer, M_HAMMER);
550                         goto again;
551                 }
552                 hammer_ref(&volume->io.lock);
553         } else {
554                 hammer_ref(&buffer->io.lock);
555         }
556
557         /*
558          * Cache the blockmap translation
559          */
560         if ((zoneX_offset & HAMMER_ZONE_RAW_BUFFER) != HAMMER_ZONE_RAW_BUFFER)
561                 buffer->zoneX_offset = zoneX_offset;
562
563         /*
564          * Deal with on-disk info
565          */
566         if (buffer->ondisk == NULL || buffer->io.loading) {
567                 *errorp = hammer_load_buffer(buffer, isnew);
568                 if (*errorp) {
569                         hammer_rel_buffer(buffer, 1);
570                         buffer = NULL;
571                 }
572         } else {
573                 *errorp = 0;
574         }
575         hammer_rel_volume(volume, 0);
576         return(buffer);
577 }
578
579 static int
580 hammer_load_buffer(hammer_buffer_t buffer, int isnew)
581 {
582         hammer_volume_t volume;
583         void *ondisk;
584         int error;
585
586         /*
587          * Load the buffer's on-disk info
588          */
589         volume = buffer->volume;
590         hammer_lock_ex(&buffer->io.lock);
591         KKASSERT(buffer->io.loading == 0);
592         buffer->io.loading = 1;
593
594         if (buffer->ondisk == NULL) {
595                 if (isnew) {
596                         error = hammer_io_new(volume->devvp, &buffer->io);
597                 } else {
598                         error = hammer_io_read(volume->devvp, &buffer->io);
599                 }
600                 if (error) {
601                         buffer->io.loading = 0;
602                         hammer_unlock(&buffer->io.lock);
603                         return (error);
604                 }
605                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
606         } else if (isnew) {
607                 error = hammer_io_new(volume->devvp, &buffer->io);
608         } else {
609                 error = 0;
610         }
611         if (error == 0 && isnew) {
612                 hammer_modify_buffer(buffer, NULL, 0);
613                 /* additional initialization goes here */
614         }
615         buffer->io.loading = 0;
616         hammer_unlock(&buffer->io.lock);
617         return (error);
618 }
619
620 /*
621  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
622  */
623 int
624 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
625 {
626         hammer_ref(&buffer->io.lock);
627         hammer_flush_buffer_nodes(buffer);
628         KKASSERT(buffer->io.lock.refs == 1);
629         hammer_rel_buffer(buffer, 2);
630         return(0);
631 }
632
633 /*
634  * Reference a buffer that is either already referenced or via a specially
635  * handled pointer (aka cursor->buffer).
636  */
637 int
638 hammer_ref_buffer(hammer_buffer_t buffer)
639 {
640         int error;
641
642         hammer_ref(&buffer->io.lock);
643         if (buffer->ondisk == NULL || buffer->io.loading) {
644                 error = hammer_load_buffer(buffer, 0);
645                 if (error) {
646                         hammer_rel_buffer(buffer, 1);
647                         /*
648                          * NOTE: buffer pointer can become stale after
649                          * the above release.
650                          */
651                 }
652         } else {
653                 error = 0;
654         }
655         return(error);
656 }
657
658 /*
659  * Release a buffer.  We have to deal with several places where
660  * another thread can ref the buffer.
661  *
662  * Only destroy the structure itself if the related buffer cache buffer
663  * was disassociated from it.  This ties the management of the structure
664  * to the buffer cache subsystem.  buffer->ondisk determines whether the
665  * embedded io is referenced or not.
666  */
667 void
668 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
669 {
670         hammer_volume_t volume;
671
672         if (buffer->io.lock.refs == 1) {
673                 hammer_lock_ex(&buffer->io.lock);
674                 if (buffer->io.lock.refs == 1) {
675                         hammer_io_release(&buffer->io, flush);
676
677                         if (buffer->io.bp == NULL &&
678                             buffer->io.lock.refs == 1) {
679                                 hammer_flush_buffer_nodes(buffer);
680                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
681                                 volume = buffer->volume;
682                                 RB_REMOVE(hammer_buf_rb_tree,
683                                           &volume->rb_bufs_root, buffer);
684                                 buffer->volume = NULL; /* sanity */
685                                 --hammer_count_buffers;
686                                 kfree(buffer, M_HAMMER);
687                                 hammer_rel_volume(volume, 0);
688                                 return;
689                         }
690                 } else if (flush) {
691                         hammer_io_flush(&buffer->io);
692                 }
693                 hammer_unlock(&buffer->io.lock);
694         }
695         hammer_unref(&buffer->io.lock);
696 }
697
698 /*
699  * Access the filesystem buffer containing the specified hammer offset.
700  * buf_offset is a conglomeration of the volume number and vol_buf_beg
701  * relative buffer offset.  It must also have bit 55 set to be valid.
702  * (see hammer_off_t in hammer_disk.h).
703  *
704  * Any prior buffer in *bufferp will be released and replaced by the
705  * requested buffer.
706  */
707 void *
708 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 
709              struct hammer_buffer **bufferp)
710 {
711         hammer_buffer_t buffer;
712         int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
713
714         buf_offset &= ~HAMMER_BUFMASK64;
715         KKASSERT((buf_offset & HAMMER_OFF_ZONE_MASK) != 0);
716
717         buffer = *bufferp;
718         if (buffer == NULL || (buffer->zone2_offset != buf_offset &&
719                                buffer->zoneX_offset != buf_offset)) {
720                 if (buffer)
721                         hammer_rel_buffer(buffer, 0);
722                 buffer = hammer_get_buffer(hmp, buf_offset, 0, errorp);
723                 *bufferp = buffer;
724         } else {
725                 *errorp = 0;
726         }
727
728         /*
729          * Return a pointer to the buffer data.
730          */
731         if (buffer == NULL)
732                 return(NULL);
733         else
734                 return((char *)buffer->ondisk + xoff);
735 }
736
737 /*
738  * Access the filesystem buffer containing the specified hammer offset.
739  * No disk read operation occurs.  The result buffer may contain garbage.
740  *
741  * Any prior buffer in *bufferp will be released and replaced by the
742  * requested buffer.
743  */
744 void *
745 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 
746              struct hammer_buffer **bufferp)
747 {
748         hammer_buffer_t buffer;
749         int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
750
751         buf_offset &= ~HAMMER_BUFMASK64;
752
753         buffer = *bufferp;
754         if (buffer == NULL || (buffer->zone2_offset != buf_offset &&
755                                buffer->zoneX_offset != buf_offset)) {
756                 if (buffer)
757                         hammer_rel_buffer(buffer, 0);
758                 buffer = hammer_get_buffer(hmp, buf_offset, 1, errorp);
759                 *bufferp = buffer;
760         } else {
761                 *errorp = 0;
762         }
763
764         /*
765          * Return a pointer to the buffer data.
766          */
767         if (buffer == NULL)
768                 return(NULL);
769         else
770                 return((char *)buffer->ondisk + xoff);
771 }
772
773 /************************************************************************
774  *                              NODES                                   *
775  ************************************************************************
776  *
777  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
778  * method used by the HAMMER filesystem.
779  *
780  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
781  * associated with its buffer, and will only referenced the buffer while
782  * the node itself is referenced.
783  *
784  * A hammer_node can also be passively associated with other HAMMER
785  * structures, such as inodes, while retaining 0 references.  These
786  * associations can be cleared backwards using a pointer-to-pointer in
787  * the hammer_node.
788  *
789  * This allows the HAMMER implementation to cache hammer_nodes long-term
790  * and short-cut a great deal of the infrastructure's complexity.  In
791  * most cases a cached node can be reacquired without having to dip into
792  * either the buffer or cluster management code.
793  *
794  * The caller must pass a referenced cluster on call and will retain
795  * ownership of the reference on return.  The node will acquire its own
796  * additional references, if necessary.
797  */
798 hammer_node_t
799 hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset, int *errorp)
800 {
801         hammer_node_t node;
802
803         KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE);
804
805         /*
806          * Locate the structure, allocating one if necessary.
807          */
808 again:
809         node = RB_LOOKUP(hammer_nod_rb_tree, &hmp->rb_nods_root, node_offset);
810         if (node == NULL) {
811                 ++hammer_count_nodes;
812                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
813                 node->node_offset = node_offset;
814                 node->hmp = hmp;
815                 if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) {
816                         --hammer_count_nodes;
817                         kfree(node, M_HAMMER);
818                         goto again;
819                 }
820         }
821         hammer_ref(&node->lock);
822         *errorp = hammer_load_node(node);
823         if (*errorp) {
824                 hammer_rel_node(node);
825                 node = NULL;
826         }
827         return(node);
828 }
829
830 /*
831  * Reference an already-referenced node.
832  */
833 int
834 hammer_ref_node(hammer_node_t node)
835 {
836         int error;
837
838         KKASSERT(node->lock.refs > 0);
839         hammer_ref(&node->lock);
840         if ((error = hammer_load_node(node)) != 0)
841                 hammer_rel_node(node);
842         return(error);
843 }
844
845 /*
846  * Load a node's on-disk data reference.
847  */
848 static int
849 hammer_load_node(hammer_node_t node)
850 {
851         hammer_buffer_t buffer;
852         int error;
853
854         if (node->ondisk)
855                 return(0);
856         error = 0;
857         hammer_lock_ex(&node->lock);
858         if (node->ondisk == NULL) {
859                 /*
860                  * This is a little confusing but the jist is that
861                  * node->buffer determines whether the node is on
862                  * the buffer's clist and node->ondisk determines
863                  * whether the buffer is referenced.
864                  */
865                 if ((buffer = node->buffer) != NULL) {
866                         error = hammer_ref_buffer(buffer);
867                 } else {
868                         buffer = hammer_get_buffer(node->hmp,
869                                                    node->node_offset, 0,
870                                                    &error);
871                         if (buffer) {
872                                 KKASSERT(error == 0);
873                                 TAILQ_INSERT_TAIL(&buffer->clist,
874                                                   node, entry);
875                                 node->buffer = buffer;
876                         }
877                 }
878                 if (error == 0) {
879                         node->ondisk = (void *)((char *)buffer->ondisk +
880                                (node->node_offset & HAMMER_BUFMASK));
881                 }
882         }
883         hammer_unlock(&node->lock);
884         return (error);
885 }
886
887 /*
888  * Safely reference a node, interlock against flushes via the IO subsystem.
889  */
890 hammer_node_t
891 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
892                      int *errorp)
893 {
894         hammer_node_t node;
895
896         if ((node = *cache) != NULL)
897                 hammer_ref(&node->lock);
898         if (node) {
899                 *errorp = hammer_load_node(node);
900                 if (*errorp) {
901                         hammer_rel_node(node);
902                         node = NULL;
903                 }
904         } else {
905                 *errorp = ENOENT;
906         }
907         return(node);
908 }
909
910 /*
911  * Release a hammer_node.  On the last release the node dereferences
912  * its underlying buffer and may or may not be destroyed.
913  */
914 void
915 hammer_rel_node(hammer_node_t node)
916 {
917         hammer_buffer_t buffer;
918
919         /*
920          * If this isn't the last ref just decrement the ref count and
921          * return.
922          */
923         if (node->lock.refs > 1) {
924                 hammer_unref(&node->lock);
925                 return;
926         }
927
928         /*
929          * If there is no ondisk info or no buffer the node failed to load,
930          * remove the last reference and destroy the node.
931          */
932         if (node->ondisk == NULL) {
933                 hammer_unref(&node->lock);
934                 hammer_flush_node(node);
935                 /* node is stale now */
936                 return;
937         }
938
939         /*
940          * Do final cleanups and then either destroy the node and leave it
941          * passively cached.  The buffer reference is removed regardless.
942          */
943         buffer = node->buffer;
944         node->ondisk = NULL;
945
946         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
947                 hammer_unref(&node->lock);
948                 hammer_rel_buffer(buffer, 0);
949                 return;
950         }
951
952         /*
953          * Destroy the node if it has been marked for deletion.  We mark
954          * it as being free.  Note that the disk space is physically
955          * freed when the fifo cycles back through the node.
956          */
957         if (node->flags & HAMMER_NODE_DELETED) {
958                 hammer_blockmap_free(node->hmp, node->node_offset,
959                                      sizeof(*node->ondisk));
960         }
961
962         /*
963          * Destroy the node.  Record pertainant data because the node
964          * becomes stale the instant we flush it.
965          */
966         hammer_unref(&node->lock);
967         hammer_flush_node(node);
968         /* node is stale */
969         hammer_rel_buffer(buffer, 0);
970 }
971
972 /*
973  * Passively cache a referenced hammer_node in *cache.  The caller may
974  * release the node on return.
975  */
976 void
977 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
978 {
979         hammer_node_t old;
980
981         /*
982          * If the node is being deleted, don't cache it!
983          */
984         if (node->flags & HAMMER_NODE_DELETED)
985                 return;
986
987         /*
988          * Cache the node.  If we previously cached a different node we
989          * have to give HAMMER a chance to destroy it.
990          */
991 again:
992         if (node->cache1 != cache) {
993                 if (node->cache2 != cache) {
994                         if ((old = *cache) != NULL) {
995                                 KKASSERT(node->lock.refs != 0);
996                                 hammer_uncache_node(cache);
997                                 goto again;
998                         }
999                         if (node->cache2)
1000                                 *node->cache2 = NULL;
1001                         node->cache2 = node->cache1;
1002                         node->cache1 = cache;
1003                         *cache = node;
1004                 } else {
1005                         struct hammer_node **tmp;
1006                         tmp = node->cache1;
1007                         node->cache1 = node->cache2;
1008                         node->cache2 = tmp;
1009                 }
1010         }
1011 }
1012
1013 void
1014 hammer_uncache_node(struct hammer_node **cache)
1015 {
1016         hammer_node_t node;
1017
1018         if ((node = *cache) != NULL) {
1019                 *cache = NULL;
1020                 if (node->cache1 == cache) {
1021                         node->cache1 = node->cache2;
1022                         node->cache2 = NULL;
1023                 } else if (node->cache2 == cache) {
1024                         node->cache2 = NULL;
1025                 } else {
1026                         panic("hammer_uncache_node: missing cache linkage");
1027                 }
1028                 if (node->cache1 == NULL && node->cache2 == NULL)
1029                         hammer_flush_node(node);
1030         }
1031 }
1032
1033 /*
1034  * Remove a node's cache references and destroy the node if it has no
1035  * other references or backing store.
1036  */
1037 void
1038 hammer_flush_node(hammer_node_t node)
1039 {
1040         hammer_buffer_t buffer;
1041
1042         if (node->cache1)
1043                 *node->cache1 = NULL;
1044         if (node->cache2)
1045                 *node->cache2 = NULL;
1046         if (node->lock.refs == 0 && node->ondisk == NULL) {
1047                 RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node);
1048                 if ((buffer = node->buffer) != NULL) {
1049                         node->buffer = NULL;
1050                         TAILQ_REMOVE(&buffer->clist, node, entry);
1051                         /* buffer is unreferenced because ondisk is NULL */
1052                 }
1053                 --hammer_count_nodes;
1054                 kfree(node, M_HAMMER);
1055         }
1056 }
1057
1058 /*
1059  * Flush passively cached B-Tree nodes associated with this buffer.
1060  * This is only called when the buffer is about to be destroyed, so
1061  * none of the nodes should have any references.
1062  */
1063 void
1064 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1065 {
1066         hammer_node_t node;
1067
1068         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1069                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1070                 hammer_ref(&node->lock);
1071                 node->flags |= HAMMER_NODE_FLUSH;
1072                 hammer_rel_node(node);
1073         }
1074 }
1075
1076
1077 /************************************************************************
1078  *                              ALLOCATORS                              *
1079  ************************************************************************/
1080
1081 /*
1082  * Allocate a B-Tree node.
1083  */
1084 hammer_node_t
1085 hammer_alloc_btree(hammer_mount_t hmp, int *errorp)
1086 {
1087         hammer_buffer_t buffer = NULL;
1088         hammer_node_t node = NULL;
1089         hammer_off_t node_offset;
1090
1091         node_offset = hammer_blockmap_alloc(hmp, HAMMER_ZONE_BTREE_INDEX,
1092                                             sizeof(struct hammer_node_ondisk),
1093                                             errorp);
1094         if (*errorp == 0) {
1095                 node = hammer_get_node(hmp, node_offset, errorp);
1096                 hammer_modify_node(node);
1097                 bzero(node->ondisk, sizeof(*node->ondisk));
1098         }
1099         if (buffer)
1100                 hammer_rel_buffer(buffer, 0);
1101         return(node);
1102 }
1103
1104 /*
1105  * The returned buffers are already appropriately marked as being modified.
1106  * If the caller marks them again unnecessary undo records may be generated.
1107  *
1108  * In-band data is indicated by data_bufferp == NULL.  Pass a data_len of 0
1109  * for zero-fill (caller modifies data_len afterwords).
1110  */
1111 void *
1112 hammer_alloc_record(hammer_mount_t hmp, 
1113                     hammer_off_t *rec_offp, u_int8_t rec_type, 
1114                     struct hammer_buffer **rec_bufferp,
1115                     int32_t data_len, void **datap,
1116                     struct hammer_buffer **data_bufferp, int *errorp)
1117 {
1118         hammer_record_ondisk_t rec;
1119         hammer_off_t rec_offset;
1120         hammer_off_t data_offset;
1121         int32_t reclen;
1122
1123         if (datap)
1124                 *datap = NULL;
1125
1126         /*
1127          * Allocate the record
1128          */
1129         rec_offset = hammer_blockmap_alloc(hmp, HAMMER_ZONE_RECORD_INDEX,
1130                                            HAMMER_RECORD_SIZE, errorp);
1131         if (*errorp)
1132                 return(NULL);
1133
1134         /*
1135          * Allocate data
1136          */
1137         if (data_len) {
1138                 if (data_bufferp == NULL) {
1139                         switch(rec_type) {
1140                         case HAMMER_RECTYPE_DATA:
1141                                 reclen = offsetof(struct hammer_data_record,
1142                                                   data[0]);
1143                                 break;
1144                         case HAMMER_RECTYPE_DIRENTRY:
1145                                 reclen = offsetof(struct hammer_entry_record,
1146                                                   name[0]);
1147                                 break;
1148                         default:
1149                                 panic("hammer_alloc_record: illegal "
1150                                       "in-band data");
1151                                 /* NOT REACHED */
1152                                 reclen = 0;
1153                                 break;
1154                         }
1155                         KKASSERT(reclen + data_len <= HAMMER_RECORD_SIZE);
1156                         data_offset = rec_offset + reclen;
1157                 } else if (data_len < HAMMER_BUFSIZE) {
1158                         data_offset = hammer_blockmap_alloc(hmp,
1159                                                 HAMMER_ZONE_SMALL_DATA_INDEX,
1160                                                 data_len, errorp);
1161                 } else {
1162                         data_offset = hammer_blockmap_alloc(hmp,
1163                                                 HAMMER_ZONE_LARGE_DATA_INDEX,
1164                                                 data_len, errorp);
1165                 }
1166         } else {
1167                 data_offset = 0;
1168         }
1169         if (*errorp) {
1170                 hammer_blockmap_free(hmp, rec_offset, HAMMER_RECORD_SIZE);
1171                 return(NULL);
1172         }
1173
1174         /*
1175          * Basic return values.
1176          */
1177         *rec_offp = rec_offset;
1178         rec = hammer_bread(hmp, rec_offset, errorp, rec_bufferp);
1179         KKASSERT(*errorp == 0);
1180         rec->base.data_off = data_offset;
1181         rec->base.data_len = data_len;
1182         hammer_modify_buffer(*rec_bufferp, NULL, 0);
1183
1184         if (data_bufferp) {
1185                 if (data_len) {
1186                         *datap = hammer_bread(hmp, data_offset, errorp,
1187                                               data_bufferp);
1188                         KKASSERT(*errorp == 0);
1189                         hammer_modify_buffer(*data_bufferp, NULL, 0);
1190                 } else {
1191                         *datap = NULL;
1192                 }
1193         } else if (data_len) {
1194                 KKASSERT(data_offset + data_len - rec_offset <=
1195                          HAMMER_RECORD_SIZE); 
1196                 if (datap) {
1197                         *datap = (void *)((char *)rec +
1198                                           (int32_t)(data_offset - rec_offset));
1199                 }
1200         } else {
1201                 KKASSERT(datap == NULL);
1202         }
1203         KKASSERT(*errorp == 0);
1204         return(rec);
1205 }
1206
1207 /*
1208  * Generate an undo fifo entry and return the buffer to the caller (XXX).
1209  * The caller must create a dependancy to ensure that the undo record is
1210  * flushed before the modified buffer is flushed.
1211  */
1212 int
1213 hammer_generate_undo(hammer_mount_t hmp, hammer_off_t off, void *base, int len)
1214 {
1215         return(0);
1216 #if 0
1217         hammer_off_t rec_offset;
1218         hammer_fifo_undo_t undo;
1219         hammer_buffer_t buffer = NULL;
1220         int error;
1221
1222         rec_offset = hammer_alloc_fifo(hmp, sizeof(*undo), len,
1223                                        &buffer, HAMMER_HEAD_TYPE_UNDO,
1224                                        0, NULL, &error);
1225         if (error == 0) {
1226                 undo = (void *)((char *)buffer->ondisk + 
1227                                 ((int32_t)rec_offset & HAMMER_BUFMASK));
1228                 undo->undo_offset = off;
1229                 bcopy(base, undo + 1, len);
1230         }
1231         if (buffer)
1232                 hammer_rel_buffer(buffer, 0);
1233         return(error);
1234 #endif
1235 }
1236
1237 #if 0
1238
1239 /*
1240  * Allocate space from the FIFO.  The first rec_len bytes will be zero'd.
1241  * The entire space is marked modified (the caller should not remark it as
1242  * that will cause unnecessary undo records to be added).
1243  */
1244 static
1245 hammer_off_t
1246 hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len, int32_t data_len,
1247                   struct hammer_buffer **rec_bufferp, u_int16_t hdr_type,
1248                   int can_cross, 
1249                   struct hammer_buffer **data2_bufferp, int *errorp)
1250 {
1251         hammer_volume_t root_volume;
1252         hammer_volume_t end_volume;
1253         hammer_volume_ondisk_t ondisk;
1254         hammer_fifo_head_t head;
1255         hammer_fifo_tail_t tail;
1256         hammer_off_t end_off = 0;
1257         hammer_off_t tmp_off = 0;
1258         int32_t end_vol_no;
1259         int32_t tmp_vol_no;
1260         int32_t xoff;
1261         int32_t aligned_bytes;
1262         int must_pad;
1263
1264         aligned_bytes = (rec_len + data_len + HAMMER_TAIL_ONDISK_SIZE +
1265                          HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK;
1266
1267         root_volume = hammer_get_root_volume(hmp, errorp);
1268         if (root_volume)
1269                 hammer_modify_volume(root_volume, NULL, 0);
1270
1271         while (root_volume) {
1272                 ondisk = root_volume->ondisk;
1273
1274                 end_off = ondisk->vol0_fifo_end;
1275                 end_vol_no = HAMMER_VOL_DECODE(end_off);
1276
1277                 end_volume = hammer_get_volume(hmp, end_vol_no, errorp);
1278                 if (*errorp)
1279                         goto done;
1280
1281                 /*
1282                  * Check to see if we ran out of space.  Include some extra
1283                  * room.
1284                  *
1285                  * vol0_fifo_end cannot be advanced into the same buffer
1286                  * that vol0_fifo_beg resides in.  This allows us to
1287                  * instantiate a new buffer without reading it in.
1288                  *
1289                  * XXX messy.
1290                  */
1291                 tmp_off = ondisk->vol0_fifo_beg & ~HAMMER_BUFMASK64;
1292                 tmp_vol_no = HAMMER_VOL_DECODE(tmp_off);
1293                 if ((tmp_off & HAMMER_OFF_SHORT_MASK) == 0) {
1294                         if (end_vol_no + 1 == tmp_vol_no) {
1295                                 tmp_vol_no = end_vol_no;
1296                                 tmp_off = end_volume->maxbuf_off;
1297                         } else if (end_vol_no + 1 == hmp->nvolumes &&
1298                                    tmp_vol_no == 0) {
1299                                 tmp_vol_no = end_vol_no;
1300                                 tmp_off = end_volume->maxbuf_off;
1301                         }
1302                 }
1303                 hammer_rel_volume(end_volume, 0);
1304
1305                 /*
1306                  * XXX dummy head at end of fifo
1307                  */
1308                 if (end_vol_no == tmp_vol_no &&
1309                     end_off < tmp_off &&
1310                     end_off + aligned_bytes + sizeof(*head) >= tmp_off) {
1311                         *errorp = ENOSPC;
1312                         goto done;
1313                 }
1314
1315                 if ((int32_t)end_off & HAMMER_BUFMASK)
1316                         head = hammer_bread(hmp, end_off, errorp, rec_bufferp);
1317                 else
1318                         head = hammer_bnew(hmp, end_off, errorp, rec_bufferp);
1319                 if (*errorp)
1320                         goto done;
1321
1322                 /*
1323                  * Load the buffer, retry if someone else squeeked in
1324                  * while we were blocked.
1325                  */
1326
1327                 if (ondisk->vol0_fifo_end != end_off)
1328                         continue;
1329
1330                 /*
1331                  * Ok, we're gonna do something.  Modify the buffer
1332                  */
1333                 hammer_modify_buffer(*rec_bufferp, NULL, 0);
1334                 if (ondisk->vol0_fifo_end != end_off)
1335                         continue;
1336                 xoff = (int32_t)end_off & HAMMER_BUFMASK;
1337
1338                 /*
1339                  * The non-data portion of the fifo record cannot cross
1340                  * a buffer boundary.
1341                  *
1342                  * The entire record cannot cross a buffer boundary if
1343                  * can_cross is 0.
1344                  *
1345                  * The entire record cannot cover more then two whole buffers
1346                  * regardless.  Even if the data portion is 16K, this case
1347                  * can occur due to the addition of the fifo_tail.
1348                  *
1349                  * It is illegal for a record to cross a volume boundary.
1350                  *
1351                  * It is illegal for a record to cross a recovery boundary
1352                  * (this is so recovery code is guaranteed a record rather
1353                  * then data at certain points).
1354                  *
1355                  * Add a pad record and loop if it does.
1356                  */
1357                 must_pad = 0;
1358                 if (xoff + rec_len > HAMMER_BUFSIZE)
1359                         must_pad = 1;
1360                 if (can_cross == 0) {
1361                         if (xoff + aligned_bytes > HAMMER_BUFSIZE)
1362                                 must_pad = 1;
1363                 } else {
1364                         if (xoff + aligned_bytes > HAMMER_BUFSIZE &&
1365                             (end_off + aligned_bytes) >=
1366                             (*rec_bufferp)->volume->maxbuf_off) {
1367                                 must_pad = 1;
1368                         }
1369                         if ((end_off ^ (end_off + aligned_bytes)) &
1370                             HAMMER_OFF_SHORT_REC_MASK) {
1371                                 must_pad = 1;
1372                         }
1373                         if (xoff + aligned_bytes - HAMMER_BUFSIZE >
1374                             HAMMER_BUFSIZE) {
1375                                 KKASSERT(xoff != 0);
1376                                 must_pad = 1;
1377                         }
1378                 }
1379
1380                 /*
1381                  * Pad to end of the buffer if necessary.  PADs can be
1382                  * squeezed into as little as 8 bytes (hence our alignment
1383                  * requirement).  The crc, reserved, and sequence number
1384                  * fields are not used, but initialize them anyway if there
1385                  * is enough room.
1386                  */
1387                 if (must_pad) {
1388                         xoff = HAMMER_BUFSIZE - xoff;
1389                         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1390                         head->hdr_type = HAMMER_HEAD_TYPE_PAD;
1391                         head->hdr_size = xoff;
1392                         if (xoff >= HAMMER_HEAD_ONDISK_SIZE +
1393                                     HAMMER_TAIL_ONDISK_SIZE) {
1394                                 head->hdr_crc = 0;
1395                                 head->hdr_reserved02 = 0;
1396                                 head->hdr_seq = 0;
1397                         }
1398
1399                         tail = (void *)((char *)head + xoff -
1400                                         HAMMER_TAIL_ONDISK_SIZE);
1401                         if ((void *)head != (void *)tail) {
1402                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
1403                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
1404                                 tail->tail_size = xoff;
1405                         }
1406                         KKASSERT((xoff & HAMMER_HEAD_ALIGN_MASK) == 0);
1407                         ondisk->vol0_fifo_end =
1408                                 hammer_advance_fifo((*rec_bufferp)->volume,
1409                                                     end_off, xoff);
1410                         continue;
1411                 }
1412
1413                 if (xoff + aligned_bytes > HAMMER_BUFSIZE) {
1414                         xoff = xoff + aligned_bytes - HAMMER_BUFSIZE;
1415
1416                         KKASSERT(xoff <= HAMMER_BUFSIZE);
1417                         tail = hammer_bnew(hmp, end_off + aligned_bytes -
1418                                                 HAMMER_TAIL_ONDISK_SIZE,
1419                                            errorp, data2_bufferp);
1420                         hammer_modify_buffer(*data2_bufferp, NULL, 0);
1421                         if (*errorp)
1422                                 goto done;
1423
1424                         /*
1425                          * Retry if someone else appended to the fifo while
1426                          * we were blocked.
1427                          */
1428                         if (ondisk->vol0_fifo_end != end_off)
1429                                 continue;
1430                 } else {
1431                         tail = (void *)((char *)head + aligned_bytes -
1432                                         HAMMER_TAIL_ONDISK_SIZE);
1433                 }
1434
1435                 bzero(head, rec_len);
1436                 head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1437                 head->hdr_type = hdr_type;
1438                 head->hdr_size = aligned_bytes;
1439                 head->hdr_crc = 0;
1440                 head->hdr_seq = root_volume->ondisk->vol0_next_seq++;
1441
1442                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
1443                 tail->tail_type = hdr_type;
1444                 tail->tail_size = aligned_bytes;
1445
1446                 ondisk->vol0_fifo_end =
1447                         hammer_advance_fifo((*rec_bufferp)->volume,
1448                                             end_off, aligned_bytes);
1449 done:
1450                 hammer_rel_volume(root_volume, 0);
1451                 break;
1452         }
1453         if (*errorp)
1454                 end_off = 0;
1455         return(end_off);
1456 }
1457
1458 /*
1459  * Mark a fifo record as having been freed.  XXX needs undo.
1460  */
1461 void
1462 hammer_free_fifo(hammer_mount_t hmp, hammer_off_t fifo_offset)
1463 {
1464         hammer_buffer_t buffer = NULL;
1465         hammer_fifo_head_t head;
1466         int error;
1467
1468         head = hammer_bread(hmp, fifo_offset, &error, &buffer);
1469         if (head) {
1470                 hammer_modify_buffer(buffer, &head->hdr_type,
1471                                      sizeof(head->hdr_type));
1472                 head->hdr_type |= HAMMER_HEAD_FLAG_FREE;
1473         }
1474         if (buffer)
1475                 hammer_rel_buffer(buffer, 0);
1476 }
1477
1478 /*
1479  * Attempt to rewind the FIFO
1480  *
1481  * This routine is allowed to do nothing.
1482  */
1483 void
1484 hammer_unwind_fifo(hammer_mount_t hmp, hammer_off_t rec_offset)
1485 {
1486 }
1487
1488 /*
1489  * Advance the FIFO a certain number of bytes.
1490  */
1491 static
1492 hammer_off_t
1493 hammer_advance_fifo(hammer_volume_t volume, hammer_off_t off, int32_t bytes)
1494 {
1495         int32_t vol_no;
1496
1497         off += bytes;
1498         KKASSERT(off <= volume->maxbuf_off);
1499         KKASSERT((off & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
1500         if (off == volume->maxbuf_off) {
1501                 vol_no = volume->vol_no + 1;
1502                 if (vol_no == volume->hmp->nvolumes)
1503                         vol_no = 0;
1504                 off = HAMMER_ENCODE_RAW_BUFFER(vol_no, 0);
1505         }
1506         return(off);
1507 }
1508 #endif
1509
1510 /*
1511  * Sync dirty buffers to the media
1512  */
1513
1514 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
1515 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
1516
1517 int
1518 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1519 {
1520         struct hammer_sync_info info;
1521
1522         info.error = 0;
1523         info.waitfor = waitfor;
1524
1525         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
1526                       hammer_sync_scan1, hammer_sync_scan2, &info);
1527
1528         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1529                 hammer_sync_volume, &info);
1530         return(info.error);
1531 }
1532
1533 static int
1534 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1535 {
1536         struct hammer_inode *ip;
1537
1538         ip = VTOI(vp);
1539         if (vp->v_type == VNON || ip == NULL ||
1540             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1541              RB_EMPTY(&vp->v_rbdirty_tree))) {
1542                 return(-1);
1543         }
1544         return(0);
1545 }
1546
1547 static int
1548 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1549 {
1550         struct hammer_sync_info *info = data;
1551         struct hammer_inode *ip;
1552         int error;
1553
1554         ip = VTOI(vp);
1555         if (vp->v_type == VNON || vp->v_type == VBAD ||
1556             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1557              RB_EMPTY(&vp->v_rbdirty_tree))) {
1558                 return(0);
1559         }
1560         error = VOP_FSYNC(vp, info->waitfor);
1561         if (error)
1562                 info->error = error;
1563         return(0);
1564 }
1565
1566 int
1567 hammer_sync_volume(hammer_volume_t volume, void *data)
1568 {
1569         struct hammer_sync_info *info = data;
1570
1571         hammer_ref(&volume->io.lock);
1572         RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
1573                 hammer_sync_buffer, info);
1574         hammer_rel_volume(volume, 1);
1575         return(0);
1576 }
1577
1578 int
1579 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
1580 {
1581         hammer_ref(&buffer->io.lock);
1582         hammer_rel_buffer(buffer, 1);
1583         return(0);
1584 }
1585
1586 #if 0
1587 /*
1588  * Generic buffer initialization.  Initialize the A-list into an all-allocated
1589  * state with the free block limit properly set.
1590  *
1591  * Note that alloc_new_buffer() will free the appropriate block range via
1592  * the appropriate cluster alist, so the free count is properly propogated.
1593  */
1594 void
1595 hammer_init_fifo(hammer_fifo_head_t head, u_int16_t type)
1596 {
1597         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1598         head->hdr_type = type;
1599         head->hdr_size = 0;
1600         head->hdr_crc = 0;
1601         head->hdr_seq = 0;
1602 }
1603
1604 #endif
1605