HAMMER 28/many: Implement zoned blockmap
[dragonfly.git] / sys / vfs / hammer / hammer_ondisk.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_ondisk.c,v 1.29 2008/02/10 09:51:01 dillon Exp $
35  */
36 /*
37  * Manage HAMMER's on-disk structures.  These routines are primarily
38  * responsible for interfacing with the kernel's I/O subsystem and for
39  * managing in-memory structures.
40  */
41
42 #include "hammer.h"
43 #include <sys/fcntl.h>
44 #include <sys/nlookup.h>
45 #include <sys/buf.h>
46 #include <sys/buf2.h>
47
48 static void hammer_free_volume(hammer_volume_t volume);
49 static int hammer_load_volume(hammer_volume_t volume);
50 static int hammer_load_buffer(hammer_buffer_t buffer, int isnew);
51 static int hammer_load_node(hammer_node_t node);
52 #if 0
53 static hammer_off_t hammer_advance_fifo(hammer_volume_t volume,
54                 hammer_off_t off, int32_t bytes);
55
56 static hammer_off_t hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len,
57                 int32_t data_len, struct hammer_buffer **rec_bufferp,
58                 u_int16_t hdr_type, int can_cross, 
59                 struct hammer_buffer **data2_bufferp, int *errorp);
60 #endif
61
62 /*
63  * Red-Black tree support for various structures
64  */
65 static int
66 hammer_ino_rb_compare(hammer_inode_t ip1, hammer_inode_t ip2)
67 {
68         if (ip1->obj_id < ip2->obj_id)
69                 return(-1);
70         if (ip1->obj_id > ip2->obj_id)
71                 return(1);
72         if (ip1->obj_asof < ip2->obj_asof)
73                 return(-1);
74         if (ip1->obj_asof > ip2->obj_asof)
75                 return(1);
76         return(0);
77 }
78
79 static int
80 hammer_inode_info_cmp(hammer_inode_info_t info, hammer_inode_t ip)
81 {
82         if (info->obj_id < ip->obj_id)
83                 return(-1);
84         if (info->obj_id > ip->obj_id)
85                 return(1);
86         if (info->obj_asof < ip->obj_asof)
87                 return(-1);
88         if (info->obj_asof > ip->obj_asof)
89                 return(1);
90         return(0);
91 }
92
93 static int
94 hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
95 {
96         if (vol1->vol_no < vol2->vol_no)
97                 return(-1);
98         if (vol1->vol_no > vol2->vol_no)
99                 return(1);
100         return(0);
101 }
102
103 static int
104 hammer_buf_rb_compare(hammer_buffer_t buf1, hammer_buffer_t buf2)
105 {
106         if (buf1->buf_offset < buf2->buf_offset)
107                 return(-1);
108         if (buf1->buf_offset > buf2->buf_offset)
109                 return(1);
110         return(0);
111 }
112
113 static int
114 hammer_nod_rb_compare(hammer_node_t node1, hammer_node_t node2)
115 {
116         if (node1->node_offset < node2->node_offset)
117                 return(-1);
118         if (node1->node_offset > node2->node_offset)
119                 return(1);
120         return(0);
121 }
122
123 /*
124  * Note: The lookup function for hammer_ino_rb_tree winds up being named
125  * hammer_ino_rb_tree_RB_LOOKUP_INFO(root, info).  The other lookup
126  * functions are normal, e.g. hammer_buf_rb_tree_RB_LOOKUP(root, buf_offset).
127  */
128 RB_GENERATE(hammer_ino_rb_tree, hammer_inode, rb_node, hammer_ino_rb_compare);
129 RB_GENERATE_XLOOKUP(hammer_ino_rb_tree, INFO, hammer_inode, rb_node,
130                 hammer_inode_info_cmp, hammer_inode_info_t);
131 RB_GENERATE2(hammer_vol_rb_tree, hammer_volume, rb_node,
132              hammer_vol_rb_compare, int32_t, vol_no);
133 RB_GENERATE2(hammer_buf_rb_tree, hammer_buffer, rb_node,
134              hammer_buf_rb_compare, hammer_off_t, buf_offset);
135 RB_GENERATE2(hammer_nod_rb_tree, hammer_node, rb_node,
136              hammer_nod_rb_compare, hammer_off_t, node_offset);
137
138 /************************************************************************
139  *                              VOLUMES                                 *
140  ************************************************************************
141  *
142  * Load a HAMMER volume by name.  Returns 0 on success or a positive error
143  * code on failure.  Volumes must be loaded at mount time, get_volume() will
144  * not load a new volume.
145  *
146  * Calls made to hammer_load_volume() or single-threaded
147  */
148 int
149 hammer_install_volume(struct hammer_mount *hmp, const char *volname)
150 {
151         struct mount *mp;
152         hammer_volume_t volume;
153         struct hammer_volume_ondisk *ondisk;
154         struct nlookupdata nd;
155         struct buf *bp = NULL;
156         int error;
157         int ronly;
158
159         mp = hmp->mp;
160         ronly = ((mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
161
162         /*
163          * Allocate a volume structure
164          */
165         ++hammer_count_volumes;
166         volume = kmalloc(sizeof(*volume), M_HAMMER, M_WAITOK|M_ZERO);
167         volume->vol_name = kstrdup(volname, M_HAMMER);
168         volume->hmp = hmp;
169         hammer_io_init(&volume->io, HAMMER_STRUCTURE_VOLUME);
170         volume->io.offset = 0LL;
171
172         /*
173          * Get the device vnode
174          */
175         error = nlookup_init(&nd, volume->vol_name, UIO_SYSSPACE, NLC_FOLLOW);
176         if (error == 0)
177                 error = nlookup(&nd);
178         if (error == 0)
179                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &volume->devvp);
180         nlookup_done(&nd);
181         if (error == 0) {
182                 if (vn_isdisk(volume->devvp, &error)) {
183                         error = vfs_mountedon(volume->devvp);
184                 }
185         }
186         if (error == 0 &&
187             count_udev(volume->devvp->v_umajor, volume->devvp->v_uminor) > 0) {
188                 error = EBUSY;
189         }
190         if (error == 0) {
191                 vn_lock(volume->devvp, LK_EXCLUSIVE | LK_RETRY);
192                 error = vinvalbuf(volume->devvp, V_SAVE, 0, 0);
193                 if (error == 0) {
194                         error = VOP_OPEN(volume->devvp, 
195                                          (ronly ? FREAD : FREAD|FWRITE),
196                                          FSCRED, NULL);
197                 }
198                 vn_unlock(volume->devvp);
199         }
200         if (error) {
201                 hammer_free_volume(volume);
202                 return(error);
203         }
204         volume->devvp->v_rdev->si_mountpoint = mp;
205
206         /*
207          * Extract the volume number from the volume header and do various
208          * sanity checks.
209          */
210         error = bread(volume->devvp, 0LL, HAMMER_BUFSIZE, &bp);
211         if (error)
212                 goto late_failure;
213         ondisk = (void *)bp->b_data;
214         if (ondisk->vol_signature != HAMMER_FSBUF_VOLUME) {
215                 kprintf("hammer_mount: volume %s has an invalid header\n",
216                         volume->vol_name);
217                 error = EFTYPE;
218                 goto late_failure;
219         }
220         volume->vol_no = ondisk->vol_no;
221         volume->buffer_base = ondisk->vol_buf_beg;
222         volume->vol_flags = ondisk->vol_flags;
223         volume->nblocks = ondisk->vol_nblocks; 
224         volume->maxbuf_off = HAMMER_ENCODE_RAW_BUFFER(volume->vol_no,
225                                     ondisk->vol_buf_end - ondisk->vol_buf_beg);
226         RB_INIT(&volume->rb_bufs_root);
227
228         hmp->mp->mnt_stat.f_blocks += volume->nblocks;
229
230         if (RB_EMPTY(&hmp->rb_vols_root)) {
231                 hmp->fsid = ondisk->vol_fsid;
232         } else if (bcmp(&hmp->fsid, &ondisk->vol_fsid, sizeof(uuid_t))) {
233                 kprintf("hammer_mount: volume %s's fsid does not match "
234                         "other volumes\n", volume->vol_name);
235                 error = EFTYPE;
236                 goto late_failure;
237         }
238
239         /*
240          * Insert the volume structure into the red-black tree.
241          */
242         if (RB_INSERT(hammer_vol_rb_tree, &hmp->rb_vols_root, volume)) {
243                 kprintf("hammer_mount: volume %s has a duplicate vol_no %d\n",
244                         volume->vol_name, volume->vol_no);
245                 error = EEXIST;
246         }
247
248         /*
249          * Set the root volume .  HAMMER special cases rootvol the structure.
250          * We do not hold a ref because this would prevent related I/O
251          * from being flushed.
252          */
253         if (error == 0 && ondisk->vol_rootvol == ondisk->vol_no) {
254                 hmp->rootvol = volume;
255                 if (bp) {
256                         brelse(bp);
257                         bp = NULL;
258                 }
259                 hmp->fsid_udev = dev2udev(vn_todev(volume->devvp));
260         }
261 late_failure:
262         if (bp)
263                 brelse(bp);
264         if (error) {
265                 /*vinvalbuf(volume->devvp, V_SAVE, 0, 0);*/
266                 VOP_CLOSE(volume->devvp, ronly ? FREAD : FREAD|FWRITE);
267                 hammer_free_volume(volume);
268         }
269         return (error);
270 }
271
272 /*
273  * Unload and free a HAMMER volume.  Must return >= 0 to continue scan
274  * so returns -1 on failure.
275  */
276 int
277 hammer_unload_volume(hammer_volume_t volume, void *data __unused)
278 {
279         struct hammer_mount *hmp = volume->hmp;
280         int ronly = ((hmp->mp->mnt_flag & MNT_RDONLY) ? 1 : 0);
281
282         /*
283          * Sync clusters, sync volume
284          */
285
286         hmp->mp->mnt_stat.f_blocks -= volume->nblocks;
287
288         /*
289          * Clean up the root volume pointer, which is held unlocked in hmp.
290          */
291         if (hmp->rootvol == volume)
292                 hmp->rootvol = NULL;
293
294         /*
295          * Unload clusters and super-clusters.  Unloading a super-cluster
296          * also unloads related clusters, but the filesystem may not be
297          * using super-clusters so unload clusters anyway.
298          */
299         RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
300                         hammer_unload_buffer, NULL);
301         hammer_io_waitdep(&volume->io);
302
303         /*
304          * Release our buffer and flush anything left in the buffer cache.
305          */
306         hammer_io_release(&volume->io, 2);
307
308         /*
309          * There should be no references on the volume, no clusters, and
310          * no super-clusters.
311          */
312         KKASSERT(volume->io.lock.refs == 0);
313         KKASSERT(RB_EMPTY(&volume->rb_bufs_root));
314
315         volume->ondisk = NULL;
316         if (volume->devvp) {
317                 if (ronly) {
318                         vinvalbuf(volume->devvp, 0, 0, 0);
319                         VOP_CLOSE(volume->devvp, FREAD);
320                 } else {
321                         vinvalbuf(volume->devvp, V_SAVE, 0, 0);
322                         VOP_CLOSE(volume->devvp, FREAD|FWRITE);
323                 }
324         }
325
326         /*
327          * Destroy the structure
328          */
329         RB_REMOVE(hammer_vol_rb_tree, &hmp->rb_vols_root, volume);
330         hammer_free_volume(volume);
331         return(0);
332 }
333
334 static
335 void
336 hammer_free_volume(hammer_volume_t volume)
337 {
338         if (volume->vol_name) {
339                 kfree(volume->vol_name, M_HAMMER);
340                 volume->vol_name = NULL;
341         }
342         if (volume->devvp) {
343                 if (vn_isdisk(volume->devvp, NULL) &&
344                     volume->devvp->v_rdev &&
345                     volume->devvp->v_rdev->si_mountpoint == volume->hmp->mp
346                 ) {
347                         volume->devvp->v_rdev->si_mountpoint = NULL;
348                 }
349                 vrele(volume->devvp);
350                 volume->devvp = NULL;
351         }
352         --hammer_count_volumes;
353         kfree(volume, M_HAMMER);
354 }
355
356 /*
357  * Get a HAMMER volume.  The volume must already exist.
358  */
359 hammer_volume_t
360 hammer_get_volume(struct hammer_mount *hmp, int32_t vol_no, int *errorp)
361 {
362         struct hammer_volume *volume;
363
364         /*
365          * Locate the volume structure
366          */
367         volume = RB_LOOKUP(hammer_vol_rb_tree, &hmp->rb_vols_root, vol_no);
368         if (volume == NULL) {
369                 *errorp = ENOENT;
370                 return(NULL);
371         }
372         hammer_ref(&volume->io.lock);
373
374         /*
375          * Deal with on-disk info
376          */
377         if (volume->ondisk == NULL || volume->io.loading) {
378                 *errorp = hammer_load_volume(volume);
379                 if (*errorp) {
380                         hammer_rel_volume(volume, 1);
381                         volume = NULL;
382                 }
383         } else {
384                 *errorp = 0;
385         }
386         return(volume);
387 }
388
389 int
390 hammer_ref_volume(hammer_volume_t volume)
391 {
392         int error;
393
394         hammer_ref(&volume->io.lock);
395
396         /*
397          * Deal with on-disk info
398          */
399         if (volume->ondisk == NULL || volume->io.loading) {
400                 error = hammer_load_volume(volume);
401                 if (error)
402                         hammer_rel_volume(volume, 1);
403         } else {
404                 error = 0;
405         }
406         return (error);
407 }
408
409 hammer_volume_t
410 hammer_get_root_volume(struct hammer_mount *hmp, int *errorp)
411 {
412         hammer_volume_t volume;
413
414         volume = hmp->rootvol;
415         KKASSERT(volume != NULL);
416         hammer_ref(&volume->io.lock);
417
418         /*
419          * Deal with on-disk info
420          */
421         if (volume->ondisk == NULL || volume->io.loading) {
422                 *errorp = hammer_load_volume(volume);
423                 if (*errorp) {
424                         hammer_rel_volume(volume, 1);
425                         volume = NULL;
426                 }
427         } else {
428                 *errorp = 0;
429         }
430         return (volume);
431 }
432
433 /*
434  * Load a volume's on-disk information.  The volume must be referenced and
435  * not locked.  We temporarily acquire an exclusive lock to interlock
436  * against releases or multiple get's.
437  */
438 static int
439 hammer_load_volume(hammer_volume_t volume)
440 {
441         struct hammer_volume_ondisk *ondisk;
442         int error;
443
444         hammer_lock_ex(&volume->io.lock);
445         KKASSERT(volume->io.loading == 0);
446         volume->io.loading = 1;
447
448         if (volume->ondisk == NULL) {
449                 error = hammer_io_read(volume->devvp, &volume->io);
450                 if (error) {
451                         volume->io.loading = 0;
452                         hammer_unlock(&volume->io.lock);
453                         return (error);
454                 }
455                 volume->ondisk = ondisk = (void *)volume->io.bp->b_data;
456         } else {
457                 error = 0;
458         }
459         volume->io.loading = 0;
460         hammer_unlock(&volume->io.lock);
461         return(0);
462 }
463
464 /*
465  * Release a volume.  Call hammer_io_release on the last reference.  We have
466  * to acquire an exclusive lock to interlock against volume->ondisk tests
467  * in hammer_load_volume(), and hammer_io_release() also expects an exclusive
468  * lock to be held.
469  *
470  * Volumes are not unloaded from memory during normal operation.
471  */
472 void
473 hammer_rel_volume(hammer_volume_t volume, int flush)
474 {
475         if (volume->io.lock.refs == 1) {
476                 hammer_lock_ex(&volume->io.lock);
477                 if (volume->io.lock.refs == 1) {
478                         volume->ondisk = NULL;
479                         hammer_io_release(&volume->io, flush);
480                 } else if (flush) {
481                         hammer_io_flush(&volume->io);
482                 }
483                 hammer_unlock(&volume->io.lock);
484         }
485         hammer_unref(&volume->io.lock);
486 }
487
488 /************************************************************************
489  *                              BUFFERS                                 *
490  ************************************************************************
491  *
492  * Manage buffers.  Currently all blockmap-backed zones are translated
493  * to zone-2 buffer offsets.
494  */
495 hammer_buffer_t
496 hammer_get_buffer(hammer_mount_t hmp, hammer_off_t buf_offset,
497                   int isnew, int *errorp)
498 {
499         hammer_buffer_t buffer;
500         hammer_volume_t volume;
501         int vol_no;
502         int zone;
503
504         zone = HAMMER_ZONE_DECODE(buf_offset);
505         if (zone > HAMMER_ZONE_RAW_BUFFER_INDEX) {
506                 buf_offset = hammer_blockmap_lookup(hmp, buf_offset, errorp);
507                 KKASSERT(*errorp == 0);
508         }
509         buf_offset &= ~HAMMER_BUFMASK64;
510         KKASSERT((buf_offset & HAMMER_ZONE_RAW_BUFFER) ==
511                  HAMMER_ZONE_RAW_BUFFER);
512         vol_no = HAMMER_VOL_DECODE(buf_offset);
513         volume = hammer_get_volume(hmp, vol_no, errorp);
514         if (volume == NULL)
515                 return(NULL);
516
517         /*
518          * NOTE: buf_offset and maxbuf_off are both full offset
519          * specifications.
520          */
521         KKASSERT(buf_offset < volume->maxbuf_off);
522
523         /*
524          * Locate and lock the buffer structure, creating one if necessary.
525          */
526 again:
527         buffer = RB_LOOKUP(hammer_buf_rb_tree, &volume->rb_bufs_root,
528                            buf_offset);
529         if (buffer == NULL) {
530                 ++hammer_count_buffers;
531                 buffer = kmalloc(sizeof(*buffer), M_HAMMER, M_WAITOK|M_ZERO);
532                 buffer->buf_offset = buf_offset;
533                 buffer->volume = volume;
534                 hammer_io_init(&buffer->io, HAMMER_STRUCTURE_BUFFER);
535                 buffer->io.offset = volume->ondisk->vol_buf_beg +
536                                     (buf_offset & HAMMER_OFF_SHORT_MASK);
537                 TAILQ_INIT(&buffer->clist);
538                 hammer_ref(&buffer->io.lock);
539
540                 /*
541                  * Insert the buffer into the RB tree and handle late
542                  * collisions.
543                  */
544                 if (RB_INSERT(hammer_buf_rb_tree, &volume->rb_bufs_root, buffer)) {
545                         hammer_unref(&buffer->io.lock);
546                         --hammer_count_buffers;
547                         kfree(buffer, M_HAMMER);
548                         goto again;
549                 }
550                 hammer_ref(&volume->io.lock);
551         } else {
552                 hammer_ref(&buffer->io.lock);
553         }
554
555         /*
556          * Deal with on-disk info
557          */
558         if (buffer->ondisk == NULL || buffer->io.loading) {
559                 *errorp = hammer_load_buffer(buffer, isnew);
560                 if (*errorp) {
561                         hammer_rel_buffer(buffer, 1);
562                         buffer = NULL;
563                 }
564         } else {
565                 *errorp = 0;
566         }
567         hammer_rel_volume(volume, 0);
568         return(buffer);
569 }
570
571 static int
572 hammer_load_buffer(hammer_buffer_t buffer, int isnew)
573 {
574         hammer_volume_t volume;
575         void *ondisk;
576         int error;
577
578         /*
579          * Load the buffer's on-disk info
580          */
581         volume = buffer->volume;
582         hammer_lock_ex(&buffer->io.lock);
583         KKASSERT(buffer->io.loading == 0);
584         buffer->io.loading = 1;
585
586         if (buffer->ondisk == NULL) {
587                 if (isnew) {
588                         error = hammer_io_new(volume->devvp, &buffer->io);
589                 } else {
590                         error = hammer_io_read(volume->devvp, &buffer->io);
591                 }
592                 if (error) {
593                         buffer->io.loading = 0;
594                         hammer_unlock(&buffer->io.lock);
595                         return (error);
596                 }
597                 buffer->ondisk = ondisk = (void *)buffer->io.bp->b_data;
598         } else if (isnew) {
599                 error = hammer_io_new(volume->devvp, &buffer->io);
600         } else {
601                 error = 0;
602         }
603         if (error == 0 && isnew) {
604                 hammer_modify_buffer(buffer, NULL, 0);
605                 /* additional initialization goes here */
606         }
607         buffer->io.loading = 0;
608         hammer_unlock(&buffer->io.lock);
609         return (error);
610 }
611
612 /*
613  * NOTE: Called from RB_SCAN, must return >= 0 for scan to continue.
614  */
615 int
616 hammer_unload_buffer(hammer_buffer_t buffer, void *data __unused)
617 {
618         hammer_ref(&buffer->io.lock);
619         hammer_flush_buffer_nodes(buffer);
620         KKASSERT(buffer->io.lock.refs == 1);
621         hammer_rel_buffer(buffer, 2);
622         return(0);
623 }
624
625 /*
626  * Reference a buffer that is either already referenced or via a specially
627  * handled pointer (aka cursor->buffer).
628  */
629 int
630 hammer_ref_buffer(hammer_buffer_t buffer)
631 {
632         int error;
633
634         hammer_ref(&buffer->io.lock);
635         if (buffer->ondisk == NULL || buffer->io.loading) {
636                 error = hammer_load_buffer(buffer, 0);
637                 if (error) {
638                         hammer_rel_buffer(buffer, 1);
639                         /*
640                          * NOTE: buffer pointer can become stale after
641                          * the above release.
642                          */
643                 }
644         } else {
645                 error = 0;
646         }
647         return(error);
648 }
649
650 /*
651  * Release a buffer.  We have to deal with several places where
652  * another thread can ref the buffer.
653  *
654  * Only destroy the structure itself if the related buffer cache buffer
655  * was disassociated from it.  This ties the management of the structure
656  * to the buffer cache subsystem.  buffer->ondisk determines whether the
657  * embedded io is referenced or not.
658  */
659 void
660 hammer_rel_buffer(hammer_buffer_t buffer, int flush)
661 {
662         hammer_volume_t volume;
663
664         if (buffer->io.lock.refs == 1) {
665                 hammer_lock_ex(&buffer->io.lock);
666                 if (buffer->io.lock.refs == 1) {
667                         hammer_io_release(&buffer->io, flush);
668
669                         if (buffer->io.bp == NULL &&
670                             buffer->io.lock.refs == 1) {
671                                 hammer_flush_buffer_nodes(buffer);
672                                 KKASSERT(TAILQ_EMPTY(&buffer->clist));
673                                 volume = buffer->volume;
674                                 RB_REMOVE(hammer_buf_rb_tree,
675                                           &volume->rb_bufs_root, buffer);
676                                 buffer->volume = NULL; /* sanity */
677                                 --hammer_count_buffers;
678                                 kfree(buffer, M_HAMMER);
679                                 hammer_rel_volume(volume, 0);
680                                 return;
681                         }
682                 } else if (flush) {
683                         hammer_io_flush(&buffer->io);
684                 }
685                 hammer_unlock(&buffer->io.lock);
686         }
687         hammer_unref(&buffer->io.lock);
688 }
689
690 /*
691  * Access the filesystem buffer containing the specified hammer offset.
692  * buf_offset is a conglomeration of the volume number and vol_buf_beg
693  * relative buffer offset.  It must also have bit 55 set to be valid.
694  * (see hammer_off_t in hammer_disk.h).
695  *
696  * Any prior buffer in *bufferp will be released and replaced by the
697  * requested buffer.
698  */
699 void *
700 hammer_bread(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 
701              struct hammer_buffer **bufferp)
702 {
703         hammer_buffer_t buffer;
704         int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
705
706         buf_offset &= ~HAMMER_BUFMASK64;
707
708         buffer = *bufferp;
709         if (buffer == NULL || buffer->buf_offset != buf_offset) {
710                 if (buffer)
711                         hammer_rel_buffer(buffer, 0);
712                 buffer = hammer_get_buffer(hmp, buf_offset, 0, errorp);
713                 *bufferp = buffer;
714         } else {
715                 *errorp = 0;
716         }
717
718         /*
719          * Return a pointer to the buffer data.
720          */
721         if (buffer == NULL)
722                 return(NULL);
723         else
724                 return((char *)buffer->ondisk + xoff);
725 }
726
727 /*
728  * Access the filesystem buffer containing the specified hammer offset.
729  * No disk read operation occurs.  The result buffer may contain garbage.
730  *
731  * Any prior buffer in *bufferp will be released and replaced by the
732  * requested buffer.
733  */
734 void *
735 hammer_bnew(hammer_mount_t hmp, hammer_off_t buf_offset, int *errorp, 
736              struct hammer_buffer **bufferp)
737 {
738         hammer_buffer_t buffer;
739         int32_t xoff = (int32_t)buf_offset & HAMMER_BUFMASK;
740
741         buf_offset &= ~HAMMER_BUFMASK64;
742
743         buffer = *bufferp;
744         if (buffer == NULL || buffer->buf_offset != buf_offset) {
745                 if (buffer)
746                         hammer_rel_buffer(buffer, 0);
747                 buffer = hammer_get_buffer(hmp, buf_offset, 1, errorp);
748                 *bufferp = buffer;
749         } else {
750                 *errorp = 0;
751         }
752
753         /*
754          * Return a pointer to the buffer data.
755          */
756         if (buffer == NULL)
757                 return(NULL);
758         else
759                 return((char *)buffer->ondisk + xoff);
760 }
761
762 /************************************************************************
763  *                              NODES                                   *
764  ************************************************************************
765  *
766  * Manage B-Tree nodes.  B-Tree nodes represent the primary indexing
767  * method used by the HAMMER filesystem.
768  *
769  * Unlike other HAMMER structures, a hammer_node can be PASSIVELY
770  * associated with its buffer, and will only referenced the buffer while
771  * the node itself is referenced.
772  *
773  * A hammer_node can also be passively associated with other HAMMER
774  * structures, such as inodes, while retaining 0 references.  These
775  * associations can be cleared backwards using a pointer-to-pointer in
776  * the hammer_node.
777  *
778  * This allows the HAMMER implementation to cache hammer_nodes long-term
779  * and short-cut a great deal of the infrastructure's complexity.  In
780  * most cases a cached node can be reacquired without having to dip into
781  * either the buffer or cluster management code.
782  *
783  * The caller must pass a referenced cluster on call and will retain
784  * ownership of the reference on return.  The node will acquire its own
785  * additional references, if necessary.
786  */
787 hammer_node_t
788 hammer_get_node(hammer_mount_t hmp, hammer_off_t node_offset, int *errorp)
789 {
790         hammer_node_t node;
791
792         KKASSERT((node_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_BTREE);
793
794         /*
795          * Locate the structure, allocating one if necessary.
796          */
797 again:
798         node = RB_LOOKUP(hammer_nod_rb_tree, &hmp->rb_nods_root, node_offset);
799         if (node == NULL) {
800                 ++hammer_count_nodes;
801                 node = kmalloc(sizeof(*node), M_HAMMER, M_WAITOK|M_ZERO);
802                 node->node_offset = node_offset;
803                 node->hmp = hmp;
804                 if (RB_INSERT(hammer_nod_rb_tree, &hmp->rb_nods_root, node)) {
805                         --hammer_count_nodes;
806                         kfree(node, M_HAMMER);
807                         goto again;
808                 }
809         }
810         hammer_ref(&node->lock);
811         *errorp = hammer_load_node(node);
812         if (*errorp) {
813                 hammer_rel_node(node);
814                 node = NULL;
815         }
816         return(node);
817 }
818
819 /*
820  * Reference an already-referenced node.
821  */
822 int
823 hammer_ref_node(hammer_node_t node)
824 {
825         int error;
826
827         KKASSERT(node->lock.refs > 0);
828         hammer_ref(&node->lock);
829         if ((error = hammer_load_node(node)) != 0)
830                 hammer_rel_node(node);
831         return(error);
832 }
833
834 /*
835  * Load a node's on-disk data reference.
836  */
837 static int
838 hammer_load_node(hammer_node_t node)
839 {
840         hammer_buffer_t buffer;
841         int error;
842
843         if (node->ondisk)
844                 return(0);
845         error = 0;
846         hammer_lock_ex(&node->lock);
847         if (node->ondisk == NULL) {
848                 /*
849                  * This is a little confusing but the jist is that
850                  * node->buffer determines whether the node is on
851                  * the buffer's clist and node->ondisk determines
852                  * whether the buffer is referenced.
853                  */
854                 if ((buffer = node->buffer) != NULL) {
855                         error = hammer_ref_buffer(buffer);
856                 } else {
857                         buffer = hammer_get_buffer(node->hmp,
858                                                    node->node_offset, 0,
859                                                    &error);
860                         if (buffer) {
861                                 KKASSERT(error == 0);
862                                 TAILQ_INSERT_TAIL(&buffer->clist,
863                                                   node, entry);
864                                 node->buffer = buffer;
865                         }
866                 }
867                 if (error == 0) {
868                         node->ondisk = (void *)((char *)buffer->ondisk +
869                                (node->node_offset & HAMMER_BUFMASK));
870                 }
871         }
872         hammer_unlock(&node->lock);
873         return (error);
874 }
875
876 /*
877  * Safely reference a node, interlock against flushes via the IO subsystem.
878  */
879 hammer_node_t
880 hammer_ref_node_safe(struct hammer_mount *hmp, struct hammer_node **cache,
881                      int *errorp)
882 {
883         hammer_node_t node;
884
885         if ((node = *cache) != NULL)
886                 hammer_ref(&node->lock);
887         if (node) {
888                 *errorp = hammer_load_node(node);
889                 if (*errorp) {
890                         hammer_rel_node(node);
891                         node = NULL;
892                 }
893         } else {
894                 *errorp = ENOENT;
895         }
896         return(node);
897 }
898
899 /*
900  * Release a hammer_node.  On the last release the node dereferences
901  * its underlying buffer and may or may not be destroyed.
902  */
903 void
904 hammer_rel_node(hammer_node_t node)
905 {
906         hammer_buffer_t buffer;
907
908         /*
909          * If this isn't the last ref just decrement the ref count and
910          * return.
911          */
912         if (node->lock.refs > 1) {
913                 hammer_unref(&node->lock);
914                 return;
915         }
916
917         /*
918          * If there is no ondisk info or no buffer the node failed to load,
919          * remove the last reference and destroy the node.
920          */
921         if (node->ondisk == NULL) {
922                 hammer_unref(&node->lock);
923                 hammer_flush_node(node);
924                 /* node is stale now */
925                 return;
926         }
927
928         /*
929          * Do final cleanups and then either destroy the node and leave it
930          * passively cached.  The buffer reference is removed regardless.
931          */
932         buffer = node->buffer;
933         node->ondisk = NULL;
934
935         if ((node->flags & (HAMMER_NODE_DELETED|HAMMER_NODE_FLUSH)) == 0) {
936                 hammer_unref(&node->lock);
937                 hammer_rel_buffer(buffer, 0);
938                 return;
939         }
940
941         /*
942          * Destroy the node if it has been marked for deletion.  We mark
943          * it as being free.  Note that the disk space is physically
944          * freed when the fifo cycles back through the node.
945          */
946         if (node->flags & HAMMER_NODE_DELETED) {
947                 hammer_blockmap_free(node->hmp, node->node_offset,
948                                      sizeof(*node->ondisk));
949         }
950
951         /*
952          * Destroy the node.  Record pertainant data because the node
953          * becomes stale the instant we flush it.
954          */
955         hammer_unref(&node->lock);
956         hammer_flush_node(node);
957         /* node is stale */
958         hammer_rel_buffer(buffer, 0);
959 }
960
961 /*
962  * Passively cache a referenced hammer_node in *cache.  The caller may
963  * release the node on return.
964  */
965 void
966 hammer_cache_node(hammer_node_t node, struct hammer_node **cache)
967 {
968         hammer_node_t old;
969
970         /*
971          * If the node is being deleted, don't cache it!
972          */
973         if (node->flags & HAMMER_NODE_DELETED)
974                 return;
975
976         /*
977          * Cache the node.  If we previously cached a different node we
978          * have to give HAMMER a chance to destroy it.
979          */
980 again:
981         if (node->cache1 != cache) {
982                 if (node->cache2 != cache) {
983                         if ((old = *cache) != NULL) {
984                                 KKASSERT(node->lock.refs != 0);
985                                 hammer_uncache_node(cache);
986                                 goto again;
987                         }
988                         if (node->cache2)
989                                 *node->cache2 = NULL;
990                         node->cache2 = node->cache1;
991                         node->cache1 = cache;
992                         *cache = node;
993                 } else {
994                         struct hammer_node **tmp;
995                         tmp = node->cache1;
996                         node->cache1 = node->cache2;
997                         node->cache2 = tmp;
998                 }
999         }
1000 }
1001
1002 void
1003 hammer_uncache_node(struct hammer_node **cache)
1004 {
1005         hammer_node_t node;
1006
1007         if ((node = *cache) != NULL) {
1008                 *cache = NULL;
1009                 if (node->cache1 == cache) {
1010                         node->cache1 = node->cache2;
1011                         node->cache2 = NULL;
1012                 } else if (node->cache2 == cache) {
1013                         node->cache2 = NULL;
1014                 } else {
1015                         panic("hammer_uncache_node: missing cache linkage");
1016                 }
1017                 if (node->cache1 == NULL && node->cache2 == NULL)
1018                         hammer_flush_node(node);
1019         }
1020 }
1021
1022 /*
1023  * Remove a node's cache references and destroy the node if it has no
1024  * other references or backing store.
1025  */
1026 void
1027 hammer_flush_node(hammer_node_t node)
1028 {
1029         hammer_buffer_t buffer;
1030
1031         if (node->cache1)
1032                 *node->cache1 = NULL;
1033         if (node->cache2)
1034                 *node->cache2 = NULL;
1035         if (node->lock.refs == 0 && node->ondisk == NULL) {
1036                 RB_REMOVE(hammer_nod_rb_tree, &node->hmp->rb_nods_root, node);
1037                 if ((buffer = node->buffer) != NULL) {
1038                         node->buffer = NULL;
1039                         TAILQ_REMOVE(&buffer->clist, node, entry);
1040                         /* buffer is unreferenced because ondisk is NULL */
1041                 }
1042                 --hammer_count_nodes;
1043                 kfree(node, M_HAMMER);
1044         }
1045 }
1046
1047 /*
1048  * Flush passively cached B-Tree nodes associated with this buffer.
1049  * This is only called when the buffer is about to be destroyed, so
1050  * none of the nodes should have any references.
1051  */
1052 void
1053 hammer_flush_buffer_nodes(hammer_buffer_t buffer)
1054 {
1055         hammer_node_t node;
1056
1057         while ((node = TAILQ_FIRST(&buffer->clist)) != NULL) {
1058                 KKASSERT(node->lock.refs == 0 && node->ondisk == NULL);
1059                 hammer_ref(&node->lock);
1060                 node->flags |= HAMMER_NODE_FLUSH;
1061                 hammer_rel_node(node);
1062         }
1063 }
1064
1065
1066 /************************************************************************
1067  *                              ALLOCATORS                              *
1068  ************************************************************************/
1069
1070 /*
1071  * Allocate a B-Tree node.
1072  */
1073 hammer_node_t
1074 hammer_alloc_btree(hammer_mount_t hmp, int *errorp)
1075 {
1076         hammer_buffer_t buffer = NULL;
1077         hammer_node_t node = NULL;
1078         hammer_off_t node_offset;
1079
1080         node_offset = hammer_blockmap_alloc(hmp, HAMMER_ZONE_BTREE_INDEX,
1081                                             sizeof(struct hammer_node_ondisk),
1082                                             errorp);
1083         if (*errorp == 0) {
1084                 node = hammer_get_node(hmp, node_offset, errorp);
1085                 hammer_modify_node(node);
1086                 bzero(node->ondisk, sizeof(*node->ondisk));
1087         }
1088         if (buffer)
1089                 hammer_rel_buffer(buffer, 0);
1090         return(node);
1091 }
1092
1093 /*
1094  * The returned buffers are already appropriately marked as being modified.
1095  * If the caller marks them again unnecessary undo records may be generated.
1096  *
1097  * In-band data is indicated by data_bufferp == NULL.  Pass a data_len of 0
1098  * for zero-fill (caller modifies data_len afterwords).
1099  */
1100 void *
1101 hammer_alloc_record(hammer_mount_t hmp, 
1102                     hammer_off_t *rec_offp, u_int8_t rec_type, 
1103                     struct hammer_buffer **rec_bufferp,
1104                     int32_t data_len, void **datap,
1105                     struct hammer_buffer **data_bufferp, int *errorp)
1106 {
1107         hammer_record_ondisk_t rec;
1108         hammer_off_t rec_offset;
1109         hammer_off_t data_offset;
1110         int32_t reclen;
1111
1112         if (datap)
1113                 *datap = NULL;
1114
1115         /*
1116          * Allocate the record
1117          */
1118         rec_offset = hammer_blockmap_alloc(hmp, HAMMER_ZONE_RECORD_INDEX,
1119                                            HAMMER_RECORD_SIZE, errorp);
1120         if (*errorp)
1121                 return(NULL);
1122
1123         /*
1124          * Allocate data
1125          */
1126         if (data_len) {
1127                 if (data_bufferp == NULL) {
1128                         switch(rec_type) {
1129                         case HAMMER_RECTYPE_DATA:
1130                                 reclen = offsetof(struct hammer_data_record,
1131                                                   data[0]);
1132                                 break;
1133                         case HAMMER_RECTYPE_DIRENTRY:
1134                                 reclen = offsetof(struct hammer_entry_record,
1135                                                   name[0]);
1136                                 break;
1137                         default:
1138                                 panic("hammer_alloc_record: illegal "
1139                                       "in-band data");
1140                                 /* NOT REACHED */
1141                                 reclen = 0;
1142                                 break;
1143                         }
1144                         KKASSERT(reclen + data_len <= HAMMER_RECORD_SIZE);
1145                         data_offset = rec_offset + reclen;
1146                 } else if (data_len < HAMMER_BUFSIZE) {
1147                         data_offset = hammer_blockmap_alloc(hmp,
1148                                                 HAMMER_ZONE_SMALL_DATA_INDEX,
1149                                                 data_len, errorp);
1150                 } else {
1151                         data_offset = hammer_blockmap_alloc(hmp,
1152                                                 HAMMER_ZONE_LARGE_DATA_INDEX,
1153                                                 data_len, errorp);
1154                 }
1155         } else {
1156                 data_offset = 0;
1157         }
1158         if (*errorp) {
1159                 hammer_blockmap_free(hmp, rec_offset, HAMMER_RECORD_SIZE);
1160                 return(NULL);
1161         }
1162
1163         /*
1164          * Basic return values.
1165          */
1166         *rec_offp = rec_offset;
1167         rec = hammer_bread(hmp, rec_offset, errorp, rec_bufferp);
1168         KKASSERT(*errorp == 0);
1169         rec->base.data_off = data_offset;
1170         rec->base.data_len = data_len;
1171         hammer_modify_buffer(*rec_bufferp, NULL, 0);
1172
1173         if (data_bufferp) {
1174                 if (data_len) {
1175                         *datap = hammer_bread(hmp, data_offset, errorp,
1176                                               data_bufferp);
1177                         KKASSERT(*errorp == 0);
1178                         hammer_modify_buffer(*data_bufferp, NULL, 0);
1179                 } else {
1180                         *datap = NULL;
1181                 }
1182         } else if (data_len) {
1183                 KKASSERT(data_offset + data_len - rec_offset <=
1184                          HAMMER_RECORD_SIZE); 
1185                 if (datap) {
1186                         *datap = (void *)((char *)rec +
1187                                           (int32_t)(data_offset - rec_offset));
1188                 }
1189         } else {
1190                 KKASSERT(datap == NULL);
1191         }
1192         KKASSERT(*errorp == 0);
1193         return(rec);
1194 }
1195
1196 /*
1197  * Generate an undo fifo entry and return the buffer to the caller (XXX).
1198  * The caller must create a dependancy to ensure that the undo record is
1199  * flushed before the modified buffer is flushed.
1200  */
1201 int
1202 hammer_generate_undo(hammer_mount_t hmp, hammer_off_t off, void *base, int len)
1203 {
1204         return(0);
1205 #if 0
1206         hammer_off_t rec_offset;
1207         hammer_fifo_undo_t undo;
1208         hammer_buffer_t buffer = NULL;
1209         int error;
1210
1211         rec_offset = hammer_alloc_fifo(hmp, sizeof(*undo), len,
1212                                        &buffer, HAMMER_HEAD_TYPE_UNDO,
1213                                        0, NULL, &error);
1214         if (error == 0) {
1215                 undo = (void *)((char *)buffer->ondisk + 
1216                                 ((int32_t)rec_offset & HAMMER_BUFMASK));
1217                 undo->undo_offset = off;
1218                 bcopy(base, undo + 1, len);
1219         }
1220         if (buffer)
1221                 hammer_rel_buffer(buffer, 0);
1222         return(error);
1223 #endif
1224 }
1225
1226 #if 0
1227
1228 /*
1229  * Allocate space from the FIFO.  The first rec_len bytes will be zero'd.
1230  * The entire space is marked modified (the caller should not remark it as
1231  * that will cause unnecessary undo records to be added).
1232  */
1233 static
1234 hammer_off_t
1235 hammer_alloc_fifo(hammer_mount_t hmp, int32_t rec_len, int32_t data_len,
1236                   struct hammer_buffer **rec_bufferp, u_int16_t hdr_type,
1237                   int can_cross, 
1238                   struct hammer_buffer **data2_bufferp, int *errorp)
1239 {
1240         hammer_volume_t root_volume;
1241         hammer_volume_t end_volume;
1242         hammer_volume_ondisk_t ondisk;
1243         hammer_fifo_head_t head;
1244         hammer_fifo_tail_t tail;
1245         hammer_off_t end_off = 0;
1246         hammer_off_t tmp_off = 0;
1247         int32_t end_vol_no;
1248         int32_t tmp_vol_no;
1249         int32_t xoff;
1250         int32_t aligned_bytes;
1251         int must_pad;
1252
1253         aligned_bytes = (rec_len + data_len + HAMMER_TAIL_ONDISK_SIZE +
1254                          HAMMER_HEAD_ALIGN_MASK) & ~HAMMER_HEAD_ALIGN_MASK;
1255
1256         root_volume = hammer_get_root_volume(hmp, errorp);
1257         if (root_volume)
1258                 hammer_modify_volume(root_volume, NULL, 0);
1259
1260         while (root_volume) {
1261                 ondisk = root_volume->ondisk;
1262
1263                 end_off = ondisk->vol0_fifo_end;
1264                 end_vol_no = HAMMER_VOL_DECODE(end_off);
1265
1266                 end_volume = hammer_get_volume(hmp, end_vol_no, errorp);
1267                 if (*errorp)
1268                         goto done;
1269
1270                 /*
1271                  * Check to see if we ran out of space.  Include some extra
1272                  * room.
1273                  *
1274                  * vol0_fifo_end cannot be advanced into the same buffer
1275                  * that vol0_fifo_beg resides in.  This allows us to
1276                  * instantiate a new buffer without reading it in.
1277                  *
1278                  * XXX messy.
1279                  */
1280                 tmp_off = ondisk->vol0_fifo_beg & ~HAMMER_BUFMASK64;
1281                 tmp_vol_no = HAMMER_VOL_DECODE(tmp_off);
1282                 if ((tmp_off & HAMMER_OFF_SHORT_MASK) == 0) {
1283                         if (end_vol_no + 1 == tmp_vol_no) {
1284                                 tmp_vol_no = end_vol_no;
1285                                 tmp_off = end_volume->maxbuf_off;
1286                         } else if (end_vol_no + 1 == hmp->nvolumes &&
1287                                    tmp_vol_no == 0) {
1288                                 tmp_vol_no = end_vol_no;
1289                                 tmp_off = end_volume->maxbuf_off;
1290                         }
1291                 }
1292                 hammer_rel_volume(end_volume, 0);
1293
1294                 /*
1295                  * XXX dummy head at end of fifo
1296                  */
1297                 if (end_vol_no == tmp_vol_no &&
1298                     end_off < tmp_off &&
1299                     end_off + aligned_bytes + sizeof(*head) >= tmp_off) {
1300                         *errorp = ENOSPC;
1301                         goto done;
1302                 }
1303
1304                 if ((int32_t)end_off & HAMMER_BUFMASK)
1305                         head = hammer_bread(hmp, end_off, errorp, rec_bufferp);
1306                 else
1307                         head = hammer_bnew(hmp, end_off, errorp, rec_bufferp);
1308                 if (*errorp)
1309                         goto done;
1310
1311                 /*
1312                  * Load the buffer, retry if someone else squeeked in
1313                  * while we were blocked.
1314                  */
1315
1316                 if (ondisk->vol0_fifo_end != end_off)
1317                         continue;
1318
1319                 /*
1320                  * Ok, we're gonna do something.  Modify the buffer
1321                  */
1322                 hammer_modify_buffer(*rec_bufferp, NULL, 0);
1323                 if (ondisk->vol0_fifo_end != end_off)
1324                         continue;
1325                 xoff = (int32_t)end_off & HAMMER_BUFMASK;
1326
1327                 /*
1328                  * The non-data portion of the fifo record cannot cross
1329                  * a buffer boundary.
1330                  *
1331                  * The entire record cannot cross a buffer boundary if
1332                  * can_cross is 0.
1333                  *
1334                  * The entire record cannot cover more then two whole buffers
1335                  * regardless.  Even if the data portion is 16K, this case
1336                  * can occur due to the addition of the fifo_tail.
1337                  *
1338                  * It is illegal for a record to cross a volume boundary.
1339                  *
1340                  * It is illegal for a record to cross a recovery boundary
1341                  * (this is so recovery code is guaranteed a record rather
1342                  * then data at certain points).
1343                  *
1344                  * Add a pad record and loop if it does.
1345                  */
1346                 must_pad = 0;
1347                 if (xoff + rec_len > HAMMER_BUFSIZE)
1348                         must_pad = 1;
1349                 if (can_cross == 0) {
1350                         if (xoff + aligned_bytes > HAMMER_BUFSIZE)
1351                                 must_pad = 1;
1352                 } else {
1353                         if (xoff + aligned_bytes > HAMMER_BUFSIZE &&
1354                             (end_off + aligned_bytes) >=
1355                             (*rec_bufferp)->volume->maxbuf_off) {
1356                                 must_pad = 1;
1357                         }
1358                         if ((end_off ^ (end_off + aligned_bytes)) &
1359                             HAMMER_OFF_SHORT_REC_MASK) {
1360                                 must_pad = 1;
1361                         }
1362                         if (xoff + aligned_bytes - HAMMER_BUFSIZE >
1363                             HAMMER_BUFSIZE) {
1364                                 KKASSERT(xoff != 0);
1365                                 must_pad = 1;
1366                         }
1367                 }
1368
1369                 /*
1370                  * Pad to end of the buffer if necessary.  PADs can be
1371                  * squeezed into as little as 8 bytes (hence our alignment
1372                  * requirement).  The crc, reserved, and sequence number
1373                  * fields are not used, but initialize them anyway if there
1374                  * is enough room.
1375                  */
1376                 if (must_pad) {
1377                         xoff = HAMMER_BUFSIZE - xoff;
1378                         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1379                         head->hdr_type = HAMMER_HEAD_TYPE_PAD;
1380                         head->hdr_size = xoff;
1381                         if (xoff >= HAMMER_HEAD_ONDISK_SIZE +
1382                                     HAMMER_TAIL_ONDISK_SIZE) {
1383                                 head->hdr_crc = 0;
1384                                 head->hdr_reserved02 = 0;
1385                                 head->hdr_seq = 0;
1386                         }
1387
1388                         tail = (void *)((char *)head + xoff -
1389                                         HAMMER_TAIL_ONDISK_SIZE);
1390                         if ((void *)head != (void *)tail) {
1391                                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
1392                                 tail->tail_type = HAMMER_HEAD_TYPE_PAD;
1393                                 tail->tail_size = xoff;
1394                         }
1395                         KKASSERT((xoff & HAMMER_HEAD_ALIGN_MASK) == 0);
1396                         ondisk->vol0_fifo_end =
1397                                 hammer_advance_fifo((*rec_bufferp)->volume,
1398                                                     end_off, xoff);
1399                         continue;
1400                 }
1401
1402                 if (xoff + aligned_bytes > HAMMER_BUFSIZE) {
1403                         xoff = xoff + aligned_bytes - HAMMER_BUFSIZE;
1404
1405                         KKASSERT(xoff <= HAMMER_BUFSIZE);
1406                         tail = hammer_bnew(hmp, end_off + aligned_bytes -
1407                                                 HAMMER_TAIL_ONDISK_SIZE,
1408                                            errorp, data2_bufferp);
1409                         hammer_modify_buffer(*data2_bufferp, NULL, 0);
1410                         if (*errorp)
1411                                 goto done;
1412
1413                         /*
1414                          * Retry if someone else appended to the fifo while
1415                          * we were blocked.
1416                          */
1417                         if (ondisk->vol0_fifo_end != end_off)
1418                                 continue;
1419                 } else {
1420                         tail = (void *)((char *)head + aligned_bytes -
1421                                         HAMMER_TAIL_ONDISK_SIZE);
1422                 }
1423
1424                 bzero(head, rec_len);
1425                 head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1426                 head->hdr_type = hdr_type;
1427                 head->hdr_size = aligned_bytes;
1428                 head->hdr_crc = 0;
1429                 head->hdr_seq = root_volume->ondisk->vol0_next_seq++;
1430
1431                 tail->tail_signature = HAMMER_TAIL_SIGNATURE;
1432                 tail->tail_type = hdr_type;
1433                 tail->tail_size = aligned_bytes;
1434
1435                 ondisk->vol0_fifo_end =
1436                         hammer_advance_fifo((*rec_bufferp)->volume,
1437                                             end_off, aligned_bytes);
1438 done:
1439                 hammer_rel_volume(root_volume, 0);
1440                 break;
1441         }
1442         if (*errorp)
1443                 end_off = 0;
1444         return(end_off);
1445 }
1446
1447 /*
1448  * Mark a fifo record as having been freed.  XXX needs undo.
1449  */
1450 void
1451 hammer_free_fifo(hammer_mount_t hmp, hammer_off_t fifo_offset)
1452 {
1453         hammer_buffer_t buffer = NULL;
1454         hammer_fifo_head_t head;
1455         int error;
1456
1457         head = hammer_bread(hmp, fifo_offset, &error, &buffer);
1458         if (head) {
1459                 hammer_modify_buffer(buffer, &head->hdr_type,
1460                                      sizeof(head->hdr_type));
1461                 head->hdr_type |= HAMMER_HEAD_FLAG_FREE;
1462         }
1463         if (buffer)
1464                 hammer_rel_buffer(buffer, 0);
1465 }
1466
1467 /*
1468  * Attempt to rewind the FIFO
1469  *
1470  * This routine is allowed to do nothing.
1471  */
1472 void
1473 hammer_unwind_fifo(hammer_mount_t hmp, hammer_off_t rec_offset)
1474 {
1475 }
1476
1477 /*
1478  * Advance the FIFO a certain number of bytes.
1479  */
1480 static
1481 hammer_off_t
1482 hammer_advance_fifo(hammer_volume_t volume, hammer_off_t off, int32_t bytes)
1483 {
1484         int32_t vol_no;
1485
1486         off += bytes;
1487         KKASSERT(off <= volume->maxbuf_off);
1488         KKASSERT((off & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER);
1489         if (off == volume->maxbuf_off) {
1490                 vol_no = volume->vol_no + 1;
1491                 if (vol_no == volume->hmp->nvolumes)
1492                         vol_no = 0;
1493                 off = HAMMER_ENCODE_RAW_BUFFER(vol_no, 0);
1494         }
1495         return(off);
1496 }
1497 #endif
1498
1499 /*
1500  * Sync dirty buffers to the media
1501  */
1502
1503 static int hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
1504 static int hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
1505
1506 int
1507 hammer_sync_hmp(hammer_mount_t hmp, int waitfor)
1508 {
1509         struct hammer_sync_info info;
1510
1511         info.error = 0;
1512         info.waitfor = waitfor;
1513
1514         vmntvnodescan(hmp->mp, VMSC_GETVP|VMSC_NOWAIT,
1515                       hammer_sync_scan1, hammer_sync_scan2, &info);
1516
1517         RB_SCAN(hammer_vol_rb_tree, &hmp->rb_vols_root, NULL,
1518                 hammer_sync_volume, &info);
1519         return(info.error);
1520 }
1521
1522 static int
1523 hammer_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1524 {
1525         struct hammer_inode *ip;
1526
1527         ip = VTOI(vp);
1528         if (vp->v_type == VNON || ip == NULL ||
1529             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1530              RB_EMPTY(&vp->v_rbdirty_tree))) {
1531                 return(-1);
1532         }
1533         return(0);
1534 }
1535
1536 static int
1537 hammer_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1538 {
1539         struct hammer_sync_info *info = data;
1540         struct hammer_inode *ip;
1541         int error;
1542
1543         ip = VTOI(vp);
1544         if (vp->v_type == VNON || vp->v_type == VBAD ||
1545             ((ip->flags & HAMMER_INODE_MODMASK) == 0 &&
1546              RB_EMPTY(&vp->v_rbdirty_tree))) {
1547                 return(0);
1548         }
1549         error = VOP_FSYNC(vp, info->waitfor);
1550         if (error)
1551                 info->error = error;
1552         return(0);
1553 }
1554
1555 int
1556 hammer_sync_volume(hammer_volume_t volume, void *data)
1557 {
1558         struct hammer_sync_info *info = data;
1559
1560         hammer_ref(&volume->io.lock);
1561         RB_SCAN(hammer_buf_rb_tree, &volume->rb_bufs_root, NULL,
1562                 hammer_sync_buffer, info);
1563         hammer_rel_volume(volume, 1);
1564         return(0);
1565 }
1566
1567 int
1568 hammer_sync_buffer(hammer_buffer_t buffer, void *data __unused)
1569 {
1570         hammer_ref(&buffer->io.lock);
1571         hammer_rel_buffer(buffer, 1);
1572         return(0);
1573 }
1574
1575 #if 0
1576 /*
1577  * Generic buffer initialization.  Initialize the A-list into an all-allocated
1578  * state with the free block limit properly set.
1579  *
1580  * Note that alloc_new_buffer() will free the appropriate block range via
1581  * the appropriate cluster alist, so the free count is properly propogated.
1582  */
1583 void
1584 hammer_init_fifo(hammer_fifo_head_t head, u_int16_t type)
1585 {
1586         head->hdr_signature = HAMMER_HEAD_SIGNATURE;
1587         head->hdr_type = type;
1588         head->hdr_size = 0;
1589         head->hdr_crc = 0;
1590         head->hdr_seq = 0;
1591 }
1592
1593 #endif
1594