/*
- * Copyright (c) 2004 The DragonFly Project. All rights reserved.
+ * Copyright (c) 2004-2006 The DragonFly Project. All rights reserved.
*
* This code is derived from software contributed to The DragonFly Project
* by Matthew Dillon <dillon@backplane.com>
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $DragonFly: src/sys/kern/vfs_journal.c,v 1.7 2005/02/28 17:41:00 dillon Exp $
+ * $DragonFly: src/sys/kern/vfs_journal.c,v 1.33 2007/05/09 00:53:34 dillon Exp $
*/
/*
- * Each mount point may have zero or more independantly configured journals
- * attached to it. Each journal is represented by a memory FIFO and worker
- * thread. Journal events are streamed through the FIFO to the thread,
- * batched up (typically on one-second intervals), and written out by the
- * thread.
- *
- * Journal vnode ops are executed instead of mnt_vn_norm_ops when one or
- * more journals have been installed on a mount point. It becomes the
- * responsibility of the journal op to call the underlying normal op as
- * appropriate.
- *
* The journaling protocol is intended to evolve into a two-way stream
* whereby transaction IDs can be acknowledged by the journaling target
* when the data has been committed to hard storage. Both implicit and
#include <sys/journal.h>
#include <sys/file.h>
#include <sys/proc.h>
+#include <sys/xio.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
#include <machine/limits.h>
#include <sys/file2.h>
#include <sys/thread2.h>
+#include <sys/mplock2.h>
+#include <sys/spinlock2.h>
+
+static void journal_wthread(void *info);
+static void journal_rthread(void *info);
-static int journal_attach(struct mount *mp);
-static void journal_detach(struct mount *mp);
-static int journal_install_vfs_journal(struct mount *mp, struct file *fp,
- const struct mountctl_install_journal *info);
-static int journal_remove_vfs_journal(struct mount *mp,
- const struct mountctl_remove_journal *info);
-static int journal_resync_vfs_journal(struct mount *mp, const void *ctl);
-static int journal_status_vfs_journal(struct mount *mp,
- const struct mountctl_status_journal *info,
- struct mountctl_journal_ret_status *rstat,
- int buflen, int *res);
-static void journal_thread(void *info);
-
-static void *journal_reserve(struct journal *jo,
- struct journal_rawrecbeg **rawpp,
- int16_t streamid, int bytes);
+static void *journal_reserve(struct journal *jo,
+ struct journal_rawrecbeg **rawpp,
+ int16_t streamid, int bytes);
static void *journal_extend(struct journal *jo,
- struct journal_rawrecbeg **rawpp,
- int truncbytes, int bytes, int *newstreamrecp);
-static void journal_abort(struct journal *jo,
- struct journal_rawrecbeg **rawpp);
-static void journal_commit(struct journal *jo,
- struct journal_rawrecbeg **rawpp,
- int bytes, int closeout);
-
-static void jrecord_init(struct journal *jo,
- struct jrecord *jrec, int16_t streamid);
-static struct journal_subrecord *jrecord_push(
- struct jrecord *jrec, int16_t rectype);
-static void jrecord_pop(struct jrecord *jrec, struct journal_subrecord *parent);
-static struct journal_subrecord *jrecord_write(struct jrecord *jrec,
- int16_t rectype, int bytes);
-static void jrecord_data(struct jrecord *jrec, const void *buf, int bytes);
-static void jrecord_done(struct jrecord *jrec, int abortit);
-
-static int journal_setattr(struct vop_setattr_args *ap);
-static int journal_write(struct vop_write_args *ap);
-static int journal_fsync(struct vop_fsync_args *ap);
-static int journal_putpages(struct vop_putpages_args *ap);
-static int journal_setacl(struct vop_setacl_args *ap);
-static int journal_setextattr(struct vop_setextattr_args *ap);
-static int journal_ncreate(struct vop_ncreate_args *ap);
-static int journal_nmknod(struct vop_nmknod_args *ap);
-static int journal_nlink(struct vop_nlink_args *ap);
-static int journal_nsymlink(struct vop_nsymlink_args *ap);
-static int journal_nwhiteout(struct vop_nwhiteout_args *ap);
-static int journal_nremove(struct vop_nremove_args *ap);
-static int journal_nmkdir(struct vop_nmkdir_args *ap);
-static int journal_nrmdir(struct vop_nrmdir_args *ap);
-static int journal_nrename(struct vop_nrename_args *ap);
-
-static struct vnodeopv_entry_desc journal_vnodeop_entries[] = {
- { &vop_default_desc, vop_journal_operate_ap },
- { &vop_mountctl_desc, (void *)journal_mountctl },
- { &vop_setattr_desc, (void *)journal_setattr },
- { &vop_write_desc, (void *)journal_write },
- { &vop_fsync_desc, (void *)journal_fsync },
- { &vop_putpages_desc, (void *)journal_putpages },
- { &vop_setacl_desc, (void *)journal_setacl },
- { &vop_setextattr_desc, (void *)journal_setextattr },
- { &vop_ncreate_desc, (void *)journal_ncreate },
- { &vop_nmknod_desc, (void *)journal_nmknod },
- { &vop_nlink_desc, (void *)journal_nlink },
- { &vop_nsymlink_desc, (void *)journal_nsymlink },
- { &vop_nwhiteout_desc, (void *)journal_nwhiteout },
- { &vop_nremove_desc, (void *)journal_nremove },
- { &vop_nmkdir_desc, (void *)journal_nmkdir },
- { &vop_nrmdir_desc, (void *)journal_nrmdir },
- { &vop_nrename_desc, (void *)journal_nrename },
- { NULL, NULL }
-};
-
-static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures");
-static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
-
-int
-journal_mountctl(struct vop_mountctl_args *ap)
-{
- struct mount *mp;
- int error = 0;
-
- mp = ap->a_head.a_ops->vv_mount;
- KKASSERT(mp);
-
- if (mp->mnt_vn_journal_ops == NULL) {
- switch(ap->a_op) {
- case MOUNTCTL_INSTALL_VFS_JOURNAL:
- error = journal_attach(mp);
- if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal))
- error = EINVAL;
- if (error == 0 && ap->a_fp == NULL)
- error = EBADF;
- if (error == 0)
- error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
- if (TAILQ_EMPTY(&mp->mnt_jlist))
- journal_detach(mp);
- break;
- case MOUNTCTL_REMOVE_VFS_JOURNAL:
- case MOUNTCTL_RESYNC_VFS_JOURNAL:
- case MOUNTCTL_STATUS_VFS_JOURNAL:
- error = ENOENT;
- break;
- default:
- error = EOPNOTSUPP;
- break;
- }
- } else {
- switch(ap->a_op) {
- case MOUNTCTL_INSTALL_VFS_JOURNAL:
- if (ap->a_ctllen != sizeof(struct mountctl_install_journal))
- error = EINVAL;
- if (error == 0 && ap->a_fp == NULL)
- error = EBADF;
- if (error == 0)
- error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
- break;
- case MOUNTCTL_REMOVE_VFS_JOURNAL:
- if (ap->a_ctllen != sizeof(struct mountctl_remove_journal))
- error = EINVAL;
- if (error == 0)
- error = journal_remove_vfs_journal(mp, ap->a_ctl);
- if (TAILQ_EMPTY(&mp->mnt_jlist))
- journal_detach(mp);
- break;
- case MOUNTCTL_RESYNC_VFS_JOURNAL:
- if (ap->a_ctllen != 0)
- error = EINVAL;
- error = journal_resync_vfs_journal(mp, ap->a_ctl);
- break;
- case MOUNTCTL_STATUS_VFS_JOURNAL:
- if (ap->a_ctllen != sizeof(struct mountctl_status_journal))
- error = EINVAL;
- if (error == 0) {
- error = journal_status_vfs_journal(mp, ap->a_ctl,
- ap->a_buf, ap->a_buflen, ap->a_res);
- }
- break;
- default:
- error = EOPNOTSUPP;
- break;
- }
- }
- return (error);
-}
+ struct journal_rawrecbeg **rawpp,
+ int truncbytes, int bytes, int *newstreamrecp);
+static void journal_abort(struct journal *jo,
+ struct journal_rawrecbeg **rawpp);
+static void journal_commit(struct journal *jo,
+ struct journal_rawrecbeg **rawpp,
+ int bytes, int closeout);
+static void jrecord_data(struct jrecord *jrec,
+ void *buf, int bytes, int dtype);
-/*
- * High level mount point setup. When a
- */
-static int
-journal_attach(struct mount *mp)
-{
- vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries);
- return(0);
-}
-static void
-journal_detach(struct mount *mp)
-{
- if (mp->mnt_vn_journal_ops)
- vfs_rm_vnodeops(&mp->mnt_vn_journal_ops);
-}
+MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures");
+MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
-/*
- * Install a journal on a mount point. Each journal has an associated worker
- * thread which is responsible for buffering and spooling the data to the
- * target. A mount point may have multiple journals attached to it. An
- * initial start record is generated when the journal is associated.
- */
-static int
-journal_install_vfs_journal(struct mount *mp, struct file *fp,
- const struct mountctl_install_journal *info)
+void
+journal_create_threads(struct journal *jo)
{
- struct journal *jo;
- struct jrecord jrec;
- int error = 0;
- int size;
-
- jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK|M_ZERO);
- bcopy(info->id, jo->id, sizeof(jo->id));
- jo->flags = info->flags & ~(MC_JOURNAL_ACTIVE | MC_JOURNAL_STOP_REQ);
-
- /*
- * Memory FIFO size, round to nearest power of 2
- */
- if (info->membufsize) {
- if (info->membufsize < 65536)
- size = 65536;
- else if (info->membufsize > 128 * 1024 * 1024)
- size = 128 * 1024 * 1024;
- else
- size = (int)info->membufsize;
- } else {
- size = 1024 * 1024;
- }
- jo->fifo.size = 1;
- while (jo->fifo.size < size)
- jo->fifo.size <<= 1;
-
- /*
- * Other parameters. If not specified the starting transaction id
- * will be the current date.
- */
- if (info->transid) {
- jo->transid = info->transid;
- } else {
- struct timespec ts;
- getnanotime(&ts);
- jo->transid = ((int64_t)ts.tv_sec << 30) | ts.tv_nsec;
- }
-
- jo->fp = fp;
-
- /*
- * Allocate the memory FIFO
- */
- jo->fifo.mask = jo->fifo.size - 1;
- jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK|M_ZERO|M_NULLOK);
- if (jo->fifo.membase == NULL)
- error = ENOMEM;
-
- /*
- * Create the worker thread and generate the association record.
- */
- if (error) {
- free(jo, M_JOURNAL);
- } else {
- fhold(fp);
- jo->flags |= MC_JOURNAL_ACTIVE;
- lwkt_create(journal_thread, jo, NULL, &jo->thread,
- TDF_STOPREQ, -1, "journal %.*s", JIDMAX, jo->id);
- lwkt_setpri(&jo->thread, TDPRI_KERN_DAEMON);
- lwkt_schedule(&jo->thread);
-
- jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT);
- jrecord_write(&jrec, JTYPE_ASSOCIATE, 0);
- jrecord_done(&jrec, 0);
- TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry);
- }
- return(error);
+ jo->flags &= ~(MC_JOURNAL_STOP_REQ | MC_JOURNAL_STOP_IMM);
+ jo->flags |= MC_JOURNAL_WACTIVE;
+ lwkt_create(journal_wthread, jo, NULL, &jo->wthread,
+ TDF_NOSTART, -1,
+ "journal w:%.*s", JIDMAX, jo->id);
+ lwkt_setpri(&jo->wthread, TDPRI_KERN_DAEMON);
+ lwkt_schedule(&jo->wthread);
+
+ if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) {
+ jo->flags |= MC_JOURNAL_RACTIVE;
+ lwkt_create(journal_rthread, jo, NULL, &jo->rthread,
+ TDF_NOSTART, -1,
+ "journal r:%.*s", JIDMAX, jo->id);
+ lwkt_setpri(&jo->rthread, TDPRI_KERN_DAEMON);
+ lwkt_schedule(&jo->rthread);
+ }
}
-/*
- * Disassociate a journal from a mount point and terminate its worker thread.
- * A final termination record is written out before the file pointer is
- * dropped.
- */
-static int
-journal_remove_vfs_journal(struct mount *mp,
- const struct mountctl_remove_journal *info)
+void
+journal_destroy_threads(struct journal *jo, int flags)
{
- struct journal *jo;
- struct jrecord jrec;
- int error;
-
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0)
- break;
- }
- if (jo) {
- error = 0;
- TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry);
-
- jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT);
- jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0);
- jrecord_done(&jrec, 0);
-
- jo->flags |= MC_JOURNAL_STOP_REQ | (info->flags & MC_JOURNAL_STOP_IMM);
- wakeup(&jo->fifo);
- while (jo->flags & MC_JOURNAL_ACTIVE) {
- tsleep(jo, 0, "jwait", 0);
+ int wcount;
+
+ jo->flags |= MC_JOURNAL_STOP_REQ | (flags & MC_JOURNAL_STOP_IMM);
+ wakeup(&jo->fifo);
+ wcount = 0;
+ while (jo->flags & (MC_JOURNAL_WACTIVE | MC_JOURNAL_RACTIVE)) {
+ tsleep(jo, 0, "jwait", hz);
+ if (++wcount % 10 == 0) {
+ kprintf("Warning: journal %s waiting for descriptors to close\n",
+ jo->id);
}
- lwkt_free_thread(&jo->thread); /* XXX SMP */
- if (jo->fp)
- fdrop(jo->fp, curthread);
- if (jo->fifo.membase)
- free(jo->fifo.membase, M_JFIFO);
- free(jo, M_JOURNAL);
- } else {
- error = EINVAL;
}
- return (error);
-}
-static int
-journal_resync_vfs_journal(struct mount *mp, const void *ctl)
-{
- return(EINVAL);
+ /*
+ * XXX SMP - threads should move to cpu requesting the restart or
+ * termination before finishing up to properly interlock.
+ */
+ tsleep(jo, 0, "jwait", hz);
+ lwkt_free_thread(&jo->wthread);
+ if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX)
+ lwkt_free_thread(&jo->rthread);
}
-static int
-journal_status_vfs_journal(struct mount *mp,
- const struct mountctl_status_journal *info,
- struct mountctl_journal_ret_status *rstat,
- int buflen, int *res)
-{
- struct journal *jo;
- int error = 0;
- int index;
-
- index = 0;
- *res = 0;
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- if (info->index == MC_JOURNAL_INDEX_ID) {
- if (bcmp(jo->id, info->id, sizeof(jo->id)) != 0)
- continue;
- } else if (info->index >= 0) {
- if (info->index < index)
- continue;
- } else if (info->index != MC_JOURNAL_INDEX_ALL) {
- continue;
- }
- if (buflen < sizeof(*rstat)) {
- if (*res)
- rstat[-1].flags |= MC_JOURNAL_STATUS_MORETOCOME;
- else
- error = EINVAL;
- break;
- }
- bzero(rstat, sizeof(*rstat));
- rstat->recsize = sizeof(*rstat);
- bcopy(jo->id, rstat->id, sizeof(jo->id));
- rstat->index = index;
- rstat->membufsize = jo->fifo.size;
- rstat->membufused = jo->fifo.xindex - jo->fifo.rindex;
- rstat->membufiopend = jo->fifo.windex - jo->fifo.rindex;
- rstat->bytessent = jo->total_acked;
- ++rstat;
- ++index;
- *res += sizeof(*rstat);
- buflen -= sizeof(*rstat);
- }
- return(error);
-}
/*
* The per-journal worker thread is responsible for writing out the
* journal's FIFO to the target stream.
*/
static void
-journal_thread(void *info)
+journal_wthread(void *info)
{
struct journal *jo = info;
struct journal_rawrecbeg *rawp;
- int bytes;
int error;
- int avail;
- int res;
+ size_t avail;
+ size_t bytes;
+ size_t res;
+
+ /* not MPSAFE yet */
+ get_mplock();
for (;;) {
/*
tsleep(&jo->fifo, 0, "jfifo", hz);
continue;
}
+
+ /*
+ * Sleep if we can not go any further due to hitting an incomplete
+ * record. This case should occur rarely but may have to be better
+ * optimized XXX.
+ */
rawp = (void *)(jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask));
if (rawp->begmagic == JREC_INCOMPLETEMAGIC) {
tsleep(&jo->fifo, 0, "jpad", hz);
continue;
}
+
+ /*
+ * Skip any pad records. We do not write out pad records if we can
+ * help it.
+ */
if (rawp->streamid == JREC_STREAMID_PAD) {
+ if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
+ if (jo->fifo.rindex == jo->fifo.xindex) {
+ jo->fifo.xindex += (rawp->recsize + 15) & ~15;
+ jo->total_acked += (rawp->recsize + 15) & ~15;
+ }
+ }
jo->fifo.rindex += (rawp->recsize + 15) & ~15;
- KKASSERT(jo->fifo.windex - jo->fifo.rindex > 0);
+ jo->total_acked += bytes;
+ KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0);
continue;
}
/*
- * Figure out how much we can write out, beware the buffer wrap
- * case.
+ * 'bytes' is the amount of data that can potentially be written out.
+ * Calculate 'res', the amount of data that can actually be written
+ * out. res is bounded either by hitting the end of the physical
+ * memory buffer or by hitting an incomplete record. Incomplete
+ * records often occur due to the way the space reservation model
+ * works.
*/
res = 0;
avail = jo->fifo.size - (jo->fifo.rindex & jo->fifo.mask);
KKASSERT(res == avail);
break;
}
+ rawp = (void *)((char *)rawp + ((rawp->recsize + 15) & ~15));
}
/*
* For now assume blocking I/O. Since we are record-aware the
* code cannot yet handle partial writes.
*
+ * We bump rindex prior to issuing the write to avoid racing
+ * the acknowledgement coming back (which could prevent the ack
+ * from bumping xindex). Restarts are always based on xindex so
+ * we do not try to undo the rindex if an error occurs.
+ *
* XXX EWOULDBLOCK/NBIO
* XXX notification on failure
+ * XXX permanent verses temporary failures
* XXX two-way acknowledgement stream in the return direction / xindex
*/
- printf("write @%d,%d\n", jo->fifo.rindex & jo->fifo.mask, bytes);
bytes = res;
+ jo->fifo.rindex += bytes;
error = fp_write(jo->fp,
- jo->fifo.membase + (jo->fifo.rindex & jo->fifo.mask),
- bytes, &res);
+ jo->fifo.membase +
+ ((jo->fifo.rindex - bytes) & jo->fifo.mask),
+ bytes, &res, UIO_SYSSPACE);
if (error) {
- printf("journal_thread(%s) write, error %d\n", jo->id, error);
+ kprintf("journal_thread(%s) write, error %d\n", jo->id, error);
/* XXX */
} else {
KKASSERT(res == bytes);
- printf("journal_thread(%s) write %d\n", jo->id, res);
}
/*
- * Advance rindex. XXX for now also advance xindex, which will
- * eventually be advanced when the target acknowledges the sequence
- * space.
+ * Advance rindex. If the journal stream is not full duplex we also
+ * advance xindex, otherwise the rjournal thread is responsible for
+ * advancing xindex.
*/
- jo->fifo.rindex += bytes;
- jo->fifo.xindex += bytes;
- jo->total_acked += bytes;
- if (jo->flags & MC_JOURNAL_WWAIT) {
- jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
- wakeup(&jo->fifo.windex);
+ if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
+ jo->fifo.xindex += bytes;
+ jo->total_acked += bytes;
+ }
+ KKASSERT(jo->fifo.windex - jo->fifo.rindex >= 0);
+ if ((jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) == 0) {
+ if (jo->flags & MC_JOURNAL_WWAIT) {
+ jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
+ wakeup(&jo->fifo.windex);
+ }
}
}
- jo->flags &= ~MC_JOURNAL_ACTIVE;
+ fp_shutdown(jo->fp, SHUT_WR);
+ jo->flags &= ~MC_JOURNAL_WACTIVE;
wakeup(jo);
wakeup(&jo->fifo.windex);
+ rel_mplock();
}
-static __inline
+/*
+ * A second per-journal worker thread is created for two-way journaling
+ * streams to deal with the return acknowledgement stream.
+ */
+static void
+journal_rthread(void *info)
+{
+ struct journal_rawrecbeg *rawp;
+ struct journal_ackrecord ack;
+ struct journal *jo = info;
+ int64_t transid;
+ int error;
+ size_t count;
+ size_t bytes;
+
+ transid = 0;
+ error = 0;
+
+ /* not MPSAFE yet */
+ get_mplock();
+
+ for (;;) {
+ /*
+ * We have been asked to stop
+ */
+ if (jo->flags & MC_JOURNAL_STOP_REQ)
+ break;
+
+ /*
+ * If we have no active transaction id, get one from the return
+ * stream.
+ */
+ if (transid == 0) {
+ error = fp_read(jo->fp, &ack, sizeof(ack), &count,
+ 1, UIO_SYSSPACE);
+#if 0
+ kprintf("fp_read ack error %d count %d\n", error, count);
+#endif
+ if (error || count != sizeof(ack))
+ break;
+ if (error) {
+ kprintf("read error %d on receive stream\n", error);
+ break;
+ }
+ if (ack.rbeg.begmagic != JREC_BEGMAGIC ||
+ ack.rend.endmagic != JREC_ENDMAGIC
+ ) {
+ kprintf("bad begmagic or endmagic on receive stream\n");
+ break;
+ }
+ transid = ack.rbeg.transid;
+ }
+
+ /*
+ * Calculate the number of unacknowledged bytes. If there are no
+ * unacknowledged bytes then unsent data was acknowledged, report,
+ * sleep a bit, and loop in that case. This should not happen
+ * normally. The ack record is thrown away.
+ */
+ bytes = jo->fifo.rindex - jo->fifo.xindex;
+
+ if (bytes == 0) {
+ kprintf("warning: unsent data acknowledged transid %08llx\n",
+ (long long)transid);
+ tsleep(&jo->fifo.xindex, 0, "jrseq", hz);
+ transid = 0;
+ continue;
+ }
+
+ /*
+ * Since rindex has advanced, the record pointed to by xindex
+ * must be a valid record.
+ */
+ rawp = (void *)(jo->fifo.membase + (jo->fifo.xindex & jo->fifo.mask));
+ KKASSERT(rawp->begmagic == JREC_BEGMAGIC);
+ KKASSERT(rawp->recsize <= bytes);
+
+ /*
+ * The target can acknowledge several records at once.
+ */
+ if (rawp->transid < transid) {
+#if 1
+ kprintf("ackskip %08llx/%08llx\n",
+ (long long)rawp->transid,
+ (long long)transid);
+#endif
+ jo->fifo.xindex += (rawp->recsize + 15) & ~15;
+ jo->total_acked += (rawp->recsize + 15) & ~15;
+ if (jo->flags & MC_JOURNAL_WWAIT) {
+ jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
+ wakeup(&jo->fifo.windex);
+ }
+ continue;
+ }
+ if (rawp->transid == transid) {
+#if 1
+ kprintf("ackskip %08llx/%08llx\n",
+ (long long)rawp->transid,
+ (long long)transid);
+#endif
+ jo->fifo.xindex += (rawp->recsize + 15) & ~15;
+ jo->total_acked += (rawp->recsize + 15) & ~15;
+ if (jo->flags & MC_JOURNAL_WWAIT) {
+ jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
+ wakeup(&jo->fifo.windex);
+ }
+ transid = 0;
+ continue;
+ }
+ kprintf("warning: unsent data(2) acknowledged transid %08llx\n",
+ (long long)transid);
+ transid = 0;
+ }
+ jo->flags &= ~MC_JOURNAL_RACTIVE;
+ wakeup(jo);
+ wakeup(&jo->fifo.windex);
+ rel_mplock();
+}
+
+/*
+ * This builds a pad record which the journaling thread will skip over. Pad
+ * records are required when we are unable to reserve sufficient stream space
+ * due to insufficient space at the end of the physical memory fifo.
+ *
+ * Even though the record is not transmitted, a normal transid must be
+ * assigned to it so link recovery operations after a failure work properly.
+ */
+static
void
-journal_build_pad(struct journal_rawrecbeg *rawp, int recsize)
+journal_build_pad(struct journal_rawrecbeg *rawp, int recsize, int64_t transid)
{
struct journal_rawrecend *rendp;
KKASSERT((recsize & 15) == 0 && recsize >= 16);
- rawp->begmagic = JREC_BEGMAGIC;
rawp->streamid = JREC_STREAMID_PAD;
rawp->recsize = recsize; /* must be 16-byte aligned */
- rawp->seqno = 0;
+ rawp->transid = transid;
/*
- * WARNING, rendp may overlap rawp->seqno. This is necessary to
- * allow PAD records to fit in 16 bytes. Use cpu_mb1() to
+ * WARNING, rendp may overlap rawp->transid. This is necessary to
+ * allow PAD records to fit in 16 bytes. Use cpu_ccfence() to
* hopefully cause the compiler to not make any assumptions.
*/
- cpu_mb1();
rendp = (void *)((char *)rawp + rawp->recsize - sizeof(*rendp));
rendp->endmagic = JREC_ENDMAGIC;
rendp->check = 0;
rendp->recsize = rawp->recsize;
+
+ /*
+ * Set the begin magic last. This is what will allow the journal
+ * thread to write the record out. Use a store fence to prevent
+ * compiler and cpu reordering of the writes.
+ */
+ cpu_sfence();
+ rawp->begmagic = JREC_BEGMAGIC;
}
/*
* Wake up the worker thread if the FIFO is more then half full or if
* someone is waiting for space to be freed up. Otherwise let the
* heartbeat deal with it. Being able to avoid waking up the worker
- * is the key to the journal's cpu efficiency.
+ * is the key to the journal's cpu performance.
*/
static __inline
void
* specified amount of payload space. *rawpp will be set to point to the
* base of the new stream record and a pointer to the base of the payload
* space will be returned. *rawpp does not need to be pre-NULLd prior to
- * making this call.
+ * making this call. The raw record header will be partially initialized.
*
* A stream can be extended, aborted, or committed by other API calls
* below. This may result in a sequence of potentially disconnected
* of the FIFO's buffer. Calculate 'req' which is the actual number
* of bytes being reserved, including wrap-around dead space.
*
+ * Neither 'bytes' or 'req' are aligned.
+ *
* Note that availtoend is not truncated to avail and so cannot be
* used to determine whether the reservation is possible by itself.
* Also, since all fifo ops are 16-byte aligned, we can check
* the size before calculating the aligned size.
*/
availtoend = jo->fifo.size - (jo->fifo.windex & jo->fifo.mask);
+ KKASSERT((availtoend & 15) == 0);
if (bytes > availtoend)
req = bytes + availtoend; /* add pad to end */
else
if (avail < req) {
/* XXX MC_JOURNAL_STOP_IMM */
jo->flags |= MC_JOURNAL_WWAIT;
+ ++jo->fifostalls;
tsleep(&jo->fifo.windex, 0, "jwrite", 0);
continue;
}
* journaling code must also be aware the reserved sections occuring
* after this one will also not be written out even if completed
* until this one is completed.
+ *
+ * The transaction id must accomodate real and potential pad creation.
*/
rawp = (void *)(jo->fifo.membase + (jo->fifo.windex & jo->fifo.mask));
if (req != bytes) {
- journal_build_pad(rawp, req - bytes);
+ journal_build_pad(rawp, availtoend, jo->transid);
+ ++jo->transid;
rawp = (void *)jo->fifo.membase;
}
rawp->begmagic = JREC_INCOMPLETEMAGIC; /* updated by abort/commit */
rawp->recsize = bytes; /* (unaligned size) */
rawp->streamid = streamid | JREC_STREAMCTL_BEGIN;
- rawp->seqno = 0; /* set by caller */
+ rawp->transid = jo->transid;
+ jo->transid += 2;
/*
* Issue a memory barrier to guarentee that the record data has been
*
* Note that stream records are always 16-byte aligned.
*/
- cpu_mb1();
+ cpu_sfence();
jo->fifo.windex += (req + 15) & ~15;
*rawpp = rawp;
return(rawp + 1);
}
/*
- * Extend a previous reservation by the specified number of payload bytes.
- * If it is not possible to extend the existing reservation due to either
- * another thread having reserved space after us or due to a boundary
- * condition, the current reservation will be committed and possibly
- * truncated and a new reservation with the specified payload size will
- * be created. *rawpp is set to the new reservation in this case but the
- * caller cannot depend on a comparison with the old rawp to determine if
- * this case occurs because we could end up using the same memory FIFO
- * offset for the new stream record.
+ * Attempt to extend the stream record by <bytes> worth of payload space.
+ *
+ * If it is possible to extend the existing stream record no truncation
+ * occurs and the record is extended as specified. A pointer to the
+ * truncation offset within the payload space is returned.
*
- * In either case this function will return a pointer to the base of the
- * extended payload space.
+ * If it is not possible to do this the existing stream record is truncated
+ * and committed, and a new stream record of size <bytes> is created. A
+ * pointer to the base of the new stream record's payload space is returned.
*
- * If a new stream block is created the caller needs to recalculate payload
- * byte counts, if the same stream block is used the caller needs to extend
- * its current notion of the payload byte count.
+ * *rawpp is set to the new reservation in the case of a new record but
+ * the caller cannot depend on a comparison with the old rawp to determine if
+ * this case occurs because we could end up using the same memory FIFO
+ * offset for the new stream record. Use *newstreamrecp instead.
*/
static void *
journal_extend(struct journal *jo, struct journal_rawrecbeg **rawpp,
wbase = (char *)rawp - jo->fifo.membase;
/*
- * If the aligned record size does not change we can trivially extend
- * the record.
+ * If the aligned record size does not change we can trivially adjust
+ * the record size.
*/
if (nsize == osize) {
rawp->recsize += bytes;
- return((char *)rawp + rawp->recsize - bytes);
+ return((char *)(rawp + 1) + truncbytes);
}
/*
* If the fifo's write index hasn't been modified since we made the
* reservation and we do not hit any boundary conditions, we can
- * trivially extend the record.
+ * trivially make the record smaller or larger.
*/
if ((jo->fifo.windex & jo->fifo.mask) == wbase + osize) {
availtoend = jo->fifo.size - wbase;
if (nsize <= avail && nsize <= availtoend) {
jo->fifo.windex += nsize - osize;
rawp->recsize += bytes;
- return((char *)rawp + rawp->recsize - bytes);
+ return((char *)(rawp + 1) + truncbytes);
}
}
KKASSERT(((intptr_t)rawp & 15) == 0);
/*
- * Truncate the record if requested. If the FIFO write index as still
+ * Truncate the record if necessary. If the FIFO write index as still
* at the end of our record we can optimally backindex it. Otherwise
- * we have to insert a pad record.
+ * we have to insert a pad record to cover the dead space.
*
* We calculate osize which is the 16-byte-aligned original recsize.
* We calculate nsize which is the 16-byte-aligned new recsize.
*
* Due to alignment issues or in case the passed truncation bytes is
- * the same as the original payload, windex will be equal to nindex.
+ * the same as the original payload, nsize may be equal to osize even
+ * if the committed bytes is less then the originally reserved bytes.
*/
if (bytes >= 0) {
KKASSERT(bytes >= 0 && bytes <= rawp->recsize - sizeof(struct journal_rawrecbeg) - sizeof(struct journal_rawrecend));
rawp->recsize = bytes + sizeof(struct journal_rawrecbeg) +
sizeof(struct journal_rawrecend);
nsize = (rawp->recsize + 15) & ~15;
+ KKASSERT(nsize <= osize);
if (osize == nsize) {
/* do nothing */
} else if ((jo->fifo.windex & jo->fifo.mask) == (char *)rawp - jo->fifo.membase + osize) {
jo->fifo.windex -= osize - nsize;
} else {
/* we cannot backindex the fifo, emplace a pad in the dead space */
- journal_build_pad((void *)((char *)rawp + osize), osize - nsize);
+ journal_build_pad((void *)((char *)rawp + nsize), osize - nsize,
+ rawp->transid + 1);
}
}
*/
if (closeout)
rawp->streamid |= JREC_STREAMCTL_END;
- cpu_mb1(); /* memory barrier */
+ cpu_sfence(); /* memory and compiler barrier */
rawp->begmagic = JREC_BEGMAGIC;
journal_commit_wakeup(jo);
* in the logical streams managed by the journal_*() routines.
*/
-static int16_t sid = JREC_STREAMID_JMIN;
-
/*
* Initialize the passed jrecord structure and start a new stream transaction
* by reserving an initial build space in the journal's memory FIFO.
*/
-static void
+void
jrecord_init(struct journal *jo, struct jrecord *jrec, int16_t streamid)
{
bzero(jrec, sizeof(*jrec));
jrec->jo = jo;
- if (streamid < 0) {
- streamid = sid++; /* XXX need to track stream ids! */
- if (sid == JREC_STREAMID_JMAX)
- sid = JREC_STREAMID_JMIN;
- }
jrec->streamid = streamid;
jrec->stream_residual = JREC_DEFAULTSIZE;
jrec->stream_reserved = jrec->stream_residual;
* record, so the caller should not mess with the returned pointer in
* any way other then to save it.
*/
-static
struct journal_subrecord *
jrecord_push(struct jrecord *jrec, int16_t rectype)
{
* and if not valid may or may not be NULL, depending on the value
* of pushptrgood.
*/
-static void
+void
jrecord_pop(struct jrecord *jrec, struct journal_subrecord *save)
{
struct journal_subrecord *last;
/*
* Write out a leaf record, including associated data.
*/
-static
void
jrecord_leaf(struct jrecord *jrec, int16_t rectype, void *ptr, int bytes)
{
jrecord_write(jrec, rectype, bytes);
- jrecord_data(jrec, ptr, bytes);
- jrecord_done(jrec, 0);
+ jrecord_data(jrec, ptr, bytes, JDATA_KERN);
+}
+
+void
+jrecord_leaf_uio(struct jrecord *jrec, int16_t rectype,
+ struct uio *uio)
+{
+ struct iovec *iov;
+ int i;
+
+ for (i = 0; i < uio->uio_iovcnt; ++i) {
+ iov = &uio->uio_iov[i];
+ if (iov->iov_len == 0)
+ continue;
+ if (uio->uio_segflg == UIO_SYSSPACE) {
+ jrecord_write(jrec, rectype, iov->iov_len);
+ jrecord_data(jrec, iov->iov_base, iov->iov_len, JDATA_KERN);
+ } else { /* UIO_USERSPACE */
+ jrecord_write(jrec, rectype, iov->iov_len);
+ jrecord_data(jrec, iov->iov_base, iov->iov_len, JDATA_USER);
+ }
+ }
+}
+
+void
+jrecord_leaf_xio(struct jrecord *jrec, int16_t rectype, xio_t xio)
+{
+ int bytes = xio->xio_npages * PAGE_SIZE;
+
+ jrecord_write(jrec, rectype, bytes);
+ jrecord_data(jrec, xio, bytes, JDATA_XIO);
}
/*
* CALL AND MAY BECOME INVALID AT ANY TIME. ONLY THE PUSH/POP CODE SHOULD
* USE THE RETURN VALUE.
*/
-static
struct journal_subrecord *
jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes)
{
jrec->stream_reserved - jrec->stream_residual,
JREC_DEFAULTSIZE, &pusheditout);
if (pusheditout) {
+ /*
+ * If a pushout occured, the pushed out stream record was
+ * truncated as specified and the new record is exactly the
+ * extension size specified.
+ */
jrec->stream_reserved = JREC_DEFAULTSIZE;
jrec->stream_residual = JREC_DEFAULTSIZE;
jrec->parent = NULL; /* no longer accessible */
jrec->pushptrgood = 0; /* restored parents in pops no good */
} else {
+ /*
+ * If no pushout occured the stream record is NOT truncated and
+ * IS extended.
+ */
jrec->stream_reserved += JREC_DEFAULTSIZE;
jrec->stream_residual += JREC_DEFAULTSIZE;
}
last = (void *)jrec->stream_ptr;
last->rectype = rectype;
last->reserved = 0;
- last->recsize = sizeof(struct journal_subrecord) + bytes;
+
+ /*
+ * We may not know the record size for recursive records and the
+ * header may become unavailable due to limited FIFO space. Write
+ * -1 to indicate this special case.
+ */
+ if ((rectype & JMASK_NESTED) && bytes == 0)
+ last->recsize = -1;
+ else
+ last->recsize = sizeof(struct journal_subrecord) + bytes;
jrec->last = last;
jrec->residual = bytes; /* remaining data to be posted */
jrec->residual_align = -bytes & 7; /* post-data alignment required */
+ jrec->stream_ptr += sizeof(*last); /* current write pointer */
+ jrec->stream_residual -= sizeof(*last); /* space remaining in stream */
return(last);
}
* subrecord header may become inaccessible due to stream record pushouts.
*/
static void
-jrecord_data(struct jrecord *jrec, const void *buf, int bytes)
+jrecord_data(struct jrecord *jrec, void *buf, int bytes, int dtype)
{
int pusheditout;
int extsize;
+ int xio_offset = 0;
KKASSERT(bytes >= 0 && bytes <= jrec->residual);
/*
* Fill in any remaining space in the current stream record.
*/
- bcopy(buf, jrec->stream_ptr, jrec->stream_residual);
- buf = (const char *)buf + jrec->stream_residual;
+ switch (dtype) {
+ case JDATA_KERN:
+ bcopy(buf, jrec->stream_ptr, jrec->stream_residual);
+ break;
+ case JDATA_USER:
+ copyin(buf, jrec->stream_ptr, jrec->stream_residual);
+ break;
+ case JDATA_XIO:
+ xio_copy_xtok((xio_t)buf, xio_offset, jrec->stream_ptr,
+ jrec->stream_residual);
+ xio_offset += jrec->stream_residual;
+ break;
+ }
+ if (dtype != JDATA_XIO)
+ buf = (char *)buf + jrec->stream_residual;
bytes -= jrec->stream_residual;
/*jrec->stream_ptr += jrec->stream_residual;*/
- jrec->stream_residual = 0;
jrec->residual -= jrec->stream_residual;
+ jrec->stream_residual = 0;
/*
* Try to extend the current stream record, but no more then 1/4
* Push out any remaining bytes into the current stream record.
*/
if (bytes) {
- bcopy(buf, jrec->stream_ptr, bytes);
+ switch (dtype) {
+ case JDATA_KERN:
+ bcopy(buf, jrec->stream_ptr, bytes);
+ break;
+ case JDATA_USER:
+ copyin(buf, jrec->stream_ptr, bytes);
+ break;
+ case JDATA_XIO:
+ xio_copy_xtok((xio_t)buf, xio_offset, jrec->stream_ptr, bytes);
+ break;
+ }
jrec->stream_ptr += bytes;
jrec->stream_residual -= bytes;
jrec->residual -= bytes;
}
/*
- * We are finished with a transaction. If abortit is not set then we must
- * be at the top level with no residual subrecord data left to output.
- * If abortit is set then we can be in any state.
+ * We are finished with the transaction. This closes the transaction created
+ * by jrecord_init().
+ *
+ * NOTE: If abortit is not set then we must be at the top level with no
+ * residual subrecord data left to output.
+ *
+ * If abortit is set then we can be in any state, all pushes will be
+ * popped and it is ok for there to be residual data. This works
+ * because the virtual stream itself is truncated. Scanners must deal
+ * with this situation.
*
* The stream record will be committed or aborted as specified and jrecord
* resources will be cleaned up.
*/
-static void
+void
jrecord_done(struct jrecord *jrec, int abortit)
{
KKASSERT(jrec->rawp != NULL);
* Write out a filename path relative to the base of the mount point.
* rectype is typically JLEAF_PATH{1,2,3,4}.
*/
-static void
+void
jrecord_write_path(struct jrecord *jrec, int16_t rectype, struct namecache *ncp)
{
char buf[64]; /* local buffer if it fits, else malloced */
/*
* Pass 1 - figure out the number of bytes required. Include terminating
* \0 on last element and '/' separator on other elements.
+ *
+ * The namecache topology terminates at the root of the filesystem
+ * (the normal lookup code would then continue by using the mount
+ * structure to figure out what it was mounted on).
*/
again:
pathlen = 0;
- for (scan = ncp;
- scan && (scan->nc_flag & NCF_MOUNTPT) == 0;
- scan = scan->nc_parent
- ) {
- pathlen += scan->nc_nlen + 1;
+ for (scan = ncp; scan; scan = scan->nc_parent) {
+ if (scan->nc_nlen > 0)
+ pathlen += scan->nc_nlen + 1;
}
if (pathlen <= sizeof(buf))
base = buf;
else
- base = malloc(pathlen, M_TEMP, M_INTWAIT);
+ base = kmalloc(pathlen, M_TEMP, M_INTWAIT);
/*
* Pass 2 - generate the path buffer
*/
index = pathlen;
- for (scan = ncp;
- scan && (scan->nc_flag & NCF_MOUNTPT) == 0;
- scan = scan->nc_parent
- ) {
+ for (scan = ncp; scan; scan = scan->nc_parent) {
+ if (scan->nc_nlen == 0)
+ continue;
if (scan->nc_nlen >= index) {
if (base != buf)
- free(base, M_TEMP);
+ kfree(base, M_TEMP);
goto again;
}
if (index == pathlen)
}
jrecord_leaf(jrec, rectype, base + index, pathlen - index);
if (base != buf)
- free(base, M_TEMP);
+ kfree(base, M_TEMP);
}
/*
* Write out a file attribute structure. While somewhat inefficient, using
* a recursive data structure is the most portable and extensible way.
*/
-static void
+void
jrecord_write_vattr(struct jrecord *jrec, struct vattr *vat)
{
void *save;
save = jrecord_push(jrec, JTYPE_VATTR);
if (vat->va_type != VNON)
- jrecord_leaf(jrec, JLEAF_UID, &vat->va_type, sizeof(vat->va_type));
- if (vat->va_uid != VNOVAL)
- jrecord_leaf(jrec, JLEAF_UID, &vat->va_mode, sizeof(vat->va_mode));
+ jrecord_leaf(jrec, JLEAF_VTYPE, &vat->va_type, sizeof(vat->va_type));
+ if (vat->va_mode != (mode_t)VNOVAL)
+ jrecord_leaf(jrec, JLEAF_MODES, &vat->va_mode, sizeof(vat->va_mode));
if (vat->va_nlink != VNOVAL)
jrecord_leaf(jrec, JLEAF_NLINK, &vat->va_nlink, sizeof(vat->va_nlink));
if (vat->va_uid != VNOVAL)
jrecord_leaf(jrec, JLEAF_GEN, &vat->va_gen, sizeof(vat->va_gen));
if (vat->va_flags != VNOVAL)
jrecord_leaf(jrec, JLEAF_FLAGS, &vat->va_flags, sizeof(vat->va_flags));
- if (vat->va_rdev != VNOVAL)
- jrecord_leaf(jrec, JLEAF_UDEV, &vat->va_rdev, sizeof(vat->va_rdev));
+ if (vat->va_rmajor != VNOVAL) {
+ udev_t rdev = makeudev(vat->va_rmajor, vat->va_rminor);
+ jrecord_leaf(jrec, JLEAF_UDEV, &rdev, sizeof(rdev));
+ jrecord_leaf(jrec, JLEAF_UMAJOR, &vat->va_rmajor, sizeof(vat->va_rmajor));
+ jrecord_leaf(jrec, JLEAF_UMINOR, &vat->va_rminor, sizeof(vat->va_rminor));
+ }
#if 0
if (vat->va_filerev != VNOVAL)
jrecord_leaf(jrec, JLEAF_FILEREV, &vat->va_filerev, sizeof(vat->va_filerev));
#endif
jrecord_pop(jrec, save);
- jrecord_done(jrec, 0);
}
/*
* XXX additional tracking info
* XXX tty line info
*/
-static void
+void
jrecord_write_cred(struct jrecord *jrec, struct thread *td, struct ucred *cred)
{
void *save;
jrecord_leaf(jrec, JLEAF_COMM, p->p_comm, sizeof(p->p_comm));
}
jrecord_pop(jrec, save);
- jrecord_done(jrec, 0);
}
/*
* Write out information required to identify a vnode
- */
-static void
-jrecord_write_vnode_ref(struct jrecord *jrec, struct vnode *vp)
-{
- /* XXX */
-}
-
-/*
- * Write out the data associated with a UIO
- */
-static void
-jrecord_write_uio(struct jrecord *jrec, int16_t rectype, struct uio *uio)
-{
- /* XXX */
-}
-
-/************************************************************************
- * JOURNAL VNOPS *
- ************************************************************************
- *
- * These are function shims replacing the normal filesystem ops. We become
- * responsible for calling the underlying filesystem ops. We have the choice
- * of executing the underlying op first and then generating the journal entry,
- * or starting the journal entry, executing the underlying op, and then
- * either completing or aborting it.
- *
- * The journal is supposed to be a high-level entity, which generally means
- * identifying files by name rather then by inode. Supplying both allows
- * the journal to be used both for inode-number-compatible 'mirrors' and
- * for simple filesystem replication.
- *
- * Writes are particularly difficult to deal with because a single write may
- * represent a hundred megabyte buffer or more, and both writes and truncations
- * require the 'old' data to be written out as well as the new data if the
- * log is reversable. Other issues:
*
- * - How to deal with operations on unlinked files (no path available),
- * but which may still be filesystem visible due to hard links.
- *
- * - How to deal with modifications made via a memory map.
- *
- * - Future cache coherency support will require cache coherency API calls
- * both prior to and after the call to the underlying VFS.
- *
- * ALSO NOTE: We do not have to shim compatibility VOPs like MKDIR which have
- * new VFS equivalents (NMKDIR).
- */
-
-/*
- * Journal vop_settattr { a_vp, a_vap, a_cred, a_td }
- */
-static
-int
-journal_setattr(struct vop_setattr_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_SETATTR);
- jrecord_write_cred(&jrec, ap->a_td, ap->a_cred);
- jrecord_write_vnode_ref(&jrec, ap->a_vp);
- jrecord_write_vattr(&jrec, ap->a_vap);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
- }
- return (error);
-}
-
-/*
- * Journal vop_write { a_vp, a_uio, a_ioflag, a_cred }
- */
-static
-int
-journal_write(struct vop_write_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_WRITE);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_vnode_ref(&jrec, ap->a_vp);
- jrecord_write_uio(&jrec, JLEAF_FILEDATA, ap->a_uio);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
- }
- return (error);
-}
-
-/*
- * Journal vop_fsync { a_vp, a_waitfor, a_td }
- */
-static
-int
-journal_fsync(struct vop_fsync_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- /* XXX synchronize pending journal records */
- }
- }
- return (error);
-}
-
-/*
- * Journal vop_putpages { a_vp, a_m, a_count, a_sync, a_rtvals, a_offset }
+ * XXX this needs work. We should write out the inode number as well,
+ * and in fact avoid writing out the file path for seqential writes
+ * occuring within e.g. a certain period of time.
*/
-static
-int
-journal_putpages(struct vop_putpages_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_PUTPAGES);
- jrecord_write_vnode_ref(&jrec, ap->a_vp);
- /* XXX pagelist */
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
- }
- return (error);
-}
-
-/*
- * Journal vop_setacl { a_vp, a_type, a_aclp, a_cred, a_td }
- */
-static
-int
-journal_setacl(struct vop_setacl_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_SETACL);
- jrecord_write_cred(&jrec, ap->a_td, ap->a_cred);
- jrecord_write_vnode_ref(&jrec, ap->a_vp);
- /* XXX type, aclp */
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
- }
- return (error);
-}
-
-/*
- * Journal vop_setextattr { a_vp, a_name, a_uio, a_cred, a_td }
- */
-static
-int
-journal_setextattr(struct vop_setextattr_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_SETEXTATTR);
- jrecord_write_cred(&jrec, ap->a_td, ap->a_cred);
- jrecord_write_vnode_ref(&jrec, ap->a_vp);
- jrecord_leaf(&jrec, JLEAF_ATTRNAME, ap->a_name, strlen(ap->a_name));
- jrecord_write_uio(&jrec, JLEAF_FILEDATA, ap->a_uio);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
- }
- return (error);
-}
-
-/*
- * Journal vop_ncreate { a_ncp, a_vpp, a_cred, a_vap }
- */
-static
-int
-journal_ncreate(struct vop_ncreate_args *ap)
+void
+jrecord_write_vnode_ref(struct jrecord *jrec, struct vnode *vp)
{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
+ struct nchandle nch;
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_CREATE);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- if (*ap->a_vpp)
- jrecord_write_vnode_ref(&jrec, *ap->a_vpp);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
+ nch.mount = vp->v_mount;
+ spin_lock(&vp->v_spin);
+ TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) {
+ if ((nch.ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
+ break;
}
- return (error);
-}
-
-/*
- * Journal vop_nmknod { a_ncp, a_vpp, a_cred, a_vap }
- */
-static
-int
-journal_nmknod(struct vop_nmknod_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_MKNOD);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- jrecord_write_vattr(&jrec, ap->a_vap);
- if (*ap->a_vpp)
- jrecord_write_vnode_ref(&jrec, *ap->a_vpp);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
+ if (nch.ncp) {
+ cache_hold(&nch);
+ spin_unlock(&vp->v_spin);
+ jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp);
+ cache_drop(&nch);
+ } else {
+ spin_unlock(&vp->v_spin);
}
- return (error);
}
-/*
- * Journal vop_nlink { a_ncp, a_vp, a_cred }
- */
-static
-int
-journal_nlink(struct vop_nlink_args *ap)
+void
+jrecord_write_vnode_link(struct jrecord *jrec, struct vnode *vp,
+ struct namecache *notncp)
{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
+ struct nchandle nch;
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_LINK);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- jrecord_write_vnode_ref(&jrec, ap->a_vp);
- /* XXX PATH to VP and inode number */
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
+ nch.mount = vp->v_mount;
+ spin_lock(&vp->v_spin);
+ TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) {
+ if (nch.ncp == notncp)
+ continue;
+ if ((nch.ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
+ break;
}
- return (error);
-}
-
-/*
- * Journal vop_symlink { a_ncp, a_vpp, a_cred, a_vap, a_target }
- */
-static
-int
-journal_nsymlink(struct vop_nsymlink_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_SYMLINK);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- jrecord_leaf(&jrec, JLEAF_SYMLINKDATA,
- ap->a_target, strlen(ap->a_target));
- if (*ap->a_vpp)
- jrecord_write_vnode_ref(&jrec, *ap->a_vpp);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
+ if (nch.ncp) {
+ cache_hold(&nch);
+ spin_unlock(&vp->v_spin);
+ jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp);
+ cache_drop(&nch);
+ } else {
+ spin_unlock(&vp->v_spin);
}
- return (error);
}
/*
- * Journal vop_nwhiteout { a_ncp, a_cred, a_flags }
+ * Write out the data represented by a pagelist
*/
-static
-int
-journal_nwhiteout(struct vop_nwhiteout_args *ap)
+void
+jrecord_write_pagelist(struct jrecord *jrec, int16_t rectype,
+ struct vm_page **pglist, int *rtvals, int pgcount,
+ off_t offset)
{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
+ struct xio xio;
int error;
+ int b;
+ int i;
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_WHITEOUT);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
+ i = 0;
+ xio_init(&xio);
+ while (i < pgcount) {
+ /*
+ * Find the next valid section. Skip any invalid elements
+ */
+ if (rtvals[i] != VM_PAGER_OK) {
+ ++i;
+ offset += PAGE_SIZE;
+ continue;
}
- }
- return (error);
-}
-/*
- * Journal vop_nremove { a_ncp, a_cred }
- */
-static
-int
-journal_nremove(struct vop_nremove_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_REMOVE);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
+ /*
+ * Figure out how big the valid section is, capping I/O at what the
+ * MSFBUF can represent.
+ */
+ b = i;
+ while (i < pgcount && i - b != XIO_INTERNAL_PAGES &&
+ rtvals[i] == VM_PAGER_OK
+ ) {
+ ++i;
}
- }
- return (error);
-}
-/*
- * Journal vop_nmkdir { a_ncp, a_vpp, a_cred, a_vap }
- */
-static
-int
-journal_nmkdir(struct vop_nmkdir_args *ap)
-{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- if (jo->flags & MC_JOURNAL_WANT_REVERSABLE) {
- save = jrecord_push(&jrec, JTYPE_UNDO);
- /* XXX undo operations */
- jrecord_pop(&jrec, save);
- }
-#if 0
- if (jo->flags & MC_JOURNAL_WANT_AUDIT) {
- jrecord_write_audit(&jrec);
+ /*
+ * And write it out.
+ */
+ if (i - b) {
+ error = xio_init_pages(&xio, pglist + b, i - b, XIOF_READ);
+ if (error == 0) {
+ jrecord_leaf(jrec, JLEAF_SEEKPOS, &offset, sizeof(offset));
+ jrecord_leaf_xio(jrec, rectype, &xio);
+ } else {
+ kprintf("jrecord_write_pagelist: xio init failure\n");
}
-#endif
- save = jrecord_push(&jrec, JTYPE_MKDIR);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_vattr(&jrec, ap->a_vap);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- if (*ap->a_vpp)
- jrecord_write_vnode_ref(&jrec, *ap->a_vpp);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
+ xio_release(&xio);
+ offset += (off_t)(i - b) << PAGE_SHIFT;
}
}
- return (error);
}
/*
- * Journal vop_nrmdir { a_ncp, a_cred }
+ * Write out the data represented by a UIO.
*/
-static
-int
-journal_nrmdir(struct vop_nrmdir_args *ap)
+void
+jrecord_write_uio(struct jrecord *jrec, int16_t rectype, struct uio *uio)
{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_RMDIR);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_ncp);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
- }
+ if (uio->uio_segflg != UIO_NOCOPY) {
+ jrecord_leaf(jrec, JLEAF_SEEKPOS, &uio->uio_offset,
+ sizeof(uio->uio_offset));
+ jrecord_leaf_uio(jrec, rectype, uio);
}
- return (error);
}
-/*
- * Journal vop_nrename { a_fncp, a_tncp, a_cred }
- */
-static
-int
-journal_nrename(struct vop_nrename_args *ap)
+void
+jrecord_file_data(struct jrecord *jrec, struct vnode *vp,
+ off_t off, off_t bytes)
{
- struct mount *mp;
- struct journal *jo;
- struct jrecord jrec;
- void *save; /* warning, save pointers do not always remain valid */
+ const int bufsize = 8192;
+ char *buf;
int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- jrecord_init(jo, &jrec, -1);
- save = jrecord_push(&jrec, JTYPE_RENAME);
- jrecord_write_cred(&jrec, NULL, ap->a_cred);
- jrecord_write_path(&jrec, JLEAF_PATH1, ap->a_fncp);
- jrecord_write_path(&jrec, JLEAF_PATH2, ap->a_tncp);
- jrecord_pop(&jrec, save);
- jrecord_done(&jrec, 0);
+ int n;
+
+ buf = kmalloc(bufsize, M_JOURNAL, M_WAITOK);
+ jrecord_leaf(jrec, JLEAF_SEEKPOS, &off, sizeof(off));
+ while (bytes) {
+ n = (bytes > bufsize) ? bufsize : (int)bytes;
+ error = vn_rdwr(UIO_READ, vp, buf, n, off, UIO_SYSSPACE, IO_NODELOCKED,
+ proc0.p_ucred, NULL);
+ if (error) {
+ jrecord_leaf(jrec, JLEAF_ERROR, &error, sizeof(error));
+ break;
}
+ jrecord_leaf(jrec, JLEAF_FILEDATA, buf, n);
+ bytes -= n;
+ off += n;
}
- return (error);
+ kfree(buf, M_JOURNAL);
}