/*
- * Copyright (c) 2004 The DragonFly Project. All rights reserved.
+ * Copyright (c) 2004-2006 The DragonFly Project. All rights reserved.
*
* This code is derived from software contributed to The DragonFly Project
* by Matthew Dillon <dillon@backplane.com>
* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * $DragonFly: src/sys/kern/vfs_journal.c,v 1.23 2005/09/17 07:43:00 dillon Exp $
+ * $DragonFly: src/sys/kern/vfs_journal.c,v 1.33 2007/05/09 00:53:34 dillon Exp $
*/
/*
- * Each mount point may have zero or more independantly configured journals
- * attached to it. Each journal is represented by a memory FIFO and worker
- * thread. Journal events are streamed through the FIFO to the thread,
- * batched up (typically on one-second intervals), and written out by the
- * thread.
- *
- * Journal vnode ops are executed instead of mnt_vn_norm_ops when one or
- * more journals have been installed on a mount point. It becomes the
- * responsibility of the journal op to call the underlying normal op as
- * appropriate.
- *
* The journaling protocol is intended to evolve into a two-way stream
* whereby transaction IDs can be acknowledged by the journaling target
* when the data has been committed to hard storage. Both implicit and
#include <sys/journal.h>
#include <sys/file.h>
#include <sys/proc.h>
-#include <sys/msfbuf.h>
+#include <sys/xio.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/file2.h>
#include <sys/thread2.h>
+#include <sys/mplock2.h>
+#include <sys/spinlock2.h>
-static int journal_attach(struct mount *mp);
-static void journal_detach(struct mount *mp);
-static int journal_install_vfs_journal(struct mount *mp, struct file *fp,
- const struct mountctl_install_journal *info);
-static int journal_restart_vfs_journal(struct mount *mp, struct file *fp,
- const struct mountctl_restart_journal *info);
-static int journal_remove_vfs_journal(struct mount *mp,
- const struct mountctl_remove_journal *info);
-static int journal_restart(struct mount *mp, struct file *fp,
- struct journal *jo, int flags);
-static int journal_destroy(struct mount *mp, struct journal *jo, int flags);
-static int journal_resync_vfs_journal(struct mount *mp, const void *ctl);
-static int journal_status_vfs_journal(struct mount *mp,
- const struct mountctl_status_journal *info,
- struct mountctl_journal_ret_status *rstat,
- int buflen, int *res);
-static void journal_create_threads(struct journal *jo);
-static void journal_destroy_threads(struct journal *jo, int flags);
static void journal_wthread(void *info);
static void journal_rthread(void *info);
-static void *journal_reserve(struct journal *jo,
- struct journal_rawrecbeg **rawpp,
- int16_t streamid, int bytes);
+static void *journal_reserve(struct journal *jo,
+ struct journal_rawrecbeg **rawpp,
+ int16_t streamid, int bytes);
static void *journal_extend(struct journal *jo,
- struct journal_rawrecbeg **rawpp,
- int truncbytes, int bytes, int *newstreamrecp);
-static void journal_abort(struct journal *jo,
- struct journal_rawrecbeg **rawpp);
-static void journal_commit(struct journal *jo,
- struct journal_rawrecbeg **rawpp,
- int bytes, int closeout);
-
-static void jrecord_init(struct journal *jo,
- struct jrecord *jrec, int16_t streamid);
-static struct journal_subrecord *jrecord_push(
- struct jrecord *jrec, int16_t rectype);
-static void jrecord_pop(struct jrecord *jrec, struct journal_subrecord *parent);
-static struct journal_subrecord *jrecord_write(struct jrecord *jrec,
- int16_t rectype, int bytes);
-static void jrecord_data(struct jrecord *jrec, const void *buf, int bytes);
-static void jrecord_done(struct jrecord *jrec, int abortit);
-static void jrecord_undo_file(struct jrecord *jrec, struct vnode *vp,
- int jrflags, off_t off, off_t bytes);
-
-static int journal_setattr(struct vop_setattr_args *ap);
-static int journal_write(struct vop_write_args *ap);
-static int journal_fsync(struct vop_fsync_args *ap);
-static int journal_putpages(struct vop_putpages_args *ap);
-static int journal_setacl(struct vop_setacl_args *ap);
-static int journal_setextattr(struct vop_setextattr_args *ap);
-static int journal_ncreate(struct vop_ncreate_args *ap);
-static int journal_nmknod(struct vop_nmknod_args *ap);
-static int journal_nlink(struct vop_nlink_args *ap);
-static int journal_nsymlink(struct vop_nsymlink_args *ap);
-static int journal_nwhiteout(struct vop_nwhiteout_args *ap);
-static int journal_nremove(struct vop_nremove_args *ap);
-static int journal_nmkdir(struct vop_nmkdir_args *ap);
-static int journal_nrmdir(struct vop_nrmdir_args *ap);
-static int journal_nrename(struct vop_nrename_args *ap);
-
-#define JRUNDO_SIZE 0x00000001
-#define JRUNDO_UID 0x00000002
-#define JRUNDO_GID 0x00000004
-#define JRUNDO_FSID 0x00000008
-#define JRUNDO_MODES 0x00000010
-#define JRUNDO_INUM 0x00000020
-#define JRUNDO_ATIME 0x00000040
-#define JRUNDO_MTIME 0x00000080
-#define JRUNDO_CTIME 0x00000100
-#define JRUNDO_GEN 0x00000200
-#define JRUNDO_FLAGS 0x00000400
-#define JRUNDO_UDEV 0x00000800
-#define JRUNDO_NLINK 0x00001000
-#define JRUNDO_FILEDATA 0x00010000
-#define JRUNDO_GETVP 0x00020000
-#define JRUNDO_CONDLINK 0x00040000 /* write file data if link count 1 */
-#define JRUNDO_VATTR (JRUNDO_SIZE|JRUNDO_UID|JRUNDO_GID|JRUNDO_FSID|\
- JRUNDO_MODES|JRUNDO_INUM|JRUNDO_ATIME|JRUNDO_MTIME|\
- JRUNDO_CTIME|JRUNDO_GEN|JRUNDO_FLAGS|JRUNDO_UDEV|\
- JRUNDO_NLINK)
-#define JRUNDO_ALL (JRUNDO_VATTR|JRUNDO_FILEDATA)
-
-static struct vnodeopv_entry_desc journal_vnodeop_entries[] = {
- { &vop_default_desc, vop_journal_operate_ap },
- { &vop_mountctl_desc, (void *)journal_mountctl },
- { &vop_setattr_desc, (void *)journal_setattr },
- { &vop_write_desc, (void *)journal_write },
- { &vop_fsync_desc, (void *)journal_fsync },
- { &vop_putpages_desc, (void *)journal_putpages },
- { &vop_setacl_desc, (void *)journal_setacl },
- { &vop_setextattr_desc, (void *)journal_setextattr },
- { &vop_ncreate_desc, (void *)journal_ncreate },
- { &vop_nmknod_desc, (void *)journal_nmknod },
- { &vop_nlink_desc, (void *)journal_nlink },
- { &vop_nsymlink_desc, (void *)journal_nsymlink },
- { &vop_nwhiteout_desc, (void *)journal_nwhiteout },
- { &vop_nremove_desc, (void *)journal_nremove },
- { &vop_nmkdir_desc, (void *)journal_nmkdir },
- { &vop_nrmdir_desc, (void *)journal_nrmdir },
- { &vop_nrename_desc, (void *)journal_nrename },
- { NULL, NULL }
-};
-
-static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures");
-static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
-
-int
-journal_mountctl(struct vop_mountctl_args *ap)
-{
- struct mount *mp;
- int error = 0;
-
- mp = ap->a_head.a_ops->vv_mount;
- KKASSERT(mp);
-
- if (mp->mnt_vn_journal_ops == NULL) {
- switch(ap->a_op) {
- case MOUNTCTL_INSTALL_VFS_JOURNAL:
- error = journal_attach(mp);
- if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal))
- error = EINVAL;
- if (error == 0 && ap->a_fp == NULL)
- error = EBADF;
- if (error == 0)
- error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
- if (TAILQ_EMPTY(&mp->mnt_jlist))
- journal_detach(mp);
- break;
- case MOUNTCTL_RESTART_VFS_JOURNAL:
- case MOUNTCTL_REMOVE_VFS_JOURNAL:
- case MOUNTCTL_RESYNC_VFS_JOURNAL:
- case MOUNTCTL_STATUS_VFS_JOURNAL:
- error = ENOENT;
- break;
- default:
- error = EOPNOTSUPP;
- break;
- }
- } else {
- switch(ap->a_op) {
- case MOUNTCTL_INSTALL_VFS_JOURNAL:
- if (ap->a_ctllen != sizeof(struct mountctl_install_journal))
- error = EINVAL;
- if (error == 0 && ap->a_fp == NULL)
- error = EBADF;
- if (error == 0)
- error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
- break;
- case MOUNTCTL_RESTART_VFS_JOURNAL:
- if (ap->a_ctllen != sizeof(struct mountctl_restart_journal))
- error = EINVAL;
- if (error == 0 && ap->a_fp == NULL)
- error = EBADF;
- if (error == 0)
- error = journal_restart_vfs_journal(mp, ap->a_fp, ap->a_ctl);
- break;
- case MOUNTCTL_REMOVE_VFS_JOURNAL:
- if (ap->a_ctllen != sizeof(struct mountctl_remove_journal))
- error = EINVAL;
- if (error == 0)
- error = journal_remove_vfs_journal(mp, ap->a_ctl);
- if (TAILQ_EMPTY(&mp->mnt_jlist))
- journal_detach(mp);
- break;
- case MOUNTCTL_RESYNC_VFS_JOURNAL:
- if (ap->a_ctllen != 0)
- error = EINVAL;
- error = journal_resync_vfs_journal(mp, ap->a_ctl);
- break;
- case MOUNTCTL_STATUS_VFS_JOURNAL:
- if (ap->a_ctllen != sizeof(struct mountctl_status_journal))
- error = EINVAL;
- if (error == 0) {
- error = journal_status_vfs_journal(mp, ap->a_ctl,
- ap->a_buf, ap->a_buflen, ap->a_res);
- }
- break;
- default:
- error = EOPNOTSUPP;
- break;
- }
- }
- return (error);
-}
+ struct journal_rawrecbeg **rawpp,
+ int truncbytes, int bytes, int *newstreamrecp);
+static void journal_abort(struct journal *jo,
+ struct journal_rawrecbeg **rawpp);
+static void journal_commit(struct journal *jo,
+ struct journal_rawrecbeg **rawpp,
+ int bytes, int closeout);
+static void jrecord_data(struct jrecord *jrec,
+ void *buf, int bytes, int dtype);
-/*
- * High level mount point setup. When a
- */
-static int
-journal_attach(struct mount *mp)
-{
- vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops,
- journal_vnodeop_entries, 0);
- return(0);
-}
-static void
-journal_detach(struct mount *mp)
-{
- if (mp->mnt_vn_journal_ops)
- vfs_rm_vnodeops(&mp->mnt_vn_journal_ops);
-}
-
-/*
- * Install a journal on a mount point. Each journal has an associated worker
- * thread which is responsible for buffering and spooling the data to the
- * target. A mount point may have multiple journals attached to it. An
- * initial start record is generated when the journal is associated.
- */
-static int
-journal_install_vfs_journal(struct mount *mp, struct file *fp,
- const struct mountctl_install_journal *info)
-{
- struct journal *jo;
- struct jrecord jrec;
- int error = 0;
- int size;
+MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structures");
+MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
- jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK|M_ZERO);
- bcopy(info->id, jo->id, sizeof(jo->id));
- jo->flags = info->flags & ~(MC_JOURNAL_WACTIVE | MC_JOURNAL_RACTIVE |
- MC_JOURNAL_STOP_REQ);
-
- /*
- * Memory FIFO size, round to nearest power of 2
- */
- if (info->membufsize) {
- if (info->membufsize < 65536)
- size = 65536;
- else if (info->membufsize > 128 * 1024 * 1024)
- size = 128 * 1024 * 1024;
- else
- size = (int)info->membufsize;
- } else {
- size = 1024 * 1024;
- }
- jo->fifo.size = 1;
- while (jo->fifo.size < size)
- jo->fifo.size <<= 1;
-
- /*
- * Other parameters. If not specified the starting transaction id
- * will be the current date.
- */
- if (info->transid) {
- jo->transid = info->transid;
- } else {
- struct timespec ts;
- getnanotime(&ts);
- jo->transid = ((int64_t)ts.tv_sec << 30) | ts.tv_nsec;
- }
-
- jo->fp = fp;
-
- /*
- * Allocate the memory FIFO
- */
- jo->fifo.mask = jo->fifo.size - 1;
- jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK|M_ZERO|M_NULLOK);
- if (jo->fifo.membase == NULL)
- error = ENOMEM;
-
- /*
- * Create the worker threads and generate the association record.
- */
- if (error) {
- free(jo, M_JOURNAL);
- } else {
- fhold(fp);
- journal_create_threads(jo);
- jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT);
- jrecord_write(&jrec, JTYPE_ASSOCIATE, 0);
- jrecord_done(&jrec, 0);
- TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry);
- }
- return(error);
-}
-
-/*
- * Restart a journal with a new descriptor. The existing reader and writer
- * threads are terminated and a new descriptor is associated with the
- * journal. The FIFO rindex is reset to xindex and the threads are then
- * restarted.
- */
-static int
-journal_restart_vfs_journal(struct mount *mp, struct file *fp,
- const struct mountctl_restart_journal *info)
-{
- struct journal *jo;
- int error;
-
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0)
- break;
- }
- if (jo)
- error = journal_restart(mp, fp, jo, info->flags);
- else
- error = EINVAL;
- return (error);
-}
-
-static int
-journal_restart(struct mount *mp, struct file *fp,
- struct journal *jo, int flags)
-{
- /*
- * XXX lock the jo
- */
-
-#if 0
- /*
- * Record the fact that we are doing a restart in the journal.
- * XXX it isn't safe to do this if the journal is being restarted
- * because it was locked up and the writer thread has already exited.
- */
- jrecord_init(jo, &jrec, JREC_STREAMID_RESTART);
- jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0);
- jrecord_done(&jrec, 0);
-#endif
-
- /*
- * Stop the reader and writer threads and clean up the current
- * descriptor.
- */
- printf("RESTART WITH FP %p KILLING %p\n", fp, jo->fp);
- journal_destroy_threads(jo, flags);
-
- if (jo->fp)
- fdrop(jo->fp, curthread);
-
- /*
- * Associate the new descriptor, reset the FIFO index, and recreate
- * the threads.
- */
- fhold(fp);
- jo->fp = fp;
- jo->fifo.rindex = jo->fifo.xindex;
- journal_create_threads(jo);
-
- return(0);
-}
-
-/*
- * Disassociate a journal from a mount point and terminate its worker thread.
- * A final termination record is written out before the file pointer is
- * dropped.
- */
-static int
-journal_remove_vfs_journal(struct mount *mp,
- const struct mountctl_remove_journal *info)
-{
- struct journal *jo;
- int error;
-
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0)
- break;
- }
- if (jo)
- error = journal_destroy(mp, jo, info->flags);
- else
- error = EINVAL;
- return (error);
-}
-
-/*
- * Remove all journals associated with a mount point. Usually called
- * by the umount code.
- */
void
-journal_remove_all_journals(struct mount *mp, int flags)
-{
- struct journal *jo;
-
- while ((jo = TAILQ_FIRST(&mp->mnt_jlist)) != NULL) {
- journal_destroy(mp, jo, flags);
- }
-}
-
-static int
-journal_destroy(struct mount *mp, struct journal *jo, int flags)
-{
- struct jrecord jrec;
-
- TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry);
-
- jrecord_init(jo, &jrec, JREC_STREAMID_DISCONT);
- jrecord_write(&jrec, JTYPE_DISASSOCIATE, 0);
- jrecord_done(&jrec, 0);
-
- journal_destroy_threads(jo, flags);
-
- if (jo->fp)
- fdrop(jo->fp, curthread);
- if (jo->fifo.membase)
- free(jo->fifo.membase, M_JFIFO);
- free(jo, M_JOURNAL);
- return(0);
-}
-
-static int
-journal_resync_vfs_journal(struct mount *mp, const void *ctl)
-{
- return(EINVAL);
-}
-
-static int
-journal_status_vfs_journal(struct mount *mp,
- const struct mountctl_status_journal *info,
- struct mountctl_journal_ret_status *rstat,
- int buflen, int *res)
-{
- struct journal *jo;
- int error = 0;
- int index;
-
- index = 0;
- *res = 0;
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- if (info->index == MC_JOURNAL_INDEX_ID) {
- if (bcmp(jo->id, info->id, sizeof(jo->id)) != 0)
- continue;
- } else if (info->index >= 0) {
- if (info->index < index)
- continue;
- } else if (info->index != MC_JOURNAL_INDEX_ALL) {
- continue;
- }
- if (buflen < sizeof(*rstat)) {
- if (*res)
- rstat[-1].flags |= MC_JOURNAL_STATUS_MORETOCOME;
- else
- error = EINVAL;
- break;
- }
- bzero(rstat, sizeof(*rstat));
- rstat->recsize = sizeof(*rstat);
- bcopy(jo->id, rstat->id, sizeof(jo->id));
- rstat->index = index;
- rstat->membufsize = jo->fifo.size;
- rstat->membufused = jo->fifo.windex - jo->fifo.xindex;
- rstat->membufunacked = jo->fifo.rindex - jo->fifo.xindex;
- rstat->bytessent = jo->total_acked;
- rstat->fifostalls = jo->fifostalls;
- ++rstat;
- ++index;
- *res += sizeof(*rstat);
- buflen -= sizeof(*rstat);
- }
- return(error);
-}
-
-static void
journal_create_threads(struct journal *jo)
{
jo->flags &= ~(MC_JOURNAL_STOP_REQ | MC_JOURNAL_STOP_IMM);
jo->flags |= MC_JOURNAL_WACTIVE;
lwkt_create(journal_wthread, jo, NULL, &jo->wthread,
- TDF_STOPREQ, -1, "journal w:%.*s", JIDMAX, jo->id);
+ TDF_NOSTART, -1,
+ "journal w:%.*s", JIDMAX, jo->id);
lwkt_setpri(&jo->wthread, TDPRI_KERN_DAEMON);
lwkt_schedule(&jo->wthread);
if (jo->flags & MC_JOURNAL_WANT_FULLDUPLEX) {
jo->flags |= MC_JOURNAL_RACTIVE;
lwkt_create(journal_rthread, jo, NULL, &jo->rthread,
- TDF_STOPREQ, -1, "journal r:%.*s", JIDMAX, jo->id);
+ TDF_NOSTART, -1,
+ "journal r:%.*s", JIDMAX, jo->id);
lwkt_setpri(&jo->rthread, TDPRI_KERN_DAEMON);
lwkt_schedule(&jo->rthread);
}
}
-static void
+void
journal_destroy_threads(struct journal *jo, int flags)
{
int wcount;
while (jo->flags & (MC_JOURNAL_WACTIVE | MC_JOURNAL_RACTIVE)) {
tsleep(jo, 0, "jwait", hz);
if (++wcount % 10 == 0) {
- printf("Warning: journal %s waiting for descriptors to close\n",
+ kprintf("Warning: journal %s waiting for descriptors to close\n",
jo->id);
}
}
{
struct journal *jo = info;
struct journal_rawrecbeg *rawp;
- int bytes;
int error;
- int avail;
- int res;
+ size_t avail;
+ size_t bytes;
+ size_t res;
+
+ /* not MPSAFE yet */
+ get_mplock();
for (;;) {
/*
bytes = res;
jo->fifo.rindex += bytes;
error = fp_write(jo->fp,
- jo->fifo.membase + ((jo->fifo.rindex - bytes) & jo->fifo.mask),
- bytes, &res);
+ jo->fifo.membase +
+ ((jo->fifo.rindex - bytes) & jo->fifo.mask),
+ bytes, &res, UIO_SYSSPACE);
if (error) {
- printf("journal_thread(%s) write, error %d\n", jo->id, error);
+ kprintf("journal_thread(%s) write, error %d\n", jo->id, error);
/* XXX */
} else {
KKASSERT(res == bytes);
jo->flags &= ~MC_JOURNAL_WACTIVE;
wakeup(jo);
wakeup(&jo->fifo.windex);
+ rel_mplock();
}
/*
struct journal *jo = info;
int64_t transid;
int error;
- int count;
- int bytes;
+ size_t count;
+ size_t bytes;
transid = 0;
error = 0;
+ /* not MPSAFE yet */
+ get_mplock();
+
for (;;) {
/*
* We have been asked to stop
* stream.
*/
if (transid == 0) {
- error = fp_read(jo->fp, &ack, sizeof(ack), &count, 1);
+ error = fp_read(jo->fp, &ack, sizeof(ack), &count,
+ 1, UIO_SYSSPACE);
#if 0
- printf("fp_read ack error %d count %d\n", error, count);
+ kprintf("fp_read ack error %d count %d\n", error, count);
#endif
if (error || count != sizeof(ack))
break;
if (error) {
- printf("read error %d on receive stream\n", error);
+ kprintf("read error %d on receive stream\n", error);
break;
}
if (ack.rbeg.begmagic != JREC_BEGMAGIC ||
ack.rend.endmagic != JREC_ENDMAGIC
) {
- printf("bad begmagic or endmagic on receive stream\n");
+ kprintf("bad begmagic or endmagic on receive stream\n");
break;
}
transid = ack.rbeg.transid;
bytes = jo->fifo.rindex - jo->fifo.xindex;
if (bytes == 0) {
- printf("warning: unsent data acknowledged transid %08llx\n", transid);
+ kprintf("warning: unsent data acknowledged transid %08llx\n",
+ (long long)transid);
tsleep(&jo->fifo.xindex, 0, "jrseq", hz);
transid = 0;
continue;
*/
if (rawp->transid < transid) {
#if 1
- printf("ackskip %08llx/%08llx\n", rawp->transid, transid);
+ kprintf("ackskip %08llx/%08llx\n",
+ (long long)rawp->transid,
+ (long long)transid);
#endif
jo->fifo.xindex += (rawp->recsize + 15) & ~15;
jo->total_acked += (rawp->recsize + 15) & ~15;
}
if (rawp->transid == transid) {
#if 1
- printf("ackskip %08llx/%08llx\n", rawp->transid, transid);
+ kprintf("ackskip %08llx/%08llx\n",
+ (long long)rawp->transid,
+ (long long)transid);
#endif
jo->fifo.xindex += (rawp->recsize + 15) & ~15;
jo->total_acked += (rawp->recsize + 15) & ~15;
transid = 0;
continue;
}
- printf("warning: unsent data(2) acknowledged transid %08llx\n", transid);
+ kprintf("warning: unsent data(2) acknowledged transid %08llx\n",
+ (long long)transid);
transid = 0;
}
jo->flags &= ~MC_JOURNAL_RACTIVE;
wakeup(jo);
wakeup(&jo->fifo.windex);
+ rel_mplock();
}
/*
}
/************************************************************************
- * PARALLEL TRANSACTION SUPPORT ROUTINES *
- ************************************************************************
- *
- * JRECLIST_*() - routines which create and iterate over jrecord structures,
- * because a mount point may have multiple attached journals.
- */
-
-/*
- * Initialize the passed jrecord_list and create a jrecord for each
- * journal we need to write to. Unnecessary mallocs are avoided by
- * using the passed jrecord structure as the first jrecord in the list.
- * A starting transaction is pushed for each jrecord.
- *
- * Returns non-zero if any of the journals require undo records.
- */
-static
-int
-jreclist_init(struct mount *mp, struct jrecord_list *jreclist,
- struct jrecord *jreccache, int16_t rectype)
-{
- struct journal *jo;
- struct jrecord *jrec;
- int wantrev = 0;
- int count = 0;
-
- TAILQ_INIT(jreclist);
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- if (count == 0)
- jrec = jreccache;
- else
- jrec = malloc(sizeof(*jrec), M_JOURNAL, M_WAITOK);
- jrecord_init(jo, jrec, -1);
- jrec->user_save = jrecord_push(jrec, rectype);
- TAILQ_INSERT_TAIL(jreclist, jrec, user_entry);
- if (jo->flags & MC_JOURNAL_WANT_REVERSABLE)
- wantrev = 1;
- ++count;
- }
- return(wantrev);
-}
-
-/*
- * Terminate the journaled transactions started by jreclist_init(). If
- * an error occured, the transaction records will be aborted.
- */
-static
-void
-jreclist_done(struct jrecord_list *jreclist, int error)
-{
- struct jrecord *jrec;
- int count;
-
- TAILQ_FOREACH(jrec, jreclist, user_entry) {
- jrecord_pop(jrec, jrec->user_save);
- jrecord_done(jrec, error);
- }
- count = 0;
- while ((jrec = TAILQ_FIRST(jreclist)) != NULL) {
- TAILQ_REMOVE(jreclist, jrec, user_entry);
- if (count)
- free(jrec, M_JOURNAL);
- ++count;
- }
-}
-
-/*
- * This procedure writes out UNDO records for available reversable
- * journals.
- *
- * XXX could use improvement. There is no need to re-read the file
- * for each journal.
- */
-static
-void
-jreclist_undo_file(struct jrecord_list *jreclist, struct vnode *vp,
- int jrflags, off_t off, off_t bytes)
-{
- struct jrecord *jrec;
- int error;
-
- error = 0;
- if (jrflags & JRUNDO_GETVP)
- error = vget(vp, LK_SHARED, curthread);
- if (error == 0) {
- TAILQ_FOREACH(jrec, jreclist, user_entry) {
- if (jrec->jo->flags & MC_JOURNAL_WANT_REVERSABLE) {
- jrecord_undo_file(jrec, vp, jrflags, off, bytes);
- }
- }
- }
- if (error == 0 && jrflags & JRUNDO_GETVP)
- vput(vp);
-}
-
-/************************************************************************
* TRANSACTION SUPPORT ROUTINES *
************************************************************************
*
* in the logical streams managed by the journal_*() routines.
*/
-static int16_t sid = JREC_STREAMID_JMIN;
-
/*
* Initialize the passed jrecord structure and start a new stream transaction
* by reserving an initial build space in the journal's memory FIFO.
*/
-static void
+void
jrecord_init(struct journal *jo, struct jrecord *jrec, int16_t streamid)
{
bzero(jrec, sizeof(*jrec));
jrec->jo = jo;
- if (streamid < 0) {
- streamid = sid++; /* XXX need to track stream ids! */
- if (sid == JREC_STREAMID_JMAX)
- sid = JREC_STREAMID_JMIN;
- }
jrec->streamid = streamid;
jrec->stream_residual = JREC_DEFAULTSIZE;
jrec->stream_reserved = jrec->stream_residual;
* record, so the caller should not mess with the returned pointer in
* any way other then to save it.
*/
-static
struct journal_subrecord *
jrecord_push(struct jrecord *jrec, int16_t rectype)
{
* and if not valid may or may not be NULL, depending on the value
* of pushptrgood.
*/
-static void
+void
jrecord_pop(struct jrecord *jrec, struct journal_subrecord *save)
{
struct journal_subrecord *last;
/*
* Write out a leaf record, including associated data.
*/
-static
void
jrecord_leaf(struct jrecord *jrec, int16_t rectype, void *ptr, int bytes)
{
jrecord_write(jrec, rectype, bytes);
- jrecord_data(jrec, ptr, bytes);
+ jrecord_data(jrec, ptr, bytes, JDATA_KERN);
+}
+
+void
+jrecord_leaf_uio(struct jrecord *jrec, int16_t rectype,
+ struct uio *uio)
+{
+ struct iovec *iov;
+ int i;
+
+ for (i = 0; i < uio->uio_iovcnt; ++i) {
+ iov = &uio->uio_iov[i];
+ if (iov->iov_len == 0)
+ continue;
+ if (uio->uio_segflg == UIO_SYSSPACE) {
+ jrecord_write(jrec, rectype, iov->iov_len);
+ jrecord_data(jrec, iov->iov_base, iov->iov_len, JDATA_KERN);
+ } else { /* UIO_USERSPACE */
+ jrecord_write(jrec, rectype, iov->iov_len);
+ jrecord_data(jrec, iov->iov_base, iov->iov_len, JDATA_USER);
+ }
+ }
+}
+
+void
+jrecord_leaf_xio(struct jrecord *jrec, int16_t rectype, xio_t xio)
+{
+ int bytes = xio->xio_npages * PAGE_SIZE;
+
+ jrecord_write(jrec, rectype, bytes);
+ jrecord_data(jrec, xio, bytes, JDATA_XIO);
}
/*
* CALL AND MAY BECOME INVALID AT ANY TIME. ONLY THE PUSH/POP CODE SHOULD
* USE THE RETURN VALUE.
*/
-static
struct journal_subrecord *
jrecord_write(struct jrecord *jrec, int16_t rectype, int bytes)
{
* subrecord header may become inaccessible due to stream record pushouts.
*/
static void
-jrecord_data(struct jrecord *jrec, const void *buf, int bytes)
+jrecord_data(struct jrecord *jrec, void *buf, int bytes, int dtype)
{
int pusheditout;
int extsize;
+ int xio_offset = 0;
KKASSERT(bytes >= 0 && bytes <= jrec->residual);
/*
* Fill in any remaining space in the current stream record.
*/
- bcopy(buf, jrec->stream_ptr, jrec->stream_residual);
- buf = (const char *)buf + jrec->stream_residual;
+ switch (dtype) {
+ case JDATA_KERN:
+ bcopy(buf, jrec->stream_ptr, jrec->stream_residual);
+ break;
+ case JDATA_USER:
+ copyin(buf, jrec->stream_ptr, jrec->stream_residual);
+ break;
+ case JDATA_XIO:
+ xio_copy_xtok((xio_t)buf, xio_offset, jrec->stream_ptr,
+ jrec->stream_residual);
+ xio_offset += jrec->stream_residual;
+ break;
+ }
+ if (dtype != JDATA_XIO)
+ buf = (char *)buf + jrec->stream_residual;
bytes -= jrec->stream_residual;
/*jrec->stream_ptr += jrec->stream_residual;*/
jrec->residual -= jrec->stream_residual;
* Push out any remaining bytes into the current stream record.
*/
if (bytes) {
- bcopy(buf, jrec->stream_ptr, bytes);
+ switch (dtype) {
+ case JDATA_KERN:
+ bcopy(buf, jrec->stream_ptr, bytes);
+ break;
+ case JDATA_USER:
+ copyin(buf, jrec->stream_ptr, bytes);
+ break;
+ case JDATA_XIO:
+ xio_copy_xtok((xio_t)buf, xio_offset, jrec->stream_ptr, bytes);
+ break;
+ }
jrec->stream_ptr += bytes;
jrec->stream_residual -= bytes;
jrec->residual -= bytes;
* The stream record will be committed or aborted as specified and jrecord
* resources will be cleaned up.
*/
-static void
+void
jrecord_done(struct jrecord *jrec, int abortit)
{
KKASSERT(jrec->rawp != NULL);
* Write out a filename path relative to the base of the mount point.
* rectype is typically JLEAF_PATH{1,2,3,4}.
*/
-static void
+void
jrecord_write_path(struct jrecord *jrec, int16_t rectype, struct namecache *ncp)
{
char buf[64]; /* local buffer if it fits, else malloced */
/*
* Pass 1 - figure out the number of bytes required. Include terminating
* \0 on last element and '/' separator on other elements.
+ *
+ * The namecache topology terminates at the root of the filesystem
+ * (the normal lookup code would then continue by using the mount
+ * structure to figure out what it was mounted on).
*/
again:
pathlen = 0;
- for (scan = ncp;
- scan && (scan->nc_flag & NCF_MOUNTPT) == 0;
- scan = scan->nc_parent
- ) {
- pathlen += scan->nc_nlen + 1;
+ for (scan = ncp; scan; scan = scan->nc_parent) {
+ if (scan->nc_nlen > 0)
+ pathlen += scan->nc_nlen + 1;
}
if (pathlen <= sizeof(buf))
base = buf;
else
- base = malloc(pathlen, M_TEMP, M_INTWAIT);
+ base = kmalloc(pathlen, M_TEMP, M_INTWAIT);
/*
* Pass 2 - generate the path buffer
*/
index = pathlen;
- for (scan = ncp;
- scan && (scan->nc_flag & NCF_MOUNTPT) == 0;
- scan = scan->nc_parent
- ) {
+ for (scan = ncp; scan; scan = scan->nc_parent) {
+ if (scan->nc_nlen == 0)
+ continue;
if (scan->nc_nlen >= index) {
if (base != buf)
- free(base, M_TEMP);
+ kfree(base, M_TEMP);
goto again;
}
if (index == pathlen)
}
jrecord_leaf(jrec, rectype, base + index, pathlen - index);
if (base != buf)
- free(base, M_TEMP);
+ kfree(base, M_TEMP);
}
/*
* Write out a file attribute structure. While somewhat inefficient, using
* a recursive data structure is the most portable and extensible way.
*/
-static void
+void
jrecord_write_vattr(struct jrecord *jrec, struct vattr *vat)
{
void *save;
jrecord_leaf(jrec, JLEAF_GEN, &vat->va_gen, sizeof(vat->va_gen));
if (vat->va_flags != VNOVAL)
jrecord_leaf(jrec, JLEAF_FLAGS, &vat->va_flags, sizeof(vat->va_flags));
- if (vat->va_rdev != VNOVAL)
- jrecord_leaf(jrec, JLEAF_UDEV, &vat->va_rdev, sizeof(vat->va_rdev));
+ if (vat->va_rmajor != VNOVAL) {
+ udev_t rdev = makeudev(vat->va_rmajor, vat->va_rminor);
+ jrecord_leaf(jrec, JLEAF_UDEV, &rdev, sizeof(rdev));
+ jrecord_leaf(jrec, JLEAF_UMAJOR, &vat->va_rmajor, sizeof(vat->va_rmajor));
+ jrecord_leaf(jrec, JLEAF_UMINOR, &vat->va_rminor, sizeof(vat->va_rminor));
+ }
#if 0
if (vat->va_filerev != VNOVAL)
jrecord_leaf(jrec, JLEAF_FILEREV, &vat->va_filerev, sizeof(vat->va_filerev));
* XXX additional tracking info
* XXX tty line info
*/
-static void
+void
jrecord_write_cred(struct jrecord *jrec, struct thread *td, struct ucred *cred)
{
void *save;
* and in fact avoid writing out the file path for seqential writes
* occuring within e.g. a certain period of time.
*/
-static void
+void
jrecord_write_vnode_ref(struct jrecord *jrec, struct vnode *vp)
{
- struct namecache *ncp;
+ struct nchandle nch;
- TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
- if ((ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
+ nch.mount = vp->v_mount;
+ spin_lock(&vp->v_spin);
+ TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) {
+ if ((nch.ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
break;
}
- if (ncp)
- jrecord_write_path(jrec, JLEAF_PATH_REF, ncp);
+ if (nch.ncp) {
+ cache_hold(&nch);
+ spin_unlock(&vp->v_spin);
+ jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp);
+ cache_drop(&nch);
+ } else {
+ spin_unlock(&vp->v_spin);
+ }
}
-static void
+void
jrecord_write_vnode_link(struct jrecord *jrec, struct vnode *vp,
struct namecache *notncp)
{
- struct namecache *ncp;
+ struct nchandle nch;
- TAILQ_FOREACH(ncp, &vp->v_namecache, nc_vnode) {
- if (ncp == notncp)
+ nch.mount = vp->v_mount;
+ spin_lock(&vp->v_spin);
+ TAILQ_FOREACH(nch.ncp, &vp->v_namecache, nc_vnode) {
+ if (nch.ncp == notncp)
continue;
- if ((ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
+ if ((nch.ncp->nc_flag & (NCF_UNRESOLVED|NCF_DESTROYED)) == 0)
break;
}
- if (ncp)
- jrecord_write_path(jrec, JLEAF_PATH_REF, ncp);
-}
-
-#if 0
-/*
- * Write out the current contents of the file within the specified
- * range. This is typically called from within an UNDO section. A
- * locked vnode must be passed.
- */
-static int
-jrecord_write_filearea(struct jrecord *jrec, struct vnode *vp,
- off_t begoff, off_t endoff)
-{
+ if (nch.ncp) {
+ cache_hold(&nch);
+ spin_unlock(&vp->v_spin);
+ jrecord_write_path(jrec, JLEAF_PATH_REF, nch.ncp);
+ cache_drop(&nch);
+ } else {
+ spin_unlock(&vp->v_spin);
+ }
}
-#endif
/*
* Write out the data represented by a pagelist
*/
-static void
+void
jrecord_write_pagelist(struct jrecord *jrec, int16_t rectype,
struct vm_page **pglist, int *rtvals, int pgcount,
off_t offset)
{
- struct msf_buf *msf;
+ struct xio xio;
int error;
int b;
int i;
i = 0;
+ xio_init(&xio);
while (i < pgcount) {
/*
* Find the next valid section. Skip any invalid elements
* And write it out.
*/
if (i - b) {
- error = msf_map_pagelist(&msf, pglist + b, i - b, 0);
+ error = xio_init_pages(&xio, pglist + b, i - b, XIOF_READ);
if (error == 0) {
- printf("RECORD PUTPAGES %d\n", msf_buf_bytes(msf));
jrecord_leaf(jrec, JLEAF_SEEKPOS, &offset, sizeof(offset));
- jrecord_leaf(jrec, rectype,
- msf_buf_kva(msf), msf_buf_bytes(msf));
- msf_buf_free(msf);
+ jrecord_leaf_xio(jrec, rectype, &xio);
} else {
- printf("jrecord_write_pagelist: mapping failure\n");
+ kprintf("jrecord_write_pagelist: xio init failure\n");
}
+ xio_release(&xio);
offset += (off_t)(i - b) << PAGE_SHIFT;
}
}
/*
* Write out the data represented by a UIO.
*/
-struct jwuio_info {
- struct jrecord *jrec;
- int16_t rectype;
-};
-
-static int jrecord_write_uio_callback(void *info, char *buf, int bytes);
-
-static void
+void
jrecord_write_uio(struct jrecord *jrec, int16_t rectype, struct uio *uio)
{
- struct jwuio_info info = { jrec, rectype };
- int error;
-
if (uio->uio_segflg != UIO_NOCOPY) {
jrecord_leaf(jrec, JLEAF_SEEKPOS, &uio->uio_offset,
sizeof(uio->uio_offset));
- error = msf_uio_iterate(uio, jrecord_write_uio_callback, &info);
- if (error)
- printf("XXX warning uio iterate failed %d\n", error);
+ jrecord_leaf_uio(jrec, rectype, uio);
}
}
-static int
-jrecord_write_uio_callback(void *info_arg, char *buf, int bytes)
-{
- struct jwuio_info *info = info_arg;
-
- jrecord_leaf(info->jrec, info->rectype, buf, bytes);
- return(0);
-}
-
-static void
+void
jrecord_file_data(struct jrecord *jrec, struct vnode *vp,
off_t off, off_t bytes)
{
int error;
int n;
- buf = malloc(bufsize, M_JOURNAL, M_WAITOK);
+ buf = kmalloc(bufsize, M_JOURNAL, M_WAITOK);
jrecord_leaf(jrec, JLEAF_SEEKPOS, &off, sizeof(off));
while (bytes) {
n = (bytes > bufsize) ? bufsize : (int)bytes;
error = vn_rdwr(UIO_READ, vp, buf, n, off, UIO_SYSSPACE, IO_NODELOCKED,
- proc0.p_ucred, NULL, curthread);
+ proc0.p_ucred, NULL);
if (error) {
jrecord_leaf(jrec, JLEAF_ERROR, &error, sizeof(error));
break;
bytes -= n;
off += n;
}
- free(buf, M_JOURNAL);
-}
-
-/************************************************************************
- * LOW LEVEL UNDO SUPPORT ROUTINE *
- ************************************************************************
- *
- * This function is used to support UNDO records. It will generate an
- * appropriate record with the requested portion of the file data. Note
- * that file data is only recorded if JRUNDO_FILEDATA is passed. If bytes
- * is -1, it will be set to the size of the file.
- */
-static void
-jrecord_undo_file(struct jrecord *jrec, struct vnode *vp, int jrflags,
- off_t off, off_t bytes)
-{
- struct vattr attr;
- void *save1; /* warning, save pointers do not always remain valid */
- void *save2;
- int error;
-
- /*
- * Setup. Start the UNDO record, obtain a shared lock on the vnode,
- * and retrieve attribute info.
- */
- save1 = jrecord_push(jrec, JTYPE_UNDO);
- error = VOP_GETATTR(vp, &attr, curthread);
- if (error)
- goto done;
-
- /*
- * Generate UNDO records as requested.
- */
- if (jrflags & JRUNDO_VATTR) {
- save2 = jrecord_push(jrec, JTYPE_VATTR);
- jrecord_leaf(jrec, JLEAF_VTYPE, &attr.va_type, sizeof(attr.va_type));
- if ((jrflags & JRUNDO_NLINK) && attr.va_nlink != VNOVAL)
- jrecord_leaf(jrec, JLEAF_NLINK, &attr.va_nlink, sizeof(attr.va_nlink));
- if ((jrflags & JRUNDO_SIZE) && attr.va_size != VNOVAL)
- jrecord_leaf(jrec, JLEAF_SIZE, &attr.va_size, sizeof(attr.va_size));
- if ((jrflags & JRUNDO_UID) && attr.va_uid != VNOVAL)
- jrecord_leaf(jrec, JLEAF_UID, &attr.va_uid, sizeof(attr.va_uid));
- if ((jrflags & JRUNDO_GID) && attr.va_gid != VNOVAL)
- jrecord_leaf(jrec, JLEAF_GID, &attr.va_gid, sizeof(attr.va_gid));
- if ((jrflags & JRUNDO_FSID) && attr.va_fsid != VNOVAL)
- jrecord_leaf(jrec, JLEAF_FSID, &attr.va_fsid, sizeof(attr.va_fsid));
- if ((jrflags & JRUNDO_MODES) && attr.va_mode != (mode_t)VNOVAL)
- jrecord_leaf(jrec, JLEAF_MODES, &attr.va_mode, sizeof(attr.va_mode));
- if ((jrflags & JRUNDO_INUM) && attr.va_fileid != VNOVAL)
- jrecord_leaf(jrec, JLEAF_INUM, &attr.va_fileid, sizeof(attr.va_fileid));
- if ((jrflags & JRUNDO_ATIME) && attr.va_atime.tv_sec != VNOVAL)
- jrecord_leaf(jrec, JLEAF_ATIME, &attr.va_atime, sizeof(attr.va_atime));
- if ((jrflags & JRUNDO_MTIME) && attr.va_mtime.tv_sec != VNOVAL)
- jrecord_leaf(jrec, JLEAF_MTIME, &attr.va_mtime, sizeof(attr.va_mtime));
- if ((jrflags & JRUNDO_CTIME) && attr.va_ctime.tv_sec != VNOVAL)
- jrecord_leaf(jrec, JLEAF_CTIME, &attr.va_ctime, sizeof(attr.va_ctime));
- if ((jrflags & JRUNDO_GEN) && attr.va_gen != VNOVAL)
- jrecord_leaf(jrec, JLEAF_GEN, &attr.va_gen, sizeof(attr.va_gen));
- if ((jrflags & JRUNDO_FLAGS) && attr.va_flags != VNOVAL)
- jrecord_leaf(jrec, JLEAF_FLAGS, &attr.va_flags, sizeof(attr.va_flags));
- if ((jrflags & JRUNDO_UDEV) && attr.va_rdev != VNOVAL)
- jrecord_leaf(jrec, JLEAF_UDEV, &attr.va_rdev, sizeof(attr.va_rdev));
- jrecord_pop(jrec, save2);
- }
-
- /*
- * Output the file data being overwritten by reading the file and
- * writing it out to the journal prior to the write operation. We
- * do not need to write out data past the current file EOF.
- *
- * XXX support JRUNDO_CONDLINK - do not write out file data for files
- * with a link count > 1. The undo code needs to locate the inode and
- * regenerate the hardlink.
- */
- if ((jrflags & JRUNDO_FILEDATA) && attr.va_type == VREG) {
- if (attr.va_size != VNOVAL) {
- if (bytes == -1)
- bytes = attr.va_size - off;
- if (off + bytes > attr.va_size)
- bytes = attr.va_size - off;
- if (bytes > 0)
- jrecord_file_data(jrec, vp, off, bytes);
- } else {
- error = EINVAL;
- }
- }
- if ((jrflags & JRUNDO_FILEDATA) && attr.va_type == VLNK) {
- struct iovec aiov;
- struct uio auio;
- char *buf;
-
- buf = malloc(PATH_MAX, M_JOURNAL, M_WAITOK);
- aiov.iov_base = buf;
- aiov.iov_len = PATH_MAX;
- auio.uio_iov = &aiov;
- auio.uio_iovcnt = 1;
- auio.uio_offset = 0;
- auio.uio_rw = UIO_READ;
- auio.uio_segflg = UIO_SYSSPACE;
- auio.uio_td = curthread;
- auio.uio_resid = PATH_MAX;
- error = VOP_READLINK(vp, &auio, proc0.p_ucred);
- if (error == 0) {
- jrecord_leaf(jrec, JLEAF_SYMLINKDATA, buf,
- PATH_MAX - auio.uio_resid);
- }
- free(buf, M_JOURNAL);
- }
-done:
- if (error)
- jrecord_leaf(jrec, JLEAF_ERROR, &error, sizeof(error));
- jrecord_pop(jrec, save1);
-}
-
-/************************************************************************
- * JOURNAL VNOPS *
- ************************************************************************
- *
- * These are function shims replacing the normal filesystem ops. We become
- * responsible for calling the underlying filesystem ops. We have the choice
- * of executing the underlying op first and then generating the journal entry,
- * or starting the journal entry, executing the underlying op, and then
- * either completing or aborting it.
- *
- * The journal is supposed to be a high-level entity, which generally means
- * identifying files by name rather then by inode. Supplying both allows
- * the journal to be used both for inode-number-compatible 'mirrors' and
- * for simple filesystem replication.
- *
- * Writes are particularly difficult to deal with because a single write may
- * represent a hundred megabyte buffer or more, and both writes and truncations
- * require the 'old' data to be written out as well as the new data if the
- * log is reversable. Other issues:
- *
- * - How to deal with operations on unlinked files (no path available),
- * but which may still be filesystem visible due to hard links.
- *
- * - How to deal with modifications made via a memory map.
- *
- * - Future cache coherency support will require cache coherency API calls
- * both prior to and after the call to the underlying VFS.
- *
- * ALSO NOTE: We do not have to shim compatibility VOPs like MKDIR which have
- * new VFS equivalents (NMKDIR).
- */
-
-/*
- * Journal vop_settattr { a_vp, a_vap, a_cred, a_td }
- */
-static
-int
-journal_setattr(struct vop_setattr_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- void *save;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_SETATTR)) {
- jreclist_undo_file(&jreclist, ap->a_vp, JRUNDO_VATTR, 0, 0);
- }
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, ap->a_td, ap->a_cred);
- jrecord_write_vnode_ref(jrec, ap->a_vp);
- save = jrecord_push(jrec, JTYPE_REDO);
- jrecord_write_vattr(jrec, ap->a_vap);
- jrecord_pop(jrec, save);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_write { a_vp, a_uio, a_ioflag, a_cred }
- */
-static
-int
-journal_write(struct vop_write_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- struct uio uio_copy;
- struct iovec uio_one_iovec;
- void *save;
- int error;
-
- /*
- * This is really nasty. UIO's don't retain sufficient information to
- * be reusable once they've gone through the VOP chain. The iovecs get
- * cleared, so we have to copy the UIO.
- *
- * XXX fix the UIO code to not destroy iov's during a scan so we can
- * reuse the uio over and over again.
- *
- * XXX UNDO code needs to journal the old data prior to the write.
- */
- uio_copy = *ap->a_uio;
- if (uio_copy.uio_iovcnt == 1) {
- uio_one_iovec = ap->a_uio->uio_iov[0];
- uio_copy.uio_iov = &uio_one_iovec;
- } else {
- uio_copy.uio_iov = malloc(uio_copy.uio_iovcnt * sizeof(struct iovec),
- M_JOURNAL, M_WAITOK);
- bcopy(ap->a_uio->uio_iov, uio_copy.uio_iov,
- uio_copy.uio_iovcnt * sizeof(struct iovec));
- }
-
- /*
- * Write out undo data. Note that uio_offset is incorrect if
- * IO_APPEND is set, but fortunately we have no undo file data to
- * write out in that case.
- */
- mp = ap->a_head.a_ops->vv_mount;
- if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_WRITE)) {
- if (ap->a_ioflag & IO_APPEND) {
- jreclist_undo_file(&jreclist, ap->a_vp, JRUNDO_SIZE|JRUNDO_MTIME, 0, 0);
- } else {
- jreclist_undo_file(&jreclist, ap->a_vp,
- JRUNDO_FILEDATA|JRUNDO_SIZE|JRUNDO_MTIME,
- uio_copy.uio_offset, uio_copy.uio_resid);
- }
- }
- error = vop_journal_operate_ap(&ap->a_head);
-
- /*
- * XXX bad hack to figure out the offset for O_APPEND writes (note:
- * uio field state after the VFS operation).
- */
- uio_copy.uio_offset = ap->a_uio->uio_offset -
- (uio_copy.uio_resid - ap->a_uio->uio_resid);
-
- /*
- * Output the write data to the journal.
- */
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_vnode_ref(jrec, ap->a_vp);
- save = jrecord_push(jrec, JTYPE_REDO);
- jrecord_write_uio(jrec, JLEAF_FILEDATA, &uio_copy);
- jrecord_pop(jrec, save);
- }
- }
- jreclist_done(&jreclist, error);
-
- if (uio_copy.uio_iov != &uio_one_iovec)
- free(uio_copy.uio_iov, M_JOURNAL);
- return (error);
-}
-
-/*
- * Journal vop_fsync { a_vp, a_waitfor, a_td }
- */
-static
-int
-journal_fsync(struct vop_fsync_args *ap)
-{
-#if 0
- struct mount *mp;
- struct journal *jo;
-#endif
- int error;
-
- error = vop_journal_operate_ap(&ap->a_head);
-#if 0
- mp = ap->a_head.a_ops->vv_mount;
- if (error == 0) {
- TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
- /* XXX synchronize pending journal records */
- }
- }
-#endif
- return (error);
-}
-
-/*
- * Journal vop_putpages { a_vp, a_m, a_count, a_sync, a_rtvals, a_offset }
- *
- * note: a_count is in bytes.
- */
-static
-int
-journal_putpages(struct vop_putpages_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- void *save;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_PUTPAGES) &&
- ap->a_count > 0
- ) {
- jreclist_undo_file(&jreclist, ap->a_vp,
- JRUNDO_FILEDATA|JRUNDO_SIZE|JRUNDO_MTIME,
- ap->a_offset, btoc(ap->a_count));
- }
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0 && ap->a_count > 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_vnode_ref(jrec, ap->a_vp);
- save = jrecord_push(jrec, JTYPE_REDO);
- jrecord_write_pagelist(jrec, JLEAF_FILEDATA, ap->a_m, ap->a_rtvals,
- btoc(ap->a_count), ap->a_offset);
- jrecord_pop(jrec, save);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_setacl { a_vp, a_type, a_aclp, a_cred, a_td }
- */
-static
-int
-journal_setacl(struct vop_setacl_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- jreclist_init(mp, &jreclist, &jreccache, JTYPE_SETACL);
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
-#if 0
- if ((jo->flags & MC_JOURNAL_WANT_REVERSABLE))
- jrecord_undo_file(jrec, ap->a_vp, JRUNDO_XXX, 0, 0);
-#endif
- jrecord_write_cred(jrec, ap->a_td, ap->a_cred);
- jrecord_write_vnode_ref(jrec, ap->a_vp);
-#if 0
- save = jrecord_push(jrec, JTYPE_REDO);
- /* XXX type, aclp */
- jrecord_pop(jrec, save);
-#endif
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_setextattr { a_vp, a_name, a_uio, a_cred, a_td }
- */
-static
-int
-journal_setextattr(struct vop_setextattr_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- void *save;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- jreclist_init(mp, &jreclist, &jreccache, JTYPE_SETEXTATTR);
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
-#if 0
- if ((jo->flags & MC_JOURNAL_WANT_REVERSABLE))
- jrecord_undo_file(jrec, ap->a_vp, JRUNDO_XXX, 0, 0);
-#endif
- jrecord_write_cred(jrec, ap->a_td, ap->a_cred);
- jrecord_write_vnode_ref(jrec, ap->a_vp);
- jrecord_leaf(jrec, JLEAF_ATTRNAME, ap->a_name, strlen(ap->a_name));
- save = jrecord_push(jrec, JTYPE_REDO);
- jrecord_write_uio(jrec, JLEAF_FILEDATA, ap->a_uio);
- jrecord_pop(jrec, save);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_ncreate { a_ncp, a_vpp, a_cred, a_vap }
- */
-static
-int
-journal_ncreate(struct vop_ncreate_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- void *save;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- jreclist_init(mp, &jreclist, &jreccache, JTYPE_CREATE);
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- if (*ap->a_vpp)
- jrecord_write_vnode_ref(jrec, *ap->a_vpp);
- save = jrecord_push(jrec, JTYPE_REDO);
- jrecord_write_vattr(jrec, ap->a_vap);
- jrecord_pop(jrec, save);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_nmknod { a_ncp, a_vpp, a_cred, a_vap }
- */
-static
-int
-journal_nmknod(struct vop_nmknod_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- void *save;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- jreclist_init(mp, &jreclist, &jreccache, JTYPE_MKNOD);
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- save = jrecord_push(jrec, JTYPE_REDO);
- jrecord_write_vattr(jrec, ap->a_vap);
- jrecord_pop(jrec, save);
- if (*ap->a_vpp)
- jrecord_write_vnode_ref(jrec, *ap->a_vpp);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_nlink { a_ncp, a_vp, a_cred }
- */
-static
-int
-journal_nlink(struct vop_nlink_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- void *save;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- jreclist_init(mp, &jreclist, &jreccache, JTYPE_LINK);
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- /* XXX PATH to VP and inode number */
- /* XXX this call may not record the correct path when
- * multiple paths are available */
- save = jrecord_push(jrec, JTYPE_REDO);
- jrecord_write_vnode_link(jrec, ap->a_vp, ap->a_ncp);
- jrecord_pop(jrec, save);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_symlink { a_ncp, a_vpp, a_cred, a_vap, a_target }
- */
-static
-int
-journal_nsymlink(struct vop_nsymlink_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- void *save;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- jreclist_init(mp, &jreclist, &jreccache, JTYPE_SYMLINK);
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- save = jrecord_push(jrec, JTYPE_REDO);
- jrecord_leaf(jrec, JLEAF_SYMLINKDATA,
- ap->a_target, strlen(ap->a_target));
- jrecord_pop(jrec, save);
- if (*ap->a_vpp)
- jrecord_write_vnode_ref(jrec, *ap->a_vpp);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_nwhiteout { a_ncp, a_cred, a_flags }
- */
-static
-int
-journal_nwhiteout(struct vop_nwhiteout_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- jreclist_init(mp, &jreclist, &jreccache, JTYPE_WHITEOUT);
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_nremove { a_ncp, a_cred }
- */
-static
-int
-journal_nremove(struct vop_nremove_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_REMOVE) &&
- ap->a_ncp->nc_vp
- ) {
- jreclist_undo_file(&jreclist, ap->a_ncp->nc_vp,
- JRUNDO_ALL|JRUNDO_GETVP|JRUNDO_CONDLINK, 0, -1);
- }
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_nmkdir { a_ncp, a_vpp, a_cred, a_vap }
- */
-static
-int
-journal_nmkdir(struct vop_nmkdir_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- jreclist_init(mp, &jreclist, &jreccache, JTYPE_MKDIR);
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
-#if 0
- if (jo->flags & MC_JOURNAL_WANT_AUDIT) {
- jrecord_write_audit(jrec);
- }
-#endif
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_vattr(jrec, ap->a_vap);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- if (*ap->a_vpp)
- jrecord_write_vnode_ref(jrec, *ap->a_vpp);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_nrmdir { a_ncp, a_cred }
- */
-static
-int
-journal_nrmdir(struct vop_nrmdir_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_RMDIR)) {
- jreclist_undo_file(&jreclist, ap->a_ncp->nc_vp,
- JRUNDO_VATTR|JRUNDO_GETVP, 0, 0);
- }
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_ncp);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
-}
-
-/*
- * Journal vop_nrename { a_fncp, a_tncp, a_cred }
- */
-static
-int
-journal_nrename(struct vop_nrename_args *ap)
-{
- struct jrecord_list jreclist;
- struct jrecord jreccache;
- struct jrecord *jrec;
- struct mount *mp;
- int error;
-
- mp = ap->a_head.a_ops->vv_mount;
- if (jreclist_init(mp, &jreclist, &jreccache, JTYPE_RENAME) &&
- ap->a_tncp->nc_vp
- ) {
- jreclist_undo_file(&jreclist, ap->a_tncp->nc_vp,
- JRUNDO_ALL|JRUNDO_GETVP|JRUNDO_CONDLINK, 0, -1);
- }
- error = vop_journal_operate_ap(&ap->a_head);
- if (error == 0) {
- TAILQ_FOREACH(jrec, &jreclist, user_entry) {
- jrecord_write_cred(jrec, NULL, ap->a_cred);
- jrecord_write_path(jrec, JLEAF_PATH1, ap->a_fncp);
- jrecord_write_path(jrec, JLEAF_PATH2, ap->a_tncp);
- }
- }
- jreclist_done(&jreclist, error);
- return (error);
+ kfree(buf, M_JOURNAL);
}