#include <sys/queue.h>
#include <sys/ktr.h>
#include <sys/globaldata.h>
+#include <sys/limits.h>
#include <sys/buf2.h>
#include <sys/signal2.h>
off_t save_trunc_off; /* write optimization */
struct hammer_btree_leaf_elm sync_ino_leaf; /* to-sync cache */
struct hammer_inode_data sync_ino_data; /* to-sync cache */
+ size_t redo_count;
};
typedef struct hammer_inode *hammer_inode_t;
#define VTOI(vp) ((struct hammer_inode *)(vp)->v_data)
-#define HAMMER_INODE_DDIRTY 0x0001 /* in-memory ino_data is dirty */
+/*
+ * NOTE: DDIRTY does not include atime or mtime and does not include
+ * write-append size changes. SDIRTY handles write-append size
+ * changes.
+ */
/* (not including atime/mtime) */
+#define HAMMER_INODE_DDIRTY 0x0001 /* in-memory ino_data is dirty */
#define HAMMER_INODE_RSV_INODES 0x0002 /* hmp->rsv_inodes bumped */
#define HAMMER_INODE_CONN_DOWN 0x0004 /* include in downward recursion */
#define HAMMER_INODE_XDIRTY 0x0008 /* in-memory records */
#define HAMMER_INODE_MTIME 0x00200000 /* in-memory mtime modified */
#define HAMMER_INODE_WOULDBLOCK 0x00400000 /* re-issue to new flush group */
#define HAMMER_INODE_DUMMY 0x00800000 /* dummy inode covering bad file */
-#define HAMMER_INODE_CLOSESYNC 0x01000000 /* synchronously fsync on close */
-#define HAMMER_INODE_CLOSEASYNC 0x02000000 /* asynchronously fsync on close */
+#define HAMMER_INODE_SDIRTY 0x01000000 /* in-memory ino_data.size is dirty*/
-#define HAMMER_INODE_MODMASK (HAMMER_INODE_DDIRTY| \
+#define HAMMER_INODE_MODMASK (HAMMER_INODE_DDIRTY|HAMMER_INODE_SDIRTY| \
HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS| \
HAMMER_INODE_ATIME|HAMMER_INODE_MTIME| \
HAMMER_INODE_TRUNCATED|HAMMER_INODE_DELETING)
-#define HAMMER_INODE_MODMASK_NOXDIRTY \
+#define HAMMER_INODE_MODMASK_NOXDIRTY \
(HAMMER_INODE_MODMASK & ~HAMMER_INODE_XDIRTY)
+#define HAMMER_INODE_MODMASK_NOREDO \
+ (HAMMER_INODE_DDIRTY| \
+ HAMMER_INODE_XDIRTY| \
+ HAMMER_INODE_TRUNCATED|HAMMER_INODE_DELETING)
+
#define HAMMER_FLUSH_SIGNAL 0x0001
#define HAMMER_FLUSH_RECURSION 0x0002
struct hammer_flusher_info_list ready_list;
};
+#define HAMMER_FLUSH_UNDOS_RELAXED 0
+#define HAMMER_FLUSH_UNDOS_FORCED 1
+#define HAMMER_FLUSH_UNDOS_AUTO 2
/*
* Internal hammer mount data structure
*/
extern int hammer_limit_recs;
extern int hammer_limit_inode_recs;
extern int hammer_limit_reclaim;
+extern int hammer_limit_redo;
extern int hammer_bio_count;
extern int hammer_verify_zone;
extern int hammer_verify_data;
* Flush data buffers. This can occur asynchronously and at any
* time. We must interlock against the frontend direct-data write
* but do not have to acquire the sync-lock yet.
+ *
+ * These data buffers have already been collected prior to the
+ * related inode(s) getting queued to the flush group.
*/
count = 0;
while ((io = TAILQ_FIRST(&hmp->data_list)) != NULL) {
* Flush UNDOs. This also waits for I/Os to complete and flushes
* the cache on the target disk.
*/
- hammer_flusher_flush_undos(hmp, 1);
+ hammer_flusher_flush_undos(hmp, HAMMER_FLUSH_UNDOS_FORCED);
if (hmp->flags & HAMMER_MOUNT_CRITICAL_ERROR)
goto failed;
}
/*
- * Flush UNDOs. If already_flushed is non-zero we force a disk sync
- * even if no UNDOs are present.
+ * Flush UNDOs.
*/
void
-hammer_flusher_flush_undos(hammer_mount_t hmp, int already_flushed)
+hammer_flusher_flush_undos(hammer_mount_t hmp, int mode)
{
hammer_io_t io;
int count;
- if (already_flushed == 0 && TAILQ_EMPTY(&hmp->undo_list))
- return;
-
count = 0;
while ((io = TAILQ_FIRST(&hmp->undo_list)) != NULL) {
if (io->ioerror)
++count;
}
hammer_flusher_clean_loose_ios(hmp);
- hammer_io_wait_all(hmp, "hmrfl1");
+ if (mode == HAMMER_FLUSH_UNDOS_FORCED ||
+ (mode == HAMMER_FLUSH_UNDOS_AUTO && count)) {
+ hammer_io_wait_all(hmp, "hmrfl1");
+ }
}
/*
ip->cache[1].ip = ip;
ip->cache[2].ip = ip;
ip->cache[3].ip = ip;
+ ip->redo_count = SIZE_T_MAX;
if (hmp->ronly)
ip->flags |= HAMMER_INODE_RO;
ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
ip->cache[1].ip = ip;
ip->cache[2].ip = ip;
ip->cache[3].ip = ip;
+ ip->redo_count = SIZE_T_MAX;
ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
0x7FFFFFFFFFFFFFFFLL;
RB_INIT(&ip->rec_tree);
ip->cache[1].ip = ip;
ip->cache[2].ip = ip;
ip->cache[3].ip = ip;
+ ip->redo_count = SIZE_T_MAX;
ip->trunc_off = 0x7FFFFFFFFFFFFFFFLL;
/* ip->save_trunc_off = 0; (already zero) */
if (hammer_debug_inode)
kprintf("CLEANDELOND %p %08x\n", ip, ip->flags);
ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
+ HAMMER_INODE_SDIRTY |
HAMMER_INODE_ATIME |
HAMMER_INODE_MTIME);
ip->flags &= ~HAMMER_INODE_DELONDISK;
*/
if (error == 0 && (ip->flags & HAMMER_INODE_DELETED)) {
ip->sync_flags &= ~(HAMMER_INODE_DDIRTY |
+ HAMMER_INODE_SDIRTY |
HAMMER_INODE_ATIME |
HAMMER_INODE_MTIME);
}
* A transaction has modified an inode, requiring updates as specified by
* the passed flags.
*
- * HAMMER_INODE_DDIRTY: Inode data has been updated
+ * HAMMER_INODE_DDIRTY: Inode data has been updated, not incl mtime/atime,
+ * and not including size changes due to write-append
+ * (but other size changes are included).
+ * HAMMER_INODE_SDIRTY: Inode data has been updated, size changes due to
+ * write-append.
* HAMMER_INODE_XDIRTY: Dirty in-memory records
* HAMMER_INODE_BUFS: Dirty buffer cache buffers
* HAMMER_INODE_DELETED: Inode record/data must be deleted
*/
KKASSERT(ip->hmp->ronly != 1 ||
(flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
+ HAMMER_INODE_SDIRTY |
HAMMER_INODE_BUFS | HAMMER_INODE_DELETED |
HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) == 0);
if ((ip->flags & HAMMER_INODE_RSV_INODES) == 0) {
* Clear flags which may have been set by the frontend.
*/
ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
+ HAMMER_INODE_SDIRTY |
HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
HAMMER_INODE_DELETING);
break;
* Clear flags which may have been set by the frontend.
*/
ip->sync_flags &= ~(HAMMER_INODE_DDIRTY | HAMMER_INODE_XDIRTY |
+ HAMMER_INODE_SDIRTY |
HAMMER_INODE_ATIME | HAMMER_INODE_MTIME |
HAMMER_INODE_DELETING);
while (RB_ROOT(&ip->rec_tree)) {
}
/*
- * If RDIRTY or DDIRTY is set, write out a new record. If the inode
- * is already on-disk the old record is marked as deleted.
+ * If RDIRTY, DDIRTY, or SDIRTY is set, write out a new record.
+ * If the inode is already on-disk the old record is marked as
+ * deleted.
*
* If DELETED is set hammer_update_inode() will delete the existing
* record without writing out a new one.
if (ip->flags & HAMMER_INODE_DELETED) {
error = hammer_update_inode(&cursor, ip);
} else
- if ((ip->sync_flags & HAMMER_INODE_DDIRTY) == 0 &&
+ if (!(ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY)) &&
(ip->sync_flags & (HAMMER_INODE_ATIME | HAMMER_INODE_MTIME))) {
error = hammer_update_itimes(&cursor, ip);
} else
- if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
+ if (ip->sync_flags & (HAMMER_INODE_DDIRTY | HAMMER_INODE_SDIRTY |
+ HAMMER_INODE_ATIME | HAMMER_INODE_MTIME)) {
error = hammer_update_inode(&cursor, ip);
}
done:
hammer_vop_fsync(struct vop_fsync_args *ap)
{
hammer_inode_t ip = VTOI(ap->a_vp);
+ hammer_mount_t hmp = ip->hmp;
int waitfor = ap->a_waitfor;
+ int mode;
/*
- * Fsync rule relaxation (default disabled)
+ * Fsync rule relaxation (default is either full synchronous flush
+ * or REDO semantics with synchronous flush).
*/
if (ap->a_flags & VOP_FSYNC_SYSCALL) {
switch(hammer_fsync_mode) {
case 0:
- /* full semantics */
- break;
+mode0:
+ /* disable REDO, full synchronous flush */
+ ip->redo_count = SIZE_T_MAX;
+ goto skip;
case 1:
- /* asynchronous */
+mode1:
+ /* disable REDO, full asynchronous flush */
+ ip->redo_count = SIZE_T_MAX;
if (waitfor == MNT_WAIT)
waitfor = MNT_NOWAIT;
- break;
+ goto skip;
case 2:
- /* synchronous fsync on close */
- ip->flags |= HAMMER_INODE_CLOSESYNC;
- return(0);
+ /* REDO semantics, synchronous flush */
+ if (hmp->version < HAMMER_VOL_VERSION_FOUR)
+ goto mode0;
+ mode = HAMMER_FLUSH_UNDOS_AUTO;
+ break;
case 3:
- /* asynchronous fsync on close */
- ip->flags |= HAMMER_INODE_CLOSEASYNC;
+ /* REDO semantics, relaxed asynchronous flush */
+ if (hmp->version < HAMMER_VOL_VERSION_FOUR)
+ goto mode1;
+ mode = HAMMER_FLUSH_UNDOS_RELAXED;
+ if (waitfor == MNT_WAIT)
+ waitfor = MNT_NOWAIT;
+ break;
+ case 4:
+ /* ignore the fsync() system call */
return(0);
default:
- /* ignore the fsync() system call */
+ /* we have to do something */
+ mode = HAMMER_FLUSH_UNDOS_RELAXED;
+ if (waitfor == MNT_WAIT)
+ waitfor = MNT_NOWAIT;
+ break;
+ }
+
+ /*
+ * redo_count is initialized to a maximal value and set
+ * to 0 after the first fsync() on a file, which enables
+ * REDO logging on the inode unless the number of bytes
+ * written exceeds the limit.
+ */
+ if (ip->redo_count < hammer_limit_redo &&
+ (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
+ ) {
+ ++hammer_count_fsyncs;
+ hammer_flusher_flush_undos(hmp, mode);
+ ip->redo_count = 0;
return(0);
}
+ ip->redo_count = 0;
}
+skip:
/*
- * Go do it
+ * Do a full flush sequence.
*/
++hammer_count_fsyncs;
vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
* If reading or writing a huge amount of data we have to break
* atomicy and allow the operation to be interrupted by a signal
* or it can DOS the machine.
+ *
+ * Adjust redo_count early to avoid generating unnecessary redos.
*/
bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
+ if (ip->redo_count < hammer_limit_redo)
+ ip->redo_count += uio->uio_resid;
/*
* Access the data typically in HAMMER_BUFSIZE blocks via the
if (error == 0)
bheavy(bp);
}
- if (error == 0) {
- error = uiomove((char *)bp->b_data + offset,
- n, uio);
+ if (error == 0)
+ error = uiomove(bp->b_data + offset, n, uio);
+
+ /*
+ * Generate REDO records while redo_count has not exceeded
+ * the limit. Note that redo_count is initialized to a
+ * maximal value until the first fsync(), and zerod on every
+ * fsync(). Thus at least one fsync() is required before we
+ * start generating REDO records for the ip.
+ */
+ if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
+ ip->redo_count < hammer_limit_redo &&
+ error == 0) {
+ hammer_sync_lock_sh(&trans);
+ error = hammer_generate_redo(&trans, ip,
+ base_offset + offset,
+ bp->b_data + offset,
+ (size_t)n);
+ hammer_sync_unlock(&trans);
}
/*
/* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
if (ip->ino_data.size < uio->uio_offset) {
ip->ino_data.size = uio->uio_offset;
- flags = HAMMER_INODE_DDIRTY;
+ flags = HAMMER_INODE_SDIRTY;
vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
} else {
flags = 0;
int
hammer_vop_close(struct vop_close_args *ap)
{
+#if 0
struct vnode *vp = ap->a_vp;
hammer_inode_t ip = VTOI(vp);
int waitfor;
-
if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
if (vn_islocked(vp) == LK_EXCLUSIVE &&
(vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
}
}
+#endif
return (vop_stdclose(ap));
}