Data CRC errors should now generate EIO instead of panic()ing the system.
B-Tree CRC errors might still panic() and freemap CRC errors WILL still
panic().
Continuing from DDB on a B-Tree node CRC error when debugging is enabled
now no longer marks the B-Tree node as good.
The mirror-read command will now transfer data records with bad CRCs
instead of aborting the transfer, identifying them with a new type field.
The mirror-write ioctl currently ignores such records.
If a directory entry is encountered and the related inode cannot be
looked up, generate a dummy in-memory inode of type FIFO to placemark
the bad directory entry, allowing it to be removed. Currently it is
possible for a directory entry to be synced to the media in a different
transaction then the related inode (a bug which needs to be fixed).
If a crash occurs at the wrong time the recovery code can leave the media
in a state where the directory entry exists but the inode does not. This
change allows the bad directory entry to be removed.
Reported-by: Antonio Huete Jimenez
#define HAMMER_TRANSF_NEWINODE 0x0001
#define HAMMER_TRANSF_DIDIO 0x0002
+#define HAMMER_TRANSF_CRCDOM 0x0004 /* EDOM on CRC error, less critical */
/*
* HAMMER locks
#define HAMMER_INODE_ATIME 0x00100000 /* in-memory atime modified */
#define HAMMER_INODE_MTIME 0x00200000 /* in-memory mtime modified */
#define HAMMER_INODE_WOULDBLOCK 0x00400000 /* re-issue to new flush group */
+#define HAMMER_INODE_DUMMY 0x00800000 /* dummy inode covering bad file */
#define HAMMER_INODE_MODMASK (HAMMER_INODE_DDIRTY| \
HAMMER_INODE_XDIRTY|HAMMER_INODE_BUFS| \
#define HAMMER_NODE_CRCGOOD 0x0004
#define HAMMER_NODE_NEEDSCRC 0x0008
#define HAMMER_NODE_NEEDSMIRROR 0x0010
+#define HAMMER_NODE_CRCBAD 0x0020
+
+#define HAMMER_NODE_CRCANY (HAMMER_NODE_CRCGOOD | HAMMER_NODE_CRCBAD)
typedef struct hammer_node *hammer_node_t;
hammer_inode_t dip, int64_t obj_id,
hammer_tid_t asof, u_int32_t localization,
int flags, int *errorp);
+struct hammer_inode *hammer_get_dummy_inode(hammer_transaction_t trans,
+ hammer_inode_t dip, int64_t obj_id,
+ hammer_tid_t asof, u_int32_t localization,
+ int flags, int *errorp);
void hammer_scan_inode_snapshots(hammer_mount_t hmp,
hammer_inode_info_t iinfo,
int (*callback)(hammer_inode_t ip, void *data),
hammer_node_t hammer_get_node(hammer_transaction_t trans,
hammer_off_t node_offset, int isnew, int *errorp);
void hammer_ref_node(hammer_node_t node);
-hammer_node_t hammer_ref_node_safe(struct hammer_mount *hmp,
+hammer_node_t hammer_ref_node_safe(hammer_transaction_t trans,
hammer_node_cache_t cache, int *errorp);
void hammer_rel_node(hammer_node_t node);
void hammer_delete_node(hammer_transaction_t trans,
static __inline void
hammer_modify_node_noundo(hammer_transaction_t trans, hammer_node_t node)
{
+ KKASSERT((node->flags & HAMMER_NODE_CRCBAD) == 0);
hammer_modify_buffer(trans, node->buffer, NULL, 0);
}
static __inline void
hammer_modify_node_all(hammer_transaction_t trans, struct hammer_node *node)
{
+ KKASSERT((node->flags & HAMMER_NODE_CRCBAD) == 0);
hammer_modify_buffer(trans, node->buffer,
node->ondisk, sizeof(*node->ondisk));
}
KKASSERT((char *)base >= (char *)node->ondisk &&
(char *)base + len <=
(char *)node->ondisk + sizeof(*node->ondisk));
+ KKASSERT((node->flags & HAMMER_NODE_CRCBAD) == 0);
hammer_modify_buffer(trans, node->buffer, base, len);
crcptr = &node->ondisk->crc;
hammer_modify_buffer(trans, node->buffer, crcptr, sizeof(hammer_crc_t));
if (hammer_crc_test_leaf(cursor->data, &elm->leaf) == 0) {
kprintf("CRC DATA @ %016llx/%d FAILED\n",
elm->leaf.data_offset, elm->leaf.data_len);
- Debugger("CRC FAILED: DATA");
+ if (hammer_debug_debug & 0x0001)
+ Debugger("CRC FAILED: DATA");
+ if (cursor->trans->flags & HAMMER_TRANSF_CRCDOM)
+ error = EDOM; /* less critical (mirroring) */
+ else
+ error = EIO; /* critical */
}
return(error);
}
* Step 1 - acquire a locked node from the cache if possible
*/
if (cache && cache->node) {
- node = hammer_ref_node_safe(trans->hmp, cache, &error);
+ node = hammer_ref_node_safe(trans, cache, &error);
if (error == 0) {
hammer_lock_sh(&node->lock);
if (node->flags & HAMMER_NODE_DELETED) {
* of inodes. Otherwise we can continue to * add new inodes
* faster then they can be disposed of, even with the tsleep
* delay.
+ *
+ * If we find a dummy inode we return a failure so dounlink
+ * (which does another lookup) doesn't try to mess with the
+ * link count. hammer_vop_nresolve() uses hammer_get_dummy_inode()
+ * to ref dummy inodes.
*/
iinfo.obj_id = obj_id;
iinfo.obj_asof = asof;
loop:
ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
if (ip) {
-#if 0
- if (ip->vp == NULL)
- trans->flags |= HAMMER_TRANSF_NEWINODE;
-#endif
+ if (ip->flags & HAMMER_INODE_DUMMY) {
+ *errorp = ENOENT;
+ return(NULL);
+ }
hammer_ref(&ip->lock);
*errorp = 0;
return(ip);
}
/*
+ * Get a dummy inode to placemark a broken directory entry.
+ */
+struct hammer_inode *
+hammer_get_dummy_inode(hammer_transaction_t trans, hammer_inode_t dip,
+ int64_t obj_id, hammer_tid_t asof, u_int32_t localization,
+ int flags, int *errorp)
+{
+ hammer_mount_t hmp = trans->hmp;
+ struct hammer_inode_info iinfo;
+ struct hammer_inode *ip;
+
+ /*
+ * Determine if we already have an inode cached. If we do then
+ * we are golden.
+ *
+ * If we find an inode with no vnode we have to mark the
+ * transaction such that hammer_inode_waitreclaims() is
+ * called later on to avoid building up an infinite number
+ * of inodes. Otherwise we can continue to * add new inodes
+ * faster then they can be disposed of, even with the tsleep
+ * delay.
+ *
+ * If we find a non-fake inode we return an error. Only fake
+ * inodes can be returned by this routine.
+ */
+ iinfo.obj_id = obj_id;
+ iinfo.obj_asof = asof;
+ iinfo.obj_localization = localization;
+loop:
+ *errorp = 0;
+ ip = hammer_ino_rb_tree_RB_LOOKUP_INFO(&hmp->rb_inos_root, &iinfo);
+ if (ip) {
+ if ((ip->flags & HAMMER_INODE_DUMMY) == 0) {
+ *errorp = ENOENT;
+ return(NULL);
+ }
+ hammer_ref(&ip->lock);
+ return(ip);
+ }
+
+ /*
+ * Allocate a new inode structure and deal with races later.
+ */
+ ip = kmalloc(sizeof(*ip), hmp->m_inodes, M_WAITOK|M_ZERO);
+ ++hammer_count_inodes;
+ ++hmp->count_inodes;
+ ip->obj_id = obj_id;
+ ip->obj_asof = iinfo.obj_asof;
+ ip->obj_localization = localization;
+ ip->hmp = hmp;
+ ip->flags = flags | HAMMER_INODE_RO | HAMMER_INODE_DUMMY;
+ ip->cache[0].ip = ip;
+ ip->cache[1].ip = ip;
+ ip->sync_trunc_off = ip->trunc_off = ip->save_trunc_off =
+ 0x7FFFFFFFFFFFFFFFLL;
+ RB_INIT(&ip->rec_tree);
+ TAILQ_INIT(&ip->target_list);
+ hammer_ref(&ip->lock);
+
+ /*
+ * Populate the dummy inode. Leave everything zero'd out.
+ *
+ * (ip->ino_leaf and ip->ino_data)
+ *
+ * Make the dummy inode a FIFO object which most copy programs
+ * will properly ignore.
+ */
+ ip->save_trunc_off = ip->ino_data.size;
+ ip->ino_data.obj_type = HAMMER_OBJTYPE_FIFO;
+
+ /*
+ * Locate and assign the pseudofs management structure to
+ * the inode.
+ */
+ if (dip && dip->obj_localization == ip->obj_localization) {
+ ip->pfsm = dip->pfsm;
+ hammer_ref(&ip->pfsm->lock);
+ } else {
+ ip->pfsm = hammer_load_pseudofs(trans, ip->obj_localization,
+ errorp);
+ *errorp = 0; /* ignore ENOENT */
+ }
+
+ /*
+ * The inode is placed on the red-black tree and will be synced to
+ * the media when flushed or by the filesystem sync. If this races
+ * another instantiation/lookup the insertion will fail.
+ *
+ * NOTE: Do not set HAMMER_INODE_ONDISK. The inode is a fake.
+ */
+ if (*errorp == 0) {
+ if (RB_INSERT(hammer_ino_rb_tree, &hmp->rb_inos_root, ip)) {
+ hammer_free_inode(ip);
+ goto loop;
+ }
+ } else {
+ if (ip->flags & HAMMER_INODE_RSV_INODES) {
+ ip->flags &= ~HAMMER_INODE_RSV_INODES; /* sanity */
+ --hmp->rsv_inodes;
+ }
+ hammer_free_inode(ip);
+ ip = NULL;
+ }
+ trans->flags |= HAMMER_TRANSF_NEWINODE;
+ return (ip);
+}
+
+/*
* Create a new filesystem object, returning the inode in *ipp. The
* returned inode will be referenced. The inode is created in-memory.
*
* out from under us.
*/
if (error == 0) {
- tmp_node = hammer_ref_node_safe(ip->hmp, &ip->cache[0], &error);
+ tmp_node = hammer_ref_node_safe(trans, &ip->cache[0], &error);
if (tmp_node) {
hammer_cursor_downgrade(&cursor);
hammer_lock_sh(&tmp_node->lock);
typedef union hammer_ioc_mrecord_any *hammer_ioc_mrecord_any_t;
+/*
+ * MREC types. Flags are in the upper 16 bits but some are also included
+ * in the type mask to force them into any switch() on the type.
+ *
+ * NOTE: Any record whos data is CRC-errored will have HAMMER_MRECF_CRC set,
+ * and the bit is also part of the type mask.
+ */
#define HAMMER_MREC_TYPE_RESERVED 0
#define HAMMER_MREC_TYPE_REC 1 /* record w/ data */
#define HAMMER_MREC_TYPE_PFSD 2 /* (userland only) */
#define HAMMER_MREC_TYPE_TERM 7 /* (userland only) */
#define HAMMER_MREC_TYPE_IDLE 8 /* (userland only) */
+#define HAMMER_MREC_TYPE_REC_BADCRC (HAMMER_MREC_TYPE_REC | \
+ HAMMER_MRECF_CRC_ERROR)
+
+#define HAMMER_MRECF_TYPE_LOMASK 0x000000FF
+#define HAMMER_MRECF_TYPE_MASK 0x800000FF
+#define HAMMER_MRECF_CRC_ERROR 0x80000000
+
+#define HAMMER_MRECF_DATA_CRC_BAD 0x40000000
+#define HAMMER_MRECF_RECD_CRC_BAD 0x20000000
+
#define HAMMER_MREC_CRCOFF (offsetof(struct hammer_ioc_mrecord_head, rec_size))
#define HAMMER_MREC_HEADSIZE sizeof(struct hammer_ioc_mrecord_head)
int data_len;
int bytes;
int eatdisk;
+ int mrec_flags;
u_int32_t localization;
u_int32_t rec_crc;
bzero(&mrec, sizeof(mrec));
bzero(&cmirror, sizeof(cmirror));
+ /*
+ * Make CRC errors non-fatal (at least on data), causing an EDOM
+ * error instead of EIO.
+ */
+ trans->flags |= HAMMER_TRANSF_CRCDOM;
+
retry:
error = hammer_init_cursor(trans, &cursor, NULL, NULL);
if (error) {
/*
* The core code exports the data to userland.
+ *
+ * CRC errors on data are reported but passed through,
+ * but the data must be washed by the user program.
*/
+ mrec_flags = 0;
data_len = (elm->data_offset) ? elm->data_len : 0;
if (data_len) {
error = hammer_btree_extract(&cursor,
HAMMER_CURSOR_GET_DATA);
- if (error)
- break;
+ if (error) {
+ if (error != EDOM)
+ break;
+ mrec_flags |= HAMMER_MRECF_CRC_ERROR |
+ HAMMER_MRECF_DATA_CRC_BAD;
+ }
}
bytes = sizeof(mrec.rec) + data_len;
* userland and delete_tid is cleared.
*/
mrec.head.signature = HAMMER_IOC_MIRROR_SIGNATURE;
- mrec.head.type = HAMMER_MREC_TYPE_REC;
+ mrec.head.type = HAMMER_MREC_TYPE_REC | mrec_flags;
mrec.head.rec_size = bytes;
mrec.rec.leaf = *elm;
+
if (elm->base.delete_tid > mirror->tid_end)
mrec.rec.leaf.base.delete_tid = 0;
rec_crc = crc32(&mrec.head.rec_size,
break;
}
- switch(mrec.head.type) {
+ switch(mrec.head.type & HAMMER_MRECF_TYPE_MASK) {
case HAMMER_MREC_TYPE_SKIP:
if (mrec.head.rec_size != sizeof(mrec.skip))
error = EINVAL;
if (error == 0)
error = hammer_ioc_mirror_write_rec(&cursor, &mrec.rec, mirror, localization, uptr + sizeof(mrec.rec));
break;
+ case HAMMER_MREC_TYPE_REC_BADCRC:
+ /*
+ * Records with bad data payloads are ignored XXX.
+ */
+ if (mrec.head.rec_size < sizeof(mrec.rec))
+ error = EINVAL;
+ break;
case HAMMER_MREC_TYPE_PASS:
if (mrec.head.rec_size != sizeof(mrec.rec))
error = EINVAL;
* cursor must be seeked to the directory entry record being deleted.
*
* The related inode should be share-locked by the caller. The caller is
- * on the frontend.
+ * on the frontend. It could also be NULL indicating that the directory
+ * entry being removed has no related inode.
*
* This function can return EDEADLK requiring the caller to terminate
* the cursor, any locks, wait on the returned record, and retry.
record->type = HAMMER_MEM_RECORD_DEL;
record->leaf.base = cursor->leaf->base;
+ /*
+ * ip may be NULL, indicating the deletion of a directory
+ * entry which has no related inode.
+ */
record->target_ip = ip;
- record->flush_state = HAMMER_FST_SETUP;
- TAILQ_INSERT_TAIL(&ip->target_list, record, target_entry);
+ if (ip) {
+ record->flush_state = HAMMER_FST_SETUP;
+ TAILQ_INSERT_TAIL(&ip->target_list, record,
+ target_entry);
+ } else {
+ record->flush_state = HAMMER_FST_IDLE;
+ }
/*
* The inode now has a dependancy and must be taken out of
* reflush when the dependancies are disposed of if someone
* is waiting on the inode.
*/
- if (ip->flush_state == HAMMER_FST_IDLE) {
+ if (ip && ip->flush_state == HAMMER_FST_IDLE) {
hammer_ref(&ip->lock);
ip->flush_state = HAMMER_FST_SETUP;
if (ip->flags & HAMMER_INODE_FLUSHW)
* on-media until we unmount.
*/
if (error == 0) {
- --ip->ino_data.nlinks;
+ if (ip)
+ --ip->ino_data.nlinks; /* do before we might block */
dip->ino_data.mtime = trans->time;
hammer_modify_inode(dip, HAMMER_INODE_MTIME);
- hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
- if (ip->ino_data.nlinks == 0 &&
- (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
- hammer_done_cursor(cursor);
- hammer_inode_unloadable_check(ip, 1);
- hammer_flush_inode(ip, 0);
+ if (ip) {
+ hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
+ if (ip->ino_data.nlinks == 0 &&
+ (ip->vp == NULL || (ip->vp->v_flag & VINACTIVE))) {
+ hammer_done_cursor(cursor);
+ hammer_inode_unloadable_check(ip, 1);
+ hammer_flush_inode(ip, 0);
+ }
}
}
static void hammer_free_volume(hammer_volume_t volume);
static int hammer_load_volume(hammer_volume_t volume);
static int hammer_load_buffer(hammer_buffer_t buffer, int isnew);
-static int hammer_load_node(hammer_node_t node, int isnew);
+static int hammer_load_node(hammer_transaction_t trans,
+ hammer_node_t node, int isnew);
static int
hammer_vol_rb_compare(hammer_volume_t vol1, hammer_volume_t vol2)
if (node->ondisk) {
*errorp = 0;
} else {
- *errorp = hammer_load_node(node, isnew);
+ *errorp = hammer_load_node(trans, node, isnew);
trans->flags |= HAMMER_TRANSF_DIDIO;
}
if (*errorp) {
* Load a node's on-disk data reference.
*/
static int
-hammer_load_node(hammer_node_t node, int isnew)
+hammer_load_node(hammer_transaction_t trans, hammer_node_t node, int isnew)
{
hammer_buffer_t buffer;
hammer_off_t buf_offset;
goto failed;
node->ondisk = (void *)((char *)buffer->ondisk +
(node->node_offset & HAMMER_BUFMASK));
+
+ /*
+ * Check CRC. NOTE: Neither flag is set and the CRC is not
+ * generated on new B-Tree nodes.
+ */
if (isnew == 0 &&
- (node->flags & HAMMER_NODE_CRCGOOD) == 0) {
- if (hammer_crc_test_btree(node->ondisk) == 0)
- Debugger("CRC FAILED: B-TREE NODE");
- node->flags |= HAMMER_NODE_CRCGOOD;
+ (node->flags & HAMMER_NODE_CRCANY) == 0) {
+ if (hammer_crc_test_btree(node->ondisk) == 0) {
+ if (hammer_debug_debug & 0x0002)
+ Debugger("CRC FAILED: B-TREE NODE");
+ node->flags |= HAMMER_NODE_CRCBAD;
+ } else {
+ node->flags |= HAMMER_NODE_CRCGOOD;
+ }
}
}
+ if (node->flags & HAMMER_NODE_CRCBAD) {
+ if (trans->flags & HAMMER_TRANSF_CRCDOM)
+ error = EDOM;
+ else
+ error = EIO;
+ }
failed:
--node->loading;
hammer_unlock(&node->lock);
* Safely reference a node, interlock against flushes via the IO subsystem.
*/
hammer_node_t
-hammer_ref_node_safe(struct hammer_mount *hmp, hammer_node_cache_t cache,
+hammer_ref_node_safe(hammer_transaction_t trans, hammer_node_cache_t cache,
int *errorp)
{
hammer_node_t node;
node = cache->node;
if (node != NULL) {
hammer_ref(&node->lock);
- if (node->ondisk)
- *errorp = 0;
- else
- *errorp = hammer_load_node(node, 0);
+ if (node->ondisk) {
+ if (node->flags & HAMMER_NODE_CRCBAD) {
+ if (trans->flags & HAMMER_TRANSF_CRCDOM)
+ *errorp = EDOM;
+ else
+ *errorp = EIO;
+ } else {
+ *errorp = 0;
+ }
+ } else {
+ *errorp = hammer_load_node(trans, node, 0);
+ }
if (*errorp) {
hammer_rel_node(node);
node = NULL;
}
}
hammer_done_cursor(&cursor);
+
+ /*
+ * Lookup the obj_id. This should always succeed. If it does not
+ * the filesystem may be damaged and we return a dummy inode.
+ */
if (error == 0) {
ip = hammer_get_inode(&trans, dip, obj_id,
asof, localization,
flags, &error);
+ if (error == ENOENT) {
+ kprintf("HAMMER: WARNING: Missing "
+ "inode for dirent \"%s\"\n"
+ "\tobj_id = %016llx\n",
+ ncp->nc_name, (long long)obj_id);
+ error = 0;
+ ip = hammer_get_dummy_inode(&trans, dip, obj_id,
+ asof, localization,
+ flags, &error);
+ }
if (error == 0) {
error = hammer_get_vnode(ip, &vp);
hammer_rel_inode(ip, 0);
/*
* Handle artificial entries
+ *
+ * It should be noted that the minimum value for a directory
+ * hash key on-media is 0x0000000100000000, so we can use anything
+ * less then that to represent our 'special' key space.
*/
error = 0;
if (saveoff == 0) {
0, &error);
hammer_lock_sh(&cursor.ip->lock);
if (error == ENOENT) {
- kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
- Debugger("ENOENT unlinking object that should exist");
+ kprintf("HAMMER: WARNING: Removing "
+ "dirent w/missing inode \"%s\"\n"
+ "\tobj_id = %016llx\n",
+ ncp->nc_name,
+ (long long)cursor.data->entry.obj_id);
+ error = 0;
}
/*
* If isdir >= 0 we validate that the entry is or is not a
* directory. If isdir < 0 we don't care.
*/
- if (error == 0 && isdir >= 0) {
+ if (error == 0 && isdir >= 0 && ip) {
if (isdir &&
ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
error = ENOTDIR;
* If any changes whatsoever have been made to the cursor
* set EDEADLK and retry.
*/
- if (error == 0 && ip->ino_data.obj_type ==
- HAMMER_OBJTYPE_DIRECTORY) {
+ if (error == 0 && ip && ip->ino_data.obj_type ==
+ HAMMER_OBJTYPE_DIRECTORY) {
hammer_unlock_cursor(&cursor);
error = hammer_ip_check_directory_empty(trans, ip);
hammer_lock_cursor(&cursor);
cache_setunresolved(nch);
cache_setvp(nch, NULL);
/* XXX locking */
- if (ip->vp) {
+ if (ip && ip->vp) {
hammer_knote(ip->vp, NOTE_DELETE);
cache_inval_vp(ip->vp, CINV_DESTROY);
}