From: Matthew Dillon Date: Thu, 24 Aug 2017 10:38:37 +0000 (-0700) Subject: hammer2 - Refactor dedup, fixes, optimizations X-Git-Tag: v5.1.0~206 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/3d4f397a6161517fc47af871d6b9cbcc5e6b9443 hammer2 - Refactor dedup, fixes, optimizations * Refactor the dedup code, reducing tracking complexity. Also note that we cannot really depend on VM page caching for dedup tests. Document why using bread() is better. * Use a larger dedup heuristic table to improve dedup matching. * Improve hammer2_io_getquick(), fix a bug where the dedup_delete code was improperly using it and could sometimes miss a dio structure due to the underlying buffer not being fully cached. * Cap out at 16KB indirect blocks, instead of 64KB indirect blocks. This significantly reduces meta-data overheads. * For now remove IO invalidation, it was causing corruption due to bugs. This means that deleted meta-data will be flushed. However, a certain amount of meta-data does not get immediately instantiated and file data chains are not instantiated unless the buffer cache gets flushed, so temporary files are still pretty cheap. * Try to improve DIO's LRU recycling. * Fix a brelse() that was supposed to be a bqrelse(). This improves meta-data caching which is desirable for dedup. * Implement the 'always_compress' sysctl which disables the H2 compression heuristic that tries to detect uncompressable data. If set, H2 will always try to compress. --- diff --git a/sbin/hammer2/hammer2.8 b/sbin/hammer2/hammer2.8 index e17057d132..da1498a3ae 100644 --- a/sbin/hammer2/hammer2.8 +++ b/sbin/hammer2/hammer2.8 @@ -286,6 +286,14 @@ data. Hammer2 compression is only effective when it can reduce the size of dataset (typically a 64KB block) by one or more powers of 2. A 64K block which only compresses to 40K will not yield any storage improvement. +.Pp +Generally speaking you do not want to set the compression mode to 'none', +as this will cause blocks of all-zeros to be written as all-zero blocks, +instead of holes. The 'autozero' compression mode detects blocks of all-zeros +and writes them as holes. However, HAMMER2 will rewrite data in-place if +the compression mode is set to 'none' and the check code is set to +'disabled'. Formal snapshots will still snapshot such files. However, +de-duplication will no longer function on the data blocks. .\" ==== setcheck ==== .It Cm setcheck Ar check Op path... Set the check code as specified for any newly created elements at or under @@ -313,6 +321,26 @@ You can specify any PFS for the mount, the bulkfree pass is run on the entire partition. Note that it takes two passes to actually free space. .El +.Sh SYSCTLS +.Bl -tag -width indent +.It Va vfs.hammer2.dedup_enable (default on) +Enables live de-duplication. Any recently read data that is on-media +(already synchronized to media) is tested against pending writes for +compatibility. If a match is found, the write will reference the +existing on-media data instead of writing new data. +.It Va vfs.hammer2.always_compress (default off) +This disables the H2 compression heuristic and forces H2 to always +try to compress data blocks, even if they look uncompressable. +Enabling this option reduces performance but has higher de-duplication +repeatability. +.It Va vfs.hammer2.cluster_read (default 4) +Set the amount of read-ahead clustering to perform. +.It Va vfs.hammer2.cluster_write (default 0) +Set the amount of write-behind clustering to perform. This is disabled by +default in order to give temporary files a chance to be deleted before +media writes are committed. Enabling this reduces buffer cache stress +but causes file writes to flush to media more quickly. +.El .Sh SETTING UP /etc/hammer2 The 'rsainit' directive will create the .Pa /etc/hammer2 diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index 3357686777..92a92adc14 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -292,8 +292,9 @@ typedef struct hammer2_iocb hammer2_iocb_t; /* * DIO - Management structure wrapping system buffer cache. * - * Used for multiple purposes including concurrent management - * if small requests by chains into larger DIOs. + * HAMMER2 uses an I/O abstraction that allows it to cache and manipulate + * fixed-sized filesystem buffers frontend by variable-sized hammer2_chain + * structures. */ struct hammer2_io { RB_ENTRY(hammer2_io) rbnode; /* indexed by device offset */ @@ -306,7 +307,9 @@ struct hammer2_io { int psize; int act; /* activity */ int btype; /* approximate BREF_TYPE_* */ - int unused01; + int ticks; + uint64_t invalid_mask; /* area that is invalid on-disk */ + uint64_t dedup_ok_mask; /* ok to dedup */ }; typedef struct hammer2_io hammer2_io_t; @@ -315,13 +318,9 @@ typedef struct hammer2_io hammer2_io_t; #define HAMMER2_DIO_GOOD 0x4000000000000000LLU /* dio->bp is stable */ #define HAMMER2_DIO_WAITING 0x2000000000000000LLU /* wait on INPROG */ #define HAMMER2_DIO_DIRTY 0x1000000000000000LLU /* flush last drop */ -#define HAMMER2_DIO_INVALOK 0x0800000000000000LLU /* ok to inval */ -#define HAMMER2_DIO_INVAL 0x0400000000000000LLU /* inval request */ #define HAMMER2_DIO_MASK 0x00FFFFFFFFFFFFFFLLU -#define HAMMER2_DIO_INVALBITS (HAMMER2_DIO_INVAL | HAMMER2_DIO_INVALOK) - /* * Primary chain structure keeps track of the topology in-memory. */ @@ -379,7 +378,7 @@ RB_PROTOTYPE(hammer2_chain_tree, hammer2_chain, rbnode, hammer2_chain_cmp); #define HAMMER2_CHAIN_MODIFIED 0x00000001 /* dirty chain data */ #define HAMMER2_CHAIN_ALLOCATED 0x00000002 /* kmalloc'd chain */ #define HAMMER2_CHAIN_DESTROY 0x00000004 -#define HAMMER2_CHAIN_DEDUP 0x00000008 /* recorded for dedup */ +#define HAMMER2_CHAIN_UNUSED0008 0x00000008 #define HAMMER2_CHAIN_DELETED 0x00000010 /* deleted chain */ #define HAMMER2_CHAIN_INITIAL 0x00000020 /* initial create */ #define HAMMER2_CHAIN_UPDATE 0x00000040 /* need parent update */ @@ -783,7 +782,7 @@ typedef struct hammer2_trans hammer2_trans_t; #define HAMMER2_FREEMAP_HEUR_SIZE (HAMMER2_FREEMAP_HEUR_NRADIX * \ HAMMER2_FREEMAP_HEUR_TYPES) -#define HAMMER2_DEDUP_HEUR_SIZE 65536 +#define HAMMER2_DEDUP_HEUR_SIZE (65536 * 4) #define HAMMER2_DEDUP_HEUR_MASK (HAMMER2_DEDUP_HEUR_SIZE - 1) #define HAMMER2_FLUSH_TOP 0x0001 @@ -1076,7 +1075,7 @@ struct hammer2_dev { int nipstacks; int maxipstacks; kdmsg_iocom_t iocom; /* volume-level dmsg interface */ - struct spinlock io_spin; /* iotree access */ + struct spinlock io_spin; /* iotree, iolruq access */ struct hammer2_io_tree iotree; int iofree_count; hammer2_chain_t vchain; /* anchor chain (topology) */ @@ -1287,10 +1286,12 @@ extern int hammer2_debug; extern int hammer2_cluster_read; extern int hammer2_cluster_write; extern int hammer2_dedup_enable; +extern int hammer2_always_compress; extern int hammer2_inval_enable; extern int hammer2_flush_pipe; extern int hammer2_synchronous_flush; extern int hammer2_dio_count; +extern int hammer2_limit_dio; extern long hammer2_chain_allocs; extern long hammer2_chain_frees; extern long hammer2_limit_dirty_chains; @@ -1494,7 +1495,12 @@ hammer2_tid_t hammer2_trans_sub(hammer2_pfs_t *pmp); void hammer2_trans_done(hammer2_pfs_t *pmp); hammer2_tid_t hammer2_trans_newinum(hammer2_pfs_t *pmp); void hammer2_trans_assert_strategy(hammer2_pfs_t *pmp); -void hammer2_dedup_record(hammer2_chain_t *chain, char *data); +void hammer2_dedup_record(hammer2_chain_t *chain, hammer2_io_t *dio, + char *data); +void hammer2_dedup_delete(hammer2_dev_t *hmp, hammer2_off_t data_off, + u_int bytes); +void hammer2_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, + u_int bytes); /* * hammer2_ioctl.c @@ -1506,10 +1512,11 @@ int hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data, * hammer2_io.c */ void hammer2_io_putblk(hammer2_io_t **diop); +void hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes); void hammer2_io_cleanup(hammer2_dev_t *hmp, struct hammer2_io_tree *tree); char *hammer2_io_data(hammer2_io_t *dio, off_t lbase); -hammer2_io_t *hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize); -void hammer2_io_resetinval(hammer2_dev_t *hmp, off_t lbase); +hammer2_io_t *hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize, + int notgood); void hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, hammer2_iocb_t *iocb); void hammer2_io_complete(hammer2_iocb_t *iocb); @@ -1527,7 +1534,6 @@ void hammer2_io_bdwrite(hammer2_io_t **diop); int hammer2_io_bwrite(hammer2_io_t **diop); int hammer2_io_isdirty(hammer2_io_t *dio); void hammer2_io_setdirty(hammer2_io_t *dio); -void hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes); void hammer2_io_brelse(hammer2_io_t **diop); void hammer2_io_bqrelse(hammer2_io_t **diop); int hammer2_io_crc_good(hammer2_chain_t *chain, uint64_t *maskp); diff --git a/sys/vfs/hammer2/hammer2_bulkfree.c b/sys/vfs/hammer2/hammer2_bulkfree.c index e6aca816c1..b5132e69ab 100644 --- a/sys/vfs/hammer2/hammer2_bulkfree.c +++ b/sys/vfs/hammer2/hammer2_bulkfree.c @@ -792,21 +792,6 @@ next: } } -/* - * When bulkfree is finally able to free a block it must make sure that - * the INVALOK bit in any cached DIO is cleared prior to the block being - * reused. - */ -static -void -fixup_dio(hammer2_dev_t *hmp, hammer2_off_t data_off, int bindex, int scount) -{ - data_off += (scount >> 1) * HAMMER2_FREEMAP_BLOCK_SIZE; - data_off += bindex * - (HAMMER2_FREEMAP_BLOCK_SIZE * HAMMER2_BMAP_BLOCKS_PER_ELEMENT); - hammer2_io_resetinval(hmp, data_off); -} - /* * Merge the bulkfree bitmap against the existing bitmap. * @@ -821,14 +806,19 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, { int bindex; int scount; + hammer2_off_t tmp_off; hammer2_bitmap_t lmask; hammer2_bitmap_t mmask; + tmp_off = data_off; + for (bindex = 0; bindex < HAMMER2_BMAP_ELEMENTS; ++bindex) { lmask = live->bitmapq[bindex]; /* live */ mmask = bmap->bitmapq[bindex]; /* snapshotted bulkfree */ - if (lmask == mmask) + if (lmask == mmask) { + tmp_off += HAMMER2_BMAP_INDEX_SIZE; continue; + } for (scount = 0; scount < HAMMER2_BMAP_BITS_PER_ELEMENT; @@ -864,15 +854,21 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, cbinfo->adj_free += HAMMER2_FREEMAP_BLOCK_SIZE; ++cbinfo->count_10_00; - fixup_dio(cbinfo->hmp, data_off, - bindex, scount); + hammer2_dedup_assert( + cbinfo->hmp, + tmp_off | + HAMMER2_FREEMAP_BLOCK_RADIX, + HAMMER2_FREEMAP_BLOCK_SIZE); break; case 3: /* 11 -> 10 */ live->bitmapq[bindex] &= ~((hammer2_bitmap_t)1 << scount); ++cbinfo->count_11_10; - fixup_dio(cbinfo->hmp, data_off, - bindex, scount); + hammer2_dedup_delete( + cbinfo->hmp, + tmp_off | + HAMMER2_FREEMAP_BLOCK_RADIX, + HAMMER2_FREEMAP_BLOCK_SIZE); break; } } else if ((mmask & 3) == 3) { @@ -905,11 +901,10 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, } live->bitmapq[bindex] |= ((hammer2_bitmap_t)3 << scount); - fixup_dio(cbinfo->hmp, data_off, - bindex, scount); } mmask >>= 2; lmask >>= 2; + tmp_off += HAMMER2_FREEMAP_BLOCK_SIZE; } } @@ -929,10 +924,25 @@ h2_bulkfree_sync_adjust(hammer2_bulkfree_info_t *cbinfo, live->class = 0; live->linear = 0; ++cbinfo->count_l0cleans; +#if 0 + hammer2_dedup_assert(cbinfo->hmp, + data_off | + HAMMER2_FREEMAP_LEVEL0_RADIX, + HAMMER2_FREEMAP_LEVEL0_SIZE); +#endif } else if (bindex < 7) { + int32_t nlinear; + ++bindex; + if (live->linear > bindex * HAMMER2_FREEMAP_BLOCK_SIZE) { - live->linear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE; + nlinear = bindex * HAMMER2_FREEMAP_BLOCK_SIZE; +#if 0 + hammer2_dedup_assert(cbinfo->hmp, + data_off + nlinear, + live->linear - nlinear); +#endif + live->linear = nlinear; ++cbinfo->count_linadjusts; } diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c index 644544c4ac..d6011137e7 100644 --- a/sys/vfs/hammer2/hammer2_chain.c +++ b/sys/vfs/hammer2/hammer2_chain.c @@ -488,18 +488,16 @@ hammer2_chain_lastdrop(hammer2_chain_t *chain) atomic_clear_int(&chain->flags, HAMMER2_CHAIN_UPDATE); /* - * If the chain has children or if it has been MODIFIED and - * also recorded for DEDUP, we must still flush the chain. + * If the chain has children we must still flush the chain. + * Any dedup is already handled by the underlying DIO, so + * we do not have to specifically flush it here. * * In the case where it has children, the DESTROY flag test * in the flush code will prevent unnecessary flushes of * MODIFIED chains that are not flagged DEDUP so don't worry * about that here. */ - if (chain->core.chain_count || - (chain->flags & (HAMMER2_CHAIN_MODIFIED | - HAMMER2_CHAIN_DEDUP)) == - (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_DEDUP)) { + if (chain->core.chain_count) { /* * Put on flushq (should ensure refs > 1), retry * the drop. @@ -1425,8 +1423,6 @@ static __inline int modified_needs_new_allocation(hammer2_chain_t *chain) { - hammer2_io_t *dio; - /* * We only live-dedup data, we do not live-dedup meta-data. */ @@ -1441,6 +1437,10 @@ modified_needs_new_allocation(hammer2_chain_t *chain) if (chain->bytes == 0) return 0; + return 0; + +#if 0 + hammer2_io_t *dio; /* * If this flag is not set the current modification has not been * recorded for dedup so a new allocation is not needed. The @@ -1474,6 +1474,7 @@ modified_needs_new_allocation(hammer2_chain_t *chain) } } return 1; +#endif } /* @@ -1592,7 +1593,9 @@ hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, * containing the caller's desired data. The dedup offset is * allowed to be in a partially free state and we must be sure * to reset it to a fully allocated state to force two bulkfree - * passes to free it again. + * passes to free it again. The chain will not be marked MODIFIED + * in the dedup case, as the dedup data cannot be changed without + * a new allocation. * * NOTE: Only applicable when chain->bytes != 0. * @@ -1608,14 +1611,16 @@ hammer2_chain_modify(hammer2_chain_t *chain, hammer2_tid_t mtid, chain->bref.data_off = dedup_off; chain->bytes = 1 << (dedup_off & HAMMER2_OFF_MASK_RADIX); - atomic_set_int(&chain->flags, - HAMMER2_CHAIN_DEDUP); + atomic_clear_int(&chain->flags, + HAMMER2_CHAIN_MODIFIED); + atomic_add_long(&hammer2_count_modified_chains, + -1); + if (chain->pmp) + hammer2_pfs_memory_wakeup(chain->pmp); hammer2_freemap_adjust(hmp, &chain->bref, HAMMER2_FREEMAP_DORECOVER); } else { hammer2_freemap_alloc(chain, chain->bytes); - atomic_clear_int(&chain->flags, - HAMMER2_CHAIN_DEDUP); } /* XXX failed allocation */ } @@ -3440,7 +3445,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent, /* * How big should our new indirect block be? It has to be at least - * as large as its parent. + * as large as its parent for splits to work properly. * * The freemap uses a specific indirect block size. The number of * levels are built dynamically and ultimately depend on the size @@ -3449,16 +3454,22 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent, * much to save disk space. * * The first indirect block level for a directory usually uses - * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). - * (the 4 entries built-into the inode can handle 4 directory - * entries) + * HAMMER2_IND_BYTES_MIN (4KB = 32 directory entries). Due to + * the hash mechanism, this typically gives us a nominal + * 32 * 4 entries with one level of indirection. * - * The first indirect block level for a file usually uses - * HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs = ~8MB file). - * (the 4 entries built-into the inode can handle a 256KB file). + * We use HAMMER2_IND_BYTES_NOM (16KB = 128 blockrefs) for FILE + * indirect blocks. The initial 4 entries in the inode gives us + * 256KB. Up to 4 indirect blocks gives us 32MB. Three levels + * of indirection gives us 137GB, and so forth. H2 can support + * huge file sizes but they are not typical, so we try to stick + * with compactness and do not use a larger indirect block size. * - * The first indirect block level down from an inode typically - * uses LBUFSIZE (16384), else it uses PBUFSIZE (65536). + * We could use 64KB (PBUFSIZE), giving us 512 blockrefs, but + * due to the way indirect blocks are created this usually winds + * up being extremely inefficient for small files. Even though + * 16KB requires more levels of indirection for very large files, + * the 16KB records can be ganged together into 64KB DIOs. */ if (for_type == HAMMER2_BREF_TYPE_FREEMAP_NODE || for_type == HAMMER2_BREF_TYPE_FREEMAP_LEAF) { @@ -3471,7 +3482,7 @@ hammer2_chain_create_indirect(hammer2_chain_t *parent, nbytes = HAMMER2_IND_BYTES_NOM; /* 16KB = ~8MB file */ } else { - nbytes = HAMMER2_IND_BYTES_MAX; + nbytes = HAMMER2_IND_BYTES_NOM; } if (nbytes < count * sizeof(hammer2_blockref_t)) { KKASSERT(for_type != HAMMER2_BREF_TYPE_FREEMAP_NODE && diff --git a/sys/vfs/hammer2/hammer2_flush.c b/sys/vfs/hammer2/hammer2_flush.c index 7924647fd8..f096b6fd10 100644 --- a/sys/vfs/hammer2/hammer2_flush.c +++ b/sys/vfs/hammer2/hammer2_flush.c @@ -868,25 +868,31 @@ hammer2_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain, } /* - * If the chain was destroyed try to avoid unnecessary I/O. - * The DIO system buffer may silently disallow the - * invalidation. + * If the chain was destroyed try to avoid unnecessary I/O + * that might not have yet occurred. Remove the data range + * from dedup candidacy and attempt to invalidation that + * potentially dirty portion of the I/O buffer. */ if (chain->flags & HAMMER2_CHAIN_DESTROY) { + hammer2_dedup_delete(hmp, + chain->bref.data_off, + chain->bytes); +#if 0 hammer2_io_t *dio; - if (chain->dio) { - hammer2_io_setinval(chain->dio, - chain->bref.data_off, - chain->bytes); + hammer2_io_inval(chain->dio, + chain->bref.data_off, + chain->bytes); } else if ((dio = hammer2_io_getquick(hmp, chain->bref.data_off, - chain->bytes)) != NULL) { - hammer2_io_setinval(dio, - chain->bref.data_off, - chain->bytes); + chain->bytes, + 1)) != NULL) { + hammer2_io_inval(dio, + chain->bref.data_off, + chain->bytes); hammer2_io_putblk(&dio); } +#endif } } diff --git a/sys/vfs/hammer2/hammer2_io.c b/sys/vfs/hammer2/hammer2_io.c index c547e4ea9e..46c0540e3d 100644 --- a/sys/vfs/hammer2/hammer2_io.c +++ b/sys/vfs/hammer2/hammer2_io.c @@ -36,13 +36,15 @@ /* * Implements an abstraction layer for synchronous and asynchronous - * buffered device I/O. Can be used for OS-abstraction but the main + * buffered device I/O. Can be used as an OS-abstraction but the main * purpose is to allow larger buffers to be used against hammer2_chain's * using smaller allocations, without causing deadlocks. * + * The DIOs also record temporary state with limited persistence. This + * feature is used to keep track of dedupable blocks. */ static int hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg); -static void dio_write_stats_update(hammer2_io_t *dio); +static void dio_write_stats_update(hammer2_io_t *dio, struct buf *bp); static int hammer2_io_cmp(hammer2_io_t *io1, hammer2_io_t *io2) @@ -101,6 +103,7 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, hammer2_io_t *xio; off_t pbase; off_t pmask; + /* * XXX after free, buffer reuse case w/ different size can clash * with dio cache. Lets avoid it for now. Ultimate we need to @@ -141,6 +144,7 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, dio->psize = psize; dio->btype = iocb->btype; dio->refs = 1; + dio->act = 5; hammer2_spin_init(&dio->spin, "h2dio"); TAILQ_INIT(&dio->iocbq); hammer2_spin_ex(&hmp->io_spin); @@ -164,8 +168,9 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, */ iocb->dio = dio; - if (dio->act < 5) /* SMP race ok */ - ++dio->act; + dio->ticks = ticks; + if (dio->act < 10) + ++dio->act; /* SMP race ok */ for (;;) { refs = dio->refs; @@ -226,7 +231,7 @@ hammer2_io_getblk(hammer2_dev_t *hmp, off_t lbase, int lsize, * caches the data. */ hammer2_io_t * -hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize) +hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize, int notgood) { hammer2_iocb_t iocb; hammer2_io_t *dio; @@ -262,8 +267,9 @@ hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize) atomic_add_int(&dio->hmp->iofree_count, -1); hammer2_spin_unsh(&hmp->io_spin); - if (dio->act < 5) /* SMP race ok */ - ++dio->act; + dio->ticks = ticks; + if (dio->act < 10) + ++dio->act; /* SMP race ok */ /* * Obtain/validate the buffer. Do NOT issue I/O. Discard if @@ -300,15 +306,23 @@ hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize) /* * We own DIO_INPROG, try to set DIO_GOOD. * - * For now do not use GETBLK_NOWAIT because + * If (notgood) specified caller just wants the dio and doesn't + * care about the buffer a whole lot. However, if the buffer + * is good (or dirty), we still want to return it. + * + * Otherwise we are trying to resolve a dedup and bread() + * is expected to always be better than building a new buffer + * that will be written. Use bread() for better determinism + * than getblk(). */ bp = dio->bp; dio->bp = NULL; if (bp == NULL) { -#if 0 - bp = getblk(hmp->devvp, dio->pbase, dio->psize, 0, 0); -#endif - bread(hmp->devvp, dio->pbase, dio->psize, &bp); + if (notgood) + bp = getblk(hmp->devvp, dio->pbase, + dio->psize, 0, 0); + else + bread(hmp->devvp, dio->pbase, dio->psize, &bp); } /* @@ -338,38 +352,21 @@ hammer2_io_getquick(hammer2_dev_t *hmp, off_t lbase, int lsize) } /* - * Only return the dio if its buffer is good. If the buffer is not - * good be sure to clear INVALOK, meaning that invalidation is no - * longer acceptable + * Only return the dio if its buffer is good. If notgood != 0, + * we return the buffer regardless (so ephermal dedup bits can be + * cleared). */ - if ((dio->refs & HAMMER2_DIO_GOOD) == 0) { + if (notgood == 0 && (dio->refs & HAMMER2_DIO_GOOD) == 0) { hammer2_io_putblk(&dio); } return dio; } -/* - * Make sure that all invalidation flags are cleared on the dio associated - * with the specified data offset, if the dio exists. - * - * Called from bulkfree when a block becomes reusable to ensure that new - * allocations do not accidently discard the buffer later on. - */ -void -hammer2_io_resetinval(hammer2_dev_t *hmp, off_t data_off) -{ - hammer2_io_t *dio; - - data_off &= ~HAMMER2_PBUFMASK64; - hammer2_spin_sh(&hmp->io_spin); - dio = RB_LOOKUP(hammer2_io_tree, &hmp->iotree, data_off); - if (dio) - atomic_clear_64(&dio->refs, HAMMER2_DIO_INVALBITS); - hammer2_spin_unsh(&hmp->io_spin); -} - /* * The originator of the iocb is finished with it. + * + * WARNING: iocb may be partially initialized with only iocb->dio and + * iocb->flags. */ void hammer2_io_complete(hammer2_iocb_t *iocb) @@ -518,6 +515,7 @@ hammer2_io_putblk(hammer2_io_t **diop) off_t peof; off_t pbase; int psize; + int limit_dio; uint64_t orefs; uint64_t nrefs; @@ -525,9 +523,7 @@ hammer2_io_putblk(hammer2_io_t **diop) *diop = NULL; hmp = dio->hmp; - while (dio->unused01) { - tsleep(&dio->unused01, 0, "h2DEBUG", hz); - } + KKASSERT((dio->refs & HAMMER2_DIO_MASK) != 0); /* * Drop refs. @@ -550,7 +546,6 @@ hammer2_io_putblk(hammer2_io_t **diop) * Lastdrop case, INPROG can be set. */ nrefs &= ~(HAMMER2_DIO_GOOD | HAMMER2_DIO_DIRTY); - nrefs &= ~(HAMMER2_DIO_INVAL); nrefs |= HAMMER2_DIO_INPROG; if (atomic_cmpset_64(&dio->refs, orefs, nrefs)) break; @@ -575,7 +570,9 @@ hammer2_io_putblk(hammer2_io_t **diop) /* * Lastdrop (1->0 transition). INPROG has been set, GOOD and DIRTY - * have been cleared. + * have been cleared. iofree_count has not yet been incremented, + * note that another accessor race will decrement iofree_count so + * we have to increment it regardless. * * We can now dispose of the buffer, and should do it before calling * io_complete() in case there's a race against a new reference @@ -588,7 +585,7 @@ hammer2_io_putblk(hammer2_io_t **diop) if (orefs & HAMMER2_DIO_GOOD) { KKASSERT(bp != NULL); -#if 1 +#if 0 if (hammer2_inval_enable && (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) { ++hammer2_iod_invals; @@ -599,7 +596,7 @@ hammer2_io_putblk(hammer2_io_t **diop) if (orefs & HAMMER2_DIO_DIRTY) { int hce; - dio_write_stats_update(dio); + dio_write_stats_update(dio, bp); if ((hce = hammer2_cluster_write) > 0) { /* * Allows write-behind to keep the buffer @@ -624,7 +621,7 @@ hammer2_io_putblk(hammer2_io_t **diop) bqrelse(bp); } } else if (bp) { -#if 1 +#if 0 if (hammer2_inval_enable && (orefs & HAMMER2_DIO_INVALBITS) == HAMMER2_DIO_INVALBITS) { ++hammer2_iod_invals; @@ -633,10 +630,10 @@ hammer2_io_putblk(hammer2_io_t **diop) } else #endif if (orefs & HAMMER2_DIO_DIRTY) { - dio_write_stats_update(dio); + dio_write_stats_update(dio, bp); bdwrite(bp); } else { - brelse(bp); + bqrelse(bp); } } @@ -660,13 +657,19 @@ hammer2_io_putblk(hammer2_io_t **diop) * We cache free buffers so re-use cases can use a shared lock, but * if too many build up we have to clean them out. */ - if (hmp->iofree_count > 65536) { + limit_dio = hammer2_limit_dio; + if (limit_dio < 256) + limit_dio = 256; + if (limit_dio > 1024*1024) + limit_dio = 1024*1024; + if (hmp->iofree_count > limit_dio) { struct hammer2_cleanupcb_info info; + kprintf("x"); RB_INIT(&info.tmptree); hammer2_spin_ex(&hmp->io_spin); - if (hmp->iofree_count > 65536) { - info.count = hmp->iofree_count / 4; + if (hmp->iofree_count > limit_dio) { + info.count = hmp->iofree_count / 5; RB_SCAN(hammer2_io_tree, &hmp->iotree, NULL, hammer2_io_cleanup_callback, &info); } @@ -690,15 +693,22 @@ hammer2_io_cleanup_callback(hammer2_io_t *dio, void *arg) if ((dio->refs & (HAMMER2_DIO_MASK | HAMMER2_DIO_INPROG)) == 0) { if (dio->act > 0) { - --dio->act; - return 0; + int act; + + act = dio->act - (ticks - dio->ticks) / hz - 1; + if (act > 0) { + dio->act = act; + return 0; + } + dio->act = 0; } KKASSERT(dio->bp == NULL); - RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio); - xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio); - KKASSERT(xio == NULL); - if (--info->count <= 0) /* limit scan */ - return(-1); + if (info->count > 0) { + RB_REMOVE(hammer2_io_tree, &dio->hmp->iotree, dio); + xio = RB_INSERT(hammer2_io_tree, &info->tmptree, dio); + KKASSERT(xio == NULL); + --info->count; + } } return 0; } @@ -837,25 +847,6 @@ hammer2_iocb_new_callback(hammer2_iocb_t *iocb) vfs_bio_clrbuf(dio->bp); dio->bp->b_flags |= B_CACHE; } - - /* - * Invalidation is ok on newly allocated - * buffers which cover the entire buffer. - * Flag will be cleared on use by the de-dup - * code. - * - * hammer2_chain_modify() also checks this flag. - * - * QUICK mode is used by the freemap code to - * pre-validate a junk buffer to prevent an - * unnecessary read I/O. We do NOT want - * to set INVALOK in that situation as the - * underlying allocations may be smaller. - */ - if ((iocb->flags & HAMMER2_IOCB_QUICK) == 0) { - atomic_set_64(&dio->refs, - HAMMER2_DIO_INVALOK); - } } else if (iocb->flags & HAMMER2_IOCB_QUICK) { /* * Partial buffer, quick mode. Do nothing. @@ -879,7 +870,8 @@ hammer2_iocb_new_callback(hammer2_iocb_t *iocb) */ if (dio->bp) { if (dio->refs & HAMMER2_DIO_DIRTY) { - dio_write_stats_update(dio); + dio_write_stats_update(dio, + dio->bp); bdwrite(dio->bp); } else { bqrelse(dio->bp); @@ -1081,15 +1073,24 @@ hammer2_io_setdirty(hammer2_io_t *dio) } /* - * Request an invalidation. The hammer2_io code will oblige only if - * DIO_INVALOK is also set. INVALOK is cleared if the dio is used - * in a dedup lookup and prevents invalidation of the dirty buffer. + * This routine is called when a MODIFIED chain is being DESTROYED, + * in an attempt to allow the related buffer cache buffer to be + * invalidated and discarded instead of flushing it to disk. + * + * At the moment this case is only really useful for file meta-data. + * File data is already handled via the logical buffer cache associated + * with the vnode, and will be discarded if it was never flushed to disk. + * File meta-data may include inodes, directory entries, and indirect blocks. + * + * XXX + * However, our DIO buffers are PBUFSIZE'd (64KB), and the area being + * invalidated might be smaller. Most of the meta-data structures above + * are in the 'smaller' category. For now, don't try to invalidate the + * data areas. */ void -hammer2_io_setinval(hammer2_io_t *dio, hammer2_off_t off, u_int bytes) +hammer2_io_inval(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes) { - if ((u_int)dio->psize == bytes) - atomic_set_64(&dio->refs, HAMMER2_DIO_INVAL); } void @@ -1112,10 +1113,13 @@ hammer2_io_isdirty(hammer2_io_t *dio) static void -dio_write_stats_update(hammer2_io_t *dio) +dio_write_stats_update(hammer2_io_t *dio, struct buf *bp) { long *counterp; + if (bp->b_flags & B_DELWRI) + return; + switch(dio->btype) { case 0: return; diff --git a/sys/vfs/hammer2/hammer2_strategy.c b/sys/vfs/hammer2/hammer2_strategy.c index 7a96e970b4..39dce35893 100644 --- a/sys/vfs/hammer2/hammer2_strategy.c +++ b/sys/vfs/hammer2/hammer2_strategy.c @@ -451,8 +451,11 @@ hammer2_strategy_read_completion(hammer2_chain_t *chain, char *data, * block device behind us. This leaves more room in the * LRU chain cache for meta-data chains which we really * want to retain. + * + * NOTE: Deduplication cannot be safely recorded for + * records without a check code. */ - hammer2_dedup_record(chain, data); + hammer2_dedup_record(chain, NULL, data); atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE); /* @@ -919,13 +922,19 @@ hammer2_compress_and_write(char *data, hammer2_inode_t *ip, /* * Compression requested. Try to compress the block. We store * the data normally if we cannot sufficiently compress it. + * + * We have a heuristic to detect files which are mostly + * uncompressable and avoid the compression attempt in that + * case. If the compression heuristic is turned off, we always + * try to compress. */ comp_size = 0; comp_buffer = NULL; KKASSERT(pblksize / 2 <= 32768); - if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) { + if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0 || + hammer2_always_compress) { z_stream strm_compress; int comp_level; int ret; @@ -1116,7 +1125,6 @@ hammer2_compress_and_write(char *data, hammer2_inode_t *ip, * so we do it here. */ hammer2_chain_setcheck(chain, bdata); - hammer2_dedup_record(chain, bdata); /* * Device buffer is now valid, chain is no longer in @@ -1125,6 +1133,7 @@ hammer2_compress_and_write(char *data, hammer2_inode_t *ip, * (No blockref table worries with file data) */ atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); + hammer2_dedup_record(chain, dio, bdata); /* Now write the related bdp. */ if (ioflag & IO_SYNC) { @@ -1325,7 +1334,6 @@ hammer2_write_bp(hammer2_chain_t *chain, char *data, int ioflag, * so we do it here. */ hammer2_chain_setcheck(chain, bdata); - hammer2_dedup_record(chain, bdata); /* * Device buffer is now valid, chain is no longer in @@ -1334,6 +1342,7 @@ hammer2_write_bp(hammer2_chain_t *chain, char *data, int ioflag, * (No blockref table worries with file data) */ atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL); + hammer2_dedup_record(chain, dio, bdata); if (ioflag & IO_SYNC) { /* @@ -1362,19 +1371,50 @@ hammer2_write_bp(hammer2_chain_t *chain, char *data, int ioflag, *errorp = error; } +#define HAMMER2_DEDUP_FRAG (HAMMER2_PBUFSIZE / 64) +#define HAMMER2_DEDUP_FRAGRADIX (HAMMER2_PBUFRADIX - 6) + +static __inline +uint64_t +hammer2_dedup_mask(hammer2_io_t *dio, hammer2_off_t data_off, u_int bytes) +{ + int bbeg; + int bits; + uint64_t mask; + + bbeg = (int)((data_off & ~HAMMER2_OFF_MASK_RADIX) - dio->pbase) >> + HAMMER2_DEDUP_FRAGRADIX; + bits = (int)((bytes + (HAMMER2_DEDUP_FRAG - 1)) >> + HAMMER2_DEDUP_FRAGRADIX); + mask = ((uint64_t)1 << bbeg) - 1; + if (bbeg + bits == 64) + mask = (uint64_t)-1; + else + mask = ((uint64_t)1 << (bbeg + bits)) - 1; + + mask &= ~(((uint64_t)1 << bbeg) - 1); + + return mask; +} + /* - * LIVE DEDUP HEURISTIC + * LIVE DEDUP HEURISTICS + * + * Record that the media data area is available for dedup operation. This + * will set the appropriate dedup bits in the DIO. These bits will be cleared + * if the dedup area becomes unavailable. * * WARNING! This code is SMP safe but the heuristic allows SMP collisions. * All fields must be loaded into locals and validated. * - * WARNING! Should only be used for file data, hammer2_chain_modify() only - * checks for the dedup case on data chains. Also, dedup data can - * only be recorded for committed chains (so NOT strategy writes - * which can undergo further modification after the fact!). + * WARNING! Should only be used for file data and directory entries, + * hammer2_chain_modify() only checks for the dedup case on data + * chains. Also, dedup data can only be recorded for committed + * chains (so NOT strategy writes which can undergo further + * modification after the fact!). */ void -hammer2_dedup_record(hammer2_chain_t *chain, char *data) +hammer2_dedup_record(hammer2_chain_t *chain, hammer2_io_t *dio, char *data) { hammer2_dev_t *hmp; hammer2_dedup_t *dedup; @@ -1383,9 +1423,21 @@ hammer2_dedup_record(hammer2_chain_t *chain, char *data) int i; int dticks; + /* + * We can only record a dedup if we have media data to test against. + * If dedup is not enabled, return early, which allows a chain to + * remain marked MODIFIED (which might have benefits in special + * situations, though typically it does not). + */ if (hammer2_dedup_enable == 0) return; + if (dio == NULL) { + dio = chain->dio; + if (dio == NULL) + return; + } +#if 0 /* * Only committed data can be recorded for de-duplication, otherwise * the contents may change out from under us. So, on read if the @@ -1395,7 +1447,7 @@ hammer2_dedup_record(hammer2_chain_t *chain, char *data) (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_INITIAL)) == 0) { return; } - +#endif hmp = chain->hmp; @@ -1458,7 +1510,65 @@ hammer2_dedup_record(hammer2_chain_t *chain, char *data) dedup->ticks = ticks; dedup->data_off = chain->bref.data_off; dedup->data_crc = crc; - atomic_set_int(&chain->flags, HAMMER2_CHAIN_DEDUP); + + atomic_set_64(&dio->dedup_ok_mask, + hammer2_dedup_mask(dio, chain->bref.data_off, + chain->bytes)); + + /* + * Once we record the dedup the chain must be marked clean to + * prevent reuse of the underlying block. Remember that this + * write occurs when the buffer cache is flushed (i.e. on sync(), + * fsync(), filesystem periodic sync, or when the kernel needs to + * flush a buffer), and not whenever the user write()s. + */ + if (chain->flags & HAMMER2_CHAIN_MODIFIED) { + atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MODIFIED); + atomic_add_long(&hammer2_count_modified_chains, -1); + if (chain->pmp) + hammer2_pfs_memory_wakeup(chain->pmp); + } +} + +/* + * Remove the data range from dedup consideration. This has no effect on + * any dedups which have already occurred. We do not need a valid buffer + * for this operation and must clean out dedup_ok_mask even if the dio is + * cached without any buffer available. + */ +void +hammer2_dedup_delete(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes) +{ + hammer2_io_t *dio; + + dio = hammer2_io_getquick(hmp, data_off, bytes, 1); + if (dio) { + if (data_off < dio->pbase || + (data_off & ~HAMMER2_OFF_MASK_RADIX) + bytes > + dio->pbase + dio->psize) { + panic("DATAOFF BAD %016jx/%d %016jx\n", + data_off, bytes, dio->pbase); + } + atomic_clear_64(&dio->dedup_ok_mask, + hammer2_dedup_mask(dio, data_off, bytes)); + hammer2_io_putblk(&dio); + } +} + +/* + * Assert that the data range is not considered for dedup operation. + */ +void +hammer2_dedup_assert(hammer2_dev_t *hmp, hammer2_off_t data_off, u_int bytes) +{ + hammer2_io_t *dio; + + dio = hammer2_io_getquick(hmp, data_off, bytes, 1); + if (dio) { + KKASSERT((dio->dedup_ok_mask & + hammer2_dedup_mask(dio, data_off, bytes)) == 0); + hammer2_io_putblk(&dio); + } } static @@ -1469,7 +1579,9 @@ hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize) hammer2_io_t *dio; hammer2_off_t off; uint64_t crc; + uint64_t mask; char *data; + char *dtmp; int i; if (hammer2_dedup_enable == 0) @@ -1499,28 +1611,26 @@ hammer2_dedup_lookup(hammer2_dev_t *hmp, char **datap, int pblksize) continue; if ((1 << (int)(off & HAMMER2_OFF_MASK_RADIX)) != pblksize) continue; - dio = hammer2_io_getquick(hmp, off, pblksize); - if (dio && - bcmp(data, hammer2_io_data(dio, off), pblksize) == 0) { - /* - * Make sure the INVALOK flag is cleared to prevent - * the possibly-dirty bp from being invalidated now - * that we are using it as part of a de-dup operation. - */ - if (hammer2_debug & 0x40000) { - kprintf("DEDUP SUCCESS %016jx\n", - (intmax_t)off); + dio = hammer2_io_getquick(hmp, off, pblksize, 0); + if (dio) { + dtmp = hammer2_io_data(dio, off), + mask = hammer2_dedup_mask(dio, off, pblksize); + if ((dio->dedup_ok_mask & mask) == mask && + bcmp(data, dtmp, pblksize) == 0) { + if (hammer2_debug & 0x40000) { + kprintf("DEDUP SUCCESS %016jx\n", + (intmax_t)off); + } + hammer2_io_putblk(&dio); + *datap = NULL; + dedup[i].ticks = ticks; /* update use */ + atomic_add_long(&hammer2_iod_file_wdedup, + pblksize); + + return off; /* RETURN */ } - atomic_clear_64(&dio->refs, HAMMER2_DIO_INVALOK); hammer2_io_putblk(&dio); - *datap = NULL; - dedup[i].ticks = ticks; /* update use */ - ++hammer2_iod_file_wdedup; - - return off; /* RETURN */ } - if (dio) - hammer2_io_putblk(&dio); } return 0; } diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index c0a834c4dd..65614131f7 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -81,10 +81,12 @@ int hammer2_debug; int hammer2_cluster_read = 4; /* physical read-ahead */ int hammer2_cluster_write = 0; /* bdwrite() so later inval works */ int hammer2_dedup_enable = 1; +int hammer2_always_compress = 0; /* always try to compress */ int hammer2_inval_enable = 0; int hammer2_flush_pipe = 100; int hammer2_synchronous_flush = 1; int hammer2_dio_count; +int hammer2_limit_dio = 256; long hammer2_chain_allocs; long hammer2_chain_frees; long hammer2_limit_dirty_chains; @@ -122,6 +124,8 @@ SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_write, CTLFLAG_RW, &hammer2_cluster_write, 0, ""); SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW, &hammer2_dedup_enable, 0, ""); +SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW, + &hammer2_always_compress, 0, ""); SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW, &hammer2_inval_enable, 0, ""); SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW, @@ -138,6 +142,8 @@ SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW, &hammer2_count_modified_chains, 0, ""); SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD, &hammer2_dio_count, 0, ""); +SYSCTL_INT(_vfs_hammer2, OID_AUTO, limit_dio, CTLFLAG_RW, + &hammer2_limit_dio, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW, &hammer2_iod_invals, 0, ""); @@ -239,6 +245,8 @@ hammer2_vfs_init(struct vfsconf *conf) error = 0; + hammer2_limit_dio = nbuf * 2; + if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref)) error = EINVAL; if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))