From: Matthew Dillon Date: Tue, 4 Jun 2013 21:29:20 +0000 (-0700) Subject: hammer2 - freemap part 3 - group by allocation size X-Git-Tag: v3.7.0~1015 X-Git-Url: https://gitweb.dragonflybsd.org/dragonfly.git/commitdiff_plain/a98aa0b0cae958124b59082290f0dfabc8423768 hammer2 - freemap part 3 - group by allocation size * Each freemap leaf represents ~2MB worth of storage. Assign a radix to each leaf, limiting allocations from that leaf to that radix. This primarily results in inodes being grouped together, improving the performance for find, ls or other topological scans. We could improve this but for now we'll stick with it as-is. This mechanic also allows us to use cluster_read(). This function is used for everything except volume-header and freemap elements. * More formally handle logical sizes vs allocation sizes vs device I/O sizes. For example, a 1KB inode allocates 1KB using 16KB device I/O's. * Beef up the sysctl I/O counters. --- diff --git a/sys/vfs/hammer2/DESIGN b/sys/vfs/hammer2/DESIGN index 47ff7bd47a..5d9aab5de2 100644 --- a/sys/vfs/hammer2/DESIGN +++ b/sys/vfs/hammer2/DESIGN @@ -184,6 +184,17 @@ but doesn't complicate anything else. The inode number is stored in the inode itself, an absolutely necessary feature in order to support the hugely flexible snapshots that we want to have in HAMMER2. + DISK I/O OPTIMIZATIONS + +The freemap implements a 1KB allocation resolution. The minimum I/O size +is 16KB. HAMMER2 typically implements 16KB and 64KB physical I/O sizes +and will cluster larger I/O's. + +Each 2MB segment managed by the freemap handles just one particular +physical I/O size. Typically this means that inodes, small data, and +initial (small) indirect blocks get clustered together. Also large 64KB +file-data and indirect blocks get clustered together. + HARDLINKS Hardlinks are a particularly sticky problem for HAMMER2 due to the lack of diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h index 707aec95c4..55a14380d4 100644 --- a/sys/vfs/hammer2/hammer2.h +++ b/sys/vfs/hammer2/hammer2.h @@ -415,7 +415,7 @@ struct hammer2_mount { hammer2_trans_t *curflush; /* current flush in progress */ hammer2_tid_t topo_flush_tid; /* currently synchronizing flush pt */ hammer2_tid_t free_flush_tid; /* currently synchronizing flush pt */ - hammer2_off_t heur_last_alloc; + hammer2_off_t heur_freemap[HAMMER2_MAX_RADIX+1]; int flushcnt; /* #of flush trans on the list */ int volhdrno; /* last volhdrno written */ @@ -454,6 +454,30 @@ MALLOC_DECLARE(M_HAMMER2); #define VTOI(vp) ((hammer2_inode_t *)(vp)->v_data) #define ITOV(ip) ((ip)->vp) +static __inline +int +hammer2_devblkradix(int radix) +{ + int cluster_radix; + + if (radix <= HAMMER2_LBUFRADIX) + cluster_radix = HAMMER2_LBUFRADIX; + else + cluster_radix = HAMMER2_PBUFRADIX; + return(cluster_radix); +} + +static __inline +size_t +hammer2_devblksize(size_t bytes) +{ + if (bytes <= HAMMER2_LBUFSIZE) + return(HAMMER2_LBUFSIZE); + else + return(HAMMER2_PBUFSIZE); +} + + static __inline hammer2_pfsmount_t * MPTOPMP(struct mount *mp) @@ -498,6 +522,8 @@ extern int hammer2_hardlink_enable; extern long hammer2_iod_file_read; extern long hammer2_iod_meta_read; extern long hammer2_iod_indr_read; +extern long hammer2_iod_fmap_read; +extern long hammer2_iod_volu_read; extern long hammer2_iod_file_write; extern long hammer2_iod_meta_write; extern long hammer2_iod_indr_write; @@ -506,6 +532,8 @@ extern long hammer2_iod_volu_write; extern long hammer2_ioa_file_read; extern long hammer2_ioa_meta_read; extern long hammer2_ioa_indr_read; +extern long hammer2_ioa_fmap_read; +extern long hammer2_ioa_volu_read; extern long hammer2_ioa_file_write; extern long hammer2_ioa_meta_write; extern long hammer2_ioa_indr_write; diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c index a479fad522..36863886e0 100644 --- a/sys/vfs/hammer2/hammer2_chain.c +++ b/sys/vfs/hammer2/hammer2_chain.c @@ -72,6 +72,7 @@ static int hammer2_indirect_optimize; /* XXX SYSCTL */ static hammer2_chain_t *hammer2_chain_create_indirect( hammer2_trans_t *trans, hammer2_chain_t *parent, hammer2_key_t key, int keybits, int for_type, int *errorp); +static void adjreadcounter(hammer2_blockref_t *bref, size_t bytes); /* * We use a red-black tree to guarantee safe lookups under shared locks. @@ -99,6 +100,20 @@ hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2) return(0); } +static __inline +int +hammer2_isclusterable(hammer2_chain_t *chain) +{ + if (hammer2_cluster_enable) { + if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT || + chain->bref.type == HAMMER2_BREF_TYPE_INODE || + chain->bref.type == HAMMER2_BREF_TYPE_DATA) { + return(1); + } + } + return(0); +} + /* * Recursively set the SUBMODIFIED flag up to the root starting at chain's * parent. SUBMODIFIED is not set in chain itself. @@ -493,10 +508,11 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how) hammer2_chain_core_t *core; hammer2_blockref_t *bref; hammer2_off_t pbase; + hammer2_off_t pmask; hammer2_off_t peof; ccms_state_t ostate; size_t boff; - size_t bbytes; + size_t psize; int error; char *bdata; @@ -574,27 +590,29 @@ hammer2_chain_lock(hammer2_chain_t *chain, int how) */ bref = &chain->bref; - if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) - bbytes = HAMMER2_MINIOSIZE; - pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1); - peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64; - boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1); + psize = hammer2_devblksize(chain->bytes); + pmask = (hammer2_off_t)psize - 1; + pbase = bref->data_off & ~pmask; + boff = bref->data_off & (HAMMER2_OFF_MASK & pmask); KKASSERT(pbase != 0); + peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64; /* * The getblk() optimization can only be used on newly created * elements if the physical block size matches the request. */ if ((chain->flags & HAMMER2_CHAIN_INITIAL) && - chain->bytes == bbytes) { - chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0); + chain->bytes == psize) { + chain->bp = getblk(hmp->devvp, pbase, psize, 0, 0); error = 0; - } else if (hammer2_cluster_enable) { - error = cluster_read(hmp->devvp, peof, pbase, bbytes, - HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE, + } else if (hammer2_isclusterable(chain)) { + error = cluster_read(hmp->devvp, peof, pbase, psize, + psize, HAMMER2_PBUFSIZE*4, &chain->bp); + adjreadcounter(&chain->bref, chain->bytes); } else { - error = bread(hmp->devvp, pbase, bbytes, &chain->bp); + error = bread(hmp->devvp, pbase, psize, &chain->bp); + adjreadcounter(&chain->bref, chain->bytes); } if (error) { @@ -788,7 +806,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain) counterp = &hammer2_ioa_volu_write; break; } - ++*counterp; + *counterp += chain->bytes; } else { switch(chain->bref.type) { case HAMMER2_BREF_TYPE_DATA: @@ -808,7 +826,7 @@ hammer2_chain_unlock(hammer2_chain_t *chain) counterp = &hammer2_iod_volu_write; break; } - ++*counterp; + *counterp += chain->bytes; } /* @@ -1006,11 +1024,13 @@ hammer2_chain_modify(hammer2_trans_t *trans, hammer2_chain_t **chainp, hammer2_mount_t *hmp = trans->hmp; hammer2_chain_t *chain; hammer2_off_t pbase; + hammer2_off_t pmask; + hammer2_off_t peof; hammer2_tid_t flush_tid; struct buf *nbp; int error; int wasinitial; - size_t bbytes; + size_t psize; size_t boff; void *bdata; @@ -1177,29 +1197,33 @@ skipxx: /* XXX */ */ KKASSERT(chain != &hmp->vchain && chain != &hmp->fchain); - /* - * The device buffer may be larger than the allocation size. - */ - if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) - bbytes = HAMMER2_MINIOSIZE; - pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); - boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); + psize = hammer2_devblksize(chain->bytes); + pmask = (hammer2_off_t)psize - 1; + pbase = chain->bref.data_off & ~pmask; + boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask); + KKASSERT(pbase != 0); + peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64; /* - * Buffer aliasing is possible, check for the case. - * * The getblk() optimization can only be used if the - * physical block size matches the request. + * chain element size matches the physical block size. */ if (chain->bp && chain->bp->b_loffset == pbase) { nbp = chain->bp; - } else if (chain->bytes == bbytes) { - nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0); error = 0; + } else if (chain->bytes == psize) { + nbp = getblk(hmp->devvp, pbase, psize, 0, 0); + error = 0; + } else if (hammer2_isclusterable(chain)) { + error = cluster_read(hmp->devvp, peof, pbase, psize, + psize, HAMMER2_PBUFSIZE*4, + &nbp); + adjreadcounter(&chain->bref, chain->bytes); } else { - error = bread(hmp->devvp, pbase, bbytes, &nbp); - KKASSERT(error == 0); + error = bread(hmp->devvp, pbase, psize, &nbp); + adjreadcounter(&chain->bref, chain->bytes); } + KKASSERT(error == 0); bdata = (char *)nbp->b_data + boff; /* @@ -1226,6 +1250,7 @@ skipxx: /* XXX */ if (chain->bp != nbp) { if (chain->bp) { if (chain->flags & HAMMER2_CHAIN_DIRTYBP) { + chain->bp->b_flags |= B_CLUSTEROK; bdwrite(chain->bp); } else { chain->bp->b_flags |= B_RELBUF; @@ -3353,3 +3378,30 @@ hammer2_chain_wait(hammer2_chain_t *chain) { tsleep(chain, 0, "chnflw", 1); } + +static +void +adjreadcounter(hammer2_blockref_t *bref, size_t bytes) +{ + long *counterp; + + switch(bref->type) { + case HAMMER2_BREF_TYPE_DATA: + counterp = &hammer2_iod_file_read; + break; + case HAMMER2_BREF_TYPE_INODE: + counterp = &hammer2_iod_meta_read; + break; + case HAMMER2_BREF_TYPE_INDIRECT: + counterp = &hammer2_iod_indr_read; + break; + case HAMMER2_BREF_TYPE_FREEMAP_NODE: + case HAMMER2_BREF_TYPE_FREEMAP_LEAF: + counterp = &hammer2_iod_fmap_read; + break; + default: + counterp = &hammer2_iod_volu_read; + break; + } + *counterp += bytes; +} diff --git a/sys/vfs/hammer2/hammer2_disk.h b/sys/vfs/hammer2/hammer2_disk.h index faf8fe66be..cbe2ae86e9 100644 --- a/sys/vfs/hammer2/hammer2_disk.h +++ b/sys/vfs/hammer2/hammer2_disk.h @@ -102,9 +102,10 @@ * blocks except the block straddling EOF. * * HAMMER2_SEGSIZE - Allocation map segment size, typically 2MB + * (space represented by a level0 bitmap). */ -#define HAMMER2_SEGSIZE (65536 * 8) +#define HAMMER2_SEGSIZE (1 << HAMMER2_FREEMAP_LEVEL0_RADIX) #define HAMMER2_PBUFRADIX 16 /* physical buf (1<<16) bytes */ #define HAMMER2_PBUFSIZE 65536 @@ -114,13 +115,8 @@ /* * Generally speaking we want to use 16K and 64K I/Os */ -#if 1 #define HAMMER2_MINIORADIX HAMMER2_LBUFRADIX #define HAMMER2_MINIOSIZE HAMMER2_LBUFSIZE -#else -#define HAMMER2_MINIORADIX 10 -#define HAMMER2_MINIOSIZE 1024 -#endif #define HAMMER2_IND_BYTES_MIN HAMMER2_LBUFSIZE #define HAMMER2_IND_BYTES_MAX HAMMER2_PBUFSIZE @@ -403,7 +399,7 @@ struct hammer2_blockref { /* MUST BE EXACTLY 64 BYTES */ /* * Freemap hints are embedded in addition to the icrc32. * - * biggest - largest possible allocation 2^N within sub-tree. + * biggest - Largest possible allocation 2^N within sub-tree. * typically initialized to 64 in freemap_blockref * and reduced as-needed when a request fails. * @@ -412,11 +408,15 @@ struct hammer2_blockref { /* MUST BE EXACTLY 64 BYTES */ * biggest hint will be adjusted downward. * * Used when allocating space. + * + * radix - (Leaf only) once assigned, radix for clustering. + * All device I/O can cluster within the 2MB + * segment. */ struct { uint32_t icrc32; uint8_t biggest; - uint8_t reserved05; + uint8_t radix; /* 0, LBUFRADIX, PBUFRADIX */ uint8_t reserved06; uint8_t reserved07; uint64_t avail; /* total available bytes */ diff --git a/sys/vfs/hammer2/hammer2_flush.c b/sys/vfs/hammer2/hammer2_flush.c index 8f1a1fec0e..805480e33c 100644 --- a/sys/vfs/hammer2/hammer2_flush.c +++ b/sys/vfs/hammer2/hammer2_flush.c @@ -327,10 +327,11 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain) hammer2_mount_t *hmp; hammer2_blockref_t *bref; hammer2_off_t pbase; + hammer2_off_t pmask; hammer2_tid_t saved_sync; hammer2_trans_t *trans = info->trans; hammer2_chain_core_t *core; - size_t bbytes; + size_t psize; size_t boff; char *bdata; struct buf *bp; @@ -660,11 +661,12 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain) * Make sure any device buffer(s) have been flushed out here. * (there aren't usually any to flush). */ - bbytes = chain->bytes; - pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1); - boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1); + psize = hammer2_devblksize(chain->bytes); + pmask = (hammer2_off_t)psize - 1; + pbase = chain->bref.data_off & ~pmask; + boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask); - bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0); + bp = getblk(hmp->devvp, pbase, psize, GETBLK_NOWAIT, 0); if (bp) { if ((bp->b_flags & (B_CACHE | B_DIRTY)) == (B_CACHE | B_DIRTY)) { @@ -730,22 +732,18 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain) * The data is embedded, we have to acquire the * buffer cache buffer and copy the data into it. */ - if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE) - bbytes = HAMMER2_MINIOSIZE; - pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1); - boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1); + psize = hammer2_devblksize(chain->bytes); + pmask = (hammer2_off_t)psize - 1; + pbase = bref->data_off & ~pmask; + boff = bref->data_off & (HAMMER2_OFF_MASK & pmask); /* * The getblk() optimization can only be used if the * physical block size matches the request. */ - if (chain->bytes == bbytes) { - bp = getblk(hmp->devvp, pbase, bbytes, 0, 0); - error = 0; - } else { - error = bread(hmp->devvp, pbase, bbytes, &bp); - KKASSERT(error == 0); - } + error = bread(hmp->devvp, pbase, psize, &bp); + KKASSERT(error == 0); + bdata = (char *)bp->b_data + boff; /* @@ -756,6 +754,7 @@ hammer2_chain_flush_core(hammer2_flush_info_t *info, hammer2_chain_t *chain) bp->b_flags |= B_CLUSTEROK; bdwrite(bp); bp = NULL; + switch(HAMMER2_DEC_CHECK(chain->bref.methods)) { case HAMMER2_CHECK_FREEMAP: chain->bref.check.freemap.icrc32 = diff --git a/sys/vfs/hammer2/hammer2_freemap.c b/sys/vfs/hammer2/hammer2_freemap.c index b55a158c84..d14205ca90 100644 --- a/sys/vfs/hammer2/hammer2_freemap.c +++ b/sys/vfs/hammer2/hammer2_freemap.c @@ -58,6 +58,13 @@ static int hammer2_freemap_iterate(hammer2_trans_t *trans, #endif +static __inline +int +hammer2_freemapradix(int radix) +{ + return(radix); +} + /* * Calculate the device offset for the specified FREEMAP_NODE or FREEMAP_LEAF * bref. Return a combined media offset and physical size radix. Freemap @@ -90,13 +97,11 @@ hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref, /* * Adjust by HAMMER2_ZONE_FREEMAP_{A,B,C,D} using the existing - * offset as a basis. + * offset as a basis. Start in zone A if previously unallocated. */ if ((bref->data_off & ~HAMMER2_OFF_MASK_RADIX) == 0) { off = HAMMER2_ZONE_FREEMAP_A; } else { - off = HAMMER2_ZONE_FREEMAP_A; -#if 0 off = bref->data_off & ~HAMMER2_OFF_MASK_RADIX & (((hammer2_off_t)1 << HAMMER2_FREEMAP_LEVEL1_RADIX) - 1); off = off / HAMMER2_PBUFSIZE; @@ -111,7 +116,6 @@ hammer2_freemap_reserve(hammer2_mount_t *hmp, hammer2_blockref_t *bref, off = HAMMER2_ZONE_FREEMAP_C; else off = HAMMER2_ZONE_FREEMAP_B; -#endif } off = off * HAMMER2_PBUFSIZE; @@ -293,6 +297,7 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, hammer2_chain_t *parent; hammer2_off_t bpref; hammer2_off_t bnext; + int freemap_radix; int radix; int error; @@ -315,17 +320,25 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, return (hammer2_freemap_simple_alloc(hmp, bref, radix)); #else - /* - * Calculate actual allocation in bytes, and radix. This ensures - * a minimum 1KB allocation. - */ KKASSERT(bytes >= HAMMER2_MIN_ALLOC && bytes <= HAMMER2_MAX_ALLOC); -#if 0 /* - * Calculate starting point + * Calculate the starting point for our allocation search. + * + * Each freemap leaf is dedicated to a specific freemap_radix. + * The freemap_radix can be more fine-grained than the device buffer + * radix which results in inodes being grouped together in their + * own segment, terminal-data (16K or less) and initial indirect + * block being grouped together, and then full-indirect and full-data + * blocks (64K) being grouped together. + * + * The single most important aspect of this is the inode grouping + * because that is what allows 'find' and 'ls' and other filesystem + * topology operations to run fast. */ + freemap_radix = hammer2_freemapradix(radix); +#if 0 if (bref->data_off & ~HAMMER2_OFF_MASK_RADIX) bpref = bref->data_off & ~HAMMER2_OFF_MASK_RADIX; else if (trans->tmp_bpref) @@ -334,7 +347,8 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, bpref = trans->tmp_ip->chain->bref.data_off; else #endif - bpref = hmp->heur_last_alloc; /* SMP race ok, heuristic */ + KKASSERT(radix >= 0 && radix <= HAMMER2_MAX_RADIX); + bpref = hmp->heur_freemap[freemap_radix]; /* * Make sure bpref is in-bounds. It's ok if bpref covers a zone's @@ -355,7 +369,7 @@ hammer2_freemap_alloc(hammer2_trans_t *trans, error = hammer2_freemap_try_alloc(trans, &parent, bref, radix, bpref, &bnext); } - hmp->heur_last_alloc = bnext; /* XXX */ + hmp->heur_freemap[freemap_radix] = bnext; hammer2_chain_unlock(parent); return (error); @@ -412,6 +426,8 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp, int index; int count; int subindex; + int freemap_radix; + int devblk_radix; /* * Calculate the number of bytes being allocated, the number @@ -425,6 +441,9 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp, bits = 1 << (radix - HAMMER2_MIN_RADIX); mask = (bits == 64) ? (uint64_t)-1 : (((uint64_t)1 << bits) - 1); + devblk_radix = hammer2_devblkradix(radix); + freemap_radix = hammer2_freemapradix(radix); + /* * Lookup the level0 freemap chain, creating and initializing one * if necessary. Intermediate levels will be created automatically @@ -459,6 +478,7 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp, chain->bref.check.freemap.biggest = HAMMER2_FREEMAP_LEVEL0_RADIX; chain->bref.check.freemap.avail = l0size; + chain->bref.check.freemap.radix = freemap_radix; /* * Preset bitmap for existing static allocations. @@ -508,6 +528,11 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp, * Already flagged as not having enough space */ error = ENOSPC; + } else if (chain->bref.check.freemap.radix != freemap_radix) { + /* + * Wrong cluster radix, cannot allocate from this leaf. + */ + error = ENOSPC; } else { /* * Modify existing chain to setup for adjustment. @@ -532,45 +557,23 @@ hammer2_freemap_try_alloc(hammer2_trans_t *trans, hammer2_chain_t **parentp, * Allocate data and meta-data from the beginning and inodes * from the end. */ - if (bref->type != HAMMER2_BREF_TYPE_INODE) { - for (index = 0; index < count; ++index) { - if (data[index] == (uint64_t)-1) /* all allocated */ - continue; - tmp_mask = mask; /* iterate */ - for (subindex = 0; subindex < 64; subindex += bits) { - if ((data[index] & tmp_mask) == 0) - break; - tmp_mask <<= bits; - } - if (subindex != 64) { - key += HAMMER2_MIN_ALLOC * 64 * index; - key += HAMMER2_MIN_ALLOC * subindex; + for (index = 0; index < count; ++index) { + if (data[index] == (uint64_t)-1) /* all allocated */ + continue; + tmp_mask = mask; /* iterate */ + for (subindex = 0; subindex < 64; subindex += bits) { + if ((data[index] & tmp_mask) == 0) break; - } + tmp_mask <<= bits; } - if (index == count) - error = ENOSPC; - } else { - for (index = count - 1; index >= 0; --index) { - if (data[index] == (uint64_t)-1) /* all allocated */ - continue; - tmp_mask = mask << (64 - bits); - for (subindex = 64 - bits; - subindex >= 0; - subindex -= bits) { - if ((data[index] & tmp_mask) == 0) - break; - tmp_mask >>= bits; - } - if (subindex != -bits) { - key += HAMMER2_MIN_ALLOC * 64 * index; - key += HAMMER2_MIN_ALLOC * subindex; - break; - } + if (subindex != 64) { + key += HAMMER2_MIN_ALLOC * 64 * index; + key += HAMMER2_MIN_ALLOC * subindex; + break; } - if (index == -1) - error = ENOSPC; } + if (index == count) + error = ENOSPC; skip: if (error == 0) { @@ -589,20 +592,15 @@ skip: /* * Modify the chain and set the bitmap appropriately. * - * Determine if we can massage the buffer cache buffer - * to avoid a read. If the allocation is smaller than - * the minimum IO size we look at the bitmap mask covering - * the allocation at the minimum IO size. If it is - * unallocated we instantiate and clear the buffer which - * marks it B_CACHE and validates it without issuing a read. - * - * For allocation requests >= MINIOSIZE other code will deal - * with the read-avoidance when the chain is locked. + * For smaller allocations try to avoid a read-before-write + * by priming the buffer cache buffer. The caller handles + * read-avoidance for larger allocations (or more properly, + * when the chain is locked). */ prebuf = 0; hammer2_chain_modify(trans, &chain, 0); data = &chain->data->bmdata.array[0]; - if (radix < HAMMER2_MINIORADIX) { + if (radix != devblk_radix) { uint64_t iomask; int iobmradix = HAMMER2_MINIORADIX - HAMMER2_MIN_RADIX; int ioindex; @@ -631,11 +629,15 @@ skip: if (prebuf) { struct buf *bp; hammer2_off_t pbase; + hammer2_off_t csize; + hammer2_off_t cmask; - pbase = key & ~(hammer2_off_t)(HAMMER2_MINIOSIZE - 1); + csize = (hammer2_off_t)1 << devblk_radix; + cmask = csize - 1; + pbase = key & ~mask; - bp = getblk(hmp->devvp, pbase, - HAMMER2_MINIOSIZE, GETBLK_NOWAIT, 0); + bp = getblk(hmp->devvp, pbase, csize, + GETBLK_NOWAIT, 0); if (bp) { if ((bp->b_flags & B_CACHE) == 0) vfs_bio_clrbuf(bp); @@ -668,24 +670,6 @@ skip: return (error); } -#if 0 - /* - * When making meta-data allocations smaller than LBUFSIZE we will - * use a LBUFSIZE'd buffer. The first chunk allocated from such a - * buffer instantiates a device buffer and marks it clean to avoid - * unnecessary read-before-write ops. XXX buffer cache buffer - * sharing. XXX mixed data/meta-data issues. - */ - if (bytes < HAMMER2_MINIOSIZE && - (data_off & (HAMMER2_MINIOSIZE - 1)) == 0 && - (bitmap shows this is the initial allocation)) { - bp = getblk(hmp->devvp, data_off, HAMMER2_MINIOSIZE, 0, 0); - bp->b_flags |= B_CACHE; - bp->b_resid = 0; - bqrelse(bp); - } -#endif - static int hammer2_freemap_iterate(hammer2_trans_t *trans, hammer2_chain_t **parentp, hammer2_chain_t **chainp, diff --git a/sys/vfs/hammer2/hammer2_vfsops.c b/sys/vfs/hammer2/hammer2_vfsops.c index 4117579a26..074ee2ef21 100644 --- a/sys/vfs/hammer2/hammer2_vfsops.c +++ b/sys/vfs/hammer2/hammer2_vfsops.c @@ -61,11 +61,13 @@ static struct hammer2_mntlist hammer2_mntlist; static struct lock hammer2_mntlk; int hammer2_debug; -int hammer2_cluster_enable = 0; /* XXX temporary until layout ironed out */ +int hammer2_cluster_enable = 1; int hammer2_hardlink_enable = 1; long hammer2_iod_file_read; long hammer2_iod_meta_read; long hammer2_iod_indr_read; +long hammer2_iod_fmap_read; +long hammer2_iod_volu_read; long hammer2_iod_file_write; long hammer2_iod_meta_write; long hammer2_iod_indr_write; @@ -74,6 +76,8 @@ long hammer2_iod_volu_write; long hammer2_ioa_file_read; long hammer2_ioa_meta_read; long hammer2_ioa_indr_read; +long hammer2_ioa_fmap_read; +long hammer2_ioa_volu_read; long hammer2_ioa_fmap_write; long hammer2_ioa_file_write; long hammer2_ioa_meta_write; @@ -88,32 +92,48 @@ SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW, &hammer2_cluster_enable, 0, ""); SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW, &hammer2_hardlink_enable, 0, ""); + SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW, &hammer2_iod_file_read, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW, &hammer2_iod_meta_read, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW, &hammer2_iod_indr_read, 0, ""); +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW, + &hammer2_iod_fmap_read, 0, ""); +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW, + &hammer2_iod_volu_read, 0, ""); + SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW, &hammer2_iod_file_write, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW, &hammer2_iod_meta_write, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW, &hammer2_iod_indr_write, 0, ""); +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW, + &hammer2_iod_fmap_write, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW, &hammer2_iod_volu_write, 0, ""); + SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW, &hammer2_ioa_file_read, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW, &hammer2_ioa_meta_read, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW, &hammer2_ioa_indr_read, 0, ""); +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW, + &hammer2_ioa_fmap_read, 0, ""); +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW, + &hammer2_ioa_volu_read, 0, ""); + SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW, &hammer2_ioa_file_write, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW, &hammer2_ioa_meta_write, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW, &hammer2_ioa_indr_write, 0, ""); +SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW, + &hammer2_ioa_fmap_write, 0, ""); SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW, &hammer2_ioa_volu_write, 0, "");