From d84f6fa13c474015da98ec341ce61ed6d737bcae Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Mon, 7 Nov 2016 18:56:34 -0800 Subject: [PATCH] kernel - Attempt to fix cluster pbuf deadlock on recursive filesystems * Change global pbuf count limits (used primarily for clustered I/O) to per-mount and per-device limits. The per-mount / per-device limit is set to nswbuf_kva / 10, allowing 10 different entities to obtain pbufs concurrently without interference. * This change goes a long way towards fixing deadlocks that could occur with the old global system (a global limit of nswbuf_kva / 2) when the I/O system recurses through a virtual block device or filesystem. Two examples of virtual block devices are the 'vn' device and the crypto layer. * We also note that even normal filesystem read and write I/O strategy calls will recurse at least once to dive the underlying block device. DFly also had issues with pbuf hogging by one mount causing unnecessary stalls in other mounts. This fix also prevents pbuf hogging. * Remove unused internal O_MAPONREAD flag. Reported-by: htse, multiple Testing-by: htse, dillon --- sys/kern/vfs_cluster.c | 43 +++++++++++++++++++++++++++++++------- sys/kern/vfs_conf.c | 1 + sys/kern/vfs_lock.c | 1 + sys/kern/vfs_mount.c | 2 ++ sys/kern/vfs_syscalls.c | 1 + sys/kern/vfs_vnops.c | 2 +- sys/sys/buf.h | 6 ++++++ sys/sys/fcntl.h | 10 +++++---- sys/sys/mount.h | 2 +- sys/sys/vnode.h | 1 + sys/vfs/ufs/ffs_rawread.c | 20 +++++++----------- sys/vm/vm_pager.c | 44 +++++++++++++-------------------------- 12 files changed, 76 insertions(+), 57 deletions(-) diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index afcc3d0649..8e18b83c36 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -121,8 +121,6 @@ SYSCTL_INT(_vfs, OID_AUTO, max_readahead, CTLFLAG_RW, &max_readahead, 0, extern vm_page_t bogus_page; -extern int cluster_pbuf_freecnt; - /* * nblks is our cluster_rbuild request size. The approximate number of * physical read-ahead requests is maxra / nblks. The physical request @@ -909,10 +907,18 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, return tbp; } - bp = trypbuf_kva(&cluster_pbuf_freecnt); - if (bp == NULL) { + /* + * Get a pbuf, limit cluster I/O on a per-device basis. If + * doing cluster I/O for a file, limit cluster I/O on a + * per-mount basis. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) + bp = trypbuf_kva(&vp->v_pbuf_count); + else + bp = trypbuf_kva(&vp->v_mount->mnt_pbuf_count); + + if (bp == NULL) return tbp; - } /* * We are synthesizing a buffer out of vm_page_t's, but @@ -920,6 +926,7 @@ cluster_rbuild(struct vnode *vp, off_t filesize, off_t loffset, off_t doffset, * address may not be either. Inherit the b_data offset * from the original buffer. */ + bp->b_vp = vp; bp->b_data = (char *)((vm_offset_t)bp->b_data | ((vm_offset_t)tbp->b_data & PAGE_MASK)); bp->b_flags |= B_CLUSTER | B_VMIO; @@ -1096,6 +1103,7 @@ cluster_callback(struct bio *bio) { struct buf *bp = bio->bio_buf; struct buf *tbp; + struct vnode *vp; int error = 0; /* @@ -1143,7 +1151,12 @@ cluster_callback(struct bio *bio) } biodone(&tbp->b_bio1); } - relpbuf(bp, &cluster_pbuf_freecnt); + vp = bp->b_vp; + bp->b_vp = NULL; + if (vp->v_type == VCHR || vp->v_type == VBLK) + relpbuf(bp, &vp->v_pbuf_count); + else + relpbuf(bp, &vp->v_mount->mnt_pbuf_count); } /* @@ -1455,8 +1468,7 @@ cluster_wbuild(struct vnode *vp, struct buf **bpp, if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || (tbp->b_bcount != tbp->b_bufsize) || (tbp->b_bcount != blksize) || - (bytes == blksize) || - ((bp = getpbuf_kva(&cluster_pbuf_freecnt)) == NULL)) { + (bytes == blksize)) { totalwritten += tbp->b_bufsize; bawrite(tbp); start_loffset += blksize; @@ -1464,6 +1476,20 @@ cluster_wbuild(struct vnode *vp, struct buf **bpp, continue; } + /* + * Get a pbuf, limit cluster I/O on a per-device basis. If + * doing cluster I/O for a file, limit cluster I/O on a + * per-mount basis. + * + * HAMMER and other filesystems may attempt to queue a massive + * amount of write I/O, using trypbuf() here easily results in + * situation where the I/O stream becomes non-clustered. + */ + if (vp->v_type == VCHR || vp->v_type == VBLK) + bp = getpbuf_kva(&vp->v_pbuf_count); + else + bp = getpbuf_kva(&vp->v_mount->mnt_pbuf_count); + /* * Set up the pbuf. Track our append point with b_bcount * and b_bufsize. b_bufsize is not used by the device but @@ -1475,6 +1501,7 @@ cluster_wbuild(struct vnode *vp, struct buf **bpp, bp->b_xio.xio_npages = 0; bp->b_loffset = tbp->b_loffset; bp->b_bio2.bio_offset = tbp->b_bio2.bio_offset; + bp->b_vp = vp; /* * We are synthesizing a buffer out of vm_page_t's, but diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c index d352aec4e1..fd468d2d1f 100644 --- a/sys/kern/vfs_conf.c +++ b/sys/kern/vfs_conf.c @@ -299,6 +299,7 @@ vfs_mountroot_devfs(void) vfs_busy(mp, LK_NOWAIT); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; + mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; diff --git a/sys/kern/vfs_lock.c b/sys/kern/vfs_lock.c index 6d915b60f6..0ae048d4bf 100644 --- a/sys/kern/vfs_lock.c +++ b/sys/kern/vfs_lock.c @@ -881,6 +881,7 @@ allocvnode(int lktimeout, int lkflags) atomic_add_int(&numvnodes, 1); vp->v_refcnt = 1; vp->v_flag = VAGE0 | VAGE1; + vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; KKASSERT(TAILQ_EMPTY(&vp->v_namecache)); /* exclusive lock still held */ diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c index 4fb583925f..a5d2b2bbf7 100644 --- a/sys/kern/vfs_mount.c +++ b/sys/kern/vfs_mount.c @@ -178,6 +178,7 @@ getnewvnode(enum vtagtype tag, struct mount *mp, * operations vector. */ vp->v_ops = &mp->mnt_vn_use_ops; + vp->v_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; /* * Placing the vnode on the mount point's queue makes it visible. @@ -308,6 +309,7 @@ vfs_rootmountalloc(char *fstypename, char *devname, struct mount **mpp) vfs_busy(mp, 0); mp->mnt_vfc = vfsp; mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= MNT_RDONLY; diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 5062ed7b5a..3885f9add8 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -336,6 +336,7 @@ sys_mount(struct mount_args *uap) vfs_busy(mp, LK_NOWAIT); mp->mnt_op = vfsp->vfc_vfsops; mp->mnt_vfc = vfsp; + mp->mnt_pbuf_count = nswbuf_kva / NSWBUF_SPLIT; vfsp->vfc_refcount++; mp->mnt_stat.f_type = vfsp->vfc_typenum; mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index ccb19b5daf..4ea4fd6257 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -617,7 +617,7 @@ vn_rdwr_inchunks(enum uio_rw rw, struct vnode *vp, caddr_t base, int len, if (chunk > len) chunk = len; - if (vp->v_type == VREG) { + if (vp->v_type == VREG && (ioflg & IO_RECURSE) == 0) { switch(rw) { case UIO_READ: bwillread(chunk); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index dcc5c19263..3008cbc6a6 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -374,6 +374,12 @@ struct cluster_save { #define B_CLRBUF 0x01 /* Cleared invalid areas of buffer. */ #define B_SYNC 0x02 /* Do all allocations synchronously. */ +/* + * Split nswbuf_kva by N, allowing each block device or mounted filesystem + * to use up to (nswbuf_kva / N) pbufs. + */ +#define NSWBUF_SPLIT 10 /* split nswbuf_kva by N */ + #ifdef _KERNEL extern long nbuf; /* The number of buffer headers */ extern long maxswzone; /* Max KVA for swap structures */ diff --git a/sys/sys/fcntl.h b/sys/sys/fcntl.h index fcac54ce3c..0d8e685be3 100644 --- a/sys/sys/fcntl.h +++ b/sys/sys/fcntl.h @@ -106,9 +106,11 @@ #define O_FOFFSET 0x00200000 /* force specific offset */ #define O_FSYNCWRITE 0x00400000 /* force synchronous write */ #define O_FASYNCWRITE 0x00800000 /* force asynchronous write */ +#if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) #define O_UNUSED24 0x01000000 #define O_UNUSED25 0x02000000 -#define O_MAPONREAD 0x04000000 /* memory map read buffer */ +#define O_UNUSED26 0x04000000 +#endif #if __POSIX_VISIBLE >= 200809 #define O_DIRECTORY 0x08000000 /* error if not a directory */ @@ -122,7 +124,7 @@ #endif #define O_FMASK (O_FBLOCKING|O_FNONBLOCKING|O_FAPPEND|O_FOFFSET|\ - O_FSYNCWRITE|O_FASYNCWRITE|O_MAPONREAD) + O_FSYNCWRITE|O_FASYNCWRITE) #ifdef _KERNEL /* convert from open() flags to/from fflags; convert O_RD/WR to FREAD/FWRITE */ @@ -133,10 +135,10 @@ * Bits to save after open from the ap. Remaining bits are retained. */ #define FMASK (FREAD|FWRITE|FAPPEND|FASYNC|FFSYNC|FNONBLOCK|\ - FAPPENDONLY|FREVOKED|O_DIRECT|O_MAPONREAD) + FAPPENDONLY|FREVOKED|O_DIRECT) /* bits settable by fcntl(F_SETFL, ...) */ #define FCNTLFLAGS (FAPPEND|FASYNC|FFSYNC|FNONBLOCK|FPOSIXSHM|\ - O_DIRECT|O_MAPONREAD) + O_DIRECT) #endif /* diff --git a/sys/sys/mount.h b/sys/sys/mount.h index 1bab65c4ff..0ded258491 100644 --- a/sys/sys/mount.h +++ b/sys/sys/mount.h @@ -208,7 +208,7 @@ struct mount { struct vfsops *mnt_op; /* operations on fs */ struct vfsconf *mnt_vfc; /* configuration info */ u_int mnt_namecache_gen; /* ++ to clear negative hits */ - u_int mnt_unused01; + u_int mnt_pbuf_count; /* pbuf usage limit */ struct vnode *mnt_syncer; /* syncer vnode */ struct syncer_ctx *mnt_syncer_ctx; /* syncer process context */ struct vnodelst mnt_nvnodelist; /* list of vnodes this mount */ diff --git a/sys/sys/vnode.h b/sys/sys/vnode.h index 2ddef90198..8696da91ad 100644 --- a/sys/sys/vnode.h +++ b/sys/sys/vnode.h @@ -152,6 +152,7 @@ struct vnode { int v_opencount; /* number of explicit opens */ int v_auxrefs; /* auxiliary references */ int v_refcnt; + int v_pbuf_count; /* (device nodes only) */ struct bio_track v_track_read; /* track I/O's in progress */ struct bio_track v_track_write; /* track I/O's in progress */ struct mount *v_mount; /* ptr to vfs we are in */ diff --git a/sys/vfs/ufs/ffs_rawread.c b/sys/vfs/ufs/ffs_rawread.c index 20b639bb27..d2cdaeeffa 100644 --- a/sys/vfs/ufs/ffs_rawread.c +++ b/sys/vfs/ufs/ffs_rawread.c @@ -64,10 +64,6 @@ void ffs_rawread_setup(void); SYSCTL_DECL(_vfs_ffs); -static int ffsrawbufcnt = 4; -SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0, - "Buffers available for raw reads"); - static int allowrawread = 1; SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0, "Flag to enable raw reads"); @@ -80,8 +76,6 @@ SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, void ffs_rawread_setup(void) { - ffsrawbufcnt = (nswbuf_kva > 100 ) ? - (nswbuf_kva - (nswbuf_kva >> 4)) : nswbuf_kva - 8; } @@ -232,7 +226,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) if (bp == NULL) { /* Setup first read */ /* XXX: Leave some bufs for swap */ - bp = getpbuf_kva(&ffsrawbufcnt); + bp = getpbuf_kva(&vp->v_mount->mnt_pbuf_count); error = ffs_rawread_readahead(vp, udata, offset, resid, bp); if (error != 0) @@ -241,7 +235,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) if (resid > bp->b_bufsize) { /* Setup fist readahead */ /* XXX: Leave bufs for swap */ if (rawreadahead != 0) - nbp = trypbuf_kva(&ffsrawbufcnt); + nbp = trypbuf_kva(&vp->v_mount->mnt_pbuf_count); else nbp = NULL; if (nbp != NULL) { @@ -252,7 +246,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) resid - bp->b_bufsize, nbp); if (nerror) { - relpbuf(nbp, &ffsrawbufcnt); + relpbuf(nbp, &vp->v_mount->mnt_pbuf_count); nbp = NULL; } } @@ -293,7 +287,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) clearbiocache(&nbp->b_bio2); if (resid <= bp->b_bufsize) { /* No more readaheads */ - relpbuf(nbp, &ffsrawbufcnt); + relpbuf(nbp, &vp->v_mount->mnt_pbuf_count); nbp = NULL; } else { /* Setup next readahead */ nerror = ffs_rawread_readahead( @@ -302,7 +296,7 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) resid - bp->b_bufsize, nbp); if (nerror != 0) { - relpbuf(nbp, &ffsrawbufcnt); + relpbuf(nbp, &vp->v_mount->mnt_pbuf_count); nbp = NULL; } } @@ -317,11 +311,11 @@ ffs_rawread_main(struct vnode *vp, struct uio *uio) } if (bp != NULL) - relpbuf(bp, &ffsrawbufcnt); + relpbuf(bp, &vp->v_mount->mnt_pbuf_count); if (nbp != NULL) { /* Run down readahead buffer */ biowait(&nbp->b_bio1, "rawrd"); vunmapbuf(nbp); - relpbuf(nbp, &ffsrawbufcnt); + relpbuf(nbp, &vp->v_mount->mnt_pbuf_count); } if (error == 0) diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index e03dcd69ce..f435b7e963 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -95,8 +95,6 @@ extern struct pagerops vnodepagerops; extern struct pagerops devicepagerops; extern struct pagerops physpagerops; -int cluster_pbuf_freecnt = -1; /* unlimited to begin with */ - static int dead_pager_getpage (vm_object_t, vm_page_t *, int); static void dead_pager_putpages (vm_object_t, vm_page_t *, int, int, int *); static boolean_t dead_pager_haspage (vm_object_t, vm_pindex_t); @@ -327,11 +325,6 @@ vm_pager_bufferinit(void *dummy __unused) TAILQ_INSERT_HEAD(&bswlist_raw[i & BSWHMASK], bp, b_freelist); atomic_add_int(&pbuf_raw_count, 1); } - - /* - * Allow the clustering code to use half of our pbufs. - */ - cluster_pbuf_freecnt = nswbuf_kva / 2; } SYSINIT(do_vmpg, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, vm_pager_bufferinit, NULL); @@ -402,27 +395,18 @@ initpbuf(struct buf *bp) /* * Allocate a physical buffer * - * There are a limited number of physical buffers. We need to make - * sure that no single subsystem is able to hog all of them, - * so each subsystem implements a counter which is typically initialized - * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and - * increments it on release, and blocks if the counter hits zero. A - * subsystem may initialize the counter to -1 to disable the feature, - * but it must still be sure to match up all uses of getpbuf() with - * relpbuf() using the same variable. + * If (pfreecnt != NULL) then *pfreecnt will be decremented on return and + * the function will block while it is <= 0. * - * NOTE: pfreecnt can be NULL, but this 'feature' will be removed - * relatively soon when the rest of the subsystems get smart about it. XXX - * - * Physical buffers can be with or without KVA space reserved. There - * are severe limitations on the ones with KVA reserved, and fewer - * limitations on the ones without. getpbuf() gets one without, - * getpbuf_kva() gets one with. + * Physical buffers can be with or without KVA space reserved. There + * are severe limitations on the ones with KVA reserved, and fewer + * limitations on the ones without. getpbuf() gets one without, + * getpbuf_kva() gets one with. * * No requirements. */ struct buf * -getpbuf(int *pfreecnt) /* raw */ +getpbuf(int *pfreecnt) { struct buf *bp; int iter; @@ -438,7 +422,7 @@ getpbuf(int *pfreecnt) /* raw */ tsleep_interlock(&pbuf_raw_count, 0); if ((int)atomic_fetchadd_int(&pbuf_raw_count, 0) <= 0) tsleep(&pbuf_raw_count, PINTERLOCKED, - "wswbuf0", 0); + "wswbuf1", 0); continue; } iter = mycpuid & BSWHMASK; @@ -477,13 +461,13 @@ getpbuf_kva(int *pfreecnt) while (pfreecnt && *pfreecnt <= 0) { tsleep_interlock(pfreecnt, 0); if ((int)atomic_fetchadd_int(pfreecnt, 0) <= 0) - tsleep(pfreecnt, PINTERLOCKED, "wswbuf0", 0); + tsleep(pfreecnt, PINTERLOCKED, "wswbuf2", 0); } if (pbuf_kva_count <= 0) { tsleep_interlock(&pbuf_kva_count, 0); if ((int)atomic_fetchadd_int(&pbuf_kva_count, 0) <= 0) tsleep(&pbuf_kva_count, PINTERLOCKED, - "wswbuf0", 0); + "wswbuf3", 0); continue; } iter = mycpuid & BSWHMASK; @@ -526,13 +510,13 @@ getpbuf_mem(int *pfreecnt) while (pfreecnt && *pfreecnt <= 0) { tsleep_interlock(pfreecnt, 0); if ((int)atomic_fetchadd_int(pfreecnt, 0) <= 0) - tsleep(pfreecnt, PINTERLOCKED, "wswbuf0", 0); + tsleep(pfreecnt, PINTERLOCKED, "wswbuf4", 0); } if (pbuf_mem_count <= 0) { tsleep_interlock(&pbuf_mem_count, 0); if ((int)atomic_fetchadd_int(&pbuf_mem_count, 0) <= 0) tsleep(&pbuf_mem_count, PINTERLOCKED, - "wswbuf0", 0); + "wswbuf5", 0); continue; } iter = mycpuid & BSWHMASK; @@ -564,12 +548,12 @@ getpbuf_mem(int *pfreecnt) * Allocate a physical buffer, if one is available. * * Note that there is no NULL hack here - all subsystems using this - * call understand how to use pfreecnt. + * call are required to use a non-NULL pfreecnt. * * No requirements. */ struct buf * -trypbuf(int *pfreecnt) /* raw */ +trypbuf(int *pfreecnt) { struct buf *bp; int iter = mycpuid & BSWHMASK; -- 2.41.0