From 2f0acc22b5ce1dd94f2d01c149a7b1bf0d8eb707 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Sat, 16 Jul 2016 23:15:19 -0700 Subject: [PATCH] kernel - Improve physio performance * See http://apollo.backplane.com/DFlyMisc/nvme_sys03.txt * Hash the pbuf system. This chops down spin-lock collisions at high transaction rates (>150K IOPS) by 1000x. * Implement a pbuf with pre-allocated kernel memory that we copy into, avoiding page table manipulations and thus avoiding system-wide invltlb/invlpg IPIs. * This increases NVMe IOPS tests with three cards from 150K-200K IOPS to 950K IOPS using physio (random read, 4K blocks, from urandom-filled partition, with many process threads, from 3 NVMe cards in parallel). * Further adjustments to the vkernel build. --- sys/dev/raid/vinum/vinum.c | 2 +- sys/kern/kern_physio.c | 62 ++-- sys/kern/subr_param.c | 8 +- sys/platform/pc64/x86_64/machdep.c | 19 +- sys/platform/vkernel64/include/pmap.h | 2 - sys/platform/vkernel64/platform/pmap.c | 23 +- sys/platform/vkernel64/x86_64/autoconf.c | 22 +- sys/sys/buf.h | 11 +- sys/vfs/nfs/nfs_subs.c | 2 +- sys/vfs/smbfs/smbfs_vfsops.c | 2 +- sys/vfs/ufs/ffs_rawread.c | 3 +- sys/vm/swap_pager.c | 8 +- sys/vm/vm_pager.c | 437 ++++++++++++++++------- sys/vm/vnode_pager.c | 2 +- 14 files changed, 402 insertions(+), 201 deletions(-) diff --git a/sys/dev/raid/vinum/vinum.c b/sys/dev/raid/vinum/vinum.c index 4c87c2f963..48d0559a85 100644 --- a/sys/dev/raid/vinum/vinum.c +++ b/sys/dev/raid/vinum/vinum.c @@ -261,7 +261,7 @@ free_vinum(int cleardrive) STATIC void vinum_initconf(void) { - vinum_conf.physbufs = nswbuf / 2 + 1; + vinum_conf.physbufs = nswbuf_kva / 2 + 1; /* allocate space: drives... */ DRIVE = (struct drive *) Malloc(sizeof(struct drive) * INITIAL_DRIVES); diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index af3d2786b3..2a196a0695 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -15,9 +15,6 @@ * John S. Dyson. * 4. Modifications may be freely made to this file if the above conditions * are met. - * - * $FreeBSD: src/sys/kern/kern_physio.c,v 1.46.2.4 2003/11/14 09:51:47 simokawa Exp $ - * $DragonFly: src/sys/kern/kern_physio.c,v 1.27 2008/08/22 08:47:56 swildner Exp $ */ #include @@ -40,11 +37,13 @@ physio(cdev_t dev, struct uio *uio, int ioflag) int saflags; int iolen; int bcount; - int bounceit; caddr_t ubase; struct buf *bp; - bp = getpbuf_kva(NULL); + if (uio->uio_segflg == UIO_USERSPACE) + bp = getpbuf_mem(NULL); + else + bp = getpbuf_kva(NULL); saflags = bp->b_flags; error = 0; @@ -87,18 +86,9 @@ physio(cdev_t dev, struct uio *uio, int ioflag) bcount = dev->si_iosize_max; ubase = uio->uio_iov[i].iov_base; - bounceit = (int)(((vm_offset_t)ubase) & 15); iolen = ((vm_offset_t)ubase) & PAGE_MASK; - if (bounceit) { - if (bcount > bp->b_kvasize) - bcount = bp->b_kvasize; - } else { - if ((bcount + iolen) > bp->b_kvasize) { - bcount = bp->b_kvasize; - if (iolen != 0) - bcount -= PAGE_SIZE; - } - } + if (bcount > bp->b_kvasize) + bcount = bp->b_kvasize; /* * If we have to use a bounce buffer allocate kernel @@ -107,20 +97,19 @@ physio(cdev_t dev, struct uio *uio, int ioflag) * copying. */ if (uio->uio_segflg == UIO_USERSPACE) { - if (bounceit) { - bp->b_data = bp->b_kvabase; - bp->b_bcount = bcount; - vm_hold_load_pages(bp, (vm_offset_t)bp->b_data, (vm_offset_t)bp->b_data + bcount); - if (uio->uio_rw == UIO_WRITE) { - error = copyin(ubase, bp->b_data, bcount); - if (error) { - vm_hold_free_pages(bp, (vm_offset_t)bp->b_data, (vm_offset_t)bp->b_data + bcount); - goto doerror; - } + /* bp->b_data = bp->b_kvabase; */ + bp->b_bcount = bcount; + /* + vm_hold_load_pages(bp, (vm_offset_t)bp->b_data, (vm_offset_t)bp->b_data + bcount); + */ + if (uio->uio_rw == UIO_WRITE) { + error = copyin(ubase, bp->b_data, bcount); + if (error) { + /* + vm_hold_free_pages(bp, (vm_offset_t)bp->b_data, (vm_offset_t)bp->b_data + bcount); + */ + goto doerror; } - } else if (vmapbuf(bp, ubase, bcount) < 0) { - error = EFAULT; - goto doerror; } } else { bp->b_data = uio->uio_iov[i].iov_base; @@ -131,17 +120,12 @@ physio(cdev_t dev, struct uio *uio, int ioflag) iolen = bp->b_bcount - bp->b_resid; if (uio->uio_segflg == UIO_USERSPACE) { - if (bounceit) { - if (uio->uio_rw == UIO_READ && iolen) { - error = copyout(bp->b_data, ubase, iolen); - if (error) { - bp->b_flags |= B_ERROR; - bp->b_error = error; - } + if (uio->uio_rw == UIO_READ && iolen) { + error = copyout(bp->b_data, ubase, iolen); + if (error) { + bp->b_flags |= B_ERROR; + bp->b_error = error; } - vm_hold_free_pages(bp, (vm_offset_t)bp->b_data, (vm_offset_t)bp->b_data + bcount); - } else { - vunmapbuf(bp); } } if (iolen == 0 && !(bp->b_flags & B_ERROR)) diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c index 92a583bc2a..e103b42801 100644 --- a/sys/kern/subr_param.c +++ b/sys/kern/subr_param.c @@ -83,7 +83,9 @@ int maxposixlocksperuid; /* max # POSIX locks per uid */ int ncallout; /* maximum # of timer events */ int mbuf_wait = 32; /* mbuf sleep time in ticks */ long nbuf; -long nswbuf; +long nswbuf_mem; +long nswbuf_kva; +long nswbuf_raw; long maxswzone; /* max swmeta KVA storage */ long maxbcache; /* max buffer cache KVA storage */ enum vmm_guest_type vmm_guest = VMM_GUEST_NONE; /* Running as VM guest? */ @@ -103,7 +105,9 @@ SYSCTL_PROC(_kern, OID_AUTO, vmm_guest, CTLFLAG_RD | CTLTYPE_STRING, * them here forces loader errors if this file is omitted * (if they've been externed everywhere else; hah!). */ -struct buf *swbuf; +struct buf *swbuf_mem; +struct buf *swbuf_kva; +struct buf *swbuf_raw; struct vmm_bname { const char *str; diff --git a/sys/platform/pc64/x86_64/machdep.c b/sys/platform/pc64/x86_64/machdep.c index dbe1f9e4de..99fe54e82d 100644 --- a/sys/platform/pc64/x86_64/machdep.c +++ b/sys/platform/pc64/x86_64/machdep.c @@ -438,16 +438,22 @@ again: "considerations", nbuf); } - nswbuf = lmax(lmin(nbuf / 4, 256), 16); + nswbuf_mem = lmax(lmin(nbuf / 32, 256), 8); #ifdef NSWBUF_MIN - if (nswbuf < NSWBUF_MIN) - nswbuf = NSWBUF_MIN; + if (nswbuf_mem < NSWBUF_MIN) + nswbuf_mem = NSWBUF_MIN; +#endif + nswbuf_kva = lmax(lmin(nbuf / 4, 256), 16); +#ifdef NSWBUF_MIN + if (nswbuf_kva < NSWBUF_MIN) + nswbuf_kva = NSWBUF_MIN; #endif #ifdef DIRECTIO ffs_rawread_setup(); #endif - valloc(swbuf, struct buf, nswbuf); + valloc(swbuf_mem, struct buf, nswbuf_mem); + valloc(swbuf_kva, struct buf, nswbuf_kva); valloc(buf, struct buf, nbuf); /* @@ -475,12 +481,13 @@ again: kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, ((vm_offset_t)(nbuf + 16) * BKVASIZE) + - (nswbuf * MAXPHYS) + pager_map_size); + ((nswbuf_mem + nswbuf_kva) * MAXPHYS) + pager_map_size); kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, ((vm_offset_t)(nbuf + 16) * BKVASIZE)); buffer_map.system_map = 1; kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, - ((vm_offset_t)nswbuf * MAXPHYS) + pager_map_size); + ((vm_offset_t)(nswbuf_mem + nswbuf_kva) * MAXPHYS) + + pager_map_size); pager_map.system_map = 1; kprintf("avail memory = %ju (%ju MB)\n", (uintmax_t)ptoa(vmstats.v_free_count + vmstats.v_dma_pages), diff --git a/sys/platform/vkernel64/include/pmap.h b/sys/platform/vkernel64/include/pmap.h index 81548d43ff..8acec7a245 100644 --- a/sys/platform/vkernel64/include/pmap.h +++ b/sys/platform/vkernel64/include/pmap.h @@ -110,8 +110,6 @@ #ifdef _KERNEL -vm_paddr_t pmap_kextract(vm_offset_t); - /* * XXX */ diff --git a/sys/platform/vkernel64/platform/pmap.c b/sys/platform/vkernel64/platform/pmap.c index f8bed2ccd4..135f36f2c7 100644 --- a/sys/platform/vkernel64/platform/pmap.c +++ b/sys/platform/vkernel64/platform/pmap.c @@ -834,26 +834,41 @@ pmap_kenter_quick(vm_offset_t va, vm_paddr_t pa) npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; ptep = vtopte(va); +#if 1 + res = 1; +#else + /* FUTURE */ res = (*ptep != 0); +#endif if (*ptep & VPTE_V) pmap_inval_pte_quick(ptep, &kernel_pmap, va); *ptep = npte; + return res; } -void +int pmap_kenter_noinval(vm_offset_t va, vm_paddr_t pa) { - pt_entry_t *pte; + pt_entry_t *ptep; pt_entry_t npte; + int res; KKASSERT(va >= KvaStart && va < KvaEnd); npte = (vpte_t)pa | VPTE_RW | VPTE_V | VPTE_U; - pte = vtopte(va); + ptep = vtopte(va); +#if 1 + res = 1; +#else + /* FUTURE */ + res = (*ptep != 0); +#endif - *pte = npte; + *ptep = npte; + + return res; } /* diff --git a/sys/platform/vkernel64/x86_64/autoconf.c b/sys/platform/vkernel64/x86_64/autoconf.c index 2f1aff492f..aa0334f909 100644 --- a/sys/platform/vkernel64/x86_64/autoconf.c +++ b/sys/platform/vkernel64/x86_64/autoconf.c @@ -154,29 +154,37 @@ cpu_startup(void *dummy) kprintf("Warning: nbufs capped at %ld\n", nbuf); } - nswbuf = lmax(lmin(nbuf / 4, 256), 16); + nswbuf_mem = lmax(lmin(nbuf / 32, 32), 4); #ifdef NSWBUF_MIN - if (nswbuf < NSWBUF_MIN) - nswbuf = NSWBUF_MIN; + if (nswbuf_mem < NSWBUF_MIN) + nswbuf_mem = NSWBUF_MIN; +#endif + nswbuf_kva = lmax(lmin(nbuf / 4, 256), 16); +#ifdef NSWBUF_MIN + if (nswbuf_kva < NSWBUF_MIN) + nswbuf_kva = NSWBUF_MIN; #endif /* * Allocate memory for the buffer cache */ buf = (void *)kmem_alloc(&kernel_map, nbuf * sizeof(struct buf)); - swbuf = (void *)kmem_alloc(&kernel_map, nswbuf * sizeof(struct buf)); - + swbuf_mem = (void *)kmem_alloc(&kernel_map, nswbuf_mem * sizeof(struct buf)); + swbuf_kva = (void *)kmem_alloc(&kernel_map, nswbuf_kva * sizeof(struct buf)); #ifdef DIRECTIO ffs_rawread_setup(); #endif kmem_suballoc(&kernel_map, &clean_map, &clean_sva, &clean_eva, - (nbuf*BKVASIZE*2) + (nswbuf*MAXPHYS) + pager_map_size); + (nbuf*BKVASIZE*2) + + (nswbuf_mem + nswbuf_kva) *MAXPHYS + + pager_map_size); kmem_suballoc(&clean_map, &buffer_map, &buffer_sva, &buffer_eva, (nbuf*BKVASIZE*2)); buffer_map.system_map = 1; kmem_suballoc(&clean_map, &pager_map, &pager_sva, &pager_eva, - (nswbuf*MAXPHYS) + pager_map_size); + (nswbuf_mem + nswbuf_kva) *MAXPHYS + + pager_map_size); pager_map.system_map = 1; kprintf("avail memory = %lu (%luK bytes)\n", ptoa(vmstats.v_free_count), ptoa(vmstats.v_free_count) / 1024); diff --git a/sys/sys/buf.h b/sys/sys/buf.h index e7f1c1cca6..070b3b6481 100644 --- a/sys/sys/buf.h +++ b/sys/sys/buf.h @@ -160,7 +160,7 @@ struct buf { unsigned int b_qindex; /* buffer queue index */ unsigned int b_qcpu; /* buffer queue cpu */ unsigned char b_act_count; /* similar to vm_page act_count */ - unsigned char b_unused01; + unsigned char b_swindex; struct lock b_lock; /* Buffer lock */ buf_cmd_t b_cmd; /* I/O command */ int b_bufsize; /* Allocated buffer size. */ @@ -380,8 +380,12 @@ extern int buf_maxio; /* nominal maximum I/O for buffer */ extern struct buf *buf; /* The buffer headers. */ extern char *buffers; /* The buffer contents. */ extern int bufpages; /* Number of memory pages in the buffer pool. */ -extern struct buf *swbuf; /* Swap I/O buffer headers. */ -extern long nswbuf; /* Number of swap I/O buffer headers. */ +extern struct buf *swbuf_mem; /* Swap I/O buffer headers. */ +extern struct buf *swbuf_kva; /* Swap I/O buffer headers. */ +extern struct buf *swbuf_raw; /* Swap I/O buffer headers. */ +extern long nswbuf_mem; /* Number of swap I/O buffer headers. */ +extern long nswbuf_kva; /* Number of swap I/O buffer headers. */ +extern long nswbuf_raw; /* Number of swap I/O buffer headers. */ extern int bioq_reorder_burst_interval; extern int bioq_reorder_burst_bytes; extern int bioq_reorder_minor_interval; @@ -417,6 +421,7 @@ void brelse (struct buf *); void bqrelse (struct buf *); int cluster_awrite (struct buf *); struct buf *getpbuf (int *); +struct buf *getpbuf_mem (int *); struct buf *getpbuf_kva (int *); int inmem (struct vnode *, off_t); struct buf *findblk (struct vnode *, off_t, int); diff --git a/sys/vfs/nfs/nfs_subs.c b/sys/vfs/nfs/nfs_subs.c index 06e0b031cc..7106c7db62 100644 --- a/sys/vfs/nfs/nfs_subs.c +++ b/sys/vfs/nfs/nfs_subs.c @@ -611,7 +611,7 @@ nfs_init(struct vfsconf *vfsp) nfs_prev_nfssvc_sy_call = sysent[SYS_nfssvc].sy_call; sysent[SYS_nfssvc].sy_call = (sy_call_t *)sys_nfssvc; - nfs_pbuf_freecnt = nswbuf / 2 + 1; + nfs_pbuf_freecnt = nswbuf_kva / 2 + 1; return (0); } diff --git a/sys/vfs/smbfs/smbfs_vfsops.c b/sys/vfs/smbfs/smbfs_vfsops.c index 50bbc089bc..22883cd408 100644 --- a/sys/vfs/smbfs/smbfs_vfsops.c +++ b/sys/vfs/smbfs/smbfs_vfsops.c @@ -290,7 +290,7 @@ smbfs_root(struct mount *mp, struct vnode **vpp) int smbfs_init(struct vfsconf *vfsp) { - smbfs_pbuf_freecnt = nswbuf / 2 + 1; + smbfs_pbuf_freecnt = nswbuf_kva / 2 + 1; SMBVDEBUG("done.\n"); return 0; } diff --git a/sys/vfs/ufs/ffs_rawread.c b/sys/vfs/ufs/ffs_rawread.c index e5fcdfa6ce..20b639bb27 100644 --- a/sys/vfs/ufs/ffs_rawread.c +++ b/sys/vfs/ufs/ffs_rawread.c @@ -80,7 +80,8 @@ SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0, void ffs_rawread_setup(void) { - ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8; + ffsrawbufcnt = (nswbuf_kva > 100 ) ? + (nswbuf_kva - (nswbuf_kva >> 4)) : nswbuf_kva - 8; } diff --git a/sys/vm/swap_pager.c b/sys/vm/swap_pager.c index 4bfcd753f5..32e6019054 100644 --- a/sys/vm/swap_pager.c +++ b/sys/vm/swap_pager.c @@ -373,8 +373,8 @@ swap_pager_swap_init(void) nsw_cluster_max = min((MAXPHYS/PAGE_SIZE), MAX_PAGEOUT_CLUSTER); - nsw_rcount = (nswbuf + 1) / 2; - nsw_wcount_sync = (nswbuf + 3) / 4; + nsw_rcount = (nswbuf_kva + 1) / 2; + nsw_wcount_sync = (nswbuf_kva + 3) / 4; nsw_wcount_async = 4; nsw_wcount_async_max = nsw_wcount_async; @@ -1524,8 +1524,8 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count, /* * limit range */ - if ((n = swap_async_max) > nswbuf / 2) - n = nswbuf / 2; + if ((n = swap_async_max) > nswbuf_kva / 2) + n = nswbuf_kva / 2; if (n < 1) n = 1; swap_async_max = n; diff --git a/sys/vm/vm_pager.c b/sys/vm/vm_pager.c index b2799acfa2..e2565dc129 100644 --- a/sys/vm/vm_pager.c +++ b/sys/vm/vm_pager.c @@ -87,6 +87,7 @@ #include #include +#include extern struct pagerops defaultpagerops; extern struct pagerops swappagerops; @@ -172,26 +173,32 @@ int npagers = NELEM(pagertab); */ #define PAGER_MAP_SIZE (8 * 1024 * 1024) +#define BSWHSIZE 16 +#define BSWHMASK (BSWHSIZE - 1) + TAILQ_HEAD(swqueue, buf); int pager_map_size = PAGER_MAP_SIZE; struct vm_map pager_map; -static int bswneeded_raw; -static int bswneeded_kva; -static long nswbuf_raw; -static struct buf *swbuf_raw; -static vm_offset_t swapbkva; /* swap buffers kva */ -static struct swqueue bswlist_raw; /* without kva */ -static struct swqueue bswlist_kva; /* with kva */ -static struct spinlock bswspin = SPINLOCK_INITIALIZER(&bswspin, "bswspin"); +static vm_offset_t swapbkva_mem; /* swap buffers kva */ +static vm_offset_t swapbkva_kva; /* swap buffers kva */ +static struct swqueue bswlist_mem[BSWHSIZE]; /* with preallocated memory */ +static struct swqueue bswlist_kva[BSWHSIZE]; /* with kva */ +static struct swqueue bswlist_raw[BSWHSIZE]; /* without kva */ +static struct spinlock bswspin_mem[BSWHSIZE]; +static struct spinlock bswspin_kva[BSWHSIZE]; +static struct spinlock bswspin_raw[BSWHSIZE]; static int pbuf_raw_count; static int pbuf_kva_count; +static int pbuf_mem_count; SYSCTL_INT(_vfs, OID_AUTO, pbuf_raw_count, CTLFLAG_RD, &pbuf_raw_count, 0, - "Kernel virtual address space reservations"); + "Kernel pbuf raw reservations"); SYSCTL_INT(_vfs, OID_AUTO, pbuf_kva_count, CTLFLAG_RD, &pbuf_kva_count, 0, - "Kernel raw address space reservations"); + "Kernel pbuf kva reservations"); +SYSCTL_INT(_vfs, OID_AUTO, pbuf_mem_count, CTLFLAG_RD, &pbuf_mem_count, 0, + "Kernel pbuf mem reservations"); /* * Initialize the swap buffer list. @@ -201,8 +208,16 @@ SYSCTL_INT(_vfs, OID_AUTO, pbuf_kva_count, CTLFLAG_RD, &pbuf_kva_count, 0, static void vm_pager_init(void *arg __unused) { - TAILQ_INIT(&bswlist_raw); - TAILQ_INIT(&bswlist_kva); + int i; + + for (i = 0; i < BSWHSIZE; ++i) { + TAILQ_INIT(&bswlist_mem[i]); + TAILQ_INIT(&bswlist_kva[i]); + TAILQ_INIT(&bswlist_raw[i]); + spin_init(&bswspin_mem[i], "bswmem"); + spin_init(&bswspin_kva[i], "bswkva"); + spin_init(&bswspin_raw[i], "bswraw"); + } } SYSINIT(vm_mem, SI_BOOT1_VM, SI_ORDER_SECOND, vm_pager_init, NULL); @@ -219,27 +234,82 @@ vm_pager_bufferinit(void *dummy __unused) /* * Reserve KVM space for pbuf data. */ - swapbkva = kmem_alloc_pageable(&pager_map, nswbuf * MAXPHYS); - if (!swapbkva) + swapbkva_mem = kmem_alloc_pageable(&pager_map, nswbuf_mem * MAXPHYS); + if (!swapbkva_mem) + panic("Not enough pager_map VM space for physical buffers"); + swapbkva_kva = kmem_alloc_pageable(&pager_map, nswbuf_kva * MAXPHYS); + if (!swapbkva_kva) panic("Not enough pager_map VM space for physical buffers"); /* - * Initial pbuf setup. These pbufs have KVA reservations. + * Initial pbuf setup. + * + * mem - These pbufs have permanently allocated memory + * kva - These pbufs have unallocated kva reservations + * raw - These pbufs have no kva reservations + */ + + /* + * Buffers with pre-allocated kernel memory can be convenient for + * copyin/copyout because no SMP page invalidation or other pmap + * operations are needed. + */ +#if 1 + bp = swbuf_mem; + for (i = 0; i < nswbuf_mem; ++i, ++bp) { + vm_page_t m; + vm_pindex_t pg; + int j; + + bp->b_kvabase = (caddr_t)((intptr_t)i * MAXPHYS) + swapbkva_mem; + bp->b_kvasize = MAXPHYS; + bp->b_swindex = i & BSWHMASK; + BUF_LOCKINIT(bp); + buf_dep_init(bp); + TAILQ_INSERT_HEAD(&bswlist_mem[i & BSWHMASK], bp, b_freelist); + atomic_add_int(&pbuf_mem_count, 1); + bp->b_data = bp->b_kvabase; + bp->b_bcount = MAXPHYS; + bp->b_xio.xio_pages = bp->b_xio.xio_internal_pages; + + pg = (vm_offset_t)bp->b_kvabase >> PAGE_SHIFT; + vm_object_hold(&kernel_object); + for (j = 0; j < MAXPHYS / PAGE_SIZE; ++j) { + m = vm_page_alloc(&kernel_object, pg, VM_ALLOC_NORMAL | + VM_ALLOC_SYSTEM); + KKASSERT(m != NULL); + bp->b_xio.xio_internal_pages[j] = m; + vm_page_wire(m); + vm_page_flag_clear(m, PG_ZERO); + /* early boot, no other cpus running yet */ + pmap_kenter_noinval(pg * PAGE_SIZE, VM_PAGE_TO_PHYS(m)); + cpu_invlpg((void *)(pg * PAGE_SIZE)); + vm_page_wakeup(m); + ++pg; + } + vm_object_drop(&kernel_object); + bp->b_xio.xio_npages = j; + } +#endif + + /* + * Buffers with pre-assigned KVA bases. The KVA has no memory pages + * assigned to it. Saves the caller from having to reserve KVA for + * the page map. */ - bp = swbuf; - for (i = 0; i < nswbuf; ++i, ++bp) { - bp->b_kvabase = (caddr_t)((intptr_t)i * MAXPHYS) + swapbkva; + bp = swbuf_kva; + for (i = 0; i < nswbuf_kva; ++i, ++bp) { + bp->b_kvabase = (caddr_t)((intptr_t)i * MAXPHYS) + swapbkva_kva; bp->b_kvasize = MAXPHYS; + bp->b_swindex = i & BSWHMASK; BUF_LOCKINIT(bp); buf_dep_init(bp); - TAILQ_INSERT_HEAD(&bswlist_kva, bp, b_freelist); - ++pbuf_kva_count; + TAILQ_INSERT_HEAD(&bswlist_kva[i & BSWHMASK], bp, b_freelist); + atomic_add_int(&pbuf_kva_count, 1); } /* - * Initial pbuf setup. These pbufs do not have KVA reservations, - * so we can have a lot more of them. These are typically used - * to massage low level buf/bio requests. + * RAW buffers with no KVA mappings. * * NOTE: We use KM_NOTLBSYNC here to reduce unnecessary IPIs * during startup, which can really slow down emulated @@ -252,16 +322,17 @@ vm_pager_bufferinit(void *dummy __unused) smp_invltlb(); bp = swbuf_raw; for (i = 0; i < nswbuf_raw; ++i, ++bp) { + bp->b_swindex = i & BSWHMASK; BUF_LOCKINIT(bp); buf_dep_init(bp); - TAILQ_INSERT_HEAD(&bswlist_raw, bp, b_freelist); - ++pbuf_raw_count; + TAILQ_INSERT_HEAD(&bswlist_raw[i & BSWHMASK], bp, b_freelist); + atomic_add_int(&pbuf_raw_count, 1); } /* * Allow the clustering code to use half of our pbufs. */ - cluster_pbuf_freecnt = nswbuf / 2; + cluster_pbuf_freecnt = nswbuf_kva / 2; } SYSINIT(do_vmpg, SI_BOOT2_MACHDEP, SI_ORDER_FIRST, vm_pager_bufferinit, NULL); @@ -332,8 +403,8 @@ initpbuf(struct buf *bp) /* * Allocate a physical buffer * - * There are a limited number (nswbuf) of physical buffers. We need - * to make sure that no single subsystem is able to hog all of them, + * There are a limited number of physical buffers. We need to make + * sure that no single subsystem is able to hog all of them, * so each subsystem implements a counter which is typically initialized * to 1/2 nswbuf. getpbuf() decrements this counter in allocation and * increments it on release, and blocks if the counter hits zero. A @@ -352,119 +423,214 @@ initpbuf(struct buf *bp) * No requirements. */ struct buf * -getpbuf(int *pfreecnt) +getpbuf(int *pfreecnt) /* raw */ { struct buf *bp; - - spin_lock(&bswspin); + int iter; + int loops; for (;;) { - if (pfreecnt) { - while (*pfreecnt == 0) - ssleep(pfreecnt, &bswspin, 0, "wswbuf0", 0); + while (pfreecnt && *pfreecnt <= 0) { + tsleep_interlock(pfreecnt, 0); + if (atomic_fetchadd_int(pfreecnt, 0) <= 0) + tsleep(pfreecnt, PINTERLOCKED, "wswbuf0", 0); + } + if (pbuf_raw_count <= 0) { + tsleep_interlock(&pbuf_raw_count, 0); + if (atomic_fetchadd_int(&pbuf_raw_count, 0) <= 0) + tsleep(&pbuf_raw_count, PINTERLOCKED, + "wswbuf0", 0); + continue; + } + iter = mycpuid & BSWHMASK; + for (loops = BSWHSIZE; loops; --loops) { + if (TAILQ_FIRST(&bswlist_raw[iter]) == NULL) { + iter = (iter + 1) & BSWHMASK; + continue; + } + spin_lock(&bswspin_raw[iter]); + if ((bp = TAILQ_FIRST(&bswlist_raw[iter])) == NULL) { + spin_unlock(&bswspin_raw[iter]); + iter = (iter + 1) & BSWHMASK; + continue; + } + TAILQ_REMOVE(&bswlist_raw[iter], bp, b_freelist); + atomic_add_int(&pbuf_raw_count, -1); + if (pfreecnt) + atomic_add_int(pfreecnt, -1); + spin_unlock(&bswspin_raw[iter]); + initpbuf(bp); + + return bp; } - - /* get a bp from the swap buffer header pool */ - if ((bp = TAILQ_FIRST(&bswlist_raw)) != NULL) - break; - bswneeded_raw = 1; - ssleep(&bswneeded_raw, &bswspin, 0, "wswbuf1", 0); - /* loop in case someone else grabbed one */ } - TAILQ_REMOVE(&bswlist_raw, bp, b_freelist); - --pbuf_raw_count; - if (pfreecnt) - --*pfreecnt; - - spin_unlock(&bswspin); - - initpbuf(bp); - - return (bp); + /* not reached */ } struct buf * getpbuf_kva(int *pfreecnt) { struct buf *bp; - - spin_lock(&bswspin); + int iter; + int loops; for (;;) { - if (pfreecnt) { - while (*pfreecnt == 0) - ssleep(pfreecnt, &bswspin, 0, "wswbuf0", 0); + while (pfreecnt && *pfreecnt <= 0) { + tsleep_interlock(pfreecnt, 0); + if (atomic_fetchadd_int(pfreecnt, 0) <= 0) + tsleep(pfreecnt, PINTERLOCKED, "wswbuf0", 0); + } + if (pbuf_kva_count <= 0) { + tsleep_interlock(&pbuf_kva_count, 0); + if (atomic_fetchadd_int(&pbuf_kva_count, 0) <= 0) + tsleep(&pbuf_kva_count, PINTERLOCKED, + "wswbuf0", 0); + continue; + } + iter = mycpuid & BSWHMASK; + for (loops = BSWHSIZE; loops; --loops) { + if (TAILQ_FIRST(&bswlist_kva[iter]) == NULL) { + iter = (iter + 1) & BSWHMASK; + continue; + } + spin_lock(&bswspin_kva[iter]); + if ((bp = TAILQ_FIRST(&bswlist_kva[iter])) == NULL) { + spin_unlock(&bswspin_kva[iter]); + iter = (iter + 1) & BSWHMASK; + continue; + } + TAILQ_REMOVE(&bswlist_kva[iter], bp, b_freelist); + atomic_add_int(&pbuf_kva_count, -1); + if (pfreecnt) + atomic_add_int(pfreecnt, -1); + spin_unlock(&bswspin_kva[iter]); + initpbuf(bp); + + return bp; } - - /* get a bp from the swap buffer header pool */ - if ((bp = TAILQ_FIRST(&bswlist_kva)) != NULL) - break; - bswneeded_kva = 1; - ssleep(&bswneeded_kva, &bswspin, 0, "wswbuf1", 0); - /* loop in case someone else grabbed one */ } - TAILQ_REMOVE(&bswlist_kva, bp, b_freelist); - --pbuf_kva_count; - if (pfreecnt) - --*pfreecnt; - - spin_unlock(&bswspin); + /* not reached */ +} - initpbuf(bp); +/* + * Allocate a pbuf with kernel memory already preallocated. Caller must + * not change the mapping. + */ +struct buf * +getpbuf_mem(int *pfreecnt) +{ + struct buf *bp; + int iter; + int loops; - return (bp); + for (;;) { + while (pfreecnt && *pfreecnt <= 0) { + tsleep_interlock(pfreecnt, 0); + if (atomic_fetchadd_int(pfreecnt, 0) <= 0) + tsleep(pfreecnt, PINTERLOCKED, "wswbuf0", 0); + } + if (pbuf_mem_count <= 0) { + tsleep_interlock(&pbuf_mem_count, 0); + if (atomic_fetchadd_int(&pbuf_mem_count, 0) <= 0) + tsleep(&pbuf_mem_count, PINTERLOCKED, + "wswbuf0", 0); + continue; + } + iter = mycpuid & BSWHMASK; + for (loops = BSWHSIZE; loops; --loops) { + if (TAILQ_FIRST(&bswlist_mem[iter]) == NULL) { + iter = (iter + 1) & BSWHMASK; + continue; + } + spin_lock(&bswspin_mem[iter]); + if ((bp = TAILQ_FIRST(&bswlist_mem[iter])) == NULL) { + spin_unlock(&bswspin_mem[iter]); + iter = (iter + 1) & BSWHMASK; + continue; + } + TAILQ_REMOVE(&bswlist_mem[iter], bp, b_freelist); + atomic_add_int(&pbuf_mem_count, -1); + if (pfreecnt) + atomic_add_int(pfreecnt, -1); + spin_unlock(&bswspin_mem[iter]); + initpbuf(bp); + + return bp; + } + } + /* not reached */ } /* * Allocate a physical buffer, if one is available. * - * Note that there is no NULL hack here - all subsystems using this - * call understand how to use pfreecnt. + * Note that there is no NULL hack here - all subsystems using this + * call understand how to use pfreecnt. * * No requirements. */ struct buf * -trypbuf(int *pfreecnt) +trypbuf(int *pfreecnt) /* raw */ { struct buf *bp; + int iter = mycpuid & BSWHMASK; + int loops; - spin_lock(&bswspin); - - if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist_raw)) == NULL) { - spin_unlock(&bswspin); - return NULL; - } - TAILQ_REMOVE(&bswlist_raw, bp, b_freelist); - --pbuf_raw_count; - --*pfreecnt; + for (loops = BSWHSIZE; loops; --loops) { + if (*pfreecnt <= 0 || TAILQ_FIRST(&bswlist_raw[iter]) == NULL) { + iter = (iter + 1) & BSWHMASK; + continue; + } + spin_lock(&bswspin_raw[iter]); + if (*pfreecnt <= 0 || + (bp = TAILQ_FIRST(&bswlist_raw[iter])) == NULL) { + spin_unlock(&bswspin_raw[iter]); + iter = (iter + 1) & BSWHMASK; + continue; + } + TAILQ_REMOVE(&bswlist_raw[iter], bp, b_freelist); + atomic_add_int(&pbuf_raw_count, -1); + atomic_add_int(pfreecnt, -1); - spin_unlock(&bswspin); + spin_unlock(&bswspin_raw[iter]); - initpbuf(bp); + initpbuf(bp); - return bp; + return bp; + } + return NULL; } struct buf * trypbuf_kva(int *pfreecnt) { struct buf *bp; + int iter = mycpuid & BSWHMASK; + int loops; - spin_lock(&bswspin); - - if (*pfreecnt == 0 || (bp = TAILQ_FIRST(&bswlist_kva)) == NULL) { - spin_unlock(&bswspin); - return NULL; - } - TAILQ_REMOVE(&bswlist_kva, bp, b_freelist); - --pbuf_kva_count; - --*pfreecnt; + for (loops = BSWHSIZE; loops; --loops) { + if (*pfreecnt <= 0 || TAILQ_FIRST(&bswlist_kva[iter]) == NULL) { + iter = (iter + 1) & BSWHMASK; + continue; + } + spin_lock(&bswspin_kva[iter]); + if (*pfreecnt <= 0 || + (bp = TAILQ_FIRST(&bswlist_kva[iter])) == NULL) { + spin_unlock(&bswspin_kva[iter]); + iter = (iter + 1) & BSWHMASK; + continue; + } + TAILQ_REMOVE(&bswlist_kva[iter], bp, b_freelist); + atomic_add_int(&pbuf_kva_count, -1); + atomic_add_int(pfreecnt, -1); - spin_unlock(&bswspin); + spin_unlock(&bswspin_kva[iter]); - initpbuf(bp); + initpbuf(bp); - return bp; + return bp; + } + return NULL; } /* @@ -478,42 +644,57 @@ trypbuf_kva(int *pfreecnt) void relpbuf(struct buf *bp, int *pfreecnt) { - int wake_bsw_kva = 0; - int wake_bsw_raw = 0; - int wake_freecnt = 0; + int wake = 0; + int wake_free = 0; + int iter = bp->b_swindex; KKASSERT(bp->b_flags & B_PAGING); dsched_buf_exit(bp); BUF_UNLOCK(bp); - spin_lock(&bswspin); - if (bp->b_kvabase) { - TAILQ_INSERT_HEAD(&bswlist_kva, bp, b_freelist); - ++pbuf_kva_count; + if (bp >= swbuf_mem && bp < &swbuf_mem[nswbuf_mem]) { + KKASSERT(bp->b_kvabase); + spin_lock(&bswspin_mem[iter]); + TAILQ_INSERT_HEAD(&bswlist_mem[iter], bp, b_freelist); + if (atomic_fetchadd_int(&pbuf_mem_count, 1) == 0) + wake = 1; + if (pfreecnt) { + if (atomic_fetchadd_int(pfreecnt, 1) == 0) + wake_free = 1; + } + spin_unlock(&bswspin_mem[iter]); + if (wake) + wakeup(&pbuf_mem_count); + } else if (swbuf_kva && bp < &swbuf_kva[nswbuf_kva]) { + KKASSERT(bp->b_kvabase); + spin_lock(&bswspin_kva[iter]); + TAILQ_INSERT_HEAD(&bswlist_kva[iter], bp, b_freelist); + if (atomic_fetchadd_int(&pbuf_kva_count, 1) == 0) + wake = 1; + if (pfreecnt) { + if (atomic_fetchadd_int(pfreecnt, 1) == 0) + wake_free = 1; + } + spin_unlock(&bswspin_kva[iter]); + if (wake) + wakeup(&pbuf_kva_count); } else { - TAILQ_INSERT_HEAD(&bswlist_raw, bp, b_freelist); - ++pbuf_raw_count; - } - if (bswneeded_kva) { - bswneeded_kva = 0; - wake_bsw_kva = 1; - } - if (bswneeded_raw) { - bswneeded_raw = 0; - wake_bsw_raw = 1; - } - if (pfreecnt) { - if (++*pfreecnt == 1) - wake_freecnt = 1; + KKASSERT(bp->b_kvabase == NULL); + KKASSERT(bp >= swbuf_raw && bp < &swbuf_raw[nswbuf_raw]); + spin_lock(&bswspin_raw[iter]); + TAILQ_INSERT_HEAD(&bswlist_raw[iter], bp, b_freelist); + if (atomic_fetchadd_int(&pbuf_raw_count, 1) == 0) + wake = 1; + if (pfreecnt) { + if (atomic_fetchadd_int(pfreecnt, 1) == 0) + wake_free = 1; + } + spin_unlock(&bswspin_raw[iter]); + if (wake) + wakeup(&pbuf_raw_count); } - spin_unlock(&bswspin); - - if (wake_bsw_kva) - wakeup(&bswneeded_kva); - if (wake_bsw_raw) - wakeup(&bswneeded_raw); - if (wake_freecnt) + if (wake_free) wakeup(pfreecnt); } @@ -521,9 +702,7 @@ void pbuf_adjcount(int *pfreecnt, int n) { if (n) { - spin_lock(&bswspin); - *pfreecnt += n; - spin_unlock(&bswspin); + atomic_add_int(pfreecnt, n); wakeup(pfreecnt); } } diff --git a/sys/vm/vnode_pager.c b/sys/vm/vnode_pager.c index 7d4829281b..f0a13d8dcd 100644 --- a/sys/vm/vnode_pager.c +++ b/sys/vm/vnode_pager.c @@ -117,7 +117,7 @@ vnode_pager_alloc(void *handle, off_t length, vm_prot_t prot, off_t offset, * XXX hack - This initialization should be put somewhere else. */ if (vnode_pbuf_freecnt < 0) { - vnode_pbuf_freecnt = nswbuf / 2 + 1; + vnode_pbuf_freecnt = nswbuf_kva / 2 + 1; } /* -- 2.41.0