kernel - refactor vm_page busy
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 18 Oct 2017 06:25:24 +0000 (23:25 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 18 Oct 2017 06:25:24 +0000 (23:25 -0700)
* Move PG_BUSY, PG_WANTED, PG_SBUSY, and PG_SWAPINPROG out of m->flags.

* Add m->busy_count with PBUSY_LOCKED, PBUSY_WANTED, PBUSY_SWAPINPROG,
  and PBUSY_MASK (for the soft-busy count).

* Add support for acquiring a soft-busy count without a hard-busy.
  This requires that there not already be a hard-busy.  The purpose
  of this is to allow a vm_page to be 'locked' in a shared manner
  via the soft-busy for situations where we only intend to read from
  it.

18 files changed:
sys/dev/drm/drm_vm.c
sys/dev/drm/i915/i915_gem.c
sys/dev/drm/ttm/ttm_bo_vm.c
sys/kern/vfs_bio.c
sys/kern/vfs_cluster.c
sys/vm/device_pager.c
sys/vm/swap_pager.c
sys/vm/vm_contig.c
sys/vm/vm_fault.c
sys/vm/vm_map.c
sys/vm/vm_mmap.c
sys/vm/vm_page.c
sys/vm/vm_page.h
sys/vm/vm_page2.h
sys/vm/vm_pageout.c
sys/vm/vm_swap.c
sys/vm/vm_swapcache.c
sys/vm/vnode_pager.c

index 7550c54..d98cd2b 100644 (file)
@@ -189,7 +189,8 @@ vm_phys_fictitious_reg_range(vm_paddr_t start, vm_paddr_t end,
 
         for (i = 0; i < page_count; i++) {
                vm_page_initfake(&fp[i], start + PAGE_SIZE * i, memattr);
-               fp[i].flags &= ~(PG_BUSY | PG_UNMANAGED);
+               fp[i].flags &= ~PG_UNMANAGED;
+               atomic_clear_int(&fp[i].busy_count, PBUSY_LOCKED);
         }
         mtx_lock(&vm_phys_fictitious_reg_mtx);
         for (segind = 0; segind < VM_PHYS_FICTITIOUS_NSEGS; segind++) {
index 356a836..8a0c34c 100644 (file)
@@ -1814,7 +1814,7 @@ int i915_gem_fault(vm_object_t vm_obj, vm_ooffset_t offset, int prot, vm_page_t
        if (*mres != NULL) {
                m = *mres;
                *mres = NULL;
-               if ((m->flags & PG_BUSY) == 0)
+               if ((m->busy_count & PBUSY_LOCKED) == 0)
                        kprintf("i915_gem_fault: Page was not busy\n");
                else
                        vm_page_remove(m);
@@ -1897,7 +1897,7 @@ retry:
                 * Try to busy the page, retry on failure (non-zero ret).
                 */
                if (vm_page_busy_try(m, false)) {
-                       kprintf("i915_gem_fault: PG_BUSY\n");
+                       kprintf("i915_gem_fault: BUSY\n");
                        ret = -EINTR;
                        goto unlock;
                }
@@ -1923,7 +1923,7 @@ retry:
         * Try to busy the page.  Fails on non-zero return.
         */
        if (vm_page_busy_try(m, false)) {
-               kprintf("i915_gem_fault: PG_BUSY(2)\n");
+               kprintf("i915_gem_fault: BUSY(2)\n");
                ret = -EINTR;
                goto unpin;
        }
index b0dcbef..685c09a 100644 (file)
@@ -230,7 +230,7 @@ reserve:
        }
 
        VM_OBJECT_LOCK(vm_obj);
-       if ((m->flags & PG_BUSY) != 0) {
+       if ((m->busy_count & PBUSY_LOCKED) != 0) {
 #if 0
                vm_page_sleep(m, "ttmpbs");
 #endif
index c73fbe2..d807c63 100644 (file)
@@ -3148,7 +3148,7 @@ allocbuf(struct buf *bp, int size)
                             bp->b_xio.xio_npages;
 
                        /*
-                        * Blocking on m->busy might lead to a
+                        * Blocking on m->busy_count might lead to a
                         * deadlock:
                         *
                         *  vm_fault->getpages->cluster_read->allocbuf
@@ -3687,7 +3687,7 @@ bpdone(struct buf *bp, int elseit)
                         * up.  if you see this, you have not set the page
                         * busy flag correctly!!!
                         */
-                       if (m->busy == 0) {
+                       if ((m->busy_count & PBUSY_MASK) == 0) {
                                kprintf("bpdone: page busy < 0, "
                                    "pindex: %d, foff: 0x(%x,%x), "
                                    "resid: %d, index: %d\n",
@@ -3872,7 +3872,7 @@ vfs_unbusy_pages(struct buf *bp)
  *     This routine is called before a device strategy routine.
  *     It is used to tell the VM system that paging I/O is in
  *     progress, and treat the pages associated with the buffer
- *     almost as being PG_BUSY.  Also the object 'paging_in_progress'
+ *     almost as being PBUSY_LOCKED.  Also the object 'paging_in_progress'
  *     flag is handled to make sure that the object doesn't become
  *     inconsistant.
  *
index b1353e6..5a6f1e2 100644 (file)
@@ -1629,7 +1629,8 @@ cluster_wbuild(struct vnode *vp, struct buf **bpp,
                                             j < tbp->b_xio.xio_npages;
                                             ++j) {
                                                m = tbp->b_xio.xio_pages[j];
-                                               if (m->flags & PG_BUSY) {
+                                               if (m->busy_count &
+                                                   PBUSY_LOCKED) {
                                                        bqrelse(tbp);
                                                        goto finishcluster;
                                                }
index fccd324..5414ac3 100644 (file)
@@ -281,13 +281,13 @@ dev_pager_getfake(vm_paddr_t paddr, int pat_mode)
 
        pmap_page_init(m);
 
-       m->flags = PG_BUSY | PG_FICTITIOUS;
+       m->flags = PG_FICTITIOUS;
        m->valid = VM_PAGE_BITS_ALL;
        m->dirty = 0;
-       m->busy = 0;
        m->queue = PQ_NONE;
        m->object = NULL;
 
+       m->busy_count = PBUSY_LOCKED;
        m->wire_count = 1;
        m->hold_count = 0;
        m->phys_addr = paddr;
index 23cd6d4..740ea8b 100644 (file)
@@ -1267,7 +1267,7 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        int j;
        int raonly;
        int error;
-       u_int32_t flags;
+       u_int32_t busy_count;
        vm_page_t marray[XIO_INTERNAL_PAGES];
 
        mreq = *mpp;
@@ -1424,8 +1424,10 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        else
                bio->bio_driver_info = (void *)(intptr_t)0;
 
-       for (j = 0; j < i; ++j)
-               vm_page_flag_set(bp->b_xio.xio_pages[j], PG_SWAPINPROG);
+       for (j = 0; j < i; ++j) {
+               atomic_set_int(&bp->b_xio.xio_pages[j]->busy_count,
+                              PBUSY_SWAPINPROG);
+       }
 
        mycpu->gd_cnt.v_swapin++;
        mycpu->gd_cnt.v_swappgsin += bp->b_xio.xio_npages;
@@ -1450,7 +1452,7 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        vn_strategy(swapdev_vp, bio);
 
        /*
-        * Wait for the page we want to complete.  PG_SWAPINPROG is always
+        * Wait for the page we want to complete.  PBUSY_SWAPINPROG is always
         * cleared on completion.  If an I/O error occurs, SWAPBLK_NONE
         * is set in the meta-data.
         *
@@ -1466,15 +1468,17 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
         * Read-ahead includes originally requested page case.
         */
        for (;;) {
-               flags = mreq->flags;
+               busy_count = mreq->busy_count;
                cpu_ccfence();
-               if ((flags & PG_SWAPINPROG) == 0)
+               if ((busy_count & PBUSY_SWAPINPROG) == 0)
                        break;
                tsleep_interlock(mreq, 0);
-               if (!atomic_cmpset_int(&mreq->flags, flags,
-                                      flags | PG_WANTED | PG_REFERENCED)) {
+               if (!atomic_cmpset_int(&mreq->busy_count, busy_count,
+                                      busy_count |
+                                       PBUSY_SWAPINPROG | PBUSY_WANTED)) {
                        continue;
                }
+               atomic_set_int(&mreq->flags, PG_REFERENCED);
                mycpu->gd_cnt.v_intrans++;
                if (tsleep(mreq, PINTERLOCKED, "swread", hz*20)) {
                        kprintf(
@@ -1488,7 +1492,7 @@ swap_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        }
 
        /*
-        * Disallow speculative reads prior to the PG_SWAPINPROG test.
+        * Disallow speculative reads prior to the SWAPINPROG test.
         */
        cpu_lfence();
 
@@ -1696,7 +1700,7 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
                                vm_page_dirty(mreq);
                        rtvals[i+j] = VM_PAGER_OK;
 
-                       vm_page_flag_set(mreq, PG_SWAPINPROG);
+                       atomic_set_int(&mreq->busy_count, PBUSY_SWAPINPROG);
                        bp->b_xio.xio_pages[j] = mreq;
                }
                bp->b_xio.xio_npages = n;
@@ -1782,8 +1786,8 @@ swap_pager_newswap(void)
  *     Completion routine for asynchronous reads and writes from/to swap.
  *     Also called manually by synchronous code to finish up a bp.
  *
- *     For READ operations, the pages are PG_BUSY'd.  For WRITE operations, 
- *     the pages are vm_page_t->busy'd.  For READ operations, we PG_BUSY 
+ *     For READ operations, the pages are BUSY'd.  For WRITE operations,
+ *     the pages are vm_page_t->busy'd.  For READ operations, we BUSY
  *     unbusy all pages except the 'main' request page.  For WRITE 
  *     operations, we vm_page_t->busy'd unbusy all pages ( we can do this 
  *     because we marked them all VM_PAGER_PEND on return from putpages ).
@@ -1873,7 +1877,7 @@ swp_pager_async_iodone(struct bio *bio)
                                 * not match anything ).
                                 *
                                 * We have to wake specifically requested pages
-                                * up too because we cleared PG_SWAPINPROG and
+                                * up too because we cleared SWAPINPROG and
                                 * someone may be waiting for that.
                                 *
                                 * NOTE: For reads, m->dirty will probably
@@ -1887,14 +1891,15 @@ swp_pager_async_iodone(struct bio *bio)
                                 *       object->memq from an interrupt.
                                 *       Deactivate the page instead.
                                 *
-                                * WARNING! The instant PG_SWAPINPROG is
+                                * WARNING! The instant SWAPINPROG is
                                 *          cleared another cpu may start
                                 *          using the mreq page (it will
                                 *          check m->valid immediately).
                                 */
 
                                m->valid = 0;
-                               vm_page_flag_clear(m, PG_SWAPINPROG);
+                               atomic_clear_int(&m->busy_count,
+                                                PBUSY_SWAPINPROG);
 
                                /*
                                 * bio_driver_info holds the requested page
@@ -1936,7 +1941,8 @@ swp_pager_async_iodone(struct bio *bio)
                                        vm_page_activate(m);
                                }
                                vm_page_io_finish(m);
-                               vm_page_flag_clear(m, PG_SWAPINPROG);
+                               atomic_clear_int(&m->busy_count,
+                                                PBUSY_SWAPINPROG);
                                vm_page_wakeup(m);
                        }
                } else if (bio->bio_caller_info1.index & SWBIO_READ) {
@@ -1964,7 +1970,7 @@ swp_pager_async_iodone(struct bio *bio)
                         *       map non-kernel pmaps and currently asserts
                         *       the case.
                         *
-                        * WARNING! The instant PG_SWAPINPROG is
+                        * WARNING! The instant SWAPINPROG is
                         *          cleared another cpu may start
                         *          using the mreq page (it will
                         *          check m->valid immediately).
@@ -1973,11 +1979,11 @@ swp_pager_async_iodone(struct bio *bio)
                        m->valid = VM_PAGE_BITS_ALL;
                        vm_page_undirty(m);
                        vm_page_flag_set(m, PG_SWAPPED);
-                       vm_page_flag_clear(m, PG_SWAPINPROG);
+                       atomic_clear_int(&m->busy_count, PBUSY_SWAPINPROG);
 
                        /*
                         * We have to wake specifically requested pages
-                        * up too because we cleared PG_SWAPINPROG and
+                        * up too because we cleared SWAPINPROG and
                         * could be waiting for it in getpages.  However,
                         * be sure to not unbusy getpages specifically
                         * requested page - getpages expects it to be 
@@ -2015,7 +2021,7 @@ swp_pager_async_iodone(struct bio *bio)
                        if (m->object->type == OBJT_SWAP)
                                vm_page_undirty(m);
                        vm_page_flag_set(m, PG_SWAPPED);
-                       vm_page_flag_clear(m, PG_SWAPINPROG);
+                       atomic_clear_int(&m->busy_count, PBUSY_SWAPINPROG);
                        if (vm_page_count_severe())
                                vm_page_deactivate(m);
                        vm_page_io_finish(m);
index bd1dd73..3cdd2d1 100644 (file)
@@ -151,7 +151,8 @@ vm_contig_pg_clean(int queue, int count)
         * Setup a local marker
         */
        bzero(&marker, sizeof(marker));
-       marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+       marker.flags = PG_FICTITIOUS | PG_MARKER;
+       marker.busy_count = PBUSY_LOCKED;
        marker.queue = queue;
        marker.wire_count = 1;
 
@@ -293,9 +294,10 @@ again:
                            (phys >= low) && (phys < high) &&
                            ((phys & (alignment - 1)) == 0) &&
                            (((phys ^ (phys + size - 1)) & ~(boundary - 1)) == 0) &&
-                           m->busy == 0 && m->wire_count == 0 &&
-                           m->hold_count == 0 &&
-                           (m->flags & (PG_BUSY | PG_NEED_COMMIT)) == 0)
+                           m->wire_count == 0 && m->hold_count == 0 &&
+                           (m->busy_count &
+                            (PBUSY_LOCKED | PBUSY_MASK)) == 0 &&
+                           (m->flags & PG_NEED_COMMIT) == 0)
                        {
                                break;
                        }
@@ -359,9 +361,10 @@ again:
                        if ((VM_PAGE_TO_PHYS(&m[0]) !=
                            (VM_PAGE_TO_PHYS(&m[-1]) + PAGE_SIZE)) ||
                            ((pqtype != PQ_FREE) && (pqtype != PQ_CACHE)) ||
-                           m->busy || m->wire_count ||
+                           m->wire_count ||
                            m->hold_count ||
-                           (m->flags & (PG_BUSY | PG_NEED_COMMIT)))
+                           (m->busy_count & (PBUSY_LOCKED | PBUSY_MASK)) ||
+                           (m->flags & PG_NEED_COMMIT))
                        {
                                start++;
                                goto again;
@@ -409,11 +412,11 @@ again:
                        KASSERT(m->dirty == 0,
                                ("vm_contig_pg_alloc: page %p was dirty", m));
                        KKASSERT(m->wire_count == 0);
-                       KKASSERT(m->busy == 0);
+                       KKASSERT((m->busy_count & PBUSY_MASK) == 0);
 
                        /*
-                        * Clear all flags except PG_[S]BUSY and PG_WANTED,
-                        * then unbusy the now allocated page.
+                        * Clear all flags.  Then unbusy the now allocated
+                        * page.
                         */
                        vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
                        vm_page_wire(m);
index fb0d3d0..462f4cd 100644 (file)
@@ -438,7 +438,8 @@ RetryFault:
 
                bzero(&fakem, sizeof(fakem));
                fakem.pindex = first_pindex;
-               fakem.flags = PG_BUSY | PG_FICTITIOUS | PG_UNMANAGED;
+               fakem.flags = PG_FICTITIOUS | PG_UNMANAGED;
+               fakem.busy_count = PBUSY_LOCKED;
                fakem.valid = VM_PAGE_BITS_ALL;
                fakem.pat_mode = VM_MEMATTR_DEFAULT;
                if (fs.entry->object.uksmap(fs.entry->aux.dev, &fakem)) {
@@ -613,7 +614,7 @@ RetryFault:
                vm_map_deinterlock(fs.map, &ilock);
 
        /*KKASSERT(fs.m->queue == PQ_NONE); page-in op may deactivate page */
-       KKASSERT(fs.m->flags & PG_BUSY);
+       KKASSERT(fs.m->busy_count & PBUSY_LOCKED);
 
        /*
         * If the page is not wired down, then put it where the pageout daemon
@@ -910,7 +911,8 @@ RetryFault:
 
                bzero(&fakem, sizeof(fakem));
                fakem.pindex = first_pindex;
-               fakem.flags = PG_BUSY | PG_FICTITIOUS | PG_UNMANAGED;
+               fakem.flags = PG_FICTITIOUS | PG_UNMANAGED;
+               fakem.busy_count = PBUSY_LOCKED;
                fakem.valid = VM_PAGE_BITS_ALL;
                fakem.pat_mode = VM_MEMATTR_DEFAULT;
                if (fs.entry->object.uksmap(fs.entry->aux.dev, &fakem)) {
@@ -1753,7 +1755,7 @@ readrest:
 
                        /*
                         * Avoid deadlocking against the map when doing I/O.
-                        * fs.object and the page is PG_BUSY'd.
+                        * fs.object and the page is BUSY'd.
                         *
                         * NOTE: Once unlocked, fs->entry can become stale
                         *       so this will NULL it out.
@@ -1766,13 +1768,13 @@ readrest:
 
                        /*
                         * Acquire the page data.  We still hold a ref on
-                        * fs.object and the page has been PG_BUSY's.
+                        * fs.object and the page has been BUSY's.
                         *
                         * The pager may replace the page (for example, in
                         * order to enter a fictitious page into the
                         * object).  If it does so it is responsible for
                         * cleaning up the passed page and properly setting
-                        * the new page PG_BUSY.
+                        * the new page BUSY.
                         *
                         * If we got here through a PG_RAM read-ahead
                         * mark the page may be partially dirty and thus
@@ -1980,7 +1982,7 @@ readrest:
         * top-level object, we have to copy it into a new page owned by the
         * top-level object.
         */
-       KASSERT((fs->m->flags & PG_BUSY) != 0,
+       KASSERT((fs->m->busy_count & PBUSY_LOCKED) != 0,
                ("vm_fault: not busy after main loop"));
 
        if (fs->object != fs->first_object) {
@@ -2075,11 +2077,20 @@ readrest:
                                 *
                                 * So we have to remove the page from at
                                 * least the current pmap if it is in it.
-                                * Just remove it from all pmaps.
+                                *
+                                * We used to just remove it from all pmaps
+                                * but that creates inefficiencies on SMP,
+                                * particularly for COW program & library
+                                * mappings that are concurrently exec'd.
+                                * Only remove the page from the current
+                                * pmap.
                                 */
                                KKASSERT(fs->first_shared == 0);
                                vm_page_copy(fs->m, fs->first_m);
-                               vm_page_protect(fs->m, VM_PROT_NONE);
+                               /*vm_page_protect(fs->m, VM_PROT_NONE);*/
+                               pmap_remove_specific(
+                                   &curthread->td_lwp->lwp_vmspace->vm_pmap,
+                                   fs->m);
                        }
 
                        /*
@@ -2213,7 +2224,7 @@ readrest:
         * fs->object will have another PIP reference if it is not equal
         * to fs->first_object.
         */
-       KASSERT(fs->m->flags & PG_BUSY,
+       KASSERT(fs->m->busy_count & PBUSY_LOCKED,
                ("vm_fault: page %p not busy!", fs->m));
 
        /*
@@ -3026,17 +3037,12 @@ vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
                 */
                pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
 
-               m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
-               if (m == NULL || error)
-                       break;
-
                /*
                 * Skip pages already mapped, and stop scanning in that
                 * direction.  When the scan terminates in both directions
                 * we are done.
                 */
                if (pmap_prefault_ok(pmap, addr) == 0) {
-                       vm_page_wakeup(m);
                        if (i & 1)
                                noneg = 1;
                        else
@@ -3046,6 +3052,38 @@ vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
                        continue;
                }
 
+               /*
+                * Shortcut the read-only mapping case using the far more
+                * efficient vm_page_lookup_sbusy_try() function.  This
+                * allows us to acquire the page soft-busied only which
+                * is especially nice for concurrent execs of the same
+                * program.
+                *
+                * The lookup function also validates page suitability
+                * (all valid bits set, and not fictitious).
+                */
+               if ((prot & (VM_PROT_WRITE|VM_PROT_OVERRIDE_WRITE)) == 0) {
+                       m = vm_page_lookup_sbusy_try(object, pindex);
+                       if (m == NULL)
+                               break;
+                       pmap_enter(pmap, addr, m, prot, 0, entry);
+                       mycpu->gd_cnt.v_vm_faults++;
+                       if (curthread->td_lwp)
+                               ++curthread->td_lwp->lwp_ru.ru_minflt;
+                       vm_page_sbusy_drop(m);
+                       continue;
+               }
+
+               /*
+                * Fallback to normal vm_page lookup code.  This code
+                * hard-busies the page.  Not only that, but the page
+                * can remain in that state for a significant period
+                * time due to pmap_enter()'s overhead.
+                */
+               m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
+               if (m == NULL || error)
+                       break;
+
                /*
                 * Stop if the page cannot be trivially entered into the
                 * pmap.
index ad508ac..8228779 100644 (file)
@@ -3374,7 +3374,7 @@ vm_map_split(vm_map_entry_t entry)
        for (idx = 0; idx < size; idx++) {
                m = vm_page_lookup(nobject, idx);
                if (m) {
-                       KKASSERT(m->flags & PG_BUSY);
+                       KKASSERT(m->busy_count & PBUSY_LOCKED);
                        vm_page_wakeup(m);
                }
        }
index 60134ae..7de4ccc 100644 (file)
@@ -875,8 +875,7 @@ RestartScan:
                                                    pindex);
                                if (m && m->valid) {
                                        mincoreinfo = MINCORE_INCORE;
-                                       if (m->dirty ||
-                                               pmap_is_modified(m))
+                                       if (m->dirty || pmap_is_modified(m))
                                                mincoreinfo |= MINCORE_MODIFIED_OTHER;
                                        if ((m->flags & PG_REFERENCED) ||
                                                pmap_ts_referenced(m)) {
index af55397..995c16b 100644 (file)
@@ -859,9 +859,11 @@ _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
 }
 
 /*
- * Wait until page is no longer PG_BUSY or (if also_m_busy is TRUE)
- * m->busy is zero.  Returns TRUE if it had to sleep, FALSE if we
- * did not.  Only one sleep call will be made before returning.
+ * Wait until page is no longer BUSY.  If also_m_busy is TRUE we wait
+ * until the page is no longer BUSY or SBUSY (busy_count field is 0).
+ *
+ * Returns TRUE if it had to sleep, FALSE if we did not.  Only one sleep
+ * call will be made before returning.
  *
  * This function does NOT busy the page and on return the page is not
  * guaranteed to be available.
@@ -869,19 +871,20 @@ _vm_page_add_queue_spinlocked(vm_page_t m, u_short queue, int athead)
 void
 vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg)
 {
-       u_int32_t flags;
+       u_int32_t busy_count;
 
        for (;;) {
-               flags = m->flags;
+               busy_count = m->busy_count;
                cpu_ccfence();
 
-               if ((flags & PG_BUSY) == 0 &&
-                   (also_m_busy == 0 || (flags & PG_SBUSY) == 0)) {
+               if ((busy_count & PBUSY_LOCKED) == 0 &&
+                   (also_m_busy == 0 || (busy_count & PBUSY_MASK) == 0)) {
                        break;
                }
                tsleep_interlock(m, 0);
-               if (atomic_cmpset_int(&m->flags, flags,
-                                     flags | PG_WANTED | PG_REFERENCED)) {
+               if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                     busy_count | PBUSY_WANTED)) {
+                       atomic_set_int(&m->flags, PG_REFERENCED);
                        tsleep(m, PINTERLOCKED, msg, 0);
                        break;
                }
@@ -953,34 +956,36 @@ vm_get_pg_color(int cpuid, vm_object_t object, vm_pindex_t pindex)
 }
 
 /*
- * Wait until PG_BUSY can be set, then set it.  If also_m_busy is TRUE we
- * also wait for m->busy to become 0 before setting PG_BUSY.
+ * Wait until BUSY can be set, then set it.  If also_m_busy is TRUE we
+ * also wait for m->busy_count to become 0 before setting PBUSY_LOCKED.
  */
 void
 VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
                                     int also_m_busy, const char *msg
                                     VM_PAGE_DEBUG_ARGS)
 {
-       u_int32_t flags;
+       u_int32_t busy_count;
 
        for (;;) {
-               flags = m->flags;
+               busy_count = m->busy_count;
                cpu_ccfence();
-               if (flags & PG_BUSY) {
+               if (busy_count & PBUSY_LOCKED) {
                        tsleep_interlock(m, 0);
-                       if (atomic_cmpset_int(&m->flags, flags,
-                                         flags | PG_WANTED | PG_REFERENCED)) {
+                       if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                         busy_count | PBUSY_WANTED)) {
+                               atomic_set_int(&m->flags, PG_REFERENCED);
                                tsleep(m, PINTERLOCKED, msg, 0);
                        }
-               } else if (also_m_busy && (flags & PG_SBUSY)) {
+               } else if (also_m_busy && busy_count) {
                        tsleep_interlock(m, 0);
-                       if (atomic_cmpset_int(&m->flags, flags,
-                                         flags | PG_WANTED | PG_REFERENCED)) {
+                       if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                         busy_count | PBUSY_WANTED)) {
+                               atomic_set_int(&m->flags, PG_REFERENCED);
                                tsleep(m, PINTERLOCKED, msg, 0);
                        }
                } else {
-                       if (atomic_cmpset_int(&m->flags, flags,
-                                             flags | PG_BUSY)) {
+                       if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                             busy_count | PBUSY_LOCKED)) {
 #ifdef VM_PAGE_DEBUG
                                m->busy_func = func;
                                m->busy_line = lineno;
@@ -992,8 +997,8 @@ VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
 }
 
 /*
- * Attempt to set PG_BUSY.  If also_m_busy is TRUE we only succeed if m->busy
- * is also 0.
+ * Attempt to set BUSY.  If also_m_busy is TRUE we only succeed if
+ * m->busy_count is also 0.
  *
  * Returns non-zero on failure.
  */
@@ -1001,16 +1006,17 @@ int
 VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
                                    VM_PAGE_DEBUG_ARGS)
 {
-       u_int32_t flags;
+       u_int32_t busy_count;
 
        for (;;) {
-               flags = m->flags;
+               busy_count = m->busy_count;
                cpu_ccfence();
-               if (flags & PG_BUSY)
+               if (busy_count & PBUSY_LOCKED)
                        return TRUE;
-               if (also_m_busy && (flags & PG_SBUSY))
+               if (also_m_busy && (busy_count & PBUSY_MASK) != 0)
                        return TRUE;
-               if (atomic_cmpset_int(&m->flags, flags, flags | PG_BUSY)) {
+               if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                     busy_count | PBUSY_LOCKED)) {
 #ifdef VM_PAGE_DEBUG
                                m->busy_func = func;
                                m->busy_line = lineno;
@@ -1021,7 +1027,7 @@ VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m, int also_m_busy
 }
 
 /*
- * Clear the PG_BUSY flag and return non-zero to indicate to the caller
+ * Clear the BUSY flag and return non-zero to indicate to the caller
  * that a wakeup() should be performed.
  *
  * The vm_page must be spinlocked and will remain spinlocked on return.
@@ -1033,28 +1039,30 @@ static __inline
 int
 _vm_page_wakeup(vm_page_t m)
 {
-       u_int32_t flags;
+       u_int32_t busy_count;
 
        for (;;) {
-               flags = m->flags;
+               busy_count = m->busy_count;
                cpu_ccfence();
-               if (atomic_cmpset_int(&m->flags, flags,
-                                     flags & ~(PG_BUSY | PG_WANTED))) {
+               if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                     busy_count &
+                                     ~(PBUSY_LOCKED | PBUSY_WANTED))) {
                        break;
                }
        }
-       return(flags & PG_WANTED);
+       return((int)(busy_count & PBUSY_WANTED));
 }
 
 /*
- * Clear the PG_BUSY flag and wakeup anyone waiting for the page.  This
+ * Clear the BUSY flag and wakeup anyone waiting for the page.  This
  * is typically the last call you make on a page before moving onto
  * other things.
  */
 void
 vm_page_wakeup(vm_page_t m)
 {
-        KASSERT(m->flags & PG_BUSY, ("vm_page_wakeup: page not busy!!!"));
+        KASSERT(m->busy_count & PBUSY_LOCKED,
+               ("vm_page_wakeup: page not busy!!!"));
        vm_page_spin_lock(m);
        if (_vm_page_wakeup(m)) {
                vm_page_spin_unlock(m);
@@ -1138,7 +1146,8 @@ vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr)
        m->queue = PQ_NONE;
        /* Fictitious pages don't use "segind". */
        /* Fictitious pages don't use "order" or "pool". */
-       m->flags = PG_FICTITIOUS | PG_UNMANAGED | PG_BUSY;
+       m->flags = PG_FICTITIOUS | PG_UNMANAGED;
+       m->busy_count = PBUSY_LOCKED;
        m->wire_count = 1;
        spin_init(&m->spin, "fake_page");
        pmap_page_init(m);
@@ -1225,7 +1234,7 @@ vm_page_remove(vm_page_t m)
                return;
        }
 
-       if ((m->flags & PG_BUSY) == 0)
+       if ((m->busy_count & PBUSY_LOCKED) == 0)
                panic("vm_page_remove: page not busy");
 
        object = m->object;
@@ -1274,33 +1283,35 @@ VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(struct vm_object *object,
                                            int also_m_busy, const char *msg
                                            VM_PAGE_DEBUG_ARGS)
 {
-       u_int32_t flags;
+       u_int32_t busy_count;
        vm_page_t m;
 
        ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
        m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
        while (m) {
                KKASSERT(m->object == object && m->pindex == pindex);
-               flags = m->flags;
+               busy_count = m->busy_count;
                cpu_ccfence();
-               if (flags & PG_BUSY) {
+               if (busy_count & PBUSY_LOCKED) {
                        tsleep_interlock(m, 0);
-                       if (atomic_cmpset_int(&m->flags, flags,
-                                         flags | PG_WANTED | PG_REFERENCED)) {
+                       if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                         busy_count | PBUSY_WANTED)) {
+                               atomic_set_int(&m->flags, PG_REFERENCED);
                                tsleep(m, PINTERLOCKED, msg, 0);
                                m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
                                                              pindex);
                        }
-               } else if (also_m_busy && (flags & PG_SBUSY)) {
+               } else if (also_m_busy && busy_count) {
                        tsleep_interlock(m, 0);
-                       if (atomic_cmpset_int(&m->flags, flags,
-                                         flags | PG_WANTED | PG_REFERENCED)) {
+                       if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                         busy_count | PBUSY_WANTED)) {
+                               atomic_set_int(&m->flags, PG_REFERENCED);
                                tsleep(m, PINTERLOCKED, msg, 0);
                                m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq,
                                                              pindex);
                        }
-               } else if (atomic_cmpset_int(&m->flags, flags,
-                                            flags | PG_BUSY)) {
+               } else if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                            busy_count | PBUSY_LOCKED)) {
 #ifdef VM_PAGE_DEBUG
                        m->busy_func = func;
                        m->busy_line = lineno;
@@ -1327,7 +1338,7 @@ VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
                                           int also_m_busy, int *errorp
                                           VM_PAGE_DEBUG_ARGS)
 {
-       u_int32_t flags;
+       u_int32_t busy_count;
        vm_page_t m;
 
        ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
@@ -1335,17 +1346,18 @@ VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
        *errorp = FALSE;
        while (m) {
                KKASSERT(m->object == object && m->pindex == pindex);
-               flags = m->flags;
+               busy_count = m->busy_count;
                cpu_ccfence();
-               if (flags & PG_BUSY) {
+               if (busy_count & PBUSY_LOCKED) {
                        *errorp = TRUE;
                        break;
                }
-               if (also_m_busy && (flags & PG_SBUSY)) {
+               if (also_m_busy && busy_count) {
                        *errorp = TRUE;
                        break;
                }
-               if (atomic_cmpset_int(&m->flags, flags, flags | PG_BUSY)) {
+               if (atomic_cmpset_int(&m->busy_count, busy_count,
+                                     busy_count | PBUSY_LOCKED)) {
 #ifdef VM_PAGE_DEBUG
                        m->busy_func = func;
                        m->busy_line = lineno;
@@ -1356,6 +1368,33 @@ VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_try)(struct vm_object *object,
        return m;
 }
 
+/*
+ * Returns a page that is only soft-busied for use by the caller in
+ * a read-only fashion.  Returns NULL if the page could not be found,
+ * the soft busy could not be obtained, or the page data is invalid.
+ */
+vm_page_t
+vm_page_lookup_sbusy_try(struct vm_object *object, vm_pindex_t pindex)
+{
+       vm_page_t m;
+
+       ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
+       m = vm_page_rb_tree_RB_LOOKUP(&object->rb_memq, pindex);
+       if (m) {
+               if (m->valid != VM_PAGE_BITS_ALL ||
+                   (m->flags & PG_FICTITIOUS)) {
+                       m = NULL;
+               } else if (vm_page_sbusy_try(m)) {
+                       m = NULL;
+               } else if (m->valid != VM_PAGE_BITS_ALL ||
+                          (m->flags & PG_FICTITIOUS)) {
+                       vm_page_sbusy_drop(m);
+                       m = NULL;
+               }
+       }
+       return m;
+}
+
 /*
  * Caller must hold the related vm_object
  */
@@ -1395,7 +1434,7 @@ vm_page_next(vm_page_t m)
 void
 vm_page_rename(vm_page_t m, vm_object_t new_object, vm_pindex_t new_pindex)
 {
-       KKASSERT(m->flags & PG_BUSY);
+       KKASSERT(m->busy_count & PBUSY_LOCKED);
        ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(new_object));
        if (m->object) {
                ASSERT_LWKT_TOKEN_HELD_EXCL(vm_object_token(m->object));
@@ -1463,7 +1502,7 @@ vm_page_unqueue(vm_page_t m)
  * This is done by 'twisting' the colors.
  *
  * The page is returned spinlocked and removed from its queue (it will
- * be on PQ_NONE), or NULL. The page is not PG_BUSY'd.  The caller
+ * be on PQ_NONE), or NULL. The page is not BUSY'd.  The caller
  * is responsible for dealing with the busy-page case (usually by
  * deactivating the page and looping).
  *
@@ -1857,7 +1896,7 @@ done:
        vm_page_flag_clear(m, ~PG_KEEP_NEWPAGE_MASK);
 
        KKASSERT(m->wire_count == 0);
-       KKASSERT(m->busy == 0);
+       KKASSERT((m->busy_count & PBUSY_MASK) == 0);
        m->act_count = 0;
        m->valid = 0;
 
@@ -1890,7 +1929,7 @@ done:
        pagedaemon_wakeup();
 
        /*
-        * A PG_BUSY page is returned.
+        * A BUSY page is returned.
         */
        return (m);
 }
@@ -2221,7 +2260,7 @@ vm_page_free_wakeup(void)
  * Returns the given page to the PQ_FREE or PQ_HOLD list and disassociates
  * it from its VM object.
  *
- * The vm_page must be PG_BUSY on entry.  PG_BUSY will be released on
+ * The vm_page must be BUSY on entry.  BUSY will be released on
  * return (the page will have been freed).
  */
 void
@@ -2229,13 +2268,12 @@ vm_page_free_toq(vm_page_t m)
 {
        mycpu->gd_cnt.v_tfree++;
        KKASSERT((m->flags & PG_MAPPED) == 0);
-       KKASSERT(m->flags & PG_BUSY);
+       KKASSERT(m->busy_count & PBUSY_LOCKED);
 
-       if (m->busy || ((m->queue - m->pc) == PQ_FREE)) {
-               kprintf("vm_page_free: pindex(%lu), busy(%d), "
-                       "PG_BUSY(%d), hold(%d)\n",
-                       (u_long)m->pindex, m->busy,
-                       ((m->flags & PG_BUSY) ? 1 : 0), m->hold_count);
+       if ((m->busy_count & PBUSY_MASK) || ((m->queue - m->pc) == PQ_FREE)) {
+               kprintf("vm_page_free: pindex(%lu), busy %08x, "
+                       "hold(%d)\n",
+                       (u_long)m->pindex, m->busy_count, m->hold_count);
                if ((m->queue - m->pc) == PQ_FREE)
                        panic("vm_page_free: freeing free page");
                else
@@ -2290,7 +2328,7 @@ vm_page_free_toq(vm_page_t m)
        }
 
        /*
-        * This sequence allows us to clear PG_BUSY while still holding
+        * This sequence allows us to clear BUSY while still holding
         * its spin lock, which reduces contention vs allocators.  We
         * must not leave the queue locked or _vm_page_wakeup() may
         * deadlock.
@@ -2328,7 +2366,7 @@ vm_page_free_toq(vm_page_t m)
 void
 vm_page_unmanage(vm_page_t m)
 {
-       KKASSERT(m->flags & PG_BUSY);
+       KKASSERT(m->busy_count & PBUSY_LOCKED);
        if ((m->flags & PG_UNMANAGED) == 0) {
                if (m->wire_count == 0)
                        vm_page_unqueue(m);
@@ -2351,7 +2389,7 @@ vm_page_wire(vm_page_t m)
         * it is already off the queues).  Don't do anything with fictitious
         * pages because they are always wired.
         */
-       KKASSERT(m->flags & PG_BUSY);
+       KKASSERT(m->busy_count & PBUSY_LOCKED);
        if ((m->flags & PG_FICTITIOUS) == 0) {
                if (atomic_fetchadd_int(&m->wire_count, 1) == 0) {
                        if ((m->flags & PG_UNMANAGED) == 0)
@@ -2394,7 +2432,7 @@ vm_page_wire(vm_page_t m)
 void
 vm_page_unwire(vm_page_t m, int activate)
 {
-       KKASSERT(m->flags & PG_BUSY);
+       KKASSERT(m->busy_count & PBUSY_LOCKED);
        if (m->flags & PG_FICTITIOUS) {
                /* do nothing */
        } else if (m->wire_count <= 0) {
@@ -2586,7 +2624,8 @@ vm_page_cache(vm_page_t m)
         * Not suitable for the cache
         */
        if ((m->flags & (PG_UNMANAGED | PG_NEED_COMMIT)) ||
-           m->busy || m->wire_count || m->hold_count) {
+           (m->busy_count & PBUSY_MASK) ||
+           m->wire_count || m->hold_count) {
                vm_page_wakeup(m);
                return;
        }
@@ -2618,7 +2657,8 @@ vm_page_cache(vm_page_t m)
         */
        vm_page_protect(m, VM_PROT_NONE);
        if ((m->flags & (PG_UNMANAGED | PG_MAPPED)) ||
-           m->busy || m->wire_count || m->hold_count) {
+           (m->busy_count & PBUSY_MASK) ||
+           m->wire_count || m->hold_count) {
                vm_page_wakeup(m);
        } else if (m->dirty || (m->flags & PG_NEED_COMMIT)) {
                vm_page_deactivate(m);
@@ -2714,31 +2754,59 @@ vm_page_dontneed(vm_page_t m)
 
 /*
  * These routines manipulate the 'soft busy' count for a page.  A soft busy
- * is almost like PG_BUSY except that it allows certain compatible operations
- * to occur on the page while it is busy.  For example, a page undergoing a
- * write can still be mapped read-only.
+ * is almost like a hard BUSY except that it allows certain compatible
+ * operations to occur on the page while it is busy.  For example, a page
+ * undergoing a write can still be mapped read-only.
  *
- * Because vm_pages can overlap buffers m->busy can be > 1.  m->busy is only
- * adjusted while the vm_page is PG_BUSY so the flash will occur when the
- * busy bit is cleared.
+ * We also use soft-busy to quickly pmap_enter shared read-only pages
+ * without having to hold the page locked.
+ *
+ * The soft-busy count can be > 1 in situations where multiple threads
+ * are pmap_enter()ing the same page simultaneously, or when two buffer
+ * cache buffers overlap the same page.
  *
  * The caller must hold the page BUSY when making these two calls.
  */
 void
 vm_page_io_start(vm_page_t m)
 {
-        KASSERT(m->flags & PG_BUSY, ("vm_page_io_start: page not busy!!!"));
-        atomic_add_char(&m->busy, 1);
-       vm_page_flag_set(m, PG_SBUSY);
+       uint32_t ocount;
+
+       ocount = atomic_fetchadd_int(&m->busy_count, 1);
+       KKASSERT(ocount & PBUSY_LOCKED);
 }
 
 void
 vm_page_io_finish(vm_page_t m)
 {
-        KASSERT(m->flags & PG_BUSY, ("vm_page_io_finish: page not busy!!!"));
-        atomic_subtract_char(&m->busy, 1);
-       if (m->busy == 0)
-               vm_page_flag_clear(m, PG_SBUSY);
+       uint32_t ocount;
+
+       ocount = atomic_fetchadd_int(&m->busy_count, -1);
+       KKASSERT(ocount & PBUSY_MASK);
+#if 0
+       if (((ocount - 1) & (PBUSY_LOCKED | PBUSY_MASK)) == 0)
+               wakeup(m);
+#endif
+}
+
+/*
+ * Attempt to soft-busy a page.  The page must not be PBUSY_LOCKED.
+ *
+ * Returns 0 on success, non-zero on failure.
+ */
+int
+vm_page_sbusy_try(vm_page_t m)
+{
+       uint32_t ocount;
+
+       if (m->busy_count & PBUSY_LOCKED)
+               return 1;
+       ocount = atomic_fetchadd_int(&m->busy_count, 1);
+       if (ocount & PBUSY_LOCKED) {
+               vm_page_sbusy_drop(m);
+               return 1;
+       }
+       return 0;
 }
 
 /*
index 6b55d0b..f6ed1ff 100644 (file)
@@ -1,10 +1,14 @@
 /*
  * Copyright (c) 1991, 1993
  *     The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 2003-2017 The DragonFly Project.  All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * The Mach Operating System project at Carnegie-Mellon University.
  *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@backplane.com>
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * any improvements or extensions that they make and grant Carnegie the
  * rights to redistribute these changes.
- *
- * $FreeBSD: src/sys/vm/vm_page.h,v 1.75.2.8 2002/03/06 01:07:09 dillon Exp $
  */
 
 /*
- *     Resident memory system definitions.
+ * Resident memory system definitions.
  */
 
 #ifndef        _VM_VM_PAGE_H_
 #endif
 
 /*
- *     Management of resident (logical) pages.
- *
- *     A small structure is kept for each resident
- *     page, indexed by page number.  Each structure
- *     is an element of several lists:
- *
- *             A hash table bucket used to quickly
- *             perform object/offset lookups
- *
- *             A list of all pages for a given object,
- *             so they can be quickly deactivated at
- *             time of deallocation.
- *
- *             An ordered list of pages due for pageout.
- *
- *     In addition, the structure contains the object
- *     and offset to which this page belongs (for pageout),
- *     and sundry status bits.
- *
- *     Fields in this structure are locked either by the lock on the
- *     object that the page belongs to (O) or by the lock on the page
- *     queues (P).
- *
- *     The 'valid' and 'dirty' fields are distinct.  A page may have dirty
- *     bits set without having associated valid bits set.  This is used by
- *     NFS to implement piecemeal writes.
+ * vm_page structure
  */
-
 TAILQ_HEAD(pglist, vm_page);
 
 struct vm_object;
@@ -135,34 +111,27 @@ struct vm_object;
 int rb_vm_page_compare(struct vm_page *, struct vm_page *);
 
 struct vm_page_rb_tree;
-RB_PROTOTYPE2(vm_page_rb_tree, vm_page, rb_entry, rb_vm_page_compare, vm_pindex_t);
+RB_PROTOTYPE2(vm_page_rb_tree, vm_page, rb_entry,
+             rb_vm_page_compare, vm_pindex_t);
 
 struct vm_page {
        TAILQ_ENTRY(vm_page) pageq;     /* vm_page_queues[] list (P)    */
        RB_ENTRY(vm_page) rb_entry;     /* Red-Black tree based at object */
        struct spinlock spin;
-
        struct vm_object *object;       /* which object am I in (O,P)*/
        vm_pindex_t pindex;             /* offset into object (O,P) */
        vm_paddr_t phys_addr;           /* physical address of page */
        struct md_page md;              /* machine dependant stuff */
-       u_short queue;                  /* page queue index */
-       u_short pc;                     /* page color */
-       u_char  act_count;              /* page usage count */
-       u_char  busy;                   /* page busy count */
-       u_char  pat_mode;               /* hardware page attribute */
-       u_char  unused02;
-       u_int32_t flags;                /* see below */
-       u_int   wire_count;             /* wired down maps refs (P) */
+       uint16_t queue;                 /* page queue index */
+       uint16_t pc;                    /* page color */
+       uint8_t act_count;              /* page usage count */
+       uint8_t pat_mode;               /* hardware page attribute */
+       uint8_t valid;                  /* map of valid DEV_BSIZE chunks */
+       uint8_t dirty;                  /* map of dirty DEV_BSIZE chunks */
+       uint32_t flags;                 /* see below */
+       uint32_t wire_count;            /* wired down maps refs (P) */
+       uint32_t busy_count;            /* soft-busy and hard-busy */
        int     hold_count;             /* page hold count */
-
-       /*
-        * NOTE that these must support one bit per DEV_BSIZE in a page!!!
-        * so, on normal X86 kernels, they must be at least 8 bits wide.
-        */
-       u_char  valid;                  /* map of valid DEV_BSIZE chunks */
-       u_char  dirty;                  /* map of dirty DEV_BSIZE chunks */
-
        int     ku_pagecnt;             /* kmalloc helper */
 #ifdef VM_PAGE_DEBUG
        const char *busy_func;
@@ -170,13 +139,10 @@ struct vm_page {
 #endif
 };
 
-#ifdef VM_PAGE_DEBUG
-#define VM_PAGE_DEBUG_EXT(name)        name ## _debug
-#define VM_PAGE_DEBUG_ARGS     , const char *func, int lineno
-#else
-#define VM_PAGE_DEBUG_EXT(name)        name
-#define VM_PAGE_DEBUG_ARGS
-#endif
+#define PBUSY_LOCKED           0x80000000U
+#define PBUSY_WANTED           0x40000000U
+#define PBUSY_SWAPINPROG       0x20000000U
+#define PBUSY_MASK             0x1FFFFFFFU
 
 #ifndef __VM_PAGE_T_DEFINED__
 #define __VM_PAGE_T_DEFINED__
@@ -192,7 +158,6 @@ typedef struct vm_page *vm_page_t;
  *
  * Page coloring cannot be disabled.
  */
-
 #define PQ_PRIME1 31   /* Prime number somewhat less than PQ_HASH_SIZE */
 #define PQ_PRIME2 23   /* Prime number somewhat less than PQ_HASH_SIZE */
 #define PQ_L2_SIZE 512 /* A number of colors opt for 1M cache */
@@ -268,12 +233,9 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
  *
  *  PG_SWAPPED indicates that the page is backed by a swap block.  Any
  *  VM object type other than OBJT_DEFAULT can have swap-backed pages now.
- *
- *  PG_SBUSY is set when m->busy != 0.  PG_SBUSY and m->busy are only modified
- *  when the page is PG_BUSY.
  */
-#define        PG_BUSY         0x00000001      /* page is in transit (O) */
-#define        PG_WANTED       0x00000002      /* someone is waiting for page (O) */
+#define        PG_UNUSED0001   0x00000001
+#define        PG_UNUSED0002   0x00000002
 #define PG_WINATCFLS   0x00000004      /* flush dirty page on inactive q */
 #define        PG_FICTITIOUS   0x00000008      /* physical page doesn't exist (O) */
 #define        PG_WRITEABLE    0x00000010      /* page is writeable */
@@ -281,7 +243,7 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define        PG_UNUSED0040   0x00000040
 #define PG_REFERENCED  0x00000080      /* page has been referenced */
 #define PG_CLEANCHK    0x00000100      /* page will be checked for cleaning */
-#define PG_SWAPINPROG  0x00000200      /* swap I/O in progress on page      */
+#define PG_UNUSED0200  0x00000200
 #define PG_NOSYNC      0x00000400      /* do not collect for syncer */
 #define PG_UNMANAGED   0x00000800      /* No PV management for page */
 #define PG_MARKER      0x00001000      /* special queue marker page */
@@ -289,11 +251,10 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define PG_SWAPPED     0x00004000      /* backed by swap */
 #define PG_NOTMETA     0x00008000      /* do not back with swap */
 #define PG_UNUSED10000 0x00010000
-#define PG_SBUSY       0x00020000      /* soft-busy also set */
+#define PG_UNUSED20000 0x00020000
 #define PG_NEED_COMMIT 0x00040000      /* clean page requires commit */
 
-#define PG_KEEP_NEWPAGE_MASK   (PG_BUSY | PG_SBUSY | PG_WANTED)
-
+#define PG_KEEP_NEWPAGE_MASK   (0)
 
 /*
  * Misc constants.
@@ -304,6 +265,14 @@ extern struct vpgqueues vm_page_queues[PQ_COUNT];
 #define ACT_INIT               5
 #define ACT_MAX                        64
 
+#ifdef VM_PAGE_DEBUG
+#define VM_PAGE_DEBUG_EXT(name)        name ## _debug
+#define VM_PAGE_DEBUG_ARGS     , const char *func, int lineno
+#else
+#define VM_PAGE_DEBUG_EXT(name)        name
+#define VM_PAGE_DEBUG_ARGS
+#endif
+
 #ifdef _KERNEL
 /*
  * Each pageable resident page falls into one of four lists:
@@ -405,6 +374,8 @@ void vm_page_deactivate_locked (vm_page_t);
 void vm_page_initfake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr);
 int vm_page_insert (vm_page_t, struct vm_object *, vm_pindex_t);
 vm_page_t vm_page_lookup (struct vm_object *, vm_pindex_t);
+vm_page_t vm_page_lookup_sbusy_try(struct vm_object *object,
+               vm_pindex_t pindex);
 vm_page_t VM_PAGE_DEBUG_EXT(vm_page_lookup_busy_wait)(
                struct vm_object *, vm_pindex_t, int, const char *
                VM_PAGE_DEBUG_ARGS);
@@ -437,6 +408,7 @@ void vm_page_free_contig(vm_page_t m, unsigned long size);
 vm_page_t vm_page_free_fromq_fast(void);
 void vm_page_dirty(vm_page_t m);
 void vm_page_sleep_busy(vm_page_t m, int also_m_busy, const char *msg);
+int vm_page_sbusy_try(vm_page_t m);
 void VM_PAGE_DEBUG_EXT(vm_page_busy_wait)(vm_page_t m,
                        int also_m_busy, const char *wmsg VM_PAGE_DEBUG_ARGS);
 int VM_PAGE_DEBUG_EXT(vm_page_busy_try)(vm_page_t m,
index c7b1930..557d8a7 100644 (file)
@@ -273,16 +273,38 @@ vm_page_flag_clear(vm_page_t m, unsigned int bits)
 /*
  * Wakeup anyone waiting for the page after potentially unbusying
  * (hard or soft) or doing other work on a page that might make a
- * waiter ready.  The setting of PG_WANTED is integrated into the
+ * waiter ready.  The setting of PBUSY_WANTED is integrated into the
  * related flags and it can't be set once the flags are already
  * clear, so there should be no races here.
  */
-
 static __inline void
 vm_page_flash(vm_page_t m)
 {
-       if (m->flags & PG_WANTED) {
-               vm_page_flag_clear(m, PG_WANTED);
+       if (m->busy_count & PBUSY_WANTED) {
+               atomic_clear_int(&m->busy_count, PBUSY_WANTED);
+               wakeup(m);
+       }
+}
+
+/*
+ * Adjust the soft-busy count on a page.  The drop code will issue an
+ * integrated wakeup if busy_count becomes 0.
+ */
+static __inline void
+vm_page_sbusy_hold(vm_page_t m)
+{
+       atomic_add_int(&m->busy_count, 1);
+}
+
+static __inline void
+vm_page_sbusy_drop(vm_page_t m)
+{
+       uint32_t ocount;
+
+       ocount = atomic_fetchadd_int(&m->busy_count, -1);
+       if (ocount - 1 == PBUSY_WANTED) {
+               /* WANTED and no longer BUSY or SBUSY */
+               atomic_clear_int(&m->busy_count, PBUSY_WANTED);
                wakeup(m);
        }
 }
@@ -308,7 +330,7 @@ vm_page_flash(vm_page_t m)
 static __inline void
 vm_page_protect(vm_page_t m, int prot)
 {
-       KKASSERT(m->flags & PG_BUSY);
+       KKASSERT(m->busy_count & PBUSY_LOCKED);
        if (prot == VM_PROT_NONE) {
                if (m->flags & (PG_WRITEABLE|PG_MAPPED)) {
                        pmap_page_protect(m, VM_PROT_NONE);
index 7676c66..3a8a6fe 100644 (file)
@@ -778,7 +778,8 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage,
         * Initialize our marker
         */
        bzero(&marker, sizeof(marker));
-       marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+       marker.flags = PG_FICTITIOUS | PG_MARKER;
+       marker.busy_count = PBUSY_LOCKED;
        marker.queue = PQ_INACTIVE + q;
        marker.pc = q;
        marker.wire_count = 1;
@@ -1301,7 +1302,8 @@ vm_pageout_scan_active(int pass, int q,
         */
 
        bzero(&marker, sizeof(marker));
-       marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+       marker.flags = PG_FICTITIOUS | PG_MARKER;
+       marker.busy_count = PBUSY_LOCKED;
        marker.queue = PQ_ACTIVE + q;
        marker.pc = q;
        marker.wire_count = 1;
@@ -1755,7 +1757,8 @@ vm_pageout_page_stats(int q)
        }
 
        bzero(&marker, sizeof(marker));
-       marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+       marker.flags = PG_FICTITIOUS | PG_MARKER;
+       marker.busy_count = PBUSY_LOCKED;
        marker.queue = PQ_ACTIVE + q;
        marker.pc = q;
        marker.wire_count = 1;
index dd19df9..19f8294 100644 (file)
@@ -463,7 +463,8 @@ swapoff_one(int index)
         */
        for (q = 0; q < PQ_L2_SIZE; ++q) {
                bzero(&marker, sizeof(marker));
-               marker.flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+               marker.flags = PG_FICTITIOUS | PG_MARKER;
+               marker.busy_count = PBUSY_LOCKED;
                marker.queue = PQ_ACTIVE + q;
                marker.pc = q;
                marker.wire_count = 1;
index 8fd3e6e..3d2b1d4 100644 (file)
@@ -191,7 +191,8 @@ vm_swapcached_thread(void)
         */
        bzero(&page_marker, sizeof(page_marker));
        for (q = 0; q < PQ_L2_SIZE; ++q) {
-               page_marker[q].flags = PG_BUSY | PG_FICTITIOUS | PG_MARKER;
+               page_marker[q].flags = PG_FICTITIOUS | PG_MARKER;
+               page_marker[q].busy_count = PBUSY_LOCKED;
                page_marker[q].queue = PQ_INACTIVE + q;
                page_marker[q].pc = q;
                page_marker[q].wire_count = 1;
index f0a13d8..9d57b08 100644 (file)
@@ -417,7 +417,9 @@ vnode_pager_setsize(struct vnode *vp, vm_ooffset_t nsize)
 void
 vnode_pager_freepage(vm_page_t m)
 {
-       if (m->busy || m->wire_count || (m->flags & PG_NEED_COMMIT)) {
+       if ((m->busy_count & PBUSY_MASK) ||
+           m->wire_count ||
+           (m->flags & PG_NEED_COMMIT)) {
                vm_page_activate(m);
                vm_page_wakeup(m);
        } else {