kernel - Greatly improve shared memory fault rate concurrency / shared tokens
authorMatthew Dillon <dillon@apollo.backplane.com>
Tue, 15 Nov 2011 09:02:24 +0000 (01:02 -0800)
committerMatthew Dillon <dillon@apollo.backplane.com>
Tue, 15 Nov 2011 09:04:56 +0000 (01:04 -0800)
This commit rolls up a lot of work to improve postgres database operations
and the system in general.  With this changes we can pgbench -j 8 -c 40 on
our 48-core opteron monster at 140000+ tps, and the shm vm_fault rate
hits 3.1M pps.

* Implement shared tokens.  They work as advertised, with some cavets.

  It is acceptable to acquire a shared token while you already hold the same
  token exclusively, but you will deadlock if you acquire an exclusive token
  while you hold the same token shared.

  Currently exclusive tokens are not given priority over shared tokens so
  starvation is possible under certain circumstances.

* Create a critical code path in vm_fault() using the new shared token
  feature to quickly fault-in pages which already exist in the VM cache.
  pmap_object_init_pt() also uses the new feature.

  This increases fault-in concurrency by a ridiculously huge amount,
  particularly on SHM segments (say when you have a large number of postgres
  clients).  Scaling for large numbers of clients on large numbers of
  cores is significantly improved.

  This also increases fault-in concurrency for MAP_SHARED file maps.

* Expand the breadn() and cluster_read() APIs.  Implement breadnx() and
  cluster_readx() which allows a getblk()'d bp to be passed.  If *bpp is not
  NULL a bp is being passed in, otherwise the routines call getblk().

* Modify the HAMMER read path to use the new API.  Instead of calling
  getcacheblk() HAMMER now calls getblk() and checks the B_CACHE flag.
  This gives getblk() a chance to regenerate a fully cached buffer from
  VM backing store without having to acquire any hammer-related locks,
  resulting in even faster operation.

* If kern.ipc.shm_use_phys is set to 2 the VM pages will be pre-allocated.
  This can take quite a while for a large map and also lock the machine
  up for a few seconds.  Defaults to off.

* Reorder the smp_invltlb()/cpu_invltlb() combos in a few places, running
  cpu_invltlb() last.

* An invalidation interlock might be needed in pmap_enter() under certain
  circumstances, enable the code for now.

* vm_object_backing_scan_callback() was failing to properly check the
  validity of a vm_object after acquiring its token.  Add the required
  check + some debugging.

* Make vm_object_set_writeable_dirty() a bit more cache friendly.

* The vmstats sysctl was scanning every process's vm_map (requiring a
  vm_map read lock to do so), which can stall for long periods of time
  when the system is paging heavily.  Change the mechanic to a LWP flag
  which can be tested with minimal locking.

* Have the phys_pager mark the page as dirty too, to make sure nothing
  tries to free it.

* Remove the spinlock in pmap_prefault_ok(), since we do not delete page
  table pages it shouldn't be needed.

* Add a required cpu_ccfence() in pmap_inval.c.  The code generated prior
  to this fix was still correct, and this makes sure it stays that way.

* Replace several manual wiring cases with calls to vm_page_wire().

59 files changed:
sys/gnu/vfs/ext2fs/ext2_alloc.c
sys/gnu/vfs/ext2fs/ext2_balloc.c
sys/gnu/vfs/ext2fs/ext2_inode.c
sys/gnu/vfs/ext2fs/ext2_linux_balloc.c
sys/gnu/vfs/ext2fs/ext2_linux_ialloc.c
sys/gnu/vfs/ext2fs/ext2_subr.c
sys/kern/lwkt_thread.c
sys/kern/lwkt_token.c
sys/kern/sysv_shm.c
sys/kern/usched_bsd4.c
sys/kern/vfs_bio.c
sys/kern/vfs_cluster.c
sys/platform/pc32/i386/pmap.c
sys/platform/pc64/x86_64/pmap.c
sys/platform/pc64/x86_64/pmap_inval.c
sys/platform/vkernel/platform/pmap.c
sys/platform/vkernel64/platform/pmap.c
sys/sys/buf.h
sys/sys/buf2.h
sys/sys/globaldata.h
sys/sys/proc.h
sys/sys/thread.h
sys/sys/thread2.h
sys/vfs/hammer/hammer_io.c
sys/vfs/hammer/hammer_ondisk.c
sys/vfs/hammer/hammer_vnops.c
sys/vfs/hammer/hammer_volume.c
sys/vfs/hpfs/hpfs_alsubr.c
sys/vfs/hpfs/hpfs_subr.c
sys/vfs/hpfs/hpfs_vfsops.c
sys/vfs/hpfs/hpfs_vnops.c
sys/vfs/isofs/cd9660/cd9660_lookup.c
sys/vfs/isofs/cd9660/cd9660_rrip.c
sys/vfs/isofs/cd9660/cd9660_vfsops.c
sys/vfs/isofs/cd9660/cd9660_vnops.c
sys/vfs/msdosfs/msdosfs_denode.c
sys/vfs/msdosfs/msdosfs_fat.c
sys/vfs/msdosfs/msdosfs_lookup.c
sys/vfs/msdosfs/msdosfs_vfsops.c
sys/vfs/ntfs/ntfs_subr.c
sys/vfs/ntfs/ntfs_vfsops.c
sys/vfs/ntfs/ntfs_vnops.c
sys/vfs/tmpfs/tmpfs_vnops.c
sys/vfs/udf/udf_vfsops.c
sys/vfs/ufs/ffs_alloc.c
sys/vfs/ufs/ffs_balloc.c
sys/vfs/ufs/ffs_inode.c
sys/vfs/ufs/ffs_subr.c
sys/vfs/ufs/ffs_vfsops.c
sys/vfs/userfs/userfs_vnops.c
sys/vm/phys_pager.c
sys/vm/vm_fault.c
sys/vm/vm_kern.c
sys/vm/vm_map.h
sys/vm/vm_meter.c
sys/vm/vm_object.c
sys/vm/vm_object.h
sys/vm/vm_page.c
sys/vm/vm_page.h

index ce53409..fafb991 100644 (file)
@@ -52,6 +52,8 @@
 
 #include <machine/inttypes.h>
 
+#include <sys/buf2.h>
+
 #include "quota.h"
 #include "inode.h"
 #include "ext2mount.h"
index e1547a3..81b1d4b 100644 (file)
@@ -47,6 +47,8 @@
 #include <sys/ucred.h>
 #include <sys/vnode.h>
 
+#include <sys/buf2.h>
+
 #include "quota.h"
 #include "inode.h"
 #include "ext2_fs.h"
index 41e6ceb..1316c1f 100644 (file)
@@ -52,6 +52,8 @@
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
+#include <sys/buf2.h>
+
 #include "quota.h"
 #include "inode.h"
 #include "ext2mount.h"
index c28f336..480565d 100644 (file)
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
-#include <sys/buf2.h>
 #include <sys/thread2.h>
 
+#include <sys/buf2.h>
+
 #include "quota.h"
 #include "dinode.h"
 #include "inode.h"
index 50a32d1..0aa8472 100644 (file)
@@ -43,6 +43,7 @@
 #include "ext2_fs.h"
 #include "ext2_fs_sb.h"
 #include "fs.h"
+
 #include <sys/stat.h>
 #include <sys/buf2.h>
 #include <sys/thread2.h>
index 7729fe4..bcd9dea 100644 (file)
@@ -48,6 +48,8 @@
 #include <sys/vnode.h>
 #include <sys/buf.h>
 
+#include <sys/buf2.h>
+
 #include "quota.h"
 #include "inode.h"
 #include "ext2_extern.h"
index 309457a..419a162 100644 (file)
@@ -473,6 +473,7 @@ lwkt_init_thread(thread_t td, void *stack, int stksize, int flags,
     td->td_gd = gd;
     td->td_pri = TDPRI_KERN_DAEMON;
     td->td_critcount = 1;
+    td->td_toks_have = NULL;
     td->td_toks_stop = &td->td_toks_base;
     if (lwkt_use_spin_port || (flags & TDF_FORCE_SPINPORT))
        lwkt_initport_spin(&td->td_msgport);
@@ -892,22 +893,11 @@ skip:
 
 havethread:
     /*
-     * If the thread we came up with is a higher or equal priority verses
-     * the thread at the head of the queue we move our thread to the
-     * front.  This way we can always check the front of the queue.
-     *
      * Clear gd_idle_repeat when doing a normal switch to a non-idle
      * thread.
      */
     ntd->td_wmesg = NULL;
     ++gd->gd_cnt.v_swtch;
-#if 0
-    xtd = TAILQ_FIRST(&gd->gd_tdrunq);
-    if (ntd != xtd && ntd->td_pri >= xtd->td_pri) {
-       TAILQ_REMOVE(&gd->gd_tdrunq, ntd, td_threadq);
-       TAILQ_INSERT_HEAD(&gd->gd_tdrunq, ntd, td_threadq);
-    }
-#endif
     gd->gd_idle_repeat = 0;
 
 havethread_preempted:
@@ -937,12 +927,11 @@ haveidle:
        /* ntd invalid, td_switch() can return a different thread_t */
     }
 
-#if 1
     /*
-     * catch-all
+     * catch-all.  XXX is this strictly needed?
      */
     splz_check();
-#endif
+
     /* NOTE: current cpu may have changed after switch */
     crit_exit_quick(td);
 }
@@ -986,15 +975,10 @@ lwkt_switch_return(thread_t otd)
 
 /*
  * Request that the target thread preempt the current thread.  Preemption
- * only works under a specific set of conditions:
- *
- *     - We are not preempting ourselves
- *     - The target thread is owned by the current cpu
- *     - We are not currently being preempted
- *     - The target is not currently being preempted
- *     - We are not holding any spin locks
- *     - The target thread is not holding any tokens
- *     - We are able to satisfy the target's MP lock requirements (if any).
+ * can only occur if our only critical section is the one that we were called
+ * with, the relative priority of the target thread is higher, and the target
+ * thread holds no tokens.  This also only works if we are not holding any
+ * spinlocks (obviously).
  *
  * THE CALLER OF LWKT_PREEMPT() MUST BE IN A CRITICAL SECTION.  Typically
  * this is called via lwkt_schedule() through the td_preemptable callback.
@@ -1002,13 +986,15 @@ lwkt_switch_return(thread_t otd)
  * to determine whether preemption is possible (aka usually just the crit
  * priority of lwkt_schedule() itself).
  *
- * XXX at the moment we run the target thread in a critical section during
- * the preemption in order to prevent the target from taking interrupts
- * that *WE* can't.  Preemption is strictly limited to interrupt threads
- * and interrupt-like threads, outside of a critical section, and the
- * preempted source thread will be resumed the instant the target blocks
- * whether or not the source is scheduled (i.e. preemption is supposed to
- * be as transparent as possible).
+ * Preemption is typically limited to interrupt threads.
+ *
+ * Operation works in a fairly straight-forward manner.  The normal
+ * scheduling code is bypassed and we switch directly to the target
+ * thread.  When the target thread attempts to block or switch away
+ * code at the base of lwkt_switch() will switch directly back to our
+ * thread.  Our thread is able to retain whatever tokens it holds and
+ * if the target needs one of them the target will switch back to us
+ * and reschedule itself normally.
  */
 void
 lwkt_preempt(thread_t ntd, int critcount)
@@ -1032,10 +1018,6 @@ lwkt_preempt(thread_t ntd, int critcount)
 
     td = gd->gd_curthread;
     if (preempt_enable == 0) {
-#if 0
-       if (ntd->td_pri > td->td_pri)
-           need_lwkt_resched();
-#endif
        ++preempt_miss;
        return;
     }
@@ -1045,17 +1027,11 @@ lwkt_preempt(thread_t ntd, int critcount)
     }
     if (td->td_critcount > critcount) {
        ++preempt_miss;
-#if 0
-       need_lwkt_resched();
-#endif
        return;
     }
 #ifdef SMP
     if (ntd->td_gd != gd) {
        ++preempt_miss;
-#if 0
-       need_lwkt_resched();
-#endif
        return;
     }
 #endif
@@ -1071,23 +1047,14 @@ lwkt_preempt(thread_t ntd, int critcount)
 
     if (TD_TOKS_HELD(ntd)) {
        ++preempt_miss;
-#if 0
-       need_lwkt_resched();
-#endif
        return;
     }
     if (td == ntd || ((td->td_flags | ntd->td_flags) & TDF_PREEMPT_LOCK)) {
        ++preempt_weird;
-#if 0
-       need_lwkt_resched();
-#endif
        return;
     }
     if (ntd->td_preempted) {
        ++preempt_hit;
-#if 0
-       need_lwkt_resched();
-#endif
        return;
     }
     KKASSERT(gd->gd_processing_ipiq == 0);
index 4d83a9d..a0e788b 100644 (file)
@@ -200,171 +200,214 @@ _lwkt_token_pool_lookup(void *ptr)
  */
 static __inline
 void
-_lwkt_tokref_init(lwkt_tokref_t ref, lwkt_token_t tok, thread_t td)
+_lwkt_tokref_init(lwkt_tokref_t ref, lwkt_token_t tok, thread_t td, long excl)
 {
        ref->tr_tok = tok;
+       ref->tr_count = excl;
        ref->tr_owner = td;
 }
 
 /*
- * See kern/kern_spinlock.c for the discussion on cache-friendly contention
- * resolution.  We currently do not use cpu_lfence() (expensive!!) and, more
- * importantly, we do a read-test of t_ref before attempting an atomic op,
- * which greatly reduces hw cache bus contention.
+ * Attempt to acquire a shared or exclusive token.  Returns TRUE on success,
+ * FALSE on failure.
+ *
+ * If TOK_EXCLUSIVE is set in mode we are attempting to get an exclusive
+ * token, otherwise are attempting to get a shared token.
+ *
+ * If TOK_EXCLREQ is set in mode this is a blocking operation, otherwise
+ * it is a non-blocking operation (for both exclusive or shared acquisions).
  */
-static
+static __inline
 int
-_lwkt_trytoken_spin(lwkt_token_t tok, lwkt_tokref_t ref)
+_lwkt_trytokref(lwkt_tokref_t ref, thread_t td, long mode)
 {
-       int n;
+       lwkt_token_t tok;
+       lwkt_tokref_t oref;
+       long count;
 
-       for (n = 0; n < lwkt_token_spin; ++n) {
-               if (tok->t_ref == NULL &&
-                   atomic_cmpset_ptr(&tok->t_ref, NULL, ref)) {
-                       return TRUE;
+       KASSERT(((mode & TOK_EXCLREQ) == 0 ||   /* non blocking */
+               td->td_gd->gd_intr_nesting_level == 0 ||
+               panic_cpu_gd == mycpu),
+               ("Attempt to acquire token %p not already "
+               "held in hard code section", tok));
+
+       tok = ref->tr_tok;
+       if (mode & TOK_EXCLUSIVE) {
+               /*
+                * Attempt to get an exclusive token
+                */
+               for (;;) {
+                       count = tok->t_count;
+                       oref = tok->t_ref;      /* can be NULL */
+                       cpu_ccfence();
+                       if ((count & ~TOK_EXCLREQ) == 0) {
+                               /*
+                                * It is possible to get the exclusive bit.
+                                * We must clear TOK_EXCLREQ on successful
+                                * acquisition.
+                                */
+                               if (atomic_cmpset_long(&tok->t_count, count,
+                                                      (count & ~TOK_EXCLREQ) |
+                                                      TOK_EXCLUSIVE)) {
+                                       KKASSERT(tok->t_ref == NULL);
+                                       tok->t_ref = ref;
+                                       return TRUE;
+                               }
+                               /* retry */
+                       } else if ((count & TOK_EXCLUSIVE) &&
+                                  oref >= &td->td_toks_base &&
+                                  oref < td->td_toks_stop) {
+                               /*
+                                * Our thread already holds the exclusive
+                                * bit, we treat this tokref as a shared
+                                * token (sorta) to make the token release
+                                * code easier.
+                                *
+                                * NOTE: oref cannot race above if it
+                                *       happens to be ours, so we're good.
+                                *       But we must still have a stable
+                                *       variable for both parts of the
+                                *       comparison.
+                                *
+                                * NOTE: Since we already have an exclusive
+                                *       lock and don't need to check EXCLREQ
+                                *       we can just use an atomic_add here
+                                */
+                               atomic_add_long(&tok->t_count, TOK_INCR);
+                               ref->tr_count &= ~TOK_EXCLUSIVE;
+                               return TRUE;
+                       } else if ((mode & TOK_EXCLREQ) &&
+                                  (count & TOK_EXCLREQ) == 0) {
+                               /*
+                                * Unable to get the exclusive bit but being
+                                * asked to set the exclusive-request bit.
+                                * Since we are going to retry anyway just
+                                * set the bit unconditionally.
+                                */
+                               atomic_set_long(&tok->t_count, TOK_EXCLREQ);
+                               return FALSE;
+                       } else {
+                               /*
+                                * Unable to get the exclusive bit and not
+                                * being asked to set the exclusive-request
+                                * (aka lwkt_trytoken()), or EXCLREQ was
+                                * already set.
+                                */
+                               cpu_pause();
+                               return FALSE;
+                       }
+                       /* retry */
                }
-               if (lwkt_token_delay) {
-                       tsc_delay(lwkt_token_delay);
-               } else {
-                       cpu_pause();
+       } else {
+               /*
+                * Attempt to get a shared token.  Note that TOK_EXCLREQ
+                * for shared tokens simply means the caller intends to
+                * block.  We never actually set the bit in tok->t_count.
+                */
+               for (;;) {
+                       count = tok->t_count;
+                       oref = tok->t_ref;      /* can be NULL */
+                       cpu_ccfence();
+                       if ((count & (TOK_EXCLUSIVE/*|TOK_EXCLREQ*/)) == 0) {
+                               /* XXX EXCLREQ should work */
+                               /*
+                                * It is possible to get the token shared.
+                                */
+                               if (atomic_cmpset_long(&tok->t_count, count,
+                                                      count + TOK_INCR)) {
+                                       return TRUE;
+                               }
+                               /* retry */
+                       } else if ((count & TOK_EXCLUSIVE) &&
+                                  oref >= &td->td_toks_base &&
+                                  oref < td->td_toks_stop) {
+                               /*
+                                * We own the exclusive bit on the token so
+                                * we can in fact also get it shared.
+                                */
+                               atomic_add_long(&tok->t_count, TOK_INCR);
+                               return TRUE;
+                       } else {
+                               /*
+                                * We failed to get the token shared
+                                */
+                               return FALSE;
+                       }
+                       /* retry */
                }
        }
-       return FALSE;
 }
 
 static __inline
-void
-_lwkt_reltoken_spin(lwkt_token_t tok)
-{
-       tok->t_ref = NULL;
-}
-
-#if 0
-/*
- * Helper function used by lwkt_getalltokens[_sorted]().
- *
- * Our attempt to acquire the token has failed.  To reduce cache coherency
- * bandwidth we set our cpu bit in t_collmask then wait for a reasonable
- * period of time for a hand-off from the current token owner.
- */
-static
 int
-_lwkt_trytoken_spin(lwkt_token_t tok, lwkt_tokref_t ref)
+_lwkt_trytokref_spin(lwkt_tokref_t ref, thread_t td, long mode)
 {
-       globaldata_t gd = mycpu;
-       cpumask_t mask;
-       int n;
+       int spin;
 
-       /*
-        * Add our cpu to the collision mask and wait for the token to be
-        * handed off to us.
-        */
-       crit_enter();
-       atomic_set_cpumask(&tok->t_collmask, gd->gd_cpumask);
-       for (n = 0; n < lwkt_token_spin; ++n) {
-               /*
-                * Token was released before we set our collision bit.
-                */
-               if (tok->t_ref == NULL &&
-                   atomic_cmpset_ptr(&tok->t_ref, NULL, ref)) {
-                       KKASSERT((tok->t_collmask & gd->gd_cpumask) != 0);
-                       atomic_clear_cpumask(&tok->t_collmask, gd->gd_cpumask);
-                       crit_exit();
-                       return TRUE;
-               }
-
-               /*
-                * Token was handed-off to us.
-                */
-               if (tok->t_ref == &gd->gd_handoff) {
-                       KKASSERT((tok->t_collmask & gd->gd_cpumask) == 0);
-                       tok->t_ref = ref;
-                       crit_exit();
-                       return TRUE;
-               }
+       if (_lwkt_trytokref(ref, td, mode))
+               return TRUE;
+       for (spin = lwkt_token_spin; spin > 0; --spin) {
                if (lwkt_token_delay)
                        tsc_delay(lwkt_token_delay);
                else
                        cpu_pause();
+               if (_lwkt_trytokref(ref, td, mode))
+                       return TRUE;
        }
-
-       /*
-        * We failed, attempt to clear our bit in the cpumask.  We may race
-        * someone handing-off to us.  If someone other than us cleared our
-        * cpu bit a handoff is incoming and we must wait for it.
-        */
-       for (;;) {
-               mask = tok->t_collmask;
-               cpu_ccfence();
-               if (mask & gd->gd_cpumask) {
-                       if (atomic_cmpset_cpumask(&tok->t_collmask,
-                                                 mask,
-                                                 mask & ~gd->gd_cpumask)) {
-                               crit_exit();
-                               return FALSE;
-                       }
-                       continue;
-               }
-               if (tok->t_ref != &gd->gd_handoff) {
-                       cpu_pause();
-                       continue;
-               }
-               tok->t_ref = ref;
-               crit_exit();
-               return TRUE;
-       }
+       return FALSE;
 }
 
 /*
- * Release token with hand-off
+ * Release a token that we hold.
  */
 static __inline
 void
-_lwkt_reltoken_spin(lwkt_token_t tok)
+_lwkt_reltokref(lwkt_tokref_t ref, thread_t td)
 {
-       globaldata_t xgd;
-       cpumask_t sidemask;
-       cpumask_t mask;
-       int cpuid;
-
-       if (tok->t_collmask == 0) {
-               tok->t_ref = NULL;
-               return;
-       }
+       lwkt_token_t tok;
+       long count;
 
-       crit_enter();
-       sidemask = ~(mycpu->gd_cpumask - 1);    /* high bits >= xcpu */
+       tok = ref->tr_tok;
        for (;;) {
-               mask = tok->t_collmask;
+               count = tok->t_count;
                cpu_ccfence();
-               if (mask == 0) {
+               if (tok->t_ref == ref) {
+                       /*
+                        * We are an exclusive holder.  We must clear tr_ref
+                        * before we clear the TOK_EXCLUSIVE bit.  If we are
+                        * unable to clear the bit we must restore
+                        * tok->t_ref.
+                        */
+                       KKASSERT(count & TOK_EXCLUSIVE);
                        tok->t_ref = NULL;
-                       break;
-               }
-               if (mask & sidemask)
-                       cpuid = BSFCPUMASK(mask & sidemask);
-               else
-                       cpuid = BSFCPUMASK(mask);
-               xgd = globaldata_find(cpuid);
-               if (atomic_cmpset_cpumask(&tok->t_collmask, mask,
-                                         mask & ~CPUMASK(cpuid))) {
-                       tok->t_ref = &xgd->gd_handoff;
-                       break;
+                       if (atomic_cmpset_long(&tok->t_count, count,
+                                              count & ~TOK_EXCLUSIVE)) {
+                               return;
+                       }
+                       tok->t_ref = ref;
+                       /* retry */
+               } else {
+                       /*
+                        * We are a shared holder
+                        */
+                       KKASSERT(count & TOK_COUNTMASK);
+                       if (atomic_cmpset_long(&tok->t_count, count,
+                                              count - TOK_INCR)) {
+                               return;
+                       }
+                       /* retry */
                }
+               /* retry */
        }
-       crit_exit();
 }
 
-#endif
-
-
 /*
  * Obtain all the tokens required by the specified thread on the current
  * cpu, return 0 on failure and non-zero on success.  If a failure occurs
  * any partially acquired tokens will be released prior to return.
  *
- * lwkt_getalltokens is called by the LWKT scheduler to acquire all
- * tokens that the thread had acquired prior to going to sleep.
+ * lwkt_getalltokens is called by the LWKT scheduler to re-acquire all
+ * tokens that the thread had to release when it switched away.
  *
  * If spinning is non-zero this function acquires the tokens in a particular
  * order to deal with potential deadlocks.  We simply use address order for
@@ -376,7 +419,6 @@ int
 lwkt_getalltokens(thread_t td, int spinning)
 {
        lwkt_tokref_t scan;
-       lwkt_tokref_t ref;
        lwkt_token_t tok;
 
        if (spinning)
@@ -389,55 +431,31 @@ lwkt_getalltokens(thread_t td, int spinning)
                tok = scan->tr_tok;
                for (;;) {
                        /*
-                        * Try to acquire the token if we do not already have
-                        * it.
-                        *
-                        * NOTE: If atomic_cmpset_ptr() fails we have to
-                        *       loop and try again.  It just means we
-                        *       lost a cpu race.
+                        * Only try really hard on the last token
                         */
-                       ref = tok->t_ref;
-                       if (ref == NULL) {
-                               if (atomic_cmpset_ptr(&tok->t_ref, NULL,scan))
-                                       break;
-                               continue;
+                       if (scan == td->td_toks_stop - 1) {
+                           if (_lwkt_trytokref_spin(scan, td, scan->tr_count))
+                                   break;
+                       } else {
+                           if (_lwkt_trytokref(scan, td, scan->tr_count))
+                                   break;
                        }
 
                        /*
-                        * Someone holds the token.
-                        *
-                        * Test if ref is already recursively held by this
-                        * thread.  We cannot safely dereference tok->t_ref
-                        * (it might belong to another thread and is thus
-                        * unstable), but we don't have to. We can simply
-                        * range-check it.
-                        */
-                       if (ref >= &td->td_toks_base && ref < td->td_toks_stop)
-                               break;
-
-                       /*
-                        * Try hard to acquire this token before giving up
-                        * and releasing the whole lot.
+                        * Otherwise we failed to acquire all the tokens.
+                        * Release whatever we did get.
                         */
-                       if (_lwkt_trytoken_spin(tok, scan))
-                               break;
                        if (lwkt_sched_debug > 0) {
                                --lwkt_sched_debug;
                                kprintf("toka %p %s %s\n",
                                        tok, tok->t_desc, td->td_comm);
                        }
-
-                       /*
-                        * Otherwise we failed to acquire all the tokens.
-                        * Release whatever we did get.
-                        */
                        td->td_wmesg = tok->t_desc;
-                       atomic_add_long(&tok->t_collisions, 1);
-                       lwkt_relalltokens(td);
-
+                       ++tok->t_collisions;
+                       while (--scan >= &td->td_toks_base)
+                               _lwkt_reltokref(scan, td);
                        return(FALSE);
                }
-
        }
        return (TRUE);
 }
@@ -458,14 +476,18 @@ void
 lwkt_relalltokens(thread_t td)
 {
        lwkt_tokref_t scan;
-       lwkt_token_t tok;
 
-       for (scan = td->td_toks_stop - 1; scan >= &td->td_toks_base; --scan) {
-       /*for (scan = &td->td_toks_base; scan < td->td_toks_stop; ++scan) {*/
-               tok = scan->tr_tok;
-               if (tok->t_ref == scan)
-                       _lwkt_reltoken_spin(tok);
+       /*
+        * Weird order is to try to avoid a panic loop
+        */
+       if (td->td_toks_have) {
+               scan = td->td_toks_have;
+               td->td_toks_have = NULL;
+       } else {
+               scan = td->td_toks_stop;
        }
+       while (--scan >= &td->td_toks_base)
+               _lwkt_reltokref(scan, td);
 }
 
 /*
@@ -473,26 +495,14 @@ lwkt_relalltokens(thread_t td)
  * acquired in address-sorted order to deal with any deadlocks.  Ultimately
  * token failures will spin into the scheduler and get here.
  *
- * In addition, to reduce hardware cache coherency contention monitor/mwait
- * is interlocked with gd->gd_reqflags and RQF_SPINNING.  Other cores which
- * release a contended token will clear RQF_SPINNING and cause the mwait
- * to resume.  Any interrupt will also generally set RQF_* flags and cause
- * mwait to resume (or be a NOP in the first place).
- *
- * This code is required to set up RQF_SPINNING in case of failure.  The
- * caller may call monitor/mwait on gd->gd_reqflags on failure.  We do NOT
- * want to call mwait here, and doubly so while we are holding tokens.
- *
  * Called from critical section
  */
 static
 int
 _lwkt_getalltokens_sorted(thread_t td)
 {
-       /*globaldata_t gd = td->td_gd;*/
        lwkt_tokref_t sort_array[LWKT_MAXTOKENS];
        lwkt_tokref_t scan;
-       lwkt_tokref_t ref;
        lwkt_token_t tok;
        int i;
        int j;
@@ -529,59 +539,32 @@ _lwkt_getalltokens_sorted(thread_t td)
                tok = scan->tr_tok;
                for (;;) {
                        /*
-                        * Try to acquire the token if we do not already have
-                        * it.
-                        *
-                        * NOTE: If atomic_cmpset_ptr() fails we have to
-                        *       loop and try again.  It just means we
-                        *       lost a cpu race.
+                        * Only try really hard on the last token
                         */
-                       ref = tok->t_ref;
-                       if (ref == NULL) {
-                               if (atomic_cmpset_ptr(&tok->t_ref, NULL, scan))
-                                       break;
-                               continue;
+                       if (scan == td->td_toks_stop - 1) {
+                           if (_lwkt_trytokref_spin(scan, td, scan->tr_count))
+                                   break;
+                       } else {
+                           if (_lwkt_trytokref(scan, td, scan->tr_count))
+                                   break;
                        }
 
                        /*
-                        * Someone holds the token.
-                        *
-                        * Test if ref is already recursively held by this
-                        * thread.  We cannot safely dereference tok->t_ref
-                        * (it might belong to another thread and is thus
-                        * unstable), but we don't have to. We can simply
-                        * range-check it.
-                        */
-                       if (ref >= &td->td_toks_base && ref < td->td_toks_stop)
-                               break;
-
-                       /*
-                        * Try hard to acquire this token before giving up
-                        * and releasing the whole lot.
+                        * Otherwise we failed to acquire all the tokens.
+                        * Release whatever we did get.
                         */
-                       if (_lwkt_trytoken_spin(tok, scan))
-                               break;
                        if (lwkt_sched_debug > 0) {
                                --lwkt_sched_debug;
                                kprintf("tokb %p %s %s\n",
                                        tok, tok->t_desc, td->td_comm);
                        }
-
-                       /*
-                        * Tokens are released in reverse order to reduce
-                        * chasing race failures.
-                        */
                        td->td_wmesg = tok->t_desc;
-                       atomic_add_long(&tok->t_collisions, 1);
-
-                       for (j = i - 1; j >= 0; --j) {
-                       /*for (j = 0; j < i; ++j) {*/
-                               scan = sort_array[j];
-                               tok = scan->tr_tok;
-                               if (tok->t_ref == scan)
-                                       _lwkt_reltoken_spin(tok);
+                       ++tok->t_collisions;
+                       while (--i >= 0) {
+                               scan = sort_array[i];
+                               _lwkt_reltokref(scan, td);
                        }
-                       return (FALSE);
+                       return(FALSE);
                }
        }
 
@@ -589,95 +572,9 @@ _lwkt_getalltokens_sorted(thread_t td)
         * We were successful, there is no need for another core to signal
         * us.
         */
-#if 0
-       atomic_clear_int(&gd->gd_reqflags, RQF_SPINNING);
-#endif
        return (TRUE);
 }
 
-/*
- * Token acquisition helper function.  The caller must have already
- * made nref visible by adjusting td_toks_stop and will be responsible
- * for the disposition of nref on either success or failure.
- *
- * When acquiring tokens recursively we want tok->t_ref to point to
- * the outer (first) acquisition so it gets cleared only on the last
- * release.
- */
-static __inline
-int
-_lwkt_trytokref2(lwkt_tokref_t nref, thread_t td, int blocking)
-{
-       lwkt_token_t tok;
-       lwkt_tokref_t ref;
-
-       /*
-        * Make sure the compiler does not reorder prior instructions
-        * beyond this demark.
-        */
-       cpu_ccfence();
-
-       /*
-        * Attempt to gain ownership
-        */
-       tok = nref->tr_tok;
-       for (;;) {
-               /*
-                * Try to acquire the token if we do not already have
-                * it.  This is not allowed if we are in a hard code
-                * section (because it 'might' have blocked).
-                */
-               ref = tok->t_ref;
-               if (ref == NULL) {
-                       KASSERT((blocking == 0 ||
-                               td->td_gd->gd_intr_nesting_level == 0 ||
-                               panic_cpu_gd == mycpu),
-                               ("Attempt to acquire token %p not already "
-                                "held in hard code section", tok));
-
-                       /*
-                        * NOTE: If atomic_cmpset_ptr() fails we have to
-                        *       loop and try again.  It just means we
-                        *       lost a cpu race.
-                        */
-                       if (atomic_cmpset_ptr(&tok->t_ref, NULL, nref))
-                               return (TRUE);
-                       continue;
-               }
-
-               /*
-                * Test if ref is already recursively held by this
-                * thread.  We cannot safely dereference tok->t_ref
-                * (it might belong to another thread and is thus
-                * unstable), but we don't have to. We can simply
-                * range-check it.
-                *
-                * It is ok to acquire a token that is already held
-                * by the current thread when in a hard code section.
-                */
-               if (ref >= &td->td_toks_base && ref < td->td_toks_stop)
-                       return(TRUE);
-
-               /*
-                * Spin generously.  This is preferable to just switching
-                * away unconditionally.
-                */
-               if (_lwkt_trytoken_spin(tok, nref))
-                       return(TRUE);
-
-               /*
-                * Otherwise we failed, and it is not ok to attempt to
-                * acquire a token in a hard code section.
-                */
-               KASSERT((blocking == 0 ||
-                       td->td_gd->gd_intr_nesting_level == 0),
-                       ("Attempt to acquire token %p not already "
-                        "held in hard code section", tok));
-
-               return(FALSE);
-       }
-}
-
 /*
  * Get a serializing token.  This routine can block.
  */
@@ -691,31 +588,37 @@ lwkt_gettoken(lwkt_token_t tok)
        KKASSERT(ref < &td->td_toks_end);
        ++td->td_toks_stop;
        cpu_ccfence();
-       _lwkt_tokref_init(ref, tok, td);
+       _lwkt_tokref_init(ref, tok, td, TOK_EXCLUSIVE|TOK_EXCLREQ);
 
-       if (_lwkt_trytokref2(ref, td, 1) == FALSE) {
-               /*
-                * Give up running if we can't acquire the token right now.
-                *
-                * Since the tokref is already active the scheduler now
-                * takes care of acquisition, so we need only call
-                * lwkt_switch().
-                *
-                * Since we failed this was not a recursive token so upon
-                * return tr_tok->t_ref should be assigned to this specific
-                * ref.
-                */
-               td->td_wmesg = tok->t_desc;
-               atomic_add_long(&tok->t_collisions, 1);
-               logtoken(fail, ref);
-               lwkt_switch();
-               logtoken(succ, ref);
-               KKASSERT(tok->t_ref == ref);
-       }
+       if (_lwkt_trytokref_spin(ref, td, TOK_EXCLUSIVE|TOK_EXCLREQ))
+               return;
+
+       /*
+        * Give up running if we can't acquire the token right now.
+        *
+        * Since the tokref is already active the scheduler now
+        * takes care of acquisition, so we need only call
+        * lwkt_switch().
+        *
+        * Since we failed this was not a recursive token so upon
+        * return tr_tok->t_ref should be assigned to this specific
+        * ref.
+        */
+       td->td_wmesg = tok->t_desc;
+       ++tok->t_collisions;
+       logtoken(fail, ref);
+       td->td_toks_have = td->td_toks_stop - 1;
+       lwkt_switch();
+       logtoken(succ, ref);
+       KKASSERT(tok->t_ref == ref);
 }
 
+/*
+ * Similar to gettoken but we acquire a shared token instead of an exclusive
+ * token.
+ */
 void
-lwkt_gettoken_hard(lwkt_token_t tok)
+lwkt_gettoken_shared(lwkt_token_t tok)
 {
        thread_t td = curthread;
        lwkt_tokref_t ref;
@@ -724,68 +627,36 @@ lwkt_gettoken_hard(lwkt_token_t tok)
        KKASSERT(ref < &td->td_toks_end);
        ++td->td_toks_stop;
        cpu_ccfence();
-       _lwkt_tokref_init(ref, tok, td);
+       _lwkt_tokref_init(ref, tok, td, TOK_EXCLREQ);
 
-       if (_lwkt_trytokref2(ref, td, 1) == FALSE) {
-               /*
-                * Give up running if we can't acquire the token right now.
-                *
-                * Since the tokref is already active the scheduler now
-                * takes care of acquisition, so we need only call
-                * lwkt_switch().
-                *
-                * Since we failed this was not a recursive token so upon
-                * return tr_tok->t_ref should be assigned to this specific
-                * ref.
-                */
-               td->td_wmesg = tok->t_desc;
-               atomic_add_long(&tok->t_collisions, 1);
-               logtoken(fail, ref);
-               lwkt_switch();
-               logtoken(succ, ref);
-               KKASSERT(tok->t_ref == ref);
-       }
-       crit_enter_hard_gd(td->td_gd);
-}
-
-lwkt_token_t
-lwkt_getpooltoken(void *ptr)
-{
-       thread_t td = curthread;
-       lwkt_token_t tok;
-       lwkt_tokref_t ref;
-
-       tok = _lwkt_token_pool_lookup(ptr);
-       ref = td->td_toks_stop;
-       KKASSERT(ref < &td->td_toks_end);
-       ++td->td_toks_stop;
-       cpu_ccfence();
-       _lwkt_tokref_init(ref, tok, td);
+       if (_lwkt_trytokref_spin(ref, td, TOK_EXCLREQ))
+               return;
 
-       if (_lwkt_trytokref2(ref, td, 1) == FALSE) {
-               /*
-                * Give up running if we can't acquire the token right now.
-                *
-                * Since the tokref is already active the scheduler now
-                * takes care of acquisition, so we need only call
-                * lwkt_switch().
-                *
-                * Since we failed this was not a recursive token so upon
-                * return tr_tok->t_ref should be assigned to this specific
-                * ref.
-                */
-               td->td_wmesg = tok->t_desc;
-               atomic_add_long(&tok->t_collisions, 1);
-               logtoken(fail, ref);
-               lwkt_switch();
-               logtoken(succ, ref);
-               KKASSERT(tok->t_ref == ref);
-       }
-       return(tok);
+       /*
+        * Give up running if we can't acquire the token right now.
+        *
+        * Since the tokref is already active the scheduler now
+        * takes care of acquisition, so we need only call
+        * lwkt_switch().
+        *
+        * Since we failed this was not a recursive token so upon
+        * return tr_tok->t_ref should be assigned to this specific
+        * ref.
+        */
+       td->td_wmesg = tok->t_desc;
+       ++tok->t_collisions;
+       logtoken(fail, ref);
+       td->td_toks_have = td->td_toks_stop - 1;
+       lwkt_switch();
+       logtoken(succ, ref);
 }
 
 /*
  * Attempt to acquire a token, return TRUE on success, FALSE on failure.
+ *
+ * We setup the tokref in case we actually get the token (if we switch later
+ * it becomes mandatory so we set TOK_EXCLREQ), but we call trytokref without
+ * TOK_EXCLREQ in case we fail.
  */
 int
 lwkt_trytoken(lwkt_token_t tok)
@@ -797,17 +668,36 @@ lwkt_trytoken(lwkt_token_t tok)
        KKASSERT(ref < &td->td_toks_end);
        ++td->td_toks_stop;
        cpu_ccfence();
-       _lwkt_tokref_init(ref, tok, td);
+       _lwkt_tokref_init(ref, tok, td, TOK_EXCLUSIVE|TOK_EXCLREQ);
 
-       if (_lwkt_trytokref2(ref, td, 0) == FALSE) {
-               /*
-                * Cleanup, deactivate the failed token.
-                */
-               cpu_ccfence();
-               --td->td_toks_stop;
-               return (FALSE);
-       }
-       return (TRUE);
+       if (_lwkt_trytokref(ref, td, TOK_EXCLUSIVE))
+               return TRUE;
+
+       /*
+        * Failed, unpend the request
+        */
+       cpu_ccfence();
+       --td->td_toks_stop;
+       ++tok->t_collisions;
+       return FALSE;
+}
+
+
+void
+lwkt_gettoken_hard(lwkt_token_t tok)
+{
+       lwkt_gettoken(tok);
+       crit_enter_hard();
+}
+
+lwkt_token_t
+lwkt_getpooltoken(void *ptr)
+{
+       lwkt_token_t tok;
+
+       tok = _lwkt_token_pool_lookup(ptr);
+       lwkt_gettoken(tok);
+       return (tok);
 }
 
 /*
@@ -828,28 +718,9 @@ lwkt_reltoken(lwkt_token_t tok)
         */
        ref = td->td_toks_stop - 1;
        KKASSERT(ref >= &td->td_toks_base && ref->tr_tok == tok);
-
-       /*
-        * Only clear the token if it matches ref.  If ref was a recursively
-        * acquired token it may not match.  Then adjust td_toks_stop.
-        *
-        * Some comparisons must be run prior to adjusting td_toks_stop
-        * to avoid racing against a fast interrupt/ ipi which tries to
-        * acquire a token.
-        *
-        * We must also be absolutely sure that the compiler does not
-        * reorder the clearing of t_ref and the adjustment of td_toks_stop,
-        * or reorder the adjustment of td_toks_stop against the conditional.
-        *
-        * NOTE: The mplock is a token also so sequencing is a bit complex.
-        */
-       if (tok->t_ref == ref)
-               _lwkt_reltoken_spin(tok);
+       _lwkt_reltokref(ref, td);
        cpu_sfence();
-       cpu_ccfence();
        td->td_toks_stop = ref;
-       cpu_ccfence();
-       KKASSERT(tok->t_ref != ref);
 }
 
 void
@@ -888,7 +759,6 @@ lwkt_cnttoken(lwkt_token_t tok, thread_t td)
        return(count);
 }
 
-
 /*
  * Pool tokens are used to provide a type-stable serializing token
  * pointer that does not race against disappearing data structures.
@@ -917,9 +787,9 @@ lwkt_token_pool_lookup(void *ptr)
 void
 lwkt_token_init(lwkt_token_t tok, const char *desc)
 {
+       tok->t_count = 0;
        tok->t_ref = NULL;
        tok->t_collisions = 0;
-       tok->t_collmask = 0;
        tok->t_desc = desc;
 }
 
@@ -938,12 +808,16 @@ lwkt_token_uninit(lwkt_token_t tok)
  * ref and must remain pointing to the deeper ref.  If we were to swap
  * it the first release would clear the token even though a second
  * ref is still present.
+ *
+ * Only exclusively held tokens contain a reference to the tokref which
+ * has to be flipped along with the swap.
  */
 void
 lwkt_token_swap(void)
 {
        lwkt_tokref_t ref1, ref2;
        lwkt_token_t tok1, tok2;
+       long count1, count2;
        thread_t td = curthread;
 
        crit_enter();
@@ -955,9 +829,14 @@ lwkt_token_swap(void)
 
        tok1 = ref1->tr_tok;
        tok2 = ref2->tr_tok;
+       count1 = ref1->tr_count;
+       count2 = ref2->tr_count;
+
        if (tok1 != tok2) {
                ref1->tr_tok = tok2;
+               ref1->tr_count = count2;
                ref2->tr_tok = tok1;
+               ref2->tr_count = count1;
                if (tok1->t_ref == ref1)
                        tok1->t_ref = ref2;
                if (tok2->t_ref == ref2)
@@ -966,26 +845,3 @@ lwkt_token_swap(void)
 
        crit_exit();
 }
-
-#if 0
-int
-lwkt_token_is_stale(lwkt_tokref_t ref)
-{
-       lwkt_token_t tok = ref->tr_tok;
-
-       KKASSERT(tok->t_owner == curthread && ref->tr_state == 1 &&
-                tok->t_count > 0);
-
-       /* Token is not stale */
-       if (tok->t_lastowner == tok->t_owner)
-               return (FALSE);
-
-       /*
-        * The token is stale. Reset to not stale so that the next call to
-        * lwkt_token_is_stale will return "not stale" unless the token
-        * was acquired in-between by another thread.
-        */
-       tok->t_lastowner = tok->t_owner;
-       return (TRUE);
-}
-#endif
index 0801d1c..01dc8bc 100644 (file)
@@ -1,6 +1,3 @@
-/* $FreeBSD: src/sys/kern/sysv_shm.c,v 1.45.2.6 2002/10/22 20:45:03 fjoe Exp $ */
-/*     $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $      */
-
 /*
  * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
  *
@@ -588,6 +585,39 @@ shmget_allocate_segment(struct proc *p, struct shmget_args *uap, int mode)
        shmseg->shm_ctime = time_second;
        shm_committed += btoc(size);
        shm_nused++;
+
+       /*
+        * If a physical mapping is desired and we have a ton of free pages
+        * we pre-allocate the pages here in order to avoid on-the-fly
+        * allocation later.  This has a big effect on database warm-up
+        * times since DFly supports concurrent page faults coming from the
+        * same VM object for pages which already exist.
+        *
+        * This can hang the kernel for a while so only do it if shm_use_phys
+        * is set to 2 or higher.
+        */
+       if (shm_use_phys > 1) {
+               vm_pindex_t pi, pmax;
+               vm_page_t m;
+
+               pmax = round_page(shmseg->shm_segsz) >> PAGE_SHIFT;
+               vm_object_hold(shm_handle->shm_object);
+               if (pmax > vmstats.v_free_count)
+                       pmax = vmstats.v_free_count;
+               for (pi = 0; pi < pmax; ++pi) {
+                       m = vm_page_grab(shm_handle->shm_object, pi,
+                                        VM_ALLOC_SYSTEM | VM_ALLOC_NULL_OK |
+                                        VM_ALLOC_ZERO);
+                       if (m == NULL)
+                               break;
+                       vm_pager_get_page(shm_handle->shm_object, &m, 1);
+                       vm_page_activate(m);
+                       vm_page_wakeup(m);
+                       lwkt_yield();
+               }
+               vm_object_drop(shm_handle->shm_object);
+       }
+
        if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
                /*
                 * Somebody else wanted this key while we were asleep.  Wake
index 13a77ed..36bf827 100644 (file)
@@ -22,8 +22,6 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $DragonFly: src/sys/kern/usched_bsd4.c,v 1.26 2008/11/01 23:31:19 dillon Exp $
  */
 
 #include <sys/param.h>
index 35b6a80..582ec92 100644 (file)
@@ -860,21 +860,6 @@ bremfree_locked(struct buf *bp)
        _bremfree(bp);
 }
 
-/*
- * bread:
- *
- *     Get a buffer with the specified data.  Look in the cache first.  We
- *     must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE
- *     is set, the buffer is valid and we do not have to do anything ( see
- *     getblk() ).
- *
- */
-int
-bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp)
-{
-       return (breadn(vp, loffset, size, NULL, NULL, 0, bpp));
-}
-
 /*
  * This version of bread issues any required I/O asyncnronously and
  * makes a callback on completion.
@@ -915,23 +900,26 @@ breadcb(struct vnode *vp, off_t loffset, int size,
 }
 
 /*
- * breadn:
+ * breadnx() - Terminal function for bread() and breadn().
  *
- *     Operates like bread, but also starts asynchronous I/O on
- *     read-ahead blocks.  We must clear B_ERROR and B_INVAL prior
- *     to initiating I/O . If B_CACHE is set, the buffer is valid 
- *     and we do not have to do anything.
+ * This function will start asynchronous I/O on read-ahead blocks as well
+ * as satisfy the primary request.
  *
+ * We must clear B_ERROR and B_INVAL prior to initiating I/O.  If B_CACHE is
+ * set, the buffer is valid and we do not have to do anything.
  */
 int
-breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
+breadnx(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
        int *rabsize, int cnt, struct buf **bpp)
 {
        struct buf *bp, *rabp;
        int i;
        int rv = 0, readwait = 0;
 
-       *bpp = bp = getblk(vp, loffset, size, 0, 0);
+       if (*bpp)
+               bp = *bpp;
+       else
+               *bpp = bp = getblk(vp, loffset, size, 0, 0);
 
        /* if not found in cache, do some I/O */
        if ((bp->b_flags & B_CACHE) == 0) {
@@ -2819,7 +2807,7 @@ findblk(struct vnode *vp, off_t loffset, int flags)
                 * Lookup.  Ref the buf while holding v_token to prevent
                 * reuse (but does not prevent diassociation).
                 */
-               lwkt_gettoken(&vp->v_token);
+               lwkt_gettoken_shared(&vp->v_token);
                bp = buf_rb_hash_RB_LOOKUP(&vp->v_rbhash_tree, loffset);
                if (bp == NULL) {
                        lwkt_reltoken(&vp->v_token);
index 3b8d575..cfdfc10 100644 (file)
@@ -53,6 +53,7 @@
 #include <vm/vm_object.h>
 #include <vm/vm_page.h>
 #include <sys/sysctl.h>
+
 #include <sys/buf2.h>
 #include <vm/vm_page2.h>
 
@@ -99,7 +100,7 @@ extern int cluster_pbuf_freecnt;
  * bpp         - return buffer (*bpp) for (loffset,blksize)
  */
 int
-cluster_read(struct vnode *vp, off_t filesize, off_t loffset, 
+cluster_readx(struct vnode *vp, off_t filesize, off_t loffset,
             int blksize, size_t minreq, size_t maxreq, struct buf **bpp)
 {
        struct buf *bp, *rbp, *reqbp;
@@ -153,7 +154,10 @@ cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
        /*
         * Get the requested block.
         */
-       *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
+       if (*bpp)
+               reqbp = bp = *bpp;
+       else
+               *bpp = reqbp = bp = getblk(vp, loffset, blksize, 0, 0);
        origoffset = loffset;
 
        /*
index 5566a1b..87d890d 100644 (file)
@@ -1421,8 +1421,8 @@ pmap_allocpte(pmap_t pmap, vm_offset_t va)
        if (ptepa & PG_PS) {
                pmap->pm_pdir[ptepindex] = 0;
                ptepa = 0;
-               cpu_invltlb();
                smp_invltlb();
+               cpu_invltlb();
        }
 
        /*
@@ -3283,8 +3283,8 @@ pmap_mapdev(vm_paddr_t pa, vm_size_t size)
                tmpva += PAGE_SIZE;
                pa += PAGE_SIZE;
        }
-       cpu_invltlb();
        smp_invltlb();
+       cpu_invltlb();
 
        return ((void *)(va + offset));
 }
@@ -3312,8 +3312,8 @@ pmap_mapdev_uncacheable(vm_paddr_t pa, vm_size_t size)
                tmpva += PAGE_SIZE;
                pa += PAGE_SIZE;
        }
-       cpu_invltlb();
        smp_invltlb();
+       cpu_invltlb();
 
        return ((void *)(va + offset));
 }
index ad80e4e..5a31e7c 100644 (file)
@@ -3174,18 +3174,14 @@ validate:
        /*
         * If the mapping or permission bits are different, we need
         * to update the pte.
-        *
-        * We do not have to interlock pte insertions as no other
-        * cpu will have a TLB entry.
         */
        if ((origpte & ~(PG_M|PG_A)) != newpte) {
-#if 0
+#if 1
                if ((prot & VM_PROT_NOSYNC) == 0)
                        pmap_inval_interlock(&info, pmap, va);
 #endif
                *ptep = newpte | PG_A;
-               cpu_invlpg((void *)va);
-#if 0
+#if 1
                if (prot & VM_PROT_NOSYNC)
                        cpu_invlpg((void *)va);
                else
@@ -3299,7 +3295,7 @@ pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_prot_t prot,
        info.addr = addr;
        info.pmap = pmap;
 
-       vm_object_hold(object);
+       vm_object_hold_shared(object);
        vm_page_rb_tree_RB_SCAN(&object->rb_memq, rb_vm_page_scancmp,
                                pmap_object_init_pt_callback, &info);
        vm_object_drop(object);
@@ -3341,20 +3337,22 @@ pmap_object_init_pt_callback(vm_page_t p, void *data)
  *
  * Returns FALSE if it would be non-trivial or if a pte is already loaded
  * into the slot.
+ *
+ * XXX This is safe only because page table pages are not freed.
  */
 int
 pmap_prefault_ok(pmap_t pmap, vm_offset_t addr)
 {
        pt_entry_t *pte;
 
-       spin_lock(&pmap->pm_spin);
+       /*spin_lock(&pmap->pm_spin);*/
        if ((pte = pmap_pte(pmap, addr)) != NULL) {
                if (*pte & PG_V) {
-                       spin_unlock(&pmap->pm_spin);
+                       /*spin_unlock(&pmap->pm_spin);*/
                        return FALSE;
                }
        }
-       spin_unlock(&pmap->pm_spin);
+       /*spin_unlock(&pmap->pm_spin);*/
        return TRUE;
 }
 
index 5efbbd6..f93a713 100644 (file)
@@ -95,10 +95,13 @@ pmap_inval_interlock(pmap_inval_info_t info, pmap_t pmap, vm_offset_t va)
 
     DEBUG_PUSH_INFO("pmap_inval_interlock");
     for (;;) {
-       oactive = pmap->pm_active & ~CPUMASK_LOCK;
+       oactive = pmap->pm_active;
+       cpu_ccfence();
        nactive = oactive | CPUMASK_LOCK;
-       if (atomic_cmpset_cpumask(&pmap->pm_active, oactive, nactive))
+       if ((oactive & CPUMASK_LOCK) == 0 &&
+           atomic_cmpset_cpumask(&pmap->pm_active, oactive, nactive)) {
                break;
+       }
        lwkt_process_ipiq();
        cpu_pause();
     }
index 52abbc6..44b6fe3 100644 (file)
@@ -221,9 +221,7 @@ pmap_pinit(struct pmap *pmap)
         */
        ptdpg = vm_page_grab(pmap->pm_pteobj, pmap->pm_pdindex,
                             VM_ALLOC_NORMAL | VM_ALLOC_RETRY | VM_ALLOC_ZERO);
-
-       ptdpg->wire_count = 1;
-       atomic_add_int(&vmstats.v_wire_count, 1);
+       vm_page_wire(ptdpg);
 
        /* not usually mapped */
        vm_page_flag_clear(ptdpg, PG_MAPPED);
@@ -1153,10 +1151,7 @@ _pmap_allocpte(pmap_t pmap, unsigned ptepindex)
                vm_page_wakeup(m);
                return(m);
        }
-
-       if (m->wire_count == 0)
-               atomic_add_int(&vmstats.v_wire_count, 1);
-       m->wire_count++;
+       vm_page_wire(m);
 
        /*
         * Map the pagetable page into the process address space, if
index cce3de4..38938d6 100644 (file)
@@ -1080,9 +1080,7 @@ pmap_pinit(struct pmap *pmap)
                                     VM_ALLOC_ZERO);
                pmap->pm_pdirm = ptdpg;
                vm_page_flag_clear(ptdpg, PG_MAPPED);
-               if (ptdpg->wire_count == 0)
-                       atomic_add_int(&vmstats.v_wire_count, 1);
-               ptdpg->wire_count = 1;
+               vm_page_wire(ptdpg);
                vm_page_wakeup(ptdpg);
                pmap_kenter((vm_offset_t)pmap->pm_pml4, VM_PAGE_TO_PHYS(ptdpg));
        }
@@ -1267,10 +1265,7 @@ _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex)
         * the caller.
         */
        m->hold_count++;
-
-       if (m->wire_count == 0)
-               atomic_add_int(&vmstats.v_wire_count, 1);
-       m->wire_count++;
+       vm_page_wire(m);
 
        /*
         * Map the pagetable page into the process address space, if
index 0742325..cfb5678 100644 (file)
@@ -403,8 +403,8 @@ void        uninitbufbio(struct buf *);
 void   reinitbufbio(struct buf *);
 void   clearbiocache(struct bio *);
 void   bremfree (struct buf *);
-int    bread (struct vnode *, off_t, int, struct buf **);
-int    breadn (struct vnode *, off_t, int, off_t *, int *, int,
+int    breadx (struct vnode *, off_t, int, struct buf **);
+int    breadnx (struct vnode *, off_t, int, off_t *, int *, int,
                struct buf **);
 void   breadcb(struct vnode *, off_t, int,
                void (*)(struct bio *), void *);
@@ -439,7 +439,7 @@ void        biodone (struct bio *);
 void   biodone_sync (struct bio *);
 
 void   cluster_append(struct bio *, struct buf *);
-int    cluster_read (struct vnode *, off_t, off_t, int,
+int    cluster_readx (struct vnode *, off_t, off_t, int,
            size_t, size_t, struct buf **);
 int    cluster_wbuild (struct vnode *, int, off_t, int);
 void   cluster_write (struct buf *, off_t, int, int);
index ab6a502..973a360 100644 (file)
@@ -353,6 +353,32 @@ biodone_chain(struct bio *bio)
                bpdone(bio->bio_buf, 1);
 }
 
+static __inline int
+bread(struct vnode *vp, off_t loffset, int size, struct buf **bpp)
+{
+       *bpp = NULL;
+       return(breadnx(vp, loffset, size, NULL, NULL, 0, bpp));
+}
+
+
+static __inline int
+breadn(struct vnode *vp, off_t loffset, int size, off_t *raoffset,
+      int *rabsize, int cnt, struct buf **bpp)
+{
+       *bpp = NULL;
+       return(breadnx(vp, loffset, size, raoffset, rabsize, cnt, bpp));
+}
+
+static __inline int
+cluster_read(struct vnode *vp, off_t filesize, off_t loffset,
+             int blksize, size_t minreq, size_t maxreq, struct buf **bpp)
+{
+       *bpp = NULL;
+       return(cluster_readx(vp, filesize, loffset, blksize, minreq,
+                            maxreq, bpp));
+}
+
+
 #endif /* _KERNEL */
 
 #endif /* !_SYS_BUF2_H_ */
index 9eb9832..c02deae 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2003,2004 The DragonFly Project.  All rights reserved.
+ * Copyright (c) 2003-2011 The DragonFly Project.  All rights reserved.
  * 
  * This code is derived from software contributed to The DragonFly Project
  * by Matthew Dillon <dillon@backplane.com>
@@ -53,9 +53,6 @@
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
- *
- * $FreeBSD: src/sys/i386/include/globaldata.h,v 1.11.2.1 2000/05/16 06:58:10 dillon Exp $
- * $DragonFly: src/sys/sys/globaldata.h,v 1.49 2008/06/02 16:54:20 dillon Exp $
  */
 
 #ifndef _SYS_GLOBALDATA_H_
index 3417601..0d28f97 100644 (file)
@@ -392,6 +392,7 @@ struct      proc {
 #define        LWP_WEXIT       0x0000040 /* working on exiting */
 #define        LWP_WSTOP       0x0000080 /* working on stopping */
 #define LWP_PASSIVE_ACQ        0x0000100 /* Passive acquire cpu (see kern_switch) */
+#define LWP_PAGING     0x0000200 /* Currently in vm_fault */
 
 #define        FIRST_LWP_IN_PROC(p)            RB_FIRST(lwp_rb_tree, &(p)->p_lwp_tree)
 #define        FOREACH_LWP_IN_PROC(lp, p)      \
index d8555a6..55d379d 100644 (file)
@@ -97,23 +97,40 @@ struct intrframe;
  * Tokens are managed through a helper reference structure, lwkt_tokref.  Each
  * thread has a stack of tokref's to keep track of acquired tokens.  Multiple
  * tokref's may reference the same token.
+ *
+ * Tokens can be held shared or exclusive.   An exclusive holder is able
+ * to set the TOK_EXCLUSIVE bit in t_count as long as no bit in the count
+ * mask is set.  If unable to accomplish this TOK_EXCLREQ can be set instead
+ * which prevents any new shared acquisitions while the exclusive requestor
+ * spins in the scheduler.  A shared holder can bump t_count by the increment
+ * value as long as neither TOK_EXCLUSIVE or TOK_EXCLREQ is set, else spin
+ * in the scheduler.
+ *
+ * Multiple exclusive tokens are handled by treating the additional tokens
+ * as a special case of the shared token, incrementing the count value.  This
+ * reduces the complexity of the token release code.
  */
 
 typedef struct lwkt_token {
-    struct lwkt_tokref *t_ref;         /* Owning ref or NULL */
+    long               t_count;        /* Shared/exclreq/exclusive access */
+    struct lwkt_tokref *t_ref;         /* Exclusive ref */
     long               t_collisions;   /* Collision counter */
-    cpumask_t          t_collmask;     /* Collision resolve mask */
     const char         *t_desc;        /* Descriptive name */
 } lwkt_token;
 
+#define TOK_EXCLUSIVE  0x00000001      /* Exclusive lock held */
+#define TOK_EXCLREQ    0x00000002      /* Exclusive request pending */
+#define TOK_INCR       4               /* Shared count increment */
+#define TOK_COUNTMASK  (~(long)(TOK_EXCLUSIVE|TOK_EXCLREQ))
+
 /*
  * Static initialization for a lwkt_token.
  */
 #define LWKT_TOKEN_INITIALIZER(name)   \
 {                                      \
+       .t_count = 0,                   \
        .t_ref = NULL,                  \
        .t_collisions = 0,              \
-       .t_collmask = 0,                \
        .t_desc = #name                 \
 }
 
@@ -158,6 +175,7 @@ typedef struct lwkt_token {
 
 struct lwkt_tokref {
     lwkt_token_t       tr_tok;         /* token in question */
+    long               tr_count;       /* TOK_EXCLUSIVE|TOK_EXCLREQ or 0 */
     struct thread      *tr_owner;      /* me */
 };
 
@@ -270,12 +288,12 @@ struct thread {
     struct thread *td_preempted; /* we preempted this thread */
     struct ucred *td_ucred;            /* synchronized from p_ucred */
     struct caps_kinfo *td_caps;        /* list of client and server registrations */
-    lwkt_tokref_t td_toks_stop;
+    lwkt_tokref_t td_toks_have;                /* tokens we own */
+    lwkt_tokref_t td_toks_stop;                /* tokens we want */
     struct lwkt_tokref td_toks_array[LWKT_MAXTOKENS];
     int                td_fairq_load;          /* fairq */
     int                td_fairq_count;         /* fairq */
     struct globaldata *td_migrate_gd;  /* target gd for thread migration */
-    const void *unused01;
 #ifdef DEBUG_CRIT_SECTIONS
 #define CRIT_DEBUG_ARRAY_SIZE   32
 #define CRIT_DEBUG_ARRAY_MASK   (CRIT_DEBUG_ARRAY_SIZE - 1)
@@ -427,6 +445,7 @@ extern void lwkt_passive_release(thread_t);
 extern void lwkt_maybe_splz(thread_t);
 
 extern void lwkt_gettoken(lwkt_token_t);
+extern void lwkt_gettoken_shared(lwkt_token_t);
 extern void lwkt_gettoken_hard(lwkt_token_t);
 extern int  lwkt_trytoken(lwkt_token_t);
 extern void lwkt_reltoken(lwkt_token_t);
index 250c4a5..acf848f 100644 (file)
@@ -35,8 +35,9 @@
 static __inline int
 _lwkt_token_held(lwkt_token_t tok, thread_t td)
 {
-       return (tok->t_ref >= &td->td_toks_base &&
-               tok->t_ref < td->td_toks_stop);
+       return ((tok->t_count & ~(TOK_EXCLUSIVE|TOK_EXCLREQ)) ||
+               (tok->t_ref >= &td->td_toks_base &&
+                tok->t_ref < td->td_toks_stop));
 }
 
 /*
index 8960485..2196292 100644 (file)
@@ -52,6 +52,7 @@
 #include <sys/fcntl.h>
 #include <sys/nlookup.h>
 #include <sys/buf.h>
+
 #include <sys/buf2.h>
 
 static void hammer_io_modify(hammer_io_t io, int count);
index 01d1fa1..87e62da 100644 (file)
@@ -43,6 +43,7 @@
 #include <sys/fcntl.h>
 #include <sys/nlookup.h>
 #include <sys/buf.h>
+
 #include <sys/buf2.h>
 
 static void hammer_free_volume(hammer_volume_t volume);
index e5101d7..eabe8a9 100644 (file)
@@ -371,13 +371,15 @@ hammer_vop_read(struct vop_read_args *ap)
                /*
                 * MPSAFE
                 */
-               bp = getcacheblk(ap->a_vp, base_offset, blksize);
-               if (bp) {
+               bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
+               if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
+                       bp->b_flags &= ~B_AGE;
                        error = 0;
                        goto skip;
-               } else {
-                       if (ap->a_ioflag & IO_NRDELAY)
-                               return (EWOULDBLOCK);
+               }
+               if (ap->a_ioflag & IO_NRDELAY) {
+                       bqrelse(bp);
+                       return (EWOULDBLOCK);
                }
 
                /*
@@ -389,6 +391,10 @@ hammer_vop_read(struct vop_read_args *ap)
                        hammer_start_transaction(&trans, ip->hmp);
                }
 
+               /*
+                * NOTE: A valid bp has already been acquired, but was not
+                *       B_CACHE.
+                */
                if (hammer_cluster_enable) {
                        /*
                         * Use file_limit to prevent cluster_read() from
@@ -400,12 +406,13 @@ hammer_vop_read(struct vop_read_args *ap)
                            file_limit > HAMMER_XDEMARC) {
                                file_limit = HAMMER_XDEMARC;
                        }
-                       error = cluster_read(ap->a_vp,
+                       error = cluster_readx(ap->a_vp,
                                             file_limit, base_offset,
                                             blksize, uio->uio_resid,
                                             seqcount * BKVASIZE, &bp);
                } else {
-                       error = bread(ap->a_vp, base_offset, blksize, &bp);
+                       error = breadnx(ap->a_vp, base_offset, blksize,
+                                       NULL, NULL, 0, &bp);
                }
                if (error) {
                        brelse(bp);
index 97d32d1..9f3cfd0 100644 (file)
@@ -39,6 +39,8 @@
 #include <sys/nlookup.h>
 #include <sys/buf.h>
 
+#include <sys/buf2.h>
+
 static int
 hammer_setup_device(struct vnode **devvpp, const char *dev_path, int ronly);
 
index 628a886..6ff27d3 100644 (file)
@@ -40,6 +40,8 @@
 #include <sys/malloc.h>
 #include <sys/buf.h>
 
+#include <sys/buf2.h>
+
 #include "hpfs.h"
 #include "hpfs_subr.h"
 
index 2180703..184974f 100644 (file)
@@ -40,6 +40,8 @@
 #include <sys/malloc.h>
 #include <sys/buf.h>
 
+#include <sys/buf2.h>
+
 #include "hpfs.h"
 #include "hpfsmount.h"
 #include "hpfs_subr.h"
index b325050..5bcefb9 100644 (file)
@@ -50,6 +50,7 @@
 #include <vm/vm_page.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
+
 #include <sys/buf2.h>
 
 #if defined(__NetBSD__)
index 9dbcb3e..8a6227e 100644 (file)
@@ -57,6 +57,7 @@
 #include <vm/vnode_pager.h>
 #endif
 #include <vm/vm_extern.h>
+
 #include <sys/buf2.h>
 
 #if !defined(__DragonFly__)
index d758658..4f480a6 100644 (file)
@@ -54,6 +54,8 @@
 #include "cd9660_node.h"
 #include "iso_rrip.h"
 
+#include <sys/buf2.h>
+
 /*
  * Convert a component of a pathname into a pointer to a locked inode.
  * This is a very central and rather complicated routine.
index d3b9f13..4148b22 100644 (file)
@@ -52,6 +52,8 @@
 #include "cd9660_rrip.h"
 #include "iso_rrip.h"
 
+#include <sys/buf2.h>
+
 typedef int    rrt_func_t (void *, ISO_RRIP_ANALYZE *ana);
 
 typedef struct {
index fba3e1d..3aef16f 100644 (file)
@@ -59,6 +59,8 @@
 
 #include <vm/vm_zone.h>
 
+#include <sys/buf2.h>
+
 #include "iso.h"
 #include "iso_rrip.h"
 #include "cd9660_node.h"
index 38aabf0..0531260 100644 (file)
@@ -60,6 +60,8 @@
 #include <vm/vm.h>
 #include <vm/vnode_pager.h>
 
+#include <sys/buf2.h>
+
 #include "iso.h"
 #include "cd9660_node.h"
 #include "iso_rrip.h"
index 1c254f7..b8e0237 100644 (file)
@@ -61,6 +61,8 @@
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
+#include <sys/buf2.h>
+
 #include "bpb.h"
 #include "msdosfsmount.h"
 #include "direntry.h"
index 1a033cf..8ce257a 100644 (file)
@@ -58,6 +58,8 @@
 #include <sys/mount.h>         /* to define statfs structure */
 #include <sys/vnode.h>         /* to define vattr structure */
 
+#include <sys/buf2.h>
+
 /*
  * msdosfs include files.
  */
index 3659369..85720a9 100644 (file)
@@ -57,6 +57,8 @@
 #include <sys/namei.h>
 #include <sys/mount.h>
 
+#include <sys/buf2.h>
+
 #include "bpb.h"
 #include "direntry.h"
 #include "denode.h"
index a7353d1..3ae8f5e 100644 (file)
@@ -64,6 +64,8 @@
 #include <sys/stat.h>                          /* defines ALLPERMS */
 #include <vm/vm_zone.h>
 
+#include <sys/buf2.h>
+
 #include "bpb.h"
 #include "bootsect.h"
 #include "direntry.h"
index e6873b7..a421d06 100644 (file)
 #include <sys/malloc.h>
 #include <sys/lock.h>
 #include <sys/spinlock.h>
-#include <sys/spinlock2.h>
 #include <sys/iconv.h>
 
 #include <machine/inttypes.h>
 
+#include <sys/buf2.h>
+#include <sys/spinlock2.h>
+
 #if defined(__NetBSD__)
 #include <miscfs/specfs/specdev.h>
 #endif
index 774f626..a6213aa 100644 (file)
@@ -62,6 +62,8 @@
 #include <miscfs/specfs/specdev.h>
 #endif
 
+#include <sys/buf2.h>
+
 /*#define NTFS_DEBUG 1*/
 #include "ntfs.h"
 #include "ntfs_inode.h"
index e865403..307e0cc 100644 (file)
@@ -70,6 +70,8 @@
 
 #include <sys/sysctl.h>
 
+#include <sys/buf2.h>
+
 /*#define NTFS_DEBUG 1*/
 #include "ntfs.h"
 #include "ntfs_inode.h"
index c740d46..d5a4fe4 100644 (file)
@@ -55,6 +55,8 @@
 #include <vm/vm_pager.h>
 #include <vm/swap_pager.h>
 
+#include <sys/buf2.h>
+
 #include <vfs/fifofs/fifo.h>
 #include <vfs/tmpfs/tmpfs_vnops.h>
 #include <vfs/tmpfs/tmpfs.h>
index 3fc68ac..e44c69a 100644 (file)
@@ -89,6 +89,8 @@
 #include <sys/queue.h>
 #include <sys/vnode.h>
 
+#include <sys/buf2.h>
+
 #include <vfs/udf/ecma167-udf.h>
 #include <vfs/udf/osta.h>
 #include <vfs/udf/udf.h>
index a3eb225..27938ff 100644 (file)
@@ -40,7 +40,6 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
-#include <sys/buf2.h>
 #include <sys/conf.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
@@ -52,6 +51,8 @@
 #include <sys/taskqueue.h>
 #include <machine/inttypes.h>
 
+#include <sys/buf2.h>
+
 #include "quota.h"
 #include "inode.h"
 #include "ufs_extern.h"
index 85abfbc..e9512ed 100644 (file)
@@ -43,6 +43,8 @@
 #include <sys/mount.h>
 #include <sys/vnode.h>
 
+#include <sys/buf2.h>
+
 #include "quota.h"
 #include "inode.h"
 #include "ufs_extern.h"
index 41bb3dd..b0112d6 100644 (file)
@@ -60,6 +60,7 @@
 #include "ffs_extern.h"
 
 #include <vm/vm_page2.h>
+#include <sys/buf2.h>
 
 static int ffs_indirtrunc (struct inode *, ufs_daddr_t, ufs_daddr_t,
            ufs_daddr_t, int, long *);
index 42a840b..d957524 100644 (file)
@@ -50,6 +50,8 @@ extern void panic(const char *, ...);
 #include <sys/ucred.h>
 #include <sys/mount.h>
 
+#include <sys/buf2.h>
+
 #include "quota.h"
 #include "inode.h"
 #include "fs.h"
index 7cb230b..83ba1a6 100644 (file)
@@ -63,6 +63,8 @@
 #include <vm/vm_page.h>
 #include <vm/vm_zone.h>
 
+#include <sys/buf2.h>
+
 static MALLOC_DEFINE(M_FFSNODE, "FFS node", "FFS vnode private part");
 
 static int     ffs_sbupdate (struct ufsmount *, int);
index fcfe81b..a35d0cd 100644 (file)
@@ -47,6 +47,9 @@
 #include <sys/syslink_vfs.h>
 #include <sys/unistd.h>
 #include <vm/vnode_pager.h>
+
+#include <sys/buf2.h>
+
 #include "userfs.h"
 
 /*
index 918b8a0..3020124 100644 (file)
@@ -87,7 +87,7 @@ phys_pager_getpage(vm_object_t object, vm_page_t *mpp, int seqaccess)
        /* Switch off pv_entries */
        vm_page_unmanage(m);
        m->valid = VM_PAGE_BITS_ALL;
-       m->dirty = 0;
+       m->dirty = VM_PAGE_BITS_ALL;
 
        return (VM_PAGER_OK);
 }
index dc618a2..13ee444 100644 (file)
@@ -124,6 +124,15 @@ struct faultstate {
 
 static int debug_cluster = 0;
 SYSCTL_INT(_vm, OID_AUTO, debug_cluster, CTLFLAG_RW, &debug_cluster, 0, "");
+static int vm_shared_fault = 1;
+SYSCTL_INT(_vm, OID_AUTO, shared_fault, CTLFLAG_RW, &vm_shared_fault, 0,
+          "Allow shared token on vm_object");
+static long vm_shared_hit = 0;
+SYSCTL_LONG(_vm, OID_AUTO, shared_hit, CTLFLAG_RW, &vm_shared_hit, 0,
+          "Successful shared faults");
+static long vm_shared_miss = 0;
+SYSCTL_LONG(_vm, OID_AUTO, shared_miss, CTLFLAG_RW, &vm_shared_miss, 0,
+          "Successful shared faults");
 
 static int vm_fault_object(struct faultstate *, vm_pindex_t, vm_prot_t);
 static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int);
@@ -131,8 +140,10 @@ static int vm_fault_vpagetable(struct faultstate *, vm_pindex_t *, vpte_t, int);
 static int vm_fault_additional_pages (vm_page_t, int, int, vm_page_t *, int *);
 #endif
 static void vm_set_nosync(vm_page_t m, vm_map_entry_t entry);
-static void vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry,
-                       int prot);
+static void vm_prefault(pmap_t pmap, vm_offset_t addra,
+                       vm_map_entry_t entry, int prot, int fault_flags);
+static void vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
+                       vm_map_entry_t entry, int prot, int fault_flags);
 
 static __inline void
 release_page(struct faultstate *fs)
@@ -250,14 +261,18 @@ vm_fault(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type, int fault_flags)
        int result;
        vm_pindex_t first_pindex;
        struct faultstate fs;
+       struct lwp *lp;
        int growstack;
 
-       mycpu->gd_cnt.v_vm_faults++;
-
+       vm_page_pcpu_cache();
        fs.hardfault = 0;
        fs.fault_flags = fault_flags;
+       fs.vp = NULL;
        growstack = 1;
 
+       if ((lp = curthread->td_lwp) != NULL)
+               lp->lwp_flag |= LWP_PAGING;
+
        lwkt_gettoken(&map->token);
 
 RetryFault:
@@ -357,13 +372,71 @@ RetryFault:
                        (void *)vaddr, fs.entry);
        }
 
+       /*
+        * Attempt to shortcut the fault if the lookup returns a
+        * terminal object and the page is present.  This allows us
+        * to obtain a shared token on the object instead of an exclusive
+        * token, which theoretically should allow concurrent faults.
+        */
+       if (vm_shared_fault &&
+           fs.first_object->backing_object == NULL &&
+           fs.entry->maptype == VM_MAPTYPE_NORMAL) {
+               int error;
+               vm_object_hold_shared(fs.first_object);
+               /*fs.vp = vnode_pager_lock(fs.first_object);*/
+               fs.m = vm_page_lookup_busy_try(fs.first_object,
+                                               first_pindex,
+                                               TRUE, &error);
+               if (error == 0 && fs.m) {
+                       /*
+                        * Activate the page and figure out if we can
+                        * short-cut a quick mapping.
+                        *
+                        * WARNING!  We cannot call swap_pager_unswapped()
+                        *           with a shared token!
+                        */
+                       vm_page_activate(fs.m);
+                       if (fs.m->valid == VM_PAGE_BITS_ALL &&
+                           ((fs.m->flags & PG_SWAPPED) == 0 ||
+                            (fs.prot & VM_PROT_WRITE) == 0 ||
+                            (fs.fault_flags & VM_FAULT_DIRTY) == 0)) {
+                               fs.lookup_still_valid = TRUE;
+                               fs.first_m = NULL;
+                               fs.object = fs.first_object;
+                               fs.prot = fs.first_prot;
+                               if (fs.wired)
+                                       fault_type = fs.first_prot;
+                               if (fs.prot & VM_PROT_WRITE) {
+                                       vm_object_set_writeable_dirty(
+                                                       fs.m->object);
+                                       vm_set_nosync(fs.m, fs.entry);
+                                       if (fs.fault_flags & VM_FAULT_DIRTY) {
+                                               vm_page_dirty(fs.m);
+                                               /*XXX*/
+                                               swap_pager_unswapped(fs.m);
+                                       }
+                               }
+                               result = KERN_SUCCESS;
+                               fault_flags |= VM_FAULT_BURST_QUICK;
+                               fault_flags &= ~VM_FAULT_BURST;
+                               ++vm_shared_hit;
+                               goto quick;
+                       }
+                       vm_page_wakeup(fs.m);
+                       fs.m = NULL;
+               }
+               vm_object_drop(fs.first_object); /* XXX drop on shared tok?*/
+       }
+       ++vm_shared_miss;
+
        /*
         * Bump the paging-in-progress count to prevent size changes (e.g.
         * truncation operations) during I/O.  This must be done after
         * obtaining the vnode lock in order to avoid possible deadlocks.
         */
        vm_object_hold(fs.first_object);
-       fs.vp = vnode_pager_lock(fs.first_object);
+       if (fs.vp == NULL)
+               fs.vp = vnode_pager_lock(fs.first_object);
 
        fs.lookup_still_valid = TRUE;
        fs.first_m = NULL;
@@ -418,6 +491,7 @@ RetryFault:
        if (result != KERN_SUCCESS)
                goto done;
 
+quick:
        /*
         * On success vm_fault_object() does not unlock or deallocate, and fs.m
         * will contain a busied page.
@@ -426,6 +500,9 @@ RetryFault:
         */
        vm_page_flag_set(fs.m, PG_REFERENCED);
        pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired);
+       mycpu->gd_cnt.v_vm_faults++;
+       if (curthread->td_lwp)
+               ++curthread->td_lwp->lwp_ru.ru_minflt;
 
        /*KKASSERT(fs.m->queue == PQ_NONE); page-in op may deactivate page */
        KKASSERT(fs.m->flags & PG_BUSY);
@@ -451,9 +528,17 @@ RetryFault:
         * first.
         */
        if (fault_flags & VM_FAULT_BURST) {
-               if ((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0 &&
-                   fs.wired == 0) {
-                       vm_prefault(fs.map->pmap, vaddr, fs.entry, fs.prot);
+               if ((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0
+                   && fs.wired == 0) {
+                       vm_prefault(fs.map->pmap, vaddr,
+                                   fs.entry, fs.prot, fault_flags);
+               }
+       }
+       if (fault_flags & VM_FAULT_BURST_QUICK) {
+               if ((fs.fault_flags & VM_FAULT_WIRE_MASK) == 0
+                   && fs.wired == 0) {
+                       vm_prefault_quick(fs.map->pmap, vaddr,
+                                         fs.entry, fs.prot, fault_flags);
                }
        }
 
@@ -479,6 +564,8 @@ done:
        if (fs.first_object)
                vm_object_drop(fs.first_object);
        lwkt_reltoken(&map->token);
+       if (lp)
+               lp->lwp_flag &= ~LWP_PAGING;
        return (result);
 }
 
@@ -523,8 +610,6 @@ vm_fault_page(vm_map_t map, vm_offset_t vaddr, vm_prot_t fault_type,
        int result;
        vm_prot_t orig_fault_type = fault_type;
 
-       mycpu->gd_cnt.v_vm_faults++;
-
        fs.hardfault = 0;
        fs.fault_flags = fault_flags;
        KKASSERT((fault_flags & VM_FAULT_WIRE_MASK) == 0);
@@ -661,6 +746,9 @@ RetryFault:
         */
        pmap_enter(fs.map->pmap, vaddr, fs.m, fs.prot, fs.wired);
        vm_page_flag_set(fs.m, PG_REFERENCED);
+       mycpu->gd_cnt.v_vm_faults++;
+       if (curthread->td_lwp)
+               ++curthread->td_lwp->lwp_ru.ru_minflt;
 
        /*
         * On success vm_fault_object() does not unlock or deallocate, and fs.m
@@ -677,15 +765,10 @@ RetryFault:
         * if a write fault was specified).
         */
        vm_page_hold(fs.m);
+       vm_page_activate(fs.m);
        if (fault_type & VM_PROT_WRITE)
                vm_page_dirty(fs.m);
 
-       /*
-        * Unbusy the page by activating it.  It remains held and will not
-        * be reclaimed.
-        */
-       vm_page_activate(fs.m);
-
        if (curthread->td_lwp) {
                if (fs.hardfault) {
                        curthread->td_lwp->lwp_ru.ru_majflt++;
@@ -818,10 +901,8 @@ RetryFault:
         * if a write fault was specified).
         */
        vm_page_hold(fs.m);
-       if (fault_type & VM_PROT_WRITE)
-               vm_page_dirty(fs.m);
-
-       if (fault_flags & VM_FAULT_DIRTY)
+       vm_page_activate(fs.m);
+       if ((fault_type & VM_PROT_WRITE) || (fault_flags & VM_FAULT_DIRTY))
                vm_page_dirty(fs.m);
        if (fault_flags & VM_FAULT_UNSWAP)
                swap_pager_unswapped(fs.m);
@@ -831,15 +912,8 @@ RetryFault:
         */
        vm_page_flag_set(fs.m, PG_REFERENCED);
 
-       /*
-        * Unbusy the page by activating it.  It remains held and will not
-        * be reclaimed.
-        */
-       vm_page_activate(fs.m);
-
        if (curthread->td_lwp) {
                if (fs.hardfault) {
-                       mycpu->gd_cnt.v_vm_faults++;
                        curthread->td_lwp->lwp_ru.ru_majflt++;
                } else {
                        curthread->td_lwp->lwp_ru.ru_minflt++;
@@ -929,6 +1003,7 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
                 * It doesn't get set in the page directory if the page table
                 * is modified during a read access.
                 */
+               vm_page_activate(fs->m);
                if ((fault_type & VM_PROT_WRITE) && (vpte & VPTE_V) &&
                    (vpte & VPTE_W)) {
                        if ((vpte & (VPTE_M|VPTE_A)) != (VPTE_M|VPTE_A)) {
@@ -945,7 +1020,6 @@ vm_fault_vpagetable(struct faultstate *fs, vm_pindex_t *pindex,
                }
                lwbuf_free(lwb);
                vm_page_flag_set(fs->m, PG_REFERENCED);
-               vm_page_activate(fs->m);
                vm_page_wakeup(fs->m);
                fs->m = NULL;
                cleanup_successful_fault(fs);
@@ -1144,7 +1218,7 @@ vm_fault_object(struct faultstate *fs,
                                    ((fs->vp || fs->object->backing_object) ?
                                        VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL :
                                        VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL |
-                                       VM_ALLOC_ZERO));
+                                       VM_ALLOC_USE_GD | VM_ALLOC_ZERO));
                        }
                        if (fs->m == NULL) {
                                vm_object_pip_wakeup(fs->first_object);
@@ -1187,6 +1261,7 @@ readrest:
                        else
                                seqaccess = -1;
 
+#if 0
                        /*
                         * If sequential access is detected then attempt
                         * to deactivate/cache pages behind the scan to
@@ -1257,6 +1332,7 @@ skip:
 
                                seqaccess = 1;
                        }
+#endif
 
                        /*
                         * Avoid deadlocking against the map when doing I/O.
@@ -1625,6 +1701,7 @@ skip:
         * Also tell the backing pager, if any, that it should remove
         * any swap backing since the page is now dirty.
         */
+       vm_page_activate(fs->m);
        if (fs->prot & VM_PROT_WRITE) {
                vm_object_set_writeable_dirty(fs->m->object);
                vm_set_nosync(fs->m, fs->entry);
@@ -2062,7 +2139,8 @@ vm_set_nosync(vm_page_t m, vm_map_entry_t entry)
 }
 
 static void
-vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
+vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot,
+           int fault_flags)
 {
        struct lwp *lp;
        vm_page_t m;
@@ -2102,8 +2180,8 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
 
        object = entry->object.vm_object;
        KKASSERT(object != NULL);
-       vm_object_hold(object);
        KKASSERT(object == entry->object.vm_object);
+       vm_object_hold(object);
        vm_object_chain_acquire(object);
 
        noneg = 0;
@@ -2201,22 +2279,10 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
                                m = vm_page_alloc(object, index,
                                                  VM_ALLOC_NORMAL |
                                                  VM_ALLOC_ZERO |
+                                                 VM_ALLOC_USE_GD |
                                                  VM_ALLOC_NULL_OK);
                                if (m == NULL)
                                        break;
-
-                               if ((m->flags & PG_ZERO) == 0) {
-                                       vm_page_zero_fill(m);
-                               } else {
-#ifdef PMAP_DEBUG
-                                       pmap_page_assertzero(
-                                                       VM_PAGE_TO_PHYS(m));
-#endif
-                                       vm_page_flag_clear(m, PG_ZERO);
-                                       mycpu->gd_cnt.v_ozfod++;
-                               }
-                               mycpu->gd_cnt.v_zfod++;
-                               m->valid = VM_PAGE_BITS_ALL;
                                allocated = 1;
                                pprot = prot;
                                /* lobject = object .. not needed */
@@ -2257,6 +2323,15 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
                        break;
                }
 
+               /*
+                * The object must be marked dirty if we are mapping a
+                * writable page.  m->object is either lobject or object,
+                * both of which are still held.  Do this before we
+                * potentially drop the object.
+                */
+               if (pprot & VM_PROT_WRITE)
+                       vm_object_set_writeable_dirty(m->object);
+
                /*
                 * Do not conditionalize on PG_RAM.  If pages are present in
                 * the VM system we assume optimal caching.  If caching is
@@ -2266,14 +2341,7 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
                 * of the object has been cached.  The cost for restarting
                 * the gravy train should be low (since accesses will likely
                 * be I/O bound anyway).
-                *
-                * The object must be marked dirty if we are mapping a
-                * writable page.  m->object is either lobject or object,
-                * both of which are still held.
                 */
-               if (pprot & VM_PROT_WRITE)
-                       vm_object_set_writeable_dirty(m->object);
-
                if (lobject != object) {
                        if (object->backing_object != lobject)
                                vm_object_hold(object->backing_object);
@@ -2291,10 +2359,41 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
                 * (pages on the cache queue are not allowed to be mapped).
                 */
                if (allocated) {
+                       /*
+                        * Page must be zerod.
+                        */
+                       if ((m->flags & PG_ZERO) == 0) {
+                               vm_page_zero_fill(m);
+                       } else {
+#ifdef PMAP_DEBUG
+                               pmap_page_assertzero(
+                                               VM_PAGE_TO_PHYS(m));
+#endif
+                               vm_page_flag_clear(m, PG_ZERO);
+                               mycpu->gd_cnt.v_ozfod++;
+                       }
+                       mycpu->gd_cnt.v_zfod++;
+                       m->valid = VM_PAGE_BITS_ALL;
+
+                       /*
+                        * Handle dirty page case
+                        */
                        if (pprot & VM_PROT_WRITE)
                                vm_set_nosync(m, entry);
                        pmap_enter(pmap, addr, m, pprot, 0);
+                       mycpu->gd_cnt.v_vm_faults++;
+                       if (curthread->td_lwp)
+                               ++curthread->td_lwp->lwp_ru.ru_minflt;
                        vm_page_deactivate(m);
+                       if (pprot & VM_PROT_WRITE) {
+                               /*vm_object_set_writeable_dirty(m->object);*/
+                               vm_set_nosync(m, entry);
+                               if (fault_flags & VM_FAULT_DIRTY) {
+                                       vm_page_dirty(m);
+                                       /*XXX*/
+                                       swap_pager_unswapped(m);
+                               }
+                       }
                        vm_page_wakeup(m);
                } else if (error) {
                        /* couldn't busy page, no wakeup */
@@ -2307,9 +2406,21 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
                         */
                        if ((m->queue - m->pc) == PQ_CACHE)
                                vm_page_deactivate(m);
+                       if (pprot & VM_PROT_WRITE) {
+                               /*vm_object_set_writeable_dirty(m->object);*/
+                               vm_set_nosync(m, entry);
+                               if (fault_flags & VM_FAULT_DIRTY) {
+                                       vm_page_dirty(m);
+                                       /*XXX*/
+                                       swap_pager_unswapped(m);
+                               }
+                       }
                        if (pprot & VM_PROT_WRITE)
                                vm_set_nosync(m, entry);
                        pmap_enter(pmap, addr, m, pprot, 0);
+                       mycpu->gd_cnt.v_vm_faults++;
+                       if (curthread->td_lwp)
+                               ++curthread->td_lwp->lwp_ru.ru_minflt;
                        vm_page_wakeup(m);
                } else {
                        vm_page_wakeup(m);
@@ -2318,3 +2429,134 @@ vm_prefault(pmap_t pmap, vm_offset_t addra, vm_map_entry_t entry, int prot)
        vm_object_chain_release(object);
        vm_object_drop(object);
 }
+
+static void
+vm_prefault_quick(pmap_t pmap, vm_offset_t addra,
+                 vm_map_entry_t entry, int prot, int fault_flags)
+{
+       struct lwp *lp;
+       vm_page_t m;
+       vm_offset_t addr;
+       vm_pindex_t pindex;
+       vm_object_t object;
+       int i;
+       int noneg;
+       int nopos;
+       int maxpages;
+
+       /*
+        * Get stable max count value, disabled if set to 0
+        */
+       maxpages = vm_prefault_pages;
+       cpu_ccfence();
+       if (maxpages <= 0)
+               return;
+
+       /*
+        * We do not currently prefault mappings that use virtual page
+        * tables.  We do not prefault foreign pmaps.
+        */
+       if (entry->maptype == VM_MAPTYPE_VPAGETABLE)
+               return;
+       lp = curthread->td_lwp;
+       if (lp == NULL || (pmap != vmspace_pmap(lp->lwp_vmspace)))
+               return;
+
+       /*
+        * Limit pre-fault count to 1024 pages.
+        */
+       if (maxpages > 1024)
+               maxpages = 1024;
+
+       object = entry->object.vm_object;
+       ASSERT_LWKT_TOKEN_HELD(vm_object_token(object));
+       KKASSERT(object->backing_object == NULL);
+
+       noneg = 0;
+       nopos = 0;
+       for (i = 0; i < maxpages; ++i) {
+               int error;
+
+               /*
+                * Calculate the page to pre-fault, stopping the scan in
+                * each direction separately if the limit is reached.
+                */
+               if (i & 1) {
+                       if (noneg)
+                               continue;
+                       addr = addra - ((i + 1) >> 1) * PAGE_SIZE;
+               } else {
+                       if (nopos)
+                               continue;
+                       addr = addra + ((i + 2) >> 1) * PAGE_SIZE;
+               }
+               if (addr < entry->start) {
+                       noneg = 1;
+                       if (noneg && nopos)
+                               break;
+                       continue;
+               }
+               if (addr >= entry->end) {
+                       nopos = 1;
+                       if (noneg && nopos)
+                               break;
+                       continue;
+               }
+
+               /*
+                * Skip pages already mapped, and stop scanning in that
+                * direction.  When the scan terminates in both directions
+                * we are done.
+                */
+               if (pmap_prefault_ok(pmap, addr) == 0) {
+                       if (i & 1)
+                               noneg = 1;
+                       else
+                               nopos = 1;
+                       if (noneg && nopos)
+                               break;
+                       continue;
+               }
+
+               /*
+                * Follow the VM object chain to obtain the page to be mapped
+                * into the pmap.  This version of the prefault code only
+                * works with terminal objects.
+                *
+                * WARNING!  We cannot call swap_pager_unswapped() with a
+                *           shared token.
+                */
+               pindex = ((addr - entry->start) + entry->offset) >> PAGE_SHIFT;
+
+               m = vm_page_lookup_busy_try(object, pindex, TRUE, &error);
+               if (m == NULL || error)
+                       continue;
+
+               if (((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) &&
+                   (m->flags & PG_FICTITIOUS) == 0 &&
+                   ((m->flags & PG_SWAPPED) == 0 ||
+                    (prot & VM_PROT_WRITE) == 0 ||
+                    (fault_flags & VM_FAULT_DIRTY) == 0)) {
+                       /*
+                        * A fully valid page not undergoing soft I/O can
+                        * be immediately entered into the pmap.
+                        */
+                       if ((m->queue - m->pc) == PQ_CACHE)
+                               vm_page_deactivate(m);
+                       if (prot & VM_PROT_WRITE) {
+                               vm_object_set_writeable_dirty(m->object);
+                               vm_set_nosync(m, entry);
+                               if (fault_flags & VM_FAULT_DIRTY) {
+                                       vm_page_dirty(m);
+                                       /*XXX*/
+                                       swap_pager_unswapped(m);
+                               }
+                       }
+                       pmap_enter(pmap, addr, m, prot, 0);
+                       mycpu->gd_cnt.v_vm_faults++;
+                       if (curthread->td_lwp)
+                               ++curthread->td_lwp->lwp_ru.ru_minflt;
+               }
+               vm_page_wakeup(m);
+       }
+}
index 5384b85..3a0c8ce 100644 (file)
@@ -225,6 +225,7 @@ kmem_alloc3(vm_map_t map, vm_size_t size, int kmflags)
                mem = vm_page_grab(&kernel_object, OFF_TO_IDX(addr + i),
                                   VM_ALLOC_FORCE_ZERO | VM_ALLOC_NORMAL |
                                   VM_ALLOC_RETRY);
+               vm_page_unqueue_nowakeup(mem);
                vm_page_wakeup(mem);
        }
        vm_object_drop(&kernel_object);
index 5e54a0f..ee4c546 100644 (file)
@@ -243,7 +243,7 @@ struct vm_map {
 /*
  * vm_flags_t values
  */
-#define MAP_WIREFUTURE         0x01    /* wire all future pages */
+#define MAP_WIREFUTURE         0x0001  /* wire all future pages */
 
 /*
  * Registered upcall
@@ -527,6 +527,7 @@ vmspace_president_count(struct vmspace *vmspace)
 #define VM_FAULT_BURST         0x04    /* Burst fault can be done */
 #define VM_FAULT_DIRTY         0x08    /* Dirty the page */
 #define VM_FAULT_UNSWAP                0x10    /* Remove backing store from the page */
+#define VM_FAULT_BURST_QUICK   0x20    /* Special case shared vm_object */
 #define VM_FAULT_WIRE_MASK     (VM_FAULT_CHANGE_WIRING|VM_FAULT_USER_WIRE)
 
 #ifdef _KERNEL
index 4e87866..1f49f41 100644 (file)
@@ -175,9 +175,6 @@ do_vmtotal_callback(struct proc *p, void *data)
 {
        struct vmtotal *totalp = data;
        struct lwp *lp;
-       vm_map_entry_t entry;
-       vm_map_t map;
-       int paging;
 
        if (p->p_flag & P_SYSTEM)
                return(0);
@@ -210,32 +207,13 @@ do_vmtotal_callback(struct proc *p, void *data)
                default:
                        return (0);
                }
-       }
 
-       /*
-        * Note active objects.
-        */
-       paging = 0;
-       lwkt_gettoken(&p->p_token);
-       if (p->p_vmspace) {
-               map = &p->p_vmspace->vm_map;
-               vm_map_lock_read(map);
-               for (entry = map->header.next;
-                    entry != &map->header; entry = entry->next) {
-                       if (entry->maptype != VM_MAPTYPE_NORMAL &&
-                           entry->maptype != VM_MAPTYPE_VPAGETABLE) {
-                               continue;
-                       }
-                       if (entry->object.vm_object == NULL)
-                               continue;
-                       vm_object_set_flag(entry->object.vm_object, OBJ_ACTIVE);
-                       paging |= entry->object.vm_object->paging_in_progress;
-               }
-               vm_map_unlock_read(map);
+               /*
+                * Set while in vm_fault()
+                */
+               if (lp->lwp_flag & LWP_PAGING)
+                       totalp->t_pw++;
        }
-       lwkt_reltoken(&p->p_token);
-       if (paging)
-               totalp->t_pw++;
        return(0);
 }
 
index e29b1e2..46d4258 100644 (file)
@@ -175,6 +175,13 @@ vm_object_lock(vm_object_t obj)
        lwkt_getpooltoken(obj);
 }
 
+void
+vm_object_lock_shared(vm_object_t obj)
+{
+       lwkt_token_t tok = lwkt_token_pool_lookup(obj);
+       lwkt_gettoken_shared(tok);
+}
+
 void
 vm_object_unlock(vm_object_t obj)
 {
@@ -221,6 +228,40 @@ debugvm_object_hold(vm_object_t obj, char *file, int line)
 #endif
 }
 
+void
+#ifndef DEBUG_LOCKS
+vm_object_hold_shared(vm_object_t obj)
+#else
+debugvm_object_hold_shared(vm_object_t obj, char *file, int line)
+#endif
+{
+       KKASSERT(obj != NULL);
+
+       /*
+        * Object must be held (object allocation is stable due to callers
+        * context, typically already holding the token on a parent object)
+        * prior to potentially blocking on the lock, otherwise the object
+        * can get ripped away from us.
+        */
+       refcount_acquire(&obj->hold_count);
+       vm_object_lock_shared(obj);
+
+#if defined(DEBUG_LOCKS)
+       int i;
+
+       i = ffs(~obj->debug_hold_bitmap) - 1;
+       if (i == -1) {
+               kprintf("vm_object hold count > VMOBJ_DEBUG_ARRAY_SIZE");
+               obj->debug_hold_ovfl = 1;
+       }
+
+       obj->debug_hold_bitmap |= (1 << i);
+       obj->debug_hold_thrs[i] = curthread;
+       obj->debug_hold_file[i] = file;
+       obj->debug_hold_line[i] = line;
+#endif
+}
+
 /*
  * Drop the token and hold_count on the object.
  */
@@ -1609,11 +1650,13 @@ vm_object_backing_scan_callback(vm_page_t p, void *data)
        struct rb_vm_page_scan_info *info = data;
        vm_object_t backing_object;
        vm_object_t object;
+       vm_pindex_t pindex;
        vm_pindex_t new_pindex;
        vm_pindex_t backing_offset_index;
        int op;
 
-       new_pindex = p->pindex - info->backing_offset_index;
+       pindex = p->pindex;
+       new_pindex = pindex - info->backing_offset_index;
        op = info->limit;
        object = info->object;
        backing_object = info->backing_object;
@@ -1630,8 +1673,7 @@ vm_object_backing_scan_callback(vm_page_t p, void *data)
                 * note that we do not busy the backing object's
                 * page.
                 */
-               if (
-                   p->pindex < backing_offset_index ||
+               if (pindex < backing_offset_index ||
                    new_pindex >= object->size
                ) {
                        return(0);
@@ -1646,7 +1688,6 @@ vm_object_backing_scan_callback(vm_page_t p, void *data)
                 * If this fails, the parent does not completely shadow
                 * the object and we might as well give up now.
                 */
-
                pp = vm_page_lookup(object, new_pindex);
                if ((pp == NULL || pp->valid == 0) &&
                    !vm_pager_has_page(object, new_pindex)
@@ -1657,7 +1698,8 @@ vm_object_backing_scan_callback(vm_page_t p, void *data)
        }
 
        /*
-        * Check for busy page
+        * Check for busy page.  Note that we may have lost (p) when we
+        * possibly blocked above.
         */
        if (op & (OBSC_COLLAPSE_WAIT | OBSC_COLLAPSE_NOWAIT)) {
                vm_page_t pp;
@@ -1678,6 +1720,18 @@ vm_object_backing_scan_callback(vm_page_t p, void *data)
                                return(-1);
                        }
                }
+
+               /*
+                * If (p) is no longer valid restart the scan.
+                */
+               if (p->object != backing_object || p->pindex != pindex) {
+                       kprintf("vm_object_backing_scan: Warning: page "
+                               "%p ripped out from under us\n", p);
+                       vm_page_wakeup(p);
+                       info->error = -1;
+                       return(-1);
+               }
+
                if (op & OBSC_COLLAPSE_NOWAIT) {
                        if (p->valid == 0 /*|| p->hold_count*/ ||
                            p->wire_count) {
@@ -2354,7 +2408,14 @@ vm_object_set_writeable_dirty(vm_object_t object)
        struct vnode *vp;
 
        /*vm_object_assert_held(object);*/
-       vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+       /*
+        * Avoid contention in vm fault path by checking the state before
+        * issuing an atomic op on it.
+        */
+       if ((object->flags & (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) !=
+           (OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY)) {
+               vm_object_set_flag(object, OBJ_WRITEABLE|OBJ_MIGHTBEDIRTY);
+       }
        if (object->type == OBJT_VNODE &&
            (vp = (struct vnode *)object->handle) != NULL) {
                if ((vp->v_flag & VOBJDIRTY) == 0) {
index 54b6bb2..58db06a 100644 (file)
@@ -304,14 +304,19 @@ void vm_object_dead_sleep(vm_object_t, const char *);
 void vm_object_dead_wakeup(vm_object_t);
 void vm_object_lock_swap(void);
 void vm_object_lock(vm_object_t);
+void vm_object_lock_shared(vm_object_t);
 void vm_object_unlock(vm_object_t);
 
 #ifndef DEBUG_LOCKS
 void vm_object_hold(vm_object_t);
+void vm_object_hold_shared(vm_object_t);
 #else
-#define vm_object_hold(obj)    \
+#define vm_object_hold(obj)            \
        debugvm_object_hold(obj, __FILE__, __LINE__)
 void debugvm_object_hold(vm_object_t, char *, int);
+#define vm_object_hold_shared(obj)     \
+       debugvm_object_hold_shared(obj, __FILE__, __LINE__)
+void debugvm_object_hold_shared(vm_object_t, char *, int);
 #endif
 
 void vm_object_drop(vm_object_t);
index 9eed65b..84902bd 100644 (file)
@@ -1276,6 +1276,45 @@ vm_page_select_free(u_short pg_color, boolean_t prefer_zero)
        return(m);
 }
 
+/*
+ * This implements a per-cpu cache of free, zero'd, ready-to-go pages.
+ * The idea is to populate this cache prior to acquiring any locks so
+ * we don't wind up potentially zeroing VM pages (under heavy loads) while
+ * holding potentialy contending locks.
+ *
+ * Note that we allocate the page uninserted into anything and use a pindex
+ * of 0, the vm_page_alloc() will effectively add gd_cpuid so these
+ * allocations should wind up being uncontended.  However, we still want
+ * to rove across PQ_L2_SIZE.
+ */
+void
+vm_page_pcpu_cache(void)
+{
+#if 0
+       globaldata_t gd = mycpu;
+       vm_page_t m;
+
+       if (gd->gd_vmpg_count < GD_MINVMPG) {
+               crit_enter_gd(gd);
+               while (gd->gd_vmpg_count < GD_MAXVMPG) {
+                       m = vm_page_alloc(NULL, ticks & ~ncpus2_mask,
+                                         VM_ALLOC_NULL_OK | VM_ALLOC_NORMAL |
+                                         VM_ALLOC_NULL_OK | VM_ALLOC_ZERO);
+                       if (gd->gd_vmpg_count < GD_MAXVMPG) {
+                               if ((m->flags & PG_ZERO) == 0) {
+                                       pmap_zero_page(VM_PAGE_TO_PHYS(m));
+                                       vm_page_flag_set(m, PG_ZERO);
+                               }
+                               gd->gd_vmpg_array[gd->gd_vmpg_count++] = m;
+                       } else {
+                               vm_page_free(m);
+                       }
+               }
+               crit_exit_gd(gd);
+       }
+#endif
+}
+
 /*
  * vm_page_alloc()
  *
@@ -1294,6 +1333,8 @@ vm_page_select_free(u_short pg_color, boolean_t prefer_zero)
  *     VM_ALLOC_FORCE_ZERO     advisory request for pre-zero'd page only
  *     VM_ALLOC_NULL_OK        ok to return NULL on insertion collision
  *                             (see vm_page_grab())
+ *     VM_ALLOC_USE_GD         ok to use per-gd cache
+ *
  * The object must be held if not NULL
  * This routine may not block
  *
@@ -1304,18 +1345,38 @@ vm_page_select_free(u_short pg_color, boolean_t prefer_zero)
 vm_page_t
 vm_page_alloc(vm_object_t object, vm_pindex_t pindex, int page_req)
 {
-       vm_page_t m = NULL;
+#ifdef SMP
+       globaldata_t gd = mycpu;
+#endif
+       vm_page_t m;
        u_short pg_color;
 
+#if 0
+       /*
+        * Special per-cpu free VM page cache.  The pages are pre-busied
+        * and pre-zerod for us.
+        */
+       if (gd->gd_vmpg_count && (page_req & VM_ALLOC_USE_GD)) {
+               crit_enter_gd(gd);
+               if (gd->gd_vmpg_count) {
+                       m = gd->gd_vmpg_array[--gd->gd_vmpg_count];
+                       crit_exit_gd(gd);
+                       goto done;
+                }
+               crit_exit_gd(gd);
+        }
+#endif
+       m = NULL;
+
 #ifdef SMP
        /*
         * Cpu twist - cpu localization algorithm
         */
        if (object) {
-               pg_color = mycpu->gd_cpuid + (pindex & ~ncpus_fit_mask) +
+               pg_color = gd->gd_cpuid + (pindex & ~ncpus_fit_mask) +
                           (object->pg_color & ~ncpus_fit_mask);
        } else {
-               pg_color = mycpu->gd_cpuid + (pindex & ~ncpus_fit_mask);
+               pg_color = gd->gd_cpuid + (pindex & ~ncpus_fit_mask);
        }
 #else
        /*
@@ -1413,6 +1474,9 @@ loop:
                ("vm_page_alloc: free/cache page %p was dirty", m));
        KKASSERT(m->queue == PQ_NONE);
 
+#if 0
+done:
+#endif
        /*
         * Initialize the structure, inheriting some flags but clearing
         * all the rest.  The page has already been busied for us.
index ebafa88..41d6c1f 100644 (file)
@@ -412,14 +412,15 @@ vm_page_flash(vm_page_t m)
  * must be specified.  VM_ALLOC_RETRY may only be specified if VM_ALLOC_NORMAL
  * is also specified.
  */
-#define VM_ALLOC_NORMAL                0x01    /* ok to use cache pages */
-#define VM_ALLOC_SYSTEM                0x02    /* ok to exhaust most of free list */
-#define VM_ALLOC_INTERRUPT     0x04    /* ok to exhaust entire free list */
-#define        VM_ALLOC_ZERO           0x08    /* req pre-zero'd memory if avail */
-#define        VM_ALLOC_QUICK          0x10    /* like NORMAL but do not use cache */
-#define VM_ALLOC_FORCE_ZERO    0x20    /* zero page even if already valid */
-#define VM_ALLOC_NULL_OK       0x40    /* ok to return NULL on collision */
-#define        VM_ALLOC_RETRY          0x80    /* indefinite block (vm_page_grab()) */
+#define VM_ALLOC_NORMAL                0x0001  /* ok to use cache pages */
+#define VM_ALLOC_SYSTEM                0x0002  /* ok to exhaust most of free list */
+#define VM_ALLOC_INTERRUPT     0x0004  /* ok to exhaust entire free list */
+#define        VM_ALLOC_ZERO           0x0008  /* req pre-zero'd memory if avail */
+#define        VM_ALLOC_QUICK          0x0010  /* like NORMAL but do not use cache */
+#define VM_ALLOC_FORCE_ZERO    0x0020  /* zero page even if already valid */
+#define VM_ALLOC_NULL_OK       0x0040  /* ok to return NULL on collision */
+#define        VM_ALLOC_RETRY          0x0080  /* indefinite block (vm_page_grab()) */
+#define VM_ALLOC_USE_GD                0x0100  /* use per-gd cache */
 
 void vm_page_queue_spin_lock(vm_page_t);
 void vm_page_queues_spin_lock(u_short);
@@ -435,6 +436,7 @@ void vm_page_wakeup(vm_page_t m);
 void vm_page_hold(vm_page_t);
 void vm_page_unhold(vm_page_t);
 void vm_page_activate (vm_page_t);
+void vm_page_pcpu_cache(void);
 vm_page_t vm_page_alloc (struct vm_object *, vm_pindex_t, int);
 vm_page_t vm_page_grab (struct vm_object *, vm_pindex_t, int);
 void vm_page_cache (vm_page_t);