kernel - Adjust emergency pager, add D_NOEMERGPGR
authorMatthew Dillon <dillon@apollo.backplane.com>
Fri, 8 Sep 2017 15:42:06 +0000 (08:42 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Fri, 8 Sep 2017 15:42:06 +0000 (08:42 -0700)
* Adjust emergency pager and pager thread tests a little.  Allow the
  emergency pager to also page to VCHR devices as long as D_NOEMERGPGR
  is not flagged.

* Add the D_NOEMERGPGR flag and apply to "vn" and "mfs" block devices.

sys/dev/disk/vn/vn.c
sys/kern/subr_disk.c
sys/sys/device.h
sys/sys/proc.h
sys/vfs/mfs/mfs_vfsops.c
sys/vm/swap_pager.c
sys/vm/vm_page.c
sys/vm/vm_pageout.c

index 0201800..f88d952 100644 (file)
@@ -101,12 +101,13 @@ DEVFS_DEFINE_CLONE_BITMAP(vn);
 
 /*
  * dev_ops
- *     D_DISK          we want to look like a disk
+ *     D_DISK          We want to look like a disk
  *     D_CANFREE       We support BUF_CMD_FREEBLKS
+ *     D_NOEMERGPGR    Too complex for emergency pager
  */
 
 static struct dev_ops vn_ops = {
-       { "vn", 0, D_DISK | D_CANFREE },
+       { "vn", 0, D_DISK | D_CANFREE | D_NOEMERGPGR },
        .d_open =       vnopen,
        .d_close =      vnclose,
        .d_read =       physread,
index 97399c5..68a473a 100644 (file)
@@ -133,7 +133,7 @@ static LIST_HEAD(, disk) disklist = LIST_HEAD_INITIALIZER(&disklist);
 static struct lwkt_token disklist_token;
 static struct lwkt_token ds_token;
 
-static struct dev_ops disk_ops = {
+static struct dev_ops disk1_ops = {
        { "disk", 0, D_DISK | D_MPSAFE | D_TRACKCLOSE },
        .d_open = diskopen,
        .d_close = diskclose,
@@ -145,6 +145,18 @@ static struct dev_ops disk_ops = {
        .d_psize = diskpsize,
 };
 
+static struct dev_ops disk2_ops = {
+       { "disk", 0, D_DISK | D_MPSAFE | D_TRACKCLOSE | D_NOEMERGPGR },
+       .d_open = diskopen,
+       .d_close = diskclose,
+       .d_read = physread,
+       .d_write = physwrite,
+       .d_ioctl = diskioctl,
+       .d_strategy = diskstrategy,
+       .d_dump = diskdump,
+       .d_psize = diskpsize,
+};
+
 static struct objcache         *disk_msg_cache;
 
 struct objcache_malloc_args disk_msg_malloc_args = {
@@ -172,6 +184,7 @@ disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe)
        struct disk_info *info = &dp->d_info;
        struct diskslice *sp = &dp->d_slice->dss_slices[slice];
        disklabel_ops_t ops;
+       struct dev_ops *dops;
        struct partinfo part;
        const char *msg;
        char uuid_buf[128];
@@ -183,6 +196,8 @@ disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe)
                   dev->si_name, dp->d_cdev->si_name);
 
        sno = slice ? slice - 1 : 0;
+       dops = (dp->d_rawdev->si_ops->head.flags & D_NOEMERGPGR) ?
+               &disk2_ops : &disk1_ops;
 
        ops = &disklabel32_ops;
        msg = ops->op_readdisklabel(dev, sp, &sp->ds_label, info);
@@ -227,7 +242,8 @@ disk_probe_slice(struct disk *dp, cdev_t dev, int slice, int reprobe)
                                                udev_dict_set_cstr(ndev, "uuid", uuid_buf);
                                        }
                                } else {
-                                       ndev = make_dev_covering(&disk_ops, dp->d_rawdev->si_ops,
+                                       ndev = make_dev_covering(dops,
+                                               dp->d_rawdev->si_ops,
                                                dkmakeminor(dkunit(dp->d_cdev),
                                                            slice, i),
                                                UID_ROOT, GID_OPERATOR, 0640,
@@ -311,6 +327,7 @@ disk_probe(struct disk *dp, int reprobe)
        int error, i, sno;
        struct diskslices *osp;
        struct diskslice *sp;
+       struct dev_ops *dops;
        char uuid_buf[128];
 
        KKASSERT (info->d_media_blksize != 0);
@@ -325,6 +342,9 @@ disk_probe(struct disk *dp, int reprobe)
                return;
        }
 
+       dops = (dp->d_rawdev->si_ops->head.flags & D_NOEMERGPGR) ?
+               &disk2_ops : &disk1_ops;
+
        for (i = 0; i < dp->d_slice->dss_nslices; i++) {
                /*
                 * Ignore the whole-disk slice, it has already been created.
@@ -391,7 +411,7 @@ disk_probe(struct disk *dp, int reprobe)
                        /*
                         * Else create new device
                         */
-                       ndev = make_dev_covering(&disk_ops, dp->d_rawdev->si_ops,
+                       ndev = make_dev_covering(dops, dp->d_rawdev->si_ops,
                                        dkmakewholeslice(dkunit(dev), i),
                                        UID_ROOT, GID_OPERATOR, 0640,
                                        (info->d_dsflags & DSO_DEVICEMAPPER)?
@@ -641,6 +661,7 @@ _disk_create_named(const char *name, int unit, struct disk *dp,
                   struct dev_ops *raw_ops, int clone)
 {
        cdev_t rawdev;
+       struct dev_ops *dops;
 
        disk_debug(1, "disk_create (begin): %s%d\n", name, unit);
 
@@ -655,20 +676,22 @@ _disk_create_named(const char *name, int unit, struct disk *dp,
 
        bzero(dp, sizeof(*dp));
 
+       dops = (raw_ops->head.flags & D_NOEMERGPGR) ? &disk2_ops : &disk1_ops;
+
        dp->d_rawdev = rawdev;
        dp->d_raw_ops = raw_ops;
-       dp->d_dev_ops = &disk_ops;
+       dp->d_dev_ops = dops;
 
        if (name) {
                if (clone) {
                        dp->d_cdev = make_only_dev_covering(
-                                       &disk_ops, dp->d_rawdev->si_ops,
+                                       dops, dp->d_rawdev->si_ops,
                                        dkmakewholedisk(unit),
                                        UID_ROOT, GID_OPERATOR, 0640,
                                        "%s", name);
                } else {
                        dp->d_cdev = make_dev_covering(
-                                       &disk_ops, dp->d_rawdev->si_ops,
+                                       dops, dp->d_rawdev->si_ops,
                                        dkmakewholedisk(unit),
                                        UID_ROOT, GID_OPERATOR, 0640,
                                        "%s", name);
@@ -676,13 +699,13 @@ _disk_create_named(const char *name, int unit, struct disk *dp,
        } else {
                if (clone) {
                        dp->d_cdev = make_only_dev_covering(
-                                       &disk_ops, dp->d_rawdev->si_ops,
+                                       dops, dp->d_rawdev->si_ops,
                                        dkmakewholedisk(unit),
                                        UID_ROOT, GID_OPERATOR, 0640,
                                        "%s%d", raw_ops->head.name, unit);
                } else {
                        dp->d_cdev = make_dev_covering(
-                                       &disk_ops, dp->d_rawdev->si_ops,
+                                       dops, dp->d_rawdev->si_ops,
                                        dkmakewholedisk(unit),
                                        UID_ROOT, GID_OPERATOR, 0640,
                                        "%s%d", raw_ops->head.name, unit);
index 72d91a5..346ee37 100644 (file)
@@ -265,13 +265,16 @@ struct dev_ops {
 
 /*
  * Flags for d_flags.
+ *
+ * D_NOEMERGPGR                Indicates complex layering, the emergency pager
+ *                     should skip buffers related to such devices.
  */
 #define D_MEMDISK      0x00010000      /* memory type disk */
 #define D_NAGGED       0x00020000      /* nagged about missing make_dev() */
 #define D_CANFREE      0x00040000      /* can free blocks */
 #define D_TRACKCLOSE   0x00080000      /* track all closes */
 #define D_MASTER       0x00100000      /* used by pty/tty code */
-#define D_UNUSED200000 0x00200000
+#define D_NOEMERGPGR   0x00200000      /* too complex for emergency pager */
 #define D_MPSAFE       0x00400000      /* all dev_d*() calls are MPSAFE */
 
 /*
index 0dec494..7955dab 100644 (file)
@@ -512,7 +512,9 @@ extern int nprocs, maxproc;         /* Current and max number of procs. */
 extern int maxprocperuid;              /* Max procs per uid. */
 
 extern struct proc *initproc;          /* Process slot for init */
-extern struct thread *pagethread, *updatethread;
+extern struct thread *pagethread;
+extern struct thread *emergpager;
+extern struct thread *updatethread;
 
 /*
  * Scheduler independant variables.  The primary scheduler polling frequency,
index 1058c89..6a9048d 100644 (file)
@@ -85,7 +85,7 @@ d_close_t     mfsclose;
 d_strategy_t   mfsstrategy;
 
 static struct dev_ops mfs_ops = {
-       { "MFS", -1, D_DISK },
+       { "MFS", -1, D_DISK | D_NOEMERGPGR },
        .d_open =       mfsopen,
        .d_close =      mfsclose,
        .d_read =       physread,
index f8c9de0..afd73b2 100644 (file)
@@ -1559,8 +1559,11 @@ swap_pager_putpages(vm_object_t object, vm_page_t *m, int count,
         * pageout daemon to prevent any single user process limited
         * via RLIMIT_RSS from hogging swap write bandwidth.
         */
-       if (curthread != pagethread && swap_user_async == 0)
+       if (curthread != pagethread &&
+           curthread != emergpager &&
+           swap_user_async == 0) {
                flags |= VM_PAGER_PUT_SYNC;
+       }
 
        /*
         * Step 2
index 595b931..08ce9dc 100644 (file)
@@ -2171,7 +2171,8 @@ vm_wait(int timo)
                timo = hz;
        lwkt_gettoken(&vm_token);
 
-       if (curthread == pagethread) {
+       if (curthread == pagethread ||
+           curthread == emergpager) {
                /*
                 * The pageout daemon itself needs pages, this is bad.
                 */
index 67b8575..caa4701 100644 (file)
@@ -78,6 +78,7 @@
 #include <sys/signalvar.h>
 #include <sys/vnode.h>
 #include <sys/vmmeter.h>
+#include <sys/conf.h>
 #include <sys/sysctl.h>
 
 #include <vm/vm.h>
@@ -106,7 +107,7 @@ static int vm_pageout_page(vm_page_t m, int *max_launderp,
 static int vm_pageout_clean_helper (vm_page_t, int);
 static int vm_pageout_free_page_calc (vm_size_t count);
 static void vm_pageout_page_free(vm_page_t m) ;
-static struct thread *emergpager;
+struct thread *emergpager;
 struct thread *pagethread;
 static int sequence_emerg_pager;
 
@@ -835,10 +836,42 @@ vm_pageout_scan_inactive(int pass, int q, int avail_shortage,
                 * The emergency pager runs when the primary pager gets
                 * stuck, which typically means the primary pager deadlocked
                 * on a vnode-backed page.  Therefore, the emergency pager
-                * must skip vnode-backed pages.
+                * must skip any complex objects.
+                *
+                * We disallow VNODEs unless they are VCHR whos device ops
+                * does not flag D_NOEMERGPGR.
                 */
-               if (isep) {
-                       if (m->object && m->object->type == OBJT_VNODE) {
+               if (isep && m->object) {
+                       struct vnode *vp;
+
+                       switch(m->object->type) {
+                       case OBJT_DEFAULT:
+                       case OBJT_SWAP:
+                               /*
+                                * Allow anonymous memory and assume that
+                                * swap devices are not complex, since its
+                                * kinda worthless if we can't swap out dirty
+                                * anonymous pages.
+                                */
+                               break;
+                       case OBJT_VNODE:
+                               /*
+                                * Allow VCHR device if the D_NOEMERGPGR
+                                * flag is not set, deny other vnode types
+                                * as being too complex.
+                                */
+                               vp = m->object->handle;
+                               if (vp && vp->v_type == VCHR &&
+                                   vp->v_rdev && vp->v_rdev->si_ops &&
+                                   (vp->v_rdev->si_ops->head.flags &
+                                    D_NOEMERGPGR) == 0) {
+                                       break;
+                               }
+                               /* Deny - fall through */
+                       default:
+                               /*
+                                * Deny
+                                */
                                vm_page_wakeup(m);
                                vm_page_queues_spin_lock(PQ_INACTIVE + q);
                                lwkt_yield();