kernel - Remove SMP bottlenecks on uidinfo, descriptors, and lockf
authorMatthew Dillon <dillon@apollo.backplane.com>
Sun, 22 Apr 2018 00:30:42 +0000 (17:30 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Sun, 22 Apr 2018 00:50:11 +0000 (17:50 -0700)
* Use an eventcounter and the per-thread fd cache to fix
  bottlenecks in checkfdclosed().  This will work well for
  the vast majority of applications and test benches.

* Batch holdfp*() operations on kqueue collections when implementing
  poll() and select().  This significant improves performance.
  Full scaling not yet achieved, however.

* Increase copyin item batching from 8 to 32 for select() and poll().

* Give the uidinfo structure a pcpu array to hold the posixlocks
  and openfiles count fields, with a rollup contained in the uidinfo
  structure itself.

  This removes numerous global bottlenecks related to open(),
  close(), dup*(), and lockf operations (posixlocks count).

  ui_openfiles will force a rollup on limit reached to be sure
  that the limit was actually reached.  ui_posixlocks stays fairly
  loose.  Each cpu rolls up generally only when the pcpu count exceeds
  +32 or goes below -32.

* Give the proc structure a pcpu array for the same counts, in order
  to properly support seteuid() and such.

* Replace P_ADVLOCK with a char field proc->p_advlock_flag, and
  remove token operations around the field.

13 files changed:
sys/kern/kern_descrip.c
sys/kern/kern_event.c
sys/kern/kern_exit.c
sys/kern/kern_fork.c
sys/kern/kern_lockf.c
sys/kern/kern_resource.c
sys/kern/sys_generic.c
sys/sys/event.h
sys/sys/eventvar.h
sys/sys/file.h
sys/sys/filedesc.h
sys/sys/proc.h
sys/sys/resourcevar.h

index 26a1ef7..c69696f 100644 (file)
@@ -300,6 +300,26 @@ holdfp_fdp(struct filedesc *fdp, int fd, int flag)
        return fp;
 }
 
+struct file *
+holdfp_fdp_locked(struct filedesc *fdp, int fd, int flag)
+{
+       struct file *fp;
+
+       if (((u_int)fd) < fdp->fd_nfiles) {
+               fp = fdp->fd_files[fd].fp;      /* can be NULL */
+               if (fp) {
+                       if ((fp->f_flag & flag) == 0 && flag != -1) {
+                               fp = NULL;
+                       } else {
+                               fhold(fp);
+                       }
+               }
+       } else {
+               fp = NULL;
+       }
+       return fp;
+}
+
 /*
  * Acquire the fp for the specified file descriptor, using the thread
  * cache if possible and caching it if possible.
@@ -656,6 +676,7 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred)
        u_int newmin;
        u_int oflags;
        u_int nflags;
+       int closedcounter;
        int tmp, error, flg = F_POSIX;
 
        KKASSERT(p);
@@ -702,6 +723,7 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred)
        /*
         * Operations on file pointers
         */
+       closedcounter = p->p_fd->fd_closedcounter;
        if ((fp = holdfp(td, fd, -1)) == NULL)
                return (EBADF);
 
@@ -771,30 +793,24 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred)
                                error = EBADF;
                                break;
                        }
-                       if ((p->p_leader->p_flags & P_ADVLOCK) == 0) {
-                               lwkt_gettoken(&p->p_leader->p_token);
-                               p->p_leader->p_flags |= P_ADVLOCK;
-                               lwkt_reltoken(&p->p_leader->p_token);
-                       }
+                       if (p->p_leader->p_advlock_flag == 0)
+                               p->p_leader->p_advlock_flag = 1;
                        error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
-                           &dat->fc_flock, flg);
+                                           &dat->fc_flock, flg);
                        break;
                case F_WRLCK:
                        if ((fp->f_flag & FWRITE) == 0) {
                                error = EBADF;
                                break;
                        }
-                       if ((p->p_leader->p_flags & P_ADVLOCK) == 0) {
-                               lwkt_gettoken(&p->p_leader->p_token);
-                               p->p_leader->p_flags |= P_ADVLOCK;
-                               lwkt_reltoken(&p->p_leader->p_token);
-                       }
+                       if (p->p_leader->p_advlock_flag == 0)
+                               p->p_leader->p_advlock_flag = 1;
                        error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_SETLK,
-                           &dat->fc_flock, flg);
+                                           &dat->fc_flock, flg);
                        break;
                case F_UNLCK:
                        error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
-                               &dat->fc_flock, F_POSIX);
+                                           &dat->fc_flock, F_POSIX);
                        break;
                default:
                        error = EINVAL;
@@ -806,13 +822,13 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred)
                 * we were blocked getting the lock.  If this occurs the
                 * close might not have caught the lock.
                 */
-               if (checkfdclosed(p->p_fd, fd, fp)) {
+               if (checkfdclosed(td, p->p_fd, fd, fp, closedcounter)) {
                        dat->fc_flock.l_whence = SEEK_SET;
                        dat->fc_flock.l_start = 0;
                        dat->fc_flock.l_len = 0;
                        dat->fc_flock.l_type = F_UNLCK;
-                       (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
-                                          F_UNLCK, &dat->fc_flock, F_POSIX);
+                       VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+                                   F_UNLCK, &dat->fc_flock, F_POSIX);
                }
                break;
 
@@ -834,7 +850,7 @@ kern_fcntl(int fd, int cmd, union fcntl_dat *dat, struct ucred *cred)
                if (dat->fc_flock.l_whence == SEEK_CUR)
                        dat->fc_flock.l_start += fp->f_offset;
                error = VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_GETLK,
-                           &dat->fc_flock, F_POSIX);
+                                   &dat->fc_flock, F_POSIX);
                break;
        default:
                error = EINVAL;
@@ -1041,7 +1057,9 @@ retry:
                 * old descriptor.  delfp inherits the ref from the 
                 * descriptor table.
                 */
+               ++fdp->fd_closedcounter;
                fclearcache(&fdp->fd_files[new], NULL, 0);
+               ++fdp->fd_closedcounter;
                delfp = fdp->fd_files[new].fp;
                fdp->fd_files[new].fp = NULL;
                fdp->fd_files[new].reserved = 1;
@@ -1622,17 +1640,32 @@ fdalloc(struct proc *p, int want, int *result)
         * Check that the user has not run out of descriptors (non-root only).
         * As a safety measure the dtable is allowed to have at least
         * minfilesperproc open fds regardless of the maxfilesperuser limit.
+        *
+        * This isn't as loose a spec as ui_posixlocks, so we use atomic
+        * ops to force synchronize and recheck if we would otherwise
+        * error.
         */
        if (p->p_ucred->cr_uid && fdp->fd_nfiles >= minfilesperproc) {
                uip = p->p_ucred->cr_uidinfo;
                if (uip->ui_openfiles > maxfilesperuser) {
-                       krateprintf(&krate_uidinfo,
-                                   "Warning: user %d pid %d (%s) ran out of "
-                                   "file descriptors (%d/%d)\n",
-                                   p->p_ucred->cr_uid, (int)p->p_pid,
-                                   p->p_comm,
-                                   uip->ui_openfiles, maxfilesperuser);
-                       return(ENFILE);
+                       int n;
+                       int count;
+
+                       for (n = 0; n < ncpus; ++n) {
+                               count = atomic_swap_int(
+                                           &uip->ui_pcpu[n].pu_openfiles, 0);
+                               atomic_add_int(&uip->ui_openfiles, count);
+                       }
+                       if (uip->ui_openfiles > maxfilesperuser) {
+                               krateprintf(&krate_uidinfo,
+                                           "Warning: user %d pid %d (%s) "
+                                           "ran out of file descriptors "
+                                           "(%d/%d)\n",
+                                           p->p_ucred->cr_uid, (int)p->p_pid,
+                                           p->p_comm,
+                                           uip->ui_openfiles, maxfilesperuser);
+                               return(ENFILE);
+                       }
                }
        }
 
@@ -1900,7 +1933,9 @@ fdrevoke_proc_callback(struct proc *p, void *vinfo)
                if ((fp = fdp->fd_files[n].fp) == NULL)
                        continue;
                if (fp->f_flag & FREVOKED) {
+                       ++fdp->fd_closedcounter;
                        fclearcache(&fdp->fd_files[n], NULL, 0);
+                       ++fdp->fd_closedcounter;
                        fhold(info->nfp);
                        fdp->fd_files[n].fp = info->nfp;
                        spin_unlock(&fdp->fd_spin);
@@ -1992,10 +2027,24 @@ done:
  * and a close is not currently in progress.
  */
 int
-checkfdclosed(struct filedesc *fdp, int fd, struct file *fp)
+checkfdclosed(thread_t td, struct filedesc *fdp, int fd, struct file *fp,
+             int closedcounter)
 {
+       struct fdcache *fdc;
        int error;
 
+       cpu_lfence();
+       if (fdp->fd_closedcounter == closedcounter)
+               return 0;
+
+       if (td->td_proc && td->td_proc->p_fd == fdp) {
+               for (fdc = &td->td_fdcache[0];
+                    fdc < &td->td_fdcache[NFDCACHE]; ++fdc) {
+                       if (fdc->fd == fd && fdc->fp == fp)
+                               return 0;
+               }
+       }
+
        spin_lock_shared(&fdp->fd_spin);
        if ((unsigned)fd >= fdp->fd_nfiles || fp != fdp->fd_files[fd].fp)
                error = EBADF;
@@ -2054,9 +2103,11 @@ funsetfd_locked(struct filedesc *fdp, int fd)
                return (NULL);
        if ((fp = fdp->fd_files[fd].fp) == NULL)
                return (NULL);
+       ++fdp->fd_closedcounter;
        fclearcache(&fdp->fd_files[fd], NULL, 0);
        fdp->fd_files[fd].fp = NULL;
        fdp->fd_files[fd].fileflags = 0;
+       ++fdp->fd_closedcounter;
 
        fdreserve_locked(fdp, fd, -1);
        fdfixup_locked(fdp, fd);
@@ -2135,16 +2186,31 @@ fsetcred(struct file *fp, struct ucred *ncr)
 {
        struct ucred *ocr;
        struct uidinfo *uip;
+       struct uidcount *pup;
+       int cpu = mycpuid;
+       int count;
 
        ocr = fp->f_cred;
        if (ocr == NULL || ncr == NULL || ocr->cr_uidinfo != ncr->cr_uidinfo) {
                if (ocr) {
                        uip = ocr->cr_uidinfo;
-                       atomic_add_int(&uip->ui_openfiles, -1);
+                       pup = &uip->ui_pcpu[cpu];
+                       atomic_add_int(&pup->pu_openfiles, -1);
+                       if (pup->pu_openfiles < -PUP_LIMIT ||
+                           pup->pu_openfiles > PUP_LIMIT) {
+                               count = atomic_swap_int(&pup->pu_openfiles, 0);
+                               atomic_add_int(&uip->ui_openfiles, count);
+                       }
                }
                if (ncr) {
                        uip = ncr->cr_uidinfo;
-                       atomic_add_int(&uip->ui_openfiles, 1);
+                       pup = &uip->ui_pcpu[cpu];
+                       atomic_add_int(&pup->pu_openfiles, 1);
+                       if (pup->pu_openfiles < -PUP_LIMIT ||
+                           pup->pu_openfiles > PUP_LIMIT) {
+                               count = atomic_swap_int(&pup->pu_openfiles, 0);
+                               atomic_add_int(&uip->ui_openfiles, count);
+                       }
                }
        }
        if (ncr)
@@ -2414,8 +2480,7 @@ fdfree(struct proc *p, struct filedesc *repl)
                KASSERT(fdtol->fdl_refcount > 0,
                        ("filedesc_to_refcount botch: fdl_refcount=%d",
                         fdtol->fdl_refcount));
-               if (fdtol->fdl_refcount == 1 &&
-                   (p->p_leader->p_flags & P_ADVLOCK) != 0) {
+               if (fdtol->fdl_refcount == 1 && p->p_leader->p_advlock_flag) {
                        for (i = 0; i <= fdp->fd_lastfile; ++i) {
                                fdnode = &fdp->fd_files[i];
                                if (fdnode->fp == NULL ||
@@ -2431,11 +2496,8 @@ fdfree(struct proc *p, struct filedesc *repl)
                                lf.l_len = 0;
                                lf.l_type = F_UNLCK;
                                vp = (struct vnode *)fp->f_data;
-                               (void) VOP_ADVLOCK(vp,
-                                                  (caddr_t)p->p_leader,
-                                                  F_UNLCK,
-                                                  &lf,
-                                                  F_POSIX);
+                               VOP_ADVLOCK(vp, (caddr_t)p->p_leader,
+                                           F_UNLCK, &lf, F_POSIX);
                                fdrop(fp);
                                spin_lock(&fdp->fd_spin);
                        }
@@ -2443,7 +2505,7 @@ fdfree(struct proc *p, struct filedesc *repl)
        retry:
                if (fdtol->fdl_refcount == 1) {
                        if (fdp->fd_holdleaderscount > 0 &&
-                           (p->p_leader->p_flags & P_ADVLOCK) != 0) {
+                           p->p_leader->p_advlock_flag) {
                                /*
                                 * close() or do_dup() has cleared a reference
                                 * in a shared file descriptor table.
@@ -2808,18 +2870,19 @@ closef(struct file *fp, struct proc *p)
        if (p != NULL && fp->f_type == DTYPE_VNODE &&
            (((struct vnode *)fp->f_data)->v_flag & VMAYHAVELOCKS)
        ) {
-               if ((p->p_leader->p_flags & P_ADVLOCK) != 0) {
+               if (p->p_leader->p_advlock_flag) {
                        lf.l_whence = SEEK_SET;
                        lf.l_start = 0;
                        lf.l_len = 0;
                        lf.l_type = F_UNLCK;
                        vp = (struct vnode *)fp->f_data;
-                       (void) VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
-                                          &lf, F_POSIX);
+                       VOP_ADVLOCK(vp, (caddr_t)p->p_leader, F_UNLCK,
+                                   &lf, F_POSIX);
                }
                fdtol = p->p_fdtol;
                if (fdtol != NULL) {
                        lwkt_gettoken(&p->p_token);
+
                        /*
                         * Handle special case where file descriptor table
                         * is shared between multiple process leaders.
@@ -2827,8 +2890,7 @@ closef(struct file *fp, struct proc *p)
                        for (fdtol = fdtol->fdl_next;
                             fdtol != p->p_fdtol;
                             fdtol = fdtol->fdl_next) {
-                               if ((fdtol->fdl_leader->p_flags &
-                                    P_ADVLOCK) == 0)
+                               if (fdtol->fdl_leader->p_advlock_flag == 0)
                                        continue;
                                fdtol->fdl_holdcount++;
                                lf.l_whence = SEEK_SET;
@@ -2836,9 +2898,8 @@ closef(struct file *fp, struct proc *p)
                                lf.l_len = 0;
                                lf.l_type = F_UNLCK;
                                vp = (struct vnode *)fp->f_data;
-                               (void) VOP_ADVLOCK(vp,
-                                                  (caddr_t)fdtol->fdl_leader,
-                                                  F_UNLCK, &lf, F_POSIX);
+                               VOP_ADVLOCK(vp, (caddr_t)fdtol->fdl_leader,
+                                           F_UNLCK, &lf, F_POSIX);
                                fdtol->fdl_holdcount--;
                                if (fdtol->fdl_holdcount == 0 &&
                                    fdtol->fdl_wakeup != 0) {
@@ -2933,7 +2994,7 @@ fdrop(struct file *fp)
                lf.l_len = 0;
                lf.l_type = F_UNLCK;
                vp = (struct vnode *)fp->f_data;
-               (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0);
+               VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, 0);
        }
        if (fp->f_ops != &badfileops)
                error = fo_close(fp);
index a8752b9..1f02786 100644 (file)
 #include <sys/signalvar.h>
 #include <sys/filio.h>
 #include <sys/ktr.h>
+#include <sys/spinlock.h>
 
 #include <sys/thread2.h>
 #include <sys/file2.h>
 #include <sys/mplock2.h>
+#include <sys/spinlock2.h>
 
 #define EVENT_REGISTER 1
 #define EVENT_PROCESS  2
@@ -72,7 +74,7 @@ struct knote_cache_list {
 } __cachealign;
 
 static int     kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
-                   struct knote *marker);
+                   struct knote *marker, int closedcounter);
 static int     kqueue_read(struct file *fp, struct uio *uio,
                    struct ucred *cred, int flags);
 static int     kqueue_write(struct file *fp, struct uio *uio,
@@ -395,6 +397,7 @@ filt_proc(struct knote *kn, long hint)
        if ((event == NOTE_FORK) && (kn->kn_sfflags & NOTE_TRACK)) {
                struct kevent kev;
                int error;
+               int n;
 
                /*
                 * register knote with new process.
@@ -405,7 +408,8 @@ filt_proc(struct knote *kn, long hint)
                kev.fflags = kn->kn_sfflags;
                kev.data = kn->kn_id;                   /* parent */
                kev.udata = kn->kn_kevent.udata;        /* preserve udata */
-               error = kqueue_register(kn->kn_kq, &kev);
+               n = 1;
+               error = kqueue_register(kn->kn_kq, &kev, &n);
                if (error)
                        kn->kn_fflags |= NOTE_TRACKERR;
        }
@@ -797,8 +801,10 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
        struct kevent *kevp;
        struct timespec *tsp, ats;
        int i, n, total, error, nerrors = 0;
+       int gobbled;
        int lres;
        int limit = kq_checkloop;
+       int closedcounter;
        struct kevent kev[KQ_NEVENTS];
        struct knote marker;
        struct lwkt_token *tok;
@@ -809,6 +815,8 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
        tsp = tsp_in;
        *res = 0;
 
+       closedcounter = kq->kq_fdp->fd_closedcounter;
+
        for (;;) {
                n = 0;
                error = kevent_copyinfn(uap, kev, KQ_NEVENTS, &n);
@@ -816,10 +824,13 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
                        return error;
                if (n == 0)
                        break;
-               for (i = 0; i < n; i++) {
+               for (i = 0; i < n; ++i)
+                       kev[i].flags &= ~EV_SYSFLAGS;
+               for (i = 0; i < n; ++i) {
+                       gobbled = n - i;
+                       error = kqueue_register(kq, &kev[i], &gobbled);
+                       i += gobbled - 1;
                        kevp = &kev[i];
-                       kevp->flags &= ~EV_SYSFLAGS;
-                       error = kqueue_register(kq, kevp);
 
                        /*
                         * If a registration returns an error we
@@ -970,7 +981,7 @@ kern_kevent(struct kqueue *kq, int nevents, int *res, void *uap,
                 * Process all received events
                 * Account for all non-spurious events in our total
                 */
-               i = kqueue_scan(kq, kev, n, &marker);
+               i = kqueue_scan(kq, kev, n, &marker, closedcounter);
                if (i) {
                        lres = *res;
                        error = kevent_copyoutfn(uap, kev, i, res);
@@ -1063,21 +1074,105 @@ sys_kevent(struct kevent_args *uap)
        return (error);
 }
 
+/*
+ * Efficiently load multiple file pointers.  This significantly reduces
+ * threaded overhead.  When doing simple polling we can depend on the
+ * per-thread (fd,fp) cache.  With more descriptors, we batch.
+ */
+static
+void
+floadkevfps(thread_t td, struct filedesc *fdp, struct kevent *kev,
+           struct file **fp, int climit)
+{
+       struct filterops *fops;
+       int tdcache;
+
+       if (climit <= 2 && td->td_proc && td->td_proc->p_fd == fdp) {
+               tdcache = 1;
+       } else {
+               tdcache = 0;
+               spin_lock_shared(&fdp->fd_spin);
+       }
+
+       while (climit) {
+               *fp = NULL;
+               if (kev->filter < 0 &&
+                   kev->filter + EVFILT_SYSCOUNT >= 0) {
+                       fops = sysfilt_ops[~kev->filter];
+                       if (fops->f_flags & FILTEROP_ISFD) {
+                               if (tdcache) {
+                                       *fp = holdfp(td, kev->ident, -1);
+                               } else {
+                                       *fp = holdfp_fdp_locked(fdp,
+                                                               kev->ident, -1);
+                               }
+                       }
+               }
+               --climit;
+               ++fp;
+               ++kev;
+       }
+       if (tdcache == 0)
+               spin_unlock_shared(&fdp->fd_spin);
+}
+
+/*
+ * Register up to *countp kev's.  Always registers at least 1.
+ *
+ * The number registered is returned in *countp.
+ *
+ * If an error occurs or a kev is flagged EV_RECEIPT, it is
+ * processed and included in *countp, and processing then
+ * stops.
+ */
 int
-kqueue_register(struct kqueue *kq, struct kevent *kev)
+kqueue_register(struct kqueue *kq, struct kevent *kev, int *countp)
 {
        struct filedesc *fdp = kq->kq_fdp;
        struct klist *list = NULL;
        struct filterops *fops;
-       struct file *fp = NULL;
+       struct file *fp[KQ_NEVENTS];
        struct knote *kn = NULL;
        struct thread *td;
-       int error = 0;
+       int error;
+       int count;
+       int climit;
+       int closedcounter;
        struct knote_cache_list *cache_list;
 
+       td = curthread;
+       climit = *countp;
+       if (climit > KQ_NEVENTS)
+               climit = KQ_NEVENTS;
+       closedcounter = fdp->fd_closedcounter;
+       floadkevfps(td, fdp, kev, fp, climit);
+
+       lwkt_getpooltoken(kq);
+       count = 0;
+
+       /*
+        * To avoid races, only one thread can register events on this
+        * kqueue at a time.
+        */
+       while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) {
+               kq->kq_state |= KQ_REGWAIT;
+               tsleep(&kq->kq_regtd, 0, "kqreg", 0);
+       }
+       if (__predict_false(kq->kq_regtd != NULL)) {
+               /* Recursive calling of kqueue_register() */
+               td = NULL;
+       } else {
+               /* Owner of the kq_regtd, i.e. td != NULL */
+               kq->kq_regtd = td;
+       }
+
+loop:
        if (kev->filter < 0) {
-               if (kev->filter + EVFILT_SYSCOUNT < 0)
-                       return (EINVAL);
+               if (kev->filter + EVFILT_SYSCOUNT < 0) {
+                       error = EINVAL;
+                       ++count;
+                       goto done;
+               }
                fops = sysfilt_ops[~kev->filter];       /* to 0-base index */
        } else {
                /*
@@ -1085,14 +1180,18 @@ kqueue_register(struct kqueue *kq, struct kevent *kev)
                 * filter attach routine is responsible for insuring that
                 * the identifier can be attached to it.
                 */
-               return (EINVAL);
+               error = EINVAL;
+               ++count;
+               goto done;
        }
 
        if (fops->f_flags & FILTEROP_ISFD) {
                /* validate descriptor */
-               fp = holdfp_fdp(fdp, kev->ident, -1);
-               if (fp == NULL)
-                       return (EBADF);
+               if (fp[count] == NULL) {
+                       error = EBADF;
+                       ++count;
+                       goto done;
+               }
        }
 
        cache_list = &knote_cache_lists[mycpuid];
@@ -1106,32 +1205,11 @@ kqueue_register(struct kqueue *kq, struct kevent *kev)
                crit_exit();
        }
 
-       td = curthread;
-       lwkt_getpooltoken(kq);
-
-       /*
-        * Make sure that only one thread can register event on this kqueue,
-        * so that we would not suffer any race, even if the registration
-        * blocked, i.e. kq token was released, and the kqueue was shared
-        * between threads (this should be rare though).
-        */
-       while (__predict_false(kq->kq_regtd != NULL && kq->kq_regtd != td)) {
-               kq->kq_state |= KQ_REGWAIT;
-               tsleep(&kq->kq_regtd, 0, "kqreg", 0);
-       }
-       if (__predict_false(kq->kq_regtd != NULL)) {
-               /* Recursive calling of kqueue_register() */
-               td = NULL;
-       } else {
-               /* Owner of the kq_regtd, i.e. td != NULL */
-               kq->kq_regtd = td;
-       }
-
-       if (fp != NULL) {
-               list = &fp->f_klist;
+       if (fp[count] != NULL) {
+               list = &fp[count]->f_klist;
        } else if (kq->kq_knhashmask) {
                list = &kq->kq_knhash[
-                   KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
+                           KN_HASH((u_long)kev->ident, kq->kq_knhashmask)];
        }
        if (list != NULL) {
                lwkt_getpooltoken(list);
@@ -1154,6 +1232,7 @@ again:
         */
        if (kn == NULL && ((kev->flags & EV_ADD) == 0)) {
                error = ENOENT;
+               ++count;
                goto done;
        }
 
@@ -1173,7 +1252,7 @@ again:
                                cache_list->knote_cache_cnt--;
                                crit_exit();
                        }
-                       kn->kn_fp = fp;
+                       kn->kn_fp = fp[count];
                        kn->kn_kq = kq;
                        kn->kn_fop = fops;
 
@@ -1181,7 +1260,7 @@ again:
                         * apply reference count to knote structure, and
                         * do not release it at the end of this routine.
                         */
-                       fp = NULL;
+                       fp[count] = NULL;       /* safety */
 
                        kn->kn_sfflags = kev->fflags;
                        kn->kn_sdata = kev->data;
@@ -1199,6 +1278,7 @@ again:
                        if ((error = filter_attach(kn)) != 0) {
                                kn->kn_status |= KN_DELETING | KN_REPROCESS;
                                knote_drop(kn);
+                               ++count;
                                goto done;
                        }
 
@@ -1209,7 +1289,8 @@ again:
                         * want to end up with a knote on a closed descriptor.
                         */
                        if ((fops->f_flags & FILTEROP_ISFD) &&
-                           checkfdclosed(fdp, kev->ident, kn->kn_fp)) {
+                           checkfdclosed(curthread, fdp, kev->ident, kn->kn_fp,
+                                         closedcounter)) {
                                kn->kn_status |= KN_DELETING | KN_REPROCESS;
                        }
                } else {
@@ -1244,6 +1325,8 @@ again:
                 * Delete the existing knote
                 */
                knote_detach_and_drop(kn);
+               error = 0;
+               ++count;
                goto done;
        } else {
                /*
@@ -1300,6 +1383,22 @@ again:
        knote_release(kn);
        /* kn may be invalid now */
 
+       /*
+        * Loop control.  We stop on errors (above), and also stop after
+        * processing EV_RECEIPT, so the caller can process it.
+        */
+       ++count;
+       if (kev->flags & EV_RECEIPT) {
+               error = 0;
+               goto done;
+       }
+       ++kev;
+       if (count < climit)
+               goto loop;
+
+       /*
+        * Cleanup
+        */
 done:
        if (td != NULL) { /* Owner of the kq_regtd */
                kq->kq_regtd = NULL;
@@ -1309,8 +1408,13 @@ done:
                }
        }
        lwkt_relpooltoken(kq);
-       if (fp != NULL)
-               fdrop(fp);
+
+       *countp = count;
+       while (count < climit) {
+               if (fp[count])
+                       fdrop(fp[count]);
+               ++count;
+       }
        return (error);
 }
 
@@ -1323,9 +1427,10 @@ done:
  */
 static int
 kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
-            struct knote *marker)
+            struct knote *marker, int closedcounter)
 {
         struct knote *kn, local_marker;
+       thread_t td = curthread;
         int total;
 
        total = 0;
@@ -1382,7 +1487,8 @@ kqueue_scan(struct kqueue *kq, struct kevent *kevp, int count,
                 * to match up the event against a knote and will go haywire.
                 */
                if ((kn->kn_fop->f_flags & FILTEROP_ISFD) &&
-                   checkfdclosed(kq->kq_fdp, kn->kn_kevent.ident, kn->kn_fp)) {
+                   checkfdclosed(td, kq->kq_fdp, kn->kn_kevent.ident,
+                                 kn->kn_fp, closedcounter)) {
                        kn->kn_status |= KN_DELETING | KN_REPROCESS;
                }
 
index c992824..fc2a314 100644 (file)
@@ -390,8 +390,6 @@ exit1(int rv)
         */
        semexit(p);
 
-       KKASSERT(p->p_numposixlocks == 0);
-
        /* The next two chunks should probably be moved to vmspace_exit. */
        vm = p->p_vmspace;
 
@@ -1140,6 +1138,7 @@ loop:
                         */
                        PHOLD(p);
                        PRELEZOMB(p);
+                       kfree(p->p_uidpcpu, M_SUBPROC);
                        kfree(p, M_PROC);
                        atomic_add_int(&nprocs, -1);
                        error = 0;
index ad66d2e..6acb67b 100644 (file)
@@ -466,6 +466,8 @@ fork1(struct lwp *lp1, int flags, struct proc **procp)
        spin_init(&p2->p_spin, "procfork1");
        lwkt_token_init(&p2->p_token, "proc");
        lwkt_gettoken(&p2->p_token);
+       p2->p_uidpcpu = kmalloc(sizeof(*p2->p_uidpcpu) * ncpus,
+                               M_SUBPROC, M_WAITOK | M_ZERO);
 
        /*
         * Setup linkage for kernel based threading XXX lwp.  Also add the
index d7367dd..3bb8f4d 100644 (file)
@@ -1,6 +1,6 @@
 /*
  * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>.  All rights reserved.
- * Copyright (c) 2006 Matthew Dillon <dillon@backplane.com>.  All rights reserved.
+ * Copyright (c) 2006-2018 Matthew Dillon <dillon@backplane.com>.  All rights reserved.
  *
  * Copyright (c) 1982, 1986, 1989, 1993
  *     The Regents of the University of California.  All rights reserved.
@@ -127,18 +127,27 @@ void
 lf_count_adjust(struct proc *p, int increase)
 {
        struct uidinfo *uip;
+       struct uidcount *pup;
+       int n;
 
        KKASSERT(p != NULL);
 
        uip = p->p_ucred->cr_uidinfo;
-       if (increase)
-               atomic_add_int(&uip->ui_posixlocks, p->p_numposixlocks);
-       else
-               atomic_add_int(&uip->ui_posixlocks, -p->p_numposixlocks);
+       pup = &uip->ui_pcpu[mycpuid];
+
+       if (increase) {
+               for (n = 0; n < ncpus; ++n)
+                       pup->pu_posixlocks += p->p_uidpcpu[n].pu_posixlocks;
+       } else {
+               for (n = 0; n < ncpus; ++n)
+                       pup->pu_posixlocks -= p->p_uidpcpu[n].pu_posixlocks;
+       }
 
-       KASSERT(uip->ui_posixlocks >= 0,
-               ("Negative number of POSIX locks held by %s user: %d.",
-                increase ? "new" : "old", uip->ui_posixlocks));
+       if (pup->pu_posixlocks < -PUP_LIMIT ||
+           pup->pu_posixlocks > PUP_LIMIT) {
+               atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
+               pup->pu_posixlocks = 0;
+       }
 }
 
 static int
@@ -160,14 +169,17 @@ lf_count_change(struct proc *owner, int diff)
            uip->ui_posixlocks >= max ) {
                ret = 1;
        } else {
-               atomic_add_int(&uip->ui_posixlocks, diff);
-               atomic_add_int(&owner->p_numposixlocks, diff);
-               KASSERT(uip->ui_posixlocks >= 0,
-                       ("Negative number of POSIX locks held by user: %d.",
-                        uip->ui_posixlocks));
-               KASSERT(owner->p_numposixlocks >= 0,
-                       ("Negative number of POSIX locks held by proc: %d.",
-                        uip->ui_posixlocks));
+               struct uidcount *pup;
+               int cpu = mycpuid;
+
+               pup = &uip->ui_pcpu[cpu];
+               pup->pu_posixlocks += diff;
+               if (pup->pu_posixlocks < -PUP_LIMIT ||
+                   pup->pu_posixlocks > PUP_LIMIT) {
+                       atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
+                       pup->pu_posixlocks = 0;
+               }
+               owner->p_uidpcpu[cpu].pu_posixlocks += diff;
                ret = 0;
        }
        return ret;
index ee67818..2262caf 100644 (file)
@@ -948,7 +948,7 @@ uicreate(uid_t uid)
        /*
         * Allocate space and check for a race
         */
-       uip = kmalloc(sizeof(*uip), M_UIDINFO, M_WAITOK|M_ZERO);
+       uip = kmalloc(sizeof(*uip), M_UIDINFO, M_WAITOK | M_ZERO);
 
        /*
         * Initialize structure and enter it into the hash table
@@ -957,6 +957,8 @@ uicreate(uid_t uid)
        uip->ui_uid = uid;
        uip->ui_ref = 1;        /* we're returning a ref */
        varsymset_init(&uip->ui_varsymset, NULL);
+       uip->ui_pcpu = kmalloc(sizeof(*uip->ui_pcpu) * ncpus,
+                              M_UIDINFO, M_WAITOK | M_ZERO);
 
        /*
         * Somebody may have already created the uidinfo for this
@@ -970,6 +972,7 @@ uicreate(uid_t uid)
 
                spin_uninit(&uip->ui_lock);
                varsymset_clean(&uip->ui_varsymset);
+               kfree(uip->ui_pcpu, M_UIDINFO);
                kfree(uip, M_UIDINFO);
                uip = tmp;
        } else {
@@ -1048,6 +1051,7 @@ uifree(uid_t uid)
                varsymset_clean(&uip->ui_varsymset);
                lockuninit(&uip->ui_varsymset.vx_lock);
                spin_uninit(&uip->ui_lock);
+               kfree(uip->ui_pcpu, M_UIDINFO);
                kfree(uip, M_UIDINFO);
        } else {
                spin_unlock(&uihash_lock);
index b2f6258..ca5a556 100644 (file)
@@ -984,7 +984,8 @@ select_copyout(void *arg, struct kevent *kevp, int count, int *res)
 {
        struct select_kevent_copyin_args *skap;
        struct kevent kev;
-       int i = 0;
+       int i;
+       int n;
 
        skap = (struct select_kevent_copyin_args *)arg;
 
@@ -996,7 +997,8 @@ select_copyout(void *arg, struct kevent *kevp, int count, int *res)
                    skap->lwp->lwp_kqueue_serial) {
                        kev = kevp[i];
                        kev.flags = EV_DISABLE|EV_DELETE;
-                       kqueue_register(&skap->lwp->lwp_kqueue, &kev);
+                       n = 1;
+                       kqueue_register(&skap->lwp->lwp_kqueue, &kev, &n);
                        if (nseldebug) {
                                kprintf("select fd %ju mismatched serial %ju\n",
                                    (uintmax_t)kevp[i].ident,
@@ -1376,6 +1378,7 @@ poll_copyout(void *arg, struct kevent *kevp, int count, int *res)
        struct kevent kev;
        int count_res;
        int i;
+       int n;
        uint64_t pi;
 
        pkap = (struct poll_kevent_copyin_args *)arg;
@@ -1392,7 +1395,8 @@ poll_copyout(void *arg, struct kevent *kevp, int count, int *res)
                if (pi >= pkap->nfds) {
                        kev = kevp[i];
                        kev.flags = EV_DISABLE|EV_DELETE;
-                       kqueue_register(&pkap->lwp->lwp_kqueue, &kev);
+                       n = 1;
+                       kqueue_register(&pkap->lwp->lwp_kqueue, &kev, &n);
                        if (nseldebug) {
                                kprintf("poll index %ju out of range against "
                                    "serial %ju\n", (uintmax_t)pi,
@@ -1602,6 +1606,7 @@ socket_wait(struct socket *so, struct timespec *ts, int *res)
        struct kqueue kq;
        struct kevent kev;
        int error, fd;
+       int n;
 
        if ((error = falloc(td->td_lwp, &fp, &fd)) != 0)
                return (error);
@@ -1616,7 +1621,8 @@ socket_wait(struct socket *so, struct timespec *ts, int *res)
        bzero(&kq, sizeof(kq));
        kqueue_init(&kq, td->td_lwp->lwp_proc->p_fd);
        EV_SET(&kev, fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, NULL);
-       if ((error = kqueue_register(&kq, &kev)) != 0) {
+       n = 1;
+       if ((error = kqueue_register(&kq, &kev, &n)) != 0) {
                fdrop(fp);
                return (error);
        }
@@ -1625,7 +1631,8 @@ socket_wait(struct socket *so, struct timespec *ts, int *res)
                            socket_wait_copyout, ts, 0);
 
        EV_SET(&kev, fd, EVFILT_READ, EV_DELETE|EV_DISABLE, 0, 0, NULL);
-       kqueue_register(&kq, &kev);
+       n = 1;
+       kqueue_register(&kq, &kev, &n);
        fp->f_ops = &badfileops;
        fdrop(fp);
 
index 501eb6f..71aa7e9 100644 (file)
@@ -253,7 +253,8 @@ extern void knote_assume_knotes(struct kqinfo *, struct kqinfo *,
 extern void    knote_fdclose(struct file *fp, struct filedesc *fdp, int fd);
 extern void    kqueue_init(struct kqueue *kq, struct filedesc *fdp);
 extern void    kqueue_terminate(struct kqueue *kq);
-extern int     kqueue_register(struct kqueue *kq, struct kevent *kev);
+extern int     kqueue_register(struct kqueue *kq, struct kevent *kev,
+                   int *countp);
 
 extern struct klist fs_klist;  /* EVFILT_FS */
 
index c74f31c..3c4d996 100644 (file)
@@ -50,7 +50,7 @@
 #endif
 
 
-#define KQ_NEVENTS     8               /* minimize copy{in,out} calls */
+#define KQ_NEVENTS     32              /* limit stack use */
 #define KQEXTENT       256             /* linear growth by this amount */
 
 TAILQ_HEAD(kqlist, knote);
index 8df95a5..f3b3964 100644 (file)
@@ -141,7 +141,8 @@ MALLOC_DECLARE(M_FILE);
 
 extern void fhold(struct file *fp);
 extern int fdrop (struct file *fp);
-extern int checkfdclosed(struct filedesc *fdp, int fd, struct file *fp);
+extern int checkfdclosed(thread_t td, struct filedesc *fdp, int fd,
+                       struct file *fp, int closedcounter);
 extern int fp_open(const char *path, int flags, int mode, struct file **fpp);
 extern int fp_vpopen(struct vnode *vp, int flags, struct file **fpp);
 extern int fp_pread(struct file *fp, void *buf, size_t nbytes, off_t offset, ssize_t *res, enum uio_seg);
index 65d044a..c2a685b 100644 (file)
@@ -93,6 +93,7 @@ struct filedesc {
        int     fd_softrefs;            /* softrefs to prevent destruction */
        int     fd_holdleaderscount;    /* block fdfree() for shared close() */
        int     fd_holdleaderswakeup;   /* fdfree() needs wakeup */
+       int     fd_closedcounter;       /* detect close() */
        struct spinlock fd_spin;
        struct  fdnode  fd_builtin_files[NDFILE];
 };
@@ -171,6 +172,7 @@ void        fdcloseexec (struct proc *p);
 int    fdcheckstd (struct lwp *lp);
 struct file *holdfp (struct thread *td, int fd, int flag);
 struct file *holdfp_fdp (struct filedesc *fdp, int fd, int flag);
+struct file *holdfp_fdp_locked (struct filedesc *fdp, int fd, int flag);
 int    holdsock (struct thread *td, int fdes, struct file **fpp);
 int    holdvnode (struct thread *td, int fd, struct file **fpp);
 void   dropfp(struct thread *td, int fd, struct file *fp);
index 810a09a..695cf9d 100644 (file)
@@ -73,6 +73,7 @@ struct proc;
 struct pgrp;
 struct session;
 struct lwp;
+struct uidcount;
 
 LIST_HEAD(proclist, proc);
 LIST_HEAD(pgrplist, pgrp);
@@ -250,7 +251,8 @@ struct      proc {
 
        int             p_flags;        /* P_* flags. */
        enum procstat   p_stat;         /* S* process status. */
-       char            p_pad1[3];
+       char            p_advlock_flag; /* replaces P_ADVLOCK */
+       char            p_pad1[2];
 
        pid_t           p_pid;          /* Process identifier. */
        pid_t           p_ppid;         /* Current parent pid */
@@ -331,7 +333,7 @@ struct      proc {
        void            *p_emuldata;    /* process-specific emulator state */
        struct usched   *p_usched;      /* Userland scheduling control */
        struct vkernel_proc *p_vkernel; /* VKernel support, proc part */
-       int             p_numposixlocks; /* number of POSIX locks */
+       struct uidcount *p_uidpcpu;
        void            (*p_userret)(void);/* p: return-to-user hook */
 
        struct spinlock p_spin;         /* Spinlock for LWP access to proc */
@@ -353,7 +355,7 @@ struct      proc {
 #define        p_pgid          p_pgrp->pg_id
 
 /* These flags are kept in p_flags. */
-#define        P_ADVLOCK       0x00001 /* Process may hold a POSIX advisory lock */
+#define        P_UNUSED01      0x00001
 #define        P_CONTROLT      0x00002 /* Has a controlling terminal */
 #define        P_SWAPPEDOUT    0x00004 /* Swapped out of memory */
 #define P_SYSVSEM      0x00008 /* Might have SysV semaphores */
index d832936..a3957b6 100644 (file)
@@ -102,6 +102,20 @@ struct plimit {
 #define PLIMIT_TESTCPU_XCPU    1
 #define PLIMIT_TESTCPU_KILL    2
 
+/*
+ * Per-cpu tracking structure attached to uidinfo.  These counts are only
+ * synchronized with the uidinfo rollup fields at +/-32.  Resource limits
+ * only check against the ui_posixlocks and ui_openfiles so some slop
+ * is possible (checking against the pcpu structures would be cause cache
+ * line ping-ponging)
+ */
+struct uidcount {
+       int     pu_posixlocks;
+       int     pu_openfiles;
+} __cachealign;
+
+#define PUP_LIMIT      32      /* +/-32 rollup */
+
 /*
  * Per uid resource consumption
  */
@@ -115,9 +129,10 @@ struct uidinfo {
        long    ui_proccnt;             /* number of processes */
        uid_t   ui_uid;                 /* uid */
        int     ui_ref;                 /* reference count */
-       int     ui_posixlocks;          /* number of POSIX locks */
-       int     ui_openfiles;           /* number of open files */
+       int     ui_posixlocks;          /* (rollup) number of POSIX locks */
+       int     ui_openfiles;           /* (rollup) number of open files */
        struct varsymset ui_varsymset;  /* variant symlinks */
+       struct uidcount *ui_pcpu;
 };
 
 #endif