From e193508c7255b6262b69e52689a301312ad53178 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Tue, 29 Dec 2009 15:50:39 -0800 Subject: [PATCH] kernel - Fix improper vgone() in procfs and races in truss * procfs was trying to destroy the vnodes associated with exiting pids, ripping them out from under active users. This is no longer legal. Instead flag it for the exiting pid so further operations fail. * procfs's stopevents handling for process tracing was not MPSAFE and raced against MPSAFE system call entry points. This led to numerous situations where gdb or truss would get stuck, or where the process getting traced would get stuck. Make the whole mess MPSAFE by protecting the tests against proc->p_spin. * Note that the platform trap case is optimized to only acquire p_spin once it has been determined that a stopevent might be pending. Reported-by: Antonio Huete Jimenez --- sys/kern/sys_process.c | 37 ++++++++++++++++++++++++++------- sys/vfs/procfs/procfs.h | 2 ++ sys/vfs/procfs/procfs_subr.c | 34 +++++++++++++++--------------- sys/vfs/procfs/procfs_vnops.c | 39 ++++++++++++++++++++++++++++++----- 4 files changed, 82 insertions(+), 30 deletions(-) diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c index 84cd8a3689..7f74d31fdf 100644 --- a/sys/kern/sys_process.c +++ b/sys/kern/sys_process.c @@ -52,6 +52,7 @@ #include #include +#include /* use the equivalent procfs code */ #if 0 @@ -652,15 +653,35 @@ trace_req(struct proc *p) void stopevent(struct proc *p, unsigned int event, unsigned int val) { + /* + * Set event info. Recheck p_stops in case we are + * racing a close() on procfs. + */ + spin_lock_wr(&p->p_spin); + if ((p->p_stops & event) == 0) { + spin_unlock_wr(&p->p_spin); + return; + } + p->p_xstat = val; + p->p_stype = event; p->p_step = 1; + tsleep_interlock(&p->p_step, 0); + spin_unlock_wr(&p->p_spin); - do { - crit_enter(); - wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */ - p->p_xstat = val; - p->p_stype = event; /* Which event caused the stop? */ - tsleep(&p->p_step, 0, "stopevent", 0); - crit_exit(); - } while (p->p_step); + /* + * Wakeup any PIOCWAITing procs and wait for p_step to + * be cleared. + */ + for (;;) { + wakeup(&p->p_stype); + tsleep(&p->p_step, PINTERLOCKED, "stopevent", 0); + spin_lock_wr(&p->p_spin); + if (p->p_step == 0) { + spin_unlock_wr(&p->p_spin); + break; + } + tsleep_interlock(&p->p_step, 0); + spin_unlock_wr(&p->p_spin); + } } diff --git a/sys/vfs/procfs/procfs.h b/sys/vfs/procfs/procfs.h index e9de6d899a..1d4d804f46 100644 --- a/sys/vfs/procfs/procfs.h +++ b/sys/vfs/procfs/procfs.h @@ -158,6 +158,8 @@ int procfs_validtype (struct lwp *); #define PROCFS_LOCKED 0x01 #define PROCFS_WANT 0x02 +#define PFS_DEAD 0x80000000 /* or'd with pid */ + int procfs_root (struct mount *, struct vnode **); int procfs_rw (struct vop_read_args *); diff --git a/sys/vfs/procfs/procfs_subr.c b/sys/vfs/procfs/procfs_subr.c index 0759570dae..f9d8eb88e1 100644 --- a/sys/vfs/procfs/procfs_subr.c +++ b/sys/vfs/procfs/procfs_subr.c @@ -408,31 +408,31 @@ procfs_exit(struct thread *td) pid = td->td_proc->p_pid; /* - * The reason for this loop is not obvious -- basicly, - * procfs_freevp(), which is called via vgone() (eventually), - * removes the specified procfs node from the pfshead list. - * It does this by *pfsp = pfs->pfs_next, meaning that it - * overwrites the node. So when we do pfs = pfs->next, we - * end up skipping the node that replaces the one that was - * vgone'd. Since it may have been the last one on the list, - * it may also have been set to null -- but *our* pfs pointer, - * here, doesn't see this. So the loop starts from the beginning - * again. + * NOTE: We can't just vgone() the vnode any more, not while + * it may potentially still be active. This will clean + * the vp and clear the mount and cause the new VOP subsystem + * to assert or panic when someone tries to do an operation + * on an open (exited) procfs descriptor. * - * This is not a for() loop because the final event - * would be "pfs = pfs->pfs_next"; in the case where - * pfs is set to pfshead again, that would mean that - * pfshead is skipped over. + * Prevent further operations on this pid by setting pfs_pid to -1. + * Note that a pfs_pid of 0 is used for nodes which do not track + * any particular pid. * + * Use vx_get() to properly ref/lock a vp which may not have any + * refs and which may or may not already be reclaimed. vx_put() + * will then properly deactivate it and cause it to be recycled. + * + * The hash table can also get ripped out from under us when + * we block so take the easy way out and restart the scan. */ again: pfs = *PFSHASH(pid); while (pfs) { if (pfs->pfs_pid == pid) { vp = PFSTOV(pfs); - vx_lock(vp); - vgone_vxlocked(vp); - vx_unlock(vp); + vx_get(vp); + pfs->pfs_pid |= PFS_DEAD; /* does not effect hash */ + vx_put(vp); goto again; } pfs = pfs->pfs_next; diff --git a/sys/vfs/procfs/procfs_vnops.c b/sys/vfs/procfs/procfs_vnops.c index a29ce970f9..87de8843e1 100644 --- a/sys/vfs/procfs/procfs_vnops.c +++ b/sys/vfs/procfs/procfs_vnops.c @@ -64,6 +64,8 @@ #include #include +#include + #include static int procfs_access (struct vop_access_args *); @@ -237,8 +239,10 @@ procfs_close(struct vop_close_args *ap) if ((ap->a_vp->v_opencount < 2) && (p = pfind(pfs->pfs_pid)) && !(p->p_pfsflags & PF_LINGER)) { + spin_lock_wr(&p->p_spin); p->p_stops = 0; p->p_step = 0; + spin_unlock_wr(&p->p_spin); wakeup(&p->p_step); } break; @@ -299,24 +303,42 @@ procfs_ioctl(struct vop_ioctl_args *ap) *(unsigned int*)ap->a_data = (unsigned int)procp->p_pfsflags; break; case PIOCSTATUS: + /* + * NOTE: syscall entry deals with stopevents and may run without + * the MP lock. + */ psp = (struct procfs_status *)ap->a_data; - psp->state = (procp->p_step == 0); psp->flags = procp->p_pfsflags; psp->events = procp->p_stops; + spin_lock_wr(&procp->p_spin); if (procp->p_step) { + psp->state = 0; psp->why = procp->p_stype; psp->val = procp->p_xstat; + spin_unlock_wr(&procp->p_spin); } else { - psp->why = psp->val = 0; /* Not defined values */ + psp->state = 1; + spin_unlock_wr(&procp->p_spin); + psp->why = 0; /* Not defined values */ + psp->val = 0; /* Not defined values */ } break; case PIOCWAIT: + /* + * NOTE: syscall entry deals with stopevents and may run without + * the MP lock. + */ psp = (struct procfs_status *)ap->a_data; - if (procp->p_step == 0) { - error = tsleep(&procp->p_stype, PCATCH, "piocwait", 0); + spin_lock_wr(&procp->p_spin); + while (procp->p_step == 0) { + tsleep_interlock(&procp->p_stype, PCATCH); + spin_unlock_wr(&procp->p_spin); + error = tsleep(&procp->p_stype, PCATCH | PINTERLOCKED, "piocwait", 0); if (error) return error; + spin_lock_wr(&procp->p_spin); } + spin_unlock_wr(&procp->p_spin); psp->state = 1; /* It stopped */ psp->flags = procp->p_pfsflags; psp->events = procp->p_stops; @@ -324,6 +346,11 @@ procfs_ioctl(struct vop_ioctl_args *ap) psp->val = procp->p_xstat; /* any extra info */ break; case PIOCCONT: /* Restart a proc */ + /* + * NOTE: syscall entry deals with stopevents and may run without + * the MP lock. However, the caller is presumably interlocked + * by having waited. + */ if (procp->p_step == 0) return EINVAL; /* Can only start a stopped process */ if ((signo = *(int*)ap->a_data) != 0) { @@ -381,8 +408,10 @@ procfs_bmap(struct vop_bmap_args *ap) static int procfs_inactive(struct vop_inactive_args *ap) { - /*struct vnode *vp = ap->a_vp;*/ + struct pfsnode *pfs = VTOPFS(ap->a_vp); + if (pfs->pfs_pid & PFS_DEAD) + vrecycle(ap->a_vp); return (0); } -- 2.41.0