gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/kern/lwkt_ipiq.c,v 1.27 2008/05/18 20:57:56 nth Exp $
	35	*/
	36
	37	/*
	38	* This module implements IPI message queueing and the MI portion of IPI
	39	* message processing.
	40	*/
	41
	42	#include "opt_ddb.h"
	43
	44	#include <sys/param.h>
	45	#include <sys/systm.h>
	46	#include <sys/kernel.h>
	47	#include <sys/proc.h>
	48	#include <sys/rtprio.h>
	49	#include <sys/queue.h>
	50	#include <sys/thread2.h>
	51	#include <sys/sysctl.h>
	52	#include <sys/ktr.h>
	53	#include <sys/kthread.h>
	54	#include <machine/cpu.h>
	55	#include <sys/lock.h>
	56	#include <sys/caps.h>
	57
	58	#include <vm/vm.h>
	59	#include <vm/vm_param.h>
	60	#include <vm/vm_kern.h>
	61	#include <vm/vm_object.h>
	62	#include <vm/vm_page.h>
	63	#include <vm/vm_map.h>
	64	#include <vm/vm_pager.h>
	65	#include <vm/vm_extern.h>
	66	#include <vm/vm_zone.h>
	67
	68	#include <machine/stdarg.h>
	69	#include <machine/smp.h>
	70	#include <machine/atomic.h>
	71
	72	#ifdef SMP
	73	static __int64_t ipiq_count; /* total calls to lwkt_send_ipiq() /
	74	static __int64_t ipiq_fifofull; /* number of fifo full conditions detected */
	75	static __int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */
	76	static __int64_t ipiq_passive; /* passive IPI messages */
	77	static __int64_t ipiq_cscount; /* number of cpu synchronizations */
	78	static int ipiq_optimized = 1; /* XXX temporary sysctl */
	79	#ifdef PANIC_DEBUG
	80	static int panic_ipiq_cpu = -1;
	81	static int panic_ipiq_count = 100;
	82	#endif
	83	#endif
	84
	85	#ifdef SMP
	86	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0,
	87	"Number of IPI's sent");
	88	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0,
	89	"Number of fifo full conditions detected");
	90	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_avoided, CTLFLAG_RW, &ipiq_avoided, 0,
	91	"Number of IPI's avoided by interlock with target cpu");
	92	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_passive, CTLFLAG_RW, &ipiq_passive, 0,
	93	"Number of passive IPI messages sent");
	94	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0,
	95	"Number of cpu synchronizations");
	96	SYSCTL_INT(_lwkt, OID_AUTO, ipiq_optimized, CTLFLAG_RW, &ipiq_optimized, 0,
	97	"");
	98	#ifdef PANIC_DEBUG
	99	SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, "");
	100	SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, "");
	101	#endif
	102
	103	#define IPIQ_STRING "func=%p arg1=%p arg2=%d scpu=%d dcpu=%d"
	104	#define IPIQ_ARG_SIZE (sizeof(void ) 2 + sizeof(int) * 3)
	105
	106	#if !defined(KTR_IPIQ)
	107	#define KTR_IPIQ KTR_ALL
	108	#endif
	109	KTR_INFO_MASTER(ipiq);
	110	KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARG_SIZE);
	111	KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARG_SIZE);
	112	KTR_INFO(KTR_IPIQ, ipiq, send_nbio, 2, IPIQ_STRING, IPIQ_ARG_SIZE);
	113	KTR_INFO(KTR_IPIQ, ipiq, send_fail, 3, IPIQ_STRING, IPIQ_ARG_SIZE);
	114	KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARG_SIZE);
	115	KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08x", sizeof(cpumask_t));
	116	KTR_INFO(KTR_IPIQ, ipiq, sync_add, 6, "cpumask=%08x", sizeof(cpumask_t));
	117	KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARG_SIZE);
	118	KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARG_SIZE);
	119
	120	#define logipiq(name, func, arg1, arg2, sgd, dgd) \
	121	KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid)
	122	#define logipiq2(name, arg) \
	123	KTR_LOG(ipiq_ ## name, arg)
	124
	125	#endif /* SMP */
	126
	127	#ifdef SMP
	128
	129	static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip,
	130	struct intrframe *frame);
	131	static void lwkt_cpusync_remote1(lwkt_cpusync_t poll);
	132	static void lwkt_cpusync_remote2(lwkt_cpusync_t poll);
	133
	134	/*
	135	* Send a function execution request to another cpu. The request is queued
	136	* on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every
	137	* possible target cpu. The FIFO can be written.
	138	*
	139	* If the FIFO fills up we have to enable interrupts to avoid an APIC
	140	* deadlock and process pending IPIQs while waiting for it to empty.
	141	* Otherwise we may soft-deadlock with another cpu whos FIFO is also full.
	142	*
	143	* We can safely bump gd_intr_nesting_level because our crit_exit() at the
	144	* end will take care of any pending interrupts.
	145	*
	146	* The actual hardware IPI is avoided if the target cpu is already processing
	147	* the queue from a prior IPI. It is possible to pipeline IPI messages
	148	* very quickly between cpus due to the FIFO hysteresis.
	149	*
	150	* Need not be called from a critical section.
	151	*/
	152	int
	153	lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2)
	154	{
	155	lwkt_ipiq_t ip;
	156	int windex;
	157	struct globaldata *gd = mycpu;
	158
	159	logipiq(send_norm, func, arg1, arg2, gd, target);
	160
	161	if (target == gd) {
	162	func(arg1, arg2, NULL);
	163	logipiq(send_end, func, arg1, arg2, gd, target);
	164	return(0);
	165	}
	166	crit_enter();
	167	++gd->gd_intr_nesting_level;
	168	#ifdef INVARIANTS
	169	if (gd->gd_intr_nesting_level > 20)
	170	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
	171	#endif
	172	KKASSERT(curthread->td_critcount);
	173	++ipiq_count;
	174	ip = &gd->gd_ipiq[target->gd_cpuid];
	175
	176	/*
	177	* Do not allow the FIFO to become full. Interrupts must be physically
	178	* enabled while we liveloop to avoid deadlocking the APIC.
	179	*/
	180	if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
	181	#if defined(__i386__)
	182	unsigned int eflags = read_eflags();
	183	#elif defined(__x86_64__)
	184	unsigned long rflags = read_rflags();
	185	#endif
	186
	187	if (atomic_poll_acquire_int(&ip->ip_npoll) \|\| ipiq_optimized == 0) {
	188	logipiq(cpu_send, func, arg1, arg2, gd, target);
	189	cpu_send_ipiq(target->gd_cpuid);
	190	}
	191	cpu_enable_intr();
	192	++ipiq_fifofull;
	193	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
	194	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
	195	lwkt_process_ipiq();
	196	}
	197	#if defined(__i386__)
	198	write_eflags(eflags);
	199	#elif defined(__x86_64__)
	200	write_rflags(rflags);
	201	#endif
	202	}
	203
	204	/*
	205	* Queue the new message
	206	*/
	207	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	208	ip->ip_func[windex] = func;
	209	ip->ip_arg1[windex] = arg1;
	210	ip->ip_arg2[windex] = arg2;
	211	cpu_sfence();
	212	++ip->ip_windex;
	213	--gd->gd_intr_nesting_level;
	214
	215	/*
	216	* signal the target cpu that there is work pending.
	217	*/
	218	if (atomic_poll_acquire_int(&ip->ip_npoll)) {
	219	logipiq(cpu_send, func, arg1, arg2, gd, target);
	220	cpu_send_ipiq(target->gd_cpuid);
	221	} else {
	222	if (ipiq_optimized == 0) {
	223	logipiq(cpu_send, func, arg1, arg2, gd, target);
	224	cpu_send_ipiq(target->gd_cpuid);
	225	} else {
	226	++ipiq_avoided;
	227	}
	228	}
	229	crit_exit();
	230
	231	logipiq(send_end, func, arg1, arg2, gd, target);
	232	return(ip->ip_windex);
	233	}
	234
	235	/*
	236	* Similar to lwkt_send_ipiq() but this function does not actually initiate
	237	* the IPI to the target cpu unless the FIFO has become too full, so it is
	238	* very fast.
	239	*
	240	* This function is used for non-critical IPI messages, such as memory
	241	* deallocations. The queue will typically be flushed by the target cpu at
	242	* the next clock interrupt.
	243	*
	244	* Need not be called from a critical section.
	245	*/
	246	int
	247	lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func,
	248	void *arg1, int arg2)
	249	{
	250	lwkt_ipiq_t ip;
	251	int windex;
	252	struct globaldata *gd = mycpu;
	253
	254	KKASSERT(target != gd);
	255	crit_enter();
	256	logipiq(send_pasv, func, arg1, arg2, gd, target);
	257	++gd->gd_intr_nesting_level;
	258	#ifdef INVARIANTS
	259	if (gd->gd_intr_nesting_level > 20)
	260	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
	261	#endif
	262	KKASSERT(curthread->td_critcount);
	263	++ipiq_count;
	264	++ipiq_passive;
	265	ip = &gd->gd_ipiq[target->gd_cpuid];
	266
	267	/*
	268	* Do not allow the FIFO to become full. Interrupts must be physically
	269	* enabled while we liveloop to avoid deadlocking the APIC.
	270	*/
	271	if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
	272	#if defined(__i386__)
	273	unsigned int eflags = read_eflags();
	274	#elif defined(__x86_64__)
	275	unsigned long rflags = read_rflags();
	276	#endif
	277
	278	if (atomic_poll_acquire_int(&ip->ip_npoll) \|\| ipiq_optimized == 0) {
	279	logipiq(cpu_send, func, arg1, arg2, gd, target);
	280	cpu_send_ipiq(target->gd_cpuid);
	281	}
	282	cpu_enable_intr();
	283	++ipiq_fifofull;
	284	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
	285	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
	286	lwkt_process_ipiq();
	287	}
	288	#if defined(__i386__)
	289	write_eflags(eflags);
	290	#elif defined(__x86_64__)
	291	write_rflags(rflags);
	292	#endif
	293	}
	294
	295	/*
	296	* Queue the new message
	297	*/
	298	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	299	ip->ip_func[windex] = func;
	300	ip->ip_arg1[windex] = arg1;
	301	ip->ip_arg2[windex] = arg2;
	302	cpu_sfence();
	303	++ip->ip_windex;
	304	--gd->gd_intr_nesting_level;
	305
	306	/*
	307	* Do not signal the target cpu, it will pick up the IPI when it next
	308	* polls (typically on the next tick).
	309	*/
	310	crit_exit();
	311
	312	logipiq(send_end, func, arg1, arg2, gd, target);
	313	return(ip->ip_windex);
	314	}
	315
	316	/*
	317	* Send an IPI request without blocking, return 0 on success, ENOENT on
	318	* failure. The actual queueing of the hardware IPI may still force us
	319	* to spin and process incoming IPIs but that will eventually go away
	320	* when we've gotten rid of the other general IPIs.
	321	*/
	322	int
	323	lwkt_send_ipiq3_nowait(globaldata_t target, ipifunc3_t func,
	324	void *arg1, int arg2)
	325	{
	326	lwkt_ipiq_t ip;
	327	int windex;
	328	struct globaldata *gd = mycpu;
	329
	330	logipiq(send_nbio, func, arg1, arg2, gd, target);
	331	KKASSERT(curthread->td_critcount);
	332	if (target == gd) {
	333	func(arg1, arg2, NULL);
	334	logipiq(send_end, func, arg1, arg2, gd, target);
	335	return(0);
	336	}
	337	++ipiq_count;
	338	ip = &gd->gd_ipiq[target->gd_cpuid];
	339
	340	if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO * 2 / 3) {
	341	logipiq(send_fail, func, arg1, arg2, gd, target);
	342	return(ENOENT);
	343	}
	344	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	345	ip->ip_func[windex] = func;
	346	ip->ip_arg1[windex] = arg1;
	347	ip->ip_arg2[windex] = arg2;
	348	cpu_sfence();
	349	++ip->ip_windex;
	350
	351	/*
	352	* This isn't a passive IPI, we still have to signal the target cpu.
	353	*/
	354	if (atomic_poll_acquire_int(&ip->ip_npoll)) {
	355	logipiq(cpu_send, func, arg1, arg2, gd, target);
	356	cpu_send_ipiq(target->gd_cpuid);
	357	} else {
	358	if (ipiq_optimized == 0) {
	359	logipiq(cpu_send, func, arg1, arg2, gd, target);
	360	cpu_send_ipiq(target->gd_cpuid);
	361	} else {
	362	++ipiq_avoided;
	363	}
	364	}
	365
	366	logipiq(send_end, func, arg1, arg2, gd, target);
	367	return(0);
	368	}
	369
	370	/*
	371	* deprecated, used only by fast int forwarding.
	372	*/
	373	int
	374	lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2)
	375	{
	376	return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2));
	377	}
	378
	379	/*
	380	* Send a message to several target cpus. Typically used for scheduling.
	381	* The message will not be sent to stopped cpus.
	382	*/
	383	int
	384	lwkt_send_ipiq3_mask(cpumask_t mask, ipifunc3_t func, void *arg1, int arg2)
	385	{
	386	int cpuid;
	387	int count = 0;
	388
	389	mask &= ~stopped_cpus;
	390	while (mask) {
	391	cpuid = BSFCPUMASK(mask);
	392	lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2);
	393	mask &= ~CPUMASK(cpuid);
	394	++count;
	395	}
	396	return(count);
	397	}
	398
	399	/*
	400	* Wait for the remote cpu to finish processing a function.
	401	*
	402	* YYY we have to enable interrupts and process the IPIQ while waiting
	403	* for it to empty or we may deadlock with another cpu. Create a CPU_*()
	404	* function to do this! YYY we really should 'block' here.
	405	*
	406	* MUST be called from a critical section. This routine may be called
	407	* from an interrupt (for example, if an interrupt wakes a foreign thread
	408	* up).
	409	*/
	410	void
	411	lwkt_wait_ipiq(globaldata_t target, int seq)
	412	{
	413	lwkt_ipiq_t ip;
	414	int maxc = 100000000;
	415
	416	if (target != mycpu) {
	417	ip = &mycpu->gd_ipiq[target->gd_cpuid];
	418	if ((int)(ip->ip_xindex - seq) < 0) {
	419	#if defined(__i386__)
	420	unsigned int eflags = read_eflags();
	421	#elif defined(__x86_64__)
	422	unsigned long rflags = read_rflags();
	423	#endif
	424	cpu_enable_intr();
	425	while ((int)(ip->ip_xindex - seq) < 0) {
	426	crit_enter();
	427	lwkt_process_ipiq();
	428	crit_exit();
	429	if (--maxc == 0)
	430	kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu->gd_cpuid, target->gd_cpuid, ip->ip_xindex - seq);
	431	if (maxc < -1000000)
	432	panic("LWKT_WAIT_IPIQ");
	433	/*
	434	* xindex may be modified by another cpu, use a load fence
	435	* to ensure that the loop does not use a speculative value
	436	* (which may improve performance).
	437	*/
	438	cpu_lfence();
	439	}
	440	#if defined(__i386__)
	441	write_eflags(eflags);
	442	#elif defined(__x86_64__)
	443	write_rflags(rflags);
	444	#endif
	445	}
	446	}
	447	}
	448
	449	int
	450	lwkt_seq_ipiq(globaldata_t target)
	451	{
	452	lwkt_ipiq_t ip;
	453
	454	ip = &mycpu->gd_ipiq[target->gd_cpuid];
	455	return(ip->ip_windex);
	456	}
	457
	458	/*
	459	* Called from IPI interrupt (like a fast interrupt), which has placed
	460	* us in a critical section. The MP lock may or may not be held.
	461	* May also be called from doreti or splz, or be reentrantly called
	462	* indirectly through the ip_func[] we run.
	463	*
	464	* There are two versions, one where no interrupt frame is available (when
	465	* called from the send code and from splz, and one where an interrupt
	466	* frame is available.
	467	*/
	468	void
	469	lwkt_process_ipiq(void)
	470	{
	471	globaldata_t gd = mycpu;
	472	globaldata_t sgd;
	473	lwkt_ipiq_t ip;
	474	int n;
	475
	476	again:
	477	for (n = 0; n < ncpus; ++n) {
	478	if (n != gd->gd_cpuid) {
	479	sgd = globaldata_find(n);
	480	ip = sgd->gd_ipiq;
	481	if (ip != NULL) {
	482	while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], NULL))
	483	;
	484	}
	485	}
	486	}
	487	if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
	488	if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) {
	489	if (gd->gd_curthread->td_cscount == 0)
	490	goto again;
	491	need_ipiq();
	492	}
	493	}
	494	}
	495
	496	void
	497	lwkt_process_ipiq_frame(struct intrframe *frame)
	498	{
	499	globaldata_t gd = mycpu;
	500	globaldata_t sgd;
	501	lwkt_ipiq_t ip;
	502	int n;
	503
	504	again:
	505	for (n = 0; n < ncpus; ++n) {
	506	if (n != gd->gd_cpuid) {
	507	sgd = globaldata_find(n);
	508	ip = sgd->gd_ipiq;
	509	if (ip != NULL) {
	510	while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], frame))
	511	;
	512	}
	513	}
	514	}
	515	if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
	516	if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame)) {
	517	if (gd->gd_curthread->td_cscount == 0)
	518	goto again;
	519	need_ipiq();
	520	}
	521	}
	522	}
	523
	524	static int
	525	lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip,
	526	struct intrframe *frame)
	527	{
	528	globaldata_t mygd = mycpu;
	529	int ri;
	530	int wi;
	531	ipifunc3_t copy_func;
	532	void *copy_arg1;
	533	int copy_arg2;
	534
	535	/*
	536	* Obtain the current write index, which is modified by a remote cpu.
	537	* Issue a load fence to prevent speculative reads of e.g. data written
	538	* by the other cpu prior to it updating the index.
	539	*/
	540	KKASSERT(curthread->td_critcount);
	541	wi = ip->ip_windex;
	542	cpu_lfence();
	543	++mygd->gd_intr_nesting_level;
	544
	545	/*
	546	* NOTE: xindex is only updated after we are sure the function has
	547	* finished execution. Beware lwkt_process_ipiq() reentrancy!
	548	* The function may send an IPI which may block/drain.
	549	*
	550	* NOTE: Due to additional IPI operations that the callback function
	551	* may make, it is possible for both rindex and windex to advance and
	552	* thus for rindex to advance passed our cached windex.
	553	*
	554	* NOTE: A memory fence is required to prevent speculative loads prior
	555	* to the loading of ip_rindex. Even though stores might be
	556	* ordered, loads are probably not.
	557	*/
	558	while (wi - (ri = ip->ip_rindex) > 0) {
	559	ri &= MAXCPUFIFO_MASK;
	560	cpu_mfence();
	561	copy_func = ip->ip_func[ri];
	562	copy_arg1 = ip->ip_arg1[ri];
	563	copy_arg2 = ip->ip_arg2[ri];
	564	++ip->ip_rindex;
	565	KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) ==
	566	((ri + 1) & MAXCPUFIFO_MASK));
	567	logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu);
	568	copy_func(copy_arg1, copy_arg2, frame);
	569	cpu_sfence();
	570	ip->ip_xindex = ip->ip_rindex;
	571
	572	#ifdef PANIC_DEBUG
	573	/*
	574	* Simulate panics during the processing of an IPI
	575	*/
	576	if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) {
	577	if (--panic_ipiq_count == 0) {
	578	#ifdef DDB
	579	Debugger("PANIC_DEBUG");
	580	#else
	581	panic("PANIC_DEBUG");
	582	#endif
	583	}
	584	}
	585	#endif
	586	}
	587	--mygd->gd_intr_nesting_level;
	588
	589	/*
	590	* Return non-zero if there are more IPI messages pending on this
	591	* ipiq. ip_npoll is left set as long as possible to reduce the
	592	* number of IPIs queued by the originating cpu, but must be cleared
	593	* BEFORE checking windex.
	594	*/
	595	atomic_poll_release_int(&ip->ip_npoll);
	596	return(wi != ip->ip_windex);
	597	}
	598
	599	static void
	600	lwkt_sync_ipiq(void *arg)
	601	{
	602	cpumask_t *cpumask = arg;
	603
	604	atomic_clear_cpumask(cpumask, mycpu->gd_cpumask);
	605	if (*cpumask == 0)
	606	wakeup(cpumask);
	607	}
	608
	609	void
	610	lwkt_synchronize_ipiqs(const char *wmesg)
	611	{
	612	cpumask_t other_cpumask;
	613
	614	other_cpumask = mycpu->gd_other_cpus & smp_active_mask;
	615	lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, &other_cpumask);
	616
	617	while (other_cpumask != 0) {
	618	tsleep_interlock(&other_cpumask, 0);
	619	if (other_cpumask != 0)
	620	tsleep(&other_cpumask, PINTERLOCKED, wmesg, 0);
	621	}
	622	}
	623
	624	#endif
	625
	626	/*
	627	* CPU Synchronization Support
	628	*
	629	* lwkt_cpusync_simple()
	630	*
	631	* The function is executed synchronously before return on remote cpus.
	632	* A lwkt_cpusync_t pointer is passed as an argument. The data can
	633	* be accessed via arg->cs_data.
	634	*
	635	* XXX should I just pass the data as an argument to be consistent?
	636	*/
	637
	638	void
	639	lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *data)
	640	{
	641	struct lwkt_cpusync cmd;
	642
	643	cmd.cs_run_func = NULL;
	644	cmd.cs_fin1_func = func;
	645	cmd.cs_fin2_func = NULL;
	646	cmd.cs_data = data;
	647	lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd);
	648	if (mask & CPUMASK(mycpu->gd_cpuid))
	649	func(&cmd);
	650	lwkt_cpusync_finish(&cmd);
	651	}
	652
	653	/*
	654	* lwkt_cpusync_fastdata()
	655	*
	656	* The function is executed in tandem with return on remote cpus.
	657	* The data is directly passed as an argument. Do not pass pointers to
	658	* temporary storage as the storage might have
	659	* gone poof by the time the target cpu executes
	660	* the function.
	661	*
	662	* At the moment lwkt_cpusync is declared on the stack and we must wait
	663	* for all remote cpus to ack in lwkt_cpusync_finish(), but as a future
	664	* optimization we should be able to put a counter in the globaldata
	665	* structure (if it is not otherwise being used) and just poke it and
	666	* return without waiting. XXX
	667	*/
	668	void
	669	lwkt_cpusync_fastdata(cpumask_t mask, cpusync_func2_t func, void *data)
	670	{
	671	struct lwkt_cpusync cmd;
	672
	673	cmd.cs_run_func = NULL;
	674	cmd.cs_fin1_func = NULL;
	675	cmd.cs_fin2_func = func;
	676	cmd.cs_data = NULL;
	677	lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd);
	678	if (mask & CPUMASK(mycpu->gd_cpuid))
	679	func(data);
	680	lwkt_cpusync_finish(&cmd);
	681	}
	682
	683	/*
	684	* lwkt_cpusync_start()
	685	*
	686	* Start synchronization with a set of target cpus, return once they are
	687	* known to be in a synchronization loop. The target cpus will execute
	688	* poll->cs_run_func() IN TANDEM WITH THE RETURN.
	689	*
	690	* XXX future: add lwkt_cpusync_start_quick() and require a call to
	691	* lwkt_cpusync_add() or lwkt_cpusync_wait(), allowing the caller to
	692	* potentially absorb the IPI latency doing something useful.
	693	*/
	694	void
	695	lwkt_cpusync_start(cpumask_t mask, lwkt_cpusync_t poll)
	696	{
	697	globaldata_t gd = mycpu;
	698
	699	poll->cs_count = 0;
	700	poll->cs_mask = mask;
	701	#ifdef SMP
	702	logipiq2(sync_start, mask & gd->gd_other_cpus);
	703	poll->cs_maxcount = lwkt_send_ipiq_mask(
	704	mask & gd->gd_other_cpus & smp_active_mask,
	705	(ipifunc1_t)lwkt_cpusync_remote1, poll);
	706	#endif
	707	if (mask & gd->gd_cpumask) {
	708	if (poll->cs_run_func)
	709	poll->cs_run_func(poll);
	710	}
	711	#ifdef SMP
	712	if (poll->cs_maxcount) {
	713	++ipiq_cscount;
	714	++gd->gd_curthread->td_cscount;
	715	while (poll->cs_count != poll->cs_maxcount) {
	716	crit_enter();
	717	lwkt_process_ipiq();
	718	crit_exit();
	719	}
	720	}
	721	#endif
	722	}
	723
	724	void
	725	lwkt_cpusync_add(cpumask_t mask, lwkt_cpusync_t poll)
	726	{
	727	globaldata_t gd = mycpu;
	728	#ifdef SMP
	729	int count;
	730	#endif
	731
	732	mask &= ~poll->cs_mask;
	733	poll->cs_mask \|= mask;
	734	#ifdef SMP
	735	logipiq2(sync_add, mask & gd->gd_other_cpus);
	736	count = lwkt_send_ipiq_mask(
	737	mask & gd->gd_other_cpus & smp_active_mask,
	738	(ipifunc1_t)lwkt_cpusync_remote1, poll);
	739	#endif
	740	if (mask & gd->gd_cpumask) {
	741	if (poll->cs_run_func)
	742	poll->cs_run_func(poll);
	743	}
	744	#ifdef SMP
	745	poll->cs_maxcount += count;
	746	if (poll->cs_maxcount) {
	747	if (poll->cs_maxcount == count)
	748	++gd->gd_curthread->td_cscount;
	749	while (poll->cs_count != poll->cs_maxcount) {
	750	crit_enter();
	751	lwkt_process_ipiq();
	752	crit_exit();
	753	}
	754	}
	755	#endif
	756	}
	757
	758	/*
	759	* Finish synchronization with a set of target cpus. The target cpus will
	760	* execute cs_fin1_func(poll) prior to this function returning, and will
	761	* execute cs_fin2_func(data) IN TANDEM WITH THIS FUNCTION'S RETURN.
	762	*
	763	* If cs_maxcount is non-zero then we are mastering a cpusync with one or
	764	* more remote cpus and must account for it in our thread structure.
	765	*/
	766	void
	767	lwkt_cpusync_finish(lwkt_cpusync_t poll)
	768	{
	769	globaldata_t gd = mycpu;
	770
	771	poll->cs_count = -1;
	772	if (poll->cs_mask & gd->gd_cpumask) {
	773	if (poll->cs_fin1_func)
	774	poll->cs_fin1_func(poll);
	775	if (poll->cs_fin2_func)
	776	poll->cs_fin2_func(poll->cs_data);
	777	}
	778	#ifdef SMP
	779	if (poll->cs_maxcount) {
	780	while (poll->cs_count != -(poll->cs_maxcount + 1)) {
	781	crit_enter();
	782	lwkt_process_ipiq();
	783	crit_exit();
	784	}
	785	--gd->gd_curthread->td_cscount;
	786	}
	787	#endif
	788	}
	789
	790	#ifdef SMP
	791
	792	/*
	793	* helper IPI remote messaging function.
	794	*
	795	* Called on remote cpu when a new cpu synchronization request has been
	796	* sent to us. Execute the run function and adjust cs_count, then requeue
	797	* the request so we spin on it.
	798	*/
	799	static void
	800	lwkt_cpusync_remote1(lwkt_cpusync_t poll)
	801	{
	802	atomic_add_int(&poll->cs_count, 1);
	803	if (poll->cs_run_func)
	804	poll->cs_run_func(poll);
	805	lwkt_cpusync_remote2(poll);
	806	}
	807
	808	/*
	809	* helper IPI remote messaging function.
	810	*
	811	* Poll for the originator telling us to finish. If it hasn't, requeue
	812	* our request so we spin on it. When the originator requests that we
	813	* finish we execute cs_fin1_func(poll) synchronously and cs_fin2_func(data)
	814	* in tandem with the release.
	815	*/
	816	static void
	817	lwkt_cpusync_remote2(lwkt_cpusync_t poll)
	818	{
	819	if (poll->cs_count < 0) {
	820	cpusync_func2_t savef;
	821	void *saved;
	822
	823	if (poll->cs_fin1_func)
	824	poll->cs_fin1_func(poll);
	825	if (poll->cs_fin2_func) {
	826	savef = poll->cs_fin2_func;
	827	saved = poll->cs_data;
	828	cpu_ccfence(); /* required ordering for MP operation */
	829	atomic_add_int(&poll->cs_count, -1);
	830	savef(saved);
	831	} else {
	832	atomic_add_int(&poll->cs_count, -1);
	833	}
	834	} else {
	835	globaldata_t gd = mycpu;
	836	lwkt_ipiq_t ip;
	837	int wi;
	838
	839	ip = &gd->gd_cpusyncq;
	840	wi = ip->ip_windex & MAXCPUFIFO_MASK;
	841	ip->ip_func[wi] = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2;
	842	ip->ip_arg1[wi] = poll;
	843	ip->ip_arg2[wi] = 0;
	844	cpu_sfence();
	845	++ip->ip_windex;
	846	}
	847	}
	848
	849	#endif