gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2003,2004 The DragonFly Project. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
	5	* by Matthew Dillon <dillon@backplane.com>
	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	*
	11	* 1. Redistributions of source code must retain the above copyright
	12	* notice, this list of conditions and the following disclaimer.
	13	* 2. Redistributions in binary form must reproduce the above copyright
	14	* notice, this list of conditions and the following disclaimer in
	15	* the documentation and/or other materials provided with the
	16	* distribution.
	17	* 3. Neither the name of The DragonFly Project nor the names of its
	18	* contributors may be used to endorse or promote products derived
	19	* from this software without specific, prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	22	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	23	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	24	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	25	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	26	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	27	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	28	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	29	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	30	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	31	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	32	* SUCH DAMAGE.
	33	*
	34	* $DragonFly: src/sys/kern/lwkt_ipiq.c,v 1.27 2008/05/18 20:57:56 nth Exp $
	35	*/
	36
	37	/*
	38	* This module implements IPI message queueing and the MI portion of IPI
	39	* message processing.
	40	*/
	41
	42	#include "opt_ddb.h"
	43
	44	#include <sys/param.h>
	45	#include <sys/systm.h>
	46	#include <sys/kernel.h>
	47	#include <sys/proc.h>
	48	#include <sys/rtprio.h>
	49	#include <sys/queue.h>
	50	#include <sys/thread2.h>
	51	#include <sys/sysctl.h>
	52	#include <sys/ktr.h>
	53	#include <sys/kthread.h>
	54	#include <machine/cpu.h>
	55	#include <sys/lock.h>
	56	#include <sys/caps.h>
	57
	58	#include <vm/vm.h>
	59	#include <vm/vm_param.h>
	60	#include <vm/vm_kern.h>
	61	#include <vm/vm_object.h>
	62	#include <vm/vm_page.h>
	63	#include <vm/vm_map.h>
	64	#include <vm/vm_pager.h>
	65	#include <vm/vm_extern.h>
	66	#include <vm/vm_zone.h>
	67
	68	#include <machine/stdarg.h>
	69	#include <machine/smp.h>
	70	#include <machine/atomic.h>
	71
	72	#ifdef SMP
	73	static __int64_t ipiq_count; /* total calls to lwkt_send_ipiq() /
	74	static __int64_t ipiq_fifofull; /* number of fifo full conditions detected */
	75	static __int64_t ipiq_avoided; /* interlock with target avoids cpu ipi */
	76	static __int64_t ipiq_passive; /* passive IPI messages */
	77	static __int64_t ipiq_cscount; /* number of cpu synchronizations */
	78	static int ipiq_optimized = 1; /* XXX temporary sysctl */
	79	#ifdef PANIC_DEBUG
	80	static int panic_ipiq_cpu = -1;
	81	static int panic_ipiq_count = 100;
	82	#endif
	83	#endif
	84
	85	#ifdef SMP
	86	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_count, CTLFLAG_RW, &ipiq_count, 0, "");
	87	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_fifofull, CTLFLAG_RW, &ipiq_fifofull, 0, "");
	88	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_avoided, CTLFLAG_RW, &ipiq_avoided, 0, "");
	89	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_passive, CTLFLAG_RW, &ipiq_passive, 0, "");
	90	SYSCTL_QUAD(_lwkt, OID_AUTO, ipiq_cscount, CTLFLAG_RW, &ipiq_cscount, 0, "");
	91	SYSCTL_INT(_lwkt, OID_AUTO, ipiq_optimized, CTLFLAG_RW, &ipiq_optimized, 0, "");
	92	#ifdef PANIC_DEBUG
	93	SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_cpu, CTLFLAG_RW, &panic_ipiq_cpu, 0, "");
	94	SYSCTL_INT(_lwkt, OID_AUTO, panic_ipiq_count, CTLFLAG_RW, &panic_ipiq_count, 0, "");
	95	#endif
	96
	97	#define IPIQ_STRING "func=%p arg1=%p arg2=%d scpu=%d dcpu=%d"
	98	#define IPIQ_ARG_SIZE (sizeof(void ) 2 + sizeof(int) * 3)
	99
	100	#if !defined(KTR_IPIQ)
	101	#define KTR_IPIQ KTR_ALL
	102	#endif
	103	KTR_INFO_MASTER(ipiq);
	104	KTR_INFO(KTR_IPIQ, ipiq, send_norm, 0, IPIQ_STRING, IPIQ_ARG_SIZE);
	105	KTR_INFO(KTR_IPIQ, ipiq, send_pasv, 1, IPIQ_STRING, IPIQ_ARG_SIZE);
	106	KTR_INFO(KTR_IPIQ, ipiq, send_nbio, 2, IPIQ_STRING, IPIQ_ARG_SIZE);
	107	KTR_INFO(KTR_IPIQ, ipiq, send_fail, 3, IPIQ_STRING, IPIQ_ARG_SIZE);
	108	KTR_INFO(KTR_IPIQ, ipiq, receive, 4, IPIQ_STRING, IPIQ_ARG_SIZE);
	109	KTR_INFO(KTR_IPIQ, ipiq, sync_start, 5, "cpumask=%08x", sizeof(cpumask_t));
	110	KTR_INFO(KTR_IPIQ, ipiq, sync_add, 6, "cpumask=%08x", sizeof(cpumask_t));
	111	KTR_INFO(KTR_IPIQ, ipiq, cpu_send, 7, IPIQ_STRING, IPIQ_ARG_SIZE);
	112	KTR_INFO(KTR_IPIQ, ipiq, send_end, 8, IPIQ_STRING, IPIQ_ARG_SIZE);
	113
	114	#define logipiq(name, func, arg1, arg2, sgd, dgd) \
	115	KTR_LOG(ipiq_ ## name, func, arg1, arg2, sgd->gd_cpuid, dgd->gd_cpuid)
	116	#define logipiq2(name, arg) \
	117	KTR_LOG(ipiq_ ## name, arg)
	118
	119	#endif /* SMP */
	120
	121	#ifdef SMP
	122
	123	static int lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip,
	124	struct intrframe *frame);
	125	static void lwkt_cpusync_remote1(lwkt_cpusync_t poll);
	126	static void lwkt_cpusync_remote2(lwkt_cpusync_t poll);
	127
	128	/*
	129	* Send a function execution request to another cpu. The request is queued
	130	* on the cpu<->cpu ipiq matrix. Each cpu owns a unique ipiq FIFO for every
	131	* possible target cpu. The FIFO can be written.
	132	*
	133	* If the FIFO fills up we have to enable interrupts to avoid an APIC
	134	* deadlock and process pending IPIQs while waiting for it to empty.
	135	* Otherwise we may soft-deadlock with another cpu whos FIFO is also full.
	136	*
	137	* We can safely bump gd_intr_nesting_level because our crit_exit() at the
	138	* end will take care of any pending interrupts.
	139	*
	140	* The actual hardware IPI is avoided if the target cpu is already processing
	141	* the queue from a prior IPI. It is possible to pipeline IPI messages
	142	* very quickly between cpus due to the FIFO hysteresis.
	143	*
	144	* Need not be called from a critical section.
	145	*/
	146	int
	147	lwkt_send_ipiq3(globaldata_t target, ipifunc3_t func, void *arg1, int arg2)
	148	{
	149	lwkt_ipiq_t ip;
	150	int windex;
	151	struct globaldata *gd = mycpu;
	152
	153	logipiq(send_norm, func, arg1, arg2, gd, target);
	154
	155	if (target == gd) {
	156	func(arg1, arg2, NULL);
	157	logipiq(send_end, func, arg1, arg2, gd, target);
	158	return(0);
	159	}
	160	crit_enter();
	161	++gd->gd_intr_nesting_level;
	162	#ifdef INVARIANTS
	163	if (gd->gd_intr_nesting_level > 20)
	164	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
	165	#endif
	166	KKASSERT(curthread->td_pri >= TDPRI_CRIT);
	167	++ipiq_count;
	168	ip = &gd->gd_ipiq[target->gd_cpuid];
	169
	170	/*
	171	* Do not allow the FIFO to become full. Interrupts must be physically
	172	* enabled while we liveloop to avoid deadlocking the APIC.
	173	*/
	174	if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
	175	unsigned int eflags = read_eflags();
	176
	177	if (atomic_poll_acquire_int(&ip->ip_npoll) \|\| ipiq_optimized == 0) {
	178	logipiq(cpu_send, func, arg1, arg2, gd, target);
	179	cpu_send_ipiq(target->gd_cpuid);
	180	}
	181	cpu_enable_intr();
	182	++ipiq_fifofull;
	183	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
	184	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
	185	lwkt_process_ipiq();
	186	}
	187	write_eflags(eflags);
	188	}
	189
	190	/*
	191	* Queue the new message
	192	*/
	193	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	194	ip->ip_func[windex] = func;
	195	ip->ip_arg1[windex] = arg1;
	196	ip->ip_arg2[windex] = arg2;
	197	cpu_sfence();
	198	++ip->ip_windex;
	199	--gd->gd_intr_nesting_level;
	200
	201	/*
	202	* signal the target cpu that there is work pending.
	203	*/
	204	if (atomic_poll_acquire_int(&ip->ip_npoll)) {
	205	logipiq(cpu_send, func, arg1, arg2, gd, target);
	206	cpu_send_ipiq(target->gd_cpuid);
	207	} else {
	208	if (ipiq_optimized == 0) {
	209	logipiq(cpu_send, func, arg1, arg2, gd, target);
	210	cpu_send_ipiq(target->gd_cpuid);
	211	} else {
	212	++ipiq_avoided;
	213	}
	214	}
	215	crit_exit();
	216
	217	logipiq(send_end, func, arg1, arg2, gd, target);
	218	return(ip->ip_windex);
	219	}
	220
	221	/*
	222	* Similar to lwkt_send_ipiq() but this function does not actually initiate
	223	* the IPI to the target cpu unless the FIFO has become too full, so it is
	224	* very fast.
	225	*
	226	* This function is used for non-critical IPI messages, such as memory
	227	* deallocations. The queue will typically be flushed by the target cpu at
	228	* the next clock interrupt.
	229	*
	230	* Need not be called from a critical section.
	231	*/
	232	int
	233	lwkt_send_ipiq3_passive(globaldata_t target, ipifunc3_t func,
	234	void *arg1, int arg2)
	235	{
	236	lwkt_ipiq_t ip;
	237	int windex;
	238	struct globaldata *gd = mycpu;
	239
	240	KKASSERT(target != gd);
	241	crit_enter();
	242	logipiq(send_pasv, func, arg1, arg2, gd, target);
	243	++gd->gd_intr_nesting_level;
	244	#ifdef INVARIANTS
	245	if (gd->gd_intr_nesting_level > 20)
	246	panic("lwkt_send_ipiq: TOO HEAVILY NESTED!");
	247	#endif
	248	KKASSERT(curthread->td_pri >= TDPRI_CRIT);
	249	++ipiq_count;
	250	++ipiq_passive;
	251	ip = &gd->gd_ipiq[target->gd_cpuid];
	252
	253	/*
	254	* Do not allow the FIFO to become full. Interrupts must be physically
	255	* enabled while we liveloop to avoid deadlocking the APIC.
	256	*/
	257	if (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 2) {
	258	unsigned int eflags = read_eflags();
	259
	260	if (atomic_poll_acquire_int(&ip->ip_npoll) \|\| ipiq_optimized == 0) {
	261	logipiq(cpu_send, func, arg1, arg2, gd, target);
	262	cpu_send_ipiq(target->gd_cpuid);
	263	}
	264	cpu_enable_intr();
	265	++ipiq_fifofull;
	266	while (ip->ip_windex - ip->ip_rindex > MAXCPUFIFO / 4) {
	267	KKASSERT(ip->ip_windex - ip->ip_rindex != MAXCPUFIFO - 1);
	268	lwkt_process_ipiq();
	269	}
	270	write_eflags(eflags);
	271	}
	272
	273	/*
	274	* Queue the new message
	275	*/
	276	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	277	ip->ip_func[windex] = func;
	278	ip->ip_arg1[windex] = arg1;
	279	ip->ip_arg2[windex] = arg2;
	280	cpu_sfence();
	281	++ip->ip_windex;
	282	--gd->gd_intr_nesting_level;
	283
	284	/*
	285	* Do not signal the target cpu, it will pick up the IPI when it next
	286	* polls (typically on the next tick).
	287	*/
	288	crit_exit();
	289
	290	logipiq(send_end, func, arg1, arg2, gd, target);
	291	return(ip->ip_windex);
	292	}
	293
	294	/*
	295	* Send an IPI request without blocking, return 0 on success, ENOENT on
	296	* failure. The actual queueing of the hardware IPI may still force us
	297	* to spin and process incoming IPIs but that will eventually go away
	298	* when we've gotten rid of the other general IPIs.
	299	*/
	300	int
	301	lwkt_send_ipiq3_nowait(globaldata_t target, ipifunc3_t func,
	302	void *arg1, int arg2)
	303	{
	304	lwkt_ipiq_t ip;
	305	int windex;
	306	struct globaldata *gd = mycpu;
	307
	308	logipiq(send_nbio, func, arg1, arg2, gd, target);
	309	KKASSERT(curthread->td_pri >= TDPRI_CRIT);
	310	if (target == gd) {
	311	func(arg1, arg2, NULL);
	312	logipiq(send_end, func, arg1, arg2, gd, target);
	313	return(0);
	314	}
	315	++ipiq_count;
	316	ip = &gd->gd_ipiq[target->gd_cpuid];
	317
	318	if (ip->ip_windex - ip->ip_rindex >= MAXCPUFIFO * 2 / 3) {
	319	logipiq(send_fail, func, arg1, arg2, gd, target);
	320	return(ENOENT);
	321	}
	322	windex = ip->ip_windex & MAXCPUFIFO_MASK;
	323	ip->ip_func[windex] = func;
	324	ip->ip_arg1[windex] = arg1;
	325	ip->ip_arg2[windex] = arg2;
	326	cpu_sfence();
	327	++ip->ip_windex;
	328
	329	/*
	330	* This isn't a passive IPI, we still have to signal the target cpu.
	331	*/
	332	if (atomic_poll_acquire_int(&ip->ip_npoll)) {
	333	logipiq(cpu_send, func, arg1, arg2, gd, target);
	334	cpu_send_ipiq(target->gd_cpuid);
	335	} else {
	336	if (ipiq_optimized == 0) {
	337	logipiq(cpu_send, func, arg1, arg2, gd, target);
	338	cpu_send_ipiq(target->gd_cpuid);
	339	} else {
	340	++ipiq_avoided;
	341	}
	342	}
	343
	344	logipiq(send_end, func, arg1, arg2, gd, target);
	345	return(0);
	346	}
	347
	348	/*
	349	* deprecated, used only by fast int forwarding.
	350	*/
	351	int
	352	lwkt_send_ipiq3_bycpu(int dcpu, ipifunc3_t func, void *arg1, int arg2)
	353	{
	354	return(lwkt_send_ipiq3(globaldata_find(dcpu), func, arg1, arg2));
	355	}
	356
	357	/*
	358	* Send a message to several target cpus. Typically used for scheduling.
	359	* The message will not be sent to stopped cpus.
	360	*/
	361	int
	362	lwkt_send_ipiq3_mask(u_int32_t mask, ipifunc3_t func, void *arg1, int arg2)
	363	{
	364	int cpuid;
	365	int count = 0;
	366
	367	mask &= ~stopped_cpus;
	368	while (mask) {
	369	cpuid = bsfl(mask);
	370	lwkt_send_ipiq3(globaldata_find(cpuid), func, arg1, arg2);
	371	mask &= ~(1 << cpuid);
	372	++count;
	373	}
	374	return(count);
	375	}
	376
	377	/*
	378	* Wait for the remote cpu to finish processing a function.
	379	*
	380	* YYY we have to enable interrupts and process the IPIQ while waiting
	381	* for it to empty or we may deadlock with another cpu. Create a CPU_*()
	382	* function to do this! YYY we really should 'block' here.
	383	*
	384	* MUST be called from a critical section. This routine may be called
	385	* from an interrupt (for example, if an interrupt wakes a foreign thread
	386	* up).
	387	*/
	388	void
	389	lwkt_wait_ipiq(globaldata_t target, int seq)
	390	{
	391	lwkt_ipiq_t ip;
	392	int maxc = 100000000;
	393
	394	if (target != mycpu) {
	395	ip = &mycpu->gd_ipiq[target->gd_cpuid];
	396	if ((int)(ip->ip_xindex - seq) < 0) {
	397	unsigned int eflags = read_eflags();
	398	cpu_enable_intr();
	399	while ((int)(ip->ip_xindex - seq) < 0) {
	400	crit_enter();
	401	lwkt_process_ipiq();
	402	crit_exit();
	403	if (--maxc == 0)
	404	kprintf("LWKT_WAIT_IPIQ WARNING! %d wait %d (%d)\n", mycpu->gd_cpuid, target->gd_cpuid, ip->ip_xindex - seq);
	405	if (maxc < -1000000)
	406	panic("LWKT_WAIT_IPIQ");
	407	/*
	408	* xindex may be modified by another cpu, use a load fence
	409	* to ensure that the loop does not use a speculative value
	410	* (which may improve performance).
	411	*/
	412	cpu_lfence();
	413	}
	414	write_eflags(eflags);
	415	}
	416	}
	417	}
	418
	419	int
	420	lwkt_seq_ipiq(globaldata_t target)
	421	{
	422	lwkt_ipiq_t ip;
	423
	424	ip = &mycpu->gd_ipiq[target->gd_cpuid];
	425	return(ip->ip_windex);
	426	}
	427
	428	/*
	429	* Called from IPI interrupt (like a fast interrupt), which has placed
	430	* us in a critical section. The MP lock may or may not be held.
	431	* May also be called from doreti or splz, or be reentrantly called
	432	* indirectly through the ip_func[] we run.
	433	*
	434	* There are two versions, one where no interrupt frame is available (when
	435	* called from the send code and from splz, and one where an interrupt
	436	* frame is available.
	437	*/
	438	void
	439	lwkt_process_ipiq(void)
	440	{
	441	globaldata_t gd = mycpu;
	442	globaldata_t sgd;
	443	lwkt_ipiq_t ip;
	444	int n;
	445
	446	again:
	447	for (n = 0; n < ncpus; ++n) {
	448	if (n != gd->gd_cpuid) {
	449	sgd = globaldata_find(n);
	450	ip = sgd->gd_ipiq;
	451	if (ip != NULL) {
	452	while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], NULL))
	453	;
	454	}
	455	}
	456	}
	457	if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
	458	if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, NULL)) {
	459	if (gd->gd_curthread->td_cscount == 0)
	460	goto again;
	461	need_ipiq();
	462	}
	463	}
	464	}
	465
	466	void
	467	lwkt_process_ipiq_frame(struct intrframe *frame)
	468	{
	469	globaldata_t gd = mycpu;
	470	globaldata_t sgd;
	471	lwkt_ipiq_t ip;
	472	int n;
	473
	474	again:
	475	for (n = 0; n < ncpus; ++n) {
	476	if (n != gd->gd_cpuid) {
	477	sgd = globaldata_find(n);
	478	ip = sgd->gd_ipiq;
	479	if (ip != NULL) {
	480	while (lwkt_process_ipiq_core(sgd, &ip[gd->gd_cpuid], frame))
	481	;
	482	}
	483	}
	484	}
	485	if (gd->gd_cpusyncq.ip_rindex != gd->gd_cpusyncq.ip_windex) {
	486	if (lwkt_process_ipiq_core(gd, &gd->gd_cpusyncq, frame)) {
	487	if (gd->gd_curthread->td_cscount == 0)
	488	goto again;
	489	need_ipiq();
	490	}
	491	}
	492	}
	493
	494	static int
	495	lwkt_process_ipiq_core(globaldata_t sgd, lwkt_ipiq_t ip,
	496	struct intrframe *frame)
	497	{
	498	int ri;
	499	int wi;
	500	ipifunc3_t copy_func;
	501	void *copy_arg1;
	502	int copy_arg2;
	503
	504	/*
	505	* Obtain the current write index, which is modified by a remote cpu.
	506	* Issue a load fence to prevent speculative reads of e.g. data written
	507	* by the other cpu prior to it updating the index.
	508	*/
	509	KKASSERT(curthread->td_pri >= TDPRI_CRIT);
	510	wi = ip->ip_windex;
	511	cpu_lfence();
	512
	513	/*
	514	* Note: xindex is only updated after we are sure the function has
	515	* finished execution. Beware lwkt_process_ipiq() reentrancy! The
	516	* function may send an IPI which may block/drain.
	517	*
	518	* Note: due to additional IPI operations that the callback function
	519	* may make, it is possible for both rindex and windex to advance and
	520	* thus for rindex to advance passed our cached windex.
	521	*/
	522	while (wi - (ri = ip->ip_rindex) > 0) {
	523	ri &= MAXCPUFIFO_MASK;
	524	copy_func = ip->ip_func[ri];
	525	copy_arg1 = ip->ip_arg1[ri];
	526	copy_arg2 = ip->ip_arg2[ri];
	527	cpu_mfence();
	528	++ip->ip_rindex;
	529	KKASSERT((ip->ip_rindex & MAXCPUFIFO_MASK) == ((ri + 1) & MAXCPUFIFO_MASK));
	530	logipiq(receive, copy_func, copy_arg1, copy_arg2, sgd, mycpu);
	531	copy_func(copy_arg1, copy_arg2, frame);
	532	cpu_sfence();
	533	ip->ip_xindex = ip->ip_rindex;
	534
	535	#ifdef PANIC_DEBUG
	536	/*
	537	* Simulate panics during the processing of an IPI
	538	*/
	539	if (mycpu->gd_cpuid == panic_ipiq_cpu && panic_ipiq_count) {
	540	if (--panic_ipiq_count == 0) {
	541	#ifdef DDB
	542	Debugger("PANIC_DEBUG");
	543	#else
	544	panic("PANIC_DEBUG");
	545	#endif
	546	}
	547	}
	548	#endif
	549	}
	550
	551	/*
	552	* Return non-zero if there are more IPI messages pending on this
	553	* ipiq. ip_npoll is left set as long as possible to reduce the
	554	* number of IPIs queued by the originating cpu, but must be cleared
	555	* BEFORE checking windex.
	556	*/
	557	atomic_poll_release_int(&ip->ip_npoll);
	558	return(wi != ip->ip_windex);
	559	}
	560
	561	static void
	562	lwkt_sync_ipiq(void *arg)
	563	{
	564	cpumask_t *cpumask = arg;
	565
	566	atomic_clear_int(cpumask, mycpu->gd_cpumask);
	567	if (*cpumask == 0)
	568	wakeup(cpumask);
	569	}
	570
	571	void
	572	lwkt_synchronize_ipiqs(const char *wmesg)
	573	{
	574	cpumask_t other_cpumask;
	575
	576	other_cpumask = mycpu->gd_other_cpus & smp_active_mask;
	577	lwkt_send_ipiq_mask(other_cpumask, lwkt_sync_ipiq, &other_cpumask);
	578
	579	crit_enter();
	580	while (other_cpumask != 0) {
	581	tsleep_interlock(&other_cpumask);
	582	if (other_cpumask != 0)
	583	tsleep(&other_cpumask, 0, wmesg, 0);
	584	}
	585	crit_exit();
	586	}
	587
	588	#endif
	589
	590	/*
	591	* CPU Synchronization Support
	592	*
	593	* lwkt_cpusync_simple()
	594	*
	595	* The function is executed synchronously before return on remote cpus.
	596	* A lwkt_cpusync_t pointer is passed as an argument. The data can
	597	* be accessed via arg->cs_data.
	598	*
	599	* XXX should I just pass the data as an argument to be consistent?
	600	*/
	601
	602	void
	603	lwkt_cpusync_simple(cpumask_t mask, cpusync_func_t func, void *data)
	604	{
	605	struct lwkt_cpusync cmd;
	606
	607	cmd.cs_run_func = NULL;
	608	cmd.cs_fin1_func = func;
	609	cmd.cs_fin2_func = NULL;
	610	cmd.cs_data = data;
	611	lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd);
	612	if (mask & (1 << mycpu->gd_cpuid))
	613	func(&cmd);
	614	lwkt_cpusync_finish(&cmd);
	615	}
	616
	617	/*
	618	* lwkt_cpusync_fastdata()
	619	*
	620	* The function is executed in tandem with return on remote cpus.
	621	* The data is directly passed as an argument. Do not pass pointers to
	622	* temporary storage as the storage might have
	623	* gone poof by the time the target cpu executes
	624	* the function.
	625	*
	626	* At the moment lwkt_cpusync is declared on the stack and we must wait
	627	* for all remote cpus to ack in lwkt_cpusync_finish(), but as a future
	628	* optimization we should be able to put a counter in the globaldata
	629	* structure (if it is not otherwise being used) and just poke it and
	630	* return without waiting. XXX
	631	*/
	632	void
	633	lwkt_cpusync_fastdata(cpumask_t mask, cpusync_func2_t func, void *data)
	634	{
	635	struct lwkt_cpusync cmd;
	636
	637	cmd.cs_run_func = NULL;
	638	cmd.cs_fin1_func = NULL;
	639	cmd.cs_fin2_func = func;
	640	cmd.cs_data = NULL;
	641	lwkt_cpusync_start(mask & mycpu->gd_other_cpus, &cmd);
	642	if (mask & (1 << mycpu->gd_cpuid))
	643	func(data);
	644	lwkt_cpusync_finish(&cmd);
	645	}
	646
	647	/*
	648	* lwkt_cpusync_start()
	649	*
	650	* Start synchronization with a set of target cpus, return once they are
	651	* known to be in a synchronization loop. The target cpus will execute
	652	* poll->cs_run_func() IN TANDEM WITH THE RETURN.
	653	*
	654	* XXX future: add lwkt_cpusync_start_quick() and require a call to
	655	* lwkt_cpusync_add() or lwkt_cpusync_wait(), allowing the caller to
	656	* potentially absorb the IPI latency doing something useful.
	657	*/
	658	void
	659	lwkt_cpusync_start(cpumask_t mask, lwkt_cpusync_t poll)
	660	{
	661	globaldata_t gd = mycpu;
	662
	663	poll->cs_count = 0;
	664	poll->cs_mask = mask;
	665	#ifdef SMP
	666	logipiq2(sync_start, mask & gd->gd_other_cpus);
	667	poll->cs_maxcount = lwkt_send_ipiq_mask(
	668	mask & gd->gd_other_cpus & smp_active_mask,
	669	(ipifunc1_t)lwkt_cpusync_remote1, poll);
	670	#endif
	671	if (mask & gd->gd_cpumask) {
	672	if (poll->cs_run_func)
	673	poll->cs_run_func(poll);
	674	}
	675	#ifdef SMP
	676	if (poll->cs_maxcount) {
	677	++ipiq_cscount;
	678	++gd->gd_curthread->td_cscount;
	679	while (poll->cs_count != poll->cs_maxcount) {
	680	crit_enter();
	681	lwkt_process_ipiq();
	682	crit_exit();
	683	}
	684	}
	685	#endif
	686	}
	687
	688	void
	689	lwkt_cpusync_add(cpumask_t mask, lwkt_cpusync_t poll)
	690	{
	691	globaldata_t gd = mycpu;
	692	#ifdef SMP
	693	int count;
	694	#endif
	695
	696	mask &= ~poll->cs_mask;
	697	poll->cs_mask \|= mask;
	698	#ifdef SMP
	699	logipiq2(sync_add, mask & gd->gd_other_cpus);
	700	count = lwkt_send_ipiq_mask(
	701	mask & gd->gd_other_cpus & smp_active_mask,
	702	(ipifunc1_t)lwkt_cpusync_remote1, poll);
	703	#endif
	704	if (mask & gd->gd_cpumask) {
	705	if (poll->cs_run_func)
	706	poll->cs_run_func(poll);
	707	}
	708	#ifdef SMP
	709	poll->cs_maxcount += count;
	710	if (poll->cs_maxcount) {
	711	if (poll->cs_maxcount == count)
	712	++gd->gd_curthread->td_cscount;
	713	while (poll->cs_count != poll->cs_maxcount) {
	714	crit_enter();
	715	lwkt_process_ipiq();
	716	crit_exit();
	717	}
	718	}
	719	#endif
	720	}
	721
	722	/*
	723	* Finish synchronization with a set of target cpus. The target cpus will
	724	* execute cs_fin1_func(poll) prior to this function returning, and will
	725	* execute cs_fin2_func(data) IN TANDEM WITH THIS FUNCTION'S RETURN.
	726	*
	727	* If cs_maxcount is non-zero then we are mastering a cpusync with one or
	728	* more remote cpus and must account for it in our thread structure.
	729	*/
	730	void
	731	lwkt_cpusync_finish(lwkt_cpusync_t poll)
	732	{
	733	globaldata_t gd = mycpu;
	734
	735	poll->cs_count = -1;
	736	if (poll->cs_mask & gd->gd_cpumask) {
	737	if (poll->cs_fin1_func)
	738	poll->cs_fin1_func(poll);
	739	if (poll->cs_fin2_func)
	740	poll->cs_fin2_func(poll->cs_data);
	741	}
	742	#ifdef SMP
	743	if (poll->cs_maxcount) {
	744	while (poll->cs_count != -(poll->cs_maxcount + 1)) {
	745	crit_enter();
	746	lwkt_process_ipiq();
	747	crit_exit();
	748	}
	749	--gd->gd_curthread->td_cscount;
	750	}
	751	#endif
	752	}
	753
	754	#ifdef SMP
	755
	756	/*
	757	* helper IPI remote messaging function.
	758	*
	759	* Called on remote cpu when a new cpu synchronization request has been
	760	* sent to us. Execute the run function and adjust cs_count, then requeue
	761	* the request so we spin on it.
	762	*/
	763	static void
	764	lwkt_cpusync_remote1(lwkt_cpusync_t poll)
	765	{
	766	atomic_add_int(&poll->cs_count, 1);
	767	if (poll->cs_run_func)
	768	poll->cs_run_func(poll);
	769	lwkt_cpusync_remote2(poll);
	770	}
	771
	772	/*
	773	* helper IPI remote messaging function.
	774	*
	775	* Poll for the originator telling us to finish. If it hasn't, requeue
	776	* our request so we spin on it. When the originator requests that we
	777	* finish we execute cs_fin1_func(poll) synchronously and cs_fin2_func(data)
	778	* in tandem with the release.
	779	*/
	780	static void
	781	lwkt_cpusync_remote2(lwkt_cpusync_t poll)
	782	{
	783	if (poll->cs_count < 0) {
	784	cpusync_func2_t savef;
	785	void *saved;
	786
	787	if (poll->cs_fin1_func)
	788	poll->cs_fin1_func(poll);
	789	if (poll->cs_fin2_func) {
	790	savef = poll->cs_fin2_func;
	791	saved = poll->cs_data;
	792	atomic_add_int(&poll->cs_count, -1);
	793	savef(saved);
	794	} else {
	795	atomic_add_int(&poll->cs_count, -1);
	796	}
	797	} else {
	798	globaldata_t gd = mycpu;
	799	lwkt_ipiq_t ip;
	800	int wi;
	801
	802	ip = &gd->gd_cpusyncq;
	803	wi = ip->ip_windex & MAXCPUFIFO_MASK;
	804	ip->ip_func[wi] = (ipifunc3_t)(ipifunc1_t)lwkt_cpusync_remote2;
	805	ip->ip_arg1[wi] = poll;
	806	ip->ip_arg2[wi] = 0;
	807	cpu_sfence();
	808	++ip->ip_windex;
	809	}
	810	}
	811
	812	#endif