gitweb.dragonflybsd.org Git - dragonfly.git/blame

Commit	Line	Data
b1af91cb JH	1	/*
	2	* Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved.
	3	*
	4	* This code is derived from software contributed to The DragonFly Project
d666840a	5	* by Jeffrey M. Hsu. and Matthew Dillon
b1af91cb JH	6	*
	7	* Redistribution and use in source and binary forms, with or without
	8	* modification, are permitted provided that the following conditions
	9	* are met:
	10	* 1. Redistributions of source code must retain the above copyright
	11	* notice, this list of conditions and the following disclaimer.
	12	* 2. Redistributions in binary form must reproduce the above copyright
	13	* notice, this list of conditions and the following disclaimer in the
	14	* documentation and/or other materials provided with the distribution.
	15	* 3. Neither the name of The DragonFly Project nor the names of its
	16	* contributors may be used to endorse or promote products derived
	17	* from this software without specific, prior written permission.
	18	*
	19	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
	20	* ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
	21	* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
	22	* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
	23	* COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
	24	* INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
	25	* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	26	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
	27	* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
	28	* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
	29	* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	30	* SUCH DAMAGE.
b12defdc	31	*/
0846e4ce	32
b12defdc	33	/*
0846e4ce MD	34	* The implementation is designed to avoid looping when compatible operations
	35	* are executed.
	36	*
d033fb32	37	* To acquire a spinlock we first increment lock. Then we check if lock
0846e4ce MD	38	* meets our requirements. For an exclusive spinlock it must be 1, of a
	39	* shared spinlock it must either be 1 or the SHARED_SPINLOCK bit must be set.
	40	*
	41	* Shared spinlock failure case: Decrement the count, loop until we can
	42	* transition from 0 to SHARED_SPINLOCK\|1, or until we find SHARED_SPINLOCK
	43	* is set and increment the count.
	44	*
	45	* Exclusive spinlock failure case: While maintaining the count, clear the
	46	* SHARED_SPINLOCK flag unconditionally. Then use an atomic add to transfer
d033fb32	47	* the count from the low bits to the high bits of lock. Then loop until
0846e4ce MD	48	* all low bits are 0. Once the low bits drop to 0 we can transfer the
0846e4ce MD	49	* count back with an atomic_cmpset_int(), atomically, and return.
b1af91cb	50	*/
b1af91cb	51	#include <sys/param.h>
b02926de	52	#include <sys/systm.h>
b1af91cb	53	#include <sys/types.h>
b02926de MD	54	#include <sys/kernel.h>
	55	#include <sys/sysctl.h>
	56	#ifdef INVARIANTS
	57	#include <sys/proc.h>
	58	#endif
2b3f93ea	59	#include <sys/caps.h>
b1af91cb	60	#include <machine/atomic.h>
b12defdc	61	#include <machine/cpu.h>
b02926de	62	#include <machine/cpufunc.h>
10c66d57	63	#include <machine/specialreg.h>
b02926de	64	#include <machine/clock.h>
5b49787b	65	#include <sys/indefinite2.h>
b1af91cb	66	#include <sys/spinlock.h>
35a832df	67	#include <sys/spinlock2.h>
57aa743c	68	#include <sys/ktr.h>
b1af91cb	69
ba87a4ab	70	struct spinlock pmap_spin = SPINLOCK_INITIALIZER(pmap_spin, "pmap_spin");
492d98e6	71
57aa743c MD	72	/*
	73	* Kernal Trace
	74	*/
	75	#if !defined(KTR_SPIN_CONTENTION)
	76	#define KTR_SPIN_CONTENTION KTR_ALL
	77	#endif
	78	#define SPIN_STRING "spin=%p type=%c"
	79	#define SPIN_ARG_SIZE (sizeof(void *) + sizeof(int))
	80
	81	KTR_INFO_MASTER(spin);
43e72e79	82	#if 0
57aa743c MD	83	KTR_INFO(KTR_SPIN_CONTENTION, spin, beg, 0, SPIN_STRING, SPIN_ARG_SIZE);
57aa743c MD	84	KTR_INFO(KTR_SPIN_CONTENTION, spin, end, 1, SPIN_STRING, SPIN_ARG_SIZE);
43e72e79	85	#endif
57aa743c	86
b12defdc MD	87	#define logspin(name, spin, type) \
b12defdc MD	88	KTR_LOG(spin_ ## name, spin, type)
10c66d57	89
b02926de MD	90	#ifdef INVARIANTS
	91	static int spin_lock_test_mode;
	92	#endif
	93
bb4ae18c MD	94	#ifdef DEBUG_LOCKS_LATENCY
bb4ae18c MD	95
288f331f	96	__read_frequently static long spinlocks_add_latency;
bb4ae18c MD	97	SYSCTL_LONG(_debug, OID_AUTO, spinlocks_add_latency, CTLFLAG_RW,
	98	&spinlocks_add_latency, 0,
	99	"Add spinlock latency");
	100
	101	#endif
	102
288f331f	103	__read_frequently static long spin_backoff_max = 4096;
cc705b82 MD	104	SYSCTL_LONG(_debug, OID_AUTO, spin_backoff_max, CTLFLAG_RW,
	105	&spin_backoff_max, 0,
	106	"Spinlock exponential backoff limit");
288f331f MD	107
	108	/* 1 << n clock cycles, approx */
	109	__read_frequently static long spin_window_shift = 8;
cc705b82 MD	110	SYSCTL_LONG(_debug, OID_AUTO, spin_window_shift, CTLFLAG_RW,
	111	&spin_window_shift, 0,
	112	"Spinlock TSC windowing");
	113
4badc135 MD	114	__read_frequently int indefinite_uses_rdtsc = 1;
	115	SYSCTL_INT(_debug, OID_AUTO, indefinite_uses_rdtsc, CTLFLAG_RW,
	116	&indefinite_uses_rdtsc, 0,
	117	"Indefinite code uses RDTSC");
	118
d666840a	119	/*
8f165b8c	120	* We contested due to another exclusive lock holder. We lose.
b12defdc MD	121	*
b12defdc MD	122	* We have to unwind the attempt and may acquire the spinlock
01be7a8f	123	* anyway while doing so.
d666840a MD	124	*/
d666840a MD	125	int
b12defdc	126	spin_trylock_contested(struct spinlock *spin)
d666840a	127	{
b12defdc MD	128	globaldata_t gd = mycpu;
b12defdc MD	129
01be7a8f MD	130	/*
	131	* Handle degenerate case, else fail.
	132	*/
d033fb32	133	if (atomic_cmpset_int(&spin->lock, SPINLOCK_SHARED\|0, 1))
01be7a8f	134	return TRUE;
d033fb32	135	/atomic_add_int(&spin->lock, -1);/
0846e4ce	136	--gd->gd_spinlocks;
e8b1691f	137	crit_exit_quick(gd->gd_curthread);
a4d95680	138
d666840a	139	return (FALSE);
b1af91cb JH	140	}
b1af91cb JH	141
d666840a	142	/*
01be7a8f	143	* The spin_lock() inline was unable to acquire the lock and calls this
d033fb32	144	* function with spin->lock already incremented, passing (spin->lock - 1)
e22f2acd	145	* to the function (the result of the inline's fetchadd).
d666840a	146	*
1b8fb8d2 MD	147	* Note that we implement both exclusive and shared spinlocks, so we cannot
	148	* use atomic_swap_int(). Instead, we try to use atomic_fetchadd_int()
	149	* to put most of the burden on the cpu. Atomic_cmpset_int() (cmpxchg)
	150	* can cause a lot of unnecessary looping in situations where it is just
	151	* trying to increment the count.
	152	*
	153	* Similarly, we leave the SHARED flag intact and incur slightly more
	154	* overhead when switching from shared to exclusive. This allows us to
	155	* use atomic_fetchadd_int() for both spinlock types in the critical
	156	* path.
	157	*
cc705b82 MD	158	* The exponential (n^1.5) backoff algorithm is designed to both reduce
	159	* cache bus contention between cpu cores and sockets, and to allow some
	160	* bursting of exclusive locks in heavily contended situations to improve
	161	* performance.
	162	*
	163	* The exclusive lock priority mechanism prevents even heavily contended
	164	* exclusive locks from being starved by shared locks
d666840a MD	165	*/
d666840a MD	166	void
cff27bad	167	_spin_lock_contested(struct spinlock spin, const char ident, int value)
d666840a	168	{
b1793cc6	169	indefinite_info_t info;
ae4025a1	170	uint32_t ovalue;
cc705b82 MD	171	long expbackoff;
cc705b82 MD	172	long loop;
d666840a	173
01be7a8f	174	/*
e22f2acd MD	175	* WARNING! Caller has already incremented the lock. We must
	176	* increment the count value (from the inline's fetch-add)
	177	* to match.
	178	*
	179	* Handle the degenerate case where the spinlock is flagged SHARED
	180	* with only our reference. We can convert it to EXCLUSIVE.
01be7a8f	181	*/
cc705b82	182	if (value == (SPINLOCK_SHARED \| 1) - 1) {
d033fb32	183	if (atomic_cmpset_int(&spin->lock, SPINLOCK_SHARED \| 1, 1))
cff27bad MD	184	return;
cff27bad MD	185	}
cc705b82 MD	186	/* ++value; value not used after this */
	187	info.type = 0; /* avoid improper gcc warning */
	188	info.ident = NULL; /* avoid improper gcc warning */
4badc135 MD	189	info.secs = 0; /* avoid improper gcc warning */
4badc135 MD	190	info.base = 0; /* avoid improper gcc warning */
2a404fe0	191	info.count = 0; /* avoid improper gcc warning */
cc705b82	192	expbackoff = 0;
01be7a8f	193
0846e4ce	194	/*
e22f2acd MD	195	* Transfer our exclusive request to the high bits and clear the
	196	* SPINLOCK_SHARED bit if it was set. This makes the spinlock
	197	* appear exclusive, preventing any NEW shared or exclusive
	198	* spinlocks from being obtained while we wait for existing
	199	* shared or exclusive holders to unlock.
	200	*
	201	* Don't tread on earlier exclusive waiters by stealing the lock
	202	* away early if the low bits happen to now be 1.
79a7c522	203	*
e22f2acd	204	* The shared unlock understands that this may occur.
0846e4ce	205	*/
d033fb32	206	ovalue = atomic_fetchadd_int(&spin->lock, SPINLOCK_EXCLWAIT - 1);
ae4025a1	207	ovalue += SPINLOCK_EXCLWAIT - 1;
9abb66c5	208	if (ovalue & SPINLOCK_SHARED) {
d033fb32	209	atomic_clear_int(&spin->lock, SPINLOCK_SHARED);
ae4025a1 MD	210	ovalue &= ~SPINLOCK_SHARED;
ae4025a1 MD	211	}
0846e4ce	212
43e72e79	213	for (;;) {
cc705b82 MD	214	expbackoff = (expbackoff + 1) * 3 / 2;
cc705b82 MD	215	if (expbackoff == 6) /* 1, 3, 6, 10, ... */
6d0742ae	216	indefinite_init(&info, spin, ident, 0, 'S');
4badc135 MD	217	if (indefinite_uses_rdtsc) {
	218	if ((rdtsc() >> spin_window_shift) % ncpus != mycpuid) {
	219	for (loop = expbackoff; loop; --loop)
	220	cpu_pause();
	221	}
cc705b82 MD	222	}
	223	/cpu_lfence();/
	224
43e72e79	225	/*
0846e4ce	226	* If the low bits are zero, try to acquire the exclusive lock
e22f2acd	227	* by transfering our high bit reservation to the low bits.
0846e4ce	228	*
cc705b82 MD	229	* NOTE: Avoid unconditional atomic op by testing ovalue,
cc705b82 MD	230	* otherwise we get cache bus armageddon.
c5cfe2c8 MD	231	*
	232	* NOTE: We must also ensure that the SHARED bit is cleared.
	233	* It is possible for it to wind up being set on a
	234	* shared lock override of the EXCLWAIT bits.
43e72e79	235	*/
d033fb32	236	ovalue = spin->lock;
0846e4ce	237	cpu_ccfence();
ae4025a1	238	if ((ovalue & (SPINLOCK_EXCLWAIT - 1)) == 0) {
a18b747c MD	239	uint32_t nvalue;
	240
	241	nvalue= ((ovalue - SPINLOCK_EXCLWAIT) \| 1) &
	242	~SPINLOCK_SHARED;
d033fb32	243	if (atomic_fcmpset_int(&spin->lock, &ovalue, nvalue))
ae4025a1	244	break;
ae4025a1	245	continue;
0846e4ce	246	}
cc705b82 MD	247	if (expbackoff > 6 + spin_backoff_max)
	248	expbackoff = 6 + spin_backoff_max;
	249	if (expbackoff >= 6) {
	250	if (indefinite_check(&info))
	251	break;
1b8fb8d2	252	}
b12defdc	253	}
cc705b82 MD	254	if (expbackoff >= 6)
cc705b82 MD	255	indefinite_done(&info);
d666840a	256	}
b02926de	257
0846e4ce	258	/*
e22f2acd	259	* The spin_lock_shared() inline was unable to acquire the lock and calls
d033fb32	260	* this function with spin->lock already incremented.
79a7c522	261	*
e22f2acd MD	262	* This is not in the critical path unless there is contention between
e22f2acd MD	263	* shared and exclusive holders.
cc705b82 MD	264	*
	265	* Exclusive locks have priority over shared locks. However, this can
	266	* cause shared locks to be starved when large numbers of threads are
	267	* competing for exclusive locks so the shared lock code uses TSC-windowing
	268	* to selectively ignore the exclusive priority mechanism. This has the
	269	* effect of allowing a limited number of shared locks to compete against
	270	* exclusive waiters at any given moment.
	271	*
	272	* Note that shared locks do not implement exponential backoff. Instead,
	273	* the shared lock simply polls the lock value. One cpu_pause() is built
	274	* into indefinite_check().
0846e4ce MD	275	*/
0846e4ce MD	276	void
e22f2acd	277	_spin_lock_shared_contested(struct spinlock spin, const char ident)
0846e4ce	278	{
b1793cc6	279	indefinite_info_t info;
ae4025a1	280	uint32_t ovalue;
5b49787b	281
e22f2acd MD	282	/*
	283	* Undo the inline's increment.
	284	*/
d033fb32	285	ovalue = atomic_fetchadd_int(&spin->lock, -1) - 1;
e22f2acd	286
6d0742ae	287	indefinite_init(&info, spin, ident, 0, 's');
9abb66c5 MD	288	cpu_pause();
9abb66c5 MD	289
0846e4ce MD	290	#ifdef DEBUG_LOCKS_LATENCY
	291	long j;
	292	for (j = spinlocks_add_latency; j > 0; --j)
	293	cpu_ccfence();
6f208300	294	#endif
0846e4ce	295
0846e4ce MD	296	for (;;) {
0846e4ce MD	297	/*
79a7c522 MD	298	* Loop until we can acquire the shared spinlock. Note that
	299	* the low bits can be zero while the high EXCLWAIT bits are
	300	* non-zero. In this situation exclusive requesters have
	301	* priority (otherwise shared users on multiple cpus can hog
	302	* the spinlnock).
	303	*
d033fb32	304	* NOTE: Reading spin->lock prior to the swap is extremely
0846e4ce MD	305	* important on multi-chip/many-core boxes. On 48-core
	306	* this one change improves fully concurrent all-cores
	307	* compiles by 100% or better.
	308	*
	309	* I can't emphasize enough how important the pre-read
	310	* is in preventing hw cache bus armageddon on
	311	* multi-chip systems. And on single-chip/multi-core
	312	* systems it just doesn't hurt.
	313	*/
0846e4ce	314	cpu_ccfence();
cc705b82 MD	315
	316	/*
	317	* Ignore the EXCLWAIT bits if we are inside our window.
aab1a048 MD	318	*
	319	* We must always use a windowing approach here or the
	320	* EXCLWAIT bits can prevent the shared lock from ever
	321	* resolving... permanent starvation.
	322	*
	323	* In addition, if we were to always ignore the EXCLWAIT
	324	* bits overlapping shared locks can prevent an exclusive
	325	* lock from ever resolving... permanent starvation again.
cc705b82	326	*/
aab1a048	327	if (/indefinite_uses_rdtsc &&/
4badc135	328	(ovalue & (SPINLOCK_EXCLWAIT - 1)) == 0 &&
cc705b82	329	(rdtsc() >> spin_window_shift) % ncpus == mycpuid) {
d033fb32	330	if (atomic_fcmpset_int(&spin->lock, &ovalue,
cc705b82	331	ovalue \| SPINLOCK_SHARED \| 1)) {
0846e4ce	332	break;
ae4025a1 MD	333	}
	334	continue;
	335	}
97cfa330 MD	336
97cfa330 MD	337	/*
cc705b82 MD	338	* Check ovalue tightly (no exponential backoff for shared
	339	* locks, that would result in horrible performance. Instead,
	340	* shared locks depend on the exclusive priority mechanism
	341	* to avoid starving exclusive locks).
97cfa330	342	*/
cc705b82	343	if (ovalue == 0) {
d033fb32	344	if (atomic_fcmpset_int(&spin->lock, &ovalue,
cc705b82	345	SPINLOCK_SHARED \| 1)) {
97cfa330 MD	346	break;
	347	}
	348	continue;
	349	}
	350
	351	/*
	352	* If SHARED is already set, go for the increment, improving
	353	* the exclusive to multiple-readers transition.
	354	*/
ae4025a1	355	if (ovalue & SPINLOCK_SHARED) {
d033fb32	356	ovalue = atomic_fetchadd_int(&spin->lock, 1);
ae4025a1 MD	357	/* ovalue += 1; NOT NEEDED */
ae4025a1 MD	358	if (ovalue & SPINLOCK_SHARED)
0846e4ce	359	break;
d033fb32	360	ovalue = atomic_fetchadd_int(&spin->lock, -1);
ae4025a1 MD	361	ovalue += -1;
ae4025a1 MD	362	continue;
0846e4ce	363	}
b1793cc6	364	if (indefinite_check(&info))
5b49787b	365	break;
ae4025a1 MD	366	/*
	367	* ovalue was wrong anyway, just reload
	368	*/
d033fb32	369	ovalue = spin->lock;
b02926de	370	}
b1793cc6	371	indefinite_done(&info);
b02926de MD	372	}
b02926de MD	373
4badc135 MD	374	/*
	375	* Automatically avoid use of rdtsc when running in a VM
	376	*/
	377	static void
	378	spinlock_sysinit(void *dummy __unused)
	379	{
	380	if (vmm_guest)
	381	indefinite_uses_rdtsc = 0;
	382	}
	383	SYSINIT(spinsysinit, SI_BOOT2_PROC0, SI_ORDER_FIRST, spinlock_sysinit, NULL);
	384
	385
b02926de	386	/*
d666840a MD	387	* If INVARIANTS is enabled various spinlock timing tests can be run
	388	* by setting debug.spin_lock_test:
	389	*
	390	* 1 Test the indefinite wait code
	391	* 2 Time the best-case exclusive lock overhead (spin_test_count)
	392	* 3 Time the best-case shared lock overhead (spin_test_count)
b02926de MD	393	*/
	394
	395	#ifdef INVARIANTS
	396
d666840a	397	static int spin_test_count = 10000000;
0c52fa62 SG	398	SYSCTL_INT(_debug, OID_AUTO, spin_test_count, CTLFLAG_RW, &spin_test_count, 0,
0c52fa62 SG	399	"Number of iterations to use for spinlock wait code test");
d666840a	400
b02926de MD	401	static int
	402	sysctl_spin_lock_test(SYSCTL_HANDLER_ARGS)
	403	{
b12defdc	404	struct spinlock spin;
b02926de MD	405	int error;
b02926de MD	406	int value = 0;
d666840a	407	int i;
b02926de	408
2b3f93ea	409	if ((error = caps_priv_check_self(SYSCAP_RESTRICTEDROOT)) != 0)
b02926de MD	410	return (error);
	411	if ((error = SYSCTL_IN(req, &value, sizeof(value))) != 0)
	412	return (error);
	413
d666840a MD	414	/*
	415	* Indefinite wait test
	416	*/
b02926de	417	if (value == 1) {
ba87a4ab	418	spin_init(&spin, "sysctllock");
b12defdc	419	spin_lock(&spin); /* force an indefinite wait */
b02926de	420	spin_lock_test_mode = 1;
b12defdc MD	421	spin_lock(&spin);
	422	spin_unlock(&spin); /* Clean up the spinlock count */
	423	spin_unlock(&spin);
b02926de MD	424	spin_lock_test_mode = 0;
b02926de MD	425	}
d666840a MD	426
	427	/*
	428	* Time best-case exclusive spinlocks
	429	*/
	430	if (value == 2) {
	431	globaldata_t gd = mycpu;
	432
ba87a4ab	433	spin_init(&spin, "sysctllocktest");
d666840a	434	for (i = spin_test_count; i > 0; --i) {
050032ec	435	_spin_lock_quick(gd, &spin, "test");
b12defdc	436	spin_unlock_quick(gd, &spin);
d666840a MD	437	}
	438	}
	439
b02926de MD	440	return (0);
	441	}
	442
	443	SYSCTL_PROC(_debug, KERN_PROC_ALL, spin_lock_test, CTLFLAG_RW\|CTLTYPE_INT,
	444	0, 0, sysctl_spin_lock_test, "I", "Test spinlock wait code");
	445
d666840a	446	#endif /* INVARIANTS */