| 1 | /* |
| 2 | * Copyright (c) 2005 Jeffrey M. Hsu. All rights reserved. |
| 3 | * |
| 4 | * This code is derived from software contributed to The DragonFly Project |
| 5 | * by Jeffrey M. Hsu. and Matthew Dillon |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * 1. Redistributions of source code must retain the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer. |
| 12 | * 2. Redistributions in binary form must reproduce the above copyright |
| 13 | * notice, this list of conditions and the following disclaimer in the |
| 14 | * documentation and/or other materials provided with the distribution. |
| 15 | * 3. Neither the name of The DragonFly Project nor the names of its |
| 16 | * contributors may be used to endorse or promote products derived |
| 17 | * from this software without specific, prior written permission. |
| 18 | * |
| 19 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 21 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| 22 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| 23 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 24 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 25 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 26 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| 27 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 28 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 29 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 30 | * SUCH DAMAGE. |
| 31 | * |
| 32 | * $DragonFly: src/sys/kern/kern_spinlock.c,v 1.15 2008/06/04 04:34:54 nth Exp $ |
| 33 | */ |
| 34 | |
| 35 | #include <sys/param.h> |
| 36 | #include <sys/systm.h> |
| 37 | #include <sys/types.h> |
| 38 | #include <sys/kernel.h> |
| 39 | #include <sys/sysctl.h> |
| 40 | #ifdef INVARIANTS |
| 41 | #include <sys/proc.h> |
| 42 | #endif |
| 43 | #include <ddb/ddb.h> |
| 44 | #include <machine/atomic.h> |
| 45 | #include <machine/cpufunc.h> |
| 46 | #include <machine/specialreg.h> |
| 47 | #include <machine/clock.h> |
| 48 | #include <sys/spinlock.h> |
| 49 | #include <sys/spinlock2.h> |
| 50 | #include <sys/ktr.h> |
| 51 | |
| 52 | #define BACKOFF_INITIAL 1 |
| 53 | #define BACKOFF_LIMIT 256 |
| 54 | |
| 55 | #ifdef SMP |
| 56 | |
| 57 | /* |
| 58 | * Kernal Trace |
| 59 | */ |
| 60 | #if !defined(KTR_SPIN_CONTENTION) |
| 61 | #define KTR_SPIN_CONTENTION KTR_ALL |
| 62 | #endif |
| 63 | #define SPIN_STRING "spin=%p type=%c" |
| 64 | #define SPIN_ARG_SIZE (sizeof(void *) + sizeof(int)) |
| 65 | |
| 66 | KTR_INFO_MASTER(spin); |
| 67 | KTR_INFO(KTR_SPIN_CONTENTION, spin, beg, 0, SPIN_STRING, SPIN_ARG_SIZE); |
| 68 | KTR_INFO(KTR_SPIN_CONTENTION, spin, end, 1, SPIN_STRING, SPIN_ARG_SIZE); |
| 69 | KTR_INFO(KTR_SPIN_CONTENTION, spin, backoff, 2, |
| 70 | "spin=%p bo1=%d thr=%p bo=%d", |
| 71 | ((2 * sizeof(void *)) + (2 * sizeof(int)))); |
| 72 | KTR_INFO(KTR_SPIN_CONTENTION, spin, bofail, 3, SPIN_STRING, SPIN_ARG_SIZE); |
| 73 | |
| 74 | #define logspin(name, mtx, type) \ |
| 75 | KTR_LOG(spin_ ## name, mtx, type) |
| 76 | |
| 77 | #define logspin_backoff(mtx, bo1, thr, bo) \ |
| 78 | KTR_LOG(spin_backoff, mtx, bo1, thr, bo) |
| 79 | |
| 80 | #ifdef INVARIANTS |
| 81 | static int spin_lock_test_mode; |
| 82 | #endif |
| 83 | |
| 84 | static int64_t spinlocks_contested1; |
| 85 | SYSCTL_QUAD(_debug, OID_AUTO, spinlocks_contested1, CTLFLAG_RD, |
| 86 | &spinlocks_contested1, 0, ""); |
| 87 | |
| 88 | static int64_t spinlocks_contested2; |
| 89 | SYSCTL_QUAD(_debug, OID_AUTO, spinlocks_contested2, CTLFLAG_RD, |
| 90 | &spinlocks_contested2, 0, ""); |
| 91 | |
| 92 | static int spinlocks_backoff_limit = BACKOFF_LIMIT; |
| 93 | SYSCTL_INT(_debug, OID_AUTO, spinlocks_bolim, CTLFLAG_RW, |
| 94 | &spinlocks_backoff_limit, 0, ""); |
| 95 | |
| 96 | struct exponential_backoff { |
| 97 | int backoff; |
| 98 | int nsec; |
| 99 | struct spinlock *mtx; |
| 100 | sysclock_t base; |
| 101 | }; |
| 102 | static int exponential_backoff(struct exponential_backoff *bo); |
| 103 | |
| 104 | static __inline |
| 105 | void |
| 106 | exponential_init(struct exponential_backoff *bo, struct spinlock *mtx) |
| 107 | { |
| 108 | bo->backoff = BACKOFF_INITIAL; |
| 109 | bo->nsec = 0; |
| 110 | bo->mtx = mtx; |
| 111 | } |
| 112 | |
| 113 | /* |
| 114 | * We were either contested due to another exclusive lock holder, |
| 115 | * or due to the presence of shared locks. We have to undo the mess |
| 116 | * we created by returning the shared locks. |
| 117 | * |
| 118 | * If there was another exclusive lock holder only the exclusive bit |
| 119 | * in value will be the only bit set. We don't have to do anything since |
| 120 | * restoration does not involve any work. |
| 121 | * |
| 122 | * Otherwise we successfully obtained the exclusive bit. Attempt to |
| 123 | * clear the shared bits. If we are able to clear the shared bits |
| 124 | * we win. Otherwise we lose and we have to restore the shared bits |
| 125 | * we couldn't clear (and also clear our exclusive bit). |
| 126 | */ |
| 127 | int |
| 128 | spin_trylock_wr_contested(globaldata_t gd, struct spinlock *mtx, int value) |
| 129 | { |
| 130 | int bit; |
| 131 | |
| 132 | ++spinlocks_contested1; |
| 133 | if ((value & SPINLOCK_EXCLUSIVE) == 0) { |
| 134 | while (value) { |
| 135 | bit = bsfl(value); |
| 136 | if (globaldata_find(bit)->gd_spinlock_rd == mtx) { |
| 137 | atomic_swap_int(&mtx->lock, value); |
| 138 | --gd->gd_spinlocks_wr; |
| 139 | return (FALSE); |
| 140 | } |
| 141 | value &= ~(1 << bit); |
| 142 | } |
| 143 | return (TRUE); |
| 144 | } |
| 145 | --gd->gd_spinlocks_wr; |
| 146 | return (FALSE); |
| 147 | } |
| 148 | |
| 149 | /* |
| 150 | * We were either contested due to another exclusive lock holder, |
| 151 | * or due to the presence of shared locks |
| 152 | * |
| 153 | * NOTE: If value indicates an exclusively held mutex, no shared bits |
| 154 | * would have been set and we can throw away value. |
| 155 | */ |
| 156 | void |
| 157 | spin_lock_wr_contested(struct spinlock *mtx, int value) |
| 158 | { |
| 159 | struct exponential_backoff backoff; |
| 160 | globaldata_t gd = mycpu; |
| 161 | int bit; |
| 162 | int mask; |
| 163 | |
| 164 | /* |
| 165 | * Wait until we can gain exclusive access vs another exclusive |
| 166 | * holder. |
| 167 | */ |
| 168 | exponential_init(&backoff, mtx); |
| 169 | ++spinlocks_contested1; |
| 170 | logspin(beg, mtx, 'w'); |
| 171 | |
| 172 | while (value & SPINLOCK_EXCLUSIVE) { |
| 173 | value = atomic_swap_int(&mtx->lock, SPINLOCK_EXCLUSIVE); |
| 174 | if (exponential_backoff(&backoff)) { |
| 175 | value &= ~SPINLOCK_EXCLUSIVE; |
| 176 | break; |
| 177 | } |
| 178 | } |
| 179 | |
| 180 | /* |
| 181 | * Kill the cached shared bit for our own cpu. This is the most |
| 182 | * common case and there's no sense wasting cpu on it. Since |
| 183 | * spinlocks aren't recursive, we can't own a shared ref on the |
| 184 | * spinlock while trying to get an exclusive one. |
| 185 | * |
| 186 | * If multiple bits are set do not stall on any single cpu. Check |
| 187 | * all cpus that have the cache bit set, then loop and check again, |
| 188 | * until we've cleaned all the bits. |
| 189 | */ |
| 190 | value &= ~gd->gd_cpumask; |
| 191 | |
| 192 | while ((mask = value) != 0) { |
| 193 | while (mask) { |
| 194 | bit = bsfl(value); |
| 195 | if (globaldata_find(bit)->gd_spinlock_rd != mtx) { |
| 196 | value &= ~(1 << bit); |
| 197 | } else if (exponential_backoff(&backoff)) { |
| 198 | value = 0; |
| 199 | break; |
| 200 | } |
| 201 | mask &= ~(1 << bit); |
| 202 | } |
| 203 | } |
| 204 | logspin(end, mtx, 'w'); |
| 205 | } |
| 206 | |
| 207 | /* |
| 208 | * The cache bit wasn't set for our cpu. Loop until we can set the bit. |
| 209 | * As with the spin_lock_rd() inline we need a memory fence after setting |
| 210 | * gd_spinlock_rd to interlock against exclusive spinlocks waiting for |
| 211 | * that field to clear. |
| 212 | */ |
| 213 | void |
| 214 | spin_lock_rd_contested(struct spinlock *mtx) |
| 215 | { |
| 216 | struct exponential_backoff backoff; |
| 217 | globaldata_t gd = mycpu; |
| 218 | int value = mtx->lock; |
| 219 | |
| 220 | /* |
| 221 | * Shortcut the op if we can just set the cache bit. This case |
| 222 | * occurs when the last lock was an exclusive lock. |
| 223 | */ |
| 224 | while ((value & SPINLOCK_EXCLUSIVE) == 0) { |
| 225 | if (atomic_cmpset_int(&mtx->lock, value, value|gd->gd_cpumask)) |
| 226 | return; |
| 227 | value = mtx->lock; |
| 228 | } |
| 229 | |
| 230 | exponential_init(&backoff, mtx); |
| 231 | ++spinlocks_contested1; |
| 232 | |
| 233 | logspin(beg, mtx, 'r'); |
| 234 | |
| 235 | while ((value & gd->gd_cpumask) == 0) { |
| 236 | if (value & SPINLOCK_EXCLUSIVE) { |
| 237 | gd->gd_spinlock_rd = NULL; |
| 238 | if (exponential_backoff(&backoff)) { |
| 239 | gd->gd_spinlock_rd = mtx; |
| 240 | break; |
| 241 | } |
| 242 | gd->gd_spinlock_rd = mtx; |
| 243 | cpu_mfence(); |
| 244 | } else { |
| 245 | if (atomic_cmpset_int(&mtx->lock, value, value|gd->gd_cpumask)) |
| 246 | break; |
| 247 | } |
| 248 | value = mtx->lock; |
| 249 | } |
| 250 | logspin(end, mtx, 'r'); |
| 251 | } |
| 252 | |
| 253 | /* |
| 254 | * Handle exponential backoff and indefinite waits. |
| 255 | * |
| 256 | * If the system is handling a panic we hand the spinlock over to the caller |
| 257 | * after 1 second. After 10 seconds we attempt to print a debugger |
| 258 | * backtrace. We also run pending interrupts in order to allow a console |
| 259 | * break into DDB. |
| 260 | */ |
| 261 | static |
| 262 | int |
| 263 | exponential_backoff(struct exponential_backoff *bo) |
| 264 | { |
| 265 | sysclock_t count; |
| 266 | int backoff; |
| 267 | |
| 268 | #ifdef _RDTSC_SUPPORTED_ |
| 269 | if (cpu_feature & CPUID_TSC) { |
| 270 | backoff = |
| 271 | (((u_long)rdtsc() ^ (((u_long)curthread) >> 5)) & |
| 272 | (bo->backoff - 1)) + BACKOFF_INITIAL; |
| 273 | } else |
| 274 | #endif |
| 275 | backoff = bo->backoff; |
| 276 | logspin_backoff(bo->mtx, bo->backoff, curthread, backoff); |
| 277 | |
| 278 | /* |
| 279 | * Quick backoff |
| 280 | */ |
| 281 | for (; backoff; --backoff) |
| 282 | cpu_pause(); |
| 283 | if (bo->backoff < spinlocks_backoff_limit) { |
| 284 | bo->backoff <<= 1; |
| 285 | return (FALSE); |
| 286 | } else { |
| 287 | bo->backoff = BACKOFF_INITIAL; |
| 288 | } |
| 289 | |
| 290 | logspin(bofail, bo->mtx, 'u'); |
| 291 | |
| 292 | /* |
| 293 | * Indefinite |
| 294 | */ |
| 295 | ++spinlocks_contested2; |
| 296 | cpu_spinlock_contested(); |
| 297 | if (bo->nsec == 0) { |
| 298 | bo->base = sys_cputimer->count(); |
| 299 | bo->nsec = 1; |
| 300 | } |
| 301 | |
| 302 | count = sys_cputimer->count(); |
| 303 | if (count - bo->base > sys_cputimer->freq) { |
| 304 | kprintf("spin_lock: %p, indefinite wait!\n", bo->mtx); |
| 305 | if (panicstr) |
| 306 | return (TRUE); |
| 307 | #if defined(INVARIANTS) && defined(DDB) |
| 308 | if (spin_lock_test_mode) { |
| 309 | db_print_backtrace(); |
| 310 | return (TRUE); |
| 311 | } |
| 312 | #endif |
| 313 | ++bo->nsec; |
| 314 | #if defined(INVARIANTS) && defined(DDB) |
| 315 | if (bo->nsec == 11) |
| 316 | db_print_backtrace(); |
| 317 | #endif |
| 318 | if (bo->nsec == 60) |
| 319 | panic("spin_lock: %p, indefinite wait!\n", bo->mtx); |
| 320 | splz(); |
| 321 | bo->base = count; |
| 322 | } |
| 323 | return (FALSE); |
| 324 | } |
| 325 | |
| 326 | /* |
| 327 | * If INVARIANTS is enabled various spinlock timing tests can be run |
| 328 | * by setting debug.spin_lock_test: |
| 329 | * |
| 330 | * 1 Test the indefinite wait code |
| 331 | * 2 Time the best-case exclusive lock overhead (spin_test_count) |
| 332 | * 3 Time the best-case shared lock overhead (spin_test_count) |
| 333 | */ |
| 334 | |
| 335 | #ifdef INVARIANTS |
| 336 | |
| 337 | static int spin_test_count = 10000000; |
| 338 | SYSCTL_INT(_debug, OID_AUTO, spin_test_count, CTLFLAG_RW, &spin_test_count, 0, ""); |
| 339 | |
| 340 | static int |
| 341 | sysctl_spin_lock_test(SYSCTL_HANDLER_ARGS) |
| 342 | { |
| 343 | struct spinlock mtx; |
| 344 | int error; |
| 345 | int value = 0; |
| 346 | int i; |
| 347 | |
| 348 | if ((error = suser(curthread)) != 0) |
| 349 | return (error); |
| 350 | if ((error = SYSCTL_IN(req, &value, sizeof(value))) != 0) |
| 351 | return (error); |
| 352 | |
| 353 | /* |
| 354 | * Indefinite wait test |
| 355 | */ |
| 356 | if (value == 1) { |
| 357 | spin_init(&mtx); |
| 358 | spin_lock_wr(&mtx); /* force an indefinite wait */ |
| 359 | spin_lock_test_mode = 1; |
| 360 | spin_lock_wr(&mtx); |
| 361 | spin_unlock_wr(&mtx); /* Clean up the spinlock count */ |
| 362 | spin_unlock_wr(&mtx); |
| 363 | spin_lock_test_mode = 0; |
| 364 | } |
| 365 | |
| 366 | /* |
| 367 | * Time best-case exclusive spinlocks |
| 368 | */ |
| 369 | if (value == 2) { |
| 370 | globaldata_t gd = mycpu; |
| 371 | |
| 372 | spin_init(&mtx); |
| 373 | for (i = spin_test_count; i > 0; --i) { |
| 374 | spin_lock_wr_quick(gd, &mtx); |
| 375 | spin_unlock_wr_quick(gd, &mtx); |
| 376 | } |
| 377 | } |
| 378 | |
| 379 | /* |
| 380 | * Time best-case shared spinlocks |
| 381 | */ |
| 382 | if (value == 3) { |
| 383 | globaldata_t gd = mycpu; |
| 384 | |
| 385 | spin_init(&mtx); |
| 386 | for (i = spin_test_count; i > 0; --i) { |
| 387 | spin_lock_rd_quick(gd, &mtx); |
| 388 | spin_unlock_rd_quick(gd, &mtx); |
| 389 | } |
| 390 | } |
| 391 | return (0); |
| 392 | } |
| 393 | |
| 394 | SYSCTL_PROC(_debug, KERN_PROC_ALL, spin_lock_test, CTLFLAG_RW|CTLTYPE_INT, |
| 395 | 0, 0, sysctl_spin_lock_test, "I", "Test spinlock wait code"); |
| 396 | |
| 397 | #endif /* INVARIANTS */ |
| 398 | #endif /* SMP */ |