sys/kern/kern_mutex.c

   1 /*
   2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 /*
  35  * Implement fast persistent locks based on atomic_cmpset_int() with
  36  * semantics similar to lockmgr locks but faster and taking up much less
  37  * space.  Taken from HAMMER's lock implementation.
  38  *
  39  * These are meant to complement our LWKT tokens.  Tokens are only held
  40  * while the thread is running.  Mutexes can be held across blocking
  41  * conditions.
  42  *
  43  * - Exclusive priority over shared to prevent SMP starvation.
  44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
  45  * - locks can be asynchronous.
  46  * - synchronous fast path if no blocking occurs (async callback is not
  47  *   made in this case).
  48  *
  49  * Generally speaking any caller-supplied link state must be properly
  50  * initialized before use.
  51  *
  52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
  53  * functions here.
  54  */
  55
  56 #include <sys/param.h>
  57 #include <sys/systm.h>
  58 #include <sys/kernel.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/thread.h>
  61
  62 #include <machine/cpufunc.h>
  63
  64 #include <sys/thread2.h>
  65 #include <sys/mutex2.h>
  66
  67 static __int64_t mtx_contention_count;
  68 static __int64_t mtx_collision_count;
  69 static __int64_t mtx_wakeup_count;
  70
  71 SYSCTL_QUAD(_kern, OID_AUTO, mtx_contention_count, CTLFLAG_RW,
  72             &mtx_contention_count, 0, "");
  73 SYSCTL_QUAD(_kern, OID_AUTO, mtx_collision_count, CTLFLAG_RW,
  74             &mtx_collision_count, 0, "");
  75 SYSCTL_QUAD(_kern, OID_AUTO, mtx_wakeup_count, CTLFLAG_RW,
  76             &mtx_wakeup_count, 0, "");
  77
  78 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
  79 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount);
  80 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
  81
  82 /*
  83  * Exclusive-lock a mutex, block until acquired unless link is async.
  84  * Recursion is allowed.
  85  *
  86  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
  87  * if async.  If immediately successful an async exclusive lock will return 0
  88  * and not issue the async callback or link the link structure.  The caller
  89  * must handle this case (typically this is an optimal code path).
  90  *
  91  * A tsleep() error can only be returned if PCATCH is specified in the flags.
  92  */
  93 static __inline int
  94 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link,
  95               const char *ident, int flags, int to)
  96 {
  97         thread_t td;
  98         u_int   lock;
  99         u_int   nlock;
 100         int     error;
 101         int     isasync;
 102
 103         for (;;) {
 104                 lock = mtx->mtx_lock;
 105                 cpu_ccfence();
 106
 107                 if (lock == 0) {
 108                         nlock = MTX_EXCLUSIVE | 1;
 109                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 110                                 mtx->mtx_owner = curthread;
 111                                 link->state = MTX_LINK_ACQUIRED;
 112                                 error = 0;
 113                                 break;
 114                         }
 115                         continue;
 116                 }
 117                 if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
 118                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 119                         nlock = lock + 1;
 120                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 121                                 link->state = MTX_LINK_ACQUIRED;
 122                                 error = 0;
 123                                 break;
 124                         }
 125                         continue;
 126                 }
 127
 128                 /*
 129                  * We need MTX_LINKSPIN to manipulate exlink or
 130                  * shlink.
 131                  *
 132                  * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
 133                  * pending shared requests.  It cannot be set as a separate
 134                  * operation prior to acquiring MTX_LINKSPIN.
 135                  *
 136                  * To avoid unnecessary cpu cache traffic we poll
 137                  * for collisions.  It is also possible that EXWANTED
 138                  * state failing the above test was spurious, so all the
 139                  * tests must be repeated if we cannot obtain LINKSPIN
 140                  * with the prior state tests intact (i.e. don't reload
 141                  * the (lock) variable here, for heaven's sake!).
 142                  */
 143                 if (lock & MTX_LINKSPIN) {
 144                         cpu_pause();
 145                         ++mtx_collision_count;
 146                         continue;
 147                 }
 148                 td = curthread;
 149                 nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
 150                 ++td->td_critcount;
 151                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
 152                         --td->td_critcount;
 153                         continue;
 154                 }
 155
 156                 /*
 157                  * Check for early abort.
 158                  */
 159                 if (link->state == MTX_LINK_ABORTED) {
 160                         if (mtx->mtx_exlink == NULL) {
 161                                 atomic_clear_int(&mtx->mtx_lock,
 162                                                  MTX_LINKSPIN |
 163                                                  MTX_EXWANTED);
 164                         } else {
 165                                 atomic_clear_int(&mtx->mtx_lock,
 166                                                  MTX_LINKSPIN);
 167                         }
 168                         --td->td_critcount;
 169                         link->state = MTX_LINK_IDLE;
 170                         error = ENOLCK;
 171                         break;
 172                 }
 173
 174                 /*
 175                  * Add our link to the exlink list and release LINKSPIN.
 176                  */
 177                 link->owner = td;
 178                 link->state = MTX_LINK_LINKED_EX;
 179                 if (mtx->mtx_exlink) {
 180                         link->next = mtx->mtx_exlink;
 181                         link->prev = link->next->prev;
 182                         link->next->prev = link;
 183                         link->prev->next = link;
 184                 } else {
 185                         link->next = link;
 186                         link->prev = link;
 187                         mtx->mtx_exlink = link;
 188                 }
 189                 isasync = (link->callback != NULL);
 190                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
 191                 --td->td_critcount;
 192
 193                 /*
 194                  * If asynchronous lock request return without
 195                  * blocking, leave link structure linked.
 196                  */
 197                 if (isasync) {
 198                         error = EINPROGRESS;
 199                         break;
 200                 }
 201
 202                 /*
 203                  * Wait for lock
 204                  */
 205                 error = mtx_wait_link(mtx, link, ident, flags, to);
 206                 break;
 207         }
 208         return (error);
 209 }
 210
 211 int
 212 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link,
 213                   const char *ident, int flags, int to)
 214 {
 215         return(__mtx_lock_ex(mtx, link, ident, flags, to));
 216 }
 217
 218 int
 219 _mtx_lock_ex(mtx_t *mtx, const char *ident, int flags, int to)
 220 {
 221         mtx_link_t link;
 222
 223         mtx_link_init(&link);
 224         return(__mtx_lock_ex(mtx, &link, ident, flags, to));
 225 }
 226
 227 int
 228 _mtx_lock_ex_quick(mtx_t *mtx, const char *ident)
 229 {
 230         mtx_link_t link;
 231
 232         mtx_link_init(&link);
 233         return(__mtx_lock_ex(mtx, &link, ident, 0, 0));
 234 }
 235
 236 /*
 237  * Share-lock a mutex, block until acquired.  Recursion is allowed.
 238  *
 239  * Returns 0 on success, or the tsleep() return code on failure.
 240  * An error can only be returned if PCATCH is specified in the flags.
 241  *
 242  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
 243  *       do not have to chain the wakeup().
 244  */
 245 static __inline int
 246 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link,
 247               const char *ident, int flags, int to)
 248 {
 249         thread_t td;
 250         u_int   lock;
 251         u_int   nlock;
 252         int     error;
 253         int     isasync;
 254
 255         for (;;) {
 256                 lock = mtx->mtx_lock;
 257                 cpu_ccfence();
 258
 259                 if (lock == 0) {
 260                         nlock = 1;
 261                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 262                                 error = 0;
 263                                 link->state = MTX_LINK_ACQUIRED;
 264                                 break;
 265                         }
 266                         continue;
 267                 }
 268                 if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
 269                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 270                         nlock = lock + 1;
 271                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 272                                 error = 0;
 273                                 link->state = MTX_LINK_ACQUIRED;
 274                                 break;
 275                         }
 276                         continue;
 277                 }
 278
 279                 /*
 280                  * We need MTX_LINKSPIN to manipulate exlink or
 281                  * shlink.
 282                  *
 283                  * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
 284                  * pending shared requests.  It cannot be set as a separate
 285                  * operation prior to acquiring MTX_LINKSPIN.
 286                  *
 287                  * To avoid unnecessary cpu cache traffic we poll
 288                  * for collisions.  It is also possible that EXWANTED
 289                  * state failing the above test was spurious, so all the
 290                  * tests must be repeated if we cannot obtain LINKSPIN
 291                  * with the prior state tests intact (i.e. don't reload
 292                  * the (lock) variable here, for heaven's sake!).
 293                  */
 294                 if (lock & MTX_LINKSPIN) {
 295                         cpu_pause();
 296                         ++mtx_collision_count;
 297                         continue;
 298                 }
 299                 td = curthread;
 300                 nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
 301                 ++td->td_critcount;
 302                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
 303                         --td->td_critcount;
 304                         continue;
 305                 }
 306
 307                 /*
 308                  * Check for early abort.
 309                  */
 310                 if (link->state == MTX_LINK_ABORTED) {
 311                         if (mtx->mtx_exlink == NULL) {
 312                                 atomic_clear_int(&mtx->mtx_lock,
 313                                                  MTX_LINKSPIN |
 314                                                  MTX_SHWANTED);
 315                         } else {
 316                                 atomic_clear_int(&mtx->mtx_lock,
 317                                                  MTX_LINKSPIN);
 318                         }
 319                         --td->td_critcount;
 320                         link->state = MTX_LINK_IDLE;
 321                         error = ENOLCK;
 322                         break;
 323                 }
 324
 325                 /*
 326                  * Add our link to the exlink list and release LINKSPIN.
 327                  */
 328                 link->owner = td;
 329                 link->state = MTX_LINK_LINKED_SH;
 330                 if (mtx->mtx_shlink) {
 331                         link->next = mtx->mtx_shlink;
 332                         link->prev = link->next->prev;
 333                         link->next->prev = link;
 334                         link->prev->next = link;
 335                 } else {
 336                         link->next = link;
 337                         link->prev = link;
 338                         mtx->mtx_shlink = link;
 339                 }
 340                 isasync = (link->callback != NULL);
 341                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
 342                 --td->td_critcount;
 343
 344                 /*
 345                  * If asynchronous lock request return without
 346                  * blocking, leave link structure linked.
 347                  */
 348                 if (isasync) {
 349                         error = EINPROGRESS;
 350                         break;
 351                 }
 352
 353                 /*
 354                  * Wait for lock
 355                  */
 356                 error = mtx_wait_link(mtx, link, ident, flags, to);
 357                 break;
 358         }
 359         return (error);
 360 }
 361
 362 int
 363 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link,
 364                   const char *ident, int flags, int to)
 365 {
 366         return(__mtx_lock_sh(mtx, link, ident, flags, to));
 367 }
 368
 369 int
 370 _mtx_lock_sh(mtx_t *mtx, const char *ident, int flags, int to)
 371 {
 372         mtx_link_t link;
 373
 374         mtx_link_init(&link);
 375         return(__mtx_lock_sh(mtx, &link, ident, flags, to));
 376 }
 377
 378 int
 379 _mtx_lock_sh_quick(mtx_t *mtx, const char *ident)
 380 {
 381         mtx_link_t link;
 382
 383         mtx_link_init(&link);
 384         return(__mtx_lock_sh(mtx, &link, ident, 0, 0));
 385 }
 386
 387 /*
 388  * Get an exclusive spinlock the hard way.
 389  */
 390 void
 391 _mtx_spinlock(mtx_t *mtx)
 392 {
 393         u_int   lock;
 394         u_int   nlock;
 395         int     bb = 1;
 396         int     bo;
 397
 398         for (;;) {
 399                 lock = mtx->mtx_lock;
 400                 if (lock == 0) {
 401                         nlock = MTX_EXCLUSIVE | 1;
 402                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 403                                 mtx->mtx_owner = curthread;
 404                                 break;
 405                         }
 406                 } else if ((lock & MTX_EXCLUSIVE) &&
 407                            mtx->mtx_owner == curthread) {
 408                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 409                         nlock = lock + 1;
 410                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 411                                 break;
 412                 } else {
 413                         /* MWAIT here */
 414                         if (bb < 1000)
 415                                 ++bb;
 416                         cpu_pause();
 417                         for (bo = 0; bo < bb; ++bo)
 418                                 ;
 419                         ++mtx_contention_count;
 420                 }
 421                 cpu_pause();
 422                 ++mtx_collision_count;
 423         }
 424 }
 425
 426 /*
 427  * Attempt to acquire a spinlock, if we fail we must undo the
 428  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
 429  *
 430  * Returns 0 on success, EAGAIN on failure.
 431  */
 432 int
 433 _mtx_spinlock_try(mtx_t *mtx)
 434 {
 435         globaldata_t gd = mycpu;
 436         u_int   lock;
 437         u_int   nlock;
 438         int     res = 0;
 439
 440         for (;;) {
 441                 lock = mtx->mtx_lock;
 442                 if (lock == 0) {
 443                         nlock = MTX_EXCLUSIVE | 1;
 444                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 445                                 mtx->mtx_owner = gd->gd_curthread;
 446                                 break;
 447                         }
 448                 } else if ((lock & MTX_EXCLUSIVE) &&
 449                            mtx->mtx_owner == gd->gd_curthread) {
 450                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 451                         nlock = lock + 1;
 452                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 453                                 break;
 454                 } else {
 455                         --gd->gd_spinlocks;
 456                         cpu_ccfence();
 457                         --gd->gd_curthread->td_critcount;
 458                         res = EAGAIN;
 459                         break;
 460                 }
 461                 cpu_pause();
 462                 ++mtx_collision_count;
 463         }
 464         return res;
 465 }
 466
 467 #if 0
 468
 469 void
 470 _mtx_spinlock_sh(mtx_t *mtx)
 471 {
 472         u_int   lock;
 473         u_int   nlock;
 474         int     bb = 1;
 475         int     bo;
 476
 477         for (;;) {
 478                 lock = mtx->mtx_lock;
 479                 if ((lock & MTX_EXCLUSIVE) == 0) {
 480                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 481                         nlock = lock + 1;
 482                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 483                                 break;
 484                 } else {
 485                         /* MWAIT here */
 486                         if (bb < 1000)
 487                                 ++bb;
 488                         cpu_pause();
 489                         for (bo = 0; bo < bb; ++bo)
 490                                 ;
 491                         ++mtx_contention_count;
 492                 }
 493                 cpu_pause();
 494                 ++mtx_collision_count;
 495         }
 496 }
 497
 498 #endif
 499
 500 int
 501 _mtx_lock_ex_try(mtx_t *mtx)
 502 {
 503         u_int   lock;
 504         u_int   nlock;
 505         int     error;
 506
 507         for (;;) {
 508                 lock = mtx->mtx_lock;
 509                 if (lock == 0) {
 510                         nlock = MTX_EXCLUSIVE | 1;
 511                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 512                                 mtx->mtx_owner = curthread;
 513                                 error = 0;
 514                                 break;
 515                         }
 516                 } else if ((lock & MTX_EXCLUSIVE) &&
 517                            mtx->mtx_owner == curthread) {
 518                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 519                         nlock = lock + 1;
 520                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 521                                 error = 0;
 522                                 break;
 523                         }
 524                 } else {
 525                         error = EAGAIN;
 526                         break;
 527                 }
 528                 cpu_pause();
 529                 ++mtx_collision_count;
 530         }
 531         return (error);
 532 }
 533
 534 int
 535 _mtx_lock_sh_try(mtx_t *mtx)
 536 {
 537         u_int   lock;
 538         u_int   nlock;
 539         int     error = 0;
 540
 541         for (;;) {
 542                 lock = mtx->mtx_lock;
 543                 if ((lock & MTX_EXCLUSIVE) == 0) {
 544                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 545                         nlock = lock + 1;
 546                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 547                                 break;
 548                 } else {
 549                         error = EAGAIN;
 550                         break;
 551                 }
 552                 cpu_pause();
 553                 ++mtx_collision_count;
 554         }
 555         return (error);
 556 }
 557
 558 /*
 559  * If the lock is held exclusively it must be owned by the caller.  If the
 560  * lock is already a shared lock this operation is a NOP.  A panic will
 561  * occur if the lock is not held either shared or exclusive.
 562  *
 563  * The exclusive count is converted to a shared count.
 564  */
 565 void
 566 _mtx_downgrade(mtx_t *mtx)
 567 {
 568         u_int   lock;
 569         u_int   nlock;
 570
 571         for (;;) {
 572                 lock = mtx->mtx_lock;
 573                 cpu_ccfence();
 574
 575                 /*
 576                  * NOP if already shared.
 577                  */
 578                 if ((lock & MTX_EXCLUSIVE) == 0) {
 579                         KKASSERT((lock & MTX_MASK) > 0);
 580                         break;
 581                 }
 582
 583                 /*
 584                  * Transfer count to shared.  Any additional pending shared
 585                  * waiters must be woken up.
 586                  */
 587                 if (lock & MTX_SHWANTED) {
 588                         if (mtx_chain_link_sh(mtx, lock, 1))
 589                                 break;
 590                         /* retry */
 591                 } else {
 592                         nlock = lock & ~MTX_EXCLUSIVE;
 593                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 594                                 break;
 595                         /* retry */
 596                 }
 597                 cpu_pause();
 598                 ++mtx_collision_count;
 599         }
 600 }
 601
 602 /*
 603  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
 604  * the shared lock has a count other then 1.  Optimize the most likely case
 605  * but note that a single cmpset can fail due to WANTED races.
 606  *
 607  * If the lock is held exclusively it must be owned by the caller and
 608  * this function will simply return without doing anything.   A panic will
 609  * occur if the lock is held exclusively by someone other then the caller.
 610  *
 611  * Returns 0 on success, EDEADLK on failure.
 612  */
 613 int
 614 _mtx_upgrade_try(mtx_t *mtx)
 615 {
 616         u_int   lock;
 617         u_int   nlock;
 618         int     error = 0;
 619
 620         for (;;) {
 621                 lock = mtx->mtx_lock;
 622
 623                 if ((lock & ~MTX_EXWANTED) == 1) {
 624                         nlock = lock | MTX_EXCLUSIVE;
 625                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 626                                 mtx->mtx_owner = curthread;
 627                                 break;
 628                         }
 629                 } else if (lock & MTX_EXCLUSIVE) {
 630                         KKASSERT(mtx->mtx_owner == curthread);
 631                         break;
 632                 } else {
 633                         error = EDEADLK;
 634                         break;
 635                 }
 636                 cpu_pause();
 637                 ++mtx_collision_count;
 638         }
 639         return (error);
 640 }
 641
 642 /*
 643  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
 644  *
 645  * On the last release we handle any pending chains.
 646  */
 647 void
 648 _mtx_unlock(mtx_t *mtx)
 649 {
 650         u_int   lock;
 651         u_int   nlock;
 652
 653         for (;;) {
 654                 lock = mtx->mtx_lock;
 655                 cpu_ccfence();
 656
 657                 switch(lock) {
 658                 case MTX_EXCLUSIVE | 1:
 659                         /*
 660                          * Last release, exclusive lock.
 661                          * No exclusive or shared requests pending.
 662                          */
 663                         mtx->mtx_owner = NULL;
 664                         nlock = 0;
 665                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 666                                 goto done;
 667                         break;
 668                 case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
 669                 case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
 670                         /*
 671                          * Last release, exclusive lock.
 672                          * Exclusive requests pending.
 673                          * Exclusive requests have priority over shared reqs.
 674                          */
 675                         if (mtx_chain_link_ex(mtx, lock))
 676                                 goto done;
 677                         break;
 678                 case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
 679                         /*
 680                          * Last release, exclusive lock.
 681                          *
 682                          * Shared requests are pending.  Transfer our count (1)
 683                          * to the first shared request, wakeup all shared reqs.
 684                          */
 685                         if (mtx_chain_link_sh(mtx, lock, 0))
 686                                 goto done;
 687                         break;
 688                 case 1:
 689                         /*
 690                          * Last release, shared lock.
 691                          * No exclusive or shared requests pending.
 692                          */
 693                         nlock = 0;
 694                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 695                                 goto done;
 696                         break;
 697                 case MTX_EXWANTED | 1:
 698                 case MTX_EXWANTED | MTX_SHWANTED | 1:
 699                         /*
 700                          * Last release, shared lock.
 701                          *
 702                          * Exclusive requests are pending.  Transfer our
 703                          * count (1) to the next exclusive request.
 704                          *
 705                          * Exclusive requests have priority over shared reqs.
 706                          */
 707                         if (mtx_chain_link_ex(mtx, lock))
 708                                 goto done;
 709                         break;
 710                 case MTX_SHWANTED | 1:
 711                         /*
 712                          * Last release, shared lock.
 713                          * Shared requests pending.
 714                          */
 715                         if (mtx_chain_link_sh(mtx, lock, 0))
 716                                 goto done;
 717                         break;
 718                 default:
 719                         /*
 720                          * We have to loop if this is the last release but
 721                          * someone is fiddling with LINKSPIN.
 722                          */
 723                         if ((lock & MTX_MASK) == 1) {
 724                                 KKASSERT(lock & MTX_LINKSPIN);
 725                                 break;
 726                         }
 727
 728                         /*
 729                          * Not the last release (shared or exclusive)
 730                          */
 731                         nlock = lock - 1;
 732                         KKASSERT((nlock & MTX_MASK) != MTX_MASK);
 733                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 734                                 goto done;
 735                         break;
 736                 }
 737                 /* loop try again */
 738                 cpu_pause();
 739                 ++mtx_collision_count;
 740         }
 741 done:
 742         ;
 743 }
 744
 745 /*
 746  * Chain pending links.  Called on the last release of an exclusive or
 747  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
 748  * is passed in with the count left at 1, which we can inherit, and other
 749  * bits which we must adjust in a single atomic operation.
 750  *
 751  * Return non-zero on success, 0 if caller needs to retry.
 752  *
 753  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
 754  *       acquiring LINKSPIN as all other cases will also need to acquire
 755  *       LINKSPIN when handling the EXWANTED case.
 756  */
 757 static int
 758 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
 759 {
 760         thread_t td = curthread;
 761         mtx_link_t *link;
 762         u_int   nlock;
 763
 764         olock &= ~MTX_LINKSPIN;
 765         nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;
 766         ++td->td_critcount;
 767         if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
 768                 link = mtx->mtx_exlink;
 769                 KKASSERT(link != NULL);
 770                 if (link->next == link) {
 771                         mtx->mtx_exlink = NULL;
 772                         nlock = MTX_LINKSPIN | MTX_EXWANTED;    /* to clear */
 773                 } else {
 774                         mtx->mtx_exlink = link->next;
 775                         link->next->prev = link->prev;
 776                         link->prev->next = link->next;
 777                         nlock = MTX_LINKSPIN;                   /* to clear */
 778                 }
 779                 KKASSERT(link->state == MTX_LINK_LINKED_EX);
 780                 mtx->mtx_owner = link->owner;
 781                 cpu_sfence();
 782
 783                 /*
 784                  * WARNING! The callback can only be safely
 785                  *          made with LINKSPIN still held
 786                  *          and in a critical section.
 787                  *
 788                  * WARNING! The link can go away after the
 789                  *          state is set, or after the
 790                  *          callback.
 791                  */
 792                 if (link->callback) {
 793                         link->state = MTX_LINK_CALLEDBACK;
 794                         link->callback(link, link->arg, 0);
 795                 } else {
 796                         link->state = MTX_LINK_ACQUIRED;
 797                         wakeup(link);
 798                 }
 799                 atomic_clear_int(&mtx->mtx_lock, nlock);
 800                 --td->td_critcount;
 801                 ++mtx_wakeup_count;
 802                 return 1;
 803         }
 804         /* retry */
 805         --td->td_critcount;
 806         return 0;
 807 }
 808
 809 /*
 810  * Flush waiting shared locks.  The lock's prior state is passed in and must
 811  * be adjusted atomically only if it matches.
 812  *
 813  * If addcount is 0, the count for the first shared lock in the chain is
 814  * assumed to have already been accounted for.
 815  *
 816  * If addcount is 1, the count for the first shared lock in the chain has
 817  * not yet been accounted for.
 818  */
 819 static int
 820 mtx_chain_link_sh(mtx_t *mtx, u_int olock, int addcount)
 821 {
 822         thread_t td = curthread;
 823         mtx_link_t *link;
 824         u_int   nlock;
 825
 826         olock &= ~MTX_LINKSPIN;
 827         nlock = olock | MTX_LINKSPIN;
 828         nlock &= ~MTX_EXCLUSIVE;
 829         ++td->td_critcount;
 830         if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
 831                 KKASSERT(mtx->mtx_shlink != NULL);
 832                 for (;;) {
 833                         link = mtx->mtx_shlink;
 834                         atomic_add_int(&mtx->mtx_lock, addcount);
 835                         KKASSERT(link->state == MTX_LINK_LINKED_SH);
 836                         if (link->next == link) {
 837                                 mtx->mtx_shlink = NULL;
 838                                 cpu_sfence();
 839
 840                                 /*
 841                                  * WARNING! The callback can only be safely
 842                                  *          made with LINKSPIN still held
 843                                  *          and in a critical section.
 844                                  *
 845                                  * WARNING! The link can go away after the
 846                                  *          state is set, or after the
 847                                  *          callback.
 848                                  */
 849                                 if (link->callback) {
 850                                         link->state = MTX_LINK_CALLEDBACK;
 851                                         link->callback(link, link->arg, 0);
 852                                 } else {
 853                                         link->state = MTX_LINK_ACQUIRED;
 854                                         wakeup(link);
 855                                 }
 856                                 ++mtx_wakeup_count;
 857                                 break;
 858                         }
 859                         mtx->mtx_shlink = link->next;
 860                         link->next->prev = link->prev;
 861                         link->prev->next = link->next;
 862                         cpu_sfence();
 863                         link->state = MTX_LINK_ACQUIRED;
 864                         /* link can go away */
 865                         wakeup(link);
 866                         ++mtx_wakeup_count;
 867                         addcount = 1;
 868                 }
 869                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
 870                                                  MTX_SHWANTED);
 871                 --td->td_critcount;
 872                 return 1;
 873         }
 874         /* retry */
 875         --td->td_critcount;
 876         return 0;
 877 }
 878
 879 /*
 880  * Delete a link structure after tsleep has failed.  This code is not
 881  * in the critical path as most exclusive waits are chained.
 882  */
 883 static
 884 void
 885 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
 886 {
 887         thread_t td = curthread;
 888         u_int   lock;
 889         u_int   nlock;
 890
 891         /*
 892          * Acquire MTX_LINKSPIN.
 893          *
 894          * Do not use cmpxchg to wait for LINKSPIN to clear as this might
 895          * result in too much cpu cache traffic.
 896          */
 897         ++td->td_critcount;
 898         for (;;) {
 899                 lock = mtx->mtx_lock;
 900                 if (lock & MTX_LINKSPIN) {
 901                         cpu_pause();
 902                         ++mtx_collision_count;
 903                         continue;
 904                 }
 905                 nlock = lock | MTX_LINKSPIN;
 906                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 907                         break;
 908                 cpu_pause();
 909                 ++mtx_collision_count;
 910         }
 911
 912         /*
 913          * Delete the link and release LINKSPIN.
 914          */
 915         nlock = MTX_LINKSPIN;   /* to clear */
 916
 917         switch(link->state) {
 918         case MTX_LINK_LINKED_EX:
 919                 if (link->next == link) {
 920                         mtx->mtx_exlink = NULL;
 921                         nlock |= MTX_EXWANTED;  /* to clear */
 922                 } else {
 923                         mtx->mtx_exlink = link->next;
 924                         link->next->prev = link->prev;
 925                         link->prev->next = link->next;
 926                 }
 927                 break;
 928         case MTX_LINK_LINKED_SH:
 929                 if (link->next == link) {
 930                         mtx->mtx_shlink = NULL;
 931                         nlock |= MTX_SHWANTED;  /* to clear */
 932                 } else {
 933                         mtx->mtx_shlink = link->next;
 934                         link->next->prev = link->prev;
 935                         link->prev->next = link->next;
 936                 }
 937                 break;
 938         default:
 939                 /* no change */
 940                 break;
 941         }
 942         atomic_clear_int(&mtx->mtx_lock, nlock);
 943         --td->td_critcount;
 944 }
 945
 946 /*
 947  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
 948  * occurred.
 949  */
 950 int
 951 mtx_wait_link(mtx_t *mtx, mtx_link_t *link,
 952               const char *ident, int flags, int to)
 953 {
 954         int error;
 955
 956         /*
 957          * Sleep.  Handle false wakeups, interruptions, etc.
 958          * The link may also have been aborted.
 959          */
 960         error = 0;
 961         while (link->state & MTX_LINK_LINKED) {
 962                 tsleep_interlock(link, 0);
 963                 cpu_lfence();
 964                 if (link->state & MTX_LINK_LINKED) {
 965                         ++mtx_contention_count;
 966                         if (link->state & MTX_LINK_LINKED_SH)
 967                                 mycpu->gd_cnt.v_lock_name[0] = 'S';
 968                         else
 969                                 mycpu->gd_cnt.v_lock_name[0] = 'X';
 970                         strncpy(mycpu->gd_cnt.v_lock_name + 1,
 971                                 ident,
 972                                 sizeof(mycpu->gd_cnt.v_lock_name) - 2);
 973                         ++mycpu->gd_cnt.v_lock_colls;
 974
 975                         error = tsleep(link, flags | PINTERLOCKED,
 976                                        ident, to);
 977                         if (error)
 978                                 break;
 979                 }
 980         }
 981
 982         /*
 983          * We are done, make sure the link structure is unlinked.
 984          * It may still be on the list due to e.g. EINTR or
 985          * EWOULDBLOCK.
 986          *
 987          * It is possible for the tsleep to race an ABORT and cause
 988          * error to be 0.
 989          *
 990          * The tsleep() can be woken up for numerous reasons and error
 991          * might be zero in situations where we intend to return an error.
 992          *
 993          * (This is the synchronous case so state cannot be CALLEDBACK)
 994          */
 995         switch(link->state) {
 996         case MTX_LINK_ACQUIRED:
 997         case MTX_LINK_CALLEDBACK:
 998                 error = 0;
 999                 break;
1000         case MTX_LINK_ABORTED:
1001                 error = ENOLCK;
1002                 break;
1003         case MTX_LINK_LINKED_EX:
1004         case MTX_LINK_LINKED_SH:
1005                 mtx_delete_link(mtx, link);
1006                 /* fall through */
1007         default:
1008                 if (error == 0)
1009                         error = EWOULDBLOCK;
1010                 break;
1011         }
1012
1013         /*
1014          * Clear state on status returned.
1015          */
1016         link->state = MTX_LINK_IDLE;
1017
1018         return error;
1019 }
1020
1021 /*
1022  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1023  * return ENOLCK.  This may be called at any time after the mtx_link
1024  * is initialized or the status from a previous lock has been
1025  * returned.  If called prior to the next (non-try) lock attempt, the
1026  * next lock attempt using this link structure will abort instantly.
1027  *
1028  * Caller must still wait for the operation to complete, either from a
1029  * blocking call that is still in progress or by calling mtx_wait_link().
1030  *
1031  * If an asynchronous lock request is possibly in-progress, the caller
1032  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1033  * lock callback will NOT be called if a successful abort occurred. XXX
1034  */
1035 void
1036 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1037 {
1038         thread_t td = curthread;
1039         u_int   lock;
1040         u_int   nlock;
1041
1042         /*
1043          * Acquire MTX_LINKSPIN
1044          */
1045         ++td->td_critcount;
1046         for (;;) {
1047                 lock = mtx->mtx_lock;
1048                 if (lock & MTX_LINKSPIN) {
1049                         cpu_pause();
1050                         ++mtx_collision_count;
1051                         continue;
1052                 }
1053                 nlock = lock | MTX_LINKSPIN;
1054                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1055                         break;
1056                 cpu_pause();
1057                 ++mtx_collision_count;
1058         }
1059
1060         /*
1061          * Do the abort.
1062          *
1063          * WARNING! Link structure can disappear once link->state is set.
1064          */
1065         nlock = MTX_LINKSPIN;   /* to clear */
1066
1067         switch(link->state) {
1068         case MTX_LINK_IDLE:
1069                 /*
1070                  * Link not started yet
1071                  */
1072                 link->state = MTX_LINK_ABORTED;
1073                 break;
1074         case MTX_LINK_LINKED_EX:
1075                 /*
1076                  * de-link, mark aborted, and potentially wakeup the thread
1077                  * or issue the callback.
1078                  */
1079                 if (link->next == link) {
1080                         if (mtx->mtx_exlink == link) {
1081                                 mtx->mtx_exlink = NULL;
1082                                 nlock |= MTX_EXWANTED;  /* to clear */
1083                         }
1084                 } else {
1085                         if (mtx->mtx_exlink == link)
1086                                 mtx->mtx_exlink = link->next;
1087                         link->next->prev = link->prev;
1088                         link->prev->next = link->next;
1089                 }
1090
1091                 /*
1092                  * When aborting the async callback is still made.  We must
1093                  * not set the link status to ABORTED in the callback case
1094                  * since there is nothing else to clear its status if the
1095                  * link is reused.
1096                  */
1097                 if (link->callback) {
1098                         link->state = MTX_LINK_CALLEDBACK;
1099                         link->callback(link, link->arg, ENOLCK);
1100                 } else {
1101                         link->state = MTX_LINK_ABORTED;
1102                         wakeup(link);
1103                 }
1104                 ++mtx_wakeup_count;
1105                 break;
1106         case MTX_LINK_LINKED_SH:
1107                 /*
1108                  * de-link, mark aborted, and potentially wakeup the thread
1109                  * or issue the callback.
1110                  */
1111                 if (link->next == link) {
1112                         if (mtx->mtx_shlink == link) {
1113                                 mtx->mtx_shlink = NULL;
1114                                 nlock |= MTX_SHWANTED;  /* to clear */
1115                         }
1116                 } else {
1117                         if (mtx->mtx_shlink == link)
1118                                 mtx->mtx_shlink = link->next;
1119                         link->next->prev = link->prev;
1120                         link->prev->next = link->next;
1121                 }
1122
1123                 /*
1124                  * When aborting the async callback is still made.  We must
1125                  * not set the link status to ABORTED in the callback case
1126                  * since there is nothing else to clear its status if the
1127                  * link is reused.
1128                  */
1129                 if (link->callback) {
1130                         link->state = MTX_LINK_CALLEDBACK;
1131                         link->callback(link, link->arg, ENOLCK);
1132                 } else {
1133                         link->state = MTX_LINK_ABORTED;
1134                         wakeup(link);
1135                 }
1136                 ++mtx_wakeup_count;
1137                 break;
1138         case MTX_LINK_ACQUIRED:
1139         case MTX_LINK_CALLEDBACK:
1140                 /*
1141                  * Too late, the lock was acquired.  Let it complete.
1142                  */
1143                 break;
1144         default:
1145                 /*
1146                  * link already aborted, do nothing.
1147                  */
1148                 break;
1149         }
1150         atomic_clear_int(&mtx->mtx_lock, nlock);
1151         --td->td_critcount;
1152 }