sys/kern/kern_mutex.c

   1 /*
   2  * Copyright (c) 2009 The DragonFly Project.  All rights reserved.
   3  *
   4  * This code is derived from software contributed to The DragonFly Project
   5  * by Matthew Dillon <dillon@backplane.com>
   6  *
   7  * Redistribution and use in source and binary forms, with or without
   8  * modification, are permitted provided that the following conditions
   9  * are met:
  10  *
  11  * 1. Redistributions of source code must retain the above copyright
  12  *    notice, this list of conditions and the following disclaimer.
  13  * 2. Redistributions in binary form must reproduce the above copyright
  14  *    notice, this list of conditions and the following disclaimer in
  15  *    the documentation and/or other materials provided with the
  16  *    distribution.
  17  * 3. Neither the name of The DragonFly Project nor the names of its
  18  *    contributors may be used to endorse or promote products derived
  19  *    from this software without specific, prior written permission.
  20  *
  21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  32  * SUCH DAMAGE.
  33  */
  34 /*
  35  * Implement fast persistent locks based on atomic_cmpset_int() with
  36  * semantics similar to lockmgr locks but faster and taking up much less
  37  * space.  Taken from HAMMER's lock implementation.
  38  *
  39  * These are meant to complement our LWKT tokens.  Tokens are only held
  40  * while the thread is running.  Mutexes can be held across blocking
  41  * conditions.
  42  *
  43  * - Exclusive priority over shared to prevent SMP starvation.
  44  * - locks can be aborted (async callback, if any, will be made w/ENOLCK).
  45  * - locks can be asynchronous.
  46  * - synchronous fast path if no blocking occurs (async callback is not
  47  *   made in this case).
  48  *
  49  * Generally speaking any caller-supplied link state must be properly
  50  * initialized before use.
  51  *
  52  * Most of the support is in sys/mutex[2].h.  We mostly provide backoff
  53  * functions here.
  54  */
  55
  56 #include <sys/param.h>
  57 #include <sys/systm.h>
  58 #include <sys/kernel.h>
  59 #include <sys/sysctl.h>
  60 #include <sys/thread.h>
  61
  62 #include <machine/cpufunc.h>
  63
  64 #include <sys/thread2.h>
  65 #include <sys/mutex2.h>
  66
  67 static int mtx_chain_link_ex(mtx_t *mtx, u_int olock);
  68 static int mtx_chain_link_sh(mtx_t *mtx, u_int olock);
  69 static void mtx_delete_link(mtx_t *mtx, mtx_link_t *link);
  70
  71 /*
  72  * Exclusive-lock a mutex, block until acquired unless link is async.
  73  * Recursion is allowed.
  74  *
  75  * Returns 0 on success, the tsleep() return code on failure, EINPROGRESS
  76  * if async.  If immediately successful an async exclusive lock will return 0
  77  * and not issue the async callback or link the link structure.  The caller
  78  * must handle this case (typically this is an optimal code path).
  79  *
  80  * A tsleep() error can only be returned if PCATCH is specified in the flags.
  81  */
  82 static __inline int
  83 __mtx_lock_ex(mtx_t *mtx, mtx_link_t *link, int flags, int to)
  84 {
  85         thread_t td;
  86         u_int   lock;
  87         u_int   nlock;
  88         int     error;
  89         int     isasync;
  90
  91         for (;;) {
  92                 lock = mtx->mtx_lock;
  93                 cpu_ccfence();
  94
  95                 if (lock == 0) {
  96                         nlock = MTX_EXCLUSIVE | 1;
  97                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
  98                                 mtx->mtx_owner = curthread;
  99                                 cpu_sfence();
 100                                 link->state = MTX_LINK_ACQUIRED;
 101                                 error = 0;
 102                                 break;
 103                         }
 104                         continue;
 105                 }
 106                 if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) {
 107                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 108                         nlock = lock + 1;
 109                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 110                                 cpu_sfence();
 111                                 link->state = MTX_LINK_ACQUIRED;
 112                                 error = 0;
 113                                 break;
 114                         }
 115                         continue;
 116                 }
 117
 118                 /*
 119                  * We need MTX_LINKSPIN to manipulate exlink or
 120                  * shlink.
 121                  *
 122                  * We must set MTX_EXWANTED with MTX_LINKSPIN to indicate
 123                  * pending exclusive requests.  It cannot be set as a separate
 124                  * operation prior to acquiring MTX_LINKSPIN.
 125                  *
 126                  * To avoid unnecessary cpu cache traffic we poll
 127                  * for collisions.  It is also possible that EXWANTED
 128                  * state failing the above test was spurious, so all the
 129                  * tests must be repeated if we cannot obtain LINKSPIN
 130                  * with the prior state tests intact (i.e. don't reload
 131                  * the (lock) variable here, for heaven's sake!).
 132                  */
 133                 if (lock & MTX_LINKSPIN) {
 134                         cpu_pause();
 135                         continue;
 136                 }
 137                 td = curthread;
 138                 nlock = lock | MTX_EXWANTED | MTX_LINKSPIN;
 139                 ++td->td_critcount;
 140                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
 141                         --td->td_critcount;
 142                         continue;
 143                 }
 144
 145                 /*
 146                  * Check for early abort.
 147                  */
 148                 if (link->state == MTX_LINK_ABORTED) {
 149                         if (mtx->mtx_exlink == NULL) {
 150                                 atomic_clear_int(&mtx->mtx_lock,
 151                                                  MTX_LINKSPIN |
 152                                                  MTX_EXWANTED);
 153                         } else {
 154                                 atomic_clear_int(&mtx->mtx_lock,
 155                                                  MTX_LINKSPIN);
 156                         }
 157                         --td->td_critcount;
 158                         link->state = MTX_LINK_IDLE;
 159                         error = ENOLCK;
 160                         break;
 161                 }
 162
 163                 /*
 164                  * Add our link to the exlink list and release LINKSPIN.
 165                  */
 166                 link->owner = td;
 167                 link->state = MTX_LINK_LINKED_EX;
 168                 if (mtx->mtx_exlink) {
 169                         link->next = mtx->mtx_exlink;
 170                         link->prev = link->next->prev;
 171                         link->next->prev = link;
 172                         link->prev->next = link;
 173                 } else {
 174                         link->next = link;
 175                         link->prev = link;
 176                         mtx->mtx_exlink = link;
 177                 }
 178                 isasync = (link->callback != NULL);
 179                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
 180                 --td->td_critcount;
 181
 182                 /*
 183                  * If asynchronous lock request return without
 184                  * blocking, leave link structure linked.
 185                  */
 186                 if (isasync) {
 187                         error = EINPROGRESS;
 188                         break;
 189                 }
 190
 191                 /*
 192                  * Wait for lock
 193                  */
 194                 error = mtx_wait_link(mtx, link, flags, to);
 195                 break;
 196         }
 197         return (error);
 198 }
 199
 200 int
 201 _mtx_lock_ex_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 202 {
 203         return(__mtx_lock_ex(mtx, link, flags, to));
 204 }
 205
 206 int
 207 _mtx_lock_ex(mtx_t *mtx, int flags, int to)
 208 {
 209         mtx_link_t link;
 210
 211         mtx_link_init(&link);
 212         return(__mtx_lock_ex(mtx, &link, flags, to));
 213 }
 214
 215 int
 216 _mtx_lock_ex_quick(mtx_t *mtx)
 217 {
 218         mtx_link_t link;
 219
 220         mtx_link_init(&link);
 221         return(__mtx_lock_ex(mtx, &link, 0, 0));
 222 }
 223
 224 /*
 225  * Share-lock a mutex, block until acquired.  Recursion is allowed.
 226  *
 227  * Returns 0 on success, or the tsleep() return code on failure.
 228  * An error can only be returned if PCATCH is specified in the flags.
 229  *
 230  * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we
 231  *       do not have to chain the wakeup().
 232  */
 233 static __inline int
 234 __mtx_lock_sh(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 235 {
 236         thread_t td;
 237         u_int   lock;
 238         u_int   nlock;
 239         int     error;
 240         int     isasync;
 241
 242         for (;;) {
 243                 lock = mtx->mtx_lock;
 244                 cpu_ccfence();
 245
 246                 if (lock == 0) {
 247                         nlock = 1;
 248                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 249                                 error = 0;
 250                                 cpu_sfence();
 251                                 link->state = MTX_LINK_ACQUIRED;
 252                                 break;
 253                         }
 254                         continue;
 255                 }
 256                 if ((lock & (MTX_EXCLUSIVE | MTX_EXWANTED)) == 0) {
 257                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 258                         nlock = lock + 1;
 259                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 260                                 error = 0;
 261                                 cpu_sfence();
 262                                 link->state = MTX_LINK_ACQUIRED;
 263                                 break;
 264                         }
 265                         continue;
 266                 }
 267
 268                 /*
 269                  * We need MTX_LINKSPIN to manipulate exlink or
 270                  * shlink.
 271                  *
 272                  * We must set MTX_SHWANTED with MTX_LINKSPIN to indicate
 273                  * pending shared requests.  It cannot be set as a separate
 274                  * operation prior to acquiring MTX_LINKSPIN.
 275                  *
 276                  * To avoid unnecessary cpu cache traffic we poll
 277                  * for collisions.  It is also possible that EXWANTED
 278                  * state failing the above test was spurious, so all the
 279                  * tests must be repeated if we cannot obtain LINKSPIN
 280                  * with the prior state tests intact (i.e. don't reload
 281                  * the (lock) variable here, for heaven's sake!).
 282                  */
 283                 if (lock & MTX_LINKSPIN) {
 284                         cpu_pause();
 285                         continue;
 286                 }
 287                 td = curthread;
 288                 nlock = lock | MTX_SHWANTED | MTX_LINKSPIN;
 289                 ++td->td_critcount;
 290                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock) == 0) {
 291                         --td->td_critcount;
 292                         continue;
 293                 }
 294
 295                 /*
 296                  * Check for early abort.  Other shared lock requestors
 297                  * could have sneaked in before we set LINKSPIN so make
 298                  * sure we undo the state properly.
 299                  */
 300                 if (link->state == MTX_LINK_ABORTED) {
 301                         if (mtx->mtx_shlink) {
 302                                 atomic_clear_int(&mtx->mtx_lock,
 303                                                  MTX_LINKSPIN);
 304                         } else {
 305                                 atomic_clear_int(&mtx->mtx_lock,
 306                                                  MTX_LINKSPIN |
 307                                                  MTX_SHWANTED);
 308                         }
 309                         --td->td_critcount;
 310                         link->state = MTX_LINK_IDLE;
 311                         error = ENOLCK;
 312                         break;
 313                 }
 314
 315                 /*
 316                  * Add our link to the shlink list and release LINKSPIN.
 317                  */
 318                 link->owner = td;
 319                 link->state = MTX_LINK_LINKED_SH;
 320                 if (mtx->mtx_shlink) {
 321                         link->next = mtx->mtx_shlink;
 322                         link->prev = link->next->prev;
 323                         link->next->prev = link;
 324                         link->prev->next = link;
 325                 } else {
 326                         link->next = link;
 327                         link->prev = link;
 328                         mtx->mtx_shlink = link;
 329                 }
 330                 isasync = (link->callback != NULL);
 331                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN);
 332                 --td->td_critcount;
 333
 334                 /*
 335                  * If asynchronous lock request return without
 336                  * blocking, leave link structure linked.
 337                  */
 338                 if (isasync) {
 339                         error = EINPROGRESS;
 340                         break;
 341                 }
 342
 343                 /*
 344                  * Wait for lock
 345                  */
 346                 error = mtx_wait_link(mtx, link, flags, to);
 347                 break;
 348         }
 349         return (error);
 350 }
 351
 352 int
 353 _mtx_lock_sh_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 354 {
 355         return(__mtx_lock_sh(mtx, link, flags, to));
 356 }
 357
 358 int
 359 _mtx_lock_sh(mtx_t *mtx, int flags, int to)
 360 {
 361         mtx_link_t link;
 362
 363         mtx_link_init(&link);
 364         return(__mtx_lock_sh(mtx, &link, flags, to));
 365 }
 366
 367 int
 368 _mtx_lock_sh_quick(mtx_t *mtx)
 369 {
 370         mtx_link_t link;
 371
 372         mtx_link_init(&link);
 373         return(__mtx_lock_sh(mtx, &link, 0, 0));
 374 }
 375
 376 /*
 377  * Get an exclusive spinlock the hard way.
 378  */
 379 void
 380 _mtx_spinlock(mtx_t *mtx)
 381 {
 382         u_int   lock;
 383         u_int   nlock;
 384         int     bb = 1;
 385         int     bo;
 386
 387         for (;;) {
 388                 lock = mtx->mtx_lock;
 389                 if (lock == 0) {
 390                         nlock = MTX_EXCLUSIVE | 1;
 391                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 392                                 mtx->mtx_owner = curthread;
 393                                 break;
 394                         }
 395                 } else if ((lock & MTX_EXCLUSIVE) &&
 396                            mtx->mtx_owner == curthread) {
 397                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 398                         nlock = lock + 1;
 399                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 400                                 break;
 401                 } else {
 402                         /* MWAIT here */
 403                         if (bb < 1000)
 404                                 ++bb;
 405                         cpu_pause();
 406                         for (bo = 0; bo < bb; ++bo)
 407                                 ;
 408                 }
 409                 cpu_pause();
 410         }
 411 }
 412
 413 /*
 414  * Attempt to acquire a spinlock, if we fail we must undo the
 415  * gd->gd_spinlocks/gd->gd_curthead->td_critcount predisposition.
 416  *
 417  * Returns 0 on success, EAGAIN on failure.
 418  */
 419 int
 420 _mtx_spinlock_try(mtx_t *mtx)
 421 {
 422         globaldata_t gd = mycpu;
 423         u_int   lock;
 424         u_int   nlock;
 425         int     res = 0;
 426
 427         for (;;) {
 428                 lock = mtx->mtx_lock;
 429                 if (lock == 0) {
 430                         nlock = MTX_EXCLUSIVE | 1;
 431                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 432                                 mtx->mtx_owner = gd->gd_curthread;
 433                                 break;
 434                         }
 435                 } else if ((lock & MTX_EXCLUSIVE) &&
 436                            mtx->mtx_owner == gd->gd_curthread) {
 437                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 438                         nlock = lock + 1;
 439                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 440                                 break;
 441                 } else {
 442                         --gd->gd_spinlocks;
 443                         cpu_ccfence();
 444                         --gd->gd_curthread->td_critcount;
 445                         res = EAGAIN;
 446                         break;
 447                 }
 448                 cpu_pause();
 449         }
 450         return res;
 451 }
 452
 453 #if 0
 454
 455 void
 456 _mtx_spinlock_sh(mtx_t *mtx)
 457 {
 458         u_int   lock;
 459         u_int   nlock;
 460         int     bb = 1;
 461         int     bo;
 462
 463         for (;;) {
 464                 lock = mtx->mtx_lock;
 465                 if ((lock & MTX_EXCLUSIVE) == 0) {
 466                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 467                         nlock = lock + 1;
 468                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 469                                 break;
 470                 } else {
 471                         /* MWAIT here */
 472                         if (bb < 1000)
 473                                 ++bb;
 474                         cpu_pause();
 475                         for (bo = 0; bo < bb; ++bo)
 476                                 ;
 477                 }
 478                 cpu_pause();
 479         }
 480 }
 481
 482 #endif
 483
 484 int
 485 _mtx_lock_ex_try(mtx_t *mtx)
 486 {
 487         u_int   lock;
 488         u_int   nlock;
 489         int     error;
 490
 491         for (;;) {
 492                 lock = mtx->mtx_lock;
 493                 if (lock == 0) {
 494                         nlock = MTX_EXCLUSIVE | 1;
 495                         if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) {
 496                                 mtx->mtx_owner = curthread;
 497                                 error = 0;
 498                                 break;
 499                         }
 500                 } else if ((lock & MTX_EXCLUSIVE) &&
 501                            mtx->mtx_owner == curthread) {
 502                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 503                         nlock = lock + 1;
 504                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 505                                 error = 0;
 506                                 break;
 507                         }
 508                 } else {
 509                         error = EAGAIN;
 510                         break;
 511                 }
 512                 cpu_pause();
 513         }
 514         return (error);
 515 }
 516
 517 int
 518 _mtx_lock_sh_try(mtx_t *mtx)
 519 {
 520         u_int   lock;
 521         u_int   nlock;
 522         int     error = 0;
 523
 524         for (;;) {
 525                 lock = mtx->mtx_lock;
 526                 if ((lock & MTX_EXCLUSIVE) == 0) {
 527                         KKASSERT((lock & MTX_MASK) != MTX_MASK);
 528                         nlock = lock + 1;
 529                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 530                                 break;
 531                 } else {
 532                         error = EAGAIN;
 533                         break;
 534                 }
 535                 cpu_pause();
 536         }
 537         return (error);
 538 }
 539
 540 /*
 541  * If the lock is held exclusively it must be owned by the caller.  If the
 542  * lock is already a shared lock this operation is a NOP.  A panic will
 543  * occur if the lock is not held either shared or exclusive.
 544  *
 545  * The exclusive count is converted to a shared count.
 546  */
 547 void
 548 _mtx_downgrade(mtx_t *mtx)
 549 {
 550         u_int   lock;
 551         u_int   nlock;
 552
 553         for (;;) {
 554                 lock = mtx->mtx_lock;
 555                 cpu_ccfence();
 556
 557                 /*
 558                  * NOP if already shared.
 559                  */
 560                 if ((lock & MTX_EXCLUSIVE) == 0) {
 561                         KKASSERT((lock & MTX_MASK) > 0);
 562                         break;
 563                 }
 564
 565                 /*
 566                  * Transfer count to shared.  Any additional pending shared
 567                  * waiters must be woken up.
 568                  */
 569                 if (lock & MTX_SHWANTED) {
 570                         if (mtx_chain_link_sh(mtx, lock))
 571                                 break;
 572                         /* retry */
 573                 } else {
 574                         nlock = lock & ~MTX_EXCLUSIVE;
 575                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 576                                 break;
 577                         /* retry */
 578                 }
 579                 cpu_pause();
 580         }
 581 }
 582
 583 /*
 584  * Upgrade a shared lock to an exclusive lock.  The upgrade will fail if
 585  * the shared lock has a count other then 1.  Optimize the most likely case
 586  * but note that a single cmpset can fail due to WANTED races.
 587  *
 588  * If the lock is held exclusively it must be owned by the caller and
 589  * this function will simply return without doing anything.   A panic will
 590  * occur if the lock is held exclusively by someone other then the caller.
 591  *
 592  * Returns 0 on success, EDEADLK on failure.
 593  */
 594 int
 595 _mtx_upgrade_try(mtx_t *mtx)
 596 {
 597         u_int   lock;
 598         u_int   nlock;
 599         int     error = 0;
 600
 601         for (;;) {
 602                 lock = mtx->mtx_lock;
 603                 cpu_ccfence();
 604
 605                 if ((lock & ~MTX_EXWANTED) == 1) {
 606                         nlock = lock | MTX_EXCLUSIVE;
 607                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) {
 608                                 mtx->mtx_owner = curthread;
 609                                 break;
 610                         }
 611                 } else if (lock & MTX_EXCLUSIVE) {
 612                         KKASSERT(mtx->mtx_owner == curthread);
 613                         break;
 614                 } else {
 615                         error = EDEADLK;
 616                         break;
 617                 }
 618                 cpu_pause();
 619         }
 620         return (error);
 621 }
 622
 623 /*
 624  * Unlock a lock.  The caller must hold the lock either shared or exclusive.
 625  *
 626  * On the last release we handle any pending chains.
 627  */
 628 void
 629 _mtx_unlock(mtx_t *mtx)
 630 {
 631         u_int   lock;
 632         u_int   nlock;
 633
 634         for (;;) {
 635                 lock = mtx->mtx_lock;
 636                 cpu_ccfence();
 637
 638                 switch(lock) {
 639                 case MTX_EXCLUSIVE | 1:
 640                         /*
 641                          * Last release, exclusive lock.
 642                          * No exclusive or shared requests pending.
 643                          */
 644                         mtx->mtx_owner = NULL;
 645                         nlock = 0;
 646                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 647                                 goto done;
 648                         break;
 649                 case MTX_EXCLUSIVE | MTX_EXWANTED | 1:
 650                 case MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1:
 651                         /*
 652                          * Last release, exclusive lock.
 653                          * Exclusive requests pending.
 654                          * Exclusive requests have priority over shared reqs.
 655                          */
 656                         if (mtx_chain_link_ex(mtx, lock))
 657                                 goto done;
 658                         break;
 659                 case MTX_EXCLUSIVE | MTX_SHWANTED | 1:
 660                         /*
 661                          * Last release, exclusive lock.
 662                          *
 663                          * Shared requests are pending.  Transfer our count (1)
 664                          * to the first shared request, wakeup all shared reqs.
 665                          */
 666                         if (mtx_chain_link_sh(mtx, lock))
 667                                 goto done;
 668                         break;
 669                 case 1:
 670                         /*
 671                          * Last release, shared lock.
 672                          * No exclusive or shared requests pending.
 673                          */
 674                         nlock = 0;
 675                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 676                                 goto done;
 677                         break;
 678                 case MTX_EXWANTED | 1:
 679                 case MTX_EXWANTED | MTX_SHWANTED | 1:
 680                         /*
 681                          * Last release, shared lock.
 682                          *
 683                          * Exclusive requests are pending.  Upgrade this
 684                          * final shared lock to exclusive and transfer our
 685                          * count (1) to the next exclusive request.
 686                          *
 687                          * Exclusive requests have priority over shared reqs.
 688                          */
 689                         if (mtx_chain_link_ex(mtx, lock))
 690                                 goto done;
 691                         break;
 692                 case MTX_SHWANTED | 1:
 693                         /*
 694                          * Last release, shared lock.
 695                          * Shared requests pending.
 696                          */
 697                         if (mtx_chain_link_sh(mtx, lock))
 698                                 goto done;
 699                         break;
 700                 default:
 701                         /*
 702                          * We have to loop if this is the last release but
 703                          * someone is fiddling with LINKSPIN.
 704                          */
 705                         if ((lock & MTX_MASK) == 1) {
 706                                 KKASSERT(lock & MTX_LINKSPIN);
 707                                 break;
 708                         }
 709
 710                         /*
 711                          * Not the last release (shared or exclusive)
 712                          */
 713                         nlock = lock - 1;
 714                         KKASSERT((nlock & MTX_MASK) != MTX_MASK);
 715                         if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 716                                 goto done;
 717                         break;
 718                 }
 719                 /* loop try again */
 720                 cpu_pause();
 721         }
 722 done:
 723         ;
 724 }
 725
 726 /*
 727  * Chain pending links.  Called on the last release of an exclusive or
 728  * shared lock when the appropriate WANTED bit is set.  mtx_lock old state
 729  * is passed in with the count left at 1, which we can inherit, and other
 730  * bits which we must adjust in a single atomic operation.
 731  *
 732  * Return non-zero on success, 0 if caller needs to retry.
 733  *
 734  * NOTE: It's ok if MTX_EXWANTED is in an indeterminant state while we are
 735  *       acquiring LINKSPIN as all other cases will also need to acquire
 736  *       LINKSPIN when handling the EXWANTED case.
 737  */
 738 static int
 739 mtx_chain_link_ex(mtx_t *mtx, u_int olock)
 740 {
 741         thread_t td = curthread;
 742         mtx_link_t *link;
 743         u_int   nlock;
 744
 745         olock &= ~MTX_LINKSPIN;
 746         nlock = olock | MTX_LINKSPIN | MTX_EXCLUSIVE;   /* upgrade if necc */
 747         ++td->td_critcount;
 748         if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
 749                 link = mtx->mtx_exlink;
 750                 KKASSERT(link != NULL);
 751                 if (link->next == link) {
 752                         mtx->mtx_exlink = NULL;
 753                         nlock = MTX_LINKSPIN | MTX_EXWANTED;    /* to clear */
 754                 } else {
 755                         mtx->mtx_exlink = link->next;
 756                         link->next->prev = link->prev;
 757                         link->prev->next = link->next;
 758                         nlock = MTX_LINKSPIN;                   /* to clear */
 759                 }
 760                 KKASSERT(link->state == MTX_LINK_LINKED_EX);
 761                 mtx->mtx_owner = link->owner;
 762                 cpu_sfence();
 763
 764                 /*
 765                  * WARNING! The callback can only be safely
 766                  *          made with LINKSPIN still held
 767                  *          and in a critical section.
 768                  *
 769                  * WARNING! The link can go away after the
 770                  *          state is set, or after the
 771                  *          callback.
 772                  */
 773                 if (link->callback) {
 774                         link->state = MTX_LINK_CALLEDBACK;
 775                         link->callback(link, link->arg, 0);
 776                 } else {
 777                         link->state = MTX_LINK_ACQUIRED;
 778                         wakeup(link);
 779                 }
 780                 atomic_clear_int(&mtx->mtx_lock, nlock);
 781                 --td->td_critcount;
 782                 return 1;
 783         }
 784         /* retry */
 785         --td->td_critcount;
 786         return 0;
 787 }
 788
 789 /*
 790  * Flush waiting shared locks.  The lock's prior state is passed in and must
 791  * be adjusted atomically only if it matches and LINKSPIN is not set.
 792  *
 793  * IMPORTANT! The caller has left one active count on the lock for us to
 794  *            consume.  We will apply this to the first link, but must add
 795  *            additional counts for any other links.
 796  */
 797 static int
 798 mtx_chain_link_sh(mtx_t *mtx, u_int olock)
 799 {
 800         thread_t td = curthread;
 801         mtx_link_t *link;
 802         u_int   addcount;
 803         u_int   nlock;
 804
 805         olock &= ~MTX_LINKSPIN;
 806         nlock = olock | MTX_LINKSPIN;
 807         nlock &= ~MTX_EXCLUSIVE;
 808         ++td->td_critcount;
 809         if (atomic_cmpset_int(&mtx->mtx_lock, olock, nlock)) {
 810                 /*
 811                  * It should not be possible for SHWANTED to be set without
 812                  * any links pending.
 813                  */
 814                 KKASSERT(mtx->mtx_shlink != NULL);
 815
 816                 /*
 817                  * We have to process the count for all shared locks before
 818                  * we process any of the links.  Count the additional shared
 819                  * locks beyond the first link (which is already accounted
 820                  * for) and associate the full count with the lock
 821                  * immediately.
 822                  */
 823                 addcount = 0;
 824                 for (link = mtx->mtx_shlink->next; link != mtx->mtx_shlink;
 825                      link = link->next) {
 826                         ++addcount;
 827                 }
 828                 if (addcount > 0)
 829                         atomic_add_int(&mtx->mtx_lock, addcount);
 830
 831                 /*
 832                  * We can wakeup all waiting shared locks.
 833                  */
 834                 while ((link = mtx->mtx_shlink) != NULL) {
 835                         KKASSERT(link->state == MTX_LINK_LINKED_SH);
 836                         if (link->next == link) {
 837                                 mtx->mtx_shlink = NULL;
 838                         } else {
 839                                 mtx->mtx_shlink = link->next;
 840                                 link->next->prev = link->prev;
 841                                 link->prev->next = link->next;
 842                         }
 843                         link->next = NULL;
 844                         link->prev = NULL;
 845                         cpu_sfence();
 846                         if (link->callback) {
 847                                 link->state = MTX_LINK_CALLEDBACK;
 848                                 link->callback(link, link->arg, 0);
 849                         } else {
 850                                 cpu_sfence();
 851                                 link->state = MTX_LINK_ACQUIRED;
 852                                 wakeup(link);
 853                         }
 854                 }
 855                 atomic_clear_int(&mtx->mtx_lock, MTX_LINKSPIN |
 856                                                  MTX_SHWANTED);
 857                 --td->td_critcount;
 858                 return 1;
 859         }
 860         /* retry */
 861         --td->td_critcount;
 862         return 0;
 863 }
 864
 865 /*
 866  * Delete a link structure after tsleep has failed.  This code is not
 867  * in the critical path as most exclusive waits are chained.
 868  */
 869 static
 870 void
 871 mtx_delete_link(mtx_t *mtx, mtx_link_t *link)
 872 {
 873         thread_t td = curthread;
 874         u_int   lock;
 875         u_int   nlock;
 876
 877         /*
 878          * Acquire MTX_LINKSPIN.
 879          *
 880          * Do not use cmpxchg to wait for LINKSPIN to clear as this might
 881          * result in too much cpu cache traffic.
 882          */
 883         ++td->td_critcount;
 884         for (;;) {
 885                 lock = mtx->mtx_lock;
 886                 if (lock & MTX_LINKSPIN) {
 887                         cpu_pause();
 888                         continue;
 889                 }
 890                 nlock = lock | MTX_LINKSPIN;
 891                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
 892                         break;
 893                 cpu_pause();
 894         }
 895
 896         /*
 897          * Delete the link and release LINKSPIN.
 898          */
 899         nlock = MTX_LINKSPIN;   /* to clear */
 900
 901         switch(link->state) {
 902         case MTX_LINK_LINKED_EX:
 903                 if (link->next == link) {
 904                         mtx->mtx_exlink = NULL;
 905                         nlock |= MTX_EXWANTED;  /* to clear */
 906                 } else {
 907                         mtx->mtx_exlink = link->next;
 908                         link->next->prev = link->prev;
 909                         link->prev->next = link->next;
 910                 }
 911                 break;
 912         case MTX_LINK_LINKED_SH:
 913                 if (link->next == link) {
 914                         mtx->mtx_shlink = NULL;
 915                         nlock |= MTX_SHWANTED;  /* to clear */
 916                 } else {
 917                         mtx->mtx_shlink = link->next;
 918                         link->next->prev = link->prev;
 919                         link->prev->next = link->next;
 920                 }
 921                 break;
 922         default:
 923                 /* no change */
 924                 break;
 925         }
 926         atomic_clear_int(&mtx->mtx_lock, nlock);
 927         --td->td_critcount;
 928 }
 929
 930 /*
 931  * Wait for async lock completion or abort.  Returns ENOLCK if an abort
 932  * occurred.
 933  */
 934 int
 935 mtx_wait_link(mtx_t *mtx, mtx_link_t *link, int flags, int to)
 936 {
 937         int error;
 938
 939         /*
 940          * Sleep.  Handle false wakeups, interruptions, etc.
 941          * The link may also have been aborted.  The LINKED
 942          * bit was set by this cpu so we can test it without
 943          * fences.
 944          */
 945         error = 0;
 946         while (link->state & MTX_LINK_LINKED) {
 947                 tsleep_interlock(link, 0);
 948                 cpu_lfence();
 949                 if (link->state & MTX_LINK_LINKED) {
 950                         if (link->state & MTX_LINK_LINKED_SH)
 951                                 mycpu->gd_cnt.v_lock_name[0] = 'S';
 952                         else
 953                                 mycpu->gd_cnt.v_lock_name[0] = 'X';
 954                         strncpy(mycpu->gd_cnt.v_lock_name + 1,
 955                                 mtx->mtx_ident,
 956                                 sizeof(mycpu->gd_cnt.v_lock_name) - 2);
 957                         ++mycpu->gd_cnt.v_lock_colls;
 958
 959                         error = tsleep(link, flags | PINTERLOCKED,
 960                                        mtx->mtx_ident, to);
 961                         if (error)
 962                                 break;
 963                 }
 964         }
 965
 966         /*
 967          * We need at least a lfence (load fence) to ensure our cpu does not
 968          * reorder loads (of data outside the lock structure) prior to the
 969          * remote cpu's release, since the above test may have run without
 970          * any atomic interactions.
 971          *
 972          * If we do not do this then state updated by the other cpu before
 973          * releasing its lock may not be read cleanly by our cpu when this
 974          * function returns.  Even though the other cpu ordered its stores,
 975          * our loads can still be out of order.
 976          */
 977         cpu_mfence();
 978
 979         /*
 980          * We are done, make sure the link structure is unlinked.
 981          * It may still be on the list due to e.g. EINTR or
 982          * EWOULDBLOCK.
 983          *
 984          * It is possible for the tsleep to race an ABORT and cause
 985          * error to be 0.
 986          *
 987          * The tsleep() can be woken up for numerous reasons and error
 988          * might be zero in situations where we intend to return an error.
 989          *
 990          * (This is the synchronous case so state cannot be CALLEDBACK)
 991          */
 992         switch(link->state) {
 993         case MTX_LINK_ACQUIRED:
 994         case MTX_LINK_CALLEDBACK:
 995                 error = 0;
 996                 break;
 997         case MTX_LINK_ABORTED:
 998                 error = ENOLCK;
 999                 break;
1000         case MTX_LINK_LINKED_EX:
1001         case MTX_LINK_LINKED_SH:
1002                 mtx_delete_link(mtx, link);
1003                 /* fall through */
1004         default:
1005                 if (error == 0)
1006                         error = EWOULDBLOCK;
1007                 break;
1008         }
1009
1010         /*
1011          * Clear state on status returned.
1012          */
1013         link->state = MTX_LINK_IDLE;
1014
1015         return error;
1016 }
1017
1018 /*
1019  * Abort a mutex locking operation, causing mtx_lock_ex_link() to
1020  * return ENOLCK.  This may be called at any time after the mtx_link
1021  * is initialized or the status from a previous lock has been
1022  * returned.  If called prior to the next (non-try) lock attempt, the
1023  * next lock attempt using this link structure will abort instantly.
1024  *
1025  * Caller must still wait for the operation to complete, either from a
1026  * blocking call that is still in progress or by calling mtx_wait_link().
1027  *
1028  * If an asynchronous lock request is possibly in-progress, the caller
1029  * should call mtx_wait_link() synchronously.  Note that the asynchronous
1030  * lock callback will NOT be called if a successful abort occurred. XXX
1031  */
1032 void
1033 mtx_abort_link(mtx_t *mtx, mtx_link_t *link)
1034 {
1035         thread_t td = curthread;
1036         u_int   lock;
1037         u_int   nlock;
1038
1039         /*
1040          * Acquire MTX_LINKSPIN
1041          */
1042         ++td->td_critcount;
1043         for (;;) {
1044                 lock = mtx->mtx_lock;
1045                 if (lock & MTX_LINKSPIN) {
1046                         cpu_pause();
1047                         continue;
1048                 }
1049                 nlock = lock | MTX_LINKSPIN;
1050                 if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock))
1051                         break;
1052                 cpu_pause();
1053         }
1054
1055         /*
1056          * Do the abort.
1057          *
1058          * WARNING! Link structure can disappear once link->state is set.
1059          */
1060         nlock = MTX_LINKSPIN;   /* to clear */
1061
1062         switch(link->state) {
1063         case MTX_LINK_IDLE:
1064                 /*
1065                  * Link not started yet
1066                  */
1067                 link->state = MTX_LINK_ABORTED;
1068                 break;
1069         case MTX_LINK_LINKED_EX:
1070                 /*
1071                  * de-link, mark aborted, and potentially wakeup the thread
1072                  * or issue the callback.
1073                  */
1074                 if (link->next == link) {
1075                         if (mtx->mtx_exlink == link) {
1076                                 mtx->mtx_exlink = NULL;
1077                                 nlock |= MTX_EXWANTED;  /* to clear */
1078                         }
1079                 } else {
1080                         if (mtx->mtx_exlink == link)
1081                                 mtx->mtx_exlink = link->next;
1082                         link->next->prev = link->prev;
1083                         link->prev->next = link->next;
1084                 }
1085
1086                 /*
1087                  * When aborting the async callback is still made.  We must
1088                  * not set the link status to ABORTED in the callback case
1089                  * since there is nothing else to clear its status if the
1090                  * link is reused.
1091                  */
1092                 if (link->callback) {
1093                         link->state = MTX_LINK_CALLEDBACK;
1094                         link->callback(link, link->arg, ENOLCK);
1095                 } else {
1096                         link->state = MTX_LINK_ABORTED;
1097                         wakeup(link);
1098                 }
1099                 break;
1100         case MTX_LINK_LINKED_SH:
1101                 /*
1102                  * de-link, mark aborted, and potentially wakeup the thread
1103                  * or issue the callback.
1104                  */
1105                 if (link->next == link) {
1106                         if (mtx->mtx_shlink == link) {
1107                                 mtx->mtx_shlink = NULL;
1108                                 nlock |= MTX_SHWANTED;  /* to clear */
1109                         }
1110                 } else {
1111                         if (mtx->mtx_shlink == link)
1112                                 mtx->mtx_shlink = link->next;
1113                         link->next->prev = link->prev;
1114                         link->prev->next = link->next;
1115                 }
1116
1117                 /*
1118                  * When aborting the async callback is still made.  We must
1119                  * not set the link status to ABORTED in the callback case
1120                  * since there is nothing else to clear its status if the
1121                  * link is reused.
1122                  */
1123                 if (link->callback) {
1124                         link->state = MTX_LINK_CALLEDBACK;
1125                         link->callback(link, link->arg, ENOLCK);
1126                 } else {
1127                         link->state = MTX_LINK_ABORTED;
1128                         wakeup(link);
1129                 }
1130                 break;
1131         case MTX_LINK_ACQUIRED:
1132         case MTX_LINK_CALLEDBACK:
1133                 /*
1134                  * Too late, the lock was acquired.  Let it complete.
1135                  */
1136                 break;
1137         default:
1138                 /*
1139                  * link already aborted, do nothing.
1140                  */
1141                 break;
1142         }
1143         atomic_clear_int(&mtx->mtx_lock, nlock);
1144         --td->td_critcount;
1145 }