serialize: Optimize atomic_intr_cond_{enter,try,exit}()
authorSepherosa Ziehau <sephe@dragonflybsd.org>
Fri, 3 Jan 2014 12:10:32 +0000 (20:10 +0800)
committerSepherosa Ziehau <sephe@dragonflybsd.org>
Tue, 7 Jan 2014 06:58:50 +0000 (14:58 +0800)
Use counter (30bits) of __atomic_intr_t as wait counter instead of request
counter:
- This avoids counter updates in atomic_intr_cond_try().
- Move counter decrement from atomic_intr_cond_exit() to
  atomic_intr_cond_enter().
- Try obtaining intr_cond first in atomic_intr_cond_enter().  If the try
  failed, counter would be incremented then.

This reduces the number of locked bus cycle intructions.
- For "try ok/exit" sequence: 4 -> 2.
- For "try fail": 3 -> 1.
- For uncontended "enter/exit" sequence: 3 -> 2

For contended "enter/exit" sequence, this increases the number of locked
bus cycle intructions from 3 to 4.  Compared with the sleep, this should
be relatively cheap.

Tested on 8 HT (i7-3770) box, using kq_accept_server/kq_connect_client:
- 4/4 TX/RX rings device (BCM5719, using MSI-X), slight improvement.
- 8/8 TX/RX rings device (Intel 82580, using MSI-X), slight improvement.
- 1/2 TX/RX rings device (Intel 82599, using MSI), no observable
  improvement.

sys/cpu/i386/include/atomic.h
sys/cpu/x86_64/include/atomic.h

index 341d2de..b444d04 100644 (file)
@@ -319,12 +319,15 @@ static __inline
 void
 atomic_intr_cond_enter(__atomic_intr_t *p, void (*func)(void *), void *arg)
 {
-       __asm __volatile(MPLOCKED "incl %0; " \
+       __asm __volatile(MPLOCKED "btsl $31,%0; jnc 3f; " \
+                        MPLOCKED "incl %0; " \
                         "1: ;" \
                         MPLOCKED "btsl $31,%0; jnc 2f; " \
                         "pushl %2; call *%1; addl $4,%%esp; " \
                         "jmp 1b; " \
                         "2: ;" \
+                        MPLOCKED "decl %0; " \
+                        "3: ;" \
                         : "+m" (*p) \
                         : "r"(func), "m"(arg) \
                         : "ax", "cx", "dx");
@@ -340,11 +343,8 @@ atomic_intr_cond_try(__atomic_intr_t *p)
 {
        int ret;
 
-       __asm __volatile(MPLOCKED "incl %0; "                   \
-                        "1: ;"                                 \
-                        "subl %%eax,%%eax; "                   \
+       __asm __volatile("subl %%eax,%%eax; "                   \
                         MPLOCKED "btsl $31,%0; jnc 2f; "       \
-                        MPLOCKED "decl %0; "                   \
                         "movl $1,%%eax;"                       \
                         "2: ;"
                         : "+m" (*p), "=&a"(ret)
@@ -364,9 +364,8 @@ static __inline
 void
 atomic_intr_cond_exit(__atomic_intr_t *p, void (*func)(void *), void *arg)
 {
-       __asm __volatile(MPLOCKED "decl %0; " \
-                       MPLOCKED "btrl $31,%0; " \
-                       "testl $0x3FFFFFFF,%0; jz 1f; " \
+       __asm __volatile(MPLOCKED "btrl $31,%0; " \
+                        "testl $0x3FFFFFFF,%0; jz 1f; " \
                         "pushl %2; call *%1; addl $4,%%esp; " \
                         "1: ;" \
                         : "+m" (*p) \
index 316ee24..8f62d08 100644 (file)
@@ -342,12 +342,15 @@ static __inline
 void
 atomic_intr_cond_enter(__atomic_intr_t *p, void (*func)(void *), void *arg)
 {
-       __asm __volatile(MPLOCKED "incl %0; " \
+       __asm __volatile(MPLOCKED "btsl $31,%0; jnc 3f; " \
+                        MPLOCKED "incl %0; " \
                         "1: ;" \
                         MPLOCKED "btsl $31,%0; jnc 2f; " \
                         "movq %2,%%rdi; call *%1; " \
                         "jmp 1b; " \
                         "2: ;" \
+                        MPLOCKED "decl %0; " \
+                        "3: ;" \
                         : "+m" (*p) \
                         : "r"(func), "m"(arg) \
                         : "ax", "cx", "dx", "rsi", "rdi", "r8", "r9", "r10", "r11");
@@ -364,11 +367,8 @@ atomic_intr_cond_try(__atomic_intr_t *p)
 {
        int ret;
 
-       __asm __volatile(MPLOCKED "incl %0; "                   \
-                        "1: ;"                                 \
-                        "subl %%eax,%%eax; "                   \
+       __asm __volatile("subl %%eax,%%eax; "                   \
                         MPLOCKED "btsl $31,%0; jnc 2f; "       \
-                        MPLOCKED "decl %0; "                   \
                         "movl $1,%%eax;"                       \
                         "2: ;"
                         : "+m" (*p), "=&a"(ret)
@@ -388,9 +388,8 @@ static __inline
 void
 atomic_intr_cond_exit(__atomic_intr_t *p, void (*func)(void *), void *arg)
 {
-       __asm __volatile(MPLOCKED "decl %0; " \
-                       MPLOCKED "btrl $31,%0; " \
-                       "testl $0x3FFFFFFF,%0; jz 1f; " \
+       __asm __volatile(MPLOCKED "btrl $31,%0; " \
+                        "testl $0x3FFFFFFF,%0; jz 1f; " \
                         "movq %2,%%rdi; call *%1; " \
                         "1: ;" \
                         : "+m" (*p) \