From 24befe94b31f6491cdca91f4f4f87bbd74787db1 Mon Sep 17 00:00:00 2001 From: Sepherosa Ziehau Date: Fri, 3 Jan 2014 20:10:32 +0800 Subject: [PATCH] serialize: Optimize atomic_intr_cond_{enter,try,exit}() Use counter (30bits) of __atomic_intr_t as wait counter instead of request counter: - This avoids counter updates in atomic_intr_cond_try(). - Move counter decrement from atomic_intr_cond_exit() to atomic_intr_cond_enter(). - Try obtaining intr_cond first in atomic_intr_cond_enter(). If the try failed, counter would be incremented then. This reduces the number of locked bus cycle intructions. - For "try ok/exit" sequence: 4 -> 2. - For "try fail": 3 -> 1. - For uncontended "enter/exit" sequence: 3 -> 2 For contended "enter/exit" sequence, this increases the number of locked bus cycle intructions from 3 to 4. Compared with the sleep, this should be relatively cheap. Tested on 8 HT (i7-3770) box, using kq_accept_server/kq_connect_client: - 4/4 TX/RX rings device (BCM5719, using MSI-X), slight improvement. - 8/8 TX/RX rings device (Intel 82580, using MSI-X), slight improvement. - 1/2 TX/RX rings device (Intel 82599, using MSI), no observable improvement. --- sys/cpu/i386/include/atomic.h | 15 +++++++-------- sys/cpu/x86_64/include/atomic.h | 15 +++++++-------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/sys/cpu/i386/include/atomic.h b/sys/cpu/i386/include/atomic.h index 341d2de89b..b444d045ae 100644 --- a/sys/cpu/i386/include/atomic.h +++ b/sys/cpu/i386/include/atomic.h @@ -319,12 +319,15 @@ static __inline void atomic_intr_cond_enter(__atomic_intr_t *p, void (*func)(void *), void *arg) { - __asm __volatile(MPLOCKED "incl %0; " \ + __asm __volatile(MPLOCKED "btsl $31,%0; jnc 3f; " \ + MPLOCKED "incl %0; " \ "1: ;" \ MPLOCKED "btsl $31,%0; jnc 2f; " \ "pushl %2; call *%1; addl $4,%%esp; " \ "jmp 1b; " \ "2: ;" \ + MPLOCKED "decl %0; " \ + "3: ;" \ : "+m" (*p) \ : "r"(func), "m"(arg) \ : "ax", "cx", "dx"); @@ -340,11 +343,8 @@ atomic_intr_cond_try(__atomic_intr_t *p) { int ret; - __asm __volatile(MPLOCKED "incl %0; " \ - "1: ;" \ - "subl %%eax,%%eax; " \ + __asm __volatile("subl %%eax,%%eax; " \ MPLOCKED "btsl $31,%0; jnc 2f; " \ - MPLOCKED "decl %0; " \ "movl $1,%%eax;" \ "2: ;" : "+m" (*p), "=&a"(ret) @@ -364,9 +364,8 @@ static __inline void atomic_intr_cond_exit(__atomic_intr_t *p, void (*func)(void *), void *arg) { - __asm __volatile(MPLOCKED "decl %0; " \ - MPLOCKED "btrl $31,%0; " \ - "testl $0x3FFFFFFF,%0; jz 1f; " \ + __asm __volatile(MPLOCKED "btrl $31,%0; " \ + "testl $0x3FFFFFFF,%0; jz 1f; " \ "pushl %2; call *%1; addl $4,%%esp; " \ "1: ;" \ : "+m" (*p) \ diff --git a/sys/cpu/x86_64/include/atomic.h b/sys/cpu/x86_64/include/atomic.h index 316ee24dff..8f62d081b3 100644 --- a/sys/cpu/x86_64/include/atomic.h +++ b/sys/cpu/x86_64/include/atomic.h @@ -342,12 +342,15 @@ static __inline void atomic_intr_cond_enter(__atomic_intr_t *p, void (*func)(void *), void *arg) { - __asm __volatile(MPLOCKED "incl %0; " \ + __asm __volatile(MPLOCKED "btsl $31,%0; jnc 3f; " \ + MPLOCKED "incl %0; " \ "1: ;" \ MPLOCKED "btsl $31,%0; jnc 2f; " \ "movq %2,%%rdi; call *%1; " \ "jmp 1b; " \ "2: ;" \ + MPLOCKED "decl %0; " \ + "3: ;" \ : "+m" (*p) \ : "r"(func), "m"(arg) \ : "ax", "cx", "dx", "rsi", "rdi", "r8", "r9", "r10", "r11"); @@ -364,11 +367,8 @@ atomic_intr_cond_try(__atomic_intr_t *p) { int ret; - __asm __volatile(MPLOCKED "incl %0; " \ - "1: ;" \ - "subl %%eax,%%eax; " \ + __asm __volatile("subl %%eax,%%eax; " \ MPLOCKED "btsl $31,%0; jnc 2f; " \ - MPLOCKED "decl %0; " \ "movl $1,%%eax;" \ "2: ;" : "+m" (*p), "=&a"(ret) @@ -388,9 +388,8 @@ static __inline void atomic_intr_cond_exit(__atomic_intr_t *p, void (*func)(void *), void *arg) { - __asm __volatile(MPLOCKED "decl %0; " \ - MPLOCKED "btrl $31,%0; " \ - "testl $0x3FFFFFFF,%0; jz 1f; " \ + __asm __volatile(MPLOCKED "btrl $31,%0; " \ + "testl $0x3FFFFFFF,%0; jz 1f; " \ "movq %2,%%rdi; call *%1; " \ "1: ;" \ : "+m" (*p) \ -- 2.41.0