From 69e6d11c18b7932f0e82b889cc25cdf746f5590a Mon Sep 17 00:00:00 2001 From: Michael Neumann Date: Wed, 15 Jul 2009 17:37:44 +0200 Subject: [PATCH 01/16] hammer expand: Layer 1 formatting (step 2/2) With this commit online HAMMER filesystem expansion is functional, but highly experimental and might destroy your filesystem. --- sys/vfs/hammer/hammer_expand.c | 73 ++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 7 deletions(-) diff --git a/sys/vfs/hammer/hammer_expand.c b/sys/vfs/hammer/hammer_expand.c index f9532c06ad..1359a54134 100644 --- a/sys/vfs/hammer/hammer_expand.c +++ b/sys/vfs/hammer/hammer_expand.c @@ -43,7 +43,7 @@ static int hammer_format_volume_header(struct hammer_mount *hmp, const char *dev_path, const char *vol_name, int vol_no, int vol_count, int64_t vol_size, int64_t boot_area_size, int64_t mem_area_size, - uint64_t *num_layer1_entries_p); + uint64_t *num_layer1_entries_p, uint64_t *layer1_free_blocks); int hammer_ioc_expand(hammer_transaction_t trans, hammer_inode_t ip, @@ -77,6 +77,9 @@ hammer_ioc_expand(hammer_transaction_t trans, hammer_inode_t ip, } uint64_t num_layer1_entries = 0; + uint64_t *layer1_free_blocks = + kmalloc(1024 * sizeof(uint64_t), M_TEMP, M_WAITOK|M_ZERO); + error = hammer_format_volume_header( hmp, expand->device_name, @@ -86,7 +89,9 @@ hammer_ioc_expand(hammer_transaction_t trans, hammer_inode_t ip, expand->vol_size, expand->boot_area_size, expand->mem_area_size, - &num_layer1_entries /* out param */); + &num_layer1_entries /* out param */, + layer1_free_blocks); + KKASSERT(num_layer1_entries < 1024); if (error) goto end; @@ -95,6 +100,7 @@ hammer_ioc_expand(hammer_transaction_t trans, hammer_inode_t ip, goto end; ++hmp->nvolumes; + hammer_sync_lock_sh(trans); hammer_lock_ex(&hmp->blkmap_lock); @@ -121,9 +127,50 @@ hammer_ioc_expand(hammer_transaction_t trans, hammer_inode_t ip, /* * Assign Layer1 entries */ + + hammer_volume_t root_volume = NULL; + hammer_blockmap_t freemap; + + freemap = &hmp->blockmap[HAMMER_ZONE_FREEMAP_INDEX]; + root_volume = hammer_get_root_volume(hmp, &error); + KKASSERT(root_volume && error == 0); + for (uint64_t i_layer1 = 0; i_layer1 < num_layer1_entries; i_layer1++) { - /* XXX */ - } + hammer_buffer_t buffer1 = NULL; + struct hammer_blockmap_layer1 *layer1; + hammer_off_t layer1_offset; + + layer1_offset = freemap->phys_offset + + (free_vol_no * 1024L) * + sizeof(struct hammer_blockmap_layer1) + i_layer1; + + layer1 = hammer_bread(hmp, layer1_offset, &error, &buffer1); + KKASSERT(layer1 != NULL && error == 0); + KKASSERT(layer1->phys_offset == HAMMER_BLOCKMAP_UNAVAIL); + + hammer_modify_buffer(trans, buffer1, layer1, sizeof(*layer1)); + bzero(layer1, sizeof(*layer1)); + layer1->phys_offset = HAMMER_ENCODE_RAW_BUFFER(free_vol_no, + i_layer1 * HAMMER_LARGEBLOCK_SIZE); + + layer1->blocks_free = layer1_free_blocks[i_layer1]; + layer1->layer1_crc = crc32(layer1, HAMMER_LAYER1_CRCSIZE); + + hammer_modify_buffer_done(buffer1); + if (buffer1) + hammer_rel_buffer(buffer1, 0); + + hammer_modify_volume_field(trans, root_volume, + vol0_stat_freebigblocks); + + root_volume->ondisk->vol0_stat_freebigblocks += + layer1_free_blocks[i_layer1]; + hmp->copy_stat_freebigblocks = + root_volume->ondisk->vol0_stat_freebigblocks; + hammer_modify_volume_done(root_volume); + } /* for */ + + hammer_rel_volume(root_volume, 0); hammer_unlock(&hmp->blkmap_lock); hammer_sync_unlock(trans); @@ -132,6 +179,8 @@ end: if (error) { kprintf("An error occured: %d\n", error); } + if (layer1_free_blocks) + kfree(layer1_free_blocks, M_TEMP); return (error); } @@ -139,7 +188,7 @@ static int hammer_format_volume_header(struct hammer_mount *hmp, const char *dev_path, const char *vol_name, int vol_no, int vol_count, int64_t vol_size, int64_t boot_area_size, int64_t mem_area_size, - uint64_t *num_layer1_entries_p) + uint64_t *num_layer1_entries_p, uint64_t *layer1_free_blocks) { struct vnode *devvp = NULL; struct buf *bp = NULL; @@ -264,8 +313,6 @@ hammer_format_volume_header(struct hammer_mount *hmp, const char *dev_path, ((off_end & HAMMER_BLOCKMAP_LAYER2_MASK) == 0 ? 0 : 1); *num_layer1_entries_p = num_layer1_entries; - kprintf("num_layer1_entries: %d\n", num_layer1_entries); - /* * We allocate all L2 big blocks sequentially from the start of * the volume. @@ -287,6 +334,17 @@ hammer_format_volume_header(struct hammer_mount *hmp, const char *dev_path, hammer_off_t bigblock_off = HAMMER_LARGEBLOCK_SIZE * (off / sizeof(*layer2)); + /* + * To which layer1 entry does the current layer2 + * big block belong? + * + * We need this to calculate the free bigblocks + * which is required for the layer1. + */ + uint64_t i_layer1 = HAMMER_BLOCKMAP_LAYER1_OFFSET(off) / + sizeof(struct hammer_blockmap_layer1); + KKASSERT(i_layer1 < 1024); + bzero(layer2, sizeof(*layer2)); if ((off & HAMMER_LARGEBLOCK_SIZE) == bigblock_off) { @@ -300,6 +358,7 @@ hammer_format_volume_header(struct hammer_mount *hmp, const char *dev_path, layer2->zone = 0; layer2->append_off = 0; layer2->bytes_free = HAMMER_LARGEBLOCK_SIZE; + ++layer1_free_blocks[i_layer1]; } else { layer2->zone = HAMMER_ZONE_UNAVAIL_INDEX; layer2->append_off = HAMMER_LARGEBLOCK_SIZE; -- 2.41.0 From 1f595ae445140feff81d6b3090f1492eeb0f9712 Mon Sep 17 00:00:00 2001 From: Michael Neumann Date: Wed, 15 Jul 2009 18:34:58 +0200 Subject: [PATCH 02/16] hammer expand: Also increase number of total bigblocks --- sys/vfs/hammer/hammer_expand.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/sys/vfs/hammer/hammer_expand.c b/sys/vfs/hammer/hammer_expand.c index 1359a54134..7e7f8ab929 100644 --- a/sys/vfs/hammer/hammer_expand.c +++ b/sys/vfs/hammer/hammer_expand.c @@ -160,9 +160,21 @@ hammer_ioc_expand(hammer_transaction_t trans, hammer_inode_t ip, if (buffer1) hammer_rel_buffer(buffer1, 0); + /* + * Increase the total number of bigblocks + */ hammer_modify_volume_field(trans, root_volume, - vol0_stat_freebigblocks); + vol0_stat_bigblocks); + root_volume->ondisk->vol0_stat_bigblocks += + layer1_free_blocks[i_layer1]; + hammer_modify_volume_done(root_volume); + /* + * Increase the number of free bigblocks + * (including the copy in hmp) + */ + hammer_modify_volume_field(trans, root_volume, + vol0_stat_freebigblocks); root_volume->ondisk->vol0_stat_freebigblocks += layer1_free_blocks[i_layer1]; hmp->copy_stat_freebigblocks = -- 2.41.0 From ab6109fbf5fccc452927e19ab4ed5ad0f62e4a38 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 09:44:22 -0700 Subject: [PATCH 03/16] Adjust atomic_cmpset_int/long - Faster version, fix amd64 issue. * Instead of using the condition code just compare %eax (or %rax) against the old value. This is considerably faster then using sete/movzbl and GCC will also optimize the caller's test for both 0 or non-zero. * AMD64. long is 64 bits, use cmpxchgq (it was previously using cmpxchgl). --- sys/cpu/amd64/include/atomic.h | 15 +++++++++------ sys/cpu/i386/include/atomic.h | 4 +--- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/sys/cpu/amd64/include/atomic.h b/sys/cpu/amd64/include/atomic.h index 25facb4812..2f8e7df095 100644 --- a/sys/cpu/amd64/include/atomic.h +++ b/sys/cpu/amd64/include/atomic.h @@ -386,19 +386,22 @@ atomic_cmpset_int(volatile u_int *_dst, u_int _old, u_int _new) int res = _old; __asm __volatile(MPLOCKED "cmpxchgl %2,%1; " \ - "setz %%al; " \ - "movzbl %%al,%0; " \ : "+a" (res), "=m" (*_dst) \ : "r" (_new), "m" (*_dst) \ : "memory"); - return res; + return (res == _old); } -static __inline int +static __inline long atomic_cmpset_long(volatile u_long *dst, u_long exp, u_long src) { - return (atomic_cmpset_int((volatile u_int *)dst, (u_int)exp, - (u_int)src)); + int res = _old; + + __asm __volatile(MPLOCKED "cmpxchgq %2,%1; " \ + : "+a" (res), "=m" (*_dst) \ + : "r" (_new), "m" (*_dst) \ + : "memory"); + return (res == _old); } /* diff --git a/sys/cpu/i386/include/atomic.h b/sys/cpu/i386/include/atomic.h index 19e6e1f969..90ee4a3630 100644 --- a/sys/cpu/i386/include/atomic.h +++ b/sys/cpu/i386/include/atomic.h @@ -363,12 +363,10 @@ atomic_cmpset_int(volatile u_int *_dst, u_int _old, u_int _new) int res = _old; __asm __volatile(MPLOCKED "cmpxchgl %2,%1; " \ - "setz %%al; " \ - "movzbl %%al,%0; " \ : "+a" (res), "=m" (*_dst) \ : "r" (_new), "m" (*_dst) \ : "memory"); - return res; + return (res == _old); } static __inline int -- 2.41.0 From 33b0b87c7f75846a43b41baa24ae0d170d110724 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 11:29:43 -0700 Subject: [PATCH 04/16] MPSAFE - Add a set of general blocking/spinnable mutex functions. These locks are intended to eventually replace lockmgr locks for most use cases. * Optimized based on in-line atomic_cmpset_int() calls with fallback call. * Recursive shared and exclusive locks. Downgrading, and non-blocking upgrading. * Interlocked wakeup flags. * Serial wakeup for exclusive waiters (i.e. optimal in the face of a large number of waiting threads). Mass-wakeup for shared waiters. * Additional entry points for spinning. * Ref-count support, separate from lock count. --- sys/conf/files | 1 + sys/kern/kern_mutex.c | 356 ++++++++++++++++++++++++++++++++++++++++++ sys/sys/mutex.h | 97 ++++++++++++ sys/sys/mutex2.h | 227 +++++++++++++++++++++++++++ 4 files changed, 681 insertions(+) create mode 100644 sys/kern/kern_mutex.c create mode 100644 sys/sys/mutex.h create mode 100644 sys/sys/mutex2.h diff --git a/sys/conf/files b/sys/conf/files index 459cd516d0..345c452fc6 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -677,6 +677,7 @@ kern/kern_usched.c standard kern/usched_bsd4.c standard kern/usched_dummy.c standard kern/kern_umtx.c standard +kern/kern_mutex.c standard kern/lwkt_thread.c standard kern/lwkt_ipiq.c standard kern/lwkt_token.c standard diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c new file mode 100644 index 0000000000..96a4f04a9c --- /dev/null +++ b/sys/kern/kern_mutex.c @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2009 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Implement fast persistent locks based on atomic_cmpset_int() with + * semantics similar to lockmgr locks but faster and taking up much less + * space. Taken from HAMMER's lock implementation. + * + * These are meant to complement our LWKT tokens. Tokens are only held + * while the thread is running. Mutexes can be held across blocking + * conditions. + * + * Most of the support is in sys/mutex[2].h. We mostly provide backoff + * functions here. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +static __int64_t mtx_contention_count; +static __int64_t mtx_collision_count; +static __int64_t mtx_wakeup_count; + +SYSCTL_QUAD(_kern, OID_AUTO, mtx_contention_count, CTLFLAG_RW, + &mtx_contention_count, 0, ""); +SYSCTL_QUAD(_kern, OID_AUTO, mtx_collision_count, CTLFLAG_RW, + &mtx_collision_count, 0, ""); +SYSCTL_QUAD(_kern, OID_AUTO, mtx_wakeup_count, CTLFLAG_RW, + &mtx_wakeup_count, 0, ""); + +/* + * Exclusive-lock a mutex, block until acquired. Recursion is allowed. + */ +void +_mtx_lock_ex(mtx_t mtx, const char *ident, int flags) +{ + u_int lock; + u_int nlock; + + for (;;) { + lock = mtx->mtx_lock; + if (lock == 0) { + nlock = MTX_EXCLUSIVE | 1; + if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { + /* mtx_owner set by caller */ + return; + } + } else if ((lock & MTX_EXCLUSIVE) && + mtx->mtx_owner == curthread) { + KKASSERT((lock & MTX_MASK) != MTX_MASK); + nlock = (lock + 1); + if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) + return; + } else { + nlock = lock | MTX_EXWANTED; + tsleep_interlock(mtx, 0); + if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { + ++mtx_contention_count; + tsleep(mtx, flags, ident, 0); + } + } + ++mtx_collision_count; + } +} + +/* + * Share-lock a mutex, block until acquired. Recursion is allowed. + */ +void +_mtx_lock_sh(mtx_t mtx, const char *ident, int flags) +{ + u_int lock; + u_int nlock; + + for (;;) { + lock = mtx->mtx_lock; + if ((lock & MTX_EXCLUSIVE) == 0) { + KKASSERT((lock & MTX_MASK) != MTX_MASK); + nlock = lock + 1; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) + return; + } else { + nlock = lock | MTX_SHWANTED; + tsleep_interlock(mtx, 0); + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + ++mtx_contention_count; + tsleep(mtx, flags, ident, 0); + } + } + ++mtx_collision_count; + } +} + +void +_mtx_spinlock_ex(mtx_t mtx) +{ + u_int lock; + u_int nlock; + int bb = 1; + int bo; + + for (;;) { + lock = mtx->mtx_lock; + if (lock == 0) { + nlock = MTX_EXCLUSIVE | 1; + if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { + /* mtx_owner set by caller */ + return; + } + } else if ((lock & MTX_EXCLUSIVE) && + mtx->mtx_owner == curthread) { + KKASSERT((lock & MTX_MASK) != MTX_MASK); + nlock = (lock + 1); + if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) + return; + } else { + /* MWAIT here */ + if (bb < 1000) + ++bb; + cpu_pause(); + for (bo = 0; bo < bb; ++bo) + ; + ++mtx_contention_count; + } + ++mtx_collision_count; + } +} + +void +_mtx_spinlock_sh(mtx_t mtx) +{ + u_int lock; + u_int nlock; + int bb = 1; + int bo; + + for (;;) { + lock = mtx->mtx_lock; + if ((lock & MTX_EXCLUSIVE) == 0) { + KKASSERT((lock & MTX_MASK) != MTX_MASK); + nlock = lock + 1; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) + return; + } else { + /* MWAIT here */ + if (bb < 1000) + ++bb; + cpu_pause(); + for (bo = 0; bo < bb; ++bo) + ; + ++mtx_contention_count; + } + ++mtx_collision_count; + } +} + +int +_mtx_lock_ex_try(mtx_t mtx) +{ + u_int lock; + u_int nlock; + int error = 0; + + for (;;) { + lock = mtx->mtx_lock; + if (lock == 0) { + nlock = MTX_EXCLUSIVE | 1; + if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { + /* mtx_owner set by caller */ + break; + } + } else if ((lock & MTX_EXCLUSIVE) && + mtx->mtx_owner == curthread) { + KKASSERT((lock & MTX_MASK) != MTX_MASK); + nlock = (lock + 1); + if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) + break; + } else { + error = EAGAIN; + break; + } + ++mtx_collision_count; + } + return (error); +} + +int +_mtx_lock_sh_try(mtx_t mtx) +{ + u_int lock; + u_int nlock; + int error = 0; + + for (;;) { + lock = mtx->mtx_lock; + if ((lock & MTX_EXCLUSIVE) == 0) { + KKASSERT((lock & MTX_MASK) != MTX_MASK); + nlock = lock + 1; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) + break; + } else { + error = EAGAIN; + break; + } + ++mtx_collision_count; + } + return (error); +} + +/* + * If the lock is held exclusively it must be owned by the caller. If the + * lock is already a shared lock this operation is a NOP. A panic will + * occur if the lock is not held either shared or exclusive. + * + * The exclusive count is converted to a shared count. + */ +void +_mtx_downgrade(mtx_t mtx) +{ + u_int lock; + u_int nlock; + + for (;;) { + lock = mtx->mtx_lock; + if ((lock & MTX_EXCLUSIVE) == 0) { + KKASSERT((lock & MTX_MASK) > 0); + break; + } + KKASSERT(mtx->mtx_owner == curthread); + nlock = lock & ~(MTX_EXCLUSIVE | MTX_SHWANTED); + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + if (lock & MTX_SHWANTED) { + ++mtx_wakeup_count; + wakeup(mtx); + } + break; + } + ++mtx_collision_count; + } +} + +/* + * Upgrade a shared lock to an exclusive lock. The upgrade will fail if + * the shared lock has a count other then 1. Optimize the most likely case + * but note that a single cmpset can fail due to WANTED races. + * + * If the lock is held exclusively it must be owned by the caller and + * this function will simply return without doing anything. A panic will + * occur if the lock is held exclusively by someone other then the caller. + * + * Returns 0 on success, EDEADLK on failure. + */ +int +_mtx_upgrade_try(mtx_t mtx) +{ + u_int lock; + u_int nlock; + int error = 0; + + for (;;) { + lock = mtx->mtx_lock; + + if ((lock & ~MTX_EXWANTED) == 1) { + nlock = lock | MTX_EXCLUSIVE; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + mtx->mtx_owner = curthread; + break; + } + } else if (lock & MTX_EXCLUSIVE) { + KKASSERT(mtx->mtx_owner == curthread); + break; + } else { + error = EDEADLK; + break; + } + ++mtx_collision_count; + } + return (error); +} + +/* + * Unlock a lock. The caller must hold the lock either shared or exclusive. + */ +void +_mtx_unlock(mtx_t mtx) +{ + u_int lock; + u_int nlock; + + for (;;) { + lock = mtx->mtx_lock; + nlock = (lock & (MTX_EXCLUSIVE | MTX_MASK)) - 1; + if (nlock == 0) { + if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0)) { + if (lock & (MTX_SHWANTED | MTX_EXWANTED)) { + ++mtx_wakeup_count; + wakeup(mtx); + } + } + } else if (nlock == MTX_EXCLUSIVE) { + mtx->mtx_owner = NULL; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0)) { + if (lock & (MTX_SHWANTED | MTX_EXWANTED)) { + ++mtx_wakeup_count; + wakeup(mtx); + } + break; + } + } else { + nlock = lock - 1; + KKASSERT((nlock & MTX_MASK) != MTX_MASK); + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) + break; + } + ++mtx_collision_count; + } +} diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h new file mode 100644 index 0000000000..82a43c914b --- /dev/null +++ b/sys/sys/mutex.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2009 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_MUTEX_H_ +#define _SYS_MUTEX_H_ + +#if defined(_KERNEL) || defined(_KERNEL_STRUCTURES) + +#ifndef _SYS_TYPES_H_ +#include +#endif +#ifndef _MACHINE_ATOMIC_H_ +#include +#endif +#ifndef _MACHINE_CPUFUNC_H_ +#include +#endif + +/* + * The general mutex structure provides recursive shared and exclusive + * locks, downgrade, a non-blocking upgrade, and various other functions. + * + * The structure is 16-byte aligned and either 16 or 32 bytes, designed + * for 32 or 64 bit cpus. + */ +struct thread; + +typedef struct mtx { + volatile u_int mtx_lock; + int mtx_refs; + struct thread *mtx_owner; +#if LONG_BIT == 32 + int mtx_unused; +#endif +} *mtx_t; + +#define MTX_EXCLUSIVE 0x80000000 +#define MTX_SHWANTED 0x40000000 +#define MTX_EXWANTED 0x20000000 +#define MTX_MASK 0x0FFFFFFF + +#define MTX_PCATCH 0x00000001 + +#define MTX_OWNER_NONE NULL +#define MTX_OWNER_ANON (struct thread *)-2) + +#endif + +/* + * See also sys/mutex2.h + */ +#ifdef _KERNEL + +void _mtx_lock_ex(mtx_t mtx, const char *ident, int flags); +void _mtx_lock_sh(mtx_t mtx, const char *ident, int flags); +void _mtx_spinlock_ex(mtx_t mtx); +void _mtx_spinlock_sh(mtx_t mtx); +int _mtx_lock_ex_try(mtx_t mtx); +int _mtx_lock_sh_try(mtx_t mtx); +void _mtx_downgrade(mtx_t mtx); +int _mtx_upgrade_try(mtx_t mtx); +void _mtx_unlock(mtx_t mtx); + +#endif + +#endif diff --git a/sys/sys/mutex2.h b/sys/sys/mutex2.h new file mode 100644 index 0000000000..24ee26f383 --- /dev/null +++ b/sys/sys/mutex2.h @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2009 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _SYS_MUTEX2_H_ +#define _SYS_MUTEX2_H_ + +#ifndef _SYS_MUTEX_H_ +#include +#endif +#ifndef _MACHINE_ATOMIC_H_ +#include +#endif + +/* + * Initialize a new mutex, placing it in an unlocked state with no refs. + */ +static __inline void +mtx_init(mtx_t mtx) +{ + mtx->mtx_lock = 0; + mtx->mtx_refs = 0; + mtx->mtx_owner = NULL; +} + +/* + * Deinitialize a mutex + */ +static __inline void +mtx_uninit(mtx_t mtx) +{ + /* empty */ +} + +/* + * Exclusive-lock a mutex, block until acquired. Recursion is allowed. + */ +static __inline void +mtx_lock_ex(mtx_t mtx, const char *ident, int flags) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, MTX_EXCLUSIVE | 1) == 0) + _mtx_lock_ex(mtx, ident, flags); + mtx->mtx_owner = curthread; +} + +/* + * Share-lock a mutex, block until acquired. Recursion is allowed. + */ +static __inline void +mtx_lock_sh(mtx_t mtx, const char *ident, int flags) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, 1) == 0) + _mtx_lock_sh(mtx, ident, flags); +} + +/* + * Exclusive-lock a mutex, spin until acquired. Recursion is allowed. + */ +static __inline void +mtx_spinlock_ex(mtx_t mtx) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, MTX_EXCLUSIVE | 1) == 0) + _mtx_spinlock_ex(mtx); +} + +/* + * Share-lock a mutex, spin until acquired. Recursion is allowed. + */ +static __inline void +mtx_spinlock_sh(mtx_t mtx) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, 1) == 0) + _mtx_spinlock_sh(mtx); +} + +/* + * Attempt to exclusive-lock a mutex, return 0 on success and + * EAGAIN on failure. + */ +static __inline int +mtx_lock_ex_try(mtx_t mtx) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, MTX_EXCLUSIVE | 1) == 0) + return (_mtx_lock_ex_try(mtx)); + mtx->mtx_owner = curthread; + return (0); +} + +/* + * Attempt to share-lock a mutex, return 0 on success and + * EAGAIN on failure. + */ +static __inline int +mtx_lock_sh_try(mtx_t mtx) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, 1) == 0) + return (_mtx_lock_sh_try(mtx)); + return (0); +} + +/* + * If the lock is held exclusively it must be owned by the caller. If the + * lock is already a shared lock this operation is a NOP. A panic will + * occur if the lock is not held either shared or exclusive. + * + * The exclusive count is converted to a shared count. + */ +static __inline void +mtx_downgrade(mtx_t mtx) +{ + mtx->mtx_owner = NULL; + if (atomic_cmpset_int(&mtx->mtx_lock, MTX_EXCLUSIVE | 1, 0) == 0) + _mtx_downgrade(mtx); +} + +/* + * Upgrade a shared lock to an exclusive lock. The upgrade will fail if + * the shared lock has a count other then 1. Optimize the most likely case + * but note that a single cmpset can fail due to WANTED races. + * + * If the lock is held exclusively it must be owned by the caller and + * this function will simply return without doing anything. A panic will + * occur if the lock is held exclusively by someone other then the caller. + * + * Returns 0 on success, EDEADLK on failure. + */ +static __inline int +mtx_upgrade_try(mtx_t mtx) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 1, MTX_EXCLUSIVE | 1)) + return(0); + return (_mtx_upgrade_try(mtx)); +} + +/* + * Optimized unlock cases. + */ +static __inline void +mtx_unlock(mtx_t mtx) +{ + u_int lock = mtx->mtx_lock; + + if (lock == (MTX_EXCLUSIVE | 1)) { + mtx->mtx_owner = NULL; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0) == 0) + _mtx_unlock(mtx); + } else if (lock == 1) { + if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0) == 0) + _mtx_unlock(mtx); + } else { + _mtx_unlock(mtx); + } +} + +static __inline void +mtx_unlock_ex(mtx_t mtx) +{ + u_int lock = mtx->mtx_lock; + + if (lock == (MTX_EXCLUSIVE | 1)) { + mtx->mtx_owner = NULL; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0) == 0) + _mtx_unlock(mtx); + } else { + _mtx_unlock(mtx); + } +} + +static __inline void +mtx_unlock_sh(mtx_t mtx) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 1, 0) == 0) + _mtx_unlock(mtx); +} + +/* + * Bump the lock's ref count. This field is independent of the lock. + */ +static __inline void +mtx_hold(mtx_t mtx) +{ + atomic_add_acq_int(&mtx->mtx_refs, 1); +} + +/* + * Drop the lock's ref count. This field is independent of the lock. + * + * Returns the previous ref count, interlocked so testing against + * 1 means you won the 1->0 transition + */ +static __inline int +mtx_drop(mtx_t mtx) +{ + return (atomic_fetchadd_int(&mtx->mtx_refs, -1)); +} + +#endif -- 2.41.0 From 54d51ad1f35de8175bbd4dfbd3a4c8b224fb8e1f Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 11:37:45 -0700 Subject: [PATCH 05/16] MPSAFE - Add a set of general blocking/spinnable mutex functions. * (serial wakeup code wasn't included in last commit) --- sys/kern/kern_mutex.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index 96a4f04a9c..ccfd3e8718 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -92,10 +92,12 @@ _mtx_lock_ex(mtx_t mtx, const char *ident, int flags) return; } else { nlock = lock | MTX_EXWANTED; - tsleep_interlock(mtx, 0); + tsleep_interlock(&mtx->mtx_owner, 0); if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { ++mtx_contention_count; - tsleep(mtx, flags, ident, 0); + tsleep(&mtx->mtx_owner, flags, ident, 0); + } else { + tsleep_remove(curthread); } } ++mtx_collision_count; @@ -124,6 +126,8 @@ _mtx_lock_sh(mtx_t mtx, const char *ident, int flags) if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { ++mtx_contention_count; tsleep(mtx, flags, ident, 0); + } else { + tsleep_remove(curthread); } } ++mtx_collision_count; @@ -331,18 +335,26 @@ _mtx_unlock(mtx_t mtx) nlock = (lock & (MTX_EXCLUSIVE | MTX_MASK)) - 1; if (nlock == 0) { if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0)) { - if (lock & (MTX_SHWANTED | MTX_EXWANTED)) { + if (lock & MTX_SHWANTED) { ++mtx_wakeup_count; wakeup(mtx); } + if (lock & MTX_EXWANTED) { + ++mtx_wakeup_count; + wakeup_one(&mtx->mtx_owner); + } } } else if (nlock == MTX_EXCLUSIVE) { mtx->mtx_owner = NULL; if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0)) { - if (lock & (MTX_SHWANTED | MTX_EXWANTED)) { + if (lock & MTX_SHWANTED) { ++mtx_wakeup_count; wakeup(mtx); } + if (lock & MTX_EXWANTED) { + ++mtx_wakeup_count; + wakeup_one(&mtx->mtx_owner); + } break; } } else { -- 2.41.0 From 7355baa5bbf0684a5284f52d4d2726e00c2893d8 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 12:04:28 -0700 Subject: [PATCH 06/16] MPSAFE - mutex enhancements. * Add timeout to mtx_lock_{sh,ex} * Add quick version, mtx_lock_{sh,ex}_quick() which does not take a flags or timeout parameter. --- sys/kern/kern_mutex.c | 81 ++++++++++++++++++++++++++++++++++--------- sys/sys/mutex.h | 6 ++-- sys/sys/mutex2.h | 37 ++++++++++++++++---- 3 files changed, 99 insertions(+), 25 deletions(-) diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index ccfd3e8718..18f60b1f9b 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -69,69 +69,116 @@ SYSCTL_QUAD(_kern, OID_AUTO, mtx_wakeup_count, CTLFLAG_RW, /* * Exclusive-lock a mutex, block until acquired. Recursion is allowed. + * + * Returns 0 on success, or the tsleep() return code on failure. + * An error can only be returned if PCATCH is specified in the flags. */ -void -_mtx_lock_ex(mtx_t mtx, const char *ident, int flags) +static __inline int +__mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to) { u_int lock; u_int nlock; + int error; for (;;) { lock = mtx->mtx_lock; if (lock == 0) { nlock = MTX_EXCLUSIVE | 1; if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { - /* mtx_owner set by caller */ - return; + mtx->mtx_owner = curthread; + error = 0; + break; } } else if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) { KKASSERT((lock & MTX_MASK) != MTX_MASK); nlock = (lock + 1); - if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) - return; + if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { + error = 0; + break; + } } else { nlock = lock | MTX_EXWANTED; tsleep_interlock(&mtx->mtx_owner, 0); if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { + error = tsleep(&mtx->mtx_owner, flags, + ident, to); ++mtx_contention_count; - tsleep(&mtx->mtx_owner, flags, ident, 0); + if (error) { + ++mtx_wakeup_count; + wakeup_one(&mtx->mtx_owner); + break; + } } else { tsleep_remove(curthread); } } ++mtx_collision_count; } + return (error); +} + +int +_mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to) +{ + return(__mtx_lock_ex(mtx, ident, flags, to)); +} + +int +_mtx_lock_ex_quick(mtx_t mtx, const char *ident) +{ + return(__mtx_lock_ex(mtx, ident, 0, 0)); } /* * Share-lock a mutex, block until acquired. Recursion is allowed. + * + * Returns 0 on success, or the tsleep() return code on failure. + * An error can only be returned if PCATCH is specified in the flags. */ -void -_mtx_lock_sh(mtx_t mtx, const char *ident, int flags) +static __inline int +__mtx_lock_sh(mtx_t mtx, const char *ident, int flags, int to) { u_int lock; u_int nlock; + int error; for (;;) { lock = mtx->mtx_lock; if ((lock & MTX_EXCLUSIVE) == 0) { KKASSERT((lock & MTX_MASK) != MTX_MASK); nlock = lock + 1; - if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) - return; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + error = 0; + break; + } } else { nlock = lock | MTX_SHWANTED; tsleep_interlock(mtx, 0); if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + error = tsleep(mtx, flags, ident, to); + if (error) + break; ++mtx_contention_count; - tsleep(mtx, flags, ident, 0); } else { tsleep_remove(curthread); } } ++mtx_collision_count; } + return (error); +} + +int +_mtx_lock_sh(mtx_t mtx, const char *ident, int flags, int to) +{ + return (__mtx_lock_sh(mtx, ident, flags, to)); +} + +int +_mtx_lock_sh_quick(mtx_t mtx, const char *ident) +{ + return (__mtx_lock_sh(mtx, ident, 0, 0)); } void @@ -273,8 +320,8 @@ _mtx_downgrade(mtx_t mtx) nlock = lock & ~(MTX_EXCLUSIVE | MTX_SHWANTED); if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { if (lock & MTX_SHWANTED) { - ++mtx_wakeup_count; wakeup(mtx); + ++mtx_wakeup_count; } break; } @@ -336,24 +383,24 @@ _mtx_unlock(mtx_t mtx) if (nlock == 0) { if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0)) { if (lock & MTX_SHWANTED) { - ++mtx_wakeup_count; wakeup(mtx); + ++mtx_wakeup_count; } if (lock & MTX_EXWANTED) { - ++mtx_wakeup_count; wakeup_one(&mtx->mtx_owner); + ++mtx_wakeup_count; } } } else if (nlock == MTX_EXCLUSIVE) { mtx->mtx_owner = NULL; if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0)) { if (lock & MTX_SHWANTED) { - ++mtx_wakeup_count; wakeup(mtx); + ++mtx_wakeup_count; } if (lock & MTX_EXWANTED) { - ++mtx_wakeup_count; wakeup_one(&mtx->mtx_owner); + ++mtx_wakeup_count; } break; } diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h index 82a43c914b..9c07992cbc 100644 --- a/sys/sys/mutex.h +++ b/sys/sys/mutex.h @@ -82,8 +82,10 @@ typedef struct mtx { */ #ifdef _KERNEL -void _mtx_lock_ex(mtx_t mtx, const char *ident, int flags); -void _mtx_lock_sh(mtx_t mtx, const char *ident, int flags); +int _mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to); +int _mtx_lock_sh(mtx_t mtx, const char *ident, int flags, int to); +int _mtx_lock_ex_quick(mtx_t mtx, const char *ident); +int _mtx_lock_sh_quick(mtx_t mtx, const char *ident); void _mtx_spinlock_ex(mtx_t mtx); void _mtx_spinlock_sh(mtx_t mtx); int _mtx_lock_ex_try(mtx_t mtx); diff --git a/sys/sys/mutex2.h b/sys/sys/mutex2.h index 24ee26f383..af63c259f4 100644 --- a/sys/sys/mutex2.h +++ b/sys/sys/mutex2.h @@ -64,23 +64,48 @@ mtx_uninit(mtx_t mtx) /* * Exclusive-lock a mutex, block until acquired. Recursion is allowed. + * + * Returns 0 on success, or the tsleep() return code on failure. + * An error can only be returned if PCATCH is specified in the flags. */ -static __inline void -mtx_lock_ex(mtx_t mtx, const char *ident, int flags) +static __inline int +mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, MTX_EXCLUSIVE | 1) == 0) + return(_mtx_lock_ex(mtx, ident, flags, to)); + mtx->mtx_owner = curthread; + return(0); +} + +static __inline int +mtx_lock_ex_quick(mtx_t mtx, const char *ident) { if (atomic_cmpset_int(&mtx->mtx_lock, 0, MTX_EXCLUSIVE | 1) == 0) - _mtx_lock_ex(mtx, ident, flags); + return(_mtx_lock_ex_quick(mtx, ident)); mtx->mtx_owner = curthread; + return(0); } /* * Share-lock a mutex, block until acquired. Recursion is allowed. + * + * Returns 0 on success, or the tsleep() return code on failure. + * An error can only be returned if PCATCH is specified in the flags. */ -static __inline void -mtx_lock_sh(mtx_t mtx, const char *ident, int flags) +static __inline int +mtx_lock_sh(mtx_t mtx, const char *ident, int flags, int to) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, 1) == 0) + return(_mtx_lock_sh(mtx, ident, flags, to)); + return(0); +} + +static __inline int +mtx_lock_sh_quick(mtx_t mtx, const char *ident) { if (atomic_cmpset_int(&mtx->mtx_lock, 0, 1) == 0) - _mtx_lock_sh(mtx, ident, flags); + return(_mtx_lock_sh_quick(mtx, ident)); + return(0); } /* -- 2.41.0 From 17386740d7e37fd918f769304323d08cfde748e2 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 12:13:13 -0700 Subject: [PATCH 07/16] MPSAFE - mutexes * Add additional inline functions to test the lock state. --- sys/sys/mutex2.h | 75 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/sys/sys/mutex2.h b/sys/sys/mutex2.h index af63c259f4..e7196eb0b5 100644 --- a/sys/sys/mutex2.h +++ b/sys/sys/mutex2.h @@ -228,6 +228,81 @@ mtx_unlock_sh(mtx_t mtx) _mtx_unlock(mtx); } +/* + * Return TRUE (non-zero) if the mutex is locked shared or exclusive by + * anyone, including the owner. + */ +static __inline int +mtx_islocked(mtx_t mtx) +{ + return(mtx->mtx_lock != 0); +} + +/* + * Return TRUE (non-zero) if the mutex is locked exclusively by anyone, + * including the owner. + * + * The mutex may in an unlocked or shared lock state. + */ +static __inline int +mtx_islocked_ex(mtx_t mtx) +{ + return((mtx->mtx_lock & MTX_EXCLUSIVE) != 0); +} + +/* + * Return TRUE (non-zero) if the mutex is not locked. + */ +static __inline int +mtx_notlocked(mtx_t mtx) +{ + return(mtx->mtx_lock == 0); +} + +/* + * Return TRUE (non-zero) if the mutex is not locked exclusively. + * The mutex may in an unlocked or shared lock state. + */ +static __inline int +mtx_notlocked_ex(mtx_t mtx) +{ + return((mtx->mtx_lock & MTX_EXCLUSIVE) != 0); +} + +/* + * Return TRUE (non-zero) if the mutex is exclusively locked by + * the caller. + */ +static __inline int +mtx_owned(mtx_t mtx) +{ + return((mtx->mtx_lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread); +} + +/* + * Return TRUE (non-zero) if the mutex is not exclusively locked by + * the caller. + */ +static __inline int +mtx_notowned(mtx_t mtx) +{ + return((mtx->mtx_lock & MTX_EXCLUSIVE) == 0 || + mtx->mtx_owner != curthread); +} + +/* + * Return the shared or exclusive lock count. A return value of 0 + * indicate that the mutex is not locked. + * + * NOTE: If the mutex is held exclusively by someone other then the + * caller the lock count for the other owner is still returned. + */ +static __inline int +mtx_lockrefs(mtx_t mtx) +{ + return(mtx->mtx_lock & MTX_MASK); +} + /* * Bump the lock's ref count. This field is independent of the lock. */ -- 2.41.0 From 1a493ad9ba51b534173a869f3ecb8df58350a57a Mon Sep 17 00:00:00 2001 From: Sascha Wildner Date: Wed, 15 Jul 2009 22:23:43 +0200 Subject: [PATCH 08/16] Fix LINT build. --- sys/kern/vfs_aio.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c index a68802e795..2b3cea4586 100644 --- a/sys/kern/vfs_aio.c +++ b/sys/kern/vfs_aio.c @@ -1008,11 +1008,8 @@ aio_fphysio(struct aiocblist *iocb) bp = iocb->bp; error = biowait_timeout(&bp->b_bio1, "physstr", aiod_timeout); - if (error) { - if (error == EWOULDBLOCK) - return EINPROGRESS; - break; - } + if (error == EWOULDBLOCK) + return EINPROGRESS; /* Release mapping into kernel space. */ vunmapbuf(bp); -- 2.41.0 From 685ebdab61459f9a21b78b3260fa119879e5c0fc Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 19:38:39 -0700 Subject: [PATCH 09/16] MPSAFE - mutex - better exclusive lock sequencer, bug fixes, abort * Redo the exclusive lock chaining algorithm. Use an explicit link structure and directly pass ownership to the next thread waiting on an exclusive lock. * Exclusive locks can be aborted via mtx_lock_ex_link() and mtx_abort_ex_link(). * Lots of misc bug fixes. --- sys/kern/kern_mutex.c | 426 ++++++++++++++++++++++++++++++++++++++---- sys/sys/mutex.h | 27 ++- sys/sys/mutex2.h | 28 +++ 3 files changed, 435 insertions(+), 46 deletions(-) diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c index 18f60b1f9b..d7895cae20 100644 --- a/sys/kern/kern_mutex.c +++ b/sys/kern/kern_mutex.c @@ -67,6 +67,9 @@ SYSCTL_QUAD(_kern, OID_AUTO, mtx_collision_count, CTLFLAG_RW, SYSCTL_QUAD(_kern, OID_AUTO, mtx_wakeup_count, CTLFLAG_RW, &mtx_wakeup_count, 0, ""); +static void mtx_chain_link(mtx_t mtx); +static void mtx_delete_link(mtx_t mtx, mtx_link_t link); + /* * Exclusive-lock a mutex, block until acquired. Recursion is allowed. * @@ -74,7 +77,7 @@ SYSCTL_QUAD(_kern, OID_AUTO, mtx_wakeup_count, CTLFLAG_RW, * An error can only be returned if PCATCH is specified in the flags. */ static __inline int -__mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to) +__mtx_lock_ex(mtx_t mtx, mtx_link_t link, const char *ident, int flags, int to) { u_int lock; u_int nlock; @@ -92,25 +95,97 @@ __mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to) } else if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) { KKASSERT((lock & MTX_MASK) != MTX_MASK); - nlock = (lock + 1); - if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { + nlock = lock + 1; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { error = 0; break; } } else { - nlock = lock | MTX_EXWANTED; - tsleep_interlock(&mtx->mtx_owner, 0); - if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { - error = tsleep(&mtx->mtx_owner, flags, - ident, to); + /* + * Clearing MTX_EXLINK in lock causes us to loop until + * MTX_EXLINK is available. However, to avoid + * unnecessary cpu cache traffic we poll instead. + * + * Setting MTX_EXLINK in nlock causes us to loop until + * we can acquire MTX_EXLINK. + * + * Also set MTX_EXWANTED coincident with EXLINK, if + * not already set. + */ + if (lock & MTX_EXLINK) { + cpu_pause(); + ++mtx_collision_count; + continue; + } + /*lock &= ~MTX_EXLINK;*/ + nlock = lock | MTX_EXWANTED | MTX_EXLINK; + ++mycpu->gd_spinlocks_wr; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + /* + * Check for early abort + */ + if (link->state == MTX_LINK_ABORTED) { + atomic_clear_int(&mtx->mtx_lock, + MTX_EXLINK); + --mycpu->gd_spinlocks_wr; + error = ENOLCK; + if (mtx->mtx_link == NULL) { + atomic_clear_int(&mtx->mtx_lock, + MTX_EXWANTED); + } + break; + } + + /* + * Success. Link in our structure then + * release EXLINK and sleep. + */ + link->owner = curthread; + link->state = MTX_LINK_LINKED; + if (mtx->mtx_link) { + link->next = mtx->mtx_link; + link->prev = link->next->prev; + link->next->prev = link; + link->prev->next = link; + } else { + link->next = link; + link->prev = link; + mtx->mtx_link = link; + } + tsleep_interlock(link, 0); + atomic_clear_int(&mtx->mtx_lock, MTX_EXLINK); + --mycpu->gd_spinlocks_wr; + + error = tsleep(link, flags, ident, to); ++mtx_contention_count; - if (error) { - ++mtx_wakeup_count; - wakeup_one(&mtx->mtx_owner); + + /* + * Normal unlink, we should own the exclusive + * lock now. + */ + if (link->state == MTX_LINK_LINKED) + mtx_delete_link(mtx, link); + if (link->state == MTX_LINK_ACQUIRED) { + KKASSERT(mtx->mtx_owner == link->owner); + error = 0; break; } + + /* + * Aborted lock (mtx_abort_ex called). + */ + if (link->state == MTX_LINK_ABORTED) { + error = ENOLCK; + break; + } + + /* + * tsleep error, else retry. + */ + if (error) + break; } else { - tsleep_remove(curthread); + --mycpu->gd_spinlocks_wr; } } ++mtx_collision_count; @@ -118,16 +193,29 @@ __mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to) return (error); } +int +_mtx_lock_ex_link(mtx_t mtx, mtx_link_t link, + const char *ident, int flags, int to) +{ + return(__mtx_lock_ex(mtx, link, ident, flags, to)); +} + int _mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to) { - return(__mtx_lock_ex(mtx, ident, flags, to)); + struct mtx_link link; + + mtx_link_init(&link); + return(__mtx_lock_ex(mtx, &link, ident, flags, to)); } int _mtx_lock_ex_quick(mtx_t mtx, const char *ident) { - return(__mtx_lock_ex(mtx, ident, 0, 0)); + struct mtx_link link; + + mtx_link_init(&link); + return(__mtx_lock_ex(mtx, &link, ident, 0, 0)); } /* @@ -135,6 +223,9 @@ _mtx_lock_ex_quick(mtx_t mtx, const char *ident) * * Returns 0 on success, or the tsleep() return code on failure. * An error can only be returned if PCATCH is specified in the flags. + * + * NOTE: Shared locks get a mass-wakeup so if the tsleep fails we + * do not have to chain the wakeup(). */ static __inline int __mtx_lock_sh(mtx_t mtx, const char *ident, int flags, int to) @@ -160,6 +251,7 @@ __mtx_lock_sh(mtx_t mtx, const char *ident, int flags, int to) if (error) break; ++mtx_contention_count; + /* retry */ } else { tsleep_remove(curthread); } @@ -194,15 +286,15 @@ _mtx_spinlock_ex(mtx_t mtx) if (lock == 0) { nlock = MTX_EXCLUSIVE | 1; if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { - /* mtx_owner set by caller */ - return; + mtx->mtx_owner = curthread; + break; } } else if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) { KKASSERT((lock & MTX_MASK) != MTX_MASK); - nlock = (lock + 1); - if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) - return; + nlock = lock + 1; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) + break; } else { /* MWAIT here */ if (bb < 1000) @@ -212,6 +304,7 @@ _mtx_spinlock_ex(mtx_t mtx) ; ++mtx_contention_count; } + cpu_pause(); ++mtx_collision_count; } } @@ -230,7 +323,7 @@ _mtx_spinlock_sh(mtx_t mtx) KKASSERT((lock & MTX_MASK) != MTX_MASK); nlock = lock + 1; if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) - return; + break; } else { /* MWAIT here */ if (bb < 1000) @@ -240,6 +333,7 @@ _mtx_spinlock_sh(mtx_t mtx) ; ++mtx_contention_count; } + cpu_pause(); ++mtx_collision_count; } } @@ -256,19 +350,20 @@ _mtx_lock_ex_try(mtx_t mtx) if (lock == 0) { nlock = MTX_EXCLUSIVE | 1; if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) { - /* mtx_owner set by caller */ + mtx->mtx_owner = curthread; break; } } else if ((lock & MTX_EXCLUSIVE) && mtx->mtx_owner == curthread) { KKASSERT((lock & MTX_MASK) != MTX_MASK); - nlock = (lock + 1); - if (atomic_cmpset_int(&mtx->mtx_lock, 0, nlock)) + nlock = lock + 1; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) break; } else { error = EAGAIN; break; } + cpu_pause(); ++mtx_collision_count; } return (error); @@ -292,6 +387,7 @@ _mtx_lock_sh_try(mtx_t mtx) error = EAGAIN; break; } + cpu_pause(); ++mtx_collision_count; } return (error); @@ -325,6 +421,7 @@ _mtx_downgrade(mtx_t mtx) } break; } + cpu_pause(); ++mtx_collision_count; } } @@ -363,6 +460,7 @@ _mtx_upgrade_try(mtx_t mtx) error = EDEADLK; break; } + cpu_pause(); ++mtx_collision_count; } return (error); @@ -370,6 +468,10 @@ _mtx_upgrade_try(mtx_t mtx) /* * Unlock a lock. The caller must hold the lock either shared or exclusive. + * + * Any release which makes the lock available when others want an exclusive + * lock causes us to chain the owner to the next exclusive lock instead of + * releasing the lock. */ void _mtx_unlock(mtx_t mtx) @@ -379,37 +481,279 @@ _mtx_unlock(mtx_t mtx) for (;;) { lock = mtx->mtx_lock; - nlock = (lock & (MTX_EXCLUSIVE | MTX_MASK)) - 1; - if (nlock == 0) { - if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0)) { - if (lock & MTX_SHWANTED) { - wakeup(mtx); - ++mtx_wakeup_count; - } - if (lock & MTX_EXWANTED) { - wakeup_one(&mtx->mtx_owner); - ++mtx_wakeup_count; - } - } - } else if (nlock == MTX_EXCLUSIVE) { + nlock = lock & ~(MTX_SHWANTED | MTX_EXLINK); + + if (nlock == 1) { + /* + * Last release, shared lock, no exclusive waiters. + */ + nlock = lock & MTX_EXLINK; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) + break; + } else if (nlock == (MTX_EXCLUSIVE | 1)) { + /* + * Last release, exclusive lock, no exclusive waiters. + * Wake up any shared waiters. + */ mtx->mtx_owner = NULL; - if (atomic_cmpset_int(&mtx->mtx_lock, lock, 0)) { + nlock = lock & MTX_EXLINK; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { if (lock & MTX_SHWANTED) { wakeup(mtx); ++mtx_wakeup_count; } - if (lock & MTX_EXWANTED) { - wakeup_one(&mtx->mtx_owner); - ++mtx_wakeup_count; - } break; } + } else if (nlock == (MTX_EXWANTED | 1)) { + /* + * Last release, shared lock, with exclusive + * waiters. + * + * Wait for EXLINK to clear, then acquire it. + * We could use the cmpset for this but polling + * is better on the cpu caches. + * + * Acquire an exclusive lock leaving the lockcount + * set to 1, and get EXLINK for access to mtx_link. + */ + if (lock & MTX_EXLINK) { + cpu_pause(); + ++mtx_collision_count; + continue; + } + /*lock &= ~MTX_EXLINK;*/ + nlock |= MTX_EXLINK | MTX_EXCLUSIVE; + nlock |= (lock & MTX_SHWANTED); + ++mycpu->gd_spinlocks_wr; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + mtx_chain_link(mtx); + --mycpu->gd_spinlocks_wr; + break; + } + --mycpu->gd_spinlocks_wr; + } else if (nlock == (MTX_EXCLUSIVE | MTX_EXWANTED | 1)) { + /* + * Last release, exclusive lock, with exclusive + * waiters. + * + * leave the exclusive lock intact and the lockcount + * set to 1, and get EXLINK for access to mtx_link. + */ + if (lock & MTX_EXLINK) { + cpu_pause(); + ++mtx_collision_count; + continue; + } + /*lock &= ~MTX_EXLINK;*/ + nlock |= MTX_EXLINK; + nlock |= (lock & MTX_SHWANTED); + ++mycpu->gd_spinlocks_wr; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + mtx_chain_link(mtx); + --mycpu->gd_spinlocks_wr; + break; + } + --mycpu->gd_spinlocks_wr; } else { + /* + * Not the last release (shared or exclusive) + */ nlock = lock - 1; KKASSERT((nlock & MTX_MASK) != MTX_MASK); if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) break; } + cpu_pause(); + ++mtx_collision_count; + } +} + +/* + * Chain mtx_chain_link. Called with the lock held exclusively with a + * single ref count, and also with MTX_EXLINK held. + */ +static void +mtx_chain_link(mtx_t mtx) +{ + mtx_link_t link; + u_int lock; + u_int nlock; + u_int clock; /* bits we own and want to clear */ + + /* + * Chain the exclusive lock to the next link. The caller cleared + * SHWANTED so if there is no link we have to wake up any shared + * waiters. + */ + clock = MTX_EXLINK; + if ((link = mtx->mtx_link) != NULL) { + KKASSERT(link->state == MTX_LINK_LINKED); + if (link->next == link) { + mtx->mtx_link = NULL; + clock |= MTX_EXWANTED; + } else { + mtx->mtx_link = link->next; + link->next->prev = link->prev; + link->prev->next = link->next; + } + link->state = MTX_LINK_ACQUIRED; + mtx->mtx_owner = link->owner; + } else { + /* + * Chain was empty, release the exclusive lock's last count + * as well the bits shown. + */ + clock |= MTX_EXCLUSIVE | MTX_EXWANTED | MTX_SHWANTED | 1; + } + + /* + * We have to uset cmpset here to deal with MTX_SHWANTED. If + * we just clear the bits we can miss a wakeup or, worse, + * leave mtx_lock unlocked with MTX_SHWANTED still set. + */ + for (;;) { + lock = mtx->mtx_lock; + nlock = lock & ~clock; + + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) { + if (link) { + /* + * Wakeup new exclusive holder. Leave + * SHWANTED intact. + */ + wakeup(link); + } else if (lock & MTX_SHWANTED) { + /* + * Signal any shared waiters (and we also + * clear SHWANTED). + */ + mtx->mtx_owner = NULL; + wakeup(mtx); + ++mtx_wakeup_count; + } + break; + } + cpu_pause(); ++mtx_collision_count; } } + +/* + * Delete a link structure after tsleep has failed. This code is not + * in the critical path as most exclusive waits are chained. + */ +static +void +mtx_delete_link(mtx_t mtx, mtx_link_t link) +{ + u_int lock; + u_int nlock; + + /* + * Acquire MTX_EXLINK. + * + * Do not use cmpxchg to wait for EXLINK to clear as this might + * result in too much cpu cache traffic. + */ + ++mycpu->gd_spinlocks_wr; + for (;;) { + lock = mtx->mtx_lock; + if (lock & MTX_EXLINK) { + cpu_pause(); + ++mtx_collision_count; + continue; + } + /* lock &= ~MTX_EXLINK; */ + nlock = lock | MTX_EXLINK; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) + break; + cpu_pause(); + ++mtx_collision_count; + } + + /* + * Delete the link and release EXLINK. + */ + if (link->state == MTX_LINK_LINKED) { + if (link->next == link) { + mtx->mtx_link = NULL; + } else { + mtx->mtx_link = link->next; + link->next->prev = link->prev; + link->prev->next = link->next; + } + link->state = MTX_LINK_IDLE; + } + atomic_clear_int(&mtx->mtx_lock, MTX_EXLINK); + --mycpu->gd_spinlocks_wr; +} + +/* + * Abort a mutex locking operation, causing mtx_lock_ex_link() to + * return ENOLCK. This may be called at any time after the + * mtx_link is initialized, including both before and after the call + * to mtx_lock_ex_link(). + */ +void +mtx_abort_ex_link(mtx_t mtx, mtx_link_t link) +{ + u_int lock; + u_int nlock; + + /* + * Acquire MTX_EXLINK + */ + ++mycpu->gd_spinlocks_wr; + for (;;) { + lock = mtx->mtx_lock; + if (lock & MTX_EXLINK) { + cpu_pause(); + ++mtx_collision_count; + continue; + } + /* lock &= ~MTX_EXLINK; */ + nlock = lock | MTX_EXLINK; + if (atomic_cmpset_int(&mtx->mtx_lock, lock, nlock)) + break; + cpu_pause(); + ++mtx_collision_count; + } + + /* + * Do the abort + */ + switch(link->state) { + case MTX_LINK_IDLE: + /* + * Link not started yet + */ + link->state = MTX_LINK_ABORTED; + break; + case MTX_LINK_LINKED: + /* + * de-link, mark aborted, and wakeup the thread. + */ + if (link->next == link) { + mtx->mtx_link = NULL; + } else { + mtx->mtx_link = link->next; + link->next->prev = link->prev; + link->prev->next = link->next; + } + link->state = MTX_LINK_ABORTED; + wakeup(link); + break; + case MTX_LINK_ACQUIRED: + /* + * Too late, the lock was acquired. Let it complete. + */ + break; + default: + /* + * link already aborted, do nothing. + */ + break; + } + atomic_clear_int(&mtx->mtx_lock, MTX_EXLINK); + --mycpu->gd_spinlocks_wr; +} diff --git a/sys/sys/mutex.h b/sys/sys/mutex.h index 9c07992cbc..4cc3e019c6 100644 --- a/sys/sys/mutex.h +++ b/sys/sys/mutex.h @@ -56,18 +56,28 @@ */ struct thread; -typedef struct mtx { +struct mtx_link { + struct mtx_link *next; + struct mtx_link *prev; + struct thread *owner; + int state; +}; + +typedef struct mtx_link *mtx_link_t; + +struct mtx { volatile u_int mtx_lock; int mtx_refs; struct thread *mtx_owner; -#if LONG_BIT == 32 - int mtx_unused; -#endif -} *mtx_t; + mtx_link_t mtx_link; +} __cachealign; + +typedef struct mtx *mtx_t; #define MTX_EXCLUSIVE 0x80000000 #define MTX_SHWANTED 0x40000000 #define MTX_EXWANTED 0x20000000 +#define MTX_EXLINK 0x10000000 #define MTX_MASK 0x0FFFFFFF #define MTX_PCATCH 0x00000001 @@ -75,6 +85,11 @@ typedef struct mtx { #define MTX_OWNER_NONE NULL #define MTX_OWNER_ANON (struct thread *)-2) +#define MTX_LINK_IDLE 0 +#define MTX_LINK_ABORTED -1 +#define MTX_LINK_LINKED 1 +#define MTX_LINK_ACQUIRED 2 + #endif /* @@ -82,6 +97,7 @@ typedef struct mtx { */ #ifdef _KERNEL +int _mtx_lock_ex_link(mtx_t mtx, mtx_link_t link, const char *ident, int flags, int to); int _mtx_lock_ex(mtx_t mtx, const char *ident, int flags, int to); int _mtx_lock_sh(mtx_t mtx, const char *ident, int flags, int to); int _mtx_lock_ex_quick(mtx_t mtx, const char *ident); @@ -93,6 +109,7 @@ int _mtx_lock_sh_try(mtx_t mtx); void _mtx_downgrade(mtx_t mtx); int _mtx_upgrade_try(mtx_t mtx); void _mtx_unlock(mtx_t mtx); +void mtx_abort_ex_link(mtx_t mtx, mtx_link_t link); #endif diff --git a/sys/sys/mutex2.h b/sys/sys/mutex2.h index e7196eb0b5..ae0d4782ac 100644 --- a/sys/sys/mutex2.h +++ b/sys/sys/mutex2.h @@ -51,6 +51,13 @@ mtx_init(mtx_t mtx) mtx->mtx_lock = 0; mtx->mtx_refs = 0; mtx->mtx_owner = NULL; + mtx->mtx_link = NULL; +} + +static __inline void +mtx_link_init(mtx_link_t link) +{ + link->state = MTX_LINK_IDLE; } /* @@ -62,6 +69,27 @@ mtx_uninit(mtx_t mtx) /* empty */ } +/* + * Exclusive-lock a mutex, block until acquired or aborted. Recursion + * is allowed. + * + * This version of the function allows the mtx_link to be passed in, thus + * giving the caller visibility for the link structure which is required + * when calling mtx_abort_ex_link(). + * + * The mutex may be aborted at any time while the passed link structure + * is valid. + */ +static __inline int +mtx_lock_ex_link(mtx_t mtx, struct mtx_link *link, + const char *ident, int flags, int to) +{ + if (atomic_cmpset_int(&mtx->mtx_lock, 0, MTX_EXCLUSIVE | 1) == 0) + return(_mtx_lock_ex_link(mtx, link, ident, flags, to)); + mtx->mtx_owner = curthread; + return(0); +} + /* * Exclusive-lock a mutex, block until acquired. Recursion is allowed. * -- 2.41.0 From 8684e6f9ef55e0d000f3b02a5a1e822364ee9450 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 19:41:34 -0700 Subject: [PATCH 10/16] NFS - Use mutex API, begin refactoring the state machine. * Use the mtx_*() API instead of roll-your-own locks for the send and receive locks. * Refactor nfs_request(). Break the procedure up into multiple pieces for upcoming nfsiod/nfsd work (as in: getting rid of them). The main thing here is to split off the 'setup', 'send', 'receive', and 'process reply' parts. This will make it easier to construct a kernel thread to i.e. just do the 'setup/send' part, and another to do the 'receive/reply' part. --- sys/vfs/nfs/nfs.h | 32 ++++- sys/vfs/nfs/nfs_socket.c | 273 ++++++++++++++++++++++++------------- sys/vfs/nfs/nfs_syscalls.c | 36 ++--- sys/vfs/nfs/nfs_vfsops.c | 3 + sys/vfs/nfs/nfsmount.h | 5 +- 5 files changed, 233 insertions(+), 116 deletions(-) diff --git a/sys/vfs/nfs/nfs.h b/sys/vfs/nfs/nfs.h index b6aea259eb..c2c8f8ce14 100644 --- a/sys/vfs/nfs/nfs.h +++ b/sys/vfs/nfs/nfs.h @@ -45,6 +45,8 @@ #include "opt_nfs.h" #endif +#include + /* * Tunable constants for nfs */ @@ -175,10 +177,10 @@ struct nfs_args { #define NFSSTA_MNTD 0x00200000 /* Mnt server for mnt point */ #define NFSSTA_DISMINPROG 0x00400000 /* Dismount in progress */ #define NFSSTA_DISMNT 0x00800000 /* Dismounted */ -#define NFSSTA_SNDLOCK 0x01000000 /* Send socket lock */ -#define NFSSTA_WANTSND 0x02000000 /* Want above */ -#define NFSSTA_RCVLOCK 0x04000000 /* Rcv socket lock */ -#define NFSSTA_WANTRCV 0x08000000 /* Want above */ +#define NFSSTA_UNUSED24 0x01000000 +#define NFSSTA_UNUSED25 0x02000000 +#define NFSSTA_UNUSED26 0x04000000 +#define NFSSTA_UNUSED27 0x08000000 #define NFSSTA_WAITAUTH 0x10000000 /* Wait for authentication */ #define NFSSTA_HASAUTH 0x20000000 /* Has authenticator */ #define NFSSTA_WANTAUTH 0x40000000 /* Wants an authenticator */ @@ -339,6 +341,7 @@ struct nlookupdata; */ struct nfsreq { TAILQ_ENTRY(nfsreq) r_chain; + struct mtx_link r_link; struct mbuf *r_mreq; struct mbuf *r_mrep; struct mbuf *r_md; @@ -353,6 +356,15 @@ struct nfsreq { u_int32_t r_procnum; /* NFS procedure number */ int r_rtt; /* RTT for rpc */ struct thread *r_td; /* Thread that did I/O system call */ + struct mbuf *r_mrest; + struct mbuf *r_mheadend; + struct mbuf **r_mrp; + struct mbuf **r_mdp; + caddr_t *r_dposp; + int r_mrest_len; + int r_failed_auth; + NFSKERBKEY_T r_key; + struct ucred *r_cred; }; /* @@ -441,7 +453,7 @@ struct nfssvc_sock { struct mbuf *ns_frag; int ns_numrec; int ns_flag; - int ns_solock; + struct mtx ns_solock; int ns_cc; int ns_reclen; int ns_numuids; @@ -623,6 +635,16 @@ int netaddr_match (int, union nethostaddr *, struct sockaddr *); int nfs_request (struct vnode *, struct mbuf *, int, struct thread *, struct ucred *, struct mbuf **, struct mbuf **, caddr_t *); +int nfs_request_setup(struct vnode *vp, struct mbuf *mrest, int procnum, + struct thread *td, struct ucred *cred, struct nfsreq **repp); +int nfs_request_auth(struct nfsreq *rep); +int nfs_request_try(struct nfsreq *rep); +int nfs_request_waitreply(struct nfsreq *rep); +int nfs_request_processreply(struct nfsreq *rep, int error); + + + + int nfs_loadattrcache (struct vnode **, struct mbuf **, caddr_t *, struct vattr *, int); int nfs_namei (struct nlookupdata *, struct ucred *, int, diff --git a/sys/vfs/nfs/nfs_socket.c b/sys/vfs/nfs/nfs_socket.c index 1b620c732b..1cc8779521 100644 --- a/sys/vfs/nfs/nfs_socket.c +++ b/sys/vfs/nfs/nfs_socket.c @@ -61,7 +61,10 @@ #include #include #include +#include + #include +#include #include #include @@ -434,6 +437,7 @@ nfs_safedisconnect(struct nfsmount *nmp) bzero(&dummyreq, sizeof(dummyreq)); dummyreq.r_nmp = nmp; dummyreq.r_td = NULL; + mtx_link_init(&dummyreq.r_link); nfs_rcvlock(&dummyreq); nfs_disconnect(nmp); nfs_rcvunlock(&dummyreq); @@ -780,6 +784,7 @@ nfs_reply(struct nfsreq *myrep) * sbwait() after someone else has received my reply for me. * Also necessary for connection based protocols to avoid * race conditions during a reconnect. + * * If nfs_rcvlock() returns EALREADY, that means that * the reply has already been recieved by another * process and we can return immediately. In this @@ -836,10 +841,8 @@ nfsmout: */ crit_enter(); TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { - if (rep->r_mrep == NULL && rxid == rep->r_xid) { - rep->r_mrep = mrep; + if (rep->r_mrep == NULL && rxid == rep->r_xid) break; - } } crit_exit(); @@ -907,6 +910,8 @@ nfsmout: NFS_SDRTT(rep) += t1; } nmp->nm_timeouts = 0; + rep->r_mrep = mrep; + mtx_abort_ex_link(&rep->r_nmp->nm_rxlock, &rep->r_link); } /* * If not matched to a request, drop it. @@ -925,6 +930,35 @@ nfsmout: } } +int +nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum, + struct thread *td, struct ucred *cred, struct mbuf **mrp, + struct mbuf **mdp, caddr_t *dposp) +{ + struct nfsreq *rep = NULL; + int error; + + error = nfs_request_setup(vp, mrest, procnum, td, cred, &rep); + if (error) + return (error); + rep->r_mrp = mrp; + rep->r_mdp = mdp; + rep->r_dposp = dposp; +needauth: + error = nfs_request_auth(rep); + if (error) + return (error); +tryagain: + error = nfs_request_try(rep); /* error ignored */ + error = nfs_request_waitreply(rep); /* pass to process */ + error = nfs_request_processreply(rep, error); + if (error == ENEEDAUTH) + goto needauth; + if (error == EAGAIN) + goto tryagain; + return (error); +} + /* * nfs_request - goes something like this * - fill in request struct @@ -936,25 +970,14 @@ nfsmout: * nb: always frees up mreq mbuf list */ int -nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum, - struct thread *td, struct ucred *cred, struct mbuf **mrp, - struct mbuf **mdp, caddr_t *dposp) +nfs_request_setup(struct vnode *vp, struct mbuf *mrest, int procnum, + struct thread *td, struct ucred *cred, + struct nfsreq **repp) { - struct mbuf *mrep, *m2; struct nfsreq *rep; - u_int32_t *tl; - int i; struct nfsmount *nmp; - struct mbuf *m, *md, *mheadend; - char nickv[RPCX_NICKVERF]; - time_t waituntil; - caddr_t dpos, cp2; - int t1, error = 0, mrest_len, auth_len, auth_type; - int trylater_delay = 15, trylater_cnt = 0, failed_auth = 0; - int verf_len, verf_type; - u_int32_t xid; - char *auth_str, *verf_str; - NFSKERBKEY_T key; /* save session key */ + struct mbuf *m; + int i; /* Reject requests while attempting a forced unmount. */ if (vp->v_mount->mnt_kern_flag & MNTK_UNMOUNTF) { @@ -974,25 +997,46 @@ nfs_request(struct vnode *vp, struct mbuf *mrest, int procnum, i += m->m_len; m = m->m_next; } - mrest_len = i; + rep->r_mrest = mrest; + rep->r_mrest_len = i; + rep->r_cred = cred; + *repp = rep; + return(0); +} + +int +nfs_request_auth(struct nfsreq *rep) +{ + struct nfsmount *nmp = rep->r_nmp; + struct mbuf *m; + char nickv[RPCX_NICKVERF]; + int error = 0, auth_len, auth_type; + int verf_len; + u_int32_t xid; + char *auth_str, *verf_str; + struct ucred *cred; + + cred = rep->r_cred; + rep->r_failed_auth = 0; /* * Get the RPC header with authorization. */ -kerbauth: verf_str = auth_str = NULL; if (nmp->nm_flag & NFSMNT_KERB) { verf_str = nickv; verf_len = sizeof (nickv); auth_type = RPCAUTH_KERB4; - bzero((caddr_t)key, sizeof (key)); - if (failed_auth || nfs_getnickauth(nmp, cred, &auth_str, - &auth_len, verf_str, verf_len)) { + bzero((caddr_t)rep->r_key, sizeof(rep->r_key)); + if (rep->r_failed_auth || + nfs_getnickauth(nmp, cred, &auth_str, &auth_len, + verf_str, verf_len)) { error = nfs_getauth(nmp, rep, cred, &auth_str, - &auth_len, verf_str, &verf_len, key); + &auth_len, verf_str, &verf_len, rep->r_key); if (error) { + m_freem(rep->r_mrest); + rep->r_mrest = NULL; kfree((caddr_t)rep, M_NFSREQ); - m_freem(mrest); return (error); } } @@ -1004,8 +1048,10 @@ kerbauth: nmp->nm_numgrps : (cred->cr_ngroups - 1)) << 2) + 5 * NFSX_UNSIGNED; } - m = nfsm_rpchead(cred, nmp->nm_flag, procnum, auth_type, auth_len, - auth_str, verf_len, verf_str, mrest, mrest_len, &mheadend, &xid); + m = nfsm_rpchead(cred, nmp->nm_flag, rep->r_procnum, auth_type, + auth_len, auth_str, verf_len, verf_str, + rep->r_mrest, rep->r_mrest_len, &rep->r_mheadend, &xid); + rep->r_mrest = NULL; if (auth_str) kfree(auth_str, M_TEMP); @@ -1023,13 +1069,22 @@ kerbauth: } rep->r_mreq = m; rep->r_xid = xid; -tryagain: + return (0); +} + +int +nfs_request_try(struct nfsreq *rep) +{ + struct nfsmount *nmp = rep->r_nmp; + struct mbuf *m2; + int error; + if (nmp->nm_flag & NFSMNT_SOFT) rep->r_retry = nmp->nm_retry; else rep->r_retry = NFS_MAXREXMIT + 1; /* past clip limit */ rep->r_rtt = rep->r_rexmit = 0; - if (proct[procnum] > 0) + if (proct[rep->r_procnum] > 0) rep->r_flags = R_TIMING | R_MASKTIMER; else rep->r_flags = R_MASKTIMER; @@ -1050,6 +1105,9 @@ tryagain: */ crit_enter(); TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain); + mtx_link_init(&rep->r_link); + + error = 0; /* * If backing off another request or avoiding congestion, don't @@ -1068,7 +1126,7 @@ tryagain: if (nmp->nm_soflags & PR_CONNREQUIRED) error = nfs_sndlock(rep); if (!error) { - m2 = m_copym(m, 0, M_COPYALL, MB_WAIT); + m2 = m_copym(rep->r_mreq, 0, M_COPYALL, MB_WAIT); error = nfs_send(nmp->nm_so, nmp->nm_nam, m2, rep); if (nmp->nm_soflags & PR_CONNREQUIRED) nfs_sndunlock(rep); @@ -1083,17 +1141,27 @@ tryagain: } else { rep->r_rtt = -1; } - + if (error == EPIPE) + error = 0; /* * Let the timer do what it will with the request, then * wait for the reply from our send or the timer's. */ - if (!error || error == EPIPE) { + if (error == 0) rep->r_flags &= ~R_MASKTIMER; - crit_exit(); - error = nfs_reply(rep); - crit_enter(); - } + crit_exit(); + return (error); +} + +int +nfs_request_waitreply(struct nfsreq *rep) +{ + struct nfsmount *nmp = rep->r_nmp; + int error; + + + error = nfs_reply(rep); + crit_enter(); /* * RPC done, unlink the request, but don't rip it out from under @@ -1114,6 +1182,29 @@ tryagain: } crit_exit(); + return (error); +} + +/* + * Process reply with error returned from nfs_requet_waitreply(). + * + * Returns EAGAIN if it wants us to loop up to nfs_request_try() again. + * Returns ENEEDAUTH if it wants us to loop up to nfs_request_auth() again. + */ +int +nfs_request_processreply(struct nfsreq *rep, int error) +{ + struct nfsmount *nmp = rep->r_nmp; + time_t waituntil; + caddr_t dpos, cp2; + struct mbuf *mrep; + struct mbuf *md; + u_int32_t *tl; + int trylater_delay = 15, trylater_cnt = 0; + int verf_type; + int t1; + int i; + /* * If there was a successful reply and a tprintf msg. * tprintf a response. @@ -1135,19 +1226,22 @@ tryagain: */ nfsm_dissect(tl, u_int32_t *, 3 * NFSX_UNSIGNED); if (*tl++ == rpc_msgdenied) { - if (*tl == rpc_mismatch) + if (*tl == rpc_mismatch) { error = EOPNOTSUPP; - else if ((nmp->nm_flag & NFSMNT_KERB) && *tl++ == rpc_autherr) { - if (!failed_auth) { - failed_auth++; - mheadend->m_next = NULL; + } else if ((nmp->nm_flag & NFSMNT_KERB) && + *tl++ == rpc_autherr) { + if (!rep->r_failed_auth) { + rep->r_failed_auth++; + rep->r_mheadend->m_next = NULL; m_freem(mrep); m_freem(rep->r_mreq); - goto kerbauth; - } else + return (ENEEDAUTH); + } else { error = EAUTH; - } else + } + } else { error = EACCES; + } m_freem(mrep); m_freem(rep->r_mreq); kfree((caddr_t)rep, M_NFSREQ); @@ -1160,7 +1254,8 @@ tryagain: verf_type = fxdr_unsigned(int, *tl++); i = fxdr_unsigned(int32_t, *tl); if ((nmp->nm_flag & NFSMNT_KERB) && verf_type == RPCAUTH_KERB4) { - error = nfs_savenickauth(nmp, cred, i, key, &md, &dpos, mrep); + error = nfs_savenickauth(nmp, rep->r_cred, i, + rep->r_key, &md, &dpos, mrep); if (error) goto nfsmout; } else if (i > 0) @@ -1182,7 +1277,8 @@ tryagain: trylater_delay *= nfs_backoff[trylater_cnt]; if (trylater_cnt < 7) trylater_cnt++; - goto tryagain; + rep->r_flags &= ~R_MASKTIMER; + return (EAGAIN); /* goto tryagain */ } /* @@ -1193,6 +1289,7 @@ tryagain: * release the vnode lock if we hold it. */ if (error == ESTALE) { + struct vnode *vp = rep->r_vp; int ltype; ltype = lockstatus(&vp->v_lock, curthread); @@ -1203,9 +1300,9 @@ tryagain: lockmgr(&vp->v_lock, ltype); } if (nmp->nm_flag & NFSMNT_NFSV3) { - *mrp = mrep; - *mdp = md; - *dposp = dpos; + *rep->r_mrp = mrep; + *rep->r_mdp = md; + *rep->r_dposp = dpos; error |= NFSERR_RETERR; } else m_freem(mrep); @@ -1214,9 +1311,9 @@ tryagain: return (error); } - *mrp = mrep; - *mdp = md; - *dposp = dpos; + *rep->r_mrp = mrep; + *rep->r_mdp = md; + *rep->r_dposp = dpos; m_freem(rep->r_mreq); FREE((caddr_t)rep, M_NFSREQ); return (0); @@ -1593,7 +1690,7 @@ nfs_sigintr(struct nfsmount *nmp, struct nfsreq *rep, struct thread *td) int nfs_sndlock(struct nfsreq *rep) { - int *statep = &rep->r_nmp->nm_state; + mtx_t mtx = &rep->r_nmp->nm_txlock; struct thread *td; int slptimeo; int slpflag; @@ -1605,26 +1702,25 @@ nfs_sndlock(struct nfsreq *rep) if (rep->r_nmp->nm_flag & NFSMNT_INT) slpflag = PCATCH; - error = 0; - crit_enter(); - while (*statep & NFSSTA_SNDLOCK) { - *statep |= NFSSTA_WANTSND; + while ((error = mtx_lock_ex_try(mtx)) != 0) { if (nfs_sigintr(rep->r_nmp, rep, td)) { error = EINTR; break; } - tsleep((caddr_t)statep, slpflag, "nfsndlck", slptimeo); + error = mtx_lock_ex(mtx, "nfsndlck", slpflag, slptimeo); + if (error == 0) + break; if (slpflag == PCATCH) { slpflag = 0; slptimeo = 2 * hz; } } /* Always fail if our request has been cancelled. */ - if ((rep->r_flags & R_SOFTTERM)) + if (rep->r_flags & R_SOFTTERM) { + if (error == 0) + mtx_unlock(mtx); error = EINTR; - if (error == 0) - *statep |= NFSSTA_SNDLOCK; - crit_exit(); + } return (error); } @@ -1634,23 +1730,15 @@ nfs_sndlock(struct nfsreq *rep) void nfs_sndunlock(struct nfsreq *rep) { - int *statep = &rep->r_nmp->nm_state; + mtx_t mtx = &rep->r_nmp->nm_txlock; - if ((*statep & NFSSTA_SNDLOCK) == 0) - panic("nfs sndunlock"); - crit_enter(); - *statep &= ~NFSSTA_SNDLOCK; - if (*statep & NFSSTA_WANTSND) { - *statep &= ~NFSSTA_WANTSND; - wakeup((caddr_t)statep); - } - crit_exit(); + mtx_unlock(mtx); } static int nfs_rcvlock(struct nfsreq *rep) { - int *statep = &rep->r_nmp->nm_state; + mtx_t mtx = &rep->r_nmp->nm_rxlock; int slpflag; int slptimeo; int error; @@ -1672,9 +1760,8 @@ nfs_rcvlock(struct nfsreq *rep) else slpflag = 0; slptimeo = 0; - error = 0; - crit_enter(); - while (*statep & NFSSTA_RCVLOCK) { + + while ((error = mtx_lock_ex_try(mtx)) != 0) { if (nfs_sigintr(rep->r_nmp, rep, rep->r_td)) { error = EINTR; break; @@ -1683,8 +1770,16 @@ nfs_rcvlock(struct nfsreq *rep) error = EALREADY; break; } - *statep |= NFSSTA_WANTRCV; - tsleep((caddr_t)statep, slpflag, "nfsrcvlk", slptimeo); + + /* + * NOTE: can return ENOLCK, but in that case rep->r_mrep + * will already be set. + */ + error = mtx_lock_ex_link(mtx, &rep->r_link, "nfsrcvlk", + slpflag, slptimeo); + if (error == 0) + break; + /* * If our reply was recieved while we were sleeping, * then just return without taking the lock to avoid a @@ -1701,10 +1796,11 @@ nfs_rcvlock(struct nfsreq *rep) } } if (error == 0) { - *statep |= NFSSTA_RCVLOCK; - rep->r_nmp->nm_rcvlock_td = curthread; /* DEBUGGING */ + if (rep->r_mrep != NULL) { + error = EALREADY; + mtx_unlock(mtx); + } } - crit_exit(); return (error); } @@ -1714,18 +1810,9 @@ nfs_rcvlock(struct nfsreq *rep) static void nfs_rcvunlock(struct nfsreq *rep) { - int *statep = &rep->r_nmp->nm_state; + mtx_t mtx = &rep->r_nmp->nm_rxlock; - if ((*statep & NFSSTA_RCVLOCK) == 0) - panic("nfs rcvunlock"); - crit_enter(); - rep->r_nmp->nm_rcvlock_td = (void *)-1; /* DEBUGGING */ - *statep &= ~NFSSTA_RCVLOCK; - if (*statep & NFSSTA_WANTRCV) { - *statep &= ~NFSSTA_WANTRCV; - wakeup((caddr_t)statep); - } - crit_exit(); + mtx_unlock(mtx); } /* diff --git a/sys/vfs/nfs/nfs_syscalls.c b/sys/vfs/nfs/nfs_syscalls.c index 95374e8115..1acd9a45f3 100644 --- a/sys/vfs/nfs/nfs_syscalls.c +++ b/sys/vfs/nfs/nfs_syscalls.c @@ -58,8 +58,11 @@ #include #include #include +#include #include +#include + #include #include #include "xdr_subs.h" @@ -403,6 +406,7 @@ nfssvc_addsock(struct file *fp, struct sockaddr *mynam, struct thread *td) slp = (struct nfssvc_sock *)kmalloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK | M_ZERO); + mtx_init(&slp->ns_solock); STAILQ_INIT(&slp->ns_rec); TAILQ_INIT(&slp->ns_uidlruhead); TAILQ_INSERT_TAIL(&nfssvc_sockhead, slp, ns_chain); @@ -768,20 +772,22 @@ nfsrv_slpderef(struct nfssvc_sock *slp) /* * Lock a socket against others. + * + * Returns 0 on failure, 1 on success. */ int nfs_slplock(struct nfssvc_sock *slp, int wait) { - int *statep = &slp->ns_solock; + mtx_t mtx = &slp->ns_solock; - if (!wait && (*statep & NFSSTA_SNDLOCK)) - return(0); /* already locked, fail */ - while (*statep & NFSSTA_SNDLOCK) { - *statep |= NFSSTA_WANTSND; - (void) tsleep((caddr_t)statep, 0, "nfsslplck", 0); + if (wait) { + mtx_lock_ex(mtx, "nfsslplck", 0, 0); + return(1); + } else if (mtx_lock_ex_try(mtx) == 0) { + return(1); + } else { + return(0); } - *statep |= NFSSTA_SNDLOCK; - return (1); } /* @@ -790,15 +796,9 @@ nfs_slplock(struct nfssvc_sock *slp, int wait) void nfs_slpunlock(struct nfssvc_sock *slp) { - int *statep = &slp->ns_solock; - - if ((*statep & NFSSTA_SNDLOCK) == 0) - panic("nfs slpunlock"); - *statep &= ~NFSSTA_SNDLOCK; - if (*statep & NFSSTA_WANTSND) { - *statep &= ~NFSSTA_WANTSND; - wakeup((caddr_t)statep); - } + mtx_t mtx = &slp->ns_solock; + + mtx_unlock(mtx); } /* @@ -838,12 +838,14 @@ nfsrv_init(int terminating) #if 0 nfs_udpsock = (struct nfssvc_sock *) kmalloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK | M_ZERO); + mtx_init(&nfs_udpsock->ns_solock); STAILQ_INIT(&nfs_udpsock->ns_rec); TAILQ_INIT(&nfs_udpsock->ns_uidlruhead); TAILQ_INSERT_HEAD(&nfssvc_sockhead, nfs_udpsock, ns_chain); nfs_cltpsock = (struct nfssvc_sock *) kmalloc(sizeof (struct nfssvc_sock), M_NFSSVC, M_WAITOK | M_ZERO); + mtx_init(&nfs_cltpsock->ns_solock); STAILQ_INIT(&nfs_cltpsock->ns_rec); TAILQ_INIT(&nfs_cltpsock->ns_uidlruhead); TAILQ_INSERT_TAIL(&nfssvc_sockhead, nfs_cltpsock, ns_chain); diff --git a/sys/vfs/nfs/nfs_vfsops.c b/sys/vfs/nfs/nfs_vfsops.c index 4a05d64a60..34fa2119e9 100644 --- a/sys/vfs/nfs/nfs_vfsops.c +++ b/sys/vfs/nfs/nfs_vfsops.c @@ -64,6 +64,7 @@ #include #include +#include #include "rpcv2.h" #include "nfsproto.h" @@ -897,6 +898,8 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, } else { nmp = zalloc(nfsmount_zone); bzero((caddr_t)nmp, sizeof (struct nfsmount)); + mtx_init(&nmp->nm_rxlock); + mtx_init(&nmp->nm_txlock); TAILQ_INIT(&nmp->nm_uidlruhead); TAILQ_INIT(&nmp->nm_bioq); mp->mnt_data = (qaddr_t)nmp; diff --git a/sys/vfs/nfs/nfsmount.h b/sys/vfs/nfs/nfsmount.h index 12888da6a9..a458fa7e05 100644 --- a/sys/vfs/nfs/nfsmount.h +++ b/sys/vfs/nfs/nfsmount.h @@ -42,6 +42,8 @@ #ifndef _NFS_NFSMOUNT_H_ #define _NFS_NFSMOUNT_H_ +#include + /* * Mount structure. * One allocated on every NFS mount. @@ -50,6 +52,8 @@ struct nfsmount { int nm_flag; /* Flags for soft/hard... */ int nm_state; /* Internal state flags */ + struct mtx nm_rxlock; /* receive socket lock */ + struct mtx nm_txlock; /* send socket lock */ struct mount *nm_mountp; /* Vfs structure for this filesystem */ int nm_numgrps; /* Max. size of groupslist */ u_char nm_fh[NFSX_V3FHMAX]; /* File handle of root dir */ @@ -93,7 +97,6 @@ struct nfsmount { int nm_bioqiods; /* number of iods processing queue */ u_int64_t nm_maxfilesize; /* maximum file size */ struct ucred *nm_cred; /* 'root' credential */ - struct thread *nm_rcvlock_td; /* debugging */ }; -- 2.41.0 From e21aec5b97250bf976585792dcedc4bfc00cfc4f Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 21:14:04 -0700 Subject: [PATCH 11/16] NFS - move nfs_reqq from global to per-nfsmount. --- sys/vfs/nfs/nfs.h | 7 +- sys/vfs/nfs/nfs_socket.c | 227 +++++++++++++++++++++------------------ sys/vfs/nfs/nfs_subs.c | 4 +- sys/vfs/nfs/nfs_vfsops.c | 1 + sys/vfs/nfs/nfsmount.h | 2 + 5 files changed, 127 insertions(+), 114 deletions(-) diff --git a/sys/vfs/nfs/nfs.h b/sys/vfs/nfs/nfs.h index c2c8f8ce14..2395adec20 100644 --- a/sys/vfs/nfs/nfs.h +++ b/sys/vfs/nfs/nfs.h @@ -368,11 +368,8 @@ struct nfsreq { }; /* - * Queue head for nfsreq's + * Flag values for r_flags */ -extern TAILQ_HEAD(nfs_reqq, nfsreq) nfs_reqq; - -/* Flag values for r_flags */ #define R_TIMING 0x0001 /* timing request (in mntp) */ #define R_SENT 0x0002 /* request has been sent */ #define R_SOFTTERM 0x0004 /* soft mnt, too many retries */ @@ -474,6 +471,8 @@ struct nfssvc_sock { extern TAILQ_HEAD(nfssvc_sockhead, nfssvc_sock) nfssvc_sockhead; extern int nfssvc_sockhead_flag; +extern TAILQ_HEAD(, nfsmount) nfs_mountq; + #define SLP_INIT 0x01 #define SLP_WANTINIT 0x02 diff --git a/sys/vfs/nfs/nfs_socket.c b/sys/vfs/nfs/nfs_socket.c index 1cc8779521..03a28890f2 100644 --- a/sys/vfs/nfs/nfs_socket.c +++ b/sys/vfs/nfs/nfs_socket.c @@ -167,6 +167,7 @@ static void nfs_softterm (struct nfsreq *rep); static int nfs_reconnect (struct nfsreq *rep); #ifndef NFS_NOSERVER static int nfsrv_getstream (struct nfssvc_sock *, int, int *); +static void nfs_timer_req(struct nfsreq *req); int (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd, struct nfssvc_sock *slp, @@ -389,7 +390,7 @@ bad: static int nfs_reconnect(struct nfsreq *rep) { - struct nfsreq *rp; + struct nfsreq *req; struct nfsmount *nmp = rep->r_nmp; int error; @@ -405,9 +406,9 @@ nfs_reconnect(struct nfsreq *rep) * on old socket. */ crit_enter(); - TAILQ_FOREACH(rp, &nfs_reqq, r_chain) { - if (rp->r_nmp == nmp) - rp->r_flags |= R_MUSTRESEND; + TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { + KKASSERT(req->r_nmp == nmp); + req->r_flags |= R_MUSTRESEND; } crit_exit(); return (0); @@ -840,7 +841,7 @@ nfsmout: * section. */ crit_enter(); - TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { + TAILQ_FOREACH(rep, &nmp->nm_reqq, r_chain) { if (rep->r_mrep == NULL && rxid == rep->r_xid) break; } @@ -1104,7 +1105,7 @@ nfs_request_try(struct nfsreq *rep) * that we may block in this code so there is no atomicy guarentee. */ crit_enter(); - TAILQ_INSERT_TAIL(&nfs_reqq, rep, r_chain); + TAILQ_INSERT_TAIL(&nmp->nm_reqq, rep, r_chain); mtx_link_init(&rep->r_link); error = 0; @@ -1171,7 +1172,7 @@ nfs_request_waitreply(struct nfsreq *rep) nfs_timer_raced = 1; tsleep(&nfs_timer_raced, 0, "nfstrac", 0); } - TAILQ_REMOVE(&nfs_reqq, rep, r_chain); + TAILQ_REMOVE(&nmp->nm_reqq, rep, r_chain); /* * Decrement the outstanding request count. @@ -1461,113 +1462,29 @@ nfs_rephead(int siz, struct nfsrv_descript *nd, struct nfssvc_sock *slp, void nfs_timer(void *arg /* never used */) { - struct nfsreq *rep; - struct mbuf *m; - struct socket *so; struct nfsmount *nmp; - int timeo; - int error; + struct nfsreq *req; #ifndef NFS_NOSERVER struct nfssvc_sock *slp; u_quad_t cur_usec; #endif /* NFS_NOSERVER */ - struct thread *td = &thread0; /* XXX for credentials, will break if sleep */ crit_enter(); - TAILQ_FOREACH(rep, &nfs_reqq, r_chain) { - nmp = rep->r_nmp; - if (rep->r_mrep || (rep->r_flags & (R_SOFTTERM|R_MASKTIMER))) - continue; - rep->r_flags |= R_LOCKED; - if (nfs_sigintr(nmp, rep, rep->r_td)) { - nfs_softterm(rep); - goto skip; - } - if (rep->r_rtt >= 0) { - rep->r_rtt++; - if (nmp->nm_flag & NFSMNT_DUMBTIMR) - timeo = nmp->nm_timeo; - else - timeo = NFS_RTO(nmp, proct[rep->r_procnum]); - if (nmp->nm_timeouts > 0) - timeo *= nfs_backoff[nmp->nm_timeouts - 1]; - if (rep->r_rtt <= timeo) - goto skip; - if (nmp->nm_timeouts < 8) - nmp->nm_timeouts++; - } - /* - * Check for server not responding - */ - if ((rep->r_flags & R_TPRINTFMSG) == 0 && - rep->r_rexmit > nmp->nm_deadthresh) { - nfs_msg(rep->r_td, - nmp->nm_mountp->mnt_stat.f_mntfromname, - "not responding"); - rep->r_flags |= R_TPRINTFMSG; - } - if (rep->r_rexmit >= rep->r_retry) { /* too many */ - nfsstats.rpctimeouts++; - nfs_softterm(rep); - goto skip; - } - if (nmp->nm_sotype != SOCK_DGRAM) { - if (++rep->r_rexmit > NFS_MAXREXMIT) - rep->r_rexmit = NFS_MAXREXMIT; - goto skip; - } - if ((so = nmp->nm_so) == NULL) - goto skip; - - /* - * If there is enough space and the window allows.. - * Resend it - * Set r_rtt to -1 in case we fail to send it now. - */ - rep->r_rtt = -1; - if (ssb_space(&so->so_snd) >= rep->r_mreq->m_pkthdr.len && - ((nmp->nm_flag & NFSMNT_DUMBTIMR) || - (rep->r_flags & R_SENT) || - nmp->nm_sent < nmp->nm_cwnd) && - (m = m_copym(rep->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){ - if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) - error = so_pru_send(so, 0, m, NULL, NULL, td); - else - error = so_pru_send(so, 0, m, nmp->nm_nam, - NULL, td); - if (error) { - if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) - so->so_error = 0; - } else if (rep->r_mrep == NULL) { - /* - * Iff first send, start timing - * else turn timing off, backoff timer - * and divide congestion window by 2. - * - * It is possible for the so_pru_send() to - * block and for us to race a reply so we - * only do this if the reply field has not - * been filled in. R_LOCKED will prevent - * the request from being ripped out from under - * us entirely. - */ - if (rep->r_flags & R_SENT) { - rep->r_flags &= ~R_TIMING; - if (++rep->r_rexmit > NFS_MAXREXMIT) - rep->r_rexmit = NFS_MAXREXMIT; - nmp->nm_cwnd >>= 1; - if (nmp->nm_cwnd < NFS_CWNDSCALE) - nmp->nm_cwnd = NFS_CWNDSCALE; - nfsstats.rpcretries++; - } else { - rep->r_flags |= R_SENT; - nmp->nm_sent += NFS_CWNDSCALE; - } - rep->r_rtt = 0; + TAILQ_FOREACH(nmp, &nfs_mountq, nm_entry) { + TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { + KKASSERT(nmp == req->r_nmp); + if (req->r_mrep || + (req->r_flags & (R_SOFTTERM|R_MASKTIMER))) { + continue; } + req->r_flags |= R_LOCKED; + if (nfs_sigintr(nmp, req, req->r_td)) { + nfs_softterm(req); + } else { + nfs_timer_req(req); + } + req->r_flags &= ~R_LOCKED; } -skip: - rep->r_flags &= ~R_LOCKED; } #ifndef NFS_NOSERVER @@ -1594,6 +1511,102 @@ skip: callout_reset(&nfs_timer_handle, nfs_ticks, nfs_timer, NULL); } +static +void +nfs_timer_req(struct nfsreq *req) +{ + struct thread *td = &thread0; /* XXX for creds, will break if sleep */ + struct nfsmount *nmp = req->r_nmp; + struct mbuf *m; + struct socket *so; + int timeo; + int error; + + if (req->r_rtt >= 0) { + req->r_rtt++; + if (nmp->nm_flag & NFSMNT_DUMBTIMR) + timeo = nmp->nm_timeo; + else + timeo = NFS_RTO(nmp, proct[req->r_procnum]); + if (nmp->nm_timeouts > 0) + timeo *= nfs_backoff[nmp->nm_timeouts - 1]; + if (req->r_rtt <= timeo) + return; + if (nmp->nm_timeouts < 8) + nmp->nm_timeouts++; + } + /* + * Check for server not responding + */ + if ((req->r_flags & R_TPRINTFMSG) == 0 && + req->r_rexmit > nmp->nm_deadthresh) { + nfs_msg(req->r_td, + nmp->nm_mountp->mnt_stat.f_mntfromname, + "not responding"); + req->r_flags |= R_TPRINTFMSG; + } + if (req->r_rexmit >= req->r_retry) { /* too many */ + nfsstats.rpctimeouts++; + nfs_softterm(req); + return; + } + if (nmp->nm_sotype != SOCK_DGRAM) { + if (++req->r_rexmit > NFS_MAXREXMIT) + req->r_rexmit = NFS_MAXREXMIT; + return; + } + if ((so = nmp->nm_so) == NULL) + return; + + /* + * If there is enough space and the window allows.. + * Resend it + * Set r_rtt to -1 in case we fail to send it now. + */ + req->r_rtt = -1; + if (ssb_space(&so->so_snd) >= req->r_mreq->m_pkthdr.len && + ((nmp->nm_flag & NFSMNT_DUMBTIMR) || + (req->r_flags & R_SENT) || + nmp->nm_sent < nmp->nm_cwnd) && + (m = m_copym(req->r_mreq, 0, M_COPYALL, MB_DONTWAIT))){ + if ((nmp->nm_flag & NFSMNT_NOCONN) == 0) + error = so_pru_send(so, 0, m, NULL, NULL, td); + else + error = so_pru_send(so, 0, m, nmp->nm_nam, + NULL, td); + if (error) { + if (NFSIGNORE_SOERROR(nmp->nm_soflags, error)) + so->so_error = 0; + } else if (req->r_mrep == NULL) { + /* + * Iff first send, start timing + * else turn timing off, backoff timer + * and divide congestion window by 2. + * + * It is possible for the so_pru_send() to + * block and for us to race a reply so we + * only do this if the reply field has not + * been filled in. R_LOCKED will prevent + * the request from being ripped out from under + * us entirely. + */ + if (req->r_flags & R_SENT) { + req->r_flags &= ~R_TIMING; + if (++req->r_rexmit > NFS_MAXREXMIT) + req->r_rexmit = NFS_MAXREXMIT; + nmp->nm_cwnd >>= 1; + if (nmp->nm_cwnd < NFS_CWNDSCALE) + nmp->nm_cwnd = NFS_CWNDSCALE; + nfsstats.rpcretries++; + } else { + req->r_flags |= R_SENT; + nmp->nm_sent += NFS_CWNDSCALE; + } + req->r_rtt = 0; + } + } +} + /* * Mark all of an nfs mount's outstanding requests with R_SOFTTERM and * wait for all requests to complete. This is used by forced unmounts @@ -1606,7 +1619,7 @@ nfs_nmcancelreqs(struct nfsmount *nmp) int i; crit_enter(); - TAILQ_FOREACH(req, &nfs_reqq, r_chain) { + TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { if (nmp != req->r_nmp || req->r_mrep != NULL || (req->r_flags & R_SOFTTERM)) { continue; @@ -1617,7 +1630,7 @@ nfs_nmcancelreqs(struct nfsmount *nmp) for (i = 0; i < 30; i++) { crit_enter(); - TAILQ_FOREACH(req, &nfs_reqq, r_chain) { + TAILQ_FOREACH(req, &nmp->nm_reqq, r_chain) { if (nmp == req->r_nmp) break; } diff --git a/sys/vfs/nfs/nfs_subs.c b/sys/vfs/nfs/nfs_subs.c index f80d0cf912..c4765f235a 100644 --- a/sys/vfs/nfs/nfs_subs.c +++ b/sys/vfs/nfs/nfs_subs.c @@ -101,7 +101,7 @@ enum vtype nv3tov_type[8]= { int nfs_ticks; int nfs_pbuf_freecnt = -1; /* start out unlimited */ -struct nfs_reqq nfs_reqq; +TAILQ_HEAD(, nfsmount) nfs_mountq = TAILQ_HEAD_INITIALIZER(nfs_mountq); struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; @@ -1069,8 +1069,6 @@ nfs_init(struct vfsconf *vfsp) /* * Initialize reply list and start timer */ - TAILQ_INIT(&nfs_reqq); - nfs_timer(0); nfs_prev_nfssvc_sy_narg = sysent[SYS_nfssvc].sy_narg; diff --git a/sys/vfs/nfs/nfs_vfsops.c b/sys/vfs/nfs/nfs_vfsops.c index 34fa2119e9..7a2a45e464 100644 --- a/sys/vfs/nfs/nfs_vfsops.c +++ b/sys/vfs/nfs/nfs_vfsops.c @@ -902,6 +902,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, mtx_init(&nmp->nm_txlock); TAILQ_INIT(&nmp->nm_uidlruhead); TAILQ_INIT(&nmp->nm_bioq); + TAILQ_INIT(&nmp->nm_reqq); mp->mnt_data = (qaddr_t)nmp; } vfs_getnewfsid(mp); diff --git a/sys/vfs/nfs/nfsmount.h b/sys/vfs/nfs/nfsmount.h index a458fa7e05..54ae6346a8 100644 --- a/sys/vfs/nfs/nfsmount.h +++ b/sys/vfs/nfs/nfsmount.h @@ -52,6 +52,7 @@ struct nfsmount { int nm_flag; /* Flags for soft/hard... */ int nm_state; /* Internal state flags */ + TAILQ_ENTRY(nfsmount) nm_entry; /* entry in nfsmountq */ struct mtx nm_rxlock; /* receive socket lock */ struct mtx nm_txlock; /* send socket lock */ struct mount *nm_mountp; /* Vfs structure for this filesystem */ @@ -92,6 +93,7 @@ struct nfsmount { TAILQ_HEAD(, nfsuid) nm_uidlruhead; /* Lists of nfsuid mappings */ LIST_HEAD(, nfsuid) nm_uidhashtbl[NFS_MUIDHASHSIZ]; TAILQ_HEAD(, bio) nm_bioq; /* async io buffer queue */ + TAILQ_HEAD(, nfsreq) nm_reqq; /* nfsreq queue */ short nm_bioqlen; /* number of buffers in queue */ short nm_bioqwant; /* process wants to add to the queue */ int nm_bioqiods; /* number of iods processing queue */ -- 2.41.0 From 7f5d7ed72f25a050b246978877ae10a3a6de9d35 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 22:25:40 -0700 Subject: [PATCH 12/16] tsleep_interlock - Fix bug with TDF_TSLEEPQ * LWKT previously disallowed the scheduling of a thread flagged TDF_TSLEEPQ because the tsleep queue was the same as the threadq. Now that they are different, it is possible for the thread to still be on a tsleepq when descheduled and later rescheduled. Remove the check, allowing the thread to be rescheduled by LWKT. This fixes issues with processes just locking up in "D"isk wait. --- sys/kern/lwkt_thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sys/kern/lwkt_thread.c b/sys/kern/lwkt_thread.c index 8878c67ba0..3b497d872b 100644 --- a/sys/kern/lwkt_thread.c +++ b/sys/kern/lwkt_thread.c @@ -182,7 +182,7 @@ static __inline void _lwkt_enqueue(thread_t td) { - if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_TSLEEPQ|TDF_BLOCKQ)) == 0) { + if ((td->td_flags & (TDF_RUNQ|TDF_MIGRATING|TDF_BLOCKQ)) == 0) { int nq = td->td_pri & TDPRI_MASK; struct globaldata *gd = td->td_gd; -- 2.41.0 From 49433307020e928128fc0204644354153b61a4a2 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 22:37:37 -0700 Subject: [PATCH 13/16] NFS - Minor compile fix --- sys/vfs/nfs/nfs.h | 2 +- sys/vfs/nfs/nfs_subs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sys/vfs/nfs/nfs.h b/sys/vfs/nfs/nfs.h index 2395adec20..6a64f62078 100644 --- a/sys/vfs/nfs/nfs.h +++ b/sys/vfs/nfs/nfs.h @@ -471,7 +471,7 @@ struct nfssvc_sock { extern TAILQ_HEAD(nfssvc_sockhead, nfssvc_sock) nfssvc_sockhead; extern int nfssvc_sockhead_flag; -extern TAILQ_HEAD(, nfsmount) nfs_mountq; +extern TAILQ_HEAD(nfsmount_head, nfsmount) nfs_mountq; #define SLP_INIT 0x01 #define SLP_WANTINIT 0x02 diff --git a/sys/vfs/nfs/nfs_subs.c b/sys/vfs/nfs/nfs_subs.c index c4765f235a..61e39b8501 100644 --- a/sys/vfs/nfs/nfs_subs.c +++ b/sys/vfs/nfs/nfs_subs.c @@ -101,7 +101,7 @@ enum vtype nv3tov_type[8]= { int nfs_ticks; int nfs_pbuf_freecnt = -1; /* start out unlimited */ -TAILQ_HEAD(, nfsmount) nfs_mountq = TAILQ_HEAD_INITIALIZER(nfs_mountq); +struct nfsmount_head nfs_mountq = TAILQ_HEAD_INITIALIZER(nfs_mountq); struct nfssvc_sockhead nfssvc_sockhead; int nfssvc_sockhead_flag; struct nfsd_head nfsd_head; -- 2.41.0 From 52e1cf57a5de67bef07f7942efb1b3c1d31d4cc3 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 23:21:34 -0700 Subject: [PATCH 14/16] NFS - Remove old nfsiod, start adding new kernel thread infrastructure * Remove all the nfsiod junk. * Add two per-mount threads, one for reading from the socket, one for writing to the socket, in a new file nfs_iod.c * Implement a quick and dirty synchronous, single threaded nfs_doio() loop in the writer thread to test basic mechanics. --- sys/conf/files | 1 + sys/vfs/nfs/Makefile | 2 +- sys/vfs/nfs/nfs.h | 6 +- sys/vfs/nfs/nfs_bio.c | 124 ++++------------------------- sys/vfs/nfs/nfs_iod.c | 158 +++++++++++++++++++++++++++++++++++++ sys/vfs/nfs/nfs_kerb.c | 1 + sys/vfs/nfs/nfs_subs.c | 7 -- sys/vfs/nfs/nfs_syscalls.c | 84 +------------------- sys/vfs/nfs/nfs_vfsops.c | 12 ++- sys/vfs/nfs/nfs_vnops.c | 3 - sys/vfs/nfs/nfsmount.h | 17 +++- sys/vfs/nfs/nfsnode.h | 2 - 12 files changed, 206 insertions(+), 211 deletions(-) create mode 100644 sys/vfs/nfs/nfs_iod.c diff --git a/sys/conf/files b/sys/conf/files index 345c452fc6..022fa58426 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1264,6 +1264,7 @@ vfs/nfs/nfs_syscalls.c optional nfs vfs/nfs/nfs_vfsops.c optional nfs vfs/nfs/nfs_vnops.c optional nfs vfs/nfs/nfs_kerb.c optional nfs +vfs/nfs/nfs_iod.c optional nfs vfs/nfs/bootp_subr.c optional bootp vfs/nfs/nfs_mountrpc.c optional nfs vfs/nfs/krpc_subr.c optional nfs diff --git a/sys/vfs/nfs/Makefile b/sys/vfs/nfs/Makefile index 0ceb928f82..67df6283cc 100644 --- a/sys/vfs/nfs/Makefile +++ b/sys/vfs/nfs/Makefile @@ -3,7 +3,7 @@ KMOD= nfs SRCS= nfs_bio.c nfs_node.c nfs_kerb.c nfs_serv.c nfs_socket.c \ - nfs_srvcache.c nfs_subs.c nfs_syscalls.c nfs_vfsops.c \ + nfs_srvcache.c nfs_subs.c nfs_syscalls.c nfs_vfsops.c nfs_iod.c \ nfs_vnops.c opt_inet.h opt_nfs.h opt_vmpage.h opt_bootp.h \ opt_nfsroot.h NFS_INET?= 1 # 0/1 - requires INET to be configured in kernel diff --git a/sys/vfs/nfs/nfs.h b/sys/vfs/nfs/nfs.h index 6a64f62078..4c38f4232c 100644 --- a/sys/vfs/nfs/nfs.h +++ b/sys/vfs/nfs/nfs.h @@ -87,7 +87,6 @@ #define NFS_DEFRAHEAD 4 /* Def. read ahead # blocks */ #define NFS_MAXRAHEAD 32 /* Max. read ahead # blocks */ #define NFS_MAXUIDHASH 64 /* Max. # of hashed uid entries/mp */ -#define NFS_MAXASYNCDAEMON 64 /* Max. number async_daemons runnable */ #define NFS_MAXGATHERDELAY 100 /* Max. write gather delay (msec) */ #ifndef NFS_GATHERDELAY #define NFS_GATHERDELAY 20 /* Default write gather delay (msec) */ @@ -754,7 +753,10 @@ int nfs_meta_setsize (struct vnode *vp, struct thread *td, u_quad_t nsize); int nfs_clientd(struct nfsmount *nmp, struct ucred *cred, struct nfsd_cargs *ncd, int flag, caddr_t argp, struct thread *td); - +void nfssvc_iod_reader(void *arg); +void nfssvc_iod_writer(void *arg); +void nfssvc_iod_stop(struct nfsmount *nmp); +void nfssvc_iod_writer_wakeup(struct nfsmount *nmp); #endif /* _KERNEL */ diff --git a/sys/vfs/nfs/nfs_bio.c b/sys/vfs/nfs/nfs_bio.c index bbc026104d..1b572d131c 100644 --- a/sys/vfs/nfs/nfs_bio.c +++ b/sys/vfs/nfs/nfs_bio.c @@ -71,7 +71,6 @@ static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset, static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen); static void nfsiodone_sync(struct bio *bio); -extern int nfs_numasync; extern int nfs_pbuf_freecnt; extern struct nfsstats nfsstats; @@ -425,7 +424,7 @@ nfs_bioread(struct vnode *vp, struct uio *uio, int ioflag) /* * Start the read ahead(s), as required. */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0) { + if (nmp->nm_readahead > 0) { for (nra = 0; nra < nmp->nm_readahead && nra < seqcount && (off_t)(lbn + 1 + nra) * biosize < np->n_size; nra++) { rabn = lbn + 1 + nra; @@ -621,7 +620,7 @@ again: * (You need the current block first, so that you have the * directory offset cookie of the next block.) */ - if (nfs_numasync > 0 && nmp->nm_readahead > 0 && + if (nmp->nm_readahead > 0 && (bp->b_flags & B_INVAL) == 0 && (np->n_direofoffset == 0 || loffset + NFS_DIRBLKSIZ < np->n_direofoffset) && @@ -1200,122 +1199,29 @@ nfs_asyncio(struct vnode *vp, struct bio *bio, struct thread *td) { struct buf *bp = bio->bio_buf; struct nfsmount *nmp; - int i; - int gotiod; - int slpflag = 0; - int slptimeo = 0; - int error; - - /* - * If no async daemons then return EIO to force caller to run the rpc - * synchronously. - */ - if (nfs_numasync == 0) - return (EIO); KKASSERT(vp->v_tag == VT_NFS); nmp = VFSTONFS(vp->v_mount); /* - * Commits are usually short and sweet so lets save some cpu and - * leave the async daemons for more important rpc's (such as reads - * and writes). - */ - if (bp->b_cmd == BUF_CMD_WRITE && (bp->b_flags & B_NEEDCOMMIT) && - (nmp->nm_bioqiods > nfs_numasync / 2)) { - return(EIO); - } - -again: - if (nmp->nm_flag & NFSMNT_INT) - slpflag = PCATCH; - gotiod = FALSE; - - /* - * Find a free iod to process this request. - */ - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) - if (nfs_iodwant[i]) { - /* - * Found one, so wake it up and tell it which - * mount to process. - */ - NFS_DPF(ASYNCIO, - ("nfs_asyncio: waking iod %d for mount %p\n", - i, nmp)); - nfs_iodwant[i] = NULL; - nfs_iodmount[i] = nmp; - nmp->nm_bioqiods++; - wakeup((caddr_t)&nfs_iodwant[i]); - gotiod = TRUE; - break; - } - - /* - * If none are free, we may already have an iod working on this mount - * point. If so, it will process our request. - */ - if (!gotiod) { - if (nmp->nm_bioqiods > 0) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: %d iods are already processing mount %p\n", - nmp->nm_bioqiods, nmp)); - gotiod = TRUE; - } - } - - /* - * If we have an iod which can process the request, then queue - * the buffer. + * If no async daemons then return EIO to force caller to run the rpc + * synchronously. */ - if (gotiod) { - /* - * Ensure that the queue never grows too large. We still want - * to asynchronize so we block rather then return EIO. - */ - while (nmp->nm_bioqlen >= 2*nfs_numasync) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: waiting for mount %p queue to drain\n", nmp)); - nmp->nm_bioqwant = TRUE; - error = tsleep(&nmp->nm_bioq, slpflag, - "nfsaio", slptimeo); - if (error) { - if (nfs_sigintr(nmp, NULL, td)) - return (EINTR); - if (slpflag == PCATCH) { - slpflag = 0; - slptimeo = 2 * hz; - } - } - /* - * We might have lost our iod while sleeping, - * so check and loop if nescessary. - */ - if (nmp->nm_bioqiods == 0) { - NFS_DPF(ASYNCIO, - ("nfs_asyncio: no iods after mount %p queue was drained, looping\n", nmp)); - goto again; - } - } - BUF_KERNPROC(bp); + if (nmp->nm_rxstate > NFSSVC_PENDING) + return (EIO); - /* - * The passed bio's buffer is not necessary associated with - * the NFS vnode it is being written to. Store the NFS vnode - * in the BIO driver info. - */ - bio->bio_driver_info = vp; - TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act); - nmp->nm_bioqlen++; - return (0); - } + BUF_KERNPROC(bp); /* - * All the iods are busy on other mounts, so return EIO to - * force the caller to process the i/o synchronously. + * The passed bio's buffer is not necessary associated with + * the NFS vnode it is being written to. Store the NFS vnode + * in the BIO driver info. */ - NFS_DPF(ASYNCIO, ("nfs_asyncio: no iods available, i/o is synchronous\n")); - return (EIO); + bio->bio_driver_info = vp; + TAILQ_INSERT_TAIL(&nmp->nm_bioq, bio, bio_act); + nmp->nm_bioqlen++; + nfssvc_iod_writer_wakeup(nmp); + return (0); } /* diff --git a/sys/vfs/nfs/nfs_iod.c b/sys/vfs/nfs/nfs_iod.c new file mode 100644 index 0000000000..7ec8ea5f7e --- /dev/null +++ b/sys/vfs/nfs/nfs_iod.c @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2009 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * NFSIOD operations - now built into the kernel. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + +#include "rpcv2.h" +#include "nfsproto.h" +#include "nfs.h" +#include "xdr_subs.h" +#include "nfsm_subs.h" +#include "nfsmount.h" +#include "nfsnode.h" +#include "nfsrtt.h" + +void +nfssvc_iod_reader(void *arg) +{ + struct nfsmount *nmp = arg; + + if (nmp->nm_rxstate == NFSSVC_INIT) + nmp->nm_rxstate = NFSSVC_PENDING; + for (;;) { + if (nmp->nm_rxstate == NFSSVC_WAITING) { + tsleep(&nmp->nm_rxstate, 0, "nfsidl", 0); + continue; + } + if (nmp->nm_rxstate != NFSSVC_PENDING) + break; + nmp->nm_rxstate = NFSSVC_WAITING; + +#if 0 + error = tsleep((caddr_t)&nfs_iodwant[myiod], + PCATCH, "nfsidl", 0); +#endif + } + nmp->nm_rxthread = NULL; + nmp->nm_rxstate = NFSSVC_DONE; + wakeup(&nmp->nm_rxthread); +} + +/* + * The writer sits on the send side of the client's socket and + * does both the initial processing of BIOs and also transmission + * and retransmission of nfsreq's. + */ +void +nfssvc_iod_writer(void *arg) +{ + struct nfsmount *nmp = arg; + struct bio *bio; + struct vnode *vp; + + if (nmp->nm_txstate == NFSSVC_INIT) + nmp->nm_txstate = NFSSVC_PENDING; + for (;;) { + if (nmp->nm_txstate == NFSSVC_WAITING) { + tsleep(&nmp->nm_txstate, 0, "nfsidl", 0); + continue; + } + if (nmp->nm_txstate != NFSSVC_PENDING) + break; + nmp->nm_txstate = NFSSVC_WAITING; + + while (nmp->nm_bioqlen && nmp->nm_reqqlen < 32) { + bio = TAILQ_FIRST(&nmp->nm_bioq); + KKASSERT(bio); + TAILQ_REMOVE(&nmp->nm_bioq, bio, bio_act); + nmp->nm_bioqlen--; + vp = bio->bio_driver_info; + nfs_doio(vp, bio, NULL); + } + } + nmp->nm_txthread = NULL; + nmp->nm_txstate = NFSSVC_DONE; + wakeup(&nmp->nm_txthread); +} + +void +nfssvc_iod_stop(struct nfsmount *nmp) +{ + nmp->nm_txstate = NFSSVC_STOPPING; + wakeup(&nmp->nm_txstate); + while (nmp->nm_txthread) + tsleep(&nmp->nm_txthread, 0, "nfssttx", 0); + + nmp->nm_rxstate = NFSSVC_STOPPING; + wakeup(&nmp->nm_rxstate); + while (nmp->nm_rxthread) + tsleep(&nmp->nm_rxthread, 0, "nfsstrx", 0); +} + +void +nfssvc_iod_writer_wakeup(struct nfsmount *nmp) +{ + if (nmp->nm_txstate == NFSSVC_WAITING) { + nmp->nm_txstate = NFSSVC_PENDING; + wakeup(&nmp->nm_txstate); + } +} diff --git a/sys/vfs/nfs/nfs_kerb.c b/sys/vfs/nfs/nfs_kerb.c index 2ac745ff54..8efe9c2e3e 100644 --- a/sys/vfs/nfs/nfs_kerb.c +++ b/sys/vfs/nfs/nfs_kerb.c @@ -144,6 +144,7 @@ nfs_clientd(struct nfsmount *nmp, struct ucred *cred, struct nfsd_cargs *ncd, TAILQ_REMOVE(&nmp->nm_uidlruhead, nuidp, nu_lru); kfree((caddr_t)nuidp, M_NFSUID); } + nfssvc_iod_stop(nmp); nfs_free_mount(nmp); if (error == EWOULDBLOCK) error = 0; diff --git a/sys/vfs/nfs/nfs_subs.c b/sys/vfs/nfs/nfs_subs.c index 61e39b8501..0931c9fdf5 100644 --- a/sys/vfs/nfs/nfs_subs.c +++ b/sys/vfs/nfs/nfs_subs.c @@ -1032,8 +1032,6 @@ nfsm_strtmbuf(struct mbuf **mb, char **bpos, const char *cp, long siz) int nfs_init(struct vfsconf *vfsp) { - int i; - callout_init(&nfs_timer_handle); nfsmount_zone = zinit("NFSMOUNT", sizeof(struct nfsmount), 0, 0, 1); @@ -1055,11 +1053,6 @@ nfs_init(struct vfsconf *vfsp) nfs_ticks = (hz * NFS_TICKINTVL + 500) / 1000; if (nfs_ticks < 1) nfs_ticks = 1; - /* Ensure async daemons disabled */ - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) { - nfs_iodwant[i] = NULL; - nfs_iodmount[i] = NULL; - } nfs_nhinit(); /* Init the nfsnode table */ #ifndef NFS_NOSERVER nfsrv_init(0); /* Init server data structures */ diff --git a/sys/vfs/nfs/nfs_syscalls.c b/sys/vfs/nfs/nfs_syscalls.c index 1acd9a45f3..9a43465d9a 100644 --- a/sys/vfs/nfs/nfs_syscalls.c +++ b/sys/vfs/nfs/nfs_syscalls.c @@ -94,13 +94,10 @@ static int nuidhash_max = NFS_MAXUIDHASH; #ifndef NFS_NOSERVER static void nfsrv_zapsock (struct nfssvc_sock *slp); #endif -static int nfssvc_iod (struct thread *); #define TRUE 1 #define FALSE 0 -static int nfs_asyncdaemon[NFS_MAXASYNCDAEMON]; - SYSCTL_DECL(_vfs_nfs); #ifndef NFS_NOSERVER @@ -165,7 +162,7 @@ sys_nfssvc(struct nfssvc_args *uap) (void) tsleep((caddr_t)&nfssvc_sockhead, 0, "nfsd init", 0); } if (uap->flag & NFSSVC_BIOD) - error = nfssvc_iod(td); + error = ENXIO; /* no longer need nfsiod's */ #ifdef NFS_NOSERVER else error = ENXIO; @@ -885,85 +882,6 @@ nfsd_rt(int sotype, struct nfsrv_descript *nd, int cacherep) static int nfs_defect = 0; SYSCTL_INT(_vfs_nfs, OID_AUTO, defect, CTLFLAG_RW, &nfs_defect, 0, ""); -/* - * Asynchronous I/O daemons for client nfs. - * They do read-ahead and write-behind operations on the block I/O cache. - * Never returns unless it fails or gets killed. - */ -static int -nfssvc_iod(struct thread *td) -{ - struct bio *bio; - int i, myiod; - struct nfsmount *nmp; - int error = 0; - - /* - * Assign my position or return error if too many already running - */ - myiod = -1; - for (i = 0; i < NFS_MAXASYNCDAEMON; i++) - if (nfs_asyncdaemon[i] == 0) { - nfs_asyncdaemon[i]++; - myiod = i; - break; - } - if (myiod == -1) - return (EBUSY); - nfs_numasync++; - /* - * Just loop around doin our stuff until SIGKILL - */ - for (;;) { - while (((nmp = nfs_iodmount[myiod]) == NULL - || TAILQ_EMPTY(&nmp->nm_bioq)) - && error == 0) { - if (nmp) - nmp->nm_bioqiods--; - nfs_iodwant[myiod] = td; - nfs_iodmount[myiod] = NULL; - error = tsleep((caddr_t)&nfs_iodwant[myiod], - PCATCH, "nfsidl", 0); - } - if (error) { - nfs_asyncdaemon[myiod] = 0; - if (nmp) - nmp->nm_bioqiods--; - nfs_iodwant[myiod] = NULL; - nfs_iodmount[myiod] = NULL; - nfs_numasync--; - return (error); - } - while ((bio = TAILQ_FIRST(&nmp->nm_bioq)) != NULL) { - /* - * Take one off the front of the list. The BIO's - * block number is normalized for DEV_BSIZE. - */ - TAILQ_REMOVE(&nmp->nm_bioq, bio, bio_act); - nmp->nm_bioqlen--; - if (nmp->nm_bioqwant && nmp->nm_bioqlen <= nfs_numasync) { - nmp->nm_bioqwant = FALSE; - wakeup(&nmp->nm_bioq); - } - nfs_doio((struct vnode *)bio->bio_driver_info, bio, NULL); - - /* - * If there are more than one iod on this mount, then defect - * so that the iods can be shared out fairly between the mounts - */ - if (nfs_defect && nmp->nm_bioqiods > 1) { - NFS_DPF(ASYNCIO, - ("nfssvc_iod: iod %d defecting from mount %p\n", - myiod, nmp)); - nfs_iodmount[myiod] = NULL; - nmp->nm_bioqiods--; - break; - } - } - } -} - - /* * Get an authorization string for the uid by having the mount_nfs sitting * on this mount point porpous out of the kernel and do it. diff --git a/sys/vfs/nfs/nfs_vfsops.c b/sys/vfs/nfs/nfs_vfsops.c index 7a2a45e464..1d3eb8f3df 100644 --- a/sys/vfs/nfs/nfs_vfsops.c +++ b/sys/vfs/nfs/nfs_vfsops.c @@ -996,6 +996,14 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, */ vn_unlock(*vpp); + /* + * Start the reader and writer threads. + */ + lwkt_create(nfssvc_iod_reader, nmp, &nmp->nm_rxthread, + NULL, 0, -1, "nfsiod_rx"); + lwkt_create(nfssvc_iod_writer, nmp, &nmp->nm_txthread, + NULL, 0, -1, "nfsiod_tx"); + return (0); bad: nfs_disconnect(nmp); @@ -1050,8 +1058,10 @@ nfs_unmount(struct mount *mp, int mntflags) nfs_disconnect(nmp); FREE(nmp->nm_nam, M_SONAME); - if ((nmp->nm_flag & NFSMNT_KERB) == 0) + if ((nmp->nm_flag & NFSMNT_KERB) == 0) { + nfssvc_iod_stop(nmp); nfs_free_mount(nmp); + } return (0); } diff --git a/sys/vfs/nfs/nfs_vnops.c b/sys/vfs/nfs/nfs_vnops.c index a9fdc815fa..79eae8b292 100644 --- a/sys/vfs/nfs/nfs_vnops.c +++ b/sys/vfs/nfs/nfs_vnops.c @@ -220,9 +220,6 @@ extern u_int32_t nfs_true, nfs_false; extern u_int32_t nfs_xdrneg1; extern struct nfsstats nfsstats; extern nfstype nfsv3_type[9]; -struct thread *nfs_iodwant[NFS_MAXASYNCDAEMON]; -struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; -int nfs_numasync = 0; SYSCTL_DECL(_vfs_nfs); diff --git a/sys/vfs/nfs/nfsmount.h b/sys/vfs/nfs/nfsmount.h index 54ae6346a8..0fbfb3aaa1 100644 --- a/sys/vfs/nfs/nfsmount.h +++ b/sys/vfs/nfs/nfsmount.h @@ -44,6 +44,14 @@ #include +enum nfssvc_state { + NFSSVC_INIT, + NFSSVC_WAITING, + NFSSVC_PENDING, + NFSSVC_STOPPING, + NFSSVC_DONE +}; + /* * Mount structure. * One allocated on every NFS mount. @@ -55,6 +63,10 @@ struct nfsmount { TAILQ_ENTRY(nfsmount) nm_entry; /* entry in nfsmountq */ struct mtx nm_rxlock; /* receive socket lock */ struct mtx nm_txlock; /* send socket lock */ + thread_t nm_rxthread; + thread_t nm_txthread; + enum nfssvc_state nm_rxstate; + enum nfssvc_state nm_txstate; struct mount *nm_mountp; /* Vfs structure for this filesystem */ int nm_numgrps; /* Max. size of groupslist */ u_char nm_fh[NFSX_V3FHMAX]; /* File handle of root dir */ @@ -94,9 +106,8 @@ struct nfsmount { LIST_HEAD(, nfsuid) nm_uidhashtbl[NFS_MUIDHASHSIZ]; TAILQ_HEAD(, bio) nm_bioq; /* async io buffer queue */ TAILQ_HEAD(, nfsreq) nm_reqq; /* nfsreq queue */ - short nm_bioqlen; /* number of buffers in queue */ - short nm_bioqwant; /* process wants to add to the queue */ - int nm_bioqiods; /* number of iods processing queue */ + int nm_bioqlen; /* number of buffers in queue */ + int nm_reqqlen; /* number of nfsreqs in queue */ u_int64_t nm_maxfilesize; /* maximum file size */ struct ucred *nm_cred; /* 'root' credential */ }; diff --git a/sys/vfs/nfs/nfsnode.h b/sys/vfs/nfs/nfsnode.h index 87c8aa39cf..f1b4034939 100644 --- a/sys/vfs/nfs/nfsnode.h +++ b/sys/vfs/nfs/nfsnode.h @@ -175,8 +175,6 @@ struct nfsnode { * Queue head for nfsiod's */ extern TAILQ_HEAD(nfs_bufq, buf) nfs_bufq; -extern struct thread *nfs_iodwant[NFS_MAXASYNCDAEMON]; -extern struct nfsmount *nfs_iodmount[NFS_MAXASYNCDAEMON]; #if defined(_KERNEL) -- 2.41.0 From 9a702b271591ff8d015cccf6157ae36a9f15b395 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Wed, 15 Jul 2009 23:46:07 -0700 Subject: [PATCH 15/16] NFS - install nfsmount on nfs_mountq, for timeout handling. * This way the timeout callout can find all the requests. --- sys/vfs/nfs/nfs_vfsops.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sys/vfs/nfs/nfs_vfsops.c b/sys/vfs/nfs/nfs_vfsops.c index 7a2a45e464..04f68d7ce3 100644 --- a/sys/vfs/nfs/nfs_vfsops.c +++ b/sys/vfs/nfs/nfs_vfsops.c @@ -995,6 +995,7 @@ mountnfs(struct nfs_args *argp, struct mount *mp, struct sockaddr *nam, * Lose the lock but keep the ref. */ vn_unlock(*vpp); + TAILQ_INSERT_TAIL(&nfs_mountq, nmp, nm_entry); return (0); bad: @@ -1049,6 +1050,7 @@ nfs_unmount(struct mount *mp, int mntflags) nfs_disconnect(nmp); FREE(nmp->nm_nam, M_SONAME); + TAILQ_REMOVE(&nfs_mountq, nmp, nm_entry); if ((nmp->nm_flag & NFSMNT_KERB) == 0) nfs_free_mount(nmp); -- 2.41.0 From e97453f3e38fa7883847b71c7a911843b2ea1113 Mon Sep 17 00:00:00 2001 From: Matthew Dillon Date: Thu, 16 Jul 2009 08:19:14 -0700 Subject: [PATCH 16/16] NFS - create nfsm_subs.c, clean up externs * Move nfsm_*() procedures from nfs_subs.c to nfsm_subs.c * Clean up externs improperly embedded in .c files. --- sys/conf/files | 1 + sys/vfs/nfs/Makefile | 9 +- sys/vfs/nfs/bootp_subr.c | 1 - sys/vfs/nfs/nfs.h | 26 +- sys/vfs/nfs/nfs_bio.c | 3 - sys/vfs/nfs/nfs_mountrpc.c | 2 - sys/vfs/nfs/nfs_node.c | 2 + sys/vfs/nfs/nfs_serv.c | 5 - sys/vfs/nfs/nfs_socket.c | 10 - sys/vfs/nfs/nfs_srvcache.c | 2 - sys/vfs/nfs/nfs_subs.c | 637 +------------------------------- sys/vfs/nfs/nfs_syscalls.c | 10 - sys/vfs/nfs/nfs_vfsops.c | 3 - sys/vfs/nfs/nfs_vnops.c | 8 - sys/vfs/nfs/nfsm_subs.c | 728 +++++++++++++++++++++++++++++++++++++ 15 files changed, 766 insertions(+), 681 deletions(-) create mode 100644 sys/vfs/nfs/nfsm_subs.c diff --git a/sys/conf/files b/sys/conf/files index 022fa58426..afaaf3fcd4 100644 --- a/sys/conf/files +++ b/sys/conf/files @@ -1265,6 +1265,7 @@ vfs/nfs/nfs_vfsops.c optional nfs vfs/nfs/nfs_vnops.c optional nfs vfs/nfs/nfs_kerb.c optional nfs vfs/nfs/nfs_iod.c optional nfs +vfs/nfs/nfsm_subs.c optional nfs vfs/nfs/bootp_subr.c optional bootp vfs/nfs/nfs_mountrpc.c optional nfs vfs/nfs/krpc_subr.c optional nfs diff --git a/sys/vfs/nfs/Makefile b/sys/vfs/nfs/Makefile index 67df6283cc..f3e8e4921c 100644 --- a/sys/vfs/nfs/Makefile +++ b/sys/vfs/nfs/Makefile @@ -4,9 +4,12 @@ KMOD= nfs SRCS= nfs_bio.c nfs_node.c nfs_kerb.c nfs_serv.c nfs_socket.c \ nfs_srvcache.c nfs_subs.c nfs_syscalls.c nfs_vfsops.c nfs_iod.c \ - nfs_vnops.c opt_inet.h opt_nfs.h opt_vmpage.h opt_bootp.h \ - opt_nfsroot.h -NFS_INET?= 1 # 0/1 - requires INET to be configured in kernel + nfsm_subs.c nfs_vnops.c \ + opt_inet.h opt_nfs.h opt_vmpage.h opt_bootp.h opt_nfsroot.h + +# 0/1 - requires INET to be configured in kernel +# +NFS_INET?= 1 opt_inet.h: touch ${.TARGET} diff --git a/sys/vfs/nfs/bootp_subr.c b/sys/vfs/nfs/bootp_subr.c index 8e50a48dbb..2e1b9c7c04 100644 --- a/sys/vfs/nfs/bootp_subr.c +++ b/sys/vfs/nfs/bootp_subr.c @@ -212,7 +212,6 @@ struct bootpc_globalcontext { #define DHCP_REQUEST 3 #define DHCP_ACK 5 -extern struct nfsv3_diskless nfsv3_diskless; static char bootp_cookie[128]; SYSCTL_STRING(_kern, OID_AUTO, bootp_cookie, CTLFLAG_RD, bootp_cookie, 0, "Cookie (T134) supplied by bootp server"); diff --git a/sys/vfs/nfs/nfs.h b/sys/vfs/nfs/nfs.h index 4c38f4232c..bf80f832d6 100644 --- a/sys/vfs/nfs/nfs.h +++ b/sys/vfs/nfs/nfs.h @@ -45,6 +45,7 @@ #include "opt_nfs.h" #endif +#include #include /* @@ -401,8 +402,6 @@ struct nfsreq { #endif #define NMUIDHASH(nmp, uid) \ (&(nmp)->nm_uidhashtbl[(uid) % NFS_MUIDHASHSIZ]) -#define NFSNOHASH(fhsum) \ - (&nfsnodehashtbl[(fhsum) & nfsnodehash]) /* * Network address hash list element @@ -597,6 +596,29 @@ extern int nfs_debug; #endif +extern u_int32_t nfs_xdrneg1; +extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers; +extern u_int32_t rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; +extern u_int32_t rpc_auth_kerb; +extern u_int32_t nfs_prog, nfs_true, nfs_false; +extern struct nfsstats nfsstats; +extern nfstype nfsv2_type[9]; +extern nfstype nfsv3_type[9]; +extern int nfsv2_procid[NFS_NPROCS]; +extern enum vtype nv3tov_type[8]; +extern int nfsv3_procid[NFS_NPROCS]; +extern int nfs_ticks; +extern struct nfsrtt nfsrtt; +extern int nfsrtton; +extern int nfsrvw_procrastinate; +extern int nfsrvw_procrastinate_v3; +extern int32_t (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd, + struct nfssvc_sock *slp, + struct thread *td, + struct mbuf **mreqp); + +extern struct nfsv3_diskless nfsv3_diskless; + u_quad_t nfs_curusec (void); int nfs_init (struct vfsconf *vfsp); int nfs_uninit (struct vfsconf *vfsp); diff --git a/sys/vfs/nfs/nfs_bio.c b/sys/vfs/nfs/nfs_bio.c index 1b572d131c..a10aa70536 100644 --- a/sys/vfs/nfs/nfs_bio.c +++ b/sys/vfs/nfs/nfs_bio.c @@ -71,9 +71,6 @@ static struct buf *nfs_getcacheblk(struct vnode *vp, off_t loffset, static int nfs_check_dirent(struct nfs_dirent *dp, int maxlen); static void nfsiodone_sync(struct bio *bio); -extern int nfs_pbuf_freecnt; -extern struct nfsstats nfsstats; - /* * Vnode op for VM getpages. * diff --git a/sys/vfs/nfs/nfs_mountrpc.c b/sys/vfs/nfs/nfs_mountrpc.c index 668bacadd2..233cbc93e9 100644 --- a/sys/vfs/nfs/nfs_mountrpc.c +++ b/sys/vfs/nfs/nfs_mountrpc.c @@ -86,8 +86,6 @@ * first complaint will happen after (1+2+3+4+5)=15 seconds. */ -extern struct nfsv3_diskless nfsv3_diskless; - static int getdec(char **ptr); static char *substr(char *a,char *b); static int xdr_opaque_decode(struct mbuf **ptr, u_char *buf, int len); diff --git a/sys/vfs/nfs/nfs_node.c b/sys/vfs/nfs/nfs_node.c index 63f676ded7..78952a31d6 100644 --- a/sys/vfs/nfs/nfs_node.c +++ b/sys/vfs/nfs/nfs_node.c @@ -63,6 +63,8 @@ static u_long nfsnodehash; #define TRUE 1 #define FALSE 0 +#define NFSNOHASH(fhsum) (&nfsnodehashtbl[(fhsum) & nfsnodehash]) + /* * Initialize hash links for nfsnodes * and build nfsnode free list. diff --git a/sys/vfs/nfs/nfs_serv.c b/sys/vfs/nfs/nfs_serv.c index 2aba4d5d16..220db22c64 100644 --- a/sys/vfs/nfs/nfs_serv.c +++ b/sys/vfs/nfs/nfs_serv.c @@ -122,11 +122,6 @@ nfstype nfsv3_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFSOCK, #ifndef NFS_NOSERVER nfstype nfsv2_type[9] = { NFNON, NFREG, NFDIR, NFBLK, NFCHR, NFLNK, NFNON, NFCHR, NFNON }; -/* Global vars */ -extern u_int32_t nfs_xdrneg1; -extern u_int32_t nfs_false, nfs_true; -extern enum vtype nv3tov_type[8]; -extern struct nfsstats nfsstats; int nfsrvw_procrastinate = NFS_GATHERDELAY * 1000; int nfsrvw_procrastinate_v3 = 0; diff --git a/sys/vfs/nfs/nfs_socket.c b/sys/vfs/nfs/nfs_socket.c index 03a28890f2..1bbfefad7d 100644 --- a/sys/vfs/nfs/nfs_socket.c +++ b/sys/vfs/nfs/nfs_socket.c @@ -101,16 +101,6 @@ ((((n)->nm_srtt[t-1] + 7) >> 3) + (n)->nm_sdrtt[t-1] + 1))) #define NFS_SRTT(r) (r)->r_nmp->nm_srtt[proct[(r)->r_procnum] - 1] #define NFS_SDRTT(r) (r)->r_nmp->nm_sdrtt[proct[(r)->r_procnum] - 1] -/* - * External data, mostly RPC constants in XDR form - */ -extern u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers, - rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr, - rpc_auth_kerb; -extern u_int32_t nfs_prog; -extern struct nfsstats nfsstats; -extern int nfsv3_procid[NFS_NPROCS]; -extern int nfs_ticks; /* * Defines which timer to use for the procnum. diff --git a/sys/vfs/nfs/nfs_srvcache.c b/sys/vfs/nfs/nfs_srvcache.c index a0c55588af..00cd75bc3d 100644 --- a/sys/vfs/nfs/nfs_srvcache.c +++ b/sys/vfs/nfs/nfs_srvcache.c @@ -58,8 +58,6 @@ #include "nfsrvcache.h" #ifndef NFS_NOSERVER -extern struct nfsstats nfsstats; -extern int nfsv2_procid[NFS_NPROCS]; static long numnfsrvcache; static long desirednfsrvcache = NFSRVCACHESIZ; diff --git a/sys/vfs/nfs/nfs_subs.c b/sys/vfs/nfs/nfs_subs.c index 0931c9fdf5..4051d75de3 100644 --- a/sys/vfs/nfs/nfs_subs.c +++ b/sys/vfs/nfs/nfs_subs.c @@ -84,13 +84,12 @@ * This is kinda hokey, but may save a little time doing byte swaps */ u_int32_t nfs_xdrneg1; -u_int32_t rpc_call, rpc_vers, rpc_reply, rpc_msgdenied, rpc_autherr, - rpc_mismatch, rpc_auth_unix, rpc_msgaccepted, - rpc_auth_kerb; +u_int32_t rpc_reply, rpc_msgdenied, rpc_mismatch, rpc_vers; +u_int32_t rpc_auth_unix, rpc_msgaccepted, rpc_call, rpc_autherr; +u_int32_t rpc_auth_kerb; u_int32_t nfs_prog, nfs_true, nfs_false; /* And other global data */ -static u_int32_t nfs_xid = 0; static enum vtype nv2tov_type[8]= { VNON, VREG, VDIR, VBLK, VCHR, VLNK, VNON, VNON }; @@ -99,7 +98,8 @@ enum vtype nv3tov_type[8]= { }; int nfs_ticks; -int nfs_pbuf_freecnt = -1; /* start out unlimited */ + +static int nfs_pbuf_freecnt = -1; /* start out unlimited */ struct nfsmount_head nfs_mountq = TAILQ_HEAD_INITIALIZER(nfs_mountq); struct nfssvc_sockhead nfssvc_sockhead; @@ -541,18 +541,9 @@ static short *nfsrv_v3errmap[] = { #endif /* NFS_NOSERVER */ -extern struct nfsrtt nfsrtt; -extern struct nfsstats nfsstats; -extern nfstype nfsv2_type[9]; -extern nfstype nfsv3_type[9]; -extern struct nfsnodehashhead *nfsnodehashtbl; -extern u_long nfsnodehash; - struct nfssvc_args; extern int sys_nfssvc(struct proc *, struct nfssvc_args *, int *); -LIST_HEAD(nfsnodehashhead, nfsnode); - /* * This needs to return a monotonically increasing or close to monotonically * increasing result, otherwise the write gathering queues won't work @@ -567,465 +558,6 @@ nfs_curusec(void) return ((u_quad_t)tv.tv_sec * 1000000 + (u_quad_t)tv.tv_usec); } -/* - * Create the header for an rpc request packet - * The hsiz is the size of the rest of the nfs request header. - * (just used to decide if a cluster is a good idea) - */ -struct mbuf * -nfsm_reqh(struct vnode *vp, u_long procid, int hsiz, caddr_t *bposp) -{ - struct mbuf *mb; - caddr_t bpos; - - mb = m_getl(hsiz, MB_WAIT, MT_DATA, 0, NULL); - mb->m_len = 0; - bpos = mtod(mb, caddr_t); - - /* Finally, return values */ - *bposp = bpos; - return (mb); -} - -/* - * Build the RPC header and fill in the authorization info. - * The authorization string argument is only used when the credentials - * come from outside of the kernel. - * Returns the head of the mbuf list. - */ -struct mbuf * -nfsm_rpchead(struct ucred *cr, int nmflag, int procid, int auth_type, - int auth_len, char *auth_str, int verf_len, char *verf_str, - struct mbuf *mrest, int mrest_len, struct mbuf **mbp, - u_int32_t *xidp) -{ - struct mbuf *mb; - u_int32_t *tl; - caddr_t bpos; - int i; - struct mbuf *mreq, *mb2; - int siz, grpsiz, authsiz, dsiz; - - authsiz = nfsm_rndup(auth_len); - dsiz = authsiz + 10 * NFSX_UNSIGNED; - mb = m_getl(dsiz, MB_WAIT, MT_DATA, M_PKTHDR, NULL); - if (dsiz < MINCLSIZE) { - if (dsiz < MHLEN) - MH_ALIGN(mb, dsiz); - else - MH_ALIGN(mb, 8 * NFSX_UNSIGNED); - } - mb->m_len = mb->m_pkthdr.len = 0; - mreq = mb; - bpos = mtod(mb, caddr_t); - - /* - * First the RPC header. - */ - nfsm_build(tl, u_int32_t *, 8 * NFSX_UNSIGNED); - - /* Get a pretty random xid to start with */ - if (!nfs_xid) - nfs_xid = krandom(); - /* - * Skip zero xid if it should ever happen. - */ - if (++nfs_xid == 0) - nfs_xid++; - - *tl++ = *xidp = txdr_unsigned(nfs_xid); - *tl++ = rpc_call; - *tl++ = rpc_vers; - *tl++ = txdr_unsigned(NFS_PROG); - if (nmflag & NFSMNT_NFSV3) - *tl++ = txdr_unsigned(NFS_VER3); - else - *tl++ = txdr_unsigned(NFS_VER2); - if (nmflag & NFSMNT_NFSV3) - *tl++ = txdr_unsigned(procid); - else - *tl++ = txdr_unsigned(nfsv2_procid[procid]); - - /* - * And then the authorization cred. - */ - *tl++ = txdr_unsigned(auth_type); - *tl = txdr_unsigned(authsiz); - switch (auth_type) { - case RPCAUTH_UNIX: - nfsm_build(tl, u_int32_t *, auth_len); - *tl++ = 0; /* stamp ?? */ - *tl++ = 0; /* NULL hostname */ - *tl++ = txdr_unsigned(cr->cr_uid); - *tl++ = txdr_unsigned(cr->cr_groups[0]); - grpsiz = (auth_len >> 2) - 5; - *tl++ = txdr_unsigned(grpsiz); - for (i = 1; i <= grpsiz; i++) - *tl++ = txdr_unsigned(cr->cr_groups[i]); - break; - case RPCAUTH_KERB4: - siz = auth_len; - while (siz > 0) { - if (M_TRAILINGSPACE(mb) == 0) { - mb2 = m_getl(siz, MB_WAIT, MT_DATA, 0, NULL); - mb2->m_len = 0; - mb->m_next = mb2; - mb = mb2; - bpos = mtod(mb, caddr_t); - } - i = min(siz, M_TRAILINGSPACE(mb)); - bcopy(auth_str, bpos, i); - mb->m_len += i; - auth_str += i; - bpos += i; - siz -= i; - } - if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { - for (i = 0; i < siz; i++) - *bpos++ = '\0'; - mb->m_len += siz; - } - break; - }; - - /* - * And the verifier... - */ - nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); - if (verf_str) { - *tl++ = txdr_unsigned(RPCAUTH_KERB4); - *tl = txdr_unsigned(verf_len); - siz = verf_len; - while (siz > 0) { - if (M_TRAILINGSPACE(mb) == 0) { - mb2 = m_getl(siz, MB_WAIT, MT_DATA, 0, NULL); - mb2->m_len = 0; - mb->m_next = mb2; - mb = mb2; - bpos = mtod(mb, caddr_t); - } - i = min(siz, M_TRAILINGSPACE(mb)); - bcopy(verf_str, bpos, i); - mb->m_len += i; - verf_str += i; - bpos += i; - siz -= i; - } - if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { - for (i = 0; i < siz; i++) - *bpos++ = '\0'; - mb->m_len += siz; - } - } else { - *tl++ = txdr_unsigned(RPCAUTH_NULL); - *tl = 0; - } - mb->m_next = mrest; - mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; - mreq->m_pkthdr.rcvif = NULL; - *mbp = mb; - return (mreq); -} - -/* - * copies mbuf chain to the uio scatter/gather list - */ -int -nfsm_mbuftouio(struct mbuf **mrep, struct uio *uiop, int siz, caddr_t *dpos) -{ - char *mbufcp, *uiocp; - int xfer, left, len; - struct mbuf *mp; - long uiosiz, rem; - int error = 0; - - mp = *mrep; - mbufcp = *dpos; - len = mtod(mp, caddr_t)+mp->m_len-mbufcp; - rem = nfsm_rndup(siz)-siz; - while (siz > 0) { - if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) - return (EFBIG); - left = uiop->uio_iov->iov_len; - uiocp = uiop->uio_iov->iov_base; - if (left > siz) - left = siz; - uiosiz = left; - while (left > 0) { - while (len == 0) { - mp = mp->m_next; - if (mp == NULL) - return (EBADRPC); - mbufcp = mtod(mp, caddr_t); - len = mp->m_len; - } - xfer = (left > len) ? len : left; -#ifdef notdef - /* Not Yet.. */ - if (uiop->uio_iov->iov_op != NULL) - (*(uiop->uio_iov->iov_op)) - (mbufcp, uiocp, xfer); - else -#endif - if (uiop->uio_segflg == UIO_SYSSPACE) - bcopy(mbufcp, uiocp, xfer); - else - copyout(mbufcp, uiocp, xfer); - left -= xfer; - len -= xfer; - mbufcp += xfer; - uiocp += xfer; - uiop->uio_offset += xfer; - uiop->uio_resid -= xfer; - } - if (uiop->uio_iov->iov_len <= siz) { - uiop->uio_iovcnt--; - uiop->uio_iov++; - } else { - uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + uiosiz; - uiop->uio_iov->iov_len -= uiosiz; - } - siz -= uiosiz; - } - *dpos = mbufcp; - *mrep = mp; - if (rem > 0) { - if (len < rem) - error = nfs_adv(mrep, dpos, rem, len); - else - *dpos += rem; - } - return (error); -} - -/* - * copies a uio scatter/gather list to an mbuf chain. - * NOTE: can ony handle iovcnt == 1 - */ -int -nfsm_uiotombuf(struct uio *uiop, struct mbuf **mq, int siz, caddr_t *bpos) -{ - char *uiocp; - struct mbuf *mp, *mp2; - int xfer, left, mlen; - int uiosiz, rem; - boolean_t getcluster; - char *cp; - -#ifdef DIAGNOSTIC - if (uiop->uio_iovcnt != 1) - panic("nfsm_uiotombuf: iovcnt != 1"); -#endif - - if (siz >= MINCLSIZE) - getcluster = TRUE; - else - getcluster = FALSE; - rem = nfsm_rndup(siz) - siz; - mp = mp2 = *mq; - while (siz > 0) { - left = uiop->uio_iov->iov_len; - uiocp = uiop->uio_iov->iov_base; - if (left > siz) - left = siz; - uiosiz = left; - while (left > 0) { - mlen = M_TRAILINGSPACE(mp); - if (mlen == 0) { - if (getcluster) - mp = m_getcl(MB_WAIT, MT_DATA, 0); - else - mp = m_get(MB_WAIT, MT_DATA); - mp->m_len = 0; - mp2->m_next = mp; - mp2 = mp; - mlen = M_TRAILINGSPACE(mp); - } - xfer = (left > mlen) ? mlen : left; -#ifdef notdef - /* Not Yet.. */ - if (uiop->uio_iov->iov_op != NULL) - (*(uiop->uio_iov->iov_op)) - (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); - else -#endif - if (uiop->uio_segflg == UIO_SYSSPACE) - bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); - else - copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); - mp->m_len += xfer; - left -= xfer; - uiocp += xfer; - uiop->uio_offset += xfer; - uiop->uio_resid -= xfer; - } - uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + uiosiz; - uiop->uio_iov->iov_len -= uiosiz; - siz -= uiosiz; - } - if (rem > 0) { - if (rem > M_TRAILINGSPACE(mp)) { - MGET(mp, MB_WAIT, MT_DATA); - mp->m_len = 0; - mp2->m_next = mp; - } - cp = mtod(mp, caddr_t)+mp->m_len; - for (left = 0; left < rem; left++) - *cp++ = '\0'; - mp->m_len += rem; - *bpos = cp; - } else - *bpos = mtod(mp, caddr_t)+mp->m_len; - *mq = mp; - return (0); -} - -/* - * Help break down an mbuf chain by setting the first siz bytes contiguous - * pointed to by returned val. - * This is used by the macros nfsm_dissect and nfsm_dissecton for tough - * cases. (The macros use the vars. dpos and dpos2) - */ -int -nfsm_disct(struct mbuf **mdp, caddr_t *dposp, int siz, int left, caddr_t *cp2) -{ - struct mbuf *mp, *mp2; - int siz2, xfer; - caddr_t p; - - mp = *mdp; - while (left == 0) { - *mdp = mp = mp->m_next; - if (mp == NULL) - return (EBADRPC); - left = mp->m_len; - *dposp = mtod(mp, caddr_t); - } - if (left >= siz) { - *cp2 = *dposp; - *dposp += siz; - } else if (mp->m_next == NULL) { - return (EBADRPC); - } else if (siz > MHLEN) { - panic("nfs S too big"); - } else { - MGET(mp2, MB_WAIT, MT_DATA); - mp2->m_next = mp->m_next; - mp->m_next = mp2; - mp->m_len -= left; - mp = mp2; - *cp2 = p = mtod(mp, caddr_t); - bcopy(*dposp, p, left); /* Copy what was left */ - siz2 = siz-left; - p += left; - mp2 = mp->m_next; - /* Loop around copying up the siz2 bytes */ - while (siz2 > 0) { - if (mp2 == NULL) - return (EBADRPC); - xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; - if (xfer > 0) { - bcopy(mtod(mp2, caddr_t), p, xfer); - NFSMADV(mp2, xfer); - mp2->m_len -= xfer; - p += xfer; - siz2 -= xfer; - } - if (siz2 > 0) - mp2 = mp2->m_next; - } - mp->m_len = siz; - *mdp = mp2; - *dposp = mtod(mp2, caddr_t); - } - return (0); -} - -/* - * Advance the position in the mbuf chain. - */ -int -nfs_adv(struct mbuf **mdp, caddr_t *dposp, int offs, int left) -{ - struct mbuf *m; - int s; - - m = *mdp; - s = left; - while (s < offs) { - offs -= s; - m = m->m_next; - if (m == NULL) - return (EBADRPC); - s = m->m_len; - } - *mdp = m; - *dposp = mtod(m, caddr_t)+offs; - return (0); -} - -/* - * Copy a string into mbufs for the hard cases... - */ -int -nfsm_strtmbuf(struct mbuf **mb, char **bpos, const char *cp, long siz) -{ - struct mbuf *m1 = NULL, *m2; - long left, xfer, len, tlen; - u_int32_t *tl; - int putsize; - - putsize = 1; - m2 = *mb; - left = M_TRAILINGSPACE(m2); - if (left > 0) { - tl = ((u_int32_t *)(*bpos)); - *tl++ = txdr_unsigned(siz); - putsize = 0; - left -= NFSX_UNSIGNED; - m2->m_len += NFSX_UNSIGNED; - if (left > 0) { - bcopy(cp, (caddr_t) tl, left); - siz -= left; - cp += left; - m2->m_len += left; - left = 0; - } - } - /* Loop around adding mbufs */ - while (siz > 0) { - int msize; - - m1 = m_getl(siz, MB_WAIT, MT_DATA, 0, &msize); - m1->m_len = msize; - m2->m_next = m1; - m2 = m1; - tl = mtod(m1, u_int32_t *); - tlen = 0; - if (putsize) { - *tl++ = txdr_unsigned(siz); - m1->m_len -= NFSX_UNSIGNED; - tlen = NFSX_UNSIGNED; - putsize = 0; - } - if (siz < m1->m_len) { - len = nfsm_rndup(siz); - xfer = siz; - if (xfer < len) - *(tl+(xfer>>2)) = 0; - } else { - xfer = len = m1->m_len; - } - bcopy(cp, (caddr_t) tl, xfer); - m1->m_len = len+tlen; - siz -= xfer; - cp += xfer; - } - *mb = m1; - *bpos = mtod(m1, caddr_t)+m1->m_len; - return (0); -} - /* * Called once to initialize data structures... */ @@ -1627,164 +1159,6 @@ out: return (error); } -/* - * A fiddled version of m_adj() that ensures null fill to a long - * boundary and only trims off the back end - */ -void -nfsm_adj(struct mbuf *mp, int len, int nul) -{ - struct mbuf *m; - int count, i; - char *cp; - - /* - * Trim from tail. Scan the mbuf chain, - * calculating its length and finding the last mbuf. - * If the adjustment only affects this mbuf, then just - * adjust and return. Otherwise, rescan and truncate - * after the remaining size. - */ - count = 0; - m = mp; - for (;;) { - count += m->m_len; - if (m->m_next == NULL) - break; - m = m->m_next; - } - if (m->m_len > len) { - m->m_len -= len; - if (nul > 0) { - cp = mtod(m, caddr_t)+m->m_len-nul; - for (i = 0; i < nul; i++) - *cp++ = '\0'; - } - return; - } - count -= len; - if (count < 0) - count = 0; - /* - * Correct length for chain is "count". - * Find the mbuf with last data, adjust its length, - * and toss data from remaining mbufs on chain. - */ - for (m = mp; m; m = m->m_next) { - if (m->m_len >= count) { - m->m_len = count; - if (nul > 0) { - cp = mtod(m, caddr_t)+m->m_len-nul; - for (i = 0; i < nul; i++) - *cp++ = '\0'; - } - break; - } - count -= m->m_len; - } - for (m = m->m_next;m;m = m->m_next) - m->m_len = 0; -} - -/* - * Make these functions instead of macros, so that the kernel text size - * doesn't get too big... - */ -void -nfsm_srvwcc(struct nfsrv_descript *nfsd, int before_ret, - struct vattr *before_vap, int after_ret, struct vattr *after_vap, - struct mbuf **mbp, char **bposp) -{ - struct mbuf *mb = *mbp, *mb2; - char *bpos = *bposp; - u_int32_t *tl; - - /* - * before_ret is 0 if before_vap is valid, non-zero if it isn't. - */ - if (before_ret) { - nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); - *tl = nfs_false; - } else { - nfsm_build(tl, u_int32_t *, 7 * NFSX_UNSIGNED); - *tl++ = nfs_true; - txdr_hyper(before_vap->va_size, tl); - tl += 2; - txdr_nfsv3time(&(before_vap->va_mtime), tl); - tl += 2; - txdr_nfsv3time(&(before_vap->va_ctime), tl); - } - *bposp = bpos; - *mbp = mb; - nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); -} - -void -nfsm_srvpostopattr(struct nfsrv_descript *nfsd, int after_ret, - struct vattr *after_vap, struct mbuf **mbp, char **bposp) -{ - struct mbuf *mb = *mbp, *mb2; - char *bpos = *bposp; - u_int32_t *tl; - struct nfs_fattr *fp; - - if (after_ret) { - nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); - *tl = nfs_false; - } else { - nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR); - *tl++ = nfs_true; - fp = (struct nfs_fattr *)tl; - nfsm_srvfattr(nfsd, after_vap, fp); - } - *mbp = mb; - *bposp = bpos; -} - -void -nfsm_srvfattr(struct nfsrv_descript *nfsd, struct vattr *vap, - struct nfs_fattr *fp) -{ - /* - * NFS seems to truncate nlink to 16 bits, don't let it overflow. - */ - if (vap->va_nlink > 65535) - fp->fa_nlink = 65535; - else - fp->fa_nlink = txdr_unsigned(vap->va_nlink); - fp->fa_uid = txdr_unsigned(vap->va_uid); - fp->fa_gid = txdr_unsigned(vap->va_gid); - if (nfsd->nd_flag & ND_NFSV3) { - fp->fa_type = vtonfsv3_type(vap->va_type); - fp->fa_mode = vtonfsv3_mode(vap->va_mode); - txdr_hyper(vap->va_size, &fp->fa3_size); - txdr_hyper(vap->va_bytes, &fp->fa3_used); - fp->fa3_rdev.specdata1 = txdr_unsigned(vap->va_rmajor); - fp->fa3_rdev.specdata2 = txdr_unsigned(vap->va_rminor); - fp->fa3_fsid.nfsuquad[0] = 0; - fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); - txdr_hyper(vap->va_fileid, &fp->fa3_fileid); - txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); - txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); - txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); - } else { - fp->fa_type = vtonfsv2_type(vap->va_type); - fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); - fp->fa2_size = txdr_unsigned(vap->va_size); - fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); - if (vap->va_type == VFIFO) - fp->fa2_rdev = 0xffffffff; - else - fp->fa2_rdev = txdr_unsigned(makeudev(vap->va_rmajor, vap->va_rminor)); - fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); - fp->fa2_fsid = txdr_unsigned(vap->va_fsid); - fp->fa2_fileid = txdr_unsigned(vap->va_fileid); - txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); - txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); - txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); - } -} - /* * nfsrv_fhtovp() - convert a fh to a vnode ptr (optionally locked) * - look up fsid in mount list (if not found ret error) @@ -1864,7 +1238,6 @@ nfsrv_fhtovp(fhandle_t *fhp, int lockflag, return (0); } - /* * WebNFS: check if a filehandle is a public filehandle. For v3, this * means a length of 0, for v2 it means all zeroes. nfsm_srvmtofh has diff --git a/sys/vfs/nfs/nfs_syscalls.c b/sys/vfs/nfs/nfs_syscalls.c index 9a43465d9a..82a54e6385 100644 --- a/sys/vfs/nfs/nfs_syscalls.c +++ b/sys/vfs/nfs/nfs_syscalls.c @@ -79,16 +79,6 @@ static MALLOC_DEFINE(M_NFSSVC, "NFS srvsock", "Nfs server structure"); -/* Global defs. */ -extern int32_t (*nfsrv3_procs[NFS_NPROCS]) (struct nfsrv_descript *nd, - struct nfssvc_sock *slp, - struct thread *td, - struct mbuf **mreqp); -extern int nfs_numasync; -extern int nfsrtton; -extern struct nfsstats nfsstats; -extern int nfsrvw_procrastinate; -extern int nfsrvw_procrastinate_v3; static int nuidhash_max = NFS_MAXUIDHASH; #ifndef NFS_NOSERVER diff --git a/sys/vfs/nfs/nfs_vfsops.c b/sys/vfs/nfs/nfs_vfsops.c index ff622556c1..dbb4e68cda 100644 --- a/sys/vfs/nfs/nfs_vfsops.c +++ b/sys/vfs/nfs/nfs_vfsops.c @@ -79,7 +79,6 @@ extern int nfs_mountroot(struct mount *mp); extern void bootpc_init(void); -extern int nfs_ticks; extern struct vop_ops nfsv2_vnode_vops; extern struct vop_ops nfsv2_fifo_vops; extern struct vop_ops nfsv2_spec_vops; @@ -1128,8 +1127,6 @@ nfs_root(struct mount *mp, struct vnode **vpp) return (error); } -extern int syncprt; - struct scaninfo { int rescan; int waitfor; diff --git a/sys/vfs/nfs/nfs_vnops.c b/sys/vfs/nfs/nfs_vnops.c index 79eae8b292..b12425b8f1 100644 --- a/sys/vfs/nfs/nfs_vnops.c +++ b/sys/vfs/nfs/nfs_vnops.c @@ -213,14 +213,6 @@ static int nfs_renameit (struct vnode *sdvp, struct componentname *scnp, struct sillyrename *sp); -/* - * Global variables - */ -extern u_int32_t nfs_true, nfs_false; -extern u_int32_t nfs_xdrneg1; -extern struct nfsstats nfsstats; -extern nfstype nfsv3_type[9]; - SYSCTL_DECL(_vfs_nfs); static int nfs_flush_on_rename = 1; diff --git a/sys/vfs/nfs/nfsm_subs.c b/sys/vfs/nfs/nfsm_subs.c new file mode 100644 index 0000000000..7e803c3f2b --- /dev/null +++ b/sys/vfs/nfs/nfsm_subs.c @@ -0,0 +1,728 @@ +/* + * Copyright (c) 2009 The DragonFly Project. All rights reserved. + * + * This code is derived from software contributed to The DragonFly Project + * by Matthew Dillon + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of The DragonFly Project nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific, prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, + * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Rick Macklem at The University of Guelph. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * These functions support the macros and help fiddle mbuf chains for + * the nfs op functions. They do things like create the rpc header and + * copy data between mbuf chains and uio lists. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "rpcv2.h" +#include "nfsproto.h" +#include "nfs.h" +#include "nfsmount.h" +#include "nfsnode.h" +#include "xdr_subs.h" +#include "nfsm_subs.h" +#include "nfsrtt.h" + +#include + +static u_int32_t nfs_xid = 0; + +/* + * Create the header for an rpc request packet + * The hsiz is the size of the rest of the nfs request header. + * (just used to decide if a cluster is a good idea) + */ +struct mbuf * +nfsm_reqh(struct vnode *vp, u_long procid, int hsiz, caddr_t *bposp) +{ + struct mbuf *mb; + caddr_t bpos; + + mb = m_getl(hsiz, MB_WAIT, MT_DATA, 0, NULL); + mb->m_len = 0; + bpos = mtod(mb, caddr_t); + + /* Finally, return values */ + *bposp = bpos; + return (mb); +} + +/* + * Build the RPC header and fill in the authorization info. + * The authorization string argument is only used when the credentials + * come from outside of the kernel. + * Returns the head of the mbuf list. + */ +struct mbuf * +nfsm_rpchead(struct ucred *cr, int nmflag, int procid, int auth_type, + int auth_len, char *auth_str, int verf_len, char *verf_str, + struct mbuf *mrest, int mrest_len, struct mbuf **mbp, + u_int32_t *xidp) +{ + struct mbuf *mb; + u_int32_t *tl; + caddr_t bpos; + int i; + struct mbuf *mreq, *mb2; + int siz, grpsiz, authsiz, dsiz; + + authsiz = nfsm_rndup(auth_len); + dsiz = authsiz + 10 * NFSX_UNSIGNED; + mb = m_getl(dsiz, MB_WAIT, MT_DATA, M_PKTHDR, NULL); + if (dsiz < MINCLSIZE) { + if (dsiz < MHLEN) + MH_ALIGN(mb, dsiz); + else + MH_ALIGN(mb, 8 * NFSX_UNSIGNED); + } + mb->m_len = mb->m_pkthdr.len = 0; + mreq = mb; + bpos = mtod(mb, caddr_t); + + /* + * First the RPC header. + */ + nfsm_build(tl, u_int32_t *, 8 * NFSX_UNSIGNED); + + /* Get a pretty random xid to start with */ + if (!nfs_xid) + nfs_xid = krandom(); + /* + * Skip zero xid if it should ever happen. + */ + if (++nfs_xid == 0) + nfs_xid++; + + *tl++ = *xidp = txdr_unsigned(nfs_xid); + *tl++ = rpc_call; + *tl++ = rpc_vers; + *tl++ = txdr_unsigned(NFS_PROG); + if (nmflag & NFSMNT_NFSV3) + *tl++ = txdr_unsigned(NFS_VER3); + else + *tl++ = txdr_unsigned(NFS_VER2); + if (nmflag & NFSMNT_NFSV3) + *tl++ = txdr_unsigned(procid); + else + *tl++ = txdr_unsigned(nfsv2_procid[procid]); + + /* + * And then the authorization cred. + */ + *tl++ = txdr_unsigned(auth_type); + *tl = txdr_unsigned(authsiz); + switch (auth_type) { + case RPCAUTH_UNIX: + nfsm_build(tl, u_int32_t *, auth_len); + *tl++ = 0; /* stamp ?? */ + *tl++ = 0; /* NULL hostname */ + *tl++ = txdr_unsigned(cr->cr_uid); + *tl++ = txdr_unsigned(cr->cr_groups[0]); + grpsiz = (auth_len >> 2) - 5; + *tl++ = txdr_unsigned(grpsiz); + for (i = 1; i <= grpsiz; i++) + *tl++ = txdr_unsigned(cr->cr_groups[i]); + break; + case RPCAUTH_KERB4: + siz = auth_len; + while (siz > 0) { + if (M_TRAILINGSPACE(mb) == 0) { + mb2 = m_getl(siz, MB_WAIT, MT_DATA, 0, NULL); + mb2->m_len = 0; + mb->m_next = mb2; + mb = mb2; + bpos = mtod(mb, caddr_t); + } + i = min(siz, M_TRAILINGSPACE(mb)); + bcopy(auth_str, bpos, i); + mb->m_len += i; + auth_str += i; + bpos += i; + siz -= i; + } + if ((siz = (nfsm_rndup(auth_len) - auth_len)) > 0) { + for (i = 0; i < siz; i++) + *bpos++ = '\0'; + mb->m_len += siz; + } + break; + }; + + /* + * And the verifier... + */ + nfsm_build(tl, u_int32_t *, 2 * NFSX_UNSIGNED); + if (verf_str) { + *tl++ = txdr_unsigned(RPCAUTH_KERB4); + *tl = txdr_unsigned(verf_len); + siz = verf_len; + while (siz > 0) { + if (M_TRAILINGSPACE(mb) == 0) { + mb2 = m_getl(siz, MB_WAIT, MT_DATA, 0, NULL); + mb2->m_len = 0; + mb->m_next = mb2; + mb = mb2; + bpos = mtod(mb, caddr_t); + } + i = min(siz, M_TRAILINGSPACE(mb)); + bcopy(verf_str, bpos, i); + mb->m_len += i; + verf_str += i; + bpos += i; + siz -= i; + } + if ((siz = (nfsm_rndup(verf_len) - verf_len)) > 0) { + for (i = 0; i < siz; i++) + *bpos++ = '\0'; + mb->m_len += siz; + } + } else { + *tl++ = txdr_unsigned(RPCAUTH_NULL); + *tl = 0; + } + mb->m_next = mrest; + mreq->m_pkthdr.len = authsiz + 10 * NFSX_UNSIGNED + mrest_len; + mreq->m_pkthdr.rcvif = NULL; + *mbp = mb; + return (mreq); +} + +/* + * copies mbuf chain to the uio scatter/gather list + */ +int +nfsm_mbuftouio(struct mbuf **mrep, struct uio *uiop, int siz, caddr_t *dpos) +{ + char *mbufcp, *uiocp; + int xfer, left, len; + struct mbuf *mp; + long uiosiz, rem; + int error = 0; + + mp = *mrep; + mbufcp = *dpos; + len = mtod(mp, caddr_t)+mp->m_len-mbufcp; + rem = nfsm_rndup(siz)-siz; + while (siz > 0) { + if (uiop->uio_iovcnt <= 0 || uiop->uio_iov == NULL) + return (EFBIG); + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + while (len == 0) { + mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + mbufcp = mtod(mp, caddr_t); + len = mp->m_len; + } + xfer = (left > len) ? len : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (mbufcp, uiocp, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(mbufcp, uiocp, xfer); + else + copyout(mbufcp, uiocp, xfer); + left -= xfer; + len -= xfer; + mbufcp += xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + if (uiop->uio_iov->iov_len <= siz) { + uiop->uio_iovcnt--; + uiop->uio_iov++; + } else { + uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + } + siz -= uiosiz; + } + *dpos = mbufcp; + *mrep = mp; + if (rem > 0) { + if (len < rem) + error = nfs_adv(mrep, dpos, rem, len); + else + *dpos += rem; + } + return (error); +} + +/* + * copies a uio scatter/gather list to an mbuf chain. + * NOTE: can ony handle iovcnt == 1 + */ +int +nfsm_uiotombuf(struct uio *uiop, struct mbuf **mq, int siz, caddr_t *bpos) +{ + char *uiocp; + struct mbuf *mp, *mp2; + int xfer, left, mlen; + int uiosiz, rem; + boolean_t getcluster; + char *cp; + +#ifdef DIAGNOSTIC + if (uiop->uio_iovcnt != 1) + panic("nfsm_uiotombuf: iovcnt != 1"); +#endif + + if (siz >= MINCLSIZE) + getcluster = TRUE; + else + getcluster = FALSE; + rem = nfsm_rndup(siz) - siz; + mp = mp2 = *mq; + while (siz > 0) { + left = uiop->uio_iov->iov_len; + uiocp = uiop->uio_iov->iov_base; + if (left > siz) + left = siz; + uiosiz = left; + while (left > 0) { + mlen = M_TRAILINGSPACE(mp); + if (mlen == 0) { + if (getcluster) + mp = m_getcl(MB_WAIT, MT_DATA, 0); + else + mp = m_get(MB_WAIT, MT_DATA); + mp->m_len = 0; + mp2->m_next = mp; + mp2 = mp; + mlen = M_TRAILINGSPACE(mp); + } + xfer = (left > mlen) ? mlen : left; +#ifdef notdef + /* Not Yet.. */ + if (uiop->uio_iov->iov_op != NULL) + (*(uiop->uio_iov->iov_op)) + (uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else +#endif + if (uiop->uio_segflg == UIO_SYSSPACE) + bcopy(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + else + copyin(uiocp, mtod(mp, caddr_t)+mp->m_len, xfer); + mp->m_len += xfer; + left -= xfer; + uiocp += xfer; + uiop->uio_offset += xfer; + uiop->uio_resid -= xfer; + } + uiop->uio_iov->iov_base = (char *)uiop->uio_iov->iov_base + uiosiz; + uiop->uio_iov->iov_len -= uiosiz; + siz -= uiosiz; + } + if (rem > 0) { + if (rem > M_TRAILINGSPACE(mp)) { + MGET(mp, MB_WAIT, MT_DATA); + mp->m_len = 0; + mp2->m_next = mp; + } + cp = mtod(mp, caddr_t)+mp->m_len; + for (left = 0; left < rem; left++) + *cp++ = '\0'; + mp->m_len += rem; + *bpos = cp; + } else + *bpos = mtod(mp, caddr_t)+mp->m_len; + *mq = mp; + return (0); +} + +/* + * Help break down an mbuf chain by setting the first siz bytes contiguous + * pointed to by returned val. + * This is used by the macros nfsm_dissect and nfsm_dissecton for tough + * cases. (The macros use the vars. dpos and dpos2) + */ +int +nfsm_disct(struct mbuf **mdp, caddr_t *dposp, int siz, int left, caddr_t *cp2) +{ + struct mbuf *mp, *mp2; + int siz2, xfer; + caddr_t p; + + mp = *mdp; + while (left == 0) { + *mdp = mp = mp->m_next; + if (mp == NULL) + return (EBADRPC); + left = mp->m_len; + *dposp = mtod(mp, caddr_t); + } + if (left >= siz) { + *cp2 = *dposp; + *dposp += siz; + } else if (mp->m_next == NULL) { + return (EBADRPC); + } else if (siz > MHLEN) { + panic("nfs S too big"); + } else { + MGET(mp2, MB_WAIT, MT_DATA); + mp2->m_next = mp->m_next; + mp->m_next = mp2; + mp->m_len -= left; + mp = mp2; + *cp2 = p = mtod(mp, caddr_t); + bcopy(*dposp, p, left); /* Copy what was left */ + siz2 = siz-left; + p += left; + mp2 = mp->m_next; + /* Loop around copying up the siz2 bytes */ + while (siz2 > 0) { + if (mp2 == NULL) + return (EBADRPC); + xfer = (siz2 > mp2->m_len) ? mp2->m_len : siz2; + if (xfer > 0) { + bcopy(mtod(mp2, caddr_t), p, xfer); + NFSMADV(mp2, xfer); + mp2->m_len -= xfer; + p += xfer; + siz2 -= xfer; + } + if (siz2 > 0) + mp2 = mp2->m_next; + } + mp->m_len = siz; + *mdp = mp2; + *dposp = mtod(mp2, caddr_t); + } + return (0); +} + +/* + * Advance the position in the mbuf chain. + */ +int +nfs_adv(struct mbuf **mdp, caddr_t *dposp, int offs, int left) +{ + struct mbuf *m; + int s; + + m = *mdp; + s = left; + while (s < offs) { + offs -= s; + m = m->m_next; + if (m == NULL) + return (EBADRPC); + s = m->m_len; + } + *mdp = m; + *dposp = mtod(m, caddr_t)+offs; + return (0); +} + +/* + * Copy a string into mbufs for the hard cases... + */ +int +nfsm_strtmbuf(struct mbuf **mb, char **bpos, const char *cp, long siz) +{ + struct mbuf *m1 = NULL, *m2; + long left, xfer, len, tlen; + u_int32_t *tl; + int putsize; + + putsize = 1; + m2 = *mb; + left = M_TRAILINGSPACE(m2); + if (left > 0) { + tl = ((u_int32_t *)(*bpos)); + *tl++ = txdr_unsigned(siz); + putsize = 0; + left -= NFSX_UNSIGNED; + m2->m_len += NFSX_UNSIGNED; + if (left > 0) { + bcopy(cp, (caddr_t) tl, left); + siz -= left; + cp += left; + m2->m_len += left; + left = 0; + } + } + /* Loop around adding mbufs */ + while (siz > 0) { + int msize; + + m1 = m_getl(siz, MB_WAIT, MT_DATA, 0, &msize); + m1->m_len = msize; + m2->m_next = m1; + m2 = m1; + tl = mtod(m1, u_int32_t *); + tlen = 0; + if (putsize) { + *tl++ = txdr_unsigned(siz); + m1->m_len -= NFSX_UNSIGNED; + tlen = NFSX_UNSIGNED; + putsize = 0; + } + if (siz < m1->m_len) { + len = nfsm_rndup(siz); + xfer = siz; + if (xfer < len) + *(tl+(xfer>>2)) = 0; + } else { + xfer = len = m1->m_len; + } + bcopy(cp, (caddr_t) tl, xfer); + m1->m_len = len+tlen; + siz -= xfer; + cp += xfer; + } + *mb = m1; + *bpos = mtod(m1, caddr_t)+m1->m_len; + return (0); +} + +/* + * A fiddled version of m_adj() that ensures null fill to a long + * boundary and only trims off the back end + */ +void +nfsm_adj(struct mbuf *mp, int len, int nul) +{ + struct mbuf *m; + int count, i; + char *cp; + + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + count = 0; + m = mp; + for (;;) { + count += m->m_len; + if (m->m_next == NULL) + break; + m = m->m_next; + } + if (m->m_len > len) { + m->m_len -= len; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + for (m = mp; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + if (nul > 0) { + cp = mtod(m, caddr_t)+m->m_len-nul; + for (i = 0; i < nul; i++) + *cp++ = '\0'; + } + break; + } + count -= m->m_len; + } + for (m = m->m_next;m;m = m->m_next) + m->m_len = 0; +} + +/* + * Make these functions instead of macros, so that the kernel text size + * doesn't get too big... + */ +void +nfsm_srvwcc(struct nfsrv_descript *nfsd, int before_ret, + struct vattr *before_vap, int after_ret, struct vattr *after_vap, + struct mbuf **mbp, char **bposp) +{ + struct mbuf *mb = *mbp, *mb2; + char *bpos = *bposp; + u_int32_t *tl; + + /* + * before_ret is 0 if before_vap is valid, non-zero if it isn't. + */ + if (before_ret) { + nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); + *tl = nfs_false; + } else { + nfsm_build(tl, u_int32_t *, 7 * NFSX_UNSIGNED); + *tl++ = nfs_true; + txdr_hyper(before_vap->va_size, tl); + tl += 2; + txdr_nfsv3time(&(before_vap->va_mtime), tl); + tl += 2; + txdr_nfsv3time(&(before_vap->va_ctime), tl); + } + *bposp = bpos; + *mbp = mb; + nfsm_srvpostopattr(nfsd, after_ret, after_vap, mbp, bposp); +} + +void +nfsm_srvpostopattr(struct nfsrv_descript *nfsd, int after_ret, + struct vattr *after_vap, struct mbuf **mbp, char **bposp) +{ + struct mbuf *mb = *mbp, *mb2; + char *bpos = *bposp; + u_int32_t *tl; + struct nfs_fattr *fp; + + if (after_ret) { + nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED); + *tl = nfs_false; + } else { + nfsm_build(tl, u_int32_t *, NFSX_UNSIGNED + NFSX_V3FATTR); + *tl++ = nfs_true; + fp = (struct nfs_fattr *)tl; + nfsm_srvfattr(nfsd, after_vap, fp); + } + *mbp = mb; + *bposp = bpos; +} + +void +nfsm_srvfattr(struct nfsrv_descript *nfsd, struct vattr *vap, + struct nfs_fattr *fp) +{ + /* + * NFS seems to truncate nlink to 16 bits, don't let it overflow. + */ + if (vap->va_nlink > 65535) + fp->fa_nlink = 65535; + else + fp->fa_nlink = txdr_unsigned(vap->va_nlink); + fp->fa_uid = txdr_unsigned(vap->va_uid); + fp->fa_gid = txdr_unsigned(vap->va_gid); + if (nfsd->nd_flag & ND_NFSV3) { + fp->fa_type = vtonfsv3_type(vap->va_type); + fp->fa_mode = vtonfsv3_mode(vap->va_mode); + txdr_hyper(vap->va_size, &fp->fa3_size); + txdr_hyper(vap->va_bytes, &fp->fa3_used); + fp->fa3_rdev.specdata1 = txdr_unsigned(vap->va_rmajor); + fp->fa3_rdev.specdata2 = txdr_unsigned(vap->va_rminor); + fp->fa3_fsid.nfsuquad[0] = 0; + fp->fa3_fsid.nfsuquad[1] = txdr_unsigned(vap->va_fsid); + txdr_hyper(vap->va_fileid, &fp->fa3_fileid); + txdr_nfsv3time(&vap->va_atime, &fp->fa3_atime); + txdr_nfsv3time(&vap->va_mtime, &fp->fa3_mtime); + txdr_nfsv3time(&vap->va_ctime, &fp->fa3_ctime); + } else { + fp->fa_type = vtonfsv2_type(vap->va_type); + fp->fa_mode = vtonfsv2_mode(vap->va_type, vap->va_mode); + fp->fa2_size = txdr_unsigned(vap->va_size); + fp->fa2_blocksize = txdr_unsigned(vap->va_blocksize); + if (vap->va_type == VFIFO) + fp->fa2_rdev = 0xffffffff; + else + fp->fa2_rdev = txdr_unsigned(makeudev(vap->va_rmajor, vap->va_rminor)); + fp->fa2_blocks = txdr_unsigned(vap->va_bytes / NFS_FABLKSIZE); + fp->fa2_fsid = txdr_unsigned(vap->va_fsid); + fp->fa2_fileid = txdr_unsigned(vap->va_fileid); + txdr_nfsv2time(&vap->va_atime, &fp->fa2_atime); + txdr_nfsv2time(&vap->va_mtime, &fp->fa2_mtime); + txdr_nfsv2time(&vap->va_ctime, &fp->fa2_ctime); + } +} -- 2.41.0