gitweb.dragonflybsd.org Git - dragonfly.git/blame_incremental

... / ...

Commit	Line	Data
	1	/*
	2	* Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>. All rights reserved.
	3	* Copyright (c) 2006 Matthew Dillon <dillon@backplane.com>. All rights reserved.
	4	*
	5	* Copyright (c) 1982, 1986, 1989, 1993
	6	* The Regents of the University of California. All rights reserved.
	7	*
	8	* This code is derived from software contributed to Berkeley by
	9	* Scooter Morris at Genentech Inc.
	10	*
	11	* Redistribution and use in source and binary forms, with or without
	12	* modification, are permitted provided that the following conditions
	13	* are met:
	14	* 1. Redistributions of source code must retain the above copyright
	15	* notice, this list of conditions and the following disclaimer.
	16	* 2. Redistributions in binary form must reproduce the above copyright
	17	* notice, this list of conditions and the following disclaimer in the
	18	* documentation and/or other materials provided with the distribution.
	19	* 3. All advertising materials mentioning features or use of this software
	20	* must display the following acknowledgement:
	21	* This product includes software developed by the University of
	22	* California, Berkeley and its contributors.
	23	* 4. Neither the name of the University nor the names of its contributors
	24	* may be used to endorse or promote products derived from this software
	25	* without specific prior written permission.
	26	*
	27	* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	28	* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	29	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	30	* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	31	* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	32	* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	33	* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	34	* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	35	* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	36	* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	37	* SUCH DAMAGE.
	38	*
	39	* @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
	40	* $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
	41	* $DragonFly: src/sys/kern/kern_lockf.c,v 1.37 2007/11/01 22:48:16 dillon Exp $
	42	*/
	43
	44	#include <sys/param.h>
	45	#include <sys/systm.h>
	46	#include <sys/kernel.h>
	47	#include <sys/lock.h>
	48	#include <sys/proc.h>
	49	#include <sys/unistd.h>
	50	#include <sys/vnode.h>
	51	#include <sys/malloc.h>
	52	#include <sys/fcntl.h>
	53	#include <sys/resourcevar.h>
	54
	55	#include <sys/lockf.h>
	56	#include <machine/limits.h> /* for LLONG_MAX */
	57	#include <machine/stdarg.h>
	58
	59	#include <sys/spinlock2.h>
	60
	61	#ifdef INVARIANTS
	62	int lf_global_counter = 0;
	63	#endif
	64
	65	#ifdef LOCKF_DEBUG
	66	int lf_print_ranges = 0;
	67
	68	static void _lf_print_lock(const struct lockf *);
	69	static void _lf_printf(const char *, ...);
	70
	71	#define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
	72	#define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args)
	73	#else
	74	#define lf_print_lock(lock)
	75	#define lf_printf(ctl, args...)
	76	#endif
	77
	78	static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
	79
	80	static void lf_wakeup(struct lockf *, off_t, off_t);
	81	static struct lockf_range *lf_alloc_range(void);
	82	static void lf_create_range(struct lockf_range , struct proc , int, int,
	83	off_t, off_t);
	84	static void lf_insert(struct lockf_range_list *list,
	85	struct lockf_range *elm,
	86	struct lockf_range *insert_point);
	87	static void lf_destroy_range(struct lockf_range *);
	88
	89	static int lf_setlock(struct lockf , struct proc , int, int,
	90	off_t, off_t);
	91	static int lf_getlock(struct flock , struct lockf , struct proc *,
	92	int, int, off_t, off_t);
	93
	94	static int lf_count_change(struct proc *, int);
	95
	96	/*
	97	* Return TRUE (non-zero) if the type and posix flags match.
	98	*/
	99	static __inline
	100	int
	101	lf_match(struct lockf_range *range, int type, int flags)
	102	{
	103	if (range->lf_type != type)
	104	return(0);
	105	if ((range->lf_flags ^ flags) & F_POSIX)
	106	return(0);
	107	return(1);
	108	}
	109
	110	/*
	111	* Check whether range and [start, end] overlap.
	112	*/
	113	static __inline
	114	int
	115	lf_overlap(const struct lockf_range *range, off_t start, off_t end)
	116	{
	117	if (range->lf_start >= start && range->lf_start <= end)
	118	return(1);
	119	else if (start >= range->lf_start && start <= range->lf_end)
	120	return(1);
	121	else
	122	return(0);
	123	}
	124
	125
	126	/*
	127	* Change the POSIX lock accounting for the given process.
	128	*/
	129	void
	130	lf_count_adjust(struct proc *p, int increase)
	131	{
	132	struct uidinfo *uip;
	133
	134	KKASSERT(p != NULL);
	135
	136	uip = p->p_ucred->cr_uidinfo;
	137	spin_lock(&uip->ui_lock);
	138
	139	if (increase)
	140	uip->ui_posixlocks += p->p_numposixlocks;
	141	else
	142	uip->ui_posixlocks -= p->p_numposixlocks;
	143
	144	KASSERT(uip->ui_posixlocks >= 0,
	145	("Negative number of POSIX locks held by %s user: %d.",
	146	increase ? "new" : "old", uip->ui_posixlocks));
	147	spin_unlock(&uip->ui_lock);
	148	}
	149
	150	static int
	151	lf_count_change(struct proc *owner, int diff)
	152	{
	153	struct uidinfo *uip;
	154	int max, ret;
	155
	156	/* we might actually not have a process context */
	157	if (owner == NULL)
	158	return(0);
	159
	160	uip = owner->p_ucred->cr_uidinfo;
	161
	162	max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur,
	163	maxposixlocksperuid);
	164
	165	spin_lock(&uip->ui_lock);
	166	if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 &&
	167	uip->ui_posixlocks >= max ) {
	168	ret = 1;
	169	} else {
	170	uip->ui_posixlocks += diff;
	171	owner->p_numposixlocks += diff;
	172	KASSERT(uip->ui_posixlocks >= 0,
	173	("Negative number of POSIX locks held by user: %d.",
	174	uip->ui_posixlocks));
	175	KASSERT(owner->p_numposixlocks >= 0,
	176	("Negative number of POSIX locks held by proc: %d.",
	177	uip->ui_posixlocks));
	178	ret = 0;
	179	}
	180	spin_unlock(&uip->ui_lock);
	181	return ret;
	182	}
	183
	184	/*
	185	* Advisory record locking support
	186	*/
	187	int
	188	lf_advlock(struct vop_advlock_args ap, struct lockf lock, u_quad_t size)
	189	{
	190	struct flock *fl = ap->a_fl;
	191	struct proc *owner;
	192	off_t start, end;
	193	int type, flags, error;
	194	lwkt_token_t token;
	195
	196	/*
	197	* Convert the flock structure into a start and end.
	198	*/
	199	switch (fl->l_whence) {
	200	case SEEK_SET:
	201	case SEEK_CUR:
	202	/*
	203	* Caller is responsible for adding any necessary offset
	204	* when SEEK_CUR is used.
	205	*/
	206	start = fl->l_start;
	207	break;
	208
	209	case SEEK_END:
	210	start = size + fl->l_start;
	211	break;
	212
	213	default:
	214	return(EINVAL);
	215	}
	216
	217	flags = ap->a_flags;
	218	if (start < 0)
	219	return(EINVAL);
	220	if (fl->l_len == 0) {
	221	flags \|= F_NOEND;
	222	end = LLONG_MAX;
	223	} else if (fl->l_len < 0) {
	224	return(EINVAL);
	225	} else {
	226	end = start + fl->l_len - 1;
	227	if (end < start)
	228	return(EINVAL);
	229	}
	230
	231	type = fl->l_type;
	232	/*
	233	* This isn't really correct for flock-style locks,
	234	* but the current handling is somewhat broken anyway.
	235	*/
	236	owner = (struct proc *)ap->a_id;
	237
	238	/*
	239	* Do the requested operation.
	240	*/
	241	token = lwkt_getpooltoken(lock);
	242
	243	if (lock->init_done == 0) {
	244	TAILQ_INIT(&lock->lf_range);
	245	TAILQ_INIT(&lock->lf_blocked);
	246	lock->init_done = 1;
	247	}
	248
	249	switch(ap->a_op) {
	250	case F_SETLK:
	251	/*
	252	* NOTE: It is possible for both lf_range and lf_blocked to
	253	* be empty if we block and get woken up, but another process
	254	* then gets in and issues an unlock. So VMAYHAVELOCKS must
	255	* be set after the lf_setlock() operation completes rather
	256	* then before.
	257	*/
	258	error = lf_setlock(lock, owner, type, flags, start, end);
	259	vsetflags(ap->a_vp, VMAYHAVELOCKS);
	260	break;
	261
	262	case F_UNLCK:
	263	error = lf_setlock(lock, owner, type, flags, start, end);
	264	if (TAILQ_EMPTY(&lock->lf_range) &&
	265	TAILQ_EMPTY(&lock->lf_blocked)) {
	266	vclrflags(ap->a_vp, VMAYHAVELOCKS);
	267	}
	268	break;
	269
	270	case F_GETLK:
	271	error = lf_getlock(fl, lock, owner, type, flags, start, end);
	272	break;
	273
	274	default:
	275	error = EINVAL;
	276	break;
	277	}
	278	lwkt_reltoken(token);
	279	return(error);
	280	}
	281
	282	static int
	283	lf_setlock(struct lockf lock, struct proc owner, int type, int flags,
	284	off_t start, off_t end)
	285	{
	286	struct lockf_range *range;
	287	struct lockf_range *brange;
	288	struct lockf_range *next;
	289	struct lockf_range *first_match;
	290	struct lockf_range *last_match;
	291	struct lockf_range *insert_point;
	292	struct lockf_range *new_range1;
	293	struct lockf_range *new_range2;
	294	int wakeup_needed;
	295	int double_clip;
	296	int unlock_override;
	297	int error = 0;
	298	int count;
	299	struct lockf_range_list deadlist;
	300
	301	new_range1 = NULL;
	302	new_range2 = NULL;
	303	count = 0;
	304
	305	restart:
	306	/*
	307	* Preallocate two ranges so we don't have to worry about blocking
	308	* in the middle of the lock code.
	309	*/
	310	if (new_range1 == NULL)
	311	new_range1 = lf_alloc_range();
	312	if (new_range2 == NULL)
	313	new_range2 = lf_alloc_range();
	314	first_match = NULL;
	315	last_match = NULL;
	316	insert_point = NULL;
	317	wakeup_needed = 0;
	318
	319	lf_print_lock(lock);
	320
	321	/*
	322	* Locate the insertion point for the new lock (the first range
	323	* with an lf_start >= start).
	324	*
	325	* Locate the first and latch ranges owned by us that overlap
	326	* the requested range.
	327	*/
	328	TAILQ_FOREACH(range, &lock->lf_range, lf_link) {
	329	if (insert_point == NULL && range->lf_start >= start)
	330	insert_point = range;
	331
	332	/*
	333	* Skip non-overlapping locks. Locks are sorted by lf_start
	334	* So we can terminate the search when lf_start exceeds the
	335	* requested range (insert_point is still guarenteed to be
	336	* set properly).
	337	*/
	338	if (range->lf_end < start)
	339	continue;
	340	if (range->lf_start > end) {
	341	range = NULL;
	342	break;
	343	}
	344
	345	/*
	346	* Overlapping lock. Set first_match and last_match if we
	347	* are the owner.
	348	*/
	349	if (range->lf_owner == owner) {
	350	if (first_match == NULL)
	351	first_match = range;
	352	last_match = range;
	353	continue;
	354	}
	355
	356	/*
	357	* If we aren't the owner check for a conflicting lock. Only
	358	* if not unlocking.
	359	*/
	360	if (type != F_UNLCK) {
	361	if (type == F_WRLCK \|\| range->lf_type == F_WRLCK)
	362	break;
	363	}
	364	}
	365
	366	/*
	367	* If a conflicting lock was observed, block or fail as appropriate.
	368	* (this code is skipped when unlocking)
	369	*/
	370	if (range != NULL) {
	371	if ((flags & F_WAIT) == 0) {
	372	error = EAGAIN;
	373	goto do_cleanup;
	374	}
	375
	376	/*
	377	* We are blocked. For POSIX locks we have to check
	378	* for deadlocks and return with EDEADLK. This is done
	379	* by checking whether range->lf_owner is already
	380	* blocked.
	381	*
	382	* Since flock-style locks cover the whole file, a
	383	* deadlock between those is nearly impossible.
	384	* This can only occur if a process tries to lock the
	385	* same inode exclusively while holding a shared lock
	386	* with another descriptor.
	387	* XXX How can we cleanly detect this?
	388	* XXX The current mixing of flock & fcntl/lockf is evil.
	389	*
	390	* Handle existing locks of flock-style like POSIX locks.
	391	*/
	392	if (flags & F_POSIX) {
	393	TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) {
	394	if (brange->lf_owner == range->lf_owner) {
	395	error = EDEADLK;
	396	goto do_cleanup;
	397	}
	398	}
	399	}
	400
	401	/*
	402	* For flock-style locks, we must first remove
	403	* any shared locks that we hold before we sleep
	404	* waiting for an exclusive lock.
	405	*/
	406	if ((flags & F_POSIX) == 0 && type == F_WRLCK)
	407	lf_setlock(lock, owner, F_UNLCK, 0, start, end);
	408
	409	brange = new_range1;
	410	new_range1 = NULL;
	411	lf_create_range(brange, owner, type, 0, start, end);
	412	TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link);
	413	error = tsleep(brange, PCATCH, "lockf", 0);
	414
	415	/*
	416	* We may have been awaked by a signal and/or by a
	417	* debugger continuing us (in which case we must remove
	418	* ourselves from the blocked list) and/or by another
	419	* process releasing/downgrading a lock (in which case
	420	* we have already been removed from the blocked list
	421	* and our lf_flags field is 1).
	422	*
	423	* Sleep if it looks like we might be livelocking.
	424	*/
	425	if (brange->lf_flags == 0)
	426	TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link);
	427	if (count == 2)
	428	tsleep(brange, 0, "lockfz", 2);
	429	else
	430	++count;
	431	lf_destroy_range(brange);
	432
	433	if (error)
	434	goto do_cleanup;
	435	goto restart;
	436	}
	437
	438	/*
	439	* If there are no overlapping locks owned by us then creating
	440	* the new lock is easy. This is the most common case.
	441	*/
	442	if (first_match == NULL) {
	443	if (type == F_UNLCK)
	444	goto do_wakeup;
	445	if (flags & F_POSIX) {
	446	if (lf_count_change(owner, 1)) {
	447	error = ENOLCK;
	448	goto do_cleanup;
	449	}
	450	}
	451	range = new_range1;
	452	new_range1 = NULL;
	453	lf_create_range(range, owner, type, flags, start, end);
	454	lf_insert(&lock->lf_range, range, insert_point);
	455	goto do_wakeup;
	456	}
	457
	458	/*
	459	* double_clip - Calculate a special case where TWO locks may have
	460	* to be added due to the new lock breaking up an
	461	* existing incompatible lock in the middle.
	462	*
	463	* unlock_override - Calculate a special case where NO locks
	464	* need to be created. This occurs when an unlock
	465	* does not clip any locks at the front and rear.
	466	*
	467	* WARNING! closef() and fdrop() assume that an F_UNLCK of the
	468	* entire range will always succeed so the unlock_override
	469	* case is mandatory.
	470	*/
	471	double_clip = 0;
	472	unlock_override = 0;
	473	if (first_match->lf_start < start) {
	474	if (first_match == last_match && last_match->lf_end > end)
	475	double_clip = 1;
	476	} else if (type == F_UNLCK && last_match->lf_end <= end) {
	477	unlock_override = 1;
	478	}
	479
	480	/*
	481	* Figure out the worst case net increase in POSIX locks and account
	482	* for it now before we start modifying things. If neither the
	483	* first or last locks match we have an issue. If there is only
	484	* one overlapping range which needs to be clipped on both ends
	485	* we wind up having to create up to two new locks, else only one.
	486	*
	487	* When unlocking the worst case is always 1 new lock if our
	488	* unlock request cuts the middle out of an existing lock range.
	489	*
	490	* count represents the 'cleanup' adjustment needed. It starts
	491	* negative, is incremented whenever we create a new POSIX lock,
	492	* and decremented whenever we delete an existing one. At the
	493	* end of the day it had better be <= 0 or we didn't calculate the
	494	* worse case properly here.
	495	*/
	496	count = 0;
	497	if ((flags & F_POSIX) && !unlock_override) {
	498	if (!lf_match(first_match, type, flags) &&
	499	!lf_match(last_match, type, flags)
	500	) {
	501	if (double_clip && type != F_UNLCK)
	502	count = -2;
	503	else
	504	count = -1;
	505	}
	506	if (count && lf_count_change(owner, -count)) {
	507	error = ENOLCK;
	508	goto do_cleanup;
	509	}
	510	}
	511	/* else flock style lock which encompasses entire range */
	512
	513	/*
	514	* Create and insert the lock represented the requested range.
	515	* Adjust the net POSIX lock count. We have to move our insertion
	516	* point since brange now represents the first record >= start.
	517	*
	518	* When unlocking, no new lock is inserted but we still clip.
	519	*/
	520	if (type != F_UNLCK) {
	521	brange = new_range1;
	522	new_range1 = NULL;
	523	lf_create_range(brange, owner, type, flags, start, end);
	524	lf_insert(&lock->lf_range, brange, insert_point);
	525	insert_point = brange;
	526	if (flags & F_POSIX)
	527	++count;
	528	} else {
	529	brange = NULL;
	530	}
	531
	532	/*
	533	* Handle the double_clip case. This is the only case where
	534	* we wind up having to add TWO locks.
	535	*/
	536	if (double_clip) {
	537	KKASSERT(first_match == last_match);
	538	last_match = new_range2;
	539	new_range2 = NULL;
	540	lf_create_range(last_match, first_match->lf_owner,
	541	first_match->lf_type, first_match->lf_flags,
	542	end + 1, first_match->lf_end);
	543	first_match->lf_end = start - 1;
	544	first_match->lf_flags &= ~F_NOEND;
	545
	546	/*
	547	* Figure out where to insert the right side clip.
	548	*/
	549	lf_insert(&lock->lf_range, last_match, first_match);
	550	if (last_match->lf_flags & F_POSIX)
	551	++count;
	552	}
	553
	554	/*
	555	* Clip or destroy the locks between first_match and last_match,
	556	* inclusive. Ignore the primary lock we created (brange). Note
	557	* that if double-clipped, first_match and last_match will be
	558	* outside our clipping range. Otherwise first_match and last_match
	559	* will be deleted.
	560	*
	561	* We have already taken care of any double clipping.
	562	*
	563	* The insert_point may become invalid as we delete records, do not
	564	* use that pointer any more. Also, when removing something other
	565	* then 'range' we have to check to see if the item we are removing
	566	* is 'next' and adjust 'next' properly.
	567	*
	568	* NOTE: brange will be NULL if F_UNLCKing.
	569	*/
	570	TAILQ_INIT(&deadlist);
	571	next = first_match;
	572
	573	while ((range = next) != NULL) {
	574	next = TAILQ_NEXT(range, lf_link);
	575
	576	/*
	577	* Ignore elements that we do not own and ignore the
	578	* primary request range which we just created.
	579	*/
	580	if (range->lf_owner != owner \|\| range == brange)
	581	continue;
	582
	583	/*
	584	* We may have to wakeup a waiter when downgrading a lock.
	585	*/
	586	if (type == F_UNLCK)
	587	wakeup_needed = 1;
	588	if (type == F_RDLCK && range->lf_type == F_WRLCK)
	589	wakeup_needed = 1;
	590
	591	/*
	592	* Clip left. This can only occur on first_match.
	593	*
	594	* Merge the left clip with brange if possible. This must
	595	* be done specifically, not in the optimized merge heuristic
	596	* below, since we may have counted on it in our 'count'
	597	* calculation above.
	598	*/
	599	if (range->lf_start < start) {
	600	KKASSERT(range == first_match);
	601	if (brange &&
	602	range->lf_end >= start - 1 &&
	603	lf_match(range, type, flags)) {
	604	range->lf_end = brange->lf_end;
	605	range->lf_flags \|= brange->lf_flags & F_NOEND;
	606	/*
	607	* Removing something other then 'range',
	608	* adjust 'next' if necessary.
	609	*/
	610	if (next == brange)
	611	next = TAILQ_NEXT(next, lf_link);
	612	TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
	613	if (brange->lf_flags & F_POSIX)
	614	--count;
	615	TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
	616	brange = range;
	617	} else if (range->lf_end >= start) {
	618	range->lf_end = start - 1;
	619	if (type != F_UNLCK)
	620	range->lf_flags &= ~F_NOEND;
	621	}
	622	if (range == last_match)
	623	break;
	624	continue;
	625	}
	626
	627	/*
	628	* Clip right. This can only occur on last_match.
	629	*
	630	* Merge the right clip if possible. This must be done
	631	* specifically, not in the optimized merge heuristic
	632	* below, since we may have counted on it in our 'count'
	633	* calculation.
	634	*
	635	* Since we are adjusting lf_start, we have to move the
	636	* record to maintain the sorted list. Since lf_start is
	637	* only getting larger we can use the next element as the
	638	* insert point (we don't have to backtrack).
	639	*/
	640	if (range->lf_end > end) {
	641	KKASSERT(range == last_match);
	642	if (brange &&
	643	range->lf_start <= end + 1 &&
	644	lf_match(range, type, flags)) {
	645	brange->lf_end = range->lf_end;
	646	brange->lf_flags \|= range->lf_flags & F_NOEND;
	647	TAILQ_REMOVE(&lock->lf_range, range, lf_link);
	648	if (range->lf_flags & F_POSIX)
	649	--count;
	650	TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
	651	} else if (range->lf_start <= end) {
	652	range->lf_start = end + 1;
	653	TAILQ_REMOVE(&lock->lf_range, range, lf_link);
	654	lf_insert(&lock->lf_range, range, next);
	655	}
	656	/* range == last_match, we are done */
	657	break;
	658	}
	659
	660	/*
	661	* The record must be entirely enclosed. Note that the
	662	* record could be first_match or last_match, and will be
	663	* deleted.
	664	*/
	665	KKASSERT(range->lf_start >= start && range->lf_end <= end);
	666	TAILQ_REMOVE(&lock->lf_range, range, lf_link);
	667	if (range->lf_flags & F_POSIX)
	668	--count;
	669	TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
	670	if (range == last_match)
	671	break;
	672	}
	673
	674	/*
	675	* Attempt to merge locks adjacent to brange. For example, we may
	676	* have had to clip first_match and/or last_match, and they might
	677	* be adjacent. Or there might simply have been an adjacent lock
	678	* already there.
	679	*
	680	* Don't get fancy, just check adjacent elements in the list if they
	681	* happen to be owned by us.
	682	*
	683	* This case only gets hit if we have a situation where a shared
	684	* and exclusive lock are adjacent, and the exclusive lock is
	685	* downgraded to shared or the shared lock is upgraded to exclusive.
	686	*/
	687	if (brange) {
	688	range = TAILQ_PREV(brange, lockf_range_list, lf_link);
	689	if (range &&
	690	range->lf_owner == owner &&
	691	range->lf_end == brange->lf_start - 1 &&
	692	lf_match(range, type, flags)
	693	) {
	694	/*
	695	* Extend range to cover brange and scrap brange.
	696	*/
	697	range->lf_end = brange->lf_end;
	698	range->lf_flags \|= brange->lf_flags & F_NOEND;
	699	TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
	700	if (brange->lf_flags & F_POSIX)
	701	--count;
	702	TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
	703	brange = range;
	704	}
	705	range = TAILQ_NEXT(brange, lf_link);
	706	if (range &&
	707	range->lf_owner == owner &&
	708	range->lf_start == brange->lf_end + 1 &&
	709	lf_match(range, type, flags)
	710	) {
	711	/*
	712	* Extend brange to cover range and scrap range.
	713	*/
	714	brange->lf_end = range->lf_end;
	715	brange->lf_flags \|= range->lf_flags & F_NOEND;
	716	TAILQ_REMOVE(&lock->lf_range, range, lf_link);
	717	if (range->lf_flags & F_POSIX)
	718	--count;
	719	TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
	720	}
	721	}
	722
	723	/*
	724	* Destroy deleted elements. We didn't want to do it in the loop
	725	* because the free() might have blocked.
	726	*
	727	* Adjust the count for any posix locks we thought we might create
	728	* but didn't.
	729	*/
	730	while ((range = TAILQ_FIRST(&deadlist)) != NULL) {
	731	TAILQ_REMOVE(&deadlist, range, lf_link);
	732	lf_destroy_range(range);
	733	}
	734
	735	KKASSERT(count <= 0);
	736	if (count < 0)
	737	lf_count_change(owner, count);
	738	do_wakeup:
	739	lf_print_lock(lock);
	740	if (wakeup_needed)
	741	lf_wakeup(lock, start, end);
	742	error = 0;
	743	do_cleanup:
	744	if (new_range1 != NULL)
	745	lf_destroy_range(new_range1);
	746	if (new_range2 != NULL)
	747	lf_destroy_range(new_range2);
	748	return(error);
	749	}
	750
	751	/*
	752	* Check whether there is a blocking lock,
	753	* and if so return its process identifier.
	754	*/
	755	static int
	756	lf_getlock(struct flock fl, struct lockf lock, struct proc *owner,
	757	int type, int flags, off_t start, off_t end)
	758	{
	759	struct lockf_range *range;
	760
	761	TAILQ_FOREACH(range, &lock->lf_range, lf_link)
	762	if (range->lf_owner != owner &&
	763	lf_overlap(range, start, end) &&
	764	(type == F_WRLCK \|\| range->lf_type == F_WRLCK))
	765	break;
	766	if (range == NULL) {
	767	fl->l_type = F_UNLCK;
	768	return(0);
	769	}
	770	fl->l_type = range->lf_type;
	771	fl->l_whence = SEEK_SET;
	772	fl->l_start = range->lf_start;
	773	if (range->lf_flags & F_NOEND)
	774	fl->l_len = 0;
	775	else
	776	fl->l_len = range->lf_end - range->lf_start + 1;
	777	if (range->lf_owner != NULL && (range->lf_flags & F_POSIX))
	778	fl->l_pid = range->lf_owner->p_pid;
	779	else
	780	fl->l_pid = -1;
	781	return(0);
	782	}
	783
	784	/*
	785	* Wakeup pending lock attempts. Theoretically we can stop as soon as
	786	* we encounter an exclusive request that covers the whole range (at least
	787	* insofar as the sleep code above calls lf_wakeup() if it would otherwise
	788	* exit instead of loop), but for now just wakeup all overlapping
	789	* requests. XXX
	790	*/
	791	static void
	792	lf_wakeup(struct lockf *lock, off_t start, off_t end)
	793	{
	794	struct lockf_range range, nrange;
	795
	796	TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) {
	797	if (lf_overlap(range, start, end) == 0)
	798	continue;
	799	TAILQ_REMOVE(&lock->lf_blocked, range, lf_link);
	800	range->lf_flags = 1;
	801	wakeup(range);
	802	}
	803	}
	804
	805	/*
	806	* Allocate a range structure and initialize it sufficiently such that
	807	* lf_destroy_range() does not barf.
	808	*/
	809	static struct lockf_range *
	810	lf_alloc_range(void)
	811	{
	812	struct lockf_range *range;
	813
	814	#ifdef INVARIANTS
	815	atomic_add_int(&lf_global_counter, 1);
	816	#endif
	817	range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK);
	818	range->lf_owner = NULL;
	819	return(range);
	820	}
	821
	822	static void
	823	lf_insert(struct lockf_range_list list, struct lockf_range elm,
	824	struct lockf_range *insert_point)
	825	{
	826	while (insert_point && insert_point->lf_start < elm->lf_start)
	827	insert_point = TAILQ_NEXT(insert_point, lf_link);
	828	if (insert_point != NULL)
	829	TAILQ_INSERT_BEFORE(insert_point, elm, lf_link);
	830	else
	831	TAILQ_INSERT_TAIL(list, elm, lf_link);
	832	}
	833
	834	static void
	835	lf_create_range(struct lockf_range range, struct proc owner, int type,
	836	int flags, off_t start, off_t end)
	837	{
	838	KKASSERT(start <= end);
	839	range->lf_type = type;
	840	range->lf_flags = flags;
	841	range->lf_start = start;
	842	range->lf_end = end;
	843	range->lf_owner = owner;
	844
	845	lf_printf("lf_create_range: %lld..%lld\n",
	846	range->lf_start, range->lf_end);
	847	}
	848
	849	static void
	850	lf_destroy_range(struct lockf_range *range)
	851	{
	852	lf_printf("lf_destroy_range: %lld..%lld\n",
	853	range->lf_start, range->lf_end);
	854	kfree(range, M_LOCKF);
	855	#ifdef INVARIANTS
	856	atomic_add_int(&lf_global_counter, -1);
	857	KKASSERT(lf_global_counter >= 0);
	858	#endif
	859	}
	860
	861	#ifdef LOCKF_DEBUG
	862
	863	static void
	864	_lf_printf(const char *ctl, ...)
	865	{
	866	struct proc *p;
	867	__va_list va;
	868
	869	if (lf_print_ranges) {
	870	if ((p = curproc) != NULL)
	871	kprintf("pid %d (%s): ", p->p_pid, p->p_comm);
	872	}
	873	__va_start(va, ctl);
	874	kvprintf(ctl, va);
	875	__va_end(va);
	876	}
	877
	878	static void
	879	_lf_print_lock(const struct lockf *lock)
	880	{
	881	struct lockf_range *range;
	882
	883	if (lf_print_ranges == 0)
	884	return;
	885
	886	if (TAILQ_EMPTY(&lock->lf_range)) {
	887	lf_printf("lockf %p: no ranges locked\n", lock);
	888	} else {
	889	lf_printf("lockf %p:\n", lock);
	890	}
	891	TAILQ_FOREACH(range, &lock->lf_range, lf_link)
	892	kprintf("\t%lld..%lld type %s owned by %d\n",
	893	range->lf_start, range->lf_end,
	894	range->lf_type == F_RDLCK ? "shared" : "exclusive",
	895	range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1);
	896	if (TAILQ_EMPTY(&lock->lf_blocked))
	897	kprintf("no process waiting for range\n");
	898	else
	899	kprintf("blocked locks:");
	900	TAILQ_FOREACH(range, &lock->lf_blocked, lf_link)
	901	kprintf("\t%lld..%lld type %s waiting on %p\n",
	902	range->lf_start, range->lf_end,
	903	range->lf_type == F_RDLCK ? "shared" : "exclusive",
	904	range);
	905	}
	906	#endif /* LOCKF_DEBUG */