kernel - Fix global counter used in lockf assertions
[dragonfly.git] / sys / kern / kern_lockf.c
CommitLineData
984263bc 1/*
0a019f0d 2 * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>. All rights reserved.
a3190466 3 * Copyright (c) 2006 Matthew Dillon <dillon@backplane.com>. All rights reserved.
508ceb09 4 *
984263bc
MD
5 * Copyright (c) 1982, 1986, 1989, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Scooter Morris at Genentech Inc.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. All advertising materials mentioning features or use of this software
20 * must display the following acknowledgement:
21 * This product includes software developed by the University of
22 * California, Berkeley and its contributors.
23 * 4. Neither the name of the University nor the names of its contributors
24 * may be used to endorse or promote products derived from this software
25 * without specific prior written permission.
26 *
27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
37 * SUCH DAMAGE.
38 *
39 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
40 * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
9b725a11 41 * $DragonFly: src/sys/kern/kern_lockf.c,v 1.37 2007/11/01 22:48:16 dillon Exp $
984263bc
MD
42 */
43
984263bc
MD
44#include <sys/param.h>
45#include <sys/systm.h>
46#include <sys/kernel.h>
47#include <sys/lock.h>
48#include <sys/proc.h>
49#include <sys/unistd.h>
50#include <sys/vnode.h>
51#include <sys/malloc.h>
52#include <sys/fcntl.h>
508ceb09 53#include <sys/resourcevar.h>
984263bc
MD
54
55#include <sys/lockf.h>
508ceb09 56#include <machine/limits.h> /* for LLONG_MAX */
6449cdd4 57#include <machine/stdarg.h>
508ceb09 58
9d7a637e
AE
59#include <sys/spinlock2.h>
60
508ceb09
JS
61#ifdef INVARIANTS
62int lf_global_counter = 0;
63#endif
6449cdd4 64
508ceb09
JS
65#ifdef LOCKF_DEBUG
66int lf_print_ranges = 0;
67
6449cdd4
MD
68static void _lf_print_lock(const struct lockf *);
69static void _lf_printf(const char *, ...);
70
71#define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
72#define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args)
73#else
74#define lf_print_lock(lock)
75#define lf_printf(ctl, args...)
508ceb09
JS
76#endif
77
78static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
79
80static void lf_wakeup(struct lockf *, off_t, off_t);
4f0a1741
JS
81static struct lockf_range *lf_alloc_range(void);
82static void lf_create_range(struct lockf_range *, struct proc *, int, int,
cae44669 83 off_t, off_t);
a3190466
MD
84static void lf_insert(struct lockf_range_list *list,
85 struct lockf_range *elm,
86 struct lockf_range *insert_point);
cae44669 87static void lf_destroy_range(struct lockf_range *);
508ceb09
JS
88
89static int lf_setlock(struct lockf *, struct proc *, int, int,
90 off_t, off_t);
508ceb09
JS
91static int lf_getlock(struct flock *, struct lockf *, struct proc *,
92 int, int, off_t, off_t);
93
94static int lf_count_change(struct proc *, int);
984263bc
MD
95
96/*
a3190466
MD
97 * Return TRUE (non-zero) if the type and posix flags match.
98 */
99static __inline
100int
101lf_match(struct lockf_range *range, int type, int flags)
102{
103 if (range->lf_type != type)
104 return(0);
105 if ((range->lf_flags ^ flags) & F_POSIX)
106 return(0);
107 return(1);
108}
109
110/*
111 * Check whether range and [start, end] overlap.
112 */
113static __inline
114int
115lf_overlap(const struct lockf_range *range, off_t start, off_t end)
116{
117 if (range->lf_start >= start && range->lf_start <= end)
118 return(1);
119 else if (start >= range->lf_start && start <= range->lf_end)
120 return(1);
121 else
122 return(0);
123}
124
125
126/*
508ceb09 127 * Change the POSIX lock accounting for the given process.
984263bc 128 */
508ceb09 129void
b76a3852 130lf_count_adjust(struct proc *p, int increase)
508ceb09
JS
131{
132 struct uidinfo *uip;
984263bc 133
508ceb09 134 KKASSERT(p != NULL);
984263bc 135
508ceb09 136 uip = p->p_ucred->cr_uidinfo;
287a8577 137 spin_lock(&uip->ui_lock);
984263bc 138
b76a3852
JS
139 if (increase)
140 uip->ui_posixlocks += p->p_numposixlocks;
141 else
142 uip->ui_posixlocks -= p->p_numposixlocks;
984263bc 143
508ceb09 144 KASSERT(uip->ui_posixlocks >= 0,
b76a3852
JS
145 ("Negative number of POSIX locks held by %s user: %d.",
146 increase ? "new" : "old", uip->ui_posixlocks));
287a8577 147 spin_unlock(&uip->ui_lock);
508ceb09 148}
984263bc 149
508ceb09
JS
150static int
151lf_count_change(struct proc *owner, int diff)
152{
153 struct uidinfo *uip;
9d7a637e 154 int max, ret;
508ceb09
JS
155
156 /* we might actually not have a process context */
157 if (owner == NULL)
158 return(0);
159
160 uip = owner->p_ucred->cr_uidinfo;
161
162 max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur,
163 maxposixlocksperuid);
9d7a637e 164
287a8577 165 spin_lock(&uip->ui_lock);
508ceb09 166 if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 &&
cae44669 167 uip->ui_posixlocks >= max ) {
9d7a637e
AE
168 ret = 1;
169 } else {
170 uip->ui_posixlocks += diff;
171 owner->p_numposixlocks += diff;
172 KASSERT(uip->ui_posixlocks >= 0,
173 ("Negative number of POSIX locks held by user: %d.",
174 uip->ui_posixlocks));
175 KASSERT(owner->p_numposixlocks >= 0,
176 ("Negative number of POSIX locks held by proc: %d.",
177 uip->ui_posixlocks));
178 ret = 0;
cae44669 179 }
287a8577 180 spin_unlock(&uip->ui_lock);
9d7a637e 181 return ret;
508ceb09 182}
984263bc
MD
183
184/*
185 * Advisory record locking support
186 */
187int
508ceb09 188lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size)
984263bc 189{
1fd87d54 190 struct flock *fl = ap->a_fl;
508ceb09 191 struct proc *owner;
984263bc 192 off_t start, end;
97ef7fc2 193 int type, flags, error;
3b998fa9 194 lwkt_token_t token;
508ceb09 195
984263bc
MD
196 /*
197 * Convert the flock structure into a start and end.
198 */
199 switch (fl->l_whence) {
984263bc
MD
200 case SEEK_SET:
201 case SEEK_CUR:
202 /*
203 * Caller is responsible for adding any necessary offset
204 * when SEEK_CUR is used.
205 */
206 start = fl->l_start;
207 break;
208
209 case SEEK_END:
210 start = size + fl->l_start;
211 break;
212
213 default:
508ceb09 214 return(EINVAL);
984263bc 215 }
9b725a11
MD
216
217 flags = ap->a_flags;
984263bc 218 if (start < 0)
508ceb09
JS
219 return(EINVAL);
220 if (fl->l_len == 0) {
221 flags |= F_NOEND;
86fc1543 222 end = LLONG_MAX;
99acf558
MD
223 } else if (fl->l_len < 0) {
224 return(EINVAL);
508ceb09 225 } else {
984263bc
MD
226 end = start + fl->l_len - 1;
227 if (end < start)
508ceb09 228 return(EINVAL);
984263bc 229 }
508ceb09 230
508ceb09 231 type = fl->l_type;
984263bc 232 /*
508ceb09
JS
233 * This isn't really correct for flock-style locks,
234 * but the current handling is somewhat broken anyway.
984263bc 235 */
508ceb09
JS
236 owner = (struct proc *)ap->a_id;
237
984263bc
MD
238 /*
239 * Do the requested operation.
240 */
3b998fa9 241 token = lwkt_getpooltoken(lock);
0e23bc1f
JS
242
243 if (lock->init_done == 0) {
244 TAILQ_INIT(&lock->lf_range);
245 TAILQ_INIT(&lock->lf_blocked);
246 lock->init_done = 1;
247 }
248
984263bc
MD
249 switch(ap->a_op) {
250 case F_SETLK:
93df580f
MD
251 /*
252 * NOTE: It is possible for both lf_range and lf_blocked to
253 * be empty if we block and get woken up, but another process
254 * then gets in and issues an unlock. So VMAYHAVELOCKS must
255 * be set after the lf_setlock() operation completes rather
256 * then before.
257 */
97ef7fc2 258 error = lf_setlock(lock, owner, type, flags, start, end);
2247fe02 259 vsetflags(ap->a_vp, VMAYHAVELOCKS);
97ef7fc2 260 break;
984263bc
MD
261
262 case F_UNLCK:
a3190466 263 error = lf_setlock(lock, owner, type, flags, start, end);
2792dfce
MD
264 if (TAILQ_EMPTY(&lock->lf_range) &&
265 TAILQ_EMPTY(&lock->lf_blocked)) {
2247fe02 266 vclrflags(ap->a_vp, VMAYHAVELOCKS);
2792dfce 267 }
97ef7fc2 268 break;
984263bc
MD
269
270 case F_GETLK:
97ef7fc2
JS
271 error = lf_getlock(fl, lock, owner, type, flags, start, end);
272 break;
984263bc
MD
273
274 default:
97ef7fc2
JS
275 error = EINVAL;
276 break;
984263bc 277 }
3b998fa9 278 lwkt_reltoken(token);
97ef7fc2 279 return(error);
984263bc
MD
280}
281
984263bc 282static int
508ceb09
JS
283lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags,
284 off_t start, off_t end)
984263bc 285{
a3190466
MD
286 struct lockf_range *range;
287 struct lockf_range *brange;
288 struct lockf_range *next;
289 struct lockf_range *first_match;
290 struct lockf_range *last_match;
291 struct lockf_range *insert_point;
292 struct lockf_range *new_range1;
293 struct lockf_range *new_range2;
294 int wakeup_needed;
295 int double_clip;
99acf558 296 int unlock_override;
4f0a1741 297 int error = 0;
cae44669 298 int count;
a3190466
MD
299 struct lockf_range_list deadlist;
300
301 new_range1 = NULL;
302 new_range2 = NULL;
303 count = 0;
508ceb09
JS
304
305restart:
a3190466
MD
306 /*
307 * Preallocate two ranges so we don't have to worry about blocking
308 * in the middle of the lock code.
309 */
4f0a1741
JS
310 if (new_range1 == NULL)
311 new_range1 = lf_alloc_range();
312 if (new_range2 == NULL)
313 new_range2 = lf_alloc_range();
508ceb09 314 first_match = NULL;
a3190466 315 last_match = NULL;
508ceb09
JS
316 insert_point = NULL;
317 wakeup_needed = 0;
984263bc 318
6449cdd4 319 lf_print_lock(lock);
984263bc 320
a3190466
MD
321 /*
322 * Locate the insertion point for the new lock (the first range
323 * with an lf_start >= start).
324 *
325 * Locate the first and latch ranges owned by us that overlap
326 * the requested range.
327 */
508ceb09
JS
328 TAILQ_FOREACH(range, &lock->lf_range, lf_link) {
329 if (insert_point == NULL && range->lf_start >= start)
330 insert_point = range;
a3190466
MD
331
332 /*
333 * Skip non-overlapping locks. Locks are sorted by lf_start
334 * So we can terminate the search when lf_start exceeds the
335 * requested range (insert_point is still guarenteed to be
336 * set properly).
337 */
338 if (range->lf_end < start)
508ceb09 339 continue;
a3190466
MD
340 if (range->lf_start > end) {
341 range = NULL;
342 break;
343 }
344
345 /*
346 * Overlapping lock. Set first_match and last_match if we
347 * are the owner.
348 */
508ceb09
JS
349 if (range->lf_owner == owner) {
350 if (first_match == NULL)
351 first_match = range;
a3190466 352 last_match = range;
508ceb09 353 continue;
984263bc 354 }
a3190466
MD
355
356 /*
357 * If we aren't the owner check for a conflicting lock. Only
358 * if not unlocking.
359 */
360 if (type != F_UNLCK) {
361 if (type == F_WRLCK || range->lf_type == F_WRLCK)
362 break;
363 }
508ceb09
JS
364 }
365
a3190466
MD
366 /*
367 * If a conflicting lock was observed, block or fail as appropriate.
368 * (this code is skipped when unlocking)
369 */
508ceb09 370 if (range != NULL) {
4f0a1741
JS
371 if ((flags & F_WAIT) == 0) {
372 error = EAGAIN;
373 goto do_cleanup;
374 }
508ceb09 375
984263bc 376 /*
508ceb09
JS
377 * We are blocked. For POSIX locks we have to check
378 * for deadlocks and return with EDEADLK. This is done
3d8f95ac 379 * by checking whether range->lf_owner is already
508ceb09
JS
380 * blocked.
381 *
382 * Since flock-style locks cover the whole file, a
383 * deadlock between those is nearly impossible.
384 * This can only occur if a process tries to lock the
385 * same inode exclusively while holding a shared lock
386 * with another descriptor.
387 * XXX How can we cleanly detect this?
388 * XXX The current mixing of flock & fcntl/lockf is evil.
984263bc 389 *
508ceb09 390 * Handle existing locks of flock-style like POSIX locks.
984263bc 391 */
508ceb09 392 if (flags & F_POSIX) {
99acf558 393 TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) {
4f0a1741
JS
394 if (brange->lf_owner == range->lf_owner) {
395 error = EDEADLK;
396 goto do_cleanup;
397 }
99acf558 398 }
984263bc 399 }
508ceb09 400
984263bc 401 /*
508ceb09 402 * For flock-style locks, we must first remove
984263bc
MD
403 * any shared locks that we hold before we sleep
404 * waiting for an exclusive lock.
405 */
71c18fe3 406 if ((flags & F_POSIX) == 0 && type == F_WRLCK)
a3190466 407 lf_setlock(lock, owner, F_UNLCK, 0, start, end);
508ceb09 408
4f0a1741
JS
409 brange = new_range1;
410 new_range1 = NULL;
cae44669 411 lf_create_range(brange, owner, type, 0, start, end);
508ceb09
JS
412 TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link);
413 error = tsleep(brange, PCATCH, "lockf", 0);
414
984263bc 415 /*
508ceb09
JS
416 * We may have been awaked by a signal and/or by a
417 * debugger continuing us (in which case we must remove
984263bc 418 * ourselves from the blocked list) and/or by another
508ceb09
JS
419 * process releasing/downgrading a lock (in which case
420 * we have already been removed from the blocked list
421 * and our lf_flags field is 1).
a3190466
MD
422 *
423 * Sleep if it looks like we might be livelocking.
984263bc 424 */
508ceb09
JS
425 if (brange->lf_flags == 0)
426 TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link);
a3190466
MD
427 if (count == 2)
428 tsleep(brange, 0, "lockfz", 2);
429 else
430 ++count;
cae44669 431 lf_destroy_range(brange);
508ceb09
JS
432
433 if (error)
4f0a1741 434 goto do_cleanup;
508ceb09
JS
435 goto restart;
436 }
437
a3190466
MD
438 /*
439 * If there are no overlapping locks owned by us then creating
440 * the new lock is easy. This is the most common case.
441 */
508ceb09 442 if (first_match == NULL) {
a3190466
MD
443 if (type == F_UNLCK)
444 goto do_wakeup;
508ceb09 445 if (flags & F_POSIX) {
4f0a1741
JS
446 if (lf_count_change(owner, 1)) {
447 error = ENOLCK;
448 goto do_cleanup;
449 }
984263bc 450 }
4f0a1741
JS
451 range = new_range1;
452 new_range1 = NULL;
cae44669 453 lf_create_range(range, owner, type, flags, start, end);
a3190466 454 lf_insert(&lock->lf_range, range, insert_point);
508ceb09 455 goto do_wakeup;
984263bc 456 }
508ceb09 457
a3190466 458 /*
99acf558
MD
459 * double_clip - Calculate a special case where TWO locks may have
460 * to be added due to the new lock breaking up an
461 * existing incompatible lock in the middle.
462 *
463 * unlock_override - Calculate a special case where NO locks
464 * need to be created. This occurs when an unlock
465 * does not clip any locks at the front and rear.
466 *
467 * WARNING! closef() and fdrop() assume that an F_UNLCK of the
468 * entire range will always succeed so the unlock_override
469 * case is mandatory.
a3190466 470 */
99acf558
MD
471 double_clip = 0;
472 unlock_override = 0;
473 if (first_match->lf_start < start) {
474 if (first_match == last_match && last_match->lf_end > end)
475 double_clip = 1;
476 } else if (type == F_UNLCK && last_match->lf_end <= end) {
477 unlock_override = 1;
a3190466 478 }
cae44669 479
a3190466
MD
480 /*
481 * Figure out the worst case net increase in POSIX locks and account
482 * for it now before we start modifying things. If neither the
483 * first or last locks match we have an issue. If there is only
484 * one overlapping range which needs to be clipped on both ends
485 * we wind up having to create up to two new locks, else only one.
486 *
e95b513d
MD
487 * When unlocking the worst case is always 1 new lock if our
488 * unlock request cuts the middle out of an existing lock range.
489 *
a3190466
MD
490 * count represents the 'cleanup' adjustment needed. It starts
491 * negative, is incremented whenever we create a new POSIX lock,
492 * and decremented whenever we delete an existing one. At the
493 * end of the day it had better be <= 0 or we didn't calculate the
494 * worse case properly here.
495 */
496 count = 0;
99acf558 497 if ((flags & F_POSIX) && !unlock_override) {
a3190466
MD
498 if (!lf_match(first_match, type, flags) &&
499 !lf_match(last_match, type, flags)
500 ) {
e95b513d 501 if (double_clip && type != F_UNLCK)
a3190466 502 count = -2;
508ceb09 503 else
a3190466
MD
504 count = -1;
505 }
506 if (count && lf_count_change(owner, -count)) {
507 error = ENOLCK;
508 goto do_cleanup;
508ceb09 509 }
a3190466 510 }
e95b513d 511 /* else flock style lock which encompasses entire range */
a3190466
MD
512
513 /*
514 * Create and insert the lock represented the requested range.
515 * Adjust the net POSIX lock count. We have to move our insertion
516 * point since brange now represents the first record >= start.
517 *
518 * When unlocking, no new lock is inserted but we still clip.
519 */
520 if (type != F_UNLCK) {
521 brange = new_range1;
522 new_range1 = NULL;
523 lf_create_range(brange, owner, type, flags, start, end);
524 lf_insert(&lock->lf_range, brange, insert_point);
525 insert_point = brange;
526 if (flags & F_POSIX)
527 ++count;
528 } else {
529 brange = NULL;
530 }
531
532 /*
533 * Handle the double_clip case. This is the only case where
534 * we wind up having to add TWO locks.
535 */
536 if (double_clip) {
537 KKASSERT(first_match == last_match);
538 last_match = new_range2;
539 new_range2 = NULL;
540 lf_create_range(last_match, first_match->lf_owner,
541 first_match->lf_type, first_match->lf_flags,
542 end + 1, first_match->lf_end);
543 first_match->lf_end = start - 1;
544 first_match->lf_flags &= ~F_NOEND;
cae44669 545
984263bc 546 /*
a3190466 547 * Figure out where to insert the right side clip.
984263bc 548 */
a3190466
MD
549 lf_insert(&lock->lf_range, last_match, first_match);
550 if (last_match->lf_flags & F_POSIX)
551 ++count;
508ceb09 552 }
984263bc 553
cae44669 554 /*
a3190466
MD
555 * Clip or destroy the locks between first_match and last_match,
556 * inclusive. Ignore the primary lock we created (brange). Note
557 * that if double-clipped, first_match and last_match will be
558 * outside our clipping range. Otherwise first_match and last_match
559 * will be deleted.
560 *
561 * We have already taken care of any double clipping.
562 *
e95b513d
MD
563 * The insert_point may become invalid as we delete records, do not
564 * use that pointer any more. Also, when removing something other
565 * then 'range' we have to check to see if the item we are removing
566 * is 'next' and adjust 'next' properly.
a3190466
MD
567 *
568 * NOTE: brange will be NULL if F_UNLCKing.
cae44669 569 */
a3190466
MD
570 TAILQ_INIT(&deadlist);
571 next = first_match;
572
573 while ((range = next) != NULL) {
574 next = TAILQ_NEXT(range, lf_link);
575
cae44669 576 /*
a3190466
MD
577 * Ignore elements that we do not own and ignore the
578 * primary request range which we just created.
cae44669 579 */
a3190466
MD
580 if (range->lf_owner != owner || range == brange)
581 continue;
cae44669
MD
582
583 /*
a3190466 584 * We may have to wakeup a waiter when downgrading a lock.
cae44669 585 */
a3190466
MD
586 if (type == F_UNLCK)
587 wakeup_needed = 1;
588 if (type == F_RDLCK && range->lf_type == F_WRLCK)
589 wakeup_needed = 1;
508ceb09 590
a3190466 591 /*
e95b513d
MD
592 * Clip left. This can only occur on first_match.
593 *
594 * Merge the left clip with brange if possible. This must
595 * be done specifically, not in the optimized merge heuristic
596 * below, since we may have counted on it in our 'count'
597 * calculation above.
a3190466
MD
598 */
599 if (range->lf_start < start) {
600 KKASSERT(range == first_match);
e95b513d
MD
601 if (brange &&
602 range->lf_end >= start - 1 &&
603 lf_match(range, type, flags)) {
604 range->lf_end = brange->lf_end;
605 range->lf_flags |= brange->lf_flags & F_NOEND;
606 /*
607 * Removing something other then 'range',
608 * adjust 'next' if necessary.
609 */
610 if (next == brange)
611 next = TAILQ_NEXT(next, lf_link);
612 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
613 if (brange->lf_flags & F_POSIX)
614 --count;
615 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
616 brange = range;
617 } else if (range->lf_end >= start) {
a3190466 618 range->lf_end = start - 1;
e95b513d
MD
619 if (type != F_UNLCK)
620 range->lf_flags &= ~F_NOEND;
508ceb09 621 }
a3190466 622 if (range == last_match)
984263bc 623 break;
a3190466
MD
624 continue;
625 }
cae44669 626
a3190466 627 /*
e95b513d
MD
628 * Clip right. This can only occur on last_match.
629 *
630 * Merge the right clip if possible. This must be done
631 * specifically, not in the optimized merge heuristic
632 * below, since we may have counted on it in our 'count'
633 * calculation.
a3190466
MD
634 *
635 * Since we are adjusting lf_start, we have to move the
636 * record to maintain the sorted list. Since lf_start is
637 * only getting larger we can use the next element as the
638 * insert point (we don't have to backtrack).
639 */
640 if (range->lf_end > end) {
641 KKASSERT(range == last_match);
e95b513d
MD
642 if (brange &&
643 range->lf_start <= end + 1 &&
644 lf_match(range, type, flags)) {
645 brange->lf_end = range->lf_end;
646 brange->lf_flags |= range->lf_flags & F_NOEND;
647 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
648 if (range->lf_flags & F_POSIX)
649 --count;
650 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
651 } else if (range->lf_start <= end) {
652 range->lf_start = end + 1;
653 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
654 lf_insert(&lock->lf_range, range, next);
a3190466
MD
655 }
656 /* range == last_match, we are done */
984263bc 657 break;
508ceb09 658 }
a3190466
MD
659
660 /*
661 * The record must be entirely enclosed. Note that the
662 * record could be first_match or last_match, and will be
663 * deleted.
664 */
665 KKASSERT(range->lf_start >= start && range->lf_end <= end);
666 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
667 if (range->lf_flags & F_POSIX)
668 --count;
669 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
670 if (range == last_match)
671 break;
508ceb09 672 }
984263bc 673
a3190466
MD
674 /*
675 * Attempt to merge locks adjacent to brange. For example, we may
676 * have had to clip first_match and/or last_match, and they might
677 * be adjacent. Or there might simply have been an adjacent lock
678 * already there.
679 *
680 * Don't get fancy, just check adjacent elements in the list if they
681 * happen to be owned by us.
e95b513d
MD
682 *
683 * This case only gets hit if we have a situation where a shared
684 * and exclusive lock are adjacent, and the exclusive lock is
685 * downgraded to shared or the shared lock is upgraded to exclusive.
a3190466
MD
686 */
687 if (brange) {
688 range = TAILQ_PREV(brange, lockf_range_list, lf_link);
689 if (range &&
690 range->lf_owner == owner &&
691 range->lf_end == brange->lf_start - 1 &&
692 lf_match(range, type, flags)
693 ) {
694 /*
e95b513d 695 * Extend range to cover brange and scrap brange.
a3190466
MD
696 */
697 range->lf_end = brange->lf_end;
698 range->lf_flags |= brange->lf_flags & F_NOEND;
699 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
700 if (brange->lf_flags & F_POSIX)
701 --count;
702 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
703 brange = range;
984263bc 704 }
a3190466
MD
705 range = TAILQ_NEXT(brange, lf_link);
706 if (range &&
707 range->lf_owner == owner &&
708 range->lf_start == brange->lf_end + 1 &&
709 lf_match(range, type, flags)
710 ) {
711 /*
e95b513d 712 * Extend brange to cover range and scrap range.
a3190466
MD
713 */
714 brange->lf_end = range->lf_end;
93df580f 715 brange->lf_flags |= range->lf_flags & F_NOEND;
a3190466
MD
716 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
717 if (range->lf_flags & F_POSIX)
718 --count;
719 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
508ceb09 720 }
984263bc 721 }
508ceb09 722
a3190466
MD
723 /*
724 * Destroy deleted elements. We didn't want to do it in the loop
725 * because the free() might have blocked.
726 *
727 * Adjust the count for any posix locks we thought we might create
728 * but didn't.
729 */
730 while ((range = TAILQ_FIRST(&deadlist)) != NULL) {
731 TAILQ_REMOVE(&deadlist, range, lf_link);
732 lf_destroy_range(range);
733 }
734
735 KKASSERT(count <= 0);
736 if (count < 0)
737 lf_count_change(owner, count);
508ceb09 738do_wakeup:
6449cdd4 739 lf_print_lock(lock);
508ceb09
JS
740 if (wakeup_needed)
741 lf_wakeup(lock, start, end);
4f0a1741
JS
742 error = 0;
743do_cleanup:
744 if (new_range1 != NULL)
cae44669 745 lf_destroy_range(new_range1);
4f0a1741 746 if (new_range2 != NULL)
cae44669 747 lf_destroy_range(new_range2);
4f0a1741 748 return(error);
984263bc
MD
749}
750
984263bc
MD
751/*
752 * Check whether there is a blocking lock,
753 * and if so return its process identifier.
754 */
755static int
508ceb09
JS
756lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner,
757 int type, int flags, off_t start, off_t end)
984263bc 758{
508ceb09 759 struct lockf_range *range;
984263bc 760
508ceb09
JS
761 TAILQ_FOREACH(range, &lock->lf_range, lf_link)
762 if (range->lf_owner != owner &&
763 lf_overlap(range, start, end) &&
764 (type == F_WRLCK || range->lf_type == F_WRLCK))
765 break;
766 if (range == NULL) {
984263bc 767 fl->l_type = F_UNLCK;
508ceb09 768 return(0);
984263bc 769 }
508ceb09
JS
770 fl->l_type = range->lf_type;
771 fl->l_whence = SEEK_SET;
772 fl->l_start = range->lf_start;
773 if (range->lf_flags & F_NOEND)
774 fl->l_len = 0;
775 else
776 fl->l_len = range->lf_end - range->lf_start + 1;
777 if (range->lf_owner != NULL && (range->lf_flags & F_POSIX))
778 fl->l_pid = range->lf_owner->p_pid;
779 else
780 fl->l_pid = -1;
781 return(0);
984263bc
MD
782}
783
784/*
ad936517
MD
785 * Wakeup pending lock attempts. Theoretically we can stop as soon as
786 * we encounter an exclusive request that covers the whole range (at least
787 * insofar as the sleep code above calls lf_wakeup() if it would otherwise
788 * exit instead of loop), but for now just wakeup all overlapping
789 * requests. XXX
984263bc 790 */
508ceb09
JS
791static void
792lf_wakeup(struct lockf *lock, off_t start, off_t end)
984263bc 793{
508ceb09 794 struct lockf_range *range, *nrange;
ad936517 795
508ceb09
JS
796 TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) {
797 if (lf_overlap(range, start, end) == 0)
984263bc 798 continue;
508ceb09
JS
799 TAILQ_REMOVE(&lock->lf_blocked, range, lf_link);
800 range->lf_flags = 1;
801 wakeup(range);
984263bc 802 }
984263bc
MD
803}
804
eacc5025
MD
805/*
806 * Allocate a range structure and initialize it sufficiently such that
807 * lf_destroy_range() does not barf.
808 */
508ceb09 809static struct lockf_range *
4f0a1741 810lf_alloc_range(void)
508ceb09 811{
eacc5025
MD
812 struct lockf_range *range;
813
4f0a1741 814#ifdef INVARIANTS
20b10249 815 atomic_add_int(&lf_global_counter, 1);
4f0a1741 816#endif
efda3bd0 817 range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK);
eacc5025
MD
818 range->lf_owner = NULL;
819 return(range);
4f0a1741 820}
508ceb09 821
4f0a1741 822static void
a3190466
MD
823lf_insert(struct lockf_range_list *list, struct lockf_range *elm,
824 struct lockf_range *insert_point)
825{
826 while (insert_point && insert_point->lf_start < elm->lf_start)
827 insert_point = TAILQ_NEXT(insert_point, lf_link);
828 if (insert_point != NULL)
829 TAILQ_INSERT_BEFORE(insert_point, elm, lf_link);
830 else
831 TAILQ_INSERT_TAIL(list, elm, lf_link);
832}
833
834static void
4f0a1741 835lf_create_range(struct lockf_range *range, struct proc *owner, int type,
cae44669 836 int flags, off_t start, off_t end)
4f0a1741 837{
508ceb09 838 KKASSERT(start <= end);
508ceb09
JS
839 range->lf_type = type;
840 range->lf_flags = flags;
841 range->lf_start = start;
842 range->lf_end = end;
843 range->lf_owner = owner;
984263bc 844
6449cdd4
MD
845 lf_printf("lf_create_range: %lld..%lld\n",
846 range->lf_start, range->lf_end);
984263bc
MD
847}
848
984263bc 849static void
cae44669 850lf_destroy_range(struct lockf_range *range)
984263bc 851{
6449cdd4 852 lf_printf("lf_destroy_range: %lld..%lld\n",
cae44669 853 range->lf_start, range->lf_end);
efda3bd0 854 kfree(range, M_LOCKF);
508ceb09 855#ifdef INVARIANTS
20b10249
MD
856 atomic_add_int(&lf_global_counter, -1);
857 KKASSERT(lf_global_counter >= 0);
508ceb09 858#endif
984263bc
MD
859}
860
861#ifdef LOCKF_DEBUG
6449cdd4 862
508ceb09 863static void
6449cdd4
MD
864_lf_printf(const char *ctl, ...)
865{
866 struct proc *p;
867 __va_list va;
868
869 if (lf_print_ranges) {
870 if ((p = curproc) != NULL)
6ea70f76 871 kprintf("pid %d (%s): ", p->p_pid, p->p_comm);
6449cdd4
MD
872 }
873 __va_start(va, ctl);
379210cb 874 kvprintf(ctl, va);
6449cdd4
MD
875 __va_end(va);
876}
877
878static void
879_lf_print_lock(const struct lockf *lock)
984263bc 880{
508ceb09 881 struct lockf_range *range;
984263bc 882
6449cdd4
MD
883 if (lf_print_ranges == 0)
884 return;
885
886 if (TAILQ_EMPTY(&lock->lf_range)) {
887 lf_printf("lockf %p: no ranges locked\n", lock);
888 } else {
889 lf_printf("lockf %p:\n", lock);
890 }
508ceb09 891 TAILQ_FOREACH(range, &lock->lf_range, lf_link)
6ea70f76 892 kprintf("\t%lld..%lld type %s owned by %d\n",
508ceb09
JS
893 range->lf_start, range->lf_end,
894 range->lf_type == F_RDLCK ? "shared" : "exclusive",
895 range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1);
896 if (TAILQ_EMPTY(&lock->lf_blocked))
6ea70f76 897 kprintf("no process waiting for range\n");
984263bc 898 else
6ea70f76 899 kprintf("blocked locks:");
f99c6c54 900 TAILQ_FOREACH(range, &lock->lf_blocked, lf_link)
6ea70f76 901 kprintf("\t%lld..%lld type %s waiting on %p\n",
508ceb09
JS
902 range->lf_start, range->lf_end,
903 range->lf_type == F_RDLCK ? "shared" : "exclusive",
904 range);
984263bc 905}
2c4af4c4 906#endif /* LOCKF_DEBUG */