nrelease - fix/improve livecd
[dragonfly.git] / sys / kern / kern_lockf.c
CommitLineData
984263bc 1/*
0a019f0d 2 * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>. All rights reserved.
d6299163 3 * Copyright (c) 2006-2018 Matthew Dillon <dillon@backplane.com>. All rights reserved.
508ceb09 4 *
984263bc
MD
5 * Copyright (c) 1982, 1986, 1989, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Scooter Morris at Genentech Inc.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
dc71b7ab 19 * 3. Neither the name of the University nor the names of its contributors
984263bc
MD
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 *
35 * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
36 * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $
37 */
38
c91da9dd
SW
39#include "opt_debug_lockf.h"
40
984263bc
MD
41#include <sys/param.h>
42#include <sys/systm.h>
43#include <sys/kernel.h>
44#include <sys/lock.h>
45#include <sys/proc.h>
46#include <sys/unistd.h>
47#include <sys/vnode.h>
48#include <sys/malloc.h>
49#include <sys/fcntl.h>
508ceb09 50#include <sys/resourcevar.h>
984263bc
MD
51
52#include <sys/lockf.h>
508ceb09 53#include <machine/limits.h> /* for LLONG_MAX */
6449cdd4 54#include <machine/stdarg.h>
508ceb09 55
9d7a637e
AE
56#include <sys/spinlock2.h>
57
f6240282
MD
58struct lf_pcpu {
59 struct lockf_range *free1;
60 struct lockf_range *free2;
61} __cachealign;
62
63static struct lf_pcpu *lf_pcpu_array;
64
508ceb09
JS
65#ifdef LOCKF_DEBUG
66int lf_print_ranges = 0;
67
6449cdd4 68static void _lf_print_lock(const struct lockf *);
4f697c93 69static void _lf_printf(const char *, ...) __printflike(1, 2);
6449cdd4
MD
70
71#define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock)
72#define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args)
73#else
74#define lf_print_lock(lock)
75#define lf_printf(ctl, args...)
508ceb09
JS
76#endif
77
78static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures");
79
80static void lf_wakeup(struct lockf *, off_t, off_t);
4f0a1741
JS
81static struct lockf_range *lf_alloc_range(void);
82static void lf_create_range(struct lockf_range *, struct proc *, int, int,
cae44669 83 off_t, off_t);
a3190466
MD
84static void lf_insert(struct lockf_range_list *list,
85 struct lockf_range *elm,
86 struct lockf_range *insert_point);
cae44669 87static void lf_destroy_range(struct lockf_range *);
508ceb09
JS
88
89static int lf_setlock(struct lockf *, struct proc *, int, int,
90 off_t, off_t);
508ceb09
JS
91static int lf_getlock(struct flock *, struct lockf *, struct proc *,
92 int, int, off_t, off_t);
93
94static int lf_count_change(struct proc *, int);
984263bc 95
a3190466
MD
96/*
97 * Return TRUE (non-zero) if the type and posix flags match.
98 */
99static __inline
100int
101lf_match(struct lockf_range *range, int type, int flags)
102{
103 if (range->lf_type != type)
104 return(0);
105 if ((range->lf_flags ^ flags) & F_POSIX)
106 return(0);
107 return(1);
108}
109
110/*
111 * Check whether range and [start, end] overlap.
112 */
113static __inline
114int
115lf_overlap(const struct lockf_range *range, off_t start, off_t end)
116{
117 if (range->lf_start >= start && range->lf_start <= end)
118 return(1);
119 else if (start >= range->lf_start && start <= range->lf_end)
120 return(1);
121 else
122 return(0);
123}
124
125
984263bc 126/*
508ceb09 127 * Change the POSIX lock accounting for the given process.
984263bc 128 */
508ceb09 129void
b76a3852 130lf_count_adjust(struct proc *p, int increase)
508ceb09
JS
131{
132 struct uidinfo *uip;
d6299163
MD
133 struct uidcount *pup;
134 int n;
984263bc 135
508ceb09 136 KKASSERT(p != NULL);
984263bc 137
508ceb09 138 uip = p->p_ucred->cr_uidinfo;
d6299163 139 pup = &uip->ui_pcpu[mycpuid];
984263bc 140
d6299163
MD
141 if (increase) {
142 for (n = 0; n < ncpus; ++n)
143 pup->pu_posixlocks += p->p_uidpcpu[n].pu_posixlocks;
144 } else {
145 for (n = 0; n < ncpus; ++n)
146 pup->pu_posixlocks -= p->p_uidpcpu[n].pu_posixlocks;
147 }
148
149 if (pup->pu_posixlocks < -PUP_LIMIT ||
150 pup->pu_posixlocks > PUP_LIMIT) {
151 atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
152 pup->pu_posixlocks = 0;
153 }
508ceb09 154}
984263bc 155
508ceb09
JS
156static int
157lf_count_change(struct proc *owner, int diff)
158{
159 struct uidinfo *uip;
9d7a637e 160 int max, ret;
508ceb09
JS
161
162 /* we might actually not have a process context */
163 if (owner == NULL)
164 return(0);
165
166 uip = owner->p_ucred->cr_uidinfo;
167
168 max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur,
169 maxposixlocksperuid);
9d7a637e 170
508ceb09 171 if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 &&
cae44669 172 uip->ui_posixlocks >= max ) {
9d7a637e
AE
173 ret = 1;
174 } else {
d6299163
MD
175 struct uidcount *pup;
176 int cpu = mycpuid;
177
178 pup = &uip->ui_pcpu[cpu];
179 pup->pu_posixlocks += diff;
180 if (pup->pu_posixlocks < -PUP_LIMIT ||
181 pup->pu_posixlocks > PUP_LIMIT) {
182 atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks);
183 pup->pu_posixlocks = 0;
184 }
185 owner->p_uidpcpu[cpu].pu_posixlocks += diff;
9d7a637e 186 ret = 0;
cae44669 187 }
9d7a637e 188 return ret;
508ceb09 189}
984263bc
MD
190
191/*
192 * Advisory record locking support
193 */
194int
508ceb09 195lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size)
984263bc 196{
1fd87d54 197 struct flock *fl = ap->a_fl;
508ceb09 198 struct proc *owner;
984263bc 199 off_t start, end;
97ef7fc2 200 int type, flags, error;
3b998fa9 201 lwkt_token_t token;
508ceb09 202
984263bc
MD
203 /*
204 * Convert the flock structure into a start and end.
205 */
206 switch (fl->l_whence) {
984263bc
MD
207 case SEEK_SET:
208 case SEEK_CUR:
209 /*
210 * Caller is responsible for adding any necessary offset
211 * when SEEK_CUR is used.
212 */
213 start = fl->l_start;
214 break;
215
216 case SEEK_END:
217 start = size + fl->l_start;
218 break;
219
220 default:
508ceb09 221 return(EINVAL);
984263bc 222 }
9b725a11
MD
223
224 flags = ap->a_flags;
984263bc 225 if (start < 0)
508ceb09
JS
226 return(EINVAL);
227 if (fl->l_len == 0) {
228 flags |= F_NOEND;
86fc1543 229 end = LLONG_MAX;
99acf558
MD
230 } else if (fl->l_len < 0) {
231 return(EINVAL);
508ceb09 232 } else {
984263bc
MD
233 end = start + fl->l_len - 1;
234 if (end < start)
508ceb09 235 return(EINVAL);
984263bc 236 }
508ceb09 237
508ceb09 238 type = fl->l_type;
984263bc 239 /*
508ceb09
JS
240 * This isn't really correct for flock-style locks,
241 * but the current handling is somewhat broken anyway.
984263bc 242 */
508ceb09
JS
243 owner = (struct proc *)ap->a_id;
244
984263bc
MD
245 /*
246 * Do the requested operation.
247 */
3b998fa9 248 token = lwkt_getpooltoken(lock);
0e23bc1f
JS
249
250 if (lock->init_done == 0) {
251 TAILQ_INIT(&lock->lf_range);
252 TAILQ_INIT(&lock->lf_blocked);
253 lock->init_done = 1;
254 }
255
984263bc
MD
256 switch(ap->a_op) {
257 case F_SETLK:
93df580f
MD
258 /*
259 * NOTE: It is possible for both lf_range and lf_blocked to
260 * be empty if we block and get woken up, but another process
261 * then gets in and issues an unlock. So VMAYHAVELOCKS must
262 * be set after the lf_setlock() operation completes rather
263 * then before.
264 */
97ef7fc2 265 error = lf_setlock(lock, owner, type, flags, start, end);
f6240282
MD
266 if ((ap->a_vp->v_flag & VMAYHAVELOCKS) == 0)
267 vsetflags(ap->a_vp, VMAYHAVELOCKS);
97ef7fc2 268 break;
984263bc
MD
269
270 case F_UNLCK:
a3190466 271 error = lf_setlock(lock, owner, type, flags, start, end);
f6240282
MD
272#if 0
273 /*
274 * XXX REMOVED. don't bother doing this in the critical path.
275 * close() overhead is minimal.
276 */
2792dfce
MD
277 if (TAILQ_EMPTY(&lock->lf_range) &&
278 TAILQ_EMPTY(&lock->lf_blocked)) {
2247fe02 279 vclrflags(ap->a_vp, VMAYHAVELOCKS);
2792dfce 280 }
f6240282 281#endif
97ef7fc2 282 break;
984263bc
MD
283
284 case F_GETLK:
97ef7fc2
JS
285 error = lf_getlock(fl, lock, owner, type, flags, start, end);
286 break;
984263bc
MD
287
288 default:
97ef7fc2
JS
289 error = EINVAL;
290 break;
984263bc 291 }
3b998fa9 292 lwkt_reltoken(token);
97ef7fc2 293 return(error);
984263bc
MD
294}
295
984263bc 296static int
508ceb09
JS
297lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags,
298 off_t start, off_t end)
984263bc 299{
a3190466
MD
300 struct lockf_range *range;
301 struct lockf_range *brange;
302 struct lockf_range *next;
303 struct lockf_range *first_match;
304 struct lockf_range *last_match;
305 struct lockf_range *insert_point;
306 struct lockf_range *new_range1;
307 struct lockf_range *new_range2;
308 int wakeup_needed;
309 int double_clip;
99acf558 310 int unlock_override;
4f0a1741 311 int error = 0;
cae44669 312 int count;
a3190466
MD
313 struct lockf_range_list deadlist;
314
315 new_range1 = NULL;
316 new_range2 = NULL;
317 count = 0;
508ceb09
JS
318
319restart:
a3190466
MD
320 /*
321 * Preallocate two ranges so we don't have to worry about blocking
322 * in the middle of the lock code.
323 */
4f0a1741
JS
324 if (new_range1 == NULL)
325 new_range1 = lf_alloc_range();
326 if (new_range2 == NULL)
327 new_range2 = lf_alloc_range();
508ceb09 328 first_match = NULL;
a3190466 329 last_match = NULL;
508ceb09
JS
330 insert_point = NULL;
331 wakeup_needed = 0;
984263bc 332
6449cdd4 333 lf_print_lock(lock);
984263bc 334
a3190466
MD
335 /*
336 * Locate the insertion point for the new lock (the first range
337 * with an lf_start >= start).
338 *
339 * Locate the first and latch ranges owned by us that overlap
340 * the requested range.
341 */
508ceb09
JS
342 TAILQ_FOREACH(range, &lock->lf_range, lf_link) {
343 if (insert_point == NULL && range->lf_start >= start)
344 insert_point = range;
a3190466
MD
345
346 /*
347 * Skip non-overlapping locks. Locks are sorted by lf_start
348 * So we can terminate the search when lf_start exceeds the
349 * requested range (insert_point is still guarenteed to be
350 * set properly).
351 */
352 if (range->lf_end < start)
508ceb09 353 continue;
a3190466
MD
354 if (range->lf_start > end) {
355 range = NULL;
356 break;
357 }
358
359 /*
360 * Overlapping lock. Set first_match and last_match if we
361 * are the owner.
362 */
508ceb09
JS
363 if (range->lf_owner == owner) {
364 if (first_match == NULL)
365 first_match = range;
a3190466 366 last_match = range;
508ceb09 367 continue;
984263bc 368 }
a3190466
MD
369
370 /*
371 * If we aren't the owner check for a conflicting lock. Only
372 * if not unlocking.
373 */
374 if (type != F_UNLCK) {
375 if (type == F_WRLCK || range->lf_type == F_WRLCK)
376 break;
377 }
508ceb09
JS
378 }
379
a3190466
MD
380 /*
381 * If a conflicting lock was observed, block or fail as appropriate.
382 * (this code is skipped when unlocking)
383 */
508ceb09 384 if (range != NULL) {
4f0a1741
JS
385 if ((flags & F_WAIT) == 0) {
386 error = EAGAIN;
387 goto do_cleanup;
388 }
508ceb09 389
984263bc 390 /*
508ceb09
JS
391 * We are blocked. For POSIX locks we have to check
392 * for deadlocks and return with EDEADLK. This is done
3d8f95ac 393 * by checking whether range->lf_owner is already
508ceb09
JS
394 * blocked.
395 *
396 * Since flock-style locks cover the whole file, a
397 * deadlock between those is nearly impossible.
398 * This can only occur if a process tries to lock the
399 * same inode exclusively while holding a shared lock
400 * with another descriptor.
401 * XXX How can we cleanly detect this?
402 * XXX The current mixing of flock & fcntl/lockf is evil.
984263bc 403 *
508ceb09 404 * Handle existing locks of flock-style like POSIX locks.
984263bc 405 */
508ceb09 406 if (flags & F_POSIX) {
99acf558 407 TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) {
4f0a1741
JS
408 if (brange->lf_owner == range->lf_owner) {
409 error = EDEADLK;
410 goto do_cleanup;
411 }
99acf558 412 }
984263bc 413 }
508ceb09 414
984263bc 415 /*
508ceb09 416 * For flock-style locks, we must first remove
984263bc
MD
417 * any shared locks that we hold before we sleep
418 * waiting for an exclusive lock.
419 */
71c18fe3 420 if ((flags & F_POSIX) == 0 && type == F_WRLCK)
a3190466 421 lf_setlock(lock, owner, F_UNLCK, 0, start, end);
508ceb09 422
4f0a1741
JS
423 brange = new_range1;
424 new_range1 = NULL;
cae44669 425 lf_create_range(brange, owner, type, 0, start, end);
508ceb09
JS
426 TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link);
427 error = tsleep(brange, PCATCH, "lockf", 0);
428
984263bc 429 /*
508ceb09
JS
430 * We may have been awaked by a signal and/or by a
431 * debugger continuing us (in which case we must remove
984263bc 432 * ourselves from the blocked list) and/or by another
508ceb09
JS
433 * process releasing/downgrading a lock (in which case
434 * we have already been removed from the blocked list
435 * and our lf_flags field is 1).
a3190466
MD
436 *
437 * Sleep if it looks like we might be livelocking.
984263bc 438 */
508ceb09
JS
439 if (brange->lf_flags == 0)
440 TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link);
9157e509 441 if (error == 0 && count == 2)
a3190466
MD
442 tsleep(brange, 0, "lockfz", 2);
443 else
444 ++count;
cae44669 445 lf_destroy_range(brange);
508ceb09
JS
446
447 if (error)
4f0a1741 448 goto do_cleanup;
508ceb09
JS
449 goto restart;
450 }
451
a3190466
MD
452 /*
453 * If there are no overlapping locks owned by us then creating
454 * the new lock is easy. This is the most common case.
455 */
508ceb09 456 if (first_match == NULL) {
a3190466
MD
457 if (type == F_UNLCK)
458 goto do_wakeup;
508ceb09 459 if (flags & F_POSIX) {
4f0a1741
JS
460 if (lf_count_change(owner, 1)) {
461 error = ENOLCK;
462 goto do_cleanup;
463 }
984263bc 464 }
4f0a1741
JS
465 range = new_range1;
466 new_range1 = NULL;
cae44669 467 lf_create_range(range, owner, type, flags, start, end);
a3190466 468 lf_insert(&lock->lf_range, range, insert_point);
508ceb09 469 goto do_wakeup;
984263bc 470 }
508ceb09 471
a3190466 472 /*
99acf558
MD
473 * double_clip - Calculate a special case where TWO locks may have
474 * to be added due to the new lock breaking up an
475 * existing incompatible lock in the middle.
476 *
477 * unlock_override - Calculate a special case where NO locks
478 * need to be created. This occurs when an unlock
479 * does not clip any locks at the front and rear.
480 *
481 * WARNING! closef() and fdrop() assume that an F_UNLCK of the
482 * entire range will always succeed so the unlock_override
483 * case is mandatory.
a3190466 484 */
99acf558
MD
485 double_clip = 0;
486 unlock_override = 0;
487 if (first_match->lf_start < start) {
488 if (first_match == last_match && last_match->lf_end > end)
489 double_clip = 1;
490 } else if (type == F_UNLCK && last_match->lf_end <= end) {
491 unlock_override = 1;
a3190466 492 }
cae44669 493
a3190466
MD
494 /*
495 * Figure out the worst case net increase in POSIX locks and account
496 * for it now before we start modifying things. If neither the
497 * first or last locks match we have an issue. If there is only
498 * one overlapping range which needs to be clipped on both ends
499 * we wind up having to create up to two new locks, else only one.
500 *
e95b513d
MD
501 * When unlocking the worst case is always 1 new lock if our
502 * unlock request cuts the middle out of an existing lock range.
503 *
a3190466
MD
504 * count represents the 'cleanup' adjustment needed. It starts
505 * negative, is incremented whenever we create a new POSIX lock,
506 * and decremented whenever we delete an existing one. At the
507 * end of the day it had better be <= 0 or we didn't calculate the
508 * worse case properly here.
509 */
510 count = 0;
99acf558 511 if ((flags & F_POSIX) && !unlock_override) {
a3190466
MD
512 if (!lf_match(first_match, type, flags) &&
513 !lf_match(last_match, type, flags)
514 ) {
e95b513d 515 if (double_clip && type != F_UNLCK)
a3190466 516 count = -2;
508ceb09 517 else
a3190466
MD
518 count = -1;
519 }
520 if (count && lf_count_change(owner, -count)) {
521 error = ENOLCK;
522 goto do_cleanup;
508ceb09 523 }
a3190466 524 }
e95b513d 525 /* else flock style lock which encompasses entire range */
a3190466
MD
526
527 /*
528 * Create and insert the lock represented the requested range.
529 * Adjust the net POSIX lock count. We have to move our insertion
530 * point since brange now represents the first record >= start.
531 *
532 * When unlocking, no new lock is inserted but we still clip.
533 */
534 if (type != F_UNLCK) {
535 brange = new_range1;
536 new_range1 = NULL;
537 lf_create_range(brange, owner, type, flags, start, end);
538 lf_insert(&lock->lf_range, brange, insert_point);
539 insert_point = brange;
540 if (flags & F_POSIX)
541 ++count;
542 } else {
543 brange = NULL;
544 }
545
546 /*
547 * Handle the double_clip case. This is the only case where
548 * we wind up having to add TWO locks.
549 */
550 if (double_clip) {
551 KKASSERT(first_match == last_match);
552 last_match = new_range2;
553 new_range2 = NULL;
554 lf_create_range(last_match, first_match->lf_owner,
555 first_match->lf_type, first_match->lf_flags,
556 end + 1, first_match->lf_end);
557 first_match->lf_end = start - 1;
558 first_match->lf_flags &= ~F_NOEND;
cae44669 559
984263bc 560 /*
a3190466 561 * Figure out where to insert the right side clip.
984263bc 562 */
a3190466
MD
563 lf_insert(&lock->lf_range, last_match, first_match);
564 if (last_match->lf_flags & F_POSIX)
565 ++count;
508ceb09 566 }
984263bc 567
cae44669 568 /*
a3190466
MD
569 * Clip or destroy the locks between first_match and last_match,
570 * inclusive. Ignore the primary lock we created (brange). Note
571 * that if double-clipped, first_match and last_match will be
572 * outside our clipping range. Otherwise first_match and last_match
573 * will be deleted.
574 *
575 * We have already taken care of any double clipping.
576 *
e95b513d
MD
577 * The insert_point may become invalid as we delete records, do not
578 * use that pointer any more. Also, when removing something other
579 * then 'range' we have to check to see if the item we are removing
580 * is 'next' and adjust 'next' properly.
a3190466
MD
581 *
582 * NOTE: brange will be NULL if F_UNLCKing.
cae44669 583 */
a3190466
MD
584 TAILQ_INIT(&deadlist);
585 next = first_match;
586
587 while ((range = next) != NULL) {
588 next = TAILQ_NEXT(range, lf_link);
589
cae44669 590 /*
a3190466
MD
591 * Ignore elements that we do not own and ignore the
592 * primary request range which we just created.
cae44669 593 */
a3190466
MD
594 if (range->lf_owner != owner || range == brange)
595 continue;
cae44669
MD
596
597 /*
a3190466 598 * We may have to wakeup a waiter when downgrading a lock.
cae44669 599 */
a3190466
MD
600 if (type == F_UNLCK)
601 wakeup_needed = 1;
602 if (type == F_RDLCK && range->lf_type == F_WRLCK)
603 wakeup_needed = 1;
508ceb09 604
a3190466 605 /*
e95b513d
MD
606 * Clip left. This can only occur on first_match.
607 *
608 * Merge the left clip with brange if possible. This must
609 * be done specifically, not in the optimized merge heuristic
610 * below, since we may have counted on it in our 'count'
611 * calculation above.
a3190466
MD
612 */
613 if (range->lf_start < start) {
614 KKASSERT(range == first_match);
e95b513d
MD
615 if (brange &&
616 range->lf_end >= start - 1 &&
617 lf_match(range, type, flags)) {
618 range->lf_end = brange->lf_end;
619 range->lf_flags |= brange->lf_flags & F_NOEND;
620 /*
621 * Removing something other then 'range',
622 * adjust 'next' if necessary.
623 */
624 if (next == brange)
625 next = TAILQ_NEXT(next, lf_link);
626 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
627 if (brange->lf_flags & F_POSIX)
628 --count;
629 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
630 brange = range;
631 } else if (range->lf_end >= start) {
a3190466 632 range->lf_end = start - 1;
e95b513d
MD
633 if (type != F_UNLCK)
634 range->lf_flags &= ~F_NOEND;
508ceb09 635 }
a3190466 636 if (range == last_match)
984263bc 637 break;
a3190466
MD
638 continue;
639 }
cae44669 640
a3190466 641 /*
e95b513d
MD
642 * Clip right. This can only occur on last_match.
643 *
644 * Merge the right clip if possible. This must be done
645 * specifically, not in the optimized merge heuristic
646 * below, since we may have counted on it in our 'count'
647 * calculation.
a3190466
MD
648 *
649 * Since we are adjusting lf_start, we have to move the
650 * record to maintain the sorted list. Since lf_start is
651 * only getting larger we can use the next element as the
652 * insert point (we don't have to backtrack).
653 */
654 if (range->lf_end > end) {
655 KKASSERT(range == last_match);
e95b513d
MD
656 if (brange &&
657 range->lf_start <= end + 1 &&
658 lf_match(range, type, flags)) {
659 brange->lf_end = range->lf_end;
660 brange->lf_flags |= range->lf_flags & F_NOEND;
661 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
662 if (range->lf_flags & F_POSIX)
663 --count;
664 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
665 } else if (range->lf_start <= end) {
666 range->lf_start = end + 1;
667 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
668 lf_insert(&lock->lf_range, range, next);
a3190466
MD
669 }
670 /* range == last_match, we are done */
984263bc 671 break;
508ceb09 672 }
a3190466
MD
673
674 /*
675 * The record must be entirely enclosed. Note that the
676 * record could be first_match or last_match, and will be
677 * deleted.
678 */
679 KKASSERT(range->lf_start >= start && range->lf_end <= end);
680 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
681 if (range->lf_flags & F_POSIX)
682 --count;
683 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
684 if (range == last_match)
685 break;
508ceb09 686 }
984263bc 687
a3190466
MD
688 /*
689 * Attempt to merge locks adjacent to brange. For example, we may
690 * have had to clip first_match and/or last_match, and they might
691 * be adjacent. Or there might simply have been an adjacent lock
692 * already there.
693 *
694 * Don't get fancy, just check adjacent elements in the list if they
695 * happen to be owned by us.
e95b513d
MD
696 *
697 * This case only gets hit if we have a situation where a shared
698 * and exclusive lock are adjacent, and the exclusive lock is
699 * downgraded to shared or the shared lock is upgraded to exclusive.
a3190466
MD
700 */
701 if (brange) {
702 range = TAILQ_PREV(brange, lockf_range_list, lf_link);
703 if (range &&
704 range->lf_owner == owner &&
705 range->lf_end == brange->lf_start - 1 &&
706 lf_match(range, type, flags)
707 ) {
708 /*
e95b513d 709 * Extend range to cover brange and scrap brange.
a3190466
MD
710 */
711 range->lf_end = brange->lf_end;
712 range->lf_flags |= brange->lf_flags & F_NOEND;
713 TAILQ_REMOVE(&lock->lf_range, brange, lf_link);
714 if (brange->lf_flags & F_POSIX)
715 --count;
716 TAILQ_INSERT_TAIL(&deadlist, brange, lf_link);
717 brange = range;
984263bc 718 }
a3190466
MD
719 range = TAILQ_NEXT(brange, lf_link);
720 if (range &&
721 range->lf_owner == owner &&
722 range->lf_start == brange->lf_end + 1 &&
723 lf_match(range, type, flags)
724 ) {
725 /*
e95b513d 726 * Extend brange to cover range and scrap range.
a3190466
MD
727 */
728 brange->lf_end = range->lf_end;
93df580f 729 brange->lf_flags |= range->lf_flags & F_NOEND;
a3190466
MD
730 TAILQ_REMOVE(&lock->lf_range, range, lf_link);
731 if (range->lf_flags & F_POSIX)
732 --count;
733 TAILQ_INSERT_TAIL(&deadlist, range, lf_link);
508ceb09 734 }
984263bc 735 }
508ceb09 736
a3190466
MD
737 /*
738 * Destroy deleted elements. We didn't want to do it in the loop
739 * because the free() might have blocked.
740 *
741 * Adjust the count for any posix locks we thought we might create
742 * but didn't.
743 */
744 while ((range = TAILQ_FIRST(&deadlist)) != NULL) {
745 TAILQ_REMOVE(&deadlist, range, lf_link);
746 lf_destroy_range(range);
747 }
748
749 KKASSERT(count <= 0);
750 if (count < 0)
751 lf_count_change(owner, count);
508ceb09 752do_wakeup:
6449cdd4 753 lf_print_lock(lock);
508ceb09
JS
754 if (wakeup_needed)
755 lf_wakeup(lock, start, end);
4f0a1741
JS
756 error = 0;
757do_cleanup:
758 if (new_range1 != NULL)
cae44669 759 lf_destroy_range(new_range1);
4f0a1741 760 if (new_range2 != NULL)
cae44669 761 lf_destroy_range(new_range2);
4f0a1741 762 return(error);
984263bc
MD
763}
764
984263bc
MD
765/*
766 * Check whether there is a blocking lock,
767 * and if so return its process identifier.
768 */
769static int
508ceb09
JS
770lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner,
771 int type, int flags, off_t start, off_t end)
984263bc 772{
508ceb09 773 struct lockf_range *range;
984263bc 774
508ceb09
JS
775 TAILQ_FOREACH(range, &lock->lf_range, lf_link)
776 if (range->lf_owner != owner &&
777 lf_overlap(range, start, end) &&
778 (type == F_WRLCK || range->lf_type == F_WRLCK))
779 break;
780 if (range == NULL) {
984263bc 781 fl->l_type = F_UNLCK;
508ceb09 782 return(0);
984263bc 783 }
508ceb09
JS
784 fl->l_type = range->lf_type;
785 fl->l_whence = SEEK_SET;
786 fl->l_start = range->lf_start;
787 if (range->lf_flags & F_NOEND)
788 fl->l_len = 0;
789 else
790 fl->l_len = range->lf_end - range->lf_start + 1;
791 if (range->lf_owner != NULL && (range->lf_flags & F_POSIX))
792 fl->l_pid = range->lf_owner->p_pid;
793 else
794 fl->l_pid = -1;
795 return(0);
984263bc
MD
796}
797
984263bc 798/*
ad936517
MD
799 * Wakeup pending lock attempts. Theoretically we can stop as soon as
800 * we encounter an exclusive request that covers the whole range (at least
801 * insofar as the sleep code above calls lf_wakeup() if it would otherwise
802 * exit instead of loop), but for now just wakeup all overlapping
803 * requests. XXX
984263bc 804 */
508ceb09
JS
805static void
806lf_wakeup(struct lockf *lock, off_t start, off_t end)
984263bc 807{
508ceb09 808 struct lockf_range *range, *nrange;
ad936517 809
508ceb09
JS
810 TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) {
811 if (lf_overlap(range, start, end) == 0)
984263bc 812 continue;
508ceb09
JS
813 TAILQ_REMOVE(&lock->lf_blocked, range, lf_link);
814 range->lf_flags = 1;
815 wakeup(range);
984263bc 816 }
984263bc
MD
817}
818
eacc5025
MD
819/*
820 * Allocate a range structure and initialize it sufficiently such that
821 * lf_destroy_range() does not barf.
f6240282
MD
822 *
823 * Most use cases are temporary, implement a small 2-entry-per-cpu
824 * cache.
eacc5025 825 */
508ceb09 826static struct lockf_range *
4f0a1741 827lf_alloc_range(void)
508ceb09 828{
eacc5025 829 struct lockf_range *range;
f6240282 830 struct lf_pcpu *lfpc;
eacc5025 831
f6240282
MD
832 lfpc = &lf_pcpu_array[mycpuid];
833 if ((range = lfpc->free1) != NULL) {
834 lfpc->free1 = NULL;
835 return range;
836 }
837 if ((range = lfpc->free2) != NULL) {
838 lfpc->free2 = NULL;
839 return range;
840 }
efda3bd0 841 range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK);
eacc5025 842 range->lf_owner = NULL;
f6240282 843
eacc5025 844 return(range);
4f0a1741 845}
508ceb09 846
a3190466
MD
847static void
848lf_insert(struct lockf_range_list *list, struct lockf_range *elm,
849 struct lockf_range *insert_point)
850{
851 while (insert_point && insert_point->lf_start < elm->lf_start)
852 insert_point = TAILQ_NEXT(insert_point, lf_link);
853 if (insert_point != NULL)
854 TAILQ_INSERT_BEFORE(insert_point, elm, lf_link);
855 else
856 TAILQ_INSERT_TAIL(list, elm, lf_link);
857}
858
4f0a1741
JS
859static void
860lf_create_range(struct lockf_range *range, struct proc *owner, int type,
cae44669 861 int flags, off_t start, off_t end)
4f0a1741 862{
508ceb09 863 KKASSERT(start <= end);
508ceb09
JS
864 range->lf_type = type;
865 range->lf_flags = flags;
866 range->lf_start = start;
867 range->lf_end = end;
868 range->lf_owner = owner;
984263bc 869
4f697c93
SW
870 lf_printf("lf_create_range: %ju..%ju\n",
871 (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
984263bc
MD
872}
873
984263bc 874static void
cae44669 875lf_destroy_range(struct lockf_range *range)
984263bc 876{
f6240282
MD
877 struct lf_pcpu *lfpc;
878
4f697c93 879 lf_printf("lf_destroy_range: %ju..%ju\n",
f6240282
MD
880 (uintmax_t)range->lf_start, (uintmax_t)range->lf_end);
881
882 lfpc = &lf_pcpu_array[mycpuid];
883 if (lfpc->free1 == NULL) {
884 range->lf_owner = NULL;
885 lfpc->free1 = range;
886 return;
887 }
888 if (lfpc->free2 == NULL) {
889 range->lf_owner = NULL;
890 lfpc->free2 = range;
891 return;
892 }
efda3bd0 893 kfree(range, M_LOCKF);
984263bc
MD
894}
895
896#ifdef LOCKF_DEBUG
6449cdd4 897
508ceb09 898static void
6449cdd4
MD
899_lf_printf(const char *ctl, ...)
900{
901 struct proc *p;
902 __va_list va;
903
904 if (lf_print_ranges) {
905 if ((p = curproc) != NULL)
6ea70f76 906 kprintf("pid %d (%s): ", p->p_pid, p->p_comm);
6449cdd4
MD
907 }
908 __va_start(va, ctl);
379210cb 909 kvprintf(ctl, va);
6449cdd4
MD
910 __va_end(va);
911}
912
913static void
914_lf_print_lock(const struct lockf *lock)
984263bc 915{
508ceb09 916 struct lockf_range *range;
984263bc 917
6449cdd4
MD
918 if (lf_print_ranges == 0)
919 return;
920
921 if (TAILQ_EMPTY(&lock->lf_range)) {
922 lf_printf("lockf %p: no ranges locked\n", lock);
923 } else {
924 lf_printf("lockf %p:\n", lock);
925 }
508ceb09 926 TAILQ_FOREACH(range, &lock->lf_range, lf_link)
c91da9dd
SW
927 kprintf("\t%jd..%jd type %s owned by %d\n",
928 (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
508ceb09
JS
929 range->lf_type == F_RDLCK ? "shared" : "exclusive",
930 range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1);
931 if (TAILQ_EMPTY(&lock->lf_blocked))
6ea70f76 932 kprintf("no process waiting for range\n");
984263bc 933 else
6ea70f76 934 kprintf("blocked locks:");
f99c6c54 935 TAILQ_FOREACH(range, &lock->lf_blocked, lf_link)
c91da9dd
SW
936 kprintf("\t%jd..%jd type %s waiting on %p\n",
937 (uintmax_t)range->lf_start, (uintmax_t)range->lf_end,
508ceb09
JS
938 range->lf_type == F_RDLCK ? "shared" : "exclusive",
939 range);
984263bc 940}
2c4af4c4 941#endif /* LOCKF_DEBUG */
f6240282
MD
942
943static void
944lf_init(void *dummy __unused)
945{
946 lf_pcpu_array = kmalloc(sizeof(*lf_pcpu_array) * ncpus,
947 M_LOCKF, M_WAITOK | M_ZERO);
948}
949
950SYSINIT(lockf, SI_BOOT2_MACHDEP, SI_ORDER_ANY, lf_init, NULL);