| 1 | /* |
| 2 | * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>. All rights reserved. |
| 3 | * Copyright (c) 2006 Matthew Dillon <dillon@backplane.com>. All rights reserved. |
| 4 | * |
| 5 | * Copyright (c) 1982, 1986, 1989, 1993 |
| 6 | * The Regents of the University of California. All rights reserved. |
| 7 | * |
| 8 | * This code is derived from software contributed to Berkeley by |
| 9 | * Scooter Morris at Genentech Inc. |
| 10 | * |
| 11 | * Redistribution and use in source and binary forms, with or without |
| 12 | * modification, are permitted provided that the following conditions |
| 13 | * are met: |
| 14 | * 1. Redistributions of source code must retain the above copyright |
| 15 | * notice, this list of conditions and the following disclaimer. |
| 16 | * 2. Redistributions in binary form must reproduce the above copyright |
| 17 | * notice, this list of conditions and the following disclaimer in the |
| 18 | * documentation and/or other materials provided with the distribution. |
| 19 | * 3. All advertising materials mentioning features or use of this software |
| 20 | * must display the following acknowledgement: |
| 21 | * This product includes software developed by the University of |
| 22 | * California, Berkeley and its contributors. |
| 23 | * 4. Neither the name of the University nor the names of its contributors |
| 24 | * may be used to endorse or promote products derived from this software |
| 25 | * without specific prior written permission. |
| 26 | * |
| 27 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 28 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 29 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 30 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 31 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 32 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 33 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 34 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 35 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 36 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 37 | * SUCH DAMAGE. |
| 38 | * |
| 39 | * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 |
| 40 | * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $ |
| 41 | * $DragonFly: src/sys/kern/kern_lockf.c,v 1.37 2007/11/01 22:48:16 dillon Exp $ |
| 42 | */ |
| 43 | |
| 44 | #include <sys/param.h> |
| 45 | #include <sys/systm.h> |
| 46 | #include <sys/kernel.h> |
| 47 | #include <sys/lock.h> |
| 48 | #include <sys/proc.h> |
| 49 | #include <sys/unistd.h> |
| 50 | #include <sys/vnode.h> |
| 51 | #include <sys/malloc.h> |
| 52 | #include <sys/fcntl.h> |
| 53 | #include <sys/resourcevar.h> |
| 54 | |
| 55 | #include <sys/lockf.h> |
| 56 | #include <machine/limits.h> /* for LLONG_MAX */ |
| 57 | #include <machine/stdarg.h> |
| 58 | |
| 59 | #include <sys/spinlock2.h> |
| 60 | |
| 61 | #ifdef INVARIANTS |
| 62 | int lf_global_counter = 0; |
| 63 | #endif |
| 64 | |
| 65 | #ifdef LOCKF_DEBUG |
| 66 | int lf_print_ranges = 0; |
| 67 | |
| 68 | static void _lf_print_lock(const struct lockf *); |
| 69 | static void _lf_printf(const char *, ...); |
| 70 | |
| 71 | #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock) |
| 72 | #define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args) |
| 73 | #else |
| 74 | #define lf_print_lock(lock) |
| 75 | #define lf_printf(ctl, args...) |
| 76 | #endif |
| 77 | |
| 78 | static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); |
| 79 | |
| 80 | static void lf_wakeup(struct lockf *, off_t, off_t); |
| 81 | static struct lockf_range *lf_alloc_range(void); |
| 82 | static void lf_create_range(struct lockf_range *, struct proc *, int, int, |
| 83 | off_t, off_t); |
| 84 | static void lf_insert(struct lockf_range_list *list, |
| 85 | struct lockf_range *elm, |
| 86 | struct lockf_range *insert_point); |
| 87 | static void lf_destroy_range(struct lockf_range *); |
| 88 | |
| 89 | static int lf_setlock(struct lockf *, struct proc *, int, int, |
| 90 | off_t, off_t); |
| 91 | static int lf_getlock(struct flock *, struct lockf *, struct proc *, |
| 92 | int, int, off_t, off_t); |
| 93 | |
| 94 | static int lf_count_change(struct proc *, int); |
| 95 | |
| 96 | /* |
| 97 | * Return TRUE (non-zero) if the type and posix flags match. |
| 98 | */ |
| 99 | static __inline |
| 100 | int |
| 101 | lf_match(struct lockf_range *range, int type, int flags) |
| 102 | { |
| 103 | if (range->lf_type != type) |
| 104 | return(0); |
| 105 | if ((range->lf_flags ^ flags) & F_POSIX) |
| 106 | return(0); |
| 107 | return(1); |
| 108 | } |
| 109 | |
| 110 | /* |
| 111 | * Check whether range and [start, end] overlap. |
| 112 | */ |
| 113 | static __inline |
| 114 | int |
| 115 | lf_overlap(const struct lockf_range *range, off_t start, off_t end) |
| 116 | { |
| 117 | if (range->lf_start >= start && range->lf_start <= end) |
| 118 | return(1); |
| 119 | else if (start >= range->lf_start && start <= range->lf_end) |
| 120 | return(1); |
| 121 | else |
| 122 | return(0); |
| 123 | } |
| 124 | |
| 125 | |
| 126 | /* |
| 127 | * Change the POSIX lock accounting for the given process. |
| 128 | */ |
| 129 | void |
| 130 | lf_count_adjust(struct proc *p, int increase) |
| 131 | { |
| 132 | struct uidinfo *uip; |
| 133 | |
| 134 | KKASSERT(p != NULL); |
| 135 | |
| 136 | uip = p->p_ucred->cr_uidinfo; |
| 137 | spin_lock(&uip->ui_lock); |
| 138 | |
| 139 | if (increase) |
| 140 | uip->ui_posixlocks += p->p_numposixlocks; |
| 141 | else |
| 142 | uip->ui_posixlocks -= p->p_numposixlocks; |
| 143 | |
| 144 | KASSERT(uip->ui_posixlocks >= 0, |
| 145 | ("Negative number of POSIX locks held by %s user: %d.", |
| 146 | increase ? "new" : "old", uip->ui_posixlocks)); |
| 147 | spin_unlock(&uip->ui_lock); |
| 148 | } |
| 149 | |
| 150 | static int |
| 151 | lf_count_change(struct proc *owner, int diff) |
| 152 | { |
| 153 | struct uidinfo *uip; |
| 154 | int max, ret; |
| 155 | |
| 156 | /* we might actually not have a process context */ |
| 157 | if (owner == NULL) |
| 158 | return(0); |
| 159 | |
| 160 | uip = owner->p_ucred->cr_uidinfo; |
| 161 | |
| 162 | max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur, |
| 163 | maxposixlocksperuid); |
| 164 | |
| 165 | spin_lock(&uip->ui_lock); |
| 166 | if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 && |
| 167 | uip->ui_posixlocks >= max ) { |
| 168 | ret = 1; |
| 169 | } else { |
| 170 | uip->ui_posixlocks += diff; |
| 171 | owner->p_numposixlocks += diff; |
| 172 | KASSERT(uip->ui_posixlocks >= 0, |
| 173 | ("Negative number of POSIX locks held by user: %d.", |
| 174 | uip->ui_posixlocks)); |
| 175 | KASSERT(owner->p_numposixlocks >= 0, |
| 176 | ("Negative number of POSIX locks held by proc: %d.", |
| 177 | uip->ui_posixlocks)); |
| 178 | ret = 0; |
| 179 | } |
| 180 | spin_unlock(&uip->ui_lock); |
| 181 | return ret; |
| 182 | } |
| 183 | |
| 184 | /* |
| 185 | * Advisory record locking support |
| 186 | */ |
| 187 | int |
| 188 | lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size) |
| 189 | { |
| 190 | struct flock *fl = ap->a_fl; |
| 191 | struct proc *owner; |
| 192 | off_t start, end; |
| 193 | int type, flags, error; |
| 194 | lwkt_token_t token; |
| 195 | |
| 196 | /* |
| 197 | * Convert the flock structure into a start and end. |
| 198 | */ |
| 199 | switch (fl->l_whence) { |
| 200 | case SEEK_SET: |
| 201 | case SEEK_CUR: |
| 202 | /* |
| 203 | * Caller is responsible for adding any necessary offset |
| 204 | * when SEEK_CUR is used. |
| 205 | */ |
| 206 | start = fl->l_start; |
| 207 | break; |
| 208 | |
| 209 | case SEEK_END: |
| 210 | start = size + fl->l_start; |
| 211 | break; |
| 212 | |
| 213 | default: |
| 214 | return(EINVAL); |
| 215 | } |
| 216 | |
| 217 | flags = ap->a_flags; |
| 218 | if (start < 0) |
| 219 | return(EINVAL); |
| 220 | if (fl->l_len == 0) { |
| 221 | flags |= F_NOEND; |
| 222 | end = LLONG_MAX; |
| 223 | } else if (fl->l_len < 0) { |
| 224 | return(EINVAL); |
| 225 | } else { |
| 226 | end = start + fl->l_len - 1; |
| 227 | if (end < start) |
| 228 | return(EINVAL); |
| 229 | } |
| 230 | |
| 231 | type = fl->l_type; |
| 232 | /* |
| 233 | * This isn't really correct for flock-style locks, |
| 234 | * but the current handling is somewhat broken anyway. |
| 235 | */ |
| 236 | owner = (struct proc *)ap->a_id; |
| 237 | |
| 238 | /* |
| 239 | * Do the requested operation. |
| 240 | */ |
| 241 | token = lwkt_getpooltoken(lock); |
| 242 | |
| 243 | if (lock->init_done == 0) { |
| 244 | TAILQ_INIT(&lock->lf_range); |
| 245 | TAILQ_INIT(&lock->lf_blocked); |
| 246 | lock->init_done = 1; |
| 247 | } |
| 248 | |
| 249 | switch(ap->a_op) { |
| 250 | case F_SETLK: |
| 251 | /* |
| 252 | * NOTE: It is possible for both lf_range and lf_blocked to |
| 253 | * be empty if we block and get woken up, but another process |
| 254 | * then gets in and issues an unlock. So VMAYHAVELOCKS must |
| 255 | * be set after the lf_setlock() operation completes rather |
| 256 | * then before. |
| 257 | */ |
| 258 | error = lf_setlock(lock, owner, type, flags, start, end); |
| 259 | vsetflags(ap->a_vp, VMAYHAVELOCKS); |
| 260 | break; |
| 261 | |
| 262 | case F_UNLCK: |
| 263 | error = lf_setlock(lock, owner, type, flags, start, end); |
| 264 | if (TAILQ_EMPTY(&lock->lf_range) && |
| 265 | TAILQ_EMPTY(&lock->lf_blocked)) { |
| 266 | vclrflags(ap->a_vp, VMAYHAVELOCKS); |
| 267 | } |
| 268 | break; |
| 269 | |
| 270 | case F_GETLK: |
| 271 | error = lf_getlock(fl, lock, owner, type, flags, start, end); |
| 272 | break; |
| 273 | |
| 274 | default: |
| 275 | error = EINVAL; |
| 276 | break; |
| 277 | } |
| 278 | lwkt_reltoken(token); |
| 279 | return(error); |
| 280 | } |
| 281 | |
| 282 | static int |
| 283 | lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags, |
| 284 | off_t start, off_t end) |
| 285 | { |
| 286 | struct lockf_range *range; |
| 287 | struct lockf_range *brange; |
| 288 | struct lockf_range *next; |
| 289 | struct lockf_range *first_match; |
| 290 | struct lockf_range *last_match; |
| 291 | struct lockf_range *insert_point; |
| 292 | struct lockf_range *new_range1; |
| 293 | struct lockf_range *new_range2; |
| 294 | int wakeup_needed; |
| 295 | int double_clip; |
| 296 | int unlock_override; |
| 297 | int error = 0; |
| 298 | int count; |
| 299 | struct lockf_range_list deadlist; |
| 300 | |
| 301 | new_range1 = NULL; |
| 302 | new_range2 = NULL; |
| 303 | count = 0; |
| 304 | |
| 305 | restart: |
| 306 | /* |
| 307 | * Preallocate two ranges so we don't have to worry about blocking |
| 308 | * in the middle of the lock code. |
| 309 | */ |
| 310 | if (new_range1 == NULL) |
| 311 | new_range1 = lf_alloc_range(); |
| 312 | if (new_range2 == NULL) |
| 313 | new_range2 = lf_alloc_range(); |
| 314 | first_match = NULL; |
| 315 | last_match = NULL; |
| 316 | insert_point = NULL; |
| 317 | wakeup_needed = 0; |
| 318 | |
| 319 | lf_print_lock(lock); |
| 320 | |
| 321 | /* |
| 322 | * Locate the insertion point for the new lock (the first range |
| 323 | * with an lf_start >= start). |
| 324 | * |
| 325 | * Locate the first and latch ranges owned by us that overlap |
| 326 | * the requested range. |
| 327 | */ |
| 328 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) { |
| 329 | if (insert_point == NULL && range->lf_start >= start) |
| 330 | insert_point = range; |
| 331 | |
| 332 | /* |
| 333 | * Skip non-overlapping locks. Locks are sorted by lf_start |
| 334 | * So we can terminate the search when lf_start exceeds the |
| 335 | * requested range (insert_point is still guarenteed to be |
| 336 | * set properly). |
| 337 | */ |
| 338 | if (range->lf_end < start) |
| 339 | continue; |
| 340 | if (range->lf_start > end) { |
| 341 | range = NULL; |
| 342 | break; |
| 343 | } |
| 344 | |
| 345 | /* |
| 346 | * Overlapping lock. Set first_match and last_match if we |
| 347 | * are the owner. |
| 348 | */ |
| 349 | if (range->lf_owner == owner) { |
| 350 | if (first_match == NULL) |
| 351 | first_match = range; |
| 352 | last_match = range; |
| 353 | continue; |
| 354 | } |
| 355 | |
| 356 | /* |
| 357 | * If we aren't the owner check for a conflicting lock. Only |
| 358 | * if not unlocking. |
| 359 | */ |
| 360 | if (type != F_UNLCK) { |
| 361 | if (type == F_WRLCK || range->lf_type == F_WRLCK) |
| 362 | break; |
| 363 | } |
| 364 | } |
| 365 | |
| 366 | /* |
| 367 | * If a conflicting lock was observed, block or fail as appropriate. |
| 368 | * (this code is skipped when unlocking) |
| 369 | */ |
| 370 | if (range != NULL) { |
| 371 | if ((flags & F_WAIT) == 0) { |
| 372 | error = EAGAIN; |
| 373 | goto do_cleanup; |
| 374 | } |
| 375 | |
| 376 | /* |
| 377 | * We are blocked. For POSIX locks we have to check |
| 378 | * for deadlocks and return with EDEADLK. This is done |
| 379 | * by checking whether range->lf_owner is already |
| 380 | * blocked. |
| 381 | * |
| 382 | * Since flock-style locks cover the whole file, a |
| 383 | * deadlock between those is nearly impossible. |
| 384 | * This can only occur if a process tries to lock the |
| 385 | * same inode exclusively while holding a shared lock |
| 386 | * with another descriptor. |
| 387 | * XXX How can we cleanly detect this? |
| 388 | * XXX The current mixing of flock & fcntl/lockf is evil. |
| 389 | * |
| 390 | * Handle existing locks of flock-style like POSIX locks. |
| 391 | */ |
| 392 | if (flags & F_POSIX) { |
| 393 | TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) { |
| 394 | if (brange->lf_owner == range->lf_owner) { |
| 395 | error = EDEADLK; |
| 396 | goto do_cleanup; |
| 397 | } |
| 398 | } |
| 399 | } |
| 400 | |
| 401 | /* |
| 402 | * For flock-style locks, we must first remove |
| 403 | * any shared locks that we hold before we sleep |
| 404 | * waiting for an exclusive lock. |
| 405 | */ |
| 406 | if ((flags & F_POSIX) == 0 && type == F_WRLCK) |
| 407 | lf_setlock(lock, owner, F_UNLCK, 0, start, end); |
| 408 | |
| 409 | brange = new_range1; |
| 410 | new_range1 = NULL; |
| 411 | lf_create_range(brange, owner, type, 0, start, end); |
| 412 | TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link); |
| 413 | error = tsleep(brange, PCATCH, "lockf", 0); |
| 414 | |
| 415 | /* |
| 416 | * We may have been awaked by a signal and/or by a |
| 417 | * debugger continuing us (in which case we must remove |
| 418 | * ourselves from the blocked list) and/or by another |
| 419 | * process releasing/downgrading a lock (in which case |
| 420 | * we have already been removed from the blocked list |
| 421 | * and our lf_flags field is 1). |
| 422 | * |
| 423 | * Sleep if it looks like we might be livelocking. |
| 424 | */ |
| 425 | if (brange->lf_flags == 0) |
| 426 | TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link); |
| 427 | if (count == 2) |
| 428 | tsleep(brange, 0, "lockfz", 2); |
| 429 | else |
| 430 | ++count; |
| 431 | lf_destroy_range(brange); |
| 432 | |
| 433 | if (error) |
| 434 | goto do_cleanup; |
| 435 | goto restart; |
| 436 | } |
| 437 | |
| 438 | /* |
| 439 | * If there are no overlapping locks owned by us then creating |
| 440 | * the new lock is easy. This is the most common case. |
| 441 | */ |
| 442 | if (first_match == NULL) { |
| 443 | if (type == F_UNLCK) |
| 444 | goto do_wakeup; |
| 445 | if (flags & F_POSIX) { |
| 446 | if (lf_count_change(owner, 1)) { |
| 447 | error = ENOLCK; |
| 448 | goto do_cleanup; |
| 449 | } |
| 450 | } |
| 451 | range = new_range1; |
| 452 | new_range1 = NULL; |
| 453 | lf_create_range(range, owner, type, flags, start, end); |
| 454 | lf_insert(&lock->lf_range, range, insert_point); |
| 455 | goto do_wakeup; |
| 456 | } |
| 457 | |
| 458 | /* |
| 459 | * double_clip - Calculate a special case where TWO locks may have |
| 460 | * to be added due to the new lock breaking up an |
| 461 | * existing incompatible lock in the middle. |
| 462 | * |
| 463 | * unlock_override - Calculate a special case where NO locks |
| 464 | * need to be created. This occurs when an unlock |
| 465 | * does not clip any locks at the front and rear. |
| 466 | * |
| 467 | * WARNING! closef() and fdrop() assume that an F_UNLCK of the |
| 468 | * entire range will always succeed so the unlock_override |
| 469 | * case is mandatory. |
| 470 | */ |
| 471 | double_clip = 0; |
| 472 | unlock_override = 0; |
| 473 | if (first_match->lf_start < start) { |
| 474 | if (first_match == last_match && last_match->lf_end > end) |
| 475 | double_clip = 1; |
| 476 | } else if (type == F_UNLCK && last_match->lf_end <= end) { |
| 477 | unlock_override = 1; |
| 478 | } |
| 479 | |
| 480 | /* |
| 481 | * Figure out the worst case net increase in POSIX locks and account |
| 482 | * for it now before we start modifying things. If neither the |
| 483 | * first or last locks match we have an issue. If there is only |
| 484 | * one overlapping range which needs to be clipped on both ends |
| 485 | * we wind up having to create up to two new locks, else only one. |
| 486 | * |
| 487 | * When unlocking the worst case is always 1 new lock if our |
| 488 | * unlock request cuts the middle out of an existing lock range. |
| 489 | * |
| 490 | * count represents the 'cleanup' adjustment needed. It starts |
| 491 | * negative, is incremented whenever we create a new POSIX lock, |
| 492 | * and decremented whenever we delete an existing one. At the |
| 493 | * end of the day it had better be <= 0 or we didn't calculate the |
| 494 | * worse case properly here. |
| 495 | */ |
| 496 | count = 0; |
| 497 | if ((flags & F_POSIX) && !unlock_override) { |
| 498 | if (!lf_match(first_match, type, flags) && |
| 499 | !lf_match(last_match, type, flags) |
| 500 | ) { |
| 501 | if (double_clip && type != F_UNLCK) |
| 502 | count = -2; |
| 503 | else |
| 504 | count = -1; |
| 505 | } |
| 506 | if (count && lf_count_change(owner, -count)) { |
| 507 | error = ENOLCK; |
| 508 | goto do_cleanup; |
| 509 | } |
| 510 | } |
| 511 | /* else flock style lock which encompasses entire range */ |
| 512 | |
| 513 | /* |
| 514 | * Create and insert the lock represented the requested range. |
| 515 | * Adjust the net POSIX lock count. We have to move our insertion |
| 516 | * point since brange now represents the first record >= start. |
| 517 | * |
| 518 | * When unlocking, no new lock is inserted but we still clip. |
| 519 | */ |
| 520 | if (type != F_UNLCK) { |
| 521 | brange = new_range1; |
| 522 | new_range1 = NULL; |
| 523 | lf_create_range(brange, owner, type, flags, start, end); |
| 524 | lf_insert(&lock->lf_range, brange, insert_point); |
| 525 | insert_point = brange; |
| 526 | if (flags & F_POSIX) |
| 527 | ++count; |
| 528 | } else { |
| 529 | brange = NULL; |
| 530 | } |
| 531 | |
| 532 | /* |
| 533 | * Handle the double_clip case. This is the only case where |
| 534 | * we wind up having to add TWO locks. |
| 535 | */ |
| 536 | if (double_clip) { |
| 537 | KKASSERT(first_match == last_match); |
| 538 | last_match = new_range2; |
| 539 | new_range2 = NULL; |
| 540 | lf_create_range(last_match, first_match->lf_owner, |
| 541 | first_match->lf_type, first_match->lf_flags, |
| 542 | end + 1, first_match->lf_end); |
| 543 | first_match->lf_end = start - 1; |
| 544 | first_match->lf_flags &= ~F_NOEND; |
| 545 | |
| 546 | /* |
| 547 | * Figure out where to insert the right side clip. |
| 548 | */ |
| 549 | lf_insert(&lock->lf_range, last_match, first_match); |
| 550 | if (last_match->lf_flags & F_POSIX) |
| 551 | ++count; |
| 552 | } |
| 553 | |
| 554 | /* |
| 555 | * Clip or destroy the locks between first_match and last_match, |
| 556 | * inclusive. Ignore the primary lock we created (brange). Note |
| 557 | * that if double-clipped, first_match and last_match will be |
| 558 | * outside our clipping range. Otherwise first_match and last_match |
| 559 | * will be deleted. |
| 560 | * |
| 561 | * We have already taken care of any double clipping. |
| 562 | * |
| 563 | * The insert_point may become invalid as we delete records, do not |
| 564 | * use that pointer any more. Also, when removing something other |
| 565 | * then 'range' we have to check to see if the item we are removing |
| 566 | * is 'next' and adjust 'next' properly. |
| 567 | * |
| 568 | * NOTE: brange will be NULL if F_UNLCKing. |
| 569 | */ |
| 570 | TAILQ_INIT(&deadlist); |
| 571 | next = first_match; |
| 572 | |
| 573 | while ((range = next) != NULL) { |
| 574 | next = TAILQ_NEXT(range, lf_link); |
| 575 | |
| 576 | /* |
| 577 | * Ignore elements that we do not own and ignore the |
| 578 | * primary request range which we just created. |
| 579 | */ |
| 580 | if (range->lf_owner != owner || range == brange) |
| 581 | continue; |
| 582 | |
| 583 | /* |
| 584 | * We may have to wakeup a waiter when downgrading a lock. |
| 585 | */ |
| 586 | if (type == F_UNLCK) |
| 587 | wakeup_needed = 1; |
| 588 | if (type == F_RDLCK && range->lf_type == F_WRLCK) |
| 589 | wakeup_needed = 1; |
| 590 | |
| 591 | /* |
| 592 | * Clip left. This can only occur on first_match. |
| 593 | * |
| 594 | * Merge the left clip with brange if possible. This must |
| 595 | * be done specifically, not in the optimized merge heuristic |
| 596 | * below, since we may have counted on it in our 'count' |
| 597 | * calculation above. |
| 598 | */ |
| 599 | if (range->lf_start < start) { |
| 600 | KKASSERT(range == first_match); |
| 601 | if (brange && |
| 602 | range->lf_end >= start - 1 && |
| 603 | lf_match(range, type, flags)) { |
| 604 | range->lf_end = brange->lf_end; |
| 605 | range->lf_flags |= brange->lf_flags & F_NOEND; |
| 606 | /* |
| 607 | * Removing something other then 'range', |
| 608 | * adjust 'next' if necessary. |
| 609 | */ |
| 610 | if (next == brange) |
| 611 | next = TAILQ_NEXT(next, lf_link); |
| 612 | TAILQ_REMOVE(&lock->lf_range, brange, lf_link); |
| 613 | if (brange->lf_flags & F_POSIX) |
| 614 | --count; |
| 615 | TAILQ_INSERT_TAIL(&deadlist, brange, lf_link); |
| 616 | brange = range; |
| 617 | } else if (range->lf_end >= start) { |
| 618 | range->lf_end = start - 1; |
| 619 | if (type != F_UNLCK) |
| 620 | range->lf_flags &= ~F_NOEND; |
| 621 | } |
| 622 | if (range == last_match) |
| 623 | break; |
| 624 | continue; |
| 625 | } |
| 626 | |
| 627 | /* |
| 628 | * Clip right. This can only occur on last_match. |
| 629 | * |
| 630 | * Merge the right clip if possible. This must be done |
| 631 | * specifically, not in the optimized merge heuristic |
| 632 | * below, since we may have counted on it in our 'count' |
| 633 | * calculation. |
| 634 | * |
| 635 | * Since we are adjusting lf_start, we have to move the |
| 636 | * record to maintain the sorted list. Since lf_start is |
| 637 | * only getting larger we can use the next element as the |
| 638 | * insert point (we don't have to backtrack). |
| 639 | */ |
| 640 | if (range->lf_end > end) { |
| 641 | KKASSERT(range == last_match); |
| 642 | if (brange && |
| 643 | range->lf_start <= end + 1 && |
| 644 | lf_match(range, type, flags)) { |
| 645 | brange->lf_end = range->lf_end; |
| 646 | brange->lf_flags |= range->lf_flags & F_NOEND; |
| 647 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); |
| 648 | if (range->lf_flags & F_POSIX) |
| 649 | --count; |
| 650 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); |
| 651 | } else if (range->lf_start <= end) { |
| 652 | range->lf_start = end + 1; |
| 653 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); |
| 654 | lf_insert(&lock->lf_range, range, next); |
| 655 | } |
| 656 | /* range == last_match, we are done */ |
| 657 | break; |
| 658 | } |
| 659 | |
| 660 | /* |
| 661 | * The record must be entirely enclosed. Note that the |
| 662 | * record could be first_match or last_match, and will be |
| 663 | * deleted. |
| 664 | */ |
| 665 | KKASSERT(range->lf_start >= start && range->lf_end <= end); |
| 666 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); |
| 667 | if (range->lf_flags & F_POSIX) |
| 668 | --count; |
| 669 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); |
| 670 | if (range == last_match) |
| 671 | break; |
| 672 | } |
| 673 | |
| 674 | /* |
| 675 | * Attempt to merge locks adjacent to brange. For example, we may |
| 676 | * have had to clip first_match and/or last_match, and they might |
| 677 | * be adjacent. Or there might simply have been an adjacent lock |
| 678 | * already there. |
| 679 | * |
| 680 | * Don't get fancy, just check adjacent elements in the list if they |
| 681 | * happen to be owned by us. |
| 682 | * |
| 683 | * This case only gets hit if we have a situation where a shared |
| 684 | * and exclusive lock are adjacent, and the exclusive lock is |
| 685 | * downgraded to shared or the shared lock is upgraded to exclusive. |
| 686 | */ |
| 687 | if (brange) { |
| 688 | range = TAILQ_PREV(brange, lockf_range_list, lf_link); |
| 689 | if (range && |
| 690 | range->lf_owner == owner && |
| 691 | range->lf_end == brange->lf_start - 1 && |
| 692 | lf_match(range, type, flags) |
| 693 | ) { |
| 694 | /* |
| 695 | * Extend range to cover brange and scrap brange. |
| 696 | */ |
| 697 | range->lf_end = brange->lf_end; |
| 698 | range->lf_flags |= brange->lf_flags & F_NOEND; |
| 699 | TAILQ_REMOVE(&lock->lf_range, brange, lf_link); |
| 700 | if (brange->lf_flags & F_POSIX) |
| 701 | --count; |
| 702 | TAILQ_INSERT_TAIL(&deadlist, brange, lf_link); |
| 703 | brange = range; |
| 704 | } |
| 705 | range = TAILQ_NEXT(brange, lf_link); |
| 706 | if (range && |
| 707 | range->lf_owner == owner && |
| 708 | range->lf_start == brange->lf_end + 1 && |
| 709 | lf_match(range, type, flags) |
| 710 | ) { |
| 711 | /* |
| 712 | * Extend brange to cover range and scrap range. |
| 713 | */ |
| 714 | brange->lf_end = range->lf_end; |
| 715 | brange->lf_flags |= range->lf_flags & F_NOEND; |
| 716 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); |
| 717 | if (range->lf_flags & F_POSIX) |
| 718 | --count; |
| 719 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); |
| 720 | } |
| 721 | } |
| 722 | |
| 723 | /* |
| 724 | * Destroy deleted elements. We didn't want to do it in the loop |
| 725 | * because the free() might have blocked. |
| 726 | * |
| 727 | * Adjust the count for any posix locks we thought we might create |
| 728 | * but didn't. |
| 729 | */ |
| 730 | while ((range = TAILQ_FIRST(&deadlist)) != NULL) { |
| 731 | TAILQ_REMOVE(&deadlist, range, lf_link); |
| 732 | lf_destroy_range(range); |
| 733 | } |
| 734 | |
| 735 | KKASSERT(count <= 0); |
| 736 | if (count < 0) |
| 737 | lf_count_change(owner, count); |
| 738 | do_wakeup: |
| 739 | lf_print_lock(lock); |
| 740 | if (wakeup_needed) |
| 741 | lf_wakeup(lock, start, end); |
| 742 | error = 0; |
| 743 | do_cleanup: |
| 744 | if (new_range1 != NULL) |
| 745 | lf_destroy_range(new_range1); |
| 746 | if (new_range2 != NULL) |
| 747 | lf_destroy_range(new_range2); |
| 748 | return(error); |
| 749 | } |
| 750 | |
| 751 | /* |
| 752 | * Check whether there is a blocking lock, |
| 753 | * and if so return its process identifier. |
| 754 | */ |
| 755 | static int |
| 756 | lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner, |
| 757 | int type, int flags, off_t start, off_t end) |
| 758 | { |
| 759 | struct lockf_range *range; |
| 760 | |
| 761 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) |
| 762 | if (range->lf_owner != owner && |
| 763 | lf_overlap(range, start, end) && |
| 764 | (type == F_WRLCK || range->lf_type == F_WRLCK)) |
| 765 | break; |
| 766 | if (range == NULL) { |
| 767 | fl->l_type = F_UNLCK; |
| 768 | return(0); |
| 769 | } |
| 770 | fl->l_type = range->lf_type; |
| 771 | fl->l_whence = SEEK_SET; |
| 772 | fl->l_start = range->lf_start; |
| 773 | if (range->lf_flags & F_NOEND) |
| 774 | fl->l_len = 0; |
| 775 | else |
| 776 | fl->l_len = range->lf_end - range->lf_start + 1; |
| 777 | if (range->lf_owner != NULL && (range->lf_flags & F_POSIX)) |
| 778 | fl->l_pid = range->lf_owner->p_pid; |
| 779 | else |
| 780 | fl->l_pid = -1; |
| 781 | return(0); |
| 782 | } |
| 783 | |
| 784 | /* |
| 785 | * Wakeup pending lock attempts. Theoretically we can stop as soon as |
| 786 | * we encounter an exclusive request that covers the whole range (at least |
| 787 | * insofar as the sleep code above calls lf_wakeup() if it would otherwise |
| 788 | * exit instead of loop), but for now just wakeup all overlapping |
| 789 | * requests. XXX |
| 790 | */ |
| 791 | static void |
| 792 | lf_wakeup(struct lockf *lock, off_t start, off_t end) |
| 793 | { |
| 794 | struct lockf_range *range, *nrange; |
| 795 | |
| 796 | TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) { |
| 797 | if (lf_overlap(range, start, end) == 0) |
| 798 | continue; |
| 799 | TAILQ_REMOVE(&lock->lf_blocked, range, lf_link); |
| 800 | range->lf_flags = 1; |
| 801 | wakeup(range); |
| 802 | } |
| 803 | } |
| 804 | |
| 805 | /* |
| 806 | * Allocate a range structure and initialize it sufficiently such that |
| 807 | * lf_destroy_range() does not barf. |
| 808 | */ |
| 809 | static struct lockf_range * |
| 810 | lf_alloc_range(void) |
| 811 | { |
| 812 | struct lockf_range *range; |
| 813 | |
| 814 | #ifdef INVARIANTS |
| 815 | atomic_add_int(&lf_global_counter, 1); |
| 816 | #endif |
| 817 | range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK); |
| 818 | range->lf_owner = NULL; |
| 819 | return(range); |
| 820 | } |
| 821 | |
| 822 | static void |
| 823 | lf_insert(struct lockf_range_list *list, struct lockf_range *elm, |
| 824 | struct lockf_range *insert_point) |
| 825 | { |
| 826 | while (insert_point && insert_point->lf_start < elm->lf_start) |
| 827 | insert_point = TAILQ_NEXT(insert_point, lf_link); |
| 828 | if (insert_point != NULL) |
| 829 | TAILQ_INSERT_BEFORE(insert_point, elm, lf_link); |
| 830 | else |
| 831 | TAILQ_INSERT_TAIL(list, elm, lf_link); |
| 832 | } |
| 833 | |
| 834 | static void |
| 835 | lf_create_range(struct lockf_range *range, struct proc *owner, int type, |
| 836 | int flags, off_t start, off_t end) |
| 837 | { |
| 838 | KKASSERT(start <= end); |
| 839 | range->lf_type = type; |
| 840 | range->lf_flags = flags; |
| 841 | range->lf_start = start; |
| 842 | range->lf_end = end; |
| 843 | range->lf_owner = owner; |
| 844 | |
| 845 | lf_printf("lf_create_range: %lld..%lld\n", |
| 846 | range->lf_start, range->lf_end); |
| 847 | } |
| 848 | |
| 849 | static void |
| 850 | lf_destroy_range(struct lockf_range *range) |
| 851 | { |
| 852 | lf_printf("lf_destroy_range: %lld..%lld\n", |
| 853 | range->lf_start, range->lf_end); |
| 854 | kfree(range, M_LOCKF); |
| 855 | #ifdef INVARIANTS |
| 856 | atomic_add_int(&lf_global_counter, -1); |
| 857 | KKASSERT(lf_global_counter >= 0); |
| 858 | #endif |
| 859 | } |
| 860 | |
| 861 | #ifdef LOCKF_DEBUG |
| 862 | |
| 863 | static void |
| 864 | _lf_printf(const char *ctl, ...) |
| 865 | { |
| 866 | struct proc *p; |
| 867 | __va_list va; |
| 868 | |
| 869 | if (lf_print_ranges) { |
| 870 | if ((p = curproc) != NULL) |
| 871 | kprintf("pid %d (%s): ", p->p_pid, p->p_comm); |
| 872 | } |
| 873 | __va_start(va, ctl); |
| 874 | kvprintf(ctl, va); |
| 875 | __va_end(va); |
| 876 | } |
| 877 | |
| 878 | static void |
| 879 | _lf_print_lock(const struct lockf *lock) |
| 880 | { |
| 881 | struct lockf_range *range; |
| 882 | |
| 883 | if (lf_print_ranges == 0) |
| 884 | return; |
| 885 | |
| 886 | if (TAILQ_EMPTY(&lock->lf_range)) { |
| 887 | lf_printf("lockf %p: no ranges locked\n", lock); |
| 888 | } else { |
| 889 | lf_printf("lockf %p:\n", lock); |
| 890 | } |
| 891 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) |
| 892 | kprintf("\t%lld..%lld type %s owned by %d\n", |
| 893 | range->lf_start, range->lf_end, |
| 894 | range->lf_type == F_RDLCK ? "shared" : "exclusive", |
| 895 | range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1); |
| 896 | if (TAILQ_EMPTY(&lock->lf_blocked)) |
| 897 | kprintf("no process waiting for range\n"); |
| 898 | else |
| 899 | kprintf("blocked locks:"); |
| 900 | TAILQ_FOREACH(range, &lock->lf_blocked, lf_link) |
| 901 | kprintf("\t%lld..%lld type %s waiting on %p\n", |
| 902 | range->lf_start, range->lf_end, |
| 903 | range->lf_type == F_RDLCK ? "shared" : "exclusive", |
| 904 | range); |
| 905 | } |
| 906 | #endif /* LOCKF_DEBUG */ |