| Commit | Line | Data |
|---|---|---|
| 984263bc | 1 | /* |
| 0a019f0d | 2 | * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>. All rights reserved. |
| a3190466 | 3 | * Copyright (c) 2006 Matthew Dillon <dillon@backplane.com>. All rights reserved. |
| 508ceb09 | 4 | * |
| 984263bc MD |
5 | * Copyright (c) 1982, 1986, 1989, 1993 |
| 6 | * The Regents of the University of California. All rights reserved. | |
| 7 | * | |
| 8 | * This code is derived from software contributed to Berkeley by | |
| 9 | * Scooter Morris at Genentech Inc. | |
| 10 | * | |
| 11 | * Redistribution and use in source and binary forms, with or without | |
| 12 | * modification, are permitted provided that the following conditions | |
| 13 | * are met: | |
| 14 | * 1. Redistributions of source code must retain the above copyright | |
| 15 | * notice, this list of conditions and the following disclaimer. | |
| 16 | * 2. Redistributions in binary form must reproduce the above copyright | |
| 17 | * notice, this list of conditions and the following disclaimer in the | |
| 18 | * documentation and/or other materials provided with the distribution. | |
| 19 | * 3. All advertising materials mentioning features or use of this software | |
| 20 | * must display the following acknowledgement: | |
| 21 | * This product includes software developed by the University of | |
| 22 | * California, Berkeley and its contributors. | |
| 23 | * 4. Neither the name of the University nor the names of its contributors | |
| 24 | * may be used to endorse or promote products derived from this software | |
| 25 | * without specific prior written permission. | |
| 26 | * | |
| 27 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
| 28 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
| 29 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
| 30 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
| 31 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
| 32 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
| 33 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
| 34 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
| 35 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
| 36 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
| 37 | * SUCH DAMAGE. | |
| 38 | * | |
| 39 | * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 | |
| 40 | * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $ | |
| 9b725a11 | 41 | * $DragonFly: src/sys/kern/kern_lockf.c,v 1.37 2007/11/01 22:48:16 dillon Exp $ |
| 984263bc MD |
42 | */ |
| 43 | ||
| 984263bc MD |
44 | #include <sys/param.h> |
| 45 | #include <sys/systm.h> | |
| 46 | #include <sys/kernel.h> | |
| 47 | #include <sys/lock.h> | |
| 48 | #include <sys/proc.h> | |
| 49 | #include <sys/unistd.h> | |
| 50 | #include <sys/vnode.h> | |
| 51 | #include <sys/malloc.h> | |
| 52 | #include <sys/fcntl.h> | |
| 508ceb09 | 53 | #include <sys/resourcevar.h> |
| 984263bc MD |
54 | |
| 55 | #include <sys/lockf.h> | |
| 508ceb09 | 56 | #include <machine/limits.h> /* for LLONG_MAX */ |
| 6449cdd4 | 57 | #include <machine/stdarg.h> |
| 508ceb09 | 58 | |
| 9d7a637e AE |
59 | #include <sys/spinlock2.h> |
| 60 | ||
| 508ceb09 JS |
61 | #ifdef INVARIANTS |
| 62 | int lf_global_counter = 0; | |
| 63 | #endif | |
| 6449cdd4 | 64 | |
| 508ceb09 JS |
65 | #ifdef LOCKF_DEBUG |
| 66 | int lf_print_ranges = 0; | |
| 67 | ||
| 6449cdd4 MD |
68 | static void _lf_print_lock(const struct lockf *); |
| 69 | static void _lf_printf(const char *, ...); | |
| 70 | ||
| 71 | #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock) | |
| 72 | #define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args) | |
| 73 | #else | |
| 74 | #define lf_print_lock(lock) | |
| 75 | #define lf_printf(ctl, args...) | |
| 508ceb09 JS |
76 | #endif |
| 77 | ||
| 78 | static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); | |
| 79 | ||
| 80 | static void lf_wakeup(struct lockf *, off_t, off_t); | |
| 4f0a1741 JS |
81 | static struct lockf_range *lf_alloc_range(void); |
| 82 | static void lf_create_range(struct lockf_range *, struct proc *, int, int, | |
| cae44669 | 83 | off_t, off_t); |
| a3190466 MD |
84 | static void lf_insert(struct lockf_range_list *list, |
| 85 | struct lockf_range *elm, | |
| 86 | struct lockf_range *insert_point); | |
| cae44669 | 87 | static void lf_destroy_range(struct lockf_range *); |
| 508ceb09 JS |
88 | |
| 89 | static int lf_setlock(struct lockf *, struct proc *, int, int, | |
| 90 | off_t, off_t); | |
| 508ceb09 JS |
91 | static int lf_getlock(struct flock *, struct lockf *, struct proc *, |
| 92 | int, int, off_t, off_t); | |
| 93 | ||
| 94 | static int lf_count_change(struct proc *, int); | |
| 984263bc MD |
95 | |
| 96 | /* | |
| a3190466 MD |
97 | * Return TRUE (non-zero) if the type and posix flags match. |
| 98 | */ | |
| 99 | static __inline | |
| 100 | int | |
| 101 | lf_match(struct lockf_range *range, int type, int flags) | |
| 102 | { | |
| 103 | if (range->lf_type != type) | |
| 104 | return(0); | |
| 105 | if ((range->lf_flags ^ flags) & F_POSIX) | |
| 106 | return(0); | |
| 107 | return(1); | |
| 108 | } | |
| 109 | ||
| 110 | /* | |
| 111 | * Check whether range and [start, end] overlap. | |
| 112 | */ | |
| 113 | static __inline | |
| 114 | int | |
| 115 | lf_overlap(const struct lockf_range *range, off_t start, off_t end) | |
| 116 | { | |
| 117 | if (range->lf_start >= start && range->lf_start <= end) | |
| 118 | return(1); | |
| 119 | else if (start >= range->lf_start && start <= range->lf_end) | |
| 120 | return(1); | |
| 121 | else | |
| 122 | return(0); | |
| 123 | } | |
| 124 | ||
| 125 | ||
| 126 | /* | |
| 508ceb09 | 127 | * Change the POSIX lock accounting for the given process. |
| 984263bc | 128 | */ |
| 508ceb09 | 129 | void |
| b76a3852 | 130 | lf_count_adjust(struct proc *p, int increase) |
| 508ceb09 JS |
131 | { |
| 132 | struct uidinfo *uip; | |
| 984263bc | 133 | |
| 508ceb09 | 134 | KKASSERT(p != NULL); |
| 984263bc | 135 | |
| 508ceb09 | 136 | uip = p->p_ucred->cr_uidinfo; |
| 9d7a637e | 137 | spin_lock_wr(&uip->ui_lock); |
| 984263bc | 138 | |
| b76a3852 JS |
139 | if (increase) |
| 140 | uip->ui_posixlocks += p->p_numposixlocks; | |
| 141 | else | |
| 142 | uip->ui_posixlocks -= p->p_numposixlocks; | |
| 984263bc | 143 | |
| 508ceb09 | 144 | KASSERT(uip->ui_posixlocks >= 0, |
| b76a3852 JS |
145 | ("Negative number of POSIX locks held by %s user: %d.", |
| 146 | increase ? "new" : "old", uip->ui_posixlocks)); | |
| 9d7a637e | 147 | spin_unlock_wr(&uip->ui_lock); |
| 508ceb09 | 148 | } |
| 984263bc | 149 | |
| 508ceb09 JS |
150 | static int |
| 151 | lf_count_change(struct proc *owner, int diff) | |
| 152 | { | |
| 153 | struct uidinfo *uip; | |
| 9d7a637e | 154 | int max, ret; |
| 508ceb09 JS |
155 | |
| 156 | /* we might actually not have a process context */ | |
| 157 | if (owner == NULL) | |
| 158 | return(0); | |
| 159 | ||
| 160 | uip = owner->p_ucred->cr_uidinfo; | |
| 161 | ||
| 162 | max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur, | |
| 163 | maxposixlocksperuid); | |
| 9d7a637e AE |
164 | |
| 165 | spin_lock_wr(&uip->ui_lock); | |
| 508ceb09 | 166 | if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 && |
| cae44669 | 167 | uip->ui_posixlocks >= max ) { |
| 9d7a637e AE |
168 | ret = 1; |
| 169 | } else { | |
| 170 | uip->ui_posixlocks += diff; | |
| 171 | owner->p_numposixlocks += diff; | |
| 172 | KASSERT(uip->ui_posixlocks >= 0, | |
| 173 | ("Negative number of POSIX locks held by user: %d.", | |
| 174 | uip->ui_posixlocks)); | |
| 175 | KASSERT(owner->p_numposixlocks >= 0, | |
| 176 | ("Negative number of POSIX locks held by proc: %d.", | |
| 177 | uip->ui_posixlocks)); | |
| 178 | ret = 0; | |
| cae44669 | 179 | } |
| 9d7a637e AE |
180 | spin_unlock_wr(&uip->ui_lock); |
| 181 | return ret; | |
| 508ceb09 | 182 | } |
| 984263bc MD |
183 | |
| 184 | /* | |
| 185 | * Advisory record locking support | |
| 186 | */ | |
| 187 | int | |
| 508ceb09 | 188 | lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size) |
| 984263bc | 189 | { |
| 1fd87d54 | 190 | struct flock *fl = ap->a_fl; |
| 508ceb09 | 191 | struct proc *owner; |
| 984263bc | 192 | off_t start, end; |
| 97ef7fc2 | 193 | int type, flags, error; |
| 3b998fa9 | 194 | lwkt_token_t token; |
| 508ceb09 | 195 | |
| 984263bc MD |
196 | /* |
| 197 | * Convert the flock structure into a start and end. | |
| 198 | */ | |
| 199 | switch (fl->l_whence) { | |
| 984263bc MD |
200 | case SEEK_SET: |
| 201 | case SEEK_CUR: | |
| 202 | /* | |
| 203 | * Caller is responsible for adding any necessary offset | |
| 204 | * when SEEK_CUR is used. | |
| 205 | */ | |
| 206 | start = fl->l_start; | |
| 207 | break; | |
| 208 | ||
| 209 | case SEEK_END: | |
| 210 | start = size + fl->l_start; | |
| 211 | break; | |
| 212 | ||
| 213 | default: | |
| 508ceb09 | 214 | return(EINVAL); |
| 984263bc | 215 | } |
| 9b725a11 MD |
216 | |
| 217 | flags = ap->a_flags; | |
| 984263bc | 218 | if (start < 0) |
| 508ceb09 JS |
219 | return(EINVAL); |
| 220 | if (fl->l_len == 0) { | |
| 221 | flags |= F_NOEND; | |
| 86fc1543 | 222 | end = LLONG_MAX; |
| 508ceb09 | 223 | } else { |
| 984263bc MD |
224 | end = start + fl->l_len - 1; |
| 225 | if (end < start) | |
| 508ceb09 | 226 | return(EINVAL); |
| 984263bc | 227 | } |
| 508ceb09 | 228 | |
| 508ceb09 | 229 | type = fl->l_type; |
| 984263bc | 230 | /* |
| 508ceb09 JS |
231 | * This isn't really correct for flock-style locks, |
| 232 | * but the current handling is somewhat broken anyway. | |
| 984263bc | 233 | */ |
| 508ceb09 JS |
234 | owner = (struct proc *)ap->a_id; |
| 235 | ||
| 984263bc MD |
236 | /* |
| 237 | * Do the requested operation. | |
| 238 | */ | |
| 3b998fa9 | 239 | token = lwkt_getpooltoken(lock); |
| 0e23bc1f JS |
240 | |
| 241 | if (lock->init_done == 0) { | |
| 242 | TAILQ_INIT(&lock->lf_range); | |
| 243 | TAILQ_INIT(&lock->lf_blocked); | |
| 244 | lock->init_done = 1; | |
| 245 | } | |
| 246 | ||
| 984263bc MD |
247 | switch(ap->a_op) { |
| 248 | case F_SETLK: | |
| 93df580f MD |
249 | /* |
| 250 | * NOTE: It is possible for both lf_range and lf_blocked to | |
| 251 | * be empty if we block and get woken up, but another process | |
| 252 | * then gets in and issues an unlock. So VMAYHAVELOCKS must | |
| 253 | * be set after the lf_setlock() operation completes rather | |
| 254 | * then before. | |
| 255 | */ | |
| 97ef7fc2 | 256 | error = lf_setlock(lock, owner, type, flags, start, end); |
| 2247fe02 | 257 | vsetflags(ap->a_vp, VMAYHAVELOCKS); |
| 97ef7fc2 | 258 | break; |
| 984263bc MD |
259 | |
| 260 | case F_UNLCK: | |
| a3190466 | 261 | error = lf_setlock(lock, owner, type, flags, start, end); |
| 2792dfce MD |
262 | if (TAILQ_EMPTY(&lock->lf_range) && |
| 263 | TAILQ_EMPTY(&lock->lf_blocked)) { | |
| 2247fe02 | 264 | vclrflags(ap->a_vp, VMAYHAVELOCKS); |
| 2792dfce | 265 | } |
| 97ef7fc2 | 266 | break; |
| 984263bc MD |
267 | |
| 268 | case F_GETLK: | |
| 97ef7fc2 JS |
269 | error = lf_getlock(fl, lock, owner, type, flags, start, end); |
| 270 | break; | |
| 984263bc MD |
271 | |
| 272 | default: | |
| 97ef7fc2 JS |
273 | error = EINVAL; |
| 274 | break; | |
| 984263bc | 275 | } |
| 3b998fa9 | 276 | lwkt_reltoken(token); |
| 97ef7fc2 | 277 | return(error); |
| 984263bc MD |
278 | } |
| 279 | ||
| 984263bc | 280 | static int |
| 508ceb09 JS |
281 | lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags, |
| 282 | off_t start, off_t end) | |
| 984263bc | 283 | { |
| a3190466 MD |
284 | struct lockf_range *range; |
| 285 | struct lockf_range *brange; | |
| 286 | struct lockf_range *next; | |
| 287 | struct lockf_range *first_match; | |
| 288 | struct lockf_range *last_match; | |
| 289 | struct lockf_range *insert_point; | |
| 290 | struct lockf_range *new_range1; | |
| 291 | struct lockf_range *new_range2; | |
| 292 | int wakeup_needed; | |
| 293 | int double_clip; | |
| 4f0a1741 | 294 | int error = 0; |
| cae44669 | 295 | int count; |
| a3190466 MD |
296 | struct lockf_range_list deadlist; |
| 297 | ||
| 298 | new_range1 = NULL; | |
| 299 | new_range2 = NULL; | |
| 300 | count = 0; | |
| 508ceb09 JS |
301 | |
| 302 | restart: | |
| a3190466 MD |
303 | /* |
| 304 | * Preallocate two ranges so we don't have to worry about blocking | |
| 305 | * in the middle of the lock code. | |
| 306 | */ | |
| 4f0a1741 JS |
307 | if (new_range1 == NULL) |
| 308 | new_range1 = lf_alloc_range(); | |
| 309 | if (new_range2 == NULL) | |
| 310 | new_range2 = lf_alloc_range(); | |
| 508ceb09 | 311 | first_match = NULL; |
| a3190466 | 312 | last_match = NULL; |
| 508ceb09 JS |
313 | insert_point = NULL; |
| 314 | wakeup_needed = 0; | |
| 984263bc | 315 | |
| 6449cdd4 | 316 | lf_print_lock(lock); |
| 984263bc | 317 | |
| a3190466 MD |
318 | /* |
| 319 | * Locate the insertion point for the new lock (the first range | |
| 320 | * with an lf_start >= start). | |
| 321 | * | |
| 322 | * Locate the first and latch ranges owned by us that overlap | |
| 323 | * the requested range. | |
| 324 | */ | |
| 508ceb09 JS |
325 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) { |
| 326 | if (insert_point == NULL && range->lf_start >= start) | |
| 327 | insert_point = range; | |
| a3190466 MD |
328 | |
| 329 | /* | |
| 330 | * Skip non-overlapping locks. Locks are sorted by lf_start | |
| 331 | * So we can terminate the search when lf_start exceeds the | |
| 332 | * requested range (insert_point is still guarenteed to be | |
| 333 | * set properly). | |
| 334 | */ | |
| 335 | if (range->lf_end < start) | |
| 508ceb09 | 336 | continue; |
| a3190466 MD |
337 | if (range->lf_start > end) { |
| 338 | range = NULL; | |
| 339 | break; | |
| 340 | } | |
| 341 | ||
| 342 | /* | |
| 343 | * Overlapping lock. Set first_match and last_match if we | |
| 344 | * are the owner. | |
| 345 | */ | |
| 508ceb09 JS |
346 | if (range->lf_owner == owner) { |
| 347 | if (first_match == NULL) | |
| 348 | first_match = range; | |
| a3190466 | 349 | last_match = range; |
| 508ceb09 | 350 | continue; |
| 984263bc | 351 | } |
| a3190466 MD |
352 | |
| 353 | /* | |
| 354 | * If we aren't the owner check for a conflicting lock. Only | |
| 355 | * if not unlocking. | |
| 356 | */ | |
| 357 | if (type != F_UNLCK) { | |
| 358 | if (type == F_WRLCK || range->lf_type == F_WRLCK) | |
| 359 | break; | |
| 360 | } | |
| 508ceb09 JS |
361 | } |
| 362 | ||
| a3190466 MD |
363 | /* |
| 364 | * If a conflicting lock was observed, block or fail as appropriate. | |
| 365 | * (this code is skipped when unlocking) | |
| 366 | */ | |
| 508ceb09 | 367 | if (range != NULL) { |
| 4f0a1741 JS |
368 | if ((flags & F_WAIT) == 0) { |
| 369 | error = EAGAIN; | |
| 370 | goto do_cleanup; | |
| 371 | } | |
| 508ceb09 | 372 | |
| 984263bc | 373 | /* |
| 508ceb09 JS |
374 | * We are blocked. For POSIX locks we have to check |
| 375 | * for deadlocks and return with EDEADLK. This is done | |
| 3d8f95ac | 376 | * by checking whether range->lf_owner is already |
| 508ceb09 JS |
377 | * blocked. |
| 378 | * | |
| 379 | * Since flock-style locks cover the whole file, a | |
| 380 | * deadlock between those is nearly impossible. | |
| 381 | * This can only occur if a process tries to lock the | |
| 382 | * same inode exclusively while holding a shared lock | |
| 383 | * with another descriptor. | |
| 384 | * XXX How can we cleanly detect this? | |
| 385 | * XXX The current mixing of flock & fcntl/lockf is evil. | |
| 984263bc | 386 | * |
| 508ceb09 | 387 | * Handle existing locks of flock-style like POSIX locks. |
| 984263bc | 388 | */ |
| 508ceb09 JS |
389 | if (flags & F_POSIX) { |
| 390 | TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) | |
| 4f0a1741 JS |
391 | if (brange->lf_owner == range->lf_owner) { |
| 392 | error = EDEADLK; | |
| 393 | goto do_cleanup; | |
| 394 | } | |
| 984263bc | 395 | } |
| 508ceb09 | 396 | |
| 984263bc | 397 | /* |
| 508ceb09 | 398 | * For flock-style locks, we must first remove |
| 984263bc MD |
399 | * any shared locks that we hold before we sleep |
| 400 | * waiting for an exclusive lock. | |
| 401 | */ | |
| 71c18fe3 | 402 | if ((flags & F_POSIX) == 0 && type == F_WRLCK) |
| a3190466 | 403 | lf_setlock(lock, owner, F_UNLCK, 0, start, end); |
| 508ceb09 | 404 | |
| 4f0a1741 JS |
405 | brange = new_range1; |
| 406 | new_range1 = NULL; | |
| cae44669 | 407 | lf_create_range(brange, owner, type, 0, start, end); |
| 508ceb09 JS |
408 | TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link); |
| 409 | error = tsleep(brange, PCATCH, "lockf", 0); | |
| 410 | ||
| 984263bc | 411 | /* |
| 508ceb09 JS |
412 | * We may have been awaked by a signal and/or by a |
| 413 | * debugger continuing us (in which case we must remove | |
| 984263bc | 414 | * ourselves from the blocked list) and/or by another |
| 508ceb09 JS |
415 | * process releasing/downgrading a lock (in which case |
| 416 | * we have already been removed from the blocked list | |
| 417 | * and our lf_flags field is 1). | |
| a3190466 MD |
418 | * |
| 419 | * Sleep if it looks like we might be livelocking. | |
| 984263bc | 420 | */ |
| 508ceb09 JS |
421 | if (brange->lf_flags == 0) |
| 422 | TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link); | |
| a3190466 MD |
423 | if (count == 2) |
| 424 | tsleep(brange, 0, "lockfz", 2); | |
| 425 | else | |
| 426 | ++count; | |
| cae44669 | 427 | lf_destroy_range(brange); |
| 508ceb09 JS |
428 | |
| 429 | if (error) | |
| 4f0a1741 | 430 | goto do_cleanup; |
| 508ceb09 JS |
431 | goto restart; |
| 432 | } | |
| 433 | ||
| a3190466 MD |
434 | /* |
| 435 | * If there are no overlapping locks owned by us then creating | |
| 436 | * the new lock is easy. This is the most common case. | |
| 437 | */ | |
| 508ceb09 | 438 | if (first_match == NULL) { |
| a3190466 MD |
439 | if (type == F_UNLCK) |
| 440 | goto do_wakeup; | |
| 508ceb09 | 441 | if (flags & F_POSIX) { |
| 4f0a1741 JS |
442 | if (lf_count_change(owner, 1)) { |
| 443 | error = ENOLCK; | |
| 444 | goto do_cleanup; | |
| 445 | } | |
| 984263bc | 446 | } |
| 4f0a1741 JS |
447 | range = new_range1; |
| 448 | new_range1 = NULL; | |
| cae44669 | 449 | lf_create_range(range, owner, type, flags, start, end); |
| a3190466 | 450 | lf_insert(&lock->lf_range, range, insert_point); |
| 508ceb09 | 451 | goto do_wakeup; |
| 984263bc | 452 | } |
| 508ceb09 | 453 | |
| a3190466 MD |
454 | /* |
| 455 | * This is a special case that we need to check for in a couple | |
| 456 | * of places. | |
| 457 | */ | |
| 458 | if (first_match == last_match && first_match->lf_start < start && | |
| 459 | last_match->lf_end > end) { | |
| 460 | double_clip = 1; | |
| 461 | } else { | |
| 462 | double_clip = 0; | |
| 463 | } | |
| cae44669 | 464 | |
| a3190466 MD |
465 | /* |
| 466 | * Figure out the worst case net increase in POSIX locks and account | |
| 467 | * for it now before we start modifying things. If neither the | |
| 468 | * first or last locks match we have an issue. If there is only | |
| 469 | * one overlapping range which needs to be clipped on both ends | |
| 470 | * we wind up having to create up to two new locks, else only one. | |
| 471 | * | |
| e95b513d MD |
472 | * When unlocking the worst case is always 1 new lock if our |
| 473 | * unlock request cuts the middle out of an existing lock range. | |
| 474 | * | |
| a3190466 MD |
475 | * count represents the 'cleanup' adjustment needed. It starts |
| 476 | * negative, is incremented whenever we create a new POSIX lock, | |
| 477 | * and decremented whenever we delete an existing one. At the | |
| 478 | * end of the day it had better be <= 0 or we didn't calculate the | |
| 479 | * worse case properly here. | |
| 480 | */ | |
| 481 | count = 0; | |
| 482 | if (flags & F_POSIX) { | |
| 483 | if (!lf_match(first_match, type, flags) && | |
| 484 | !lf_match(last_match, type, flags) | |
| 485 | ) { | |
| e95b513d | 486 | if (double_clip && type != F_UNLCK) |
| a3190466 | 487 | count = -2; |
| 508ceb09 | 488 | else |
| a3190466 MD |
489 | count = -1; |
| 490 | } | |
| 491 | if (count && lf_count_change(owner, -count)) { | |
| 492 | error = ENOLCK; | |
| 493 | goto do_cleanup; | |
| 508ceb09 | 494 | } |
| a3190466 | 495 | } |
| e95b513d | 496 | /* else flock style lock which encompasses entire range */ |
| a3190466 MD |
497 | |
| 498 | /* | |
| 499 | * Create and insert the lock represented the requested range. | |
| 500 | * Adjust the net POSIX lock count. We have to move our insertion | |
| 501 | * point since brange now represents the first record >= start. | |
| 502 | * | |
| 503 | * When unlocking, no new lock is inserted but we still clip. | |
| 504 | */ | |
| 505 | if (type != F_UNLCK) { | |
| 506 | brange = new_range1; | |
| 507 | new_range1 = NULL; | |
| 508 | lf_create_range(brange, owner, type, flags, start, end); | |
| 509 | lf_insert(&lock->lf_range, brange, insert_point); | |
| 510 | insert_point = brange; | |
| 511 | if (flags & F_POSIX) | |
| 512 | ++count; | |
| 513 | } else { | |
| 514 | brange = NULL; | |
| 515 | } | |
| 516 | ||
| 517 | /* | |
| 518 | * Handle the double_clip case. This is the only case where | |
| 519 | * we wind up having to add TWO locks. | |
| 520 | */ | |
| 521 | if (double_clip) { | |
| 522 | KKASSERT(first_match == last_match); | |
| 523 | last_match = new_range2; | |
| 524 | new_range2 = NULL; | |
| 525 | lf_create_range(last_match, first_match->lf_owner, | |
| 526 | first_match->lf_type, first_match->lf_flags, | |
| 527 | end + 1, first_match->lf_end); | |
| 528 | first_match->lf_end = start - 1; | |
| 529 | first_match->lf_flags &= ~F_NOEND; | |
| cae44669 | 530 | |
| 984263bc | 531 | /* |
| a3190466 | 532 | * Figure out where to insert the right side clip. |
| 984263bc | 533 | */ |
| a3190466 MD |
534 | lf_insert(&lock->lf_range, last_match, first_match); |
| 535 | if (last_match->lf_flags & F_POSIX) | |
| 536 | ++count; | |
| 508ceb09 | 537 | } |
| 984263bc | 538 | |
| cae44669 | 539 | /* |
| a3190466 MD |
540 | * Clip or destroy the locks between first_match and last_match, |
| 541 | * inclusive. Ignore the primary lock we created (brange). Note | |
| 542 | * that if double-clipped, first_match and last_match will be | |
| 543 | * outside our clipping range. Otherwise first_match and last_match | |
| 544 | * will be deleted. | |
| 545 | * | |
| 546 | * We have already taken care of any double clipping. | |
| 547 | * | |
| e95b513d MD |
548 | * The insert_point may become invalid as we delete records, do not |
| 549 | * use that pointer any more. Also, when removing something other | |
| 550 | * then 'range' we have to check to see if the item we are removing | |
| 551 | * is 'next' and adjust 'next' properly. | |
| a3190466 MD |
552 | * |
| 553 | * NOTE: brange will be NULL if F_UNLCKing. | |
| cae44669 | 554 | */ |
| a3190466 MD |
555 | TAILQ_INIT(&deadlist); |
| 556 | next = first_match; | |
| 557 | ||
| 558 | while ((range = next) != NULL) { | |
| 559 | next = TAILQ_NEXT(range, lf_link); | |
| 560 | ||
| cae44669 | 561 | /* |
| a3190466 MD |
562 | * Ignore elements that we do not own and ignore the |
| 563 | * primary request range which we just created. | |
| cae44669 | 564 | */ |
| a3190466 MD |
565 | if (range->lf_owner != owner || range == brange) |
| 566 | continue; | |
| cae44669 MD |
567 | |
| 568 | /* | |
| a3190466 | 569 | * We may have to wakeup a waiter when downgrading a lock. |
| cae44669 | 570 | */ |
| a3190466 MD |
571 | if (type == F_UNLCK) |
| 572 | wakeup_needed = 1; | |
| 573 | if (type == F_RDLCK && range->lf_type == F_WRLCK) | |
| 574 | wakeup_needed = 1; | |
| 508ceb09 | 575 | |
| a3190466 | 576 | /* |
| e95b513d MD |
577 | * Clip left. This can only occur on first_match. |
| 578 | * | |
| 579 | * Merge the left clip with brange if possible. This must | |
| 580 | * be done specifically, not in the optimized merge heuristic | |
| 581 | * below, since we may have counted on it in our 'count' | |
| 582 | * calculation above. | |
| a3190466 MD |
583 | */ |
| 584 | if (range->lf_start < start) { | |
| 585 | KKASSERT(range == first_match); | |
| e95b513d MD |
586 | if (brange && |
| 587 | range->lf_end >= start - 1 && | |
| 588 | lf_match(range, type, flags)) { | |
| 589 | range->lf_end = brange->lf_end; | |
| 590 | range->lf_flags |= brange->lf_flags & F_NOEND; | |
| 591 | /* | |
| 592 | * Removing something other then 'range', | |
| 593 | * adjust 'next' if necessary. | |
| 594 | */ | |
| 595 | if (next == brange) | |
| 596 | next = TAILQ_NEXT(next, lf_link); | |
| 597 | TAILQ_REMOVE(&lock->lf_range, brange, lf_link); | |
| 598 | if (brange->lf_flags & F_POSIX) | |
| 599 | --count; | |
| 600 | TAILQ_INSERT_TAIL(&deadlist, brange, lf_link); | |
| 601 | brange = range; | |
| 602 | } else if (range->lf_end >= start) { | |
| a3190466 | 603 | range->lf_end = start - 1; |
| e95b513d MD |
604 | if (type != F_UNLCK) |
| 605 | range->lf_flags &= ~F_NOEND; | |
| 508ceb09 | 606 | } |
| a3190466 | 607 | if (range == last_match) |
| 984263bc | 608 | break; |
| a3190466 MD |
609 | continue; |
| 610 | } | |
| cae44669 | 611 | |
| a3190466 | 612 | /* |
| e95b513d MD |
613 | * Clip right. This can only occur on last_match. |
| 614 | * | |
| 615 | * Merge the right clip if possible. This must be done | |
| 616 | * specifically, not in the optimized merge heuristic | |
| 617 | * below, since we may have counted on it in our 'count' | |
| 618 | * calculation. | |
| a3190466 MD |
619 | * |
| 620 | * Since we are adjusting lf_start, we have to move the | |
| 621 | * record to maintain the sorted list. Since lf_start is | |
| 622 | * only getting larger we can use the next element as the | |
| 623 | * insert point (we don't have to backtrack). | |
| 624 | */ | |
| 625 | if (range->lf_end > end) { | |
| 626 | KKASSERT(range == last_match); | |
| e95b513d MD |
627 | if (brange && |
| 628 | range->lf_start <= end + 1 && | |
| 629 | lf_match(range, type, flags)) { | |
| 630 | brange->lf_end = range->lf_end; | |
| 631 | brange->lf_flags |= range->lf_flags & F_NOEND; | |
| 632 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); | |
| 633 | if (range->lf_flags & F_POSIX) | |
| 634 | --count; | |
| 635 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); | |
| 636 | } else if (range->lf_start <= end) { | |
| 637 | range->lf_start = end + 1; | |
| 638 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); | |
| 639 | lf_insert(&lock->lf_range, range, next); | |
| a3190466 MD |
640 | } |
| 641 | /* range == last_match, we are done */ | |
| 984263bc | 642 | break; |
| 508ceb09 | 643 | } |
| a3190466 MD |
644 | |
| 645 | /* | |
| 646 | * The record must be entirely enclosed. Note that the | |
| 647 | * record could be first_match or last_match, and will be | |
| 648 | * deleted. | |
| 649 | */ | |
| 650 | KKASSERT(range->lf_start >= start && range->lf_end <= end); | |
| 651 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); | |
| 652 | if (range->lf_flags & F_POSIX) | |
| 653 | --count; | |
| 654 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); | |
| 655 | if (range == last_match) | |
| 656 | break; | |
| 508ceb09 | 657 | } |
| 984263bc | 658 | |
| a3190466 MD |
659 | /* |
| 660 | * Attempt to merge locks adjacent to brange. For example, we may | |
| 661 | * have had to clip first_match and/or last_match, and they might | |
| 662 | * be adjacent. Or there might simply have been an adjacent lock | |
| 663 | * already there. | |
| 664 | * | |
| 665 | * Don't get fancy, just check adjacent elements in the list if they | |
| 666 | * happen to be owned by us. | |
| e95b513d MD |
667 | * |
| 668 | * This case only gets hit if we have a situation where a shared | |
| 669 | * and exclusive lock are adjacent, and the exclusive lock is | |
| 670 | * downgraded to shared or the shared lock is upgraded to exclusive. | |
| a3190466 MD |
671 | */ |
| 672 | if (brange) { | |
| 673 | range = TAILQ_PREV(brange, lockf_range_list, lf_link); | |
| 674 | if (range && | |
| 675 | range->lf_owner == owner && | |
| 676 | range->lf_end == brange->lf_start - 1 && | |
| 677 | lf_match(range, type, flags) | |
| 678 | ) { | |
| 679 | /* | |
| e95b513d | 680 | * Extend range to cover brange and scrap brange. |
| a3190466 MD |
681 | */ |
| 682 | range->lf_end = brange->lf_end; | |
| 683 | range->lf_flags |= brange->lf_flags & F_NOEND; | |
| 684 | TAILQ_REMOVE(&lock->lf_range, brange, lf_link); | |
| 685 | if (brange->lf_flags & F_POSIX) | |
| 686 | --count; | |
| 687 | TAILQ_INSERT_TAIL(&deadlist, brange, lf_link); | |
| 688 | brange = range; | |
| 984263bc | 689 | } |
| a3190466 MD |
690 | range = TAILQ_NEXT(brange, lf_link); |
| 691 | if (range && | |
| 692 | range->lf_owner == owner && | |
| 693 | range->lf_start == brange->lf_end + 1 && | |
| 694 | lf_match(range, type, flags) | |
| 695 | ) { | |
| 696 | /* | |
| e95b513d | 697 | * Extend brange to cover range and scrap range. |
| a3190466 MD |
698 | */ |
| 699 | brange->lf_end = range->lf_end; | |
| 93df580f | 700 | brange->lf_flags |= range->lf_flags & F_NOEND; |
| a3190466 MD |
701 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); |
| 702 | if (range->lf_flags & F_POSIX) | |
| 703 | --count; | |
| 704 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); | |
| 508ceb09 | 705 | } |
| 984263bc | 706 | } |
| 508ceb09 | 707 | |
| a3190466 MD |
708 | /* |
| 709 | * Destroy deleted elements. We didn't want to do it in the loop | |
| 710 | * because the free() might have blocked. | |
| 711 | * | |
| 712 | * Adjust the count for any posix locks we thought we might create | |
| 713 | * but didn't. | |
| 714 | */ | |
| 715 | while ((range = TAILQ_FIRST(&deadlist)) != NULL) { | |
| 716 | TAILQ_REMOVE(&deadlist, range, lf_link); | |
| 717 | lf_destroy_range(range); | |
| 718 | } | |
| 719 | ||
| 720 | KKASSERT(count <= 0); | |
| 721 | if (count < 0) | |
| 722 | lf_count_change(owner, count); | |
| 508ceb09 | 723 | do_wakeup: |
| 6449cdd4 | 724 | lf_print_lock(lock); |
| 508ceb09 JS |
725 | if (wakeup_needed) |
| 726 | lf_wakeup(lock, start, end); | |
| 4f0a1741 JS |
727 | error = 0; |
| 728 | do_cleanup: | |
| 729 | if (new_range1 != NULL) | |
| cae44669 | 730 | lf_destroy_range(new_range1); |
| 4f0a1741 | 731 | if (new_range2 != NULL) |
| cae44669 | 732 | lf_destroy_range(new_range2); |
| 4f0a1741 | 733 | return(error); |
| 984263bc MD |
734 | } |
| 735 | ||
| 984263bc MD |
736 | /* |
| 737 | * Check whether there is a blocking lock, | |
| 738 | * and if so return its process identifier. | |
| 739 | */ | |
| 740 | static int | |
| 508ceb09 JS |
741 | lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner, |
| 742 | int type, int flags, off_t start, off_t end) | |
| 984263bc | 743 | { |
| 508ceb09 | 744 | struct lockf_range *range; |
| 984263bc | 745 | |
| 508ceb09 JS |
746 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) |
| 747 | if (range->lf_owner != owner && | |
| 748 | lf_overlap(range, start, end) && | |
| 749 | (type == F_WRLCK || range->lf_type == F_WRLCK)) | |
| 750 | break; | |
| 751 | if (range == NULL) { | |
| 984263bc | 752 | fl->l_type = F_UNLCK; |
| 508ceb09 | 753 | return(0); |
| 984263bc | 754 | } |
| 508ceb09 JS |
755 | fl->l_type = range->lf_type; |
| 756 | fl->l_whence = SEEK_SET; | |
| 757 | fl->l_start = range->lf_start; | |
| 758 | if (range->lf_flags & F_NOEND) | |
| 759 | fl->l_len = 0; | |
| 760 | else | |
| 761 | fl->l_len = range->lf_end - range->lf_start + 1; | |
| 762 | if (range->lf_owner != NULL && (range->lf_flags & F_POSIX)) | |
| 763 | fl->l_pid = range->lf_owner->p_pid; | |
| 764 | else | |
| 765 | fl->l_pid = -1; | |
| 766 | return(0); | |
| 984263bc MD |
767 | } |
| 768 | ||
| 769 | /* | |
| ad936517 MD |
770 | * Wakeup pending lock attempts. Theoretically we can stop as soon as |
| 771 | * we encounter an exclusive request that covers the whole range (at least | |
| 772 | * insofar as the sleep code above calls lf_wakeup() if it would otherwise | |
| 773 | * exit instead of loop), but for now just wakeup all overlapping | |
| 774 | * requests. XXX | |
| 984263bc | 775 | */ |
| 508ceb09 JS |
776 | static void |
| 777 | lf_wakeup(struct lockf *lock, off_t start, off_t end) | |
| 984263bc | 778 | { |
| 508ceb09 | 779 | struct lockf_range *range, *nrange; |
| ad936517 | 780 | |
| 508ceb09 JS |
781 | TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) { |
| 782 | if (lf_overlap(range, start, end) == 0) | |
| 984263bc | 783 | continue; |
| 508ceb09 JS |
784 | TAILQ_REMOVE(&lock->lf_blocked, range, lf_link); |
| 785 | range->lf_flags = 1; | |
| 786 | wakeup(range); | |
| 984263bc | 787 | } |
| 984263bc MD |
788 | } |
| 789 | ||
| eacc5025 MD |
790 | /* |
| 791 | * Allocate a range structure and initialize it sufficiently such that | |
| 792 | * lf_destroy_range() does not barf. | |
| 793 | */ | |
| 508ceb09 | 794 | static struct lockf_range * |
| 4f0a1741 | 795 | lf_alloc_range(void) |
| 508ceb09 | 796 | { |
| eacc5025 MD |
797 | struct lockf_range *range; |
| 798 | ||
| 4f0a1741 JS |
799 | #ifdef INVARIANTS |
| 800 | lf_global_counter++; | |
| 801 | #endif | |
| efda3bd0 | 802 | range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK); |
| eacc5025 MD |
803 | range->lf_owner = NULL; |
| 804 | return(range); | |
| 4f0a1741 | 805 | } |
| 508ceb09 | 806 | |
| 4f0a1741 | 807 | static void |
| a3190466 MD |
808 | lf_insert(struct lockf_range_list *list, struct lockf_range *elm, |
| 809 | struct lockf_range *insert_point) | |
| 810 | { | |
| 811 | while (insert_point && insert_point->lf_start < elm->lf_start) | |
| 812 | insert_point = TAILQ_NEXT(insert_point, lf_link); | |
| 813 | if (insert_point != NULL) | |
| 814 | TAILQ_INSERT_BEFORE(insert_point, elm, lf_link); | |
| 815 | else | |
| 816 | TAILQ_INSERT_TAIL(list, elm, lf_link); | |
| 817 | } | |
| 818 | ||
| 819 | static void | |
| 4f0a1741 | 820 | lf_create_range(struct lockf_range *range, struct proc *owner, int type, |
| cae44669 | 821 | int flags, off_t start, off_t end) |
| 4f0a1741 | 822 | { |
| 508ceb09 | 823 | KKASSERT(start <= end); |
| 508ceb09 JS |
824 | range->lf_type = type; |
| 825 | range->lf_flags = flags; | |
| 826 | range->lf_start = start; | |
| 827 | range->lf_end = end; | |
| 828 | range->lf_owner = owner; | |
| 984263bc | 829 | |
| 6449cdd4 MD |
830 | lf_printf("lf_create_range: %lld..%lld\n", |
| 831 | range->lf_start, range->lf_end); | |
| 984263bc MD |
832 | } |
| 833 | ||
| 984263bc | 834 | static void |
| cae44669 | 835 | lf_destroy_range(struct lockf_range *range) |
| 984263bc | 836 | { |
| 6449cdd4 | 837 | lf_printf("lf_destroy_range: %lld..%lld\n", |
| cae44669 | 838 | range->lf_start, range->lf_end); |
| efda3bd0 | 839 | kfree(range, M_LOCKF); |
| 508ceb09 JS |
840 | #ifdef INVARIANTS |
| 841 | lf_global_counter--; | |
| 842 | KKASSERT(lf_global_counter>=0); | |
| 843 | #endif | |
| 984263bc MD |
844 | } |
| 845 | ||
| 846 | #ifdef LOCKF_DEBUG | |
| 6449cdd4 | 847 | |
| 508ceb09 | 848 | static void |
| 6449cdd4 MD |
849 | _lf_printf(const char *ctl, ...) |
| 850 | { | |
| 851 | struct proc *p; | |
| 852 | __va_list va; | |
| 853 | ||
| 854 | if (lf_print_ranges) { | |
| 855 | if ((p = curproc) != NULL) | |
| 6ea70f76 | 856 | kprintf("pid %d (%s): ", p->p_pid, p->p_comm); |
| 6449cdd4 MD |
857 | } |
| 858 | __va_start(va, ctl); | |
| 379210cb | 859 | kvprintf(ctl, va); |
| 6449cdd4 MD |
860 | __va_end(va); |
| 861 | } | |
| 862 | ||
| 863 | static void | |
| 864 | _lf_print_lock(const struct lockf *lock) | |
| 984263bc | 865 | { |
| 508ceb09 | 866 | struct lockf_range *range; |
| 984263bc | 867 | |
| 6449cdd4 MD |
868 | if (lf_print_ranges == 0) |
| 869 | return; | |
| 870 | ||
| 871 | if (TAILQ_EMPTY(&lock->lf_range)) { | |
| 872 | lf_printf("lockf %p: no ranges locked\n", lock); | |
| 873 | } else { | |
| 874 | lf_printf("lockf %p:\n", lock); | |
| 875 | } | |
| 508ceb09 | 876 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) |
| 6ea70f76 | 877 | kprintf("\t%lld..%lld type %s owned by %d\n", |
| 508ceb09 JS |
878 | range->lf_start, range->lf_end, |
| 879 | range->lf_type == F_RDLCK ? "shared" : "exclusive", | |
| 880 | range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1); | |
| 881 | if (TAILQ_EMPTY(&lock->lf_blocked)) | |
| 6ea70f76 | 882 | kprintf("no process waiting for range\n"); |
| 984263bc | 883 | else |
| 6ea70f76 | 884 | kprintf("blocked locks:"); |
| f99c6c54 | 885 | TAILQ_FOREACH(range, &lock->lf_blocked, lf_link) |
| 6ea70f76 | 886 | kprintf("\t%lld..%lld type %s waiting on %p\n", |
| 508ceb09 JS |
887 | range->lf_start, range->lf_end, |
| 888 | range->lf_type == F_RDLCK ? "shared" : "exclusive", | |
| 889 | range); | |
| 984263bc | 890 | } |
| 2c4af4c4 | 891 | #endif /* LOCKF_DEBUG */ |