Commit | Line | Data |
---|---|---|
984263bc | 1 | /* |
0a019f0d | 2 | * Copyright (c) 2004 Joerg Sonnenberger <joerg@bec.de>. All rights reserved. |
d6299163 | 3 | * Copyright (c) 2006-2018 Matthew Dillon <dillon@backplane.com>. All rights reserved. |
508ceb09 | 4 | * |
984263bc MD |
5 | * Copyright (c) 1982, 1986, 1989, 1993 |
6 | * The Regents of the University of California. All rights reserved. | |
7 | * | |
8 | * This code is derived from software contributed to Berkeley by | |
9 | * Scooter Morris at Genentech Inc. | |
10 | * | |
11 | * Redistribution and use in source and binary forms, with or without | |
12 | * modification, are permitted provided that the following conditions | |
13 | * are met: | |
14 | * 1. Redistributions of source code must retain the above copyright | |
15 | * notice, this list of conditions and the following disclaimer. | |
16 | * 2. Redistributions in binary form must reproduce the above copyright | |
17 | * notice, this list of conditions and the following disclaimer in the | |
18 | * documentation and/or other materials provided with the distribution. | |
dc71b7ab | 19 | * 3. Neither the name of the University nor the names of its contributors |
984263bc MD |
20 | * may be used to endorse or promote products derived from this software |
21 | * without specific prior written permission. | |
22 | * | |
23 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND | |
24 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
25 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
26 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE | |
27 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
28 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
29 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
30 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
31 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
32 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
33 | * SUCH DAMAGE. | |
34 | * | |
35 | * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 | |
36 | * $FreeBSD: src/sys/kern/kern_lockf.c,v 1.25 1999/11/16 16:28:56 phk Exp $ | |
37 | */ | |
38 | ||
c91da9dd SW |
39 | #include "opt_debug_lockf.h" |
40 | ||
984263bc MD |
41 | #include <sys/param.h> |
42 | #include <sys/systm.h> | |
43 | #include <sys/kernel.h> | |
44 | #include <sys/lock.h> | |
45 | #include <sys/proc.h> | |
46 | #include <sys/unistd.h> | |
47 | #include <sys/vnode.h> | |
48 | #include <sys/malloc.h> | |
49 | #include <sys/fcntl.h> | |
508ceb09 | 50 | #include <sys/resourcevar.h> |
984263bc MD |
51 | |
52 | #include <sys/lockf.h> | |
508ceb09 | 53 | #include <machine/limits.h> /* for LLONG_MAX */ |
6449cdd4 | 54 | #include <machine/stdarg.h> |
508ceb09 | 55 | |
9d7a637e AE |
56 | #include <sys/spinlock2.h> |
57 | ||
f6240282 MD |
58 | struct lf_pcpu { |
59 | struct lockf_range *free1; | |
60 | struct lockf_range *free2; | |
61 | } __cachealign; | |
62 | ||
63 | static struct lf_pcpu *lf_pcpu_array; | |
64 | ||
508ceb09 JS |
65 | #ifdef LOCKF_DEBUG |
66 | int lf_print_ranges = 0; | |
67 | ||
6449cdd4 | 68 | static void _lf_print_lock(const struct lockf *); |
4f697c93 | 69 | static void _lf_printf(const char *, ...) __printflike(1, 2); |
6449cdd4 MD |
70 | |
71 | #define lf_print_lock(lock) if (lf_print_ranges) _lf_print_lock(lock) | |
72 | #define lf_printf(ctl, args...) if (lf_print_ranges) _lf_printf(ctl, args) | |
73 | #else | |
74 | #define lf_print_lock(lock) | |
75 | #define lf_printf(ctl, args...) | |
508ceb09 JS |
76 | #endif |
77 | ||
78 | static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); | |
79 | ||
80 | static void lf_wakeup(struct lockf *, off_t, off_t); | |
4f0a1741 JS |
81 | static struct lockf_range *lf_alloc_range(void); |
82 | static void lf_create_range(struct lockf_range *, struct proc *, int, int, | |
cae44669 | 83 | off_t, off_t); |
a3190466 MD |
84 | static void lf_insert(struct lockf_range_list *list, |
85 | struct lockf_range *elm, | |
86 | struct lockf_range *insert_point); | |
cae44669 | 87 | static void lf_destroy_range(struct lockf_range *); |
508ceb09 JS |
88 | |
89 | static int lf_setlock(struct lockf *, struct proc *, int, int, | |
90 | off_t, off_t); | |
508ceb09 JS |
91 | static int lf_getlock(struct flock *, struct lockf *, struct proc *, |
92 | int, int, off_t, off_t); | |
93 | ||
94 | static int lf_count_change(struct proc *, int); | |
984263bc | 95 | |
a3190466 MD |
96 | /* |
97 | * Return TRUE (non-zero) if the type and posix flags match. | |
98 | */ | |
99 | static __inline | |
100 | int | |
101 | lf_match(struct lockf_range *range, int type, int flags) | |
102 | { | |
103 | if (range->lf_type != type) | |
104 | return(0); | |
105 | if ((range->lf_flags ^ flags) & F_POSIX) | |
106 | return(0); | |
107 | return(1); | |
108 | } | |
109 | ||
110 | /* | |
111 | * Check whether range and [start, end] overlap. | |
112 | */ | |
113 | static __inline | |
114 | int | |
115 | lf_overlap(const struct lockf_range *range, off_t start, off_t end) | |
116 | { | |
117 | if (range->lf_start >= start && range->lf_start <= end) | |
118 | return(1); | |
119 | else if (start >= range->lf_start && start <= range->lf_end) | |
120 | return(1); | |
121 | else | |
122 | return(0); | |
123 | } | |
124 | ||
125 | ||
984263bc | 126 | /* |
508ceb09 | 127 | * Change the POSIX lock accounting for the given process. |
984263bc | 128 | */ |
508ceb09 | 129 | void |
b76a3852 | 130 | lf_count_adjust(struct proc *p, int increase) |
508ceb09 JS |
131 | { |
132 | struct uidinfo *uip; | |
d6299163 MD |
133 | struct uidcount *pup; |
134 | int n; | |
984263bc | 135 | |
508ceb09 | 136 | KKASSERT(p != NULL); |
984263bc | 137 | |
508ceb09 | 138 | uip = p->p_ucred->cr_uidinfo; |
d6299163 | 139 | pup = &uip->ui_pcpu[mycpuid]; |
984263bc | 140 | |
d6299163 MD |
141 | if (increase) { |
142 | for (n = 0; n < ncpus; ++n) | |
143 | pup->pu_posixlocks += p->p_uidpcpu[n].pu_posixlocks; | |
144 | } else { | |
145 | for (n = 0; n < ncpus; ++n) | |
146 | pup->pu_posixlocks -= p->p_uidpcpu[n].pu_posixlocks; | |
147 | } | |
148 | ||
149 | if (pup->pu_posixlocks < -PUP_LIMIT || | |
150 | pup->pu_posixlocks > PUP_LIMIT) { | |
151 | atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks); | |
152 | pup->pu_posixlocks = 0; | |
153 | } | |
508ceb09 | 154 | } |
984263bc | 155 | |
508ceb09 JS |
156 | static int |
157 | lf_count_change(struct proc *owner, int diff) | |
158 | { | |
159 | struct uidinfo *uip; | |
9d7a637e | 160 | int max, ret; |
508ceb09 JS |
161 | |
162 | /* we might actually not have a process context */ | |
163 | if (owner == NULL) | |
164 | return(0); | |
165 | ||
166 | uip = owner->p_ucred->cr_uidinfo; | |
167 | ||
168 | max = MIN(owner->p_rlimit[RLIMIT_POSIXLOCKS].rlim_cur, | |
169 | maxposixlocksperuid); | |
9d7a637e | 170 | |
508ceb09 | 171 | if (diff > 0 && owner->p_ucred->cr_uid != 0 && max != -1 && |
cae44669 | 172 | uip->ui_posixlocks >= max ) { |
9d7a637e AE |
173 | ret = 1; |
174 | } else { | |
d6299163 MD |
175 | struct uidcount *pup; |
176 | int cpu = mycpuid; | |
177 | ||
178 | pup = &uip->ui_pcpu[cpu]; | |
179 | pup->pu_posixlocks += diff; | |
180 | if (pup->pu_posixlocks < -PUP_LIMIT || | |
181 | pup->pu_posixlocks > PUP_LIMIT) { | |
182 | atomic_add_int(&uip->ui_posixlocks, pup->pu_posixlocks); | |
183 | pup->pu_posixlocks = 0; | |
184 | } | |
185 | owner->p_uidpcpu[cpu].pu_posixlocks += diff; | |
9d7a637e | 186 | ret = 0; |
cae44669 | 187 | } |
9d7a637e | 188 | return ret; |
508ceb09 | 189 | } |
984263bc MD |
190 | |
191 | /* | |
192 | * Advisory record locking support | |
193 | */ | |
194 | int | |
508ceb09 | 195 | lf_advlock(struct vop_advlock_args *ap, struct lockf *lock, u_quad_t size) |
984263bc | 196 | { |
1fd87d54 | 197 | struct flock *fl = ap->a_fl; |
508ceb09 | 198 | struct proc *owner; |
984263bc | 199 | off_t start, end; |
97ef7fc2 | 200 | int type, flags, error; |
3b998fa9 | 201 | lwkt_token_t token; |
508ceb09 | 202 | |
984263bc MD |
203 | /* |
204 | * Convert the flock structure into a start and end. | |
205 | */ | |
206 | switch (fl->l_whence) { | |
984263bc MD |
207 | case SEEK_SET: |
208 | case SEEK_CUR: | |
209 | /* | |
210 | * Caller is responsible for adding any necessary offset | |
211 | * when SEEK_CUR is used. | |
212 | */ | |
213 | start = fl->l_start; | |
214 | break; | |
215 | ||
216 | case SEEK_END: | |
217 | start = size + fl->l_start; | |
218 | break; | |
219 | ||
220 | default: | |
508ceb09 | 221 | return(EINVAL); |
984263bc | 222 | } |
9b725a11 MD |
223 | |
224 | flags = ap->a_flags; | |
984263bc | 225 | if (start < 0) |
508ceb09 JS |
226 | return(EINVAL); |
227 | if (fl->l_len == 0) { | |
228 | flags |= F_NOEND; | |
86fc1543 | 229 | end = LLONG_MAX; |
99acf558 MD |
230 | } else if (fl->l_len < 0) { |
231 | return(EINVAL); | |
508ceb09 | 232 | } else { |
984263bc MD |
233 | end = start + fl->l_len - 1; |
234 | if (end < start) | |
508ceb09 | 235 | return(EINVAL); |
984263bc | 236 | } |
508ceb09 | 237 | |
508ceb09 | 238 | type = fl->l_type; |
984263bc | 239 | /* |
508ceb09 JS |
240 | * This isn't really correct for flock-style locks, |
241 | * but the current handling is somewhat broken anyway. | |
984263bc | 242 | */ |
508ceb09 JS |
243 | owner = (struct proc *)ap->a_id; |
244 | ||
984263bc MD |
245 | /* |
246 | * Do the requested operation. | |
247 | */ | |
3b998fa9 | 248 | token = lwkt_getpooltoken(lock); |
0e23bc1f JS |
249 | |
250 | if (lock->init_done == 0) { | |
251 | TAILQ_INIT(&lock->lf_range); | |
252 | TAILQ_INIT(&lock->lf_blocked); | |
253 | lock->init_done = 1; | |
254 | } | |
255 | ||
984263bc MD |
256 | switch(ap->a_op) { |
257 | case F_SETLK: | |
93df580f MD |
258 | /* |
259 | * NOTE: It is possible for both lf_range and lf_blocked to | |
260 | * be empty if we block and get woken up, but another process | |
261 | * then gets in and issues an unlock. So VMAYHAVELOCKS must | |
262 | * be set after the lf_setlock() operation completes rather | |
263 | * then before. | |
264 | */ | |
97ef7fc2 | 265 | error = lf_setlock(lock, owner, type, flags, start, end); |
f6240282 MD |
266 | if ((ap->a_vp->v_flag & VMAYHAVELOCKS) == 0) |
267 | vsetflags(ap->a_vp, VMAYHAVELOCKS); | |
97ef7fc2 | 268 | break; |
984263bc MD |
269 | |
270 | case F_UNLCK: | |
a3190466 | 271 | error = lf_setlock(lock, owner, type, flags, start, end); |
f6240282 MD |
272 | #if 0 |
273 | /* | |
274 | * XXX REMOVED. don't bother doing this in the critical path. | |
275 | * close() overhead is minimal. | |
276 | */ | |
2792dfce MD |
277 | if (TAILQ_EMPTY(&lock->lf_range) && |
278 | TAILQ_EMPTY(&lock->lf_blocked)) { | |
2247fe02 | 279 | vclrflags(ap->a_vp, VMAYHAVELOCKS); |
2792dfce | 280 | } |
f6240282 | 281 | #endif |
97ef7fc2 | 282 | break; |
984263bc MD |
283 | |
284 | case F_GETLK: | |
97ef7fc2 JS |
285 | error = lf_getlock(fl, lock, owner, type, flags, start, end); |
286 | break; | |
984263bc MD |
287 | |
288 | default: | |
97ef7fc2 JS |
289 | error = EINVAL; |
290 | break; | |
984263bc | 291 | } |
3b998fa9 | 292 | lwkt_reltoken(token); |
97ef7fc2 | 293 | return(error); |
984263bc MD |
294 | } |
295 | ||
984263bc | 296 | static int |
508ceb09 JS |
297 | lf_setlock(struct lockf *lock, struct proc *owner, int type, int flags, |
298 | off_t start, off_t end) | |
984263bc | 299 | { |
a3190466 MD |
300 | struct lockf_range *range; |
301 | struct lockf_range *brange; | |
302 | struct lockf_range *next; | |
303 | struct lockf_range *first_match; | |
304 | struct lockf_range *last_match; | |
305 | struct lockf_range *insert_point; | |
306 | struct lockf_range *new_range1; | |
307 | struct lockf_range *new_range2; | |
308 | int wakeup_needed; | |
309 | int double_clip; | |
99acf558 | 310 | int unlock_override; |
4f0a1741 | 311 | int error = 0; |
cae44669 | 312 | int count; |
a3190466 MD |
313 | struct lockf_range_list deadlist; |
314 | ||
315 | new_range1 = NULL; | |
316 | new_range2 = NULL; | |
317 | count = 0; | |
508ceb09 JS |
318 | |
319 | restart: | |
a3190466 MD |
320 | /* |
321 | * Preallocate two ranges so we don't have to worry about blocking | |
322 | * in the middle of the lock code. | |
323 | */ | |
4f0a1741 JS |
324 | if (new_range1 == NULL) |
325 | new_range1 = lf_alloc_range(); | |
326 | if (new_range2 == NULL) | |
327 | new_range2 = lf_alloc_range(); | |
508ceb09 | 328 | first_match = NULL; |
a3190466 | 329 | last_match = NULL; |
508ceb09 JS |
330 | insert_point = NULL; |
331 | wakeup_needed = 0; | |
984263bc | 332 | |
6449cdd4 | 333 | lf_print_lock(lock); |
984263bc | 334 | |
a3190466 MD |
335 | /* |
336 | * Locate the insertion point for the new lock (the first range | |
337 | * with an lf_start >= start). | |
338 | * | |
339 | * Locate the first and latch ranges owned by us that overlap | |
340 | * the requested range. | |
341 | */ | |
508ceb09 JS |
342 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) { |
343 | if (insert_point == NULL && range->lf_start >= start) | |
344 | insert_point = range; | |
a3190466 MD |
345 | |
346 | /* | |
347 | * Skip non-overlapping locks. Locks are sorted by lf_start | |
348 | * So we can terminate the search when lf_start exceeds the | |
349 | * requested range (insert_point is still guarenteed to be | |
350 | * set properly). | |
351 | */ | |
352 | if (range->lf_end < start) | |
508ceb09 | 353 | continue; |
a3190466 MD |
354 | if (range->lf_start > end) { |
355 | range = NULL; | |
356 | break; | |
357 | } | |
358 | ||
359 | /* | |
360 | * Overlapping lock. Set first_match and last_match if we | |
361 | * are the owner. | |
362 | */ | |
508ceb09 JS |
363 | if (range->lf_owner == owner) { |
364 | if (first_match == NULL) | |
365 | first_match = range; | |
a3190466 | 366 | last_match = range; |
508ceb09 | 367 | continue; |
984263bc | 368 | } |
a3190466 MD |
369 | |
370 | /* | |
371 | * If we aren't the owner check for a conflicting lock. Only | |
372 | * if not unlocking. | |
373 | */ | |
374 | if (type != F_UNLCK) { | |
375 | if (type == F_WRLCK || range->lf_type == F_WRLCK) | |
376 | break; | |
377 | } | |
508ceb09 JS |
378 | } |
379 | ||
a3190466 MD |
380 | /* |
381 | * If a conflicting lock was observed, block or fail as appropriate. | |
382 | * (this code is skipped when unlocking) | |
383 | */ | |
508ceb09 | 384 | if (range != NULL) { |
4f0a1741 JS |
385 | if ((flags & F_WAIT) == 0) { |
386 | error = EAGAIN; | |
387 | goto do_cleanup; | |
388 | } | |
508ceb09 | 389 | |
984263bc | 390 | /* |
508ceb09 JS |
391 | * We are blocked. For POSIX locks we have to check |
392 | * for deadlocks and return with EDEADLK. This is done | |
3d8f95ac | 393 | * by checking whether range->lf_owner is already |
508ceb09 JS |
394 | * blocked. |
395 | * | |
396 | * Since flock-style locks cover the whole file, a | |
397 | * deadlock between those is nearly impossible. | |
398 | * This can only occur if a process tries to lock the | |
399 | * same inode exclusively while holding a shared lock | |
400 | * with another descriptor. | |
401 | * XXX How can we cleanly detect this? | |
402 | * XXX The current mixing of flock & fcntl/lockf is evil. | |
984263bc | 403 | * |
508ceb09 | 404 | * Handle existing locks of flock-style like POSIX locks. |
984263bc | 405 | */ |
508ceb09 | 406 | if (flags & F_POSIX) { |
99acf558 | 407 | TAILQ_FOREACH(brange, &lock->lf_blocked, lf_link) { |
4f0a1741 JS |
408 | if (brange->lf_owner == range->lf_owner) { |
409 | error = EDEADLK; | |
410 | goto do_cleanup; | |
411 | } | |
99acf558 | 412 | } |
984263bc | 413 | } |
508ceb09 | 414 | |
984263bc | 415 | /* |
508ceb09 | 416 | * For flock-style locks, we must first remove |
984263bc MD |
417 | * any shared locks that we hold before we sleep |
418 | * waiting for an exclusive lock. | |
419 | */ | |
71c18fe3 | 420 | if ((flags & F_POSIX) == 0 && type == F_WRLCK) |
a3190466 | 421 | lf_setlock(lock, owner, F_UNLCK, 0, start, end); |
508ceb09 | 422 | |
4f0a1741 JS |
423 | brange = new_range1; |
424 | new_range1 = NULL; | |
cae44669 | 425 | lf_create_range(brange, owner, type, 0, start, end); |
508ceb09 JS |
426 | TAILQ_INSERT_TAIL(&lock->lf_blocked, brange, lf_link); |
427 | error = tsleep(brange, PCATCH, "lockf", 0); | |
428 | ||
984263bc | 429 | /* |
508ceb09 JS |
430 | * We may have been awaked by a signal and/or by a |
431 | * debugger continuing us (in which case we must remove | |
984263bc | 432 | * ourselves from the blocked list) and/or by another |
508ceb09 JS |
433 | * process releasing/downgrading a lock (in which case |
434 | * we have already been removed from the blocked list | |
435 | * and our lf_flags field is 1). | |
a3190466 MD |
436 | * |
437 | * Sleep if it looks like we might be livelocking. | |
984263bc | 438 | */ |
508ceb09 JS |
439 | if (brange->lf_flags == 0) |
440 | TAILQ_REMOVE(&lock->lf_blocked, brange, lf_link); | |
9157e509 | 441 | if (error == 0 && count == 2) |
a3190466 MD |
442 | tsleep(brange, 0, "lockfz", 2); |
443 | else | |
444 | ++count; | |
cae44669 | 445 | lf_destroy_range(brange); |
508ceb09 JS |
446 | |
447 | if (error) | |
4f0a1741 | 448 | goto do_cleanup; |
508ceb09 JS |
449 | goto restart; |
450 | } | |
451 | ||
a3190466 MD |
452 | /* |
453 | * If there are no overlapping locks owned by us then creating | |
454 | * the new lock is easy. This is the most common case. | |
455 | */ | |
508ceb09 | 456 | if (first_match == NULL) { |
a3190466 MD |
457 | if (type == F_UNLCK) |
458 | goto do_wakeup; | |
508ceb09 | 459 | if (flags & F_POSIX) { |
4f0a1741 JS |
460 | if (lf_count_change(owner, 1)) { |
461 | error = ENOLCK; | |
462 | goto do_cleanup; | |
463 | } | |
984263bc | 464 | } |
4f0a1741 JS |
465 | range = new_range1; |
466 | new_range1 = NULL; | |
cae44669 | 467 | lf_create_range(range, owner, type, flags, start, end); |
a3190466 | 468 | lf_insert(&lock->lf_range, range, insert_point); |
508ceb09 | 469 | goto do_wakeup; |
984263bc | 470 | } |
508ceb09 | 471 | |
a3190466 | 472 | /* |
99acf558 MD |
473 | * double_clip - Calculate a special case where TWO locks may have |
474 | * to be added due to the new lock breaking up an | |
475 | * existing incompatible lock in the middle. | |
476 | * | |
477 | * unlock_override - Calculate a special case where NO locks | |
478 | * need to be created. This occurs when an unlock | |
479 | * does not clip any locks at the front and rear. | |
480 | * | |
481 | * WARNING! closef() and fdrop() assume that an F_UNLCK of the | |
482 | * entire range will always succeed so the unlock_override | |
483 | * case is mandatory. | |
a3190466 | 484 | */ |
99acf558 MD |
485 | double_clip = 0; |
486 | unlock_override = 0; | |
487 | if (first_match->lf_start < start) { | |
488 | if (first_match == last_match && last_match->lf_end > end) | |
489 | double_clip = 1; | |
490 | } else if (type == F_UNLCK && last_match->lf_end <= end) { | |
491 | unlock_override = 1; | |
a3190466 | 492 | } |
cae44669 | 493 | |
a3190466 MD |
494 | /* |
495 | * Figure out the worst case net increase in POSIX locks and account | |
496 | * for it now before we start modifying things. If neither the | |
497 | * first or last locks match we have an issue. If there is only | |
498 | * one overlapping range which needs to be clipped on both ends | |
499 | * we wind up having to create up to two new locks, else only one. | |
500 | * | |
e95b513d MD |
501 | * When unlocking the worst case is always 1 new lock if our |
502 | * unlock request cuts the middle out of an existing lock range. | |
503 | * | |
a3190466 MD |
504 | * count represents the 'cleanup' adjustment needed. It starts |
505 | * negative, is incremented whenever we create a new POSIX lock, | |
506 | * and decremented whenever we delete an existing one. At the | |
507 | * end of the day it had better be <= 0 or we didn't calculate the | |
508 | * worse case properly here. | |
509 | */ | |
510 | count = 0; | |
99acf558 | 511 | if ((flags & F_POSIX) && !unlock_override) { |
a3190466 MD |
512 | if (!lf_match(first_match, type, flags) && |
513 | !lf_match(last_match, type, flags) | |
514 | ) { | |
e95b513d | 515 | if (double_clip && type != F_UNLCK) |
a3190466 | 516 | count = -2; |
508ceb09 | 517 | else |
a3190466 MD |
518 | count = -1; |
519 | } | |
520 | if (count && lf_count_change(owner, -count)) { | |
521 | error = ENOLCK; | |
522 | goto do_cleanup; | |
508ceb09 | 523 | } |
a3190466 | 524 | } |
e95b513d | 525 | /* else flock style lock which encompasses entire range */ |
a3190466 MD |
526 | |
527 | /* | |
528 | * Create and insert the lock represented the requested range. | |
529 | * Adjust the net POSIX lock count. We have to move our insertion | |
530 | * point since brange now represents the first record >= start. | |
531 | * | |
532 | * When unlocking, no new lock is inserted but we still clip. | |
533 | */ | |
534 | if (type != F_UNLCK) { | |
535 | brange = new_range1; | |
536 | new_range1 = NULL; | |
537 | lf_create_range(brange, owner, type, flags, start, end); | |
538 | lf_insert(&lock->lf_range, brange, insert_point); | |
539 | insert_point = brange; | |
540 | if (flags & F_POSIX) | |
541 | ++count; | |
542 | } else { | |
543 | brange = NULL; | |
544 | } | |
545 | ||
546 | /* | |
547 | * Handle the double_clip case. This is the only case where | |
548 | * we wind up having to add TWO locks. | |
549 | */ | |
550 | if (double_clip) { | |
551 | KKASSERT(first_match == last_match); | |
552 | last_match = new_range2; | |
553 | new_range2 = NULL; | |
554 | lf_create_range(last_match, first_match->lf_owner, | |
555 | first_match->lf_type, first_match->lf_flags, | |
556 | end + 1, first_match->lf_end); | |
557 | first_match->lf_end = start - 1; | |
558 | first_match->lf_flags &= ~F_NOEND; | |
cae44669 | 559 | |
984263bc | 560 | /* |
a3190466 | 561 | * Figure out where to insert the right side clip. |
984263bc | 562 | */ |
a3190466 MD |
563 | lf_insert(&lock->lf_range, last_match, first_match); |
564 | if (last_match->lf_flags & F_POSIX) | |
565 | ++count; | |
508ceb09 | 566 | } |
984263bc | 567 | |
cae44669 | 568 | /* |
a3190466 MD |
569 | * Clip or destroy the locks between first_match and last_match, |
570 | * inclusive. Ignore the primary lock we created (brange). Note | |
571 | * that if double-clipped, first_match and last_match will be | |
572 | * outside our clipping range. Otherwise first_match and last_match | |
573 | * will be deleted. | |
574 | * | |
575 | * We have already taken care of any double clipping. | |
576 | * | |
e95b513d MD |
577 | * The insert_point may become invalid as we delete records, do not |
578 | * use that pointer any more. Also, when removing something other | |
579 | * then 'range' we have to check to see if the item we are removing | |
580 | * is 'next' and adjust 'next' properly. | |
a3190466 MD |
581 | * |
582 | * NOTE: brange will be NULL if F_UNLCKing. | |
cae44669 | 583 | */ |
a3190466 MD |
584 | TAILQ_INIT(&deadlist); |
585 | next = first_match; | |
586 | ||
587 | while ((range = next) != NULL) { | |
588 | next = TAILQ_NEXT(range, lf_link); | |
589 | ||
cae44669 | 590 | /* |
a3190466 MD |
591 | * Ignore elements that we do not own and ignore the |
592 | * primary request range which we just created. | |
cae44669 | 593 | */ |
a3190466 MD |
594 | if (range->lf_owner != owner || range == brange) |
595 | continue; | |
cae44669 MD |
596 | |
597 | /* | |
a3190466 | 598 | * We may have to wakeup a waiter when downgrading a lock. |
cae44669 | 599 | */ |
a3190466 MD |
600 | if (type == F_UNLCK) |
601 | wakeup_needed = 1; | |
602 | if (type == F_RDLCK && range->lf_type == F_WRLCK) | |
603 | wakeup_needed = 1; | |
508ceb09 | 604 | |
a3190466 | 605 | /* |
e95b513d MD |
606 | * Clip left. This can only occur on first_match. |
607 | * | |
608 | * Merge the left clip with brange if possible. This must | |
609 | * be done specifically, not in the optimized merge heuristic | |
610 | * below, since we may have counted on it in our 'count' | |
611 | * calculation above. | |
a3190466 MD |
612 | */ |
613 | if (range->lf_start < start) { | |
614 | KKASSERT(range == first_match); | |
e95b513d MD |
615 | if (brange && |
616 | range->lf_end >= start - 1 && | |
617 | lf_match(range, type, flags)) { | |
618 | range->lf_end = brange->lf_end; | |
619 | range->lf_flags |= brange->lf_flags & F_NOEND; | |
620 | /* | |
621 | * Removing something other then 'range', | |
622 | * adjust 'next' if necessary. | |
623 | */ | |
624 | if (next == brange) | |
625 | next = TAILQ_NEXT(next, lf_link); | |
626 | TAILQ_REMOVE(&lock->lf_range, brange, lf_link); | |
627 | if (brange->lf_flags & F_POSIX) | |
628 | --count; | |
629 | TAILQ_INSERT_TAIL(&deadlist, brange, lf_link); | |
630 | brange = range; | |
631 | } else if (range->lf_end >= start) { | |
a3190466 | 632 | range->lf_end = start - 1; |
e95b513d MD |
633 | if (type != F_UNLCK) |
634 | range->lf_flags &= ~F_NOEND; | |
508ceb09 | 635 | } |
a3190466 | 636 | if (range == last_match) |
984263bc | 637 | break; |
a3190466 MD |
638 | continue; |
639 | } | |
cae44669 | 640 | |
a3190466 | 641 | /* |
e95b513d MD |
642 | * Clip right. This can only occur on last_match. |
643 | * | |
644 | * Merge the right clip if possible. This must be done | |
645 | * specifically, not in the optimized merge heuristic | |
646 | * below, since we may have counted on it in our 'count' | |
647 | * calculation. | |
a3190466 MD |
648 | * |
649 | * Since we are adjusting lf_start, we have to move the | |
650 | * record to maintain the sorted list. Since lf_start is | |
651 | * only getting larger we can use the next element as the | |
652 | * insert point (we don't have to backtrack). | |
653 | */ | |
654 | if (range->lf_end > end) { | |
655 | KKASSERT(range == last_match); | |
e95b513d MD |
656 | if (brange && |
657 | range->lf_start <= end + 1 && | |
658 | lf_match(range, type, flags)) { | |
659 | brange->lf_end = range->lf_end; | |
660 | brange->lf_flags |= range->lf_flags & F_NOEND; | |
661 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); | |
662 | if (range->lf_flags & F_POSIX) | |
663 | --count; | |
664 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); | |
665 | } else if (range->lf_start <= end) { | |
666 | range->lf_start = end + 1; | |
667 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); | |
668 | lf_insert(&lock->lf_range, range, next); | |
a3190466 MD |
669 | } |
670 | /* range == last_match, we are done */ | |
984263bc | 671 | break; |
508ceb09 | 672 | } |
a3190466 MD |
673 | |
674 | /* | |
675 | * The record must be entirely enclosed. Note that the | |
676 | * record could be first_match or last_match, and will be | |
677 | * deleted. | |
678 | */ | |
679 | KKASSERT(range->lf_start >= start && range->lf_end <= end); | |
680 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); | |
681 | if (range->lf_flags & F_POSIX) | |
682 | --count; | |
683 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); | |
684 | if (range == last_match) | |
685 | break; | |
508ceb09 | 686 | } |
984263bc | 687 | |
a3190466 MD |
688 | /* |
689 | * Attempt to merge locks adjacent to brange. For example, we may | |
690 | * have had to clip first_match and/or last_match, and they might | |
691 | * be adjacent. Or there might simply have been an adjacent lock | |
692 | * already there. | |
693 | * | |
694 | * Don't get fancy, just check adjacent elements in the list if they | |
695 | * happen to be owned by us. | |
e95b513d MD |
696 | * |
697 | * This case only gets hit if we have a situation where a shared | |
698 | * and exclusive lock are adjacent, and the exclusive lock is | |
699 | * downgraded to shared or the shared lock is upgraded to exclusive. | |
a3190466 MD |
700 | */ |
701 | if (brange) { | |
702 | range = TAILQ_PREV(brange, lockf_range_list, lf_link); | |
703 | if (range && | |
704 | range->lf_owner == owner && | |
705 | range->lf_end == brange->lf_start - 1 && | |
706 | lf_match(range, type, flags) | |
707 | ) { | |
708 | /* | |
e95b513d | 709 | * Extend range to cover brange and scrap brange. |
a3190466 MD |
710 | */ |
711 | range->lf_end = brange->lf_end; | |
712 | range->lf_flags |= brange->lf_flags & F_NOEND; | |
713 | TAILQ_REMOVE(&lock->lf_range, brange, lf_link); | |
714 | if (brange->lf_flags & F_POSIX) | |
715 | --count; | |
716 | TAILQ_INSERT_TAIL(&deadlist, brange, lf_link); | |
717 | brange = range; | |
984263bc | 718 | } |
a3190466 MD |
719 | range = TAILQ_NEXT(brange, lf_link); |
720 | if (range && | |
721 | range->lf_owner == owner && | |
722 | range->lf_start == brange->lf_end + 1 && | |
723 | lf_match(range, type, flags) | |
724 | ) { | |
725 | /* | |
e95b513d | 726 | * Extend brange to cover range and scrap range. |
a3190466 MD |
727 | */ |
728 | brange->lf_end = range->lf_end; | |
93df580f | 729 | brange->lf_flags |= range->lf_flags & F_NOEND; |
a3190466 MD |
730 | TAILQ_REMOVE(&lock->lf_range, range, lf_link); |
731 | if (range->lf_flags & F_POSIX) | |
732 | --count; | |
733 | TAILQ_INSERT_TAIL(&deadlist, range, lf_link); | |
508ceb09 | 734 | } |
984263bc | 735 | } |
508ceb09 | 736 | |
a3190466 MD |
737 | /* |
738 | * Destroy deleted elements. We didn't want to do it in the loop | |
739 | * because the free() might have blocked. | |
740 | * | |
741 | * Adjust the count for any posix locks we thought we might create | |
742 | * but didn't. | |
743 | */ | |
744 | while ((range = TAILQ_FIRST(&deadlist)) != NULL) { | |
745 | TAILQ_REMOVE(&deadlist, range, lf_link); | |
746 | lf_destroy_range(range); | |
747 | } | |
748 | ||
749 | KKASSERT(count <= 0); | |
750 | if (count < 0) | |
751 | lf_count_change(owner, count); | |
508ceb09 | 752 | do_wakeup: |
6449cdd4 | 753 | lf_print_lock(lock); |
508ceb09 JS |
754 | if (wakeup_needed) |
755 | lf_wakeup(lock, start, end); | |
4f0a1741 JS |
756 | error = 0; |
757 | do_cleanup: | |
758 | if (new_range1 != NULL) | |
cae44669 | 759 | lf_destroy_range(new_range1); |
4f0a1741 | 760 | if (new_range2 != NULL) |
cae44669 | 761 | lf_destroy_range(new_range2); |
4f0a1741 | 762 | return(error); |
984263bc MD |
763 | } |
764 | ||
984263bc MD |
765 | /* |
766 | * Check whether there is a blocking lock, | |
767 | * and if so return its process identifier. | |
768 | */ | |
769 | static int | |
508ceb09 JS |
770 | lf_getlock(struct flock *fl, struct lockf *lock, struct proc *owner, |
771 | int type, int flags, off_t start, off_t end) | |
984263bc | 772 | { |
508ceb09 | 773 | struct lockf_range *range; |
984263bc | 774 | |
508ceb09 JS |
775 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) |
776 | if (range->lf_owner != owner && | |
777 | lf_overlap(range, start, end) && | |
778 | (type == F_WRLCK || range->lf_type == F_WRLCK)) | |
779 | break; | |
780 | if (range == NULL) { | |
984263bc | 781 | fl->l_type = F_UNLCK; |
508ceb09 | 782 | return(0); |
984263bc | 783 | } |
508ceb09 JS |
784 | fl->l_type = range->lf_type; |
785 | fl->l_whence = SEEK_SET; | |
786 | fl->l_start = range->lf_start; | |
787 | if (range->lf_flags & F_NOEND) | |
788 | fl->l_len = 0; | |
789 | else | |
790 | fl->l_len = range->lf_end - range->lf_start + 1; | |
791 | if (range->lf_owner != NULL && (range->lf_flags & F_POSIX)) | |
792 | fl->l_pid = range->lf_owner->p_pid; | |
793 | else | |
794 | fl->l_pid = -1; | |
795 | return(0); | |
984263bc MD |
796 | } |
797 | ||
984263bc | 798 | /* |
ad936517 MD |
799 | * Wakeup pending lock attempts. Theoretically we can stop as soon as |
800 | * we encounter an exclusive request that covers the whole range (at least | |
801 | * insofar as the sleep code above calls lf_wakeup() if it would otherwise | |
802 | * exit instead of loop), but for now just wakeup all overlapping | |
803 | * requests. XXX | |
984263bc | 804 | */ |
508ceb09 JS |
805 | static void |
806 | lf_wakeup(struct lockf *lock, off_t start, off_t end) | |
984263bc | 807 | { |
508ceb09 | 808 | struct lockf_range *range, *nrange; |
ad936517 | 809 | |
508ceb09 JS |
810 | TAILQ_FOREACH_MUTABLE(range, &lock->lf_blocked, lf_link, nrange) { |
811 | if (lf_overlap(range, start, end) == 0) | |
984263bc | 812 | continue; |
508ceb09 JS |
813 | TAILQ_REMOVE(&lock->lf_blocked, range, lf_link); |
814 | range->lf_flags = 1; | |
815 | wakeup(range); | |
984263bc | 816 | } |
984263bc MD |
817 | } |
818 | ||
eacc5025 MD |
819 | /* |
820 | * Allocate a range structure and initialize it sufficiently such that | |
821 | * lf_destroy_range() does not barf. | |
f6240282 MD |
822 | * |
823 | * Most use cases are temporary, implement a small 2-entry-per-cpu | |
824 | * cache. | |
eacc5025 | 825 | */ |
508ceb09 | 826 | static struct lockf_range * |
4f0a1741 | 827 | lf_alloc_range(void) |
508ceb09 | 828 | { |
eacc5025 | 829 | struct lockf_range *range; |
f6240282 | 830 | struct lf_pcpu *lfpc; |
eacc5025 | 831 | |
f6240282 MD |
832 | lfpc = &lf_pcpu_array[mycpuid]; |
833 | if ((range = lfpc->free1) != NULL) { | |
834 | lfpc->free1 = NULL; | |
835 | return range; | |
836 | } | |
837 | if ((range = lfpc->free2) != NULL) { | |
838 | lfpc->free2 = NULL; | |
839 | return range; | |
840 | } | |
efda3bd0 | 841 | range = kmalloc(sizeof(struct lockf_range), M_LOCKF, M_WAITOK); |
eacc5025 | 842 | range->lf_owner = NULL; |
f6240282 | 843 | |
eacc5025 | 844 | return(range); |
4f0a1741 | 845 | } |
508ceb09 | 846 | |
a3190466 MD |
847 | static void |
848 | lf_insert(struct lockf_range_list *list, struct lockf_range *elm, | |
849 | struct lockf_range *insert_point) | |
850 | { | |
851 | while (insert_point && insert_point->lf_start < elm->lf_start) | |
852 | insert_point = TAILQ_NEXT(insert_point, lf_link); | |
853 | if (insert_point != NULL) | |
854 | TAILQ_INSERT_BEFORE(insert_point, elm, lf_link); | |
855 | else | |
856 | TAILQ_INSERT_TAIL(list, elm, lf_link); | |
857 | } | |
858 | ||
4f0a1741 JS |
859 | static void |
860 | lf_create_range(struct lockf_range *range, struct proc *owner, int type, | |
cae44669 | 861 | int flags, off_t start, off_t end) |
4f0a1741 | 862 | { |
508ceb09 | 863 | KKASSERT(start <= end); |
508ceb09 JS |
864 | range->lf_type = type; |
865 | range->lf_flags = flags; | |
866 | range->lf_start = start; | |
867 | range->lf_end = end; | |
868 | range->lf_owner = owner; | |
984263bc | 869 | |
4f697c93 SW |
870 | lf_printf("lf_create_range: %ju..%ju\n", |
871 | (uintmax_t)range->lf_start, (uintmax_t)range->lf_end); | |
984263bc MD |
872 | } |
873 | ||
984263bc | 874 | static void |
cae44669 | 875 | lf_destroy_range(struct lockf_range *range) |
984263bc | 876 | { |
f6240282 MD |
877 | struct lf_pcpu *lfpc; |
878 | ||
4f697c93 | 879 | lf_printf("lf_destroy_range: %ju..%ju\n", |
f6240282 MD |
880 | (uintmax_t)range->lf_start, (uintmax_t)range->lf_end); |
881 | ||
882 | lfpc = &lf_pcpu_array[mycpuid]; | |
883 | if (lfpc->free1 == NULL) { | |
884 | range->lf_owner = NULL; | |
885 | lfpc->free1 = range; | |
886 | return; | |
887 | } | |
888 | if (lfpc->free2 == NULL) { | |
889 | range->lf_owner = NULL; | |
890 | lfpc->free2 = range; | |
891 | return; | |
892 | } | |
efda3bd0 | 893 | kfree(range, M_LOCKF); |
984263bc MD |
894 | } |
895 | ||
896 | #ifdef LOCKF_DEBUG | |
6449cdd4 | 897 | |
508ceb09 | 898 | static void |
6449cdd4 MD |
899 | _lf_printf(const char *ctl, ...) |
900 | { | |
901 | struct proc *p; | |
902 | __va_list va; | |
903 | ||
904 | if (lf_print_ranges) { | |
905 | if ((p = curproc) != NULL) | |
6ea70f76 | 906 | kprintf("pid %d (%s): ", p->p_pid, p->p_comm); |
6449cdd4 MD |
907 | } |
908 | __va_start(va, ctl); | |
379210cb | 909 | kvprintf(ctl, va); |
6449cdd4 MD |
910 | __va_end(va); |
911 | } | |
912 | ||
913 | static void | |
914 | _lf_print_lock(const struct lockf *lock) | |
984263bc | 915 | { |
508ceb09 | 916 | struct lockf_range *range; |
984263bc | 917 | |
6449cdd4 MD |
918 | if (lf_print_ranges == 0) |
919 | return; | |
920 | ||
921 | if (TAILQ_EMPTY(&lock->lf_range)) { | |
922 | lf_printf("lockf %p: no ranges locked\n", lock); | |
923 | } else { | |
924 | lf_printf("lockf %p:\n", lock); | |
925 | } | |
508ceb09 | 926 | TAILQ_FOREACH(range, &lock->lf_range, lf_link) |
c91da9dd SW |
927 | kprintf("\t%jd..%jd type %s owned by %d\n", |
928 | (uintmax_t)range->lf_start, (uintmax_t)range->lf_end, | |
508ceb09 JS |
929 | range->lf_type == F_RDLCK ? "shared" : "exclusive", |
930 | range->lf_flags & F_POSIX ? range->lf_owner->p_pid : -1); | |
931 | if (TAILQ_EMPTY(&lock->lf_blocked)) | |
6ea70f76 | 932 | kprintf("no process waiting for range\n"); |
984263bc | 933 | else |
6ea70f76 | 934 | kprintf("blocked locks:"); |
f99c6c54 | 935 | TAILQ_FOREACH(range, &lock->lf_blocked, lf_link) |
c91da9dd SW |
936 | kprintf("\t%jd..%jd type %s waiting on %p\n", |
937 | (uintmax_t)range->lf_start, (uintmax_t)range->lf_end, | |
508ceb09 JS |
938 | range->lf_type == F_RDLCK ? "shared" : "exclusive", |
939 | range); | |
984263bc | 940 | } |
2c4af4c4 | 941 | #endif /* LOCKF_DEBUG */ |
f6240282 MD |
942 | |
943 | static void | |
944 | lf_init(void *dummy __unused) | |
945 | { | |
946 | lf_pcpu_array = kmalloc(sizeof(*lf_pcpu_array) * ncpus, | |
947 | M_LOCKF, M_WAITOK | M_ZERO); | |
948 | } | |
949 | ||
950 | SYSINIT(lockf, SI_BOOT2_MACHDEP, SI_ORDER_ANY, lf_init, NULL); |