kernel - Fix a race and enable the VM read shortcut feature by default
[dragonfly.git] / sys / kern / vfs_helper.c
CommitLineData
9340415c
MD
1/*
2 * (The copyright below applies to ufs_access())
3 *
4 * Copyright (c) 1982, 1986, 1989, 1993, 1995
5 * The Regents of the University of California. All rights reserved.
6 * (c) UNIX System Laboratories, Inc.
7 * All or some portions of this file are derived from material licensed
8 * to the University of California by American Telephone and Telegraph
9 * Co. or Unix System Laboratories, Inc. and are reproduced herein with
10 * the permission of UNIX System Laboratories, Inc.
11 *
12 * Redistribution and use in source and binary forms, with or without
13 * modification, are permitted provided that the following conditions
14 * are met:
15 * 1. Redistributions of source code must retain the above copyright
16 * notice, this list of conditions and the following disclaimer.
17 * 2. Redistributions in binary form must reproduce the above copyright
18 * notice, this list of conditions and the following disclaimer in the
19 * documentation and/or other materials provided with the distribution.
20 * 3. All advertising materials mentioning features or use of this software
21 * must display the following acknowledgement:
22 * This product includes software developed by the University of
23 * California, Berkeley and its contributors.
24 * 4. Neither the name of the University nor the names of its contributors
25 * may be used to endorse or promote products derived from this software
26 * without specific prior written permission.
27 *
28 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
29 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
32 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
33 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
34 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
35 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
36 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
37 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
38 * SUCH DAMAGE.
39 *
40 * @(#)ufs_vnops.c 8.27 (Berkeley) 5/27/95
f7179e23 41 * $DragonFly: src/sys/kern/vfs_helper.c,v 1.5 2008/05/25 18:34:46 dillon Exp $
9340415c
MD
42 */
43
44#include "opt_quota.h"
45#include "opt_suiddir.h"
46
47#include <sys/param.h>
48#include <sys/systm.h>
cd65363e 49#include <sys/conf.h>
9340415c
MD
50#include <sys/kernel.h>
51#include <sys/fcntl.h>
52#include <sys/stat.h>
53#include <sys/mount.h>
54#include <sys/unistd.h>
55#include <sys/vnode.h>
56#include <sys/file.h> /* XXX */
486028c8 57#include <sys/proc.h>
895c1f85 58#include <sys/priv.h>
1399fa3b 59#include <sys/jail.h>
68ad1455
MD
60#include <sys/sysctl.h>
61#include <sys/sfbuf.h>
62#include <vm/vm_extern.h>
63#include <vm/vm_object.h>
64
65#ifdef LWBUF_IS_OPTIMAL
66
74d299cb 67static int vm_read_shortcut_enable = 1;
68ad1455
MD
68static long vm_read_shortcut_count;
69static long vm_read_shortcut_failed;
70SYSCTL_INT(_vm, OID_AUTO, read_shortcut_enable, CTLFLAG_RW,
71 &vm_read_shortcut_enable, 0, "Direct vm_object vop_read shortcut");
72SYSCTL_LONG(_vm, OID_AUTO, read_shortcut_count, CTLFLAG_RW,
73 &vm_read_shortcut_count, 0, "Statistics");
74SYSCTL_LONG(_vm, OID_AUTO, read_shortcut_failed, CTLFLAG_RW,
75 &vm_read_shortcut_failed, 0, "Statistics");
76
77#endif
9340415c
MD
78
79/*
80 * vop_helper_access()
81 *
82 * Provide standard UNIX semanics for VOP_ACCESS, but without the quota
83 * code. This procedure was basically pulled out of UFS.
84 */
85int
86vop_helper_access(struct vop_access_args *ap, uid_t ino_uid, gid_t ino_gid,
87 mode_t ino_mode, u_int32_t ino_flags)
88{
89 struct vnode *vp = ap->a_vp;
90 struct ucred *cred = ap->a_cred;
91 mode_t mask, mode = ap->a_mode;
92 gid_t *gp;
93 int i;
b2aed953
NT
94 uid_t proc_uid;
95 gid_t proc_gid;
96
97 if (ap->a_flags & AT_EACCESS) {
98 proc_uid = cred->cr_uid;
99 proc_gid = cred->cr_gid;
100 } else {
101 proc_uid = cred->cr_ruid;
102 proc_gid = cred->cr_rgid;
103 }
9340415c
MD
104
105 /*
106 * Disallow write attempts on read-only filesystems;
107 * unless the file is a socket, fifo, or a block or
108 * character device resident on the filesystem.
109 */
110 if (mode & VWRITE) {
111 switch (vp->v_type) {
112 case VDIR:
113 case VLNK:
114 case VREG:
115 case VDATABASE:
116 if (vp->v_mount->mnt_flag & MNT_RDONLY)
117 return (EROFS);
9340415c
MD
118 break;
119 default:
120 break;
121 }
122 }
123
124 /* If immutable bit set, nobody gets to write it. */
125 if ((mode & VWRITE) && (ino_flags & IMMUTABLE))
126 return (EPERM);
127
128 /* Otherwise, user id 0 always gets access. */
b2aed953 129 if (proc_uid == 0)
9340415c
MD
130 return (0);
131
132 mask = 0;
133
134 /* Otherwise, check the owner. */
b2aed953 135 if (proc_uid == ino_uid) {
9340415c
MD
136 if (mode & VEXEC)
137 mask |= S_IXUSR;
138 if (mode & VREAD)
139 mask |= S_IRUSR;
140 if (mode & VWRITE)
141 mask |= S_IWUSR;
142 return ((ino_mode & mask) == mask ? 0 : EACCES);
143 }
144
7ecca789
NT
145 /*
146 * Otherwise, check the groups.
b2aed953
NT
147 * We must special-case the primary group to, if needed, check against
148 * the real gid and not the effective one.
7ecca789 149 */
b2aed953 150 if (proc_gid == ino_gid) {
7ecca789
NT
151 if (mode & VEXEC)
152 mask |= S_IXGRP;
153 if (mode & VREAD)
154 mask |= S_IRGRP;
155 if (mode & VWRITE)
156 mask |= S_IWGRP;
157 return ((ino_mode & mask) == mask ? 0 : EACCES);
158 }
159 for (i = 1, gp = &cred->cr_groups[1]; i < cred->cr_ngroups; i++, gp++)
9340415c
MD
160 if (ino_gid == *gp) {
161 if (mode & VEXEC)
162 mask |= S_IXGRP;
163 if (mode & VREAD)
164 mask |= S_IRGRP;
165 if (mode & VWRITE)
166 mask |= S_IWGRP;
167 return ((ino_mode & mask) == mask ? 0 : EACCES);
168 }
169
170 /* Otherwise, check everyone else. */
171 if (mode & VEXEC)
172 mask |= S_IXOTH;
173 if (mode & VREAD)
174 mask |= S_IROTH;
175 if (mode & VWRITE)
176 mask |= S_IWOTH;
177 return ((ino_mode & mask) == mask ? 0 : EACCES);
178}
179
1399fa3b
MD
180int
181vop_helper_setattr_flags(u_int32_t *ino_flags, u_int32_t vaflags,
182 uid_t uid, struct ucred *cred)
183{
184 int error;
185
186 /*
ca3cd02d 187 * If uid doesn't match only a privileged user can change the flags
1399fa3b
MD
188 */
189 if (cred->cr_uid != uid &&
3a591c90 190 (error = priv_check_cred(cred, PRIV_VFS_SYSFLAGS, 0))) {
1399fa3b
MD
191 return(error);
192 }
193 if (cred->cr_uid == 0 &&
194 (!jailed(cred)|| jail_chflags_allowed)) {
195 if ((*ino_flags & (SF_NOUNLINK|SF_IMMUTABLE|SF_APPEND)) &&
196 securelevel > 0)
197 return (EPERM);
198 *ino_flags = vaflags;
199 } else {
200 if (*ino_flags & (SF_NOUNLINK|SF_IMMUTABLE|SF_APPEND) ||
201 (vaflags & UF_SETTABLE) != vaflags)
202 return (EPERM);
203 *ino_flags &= SF_SETTABLE;
204 *ino_flags |= vaflags & UF_SETTABLE;
205 }
206 return(0);
207}
208
f7179e23
MD
209/*
210 * This helper function may be used by VFSs to implement UNIX initial
211 * ownership semantics when creating new objects inside directories.
212 */
fb356552
MD
213uid_t
214vop_helper_create_uid(struct mount *mp, mode_t dmode, uid_t duid,
215 struct ucred *cred, mode_t *modep)
216{
217#ifdef SUIDDIR
f7179e23 218 if ((mp->mnt_flag & MNT_SUIDDIR) && (dmode & S_ISUID) &&
fb356552
MD
219 duid != cred->cr_uid && duid) {
220 *modep &= ~07111;
221 return(duid);
222 }
223#endif
224 return(cred->cr_uid);
225}
1399fa3b 226
f7179e23
MD
227/*
228 * This helper may be used by VFSs to implement unix chmod semantics.
229 */
230int
231vop_helper_chmod(struct vnode *vp, mode_t new_mode, struct ucred *cred,
232 uid_t cur_uid, gid_t cur_gid, mode_t *cur_modep)
233{
234 int error;
cd65363e 235
f7179e23 236 if (cred->cr_uid != cur_uid) {
47fac363 237 error = priv_check_cred(cred, PRIV_VFS_CHMOD, 0);
f7179e23
MD
238 if (error)
239 return (error);
240 }
241 if (cred->cr_uid) {
242 if (vp->v_type != VDIR && (*cur_modep & S_ISTXT))
243 return (EFTYPE);
244 if (!groupmember(cur_gid, cred) && (*cur_modep & S_ISGID))
245 return (EPERM);
246 }
247 *cur_modep &= ~ALLPERMS;
248 *cur_modep |= new_mode & ALLPERMS;
249 return(0);
250}
251
252/*
253 * This helper may be used by VFSs to implement unix chown semantics.
254 */
255int
256vop_helper_chown(struct vnode *vp, uid_t new_uid, gid_t new_gid,
257 struct ucred *cred,
258 uid_t *cur_uidp, gid_t *cur_gidp, mode_t *cur_modep)
259{
260 gid_t ogid;
261 uid_t ouid;
262 int error;
263
264 if (new_uid == (uid_t)VNOVAL)
265 new_uid = *cur_uidp;
266 if (new_gid == (gid_t)VNOVAL)
267 new_gid = *cur_gidp;
268
269 /*
270 * If we don't own the file, are trying to change the owner
271 * of the file, or are not a member of the target group,
ca3cd02d 272 * the caller must be privileged or the call fails.
f7179e23
MD
273 */
274 if ((cred->cr_uid != *cur_uidp || new_uid != *cur_uidp ||
275 (new_gid != *cur_gidp && !(cred->cr_gid == new_gid ||
276 groupmember(new_gid, cred)))) &&
3a591c90 277 (error = priv_check_cred(cred, PRIV_VFS_CHOWN, 0))) {
f7179e23
MD
278 return (error);
279 }
280 ogid = *cur_gidp;
281 ouid = *cur_uidp;
282 /* XXX QUOTA CODE */
283 *cur_uidp = new_uid;
284 *cur_gidp = new_gid;
285 /* XXX QUOTA CODE */
d7c75c7a
MD
286
287 /*
288 * DragonFly clears both SUID and SGID if either the owner or
289 * group is changed and root isn't doing it. If root is doing
290 * it we do not clear SUID/SGID.
291 */
f7179e23
MD
292 if (cred->cr_uid != 0 && (ouid != new_uid || ogid != new_gid))
293 *cur_modep &= ~(S_ISUID | S_ISGID);
294 return(0);
295}
296
68ad1455
MD
297#ifdef LWBUF_IS_OPTIMAL
298
299/*
300 * A VFS can call this function to try to dispose of a read request
301 * directly from the VM system, pretty much bypassing almost all VFS
302 * overhead except for atime updates.
303 *
304 * If 0 is returned some or all of the uio was handled. The caller must
305 * check the uio and handle the remainder.
306 *
307 * The caller must fail on a non-zero error.
308 */
309int
310vop_helper_read_shortcut(struct vop_read_args *ap)
311{
312 struct vnode *vp;
313 struct uio *uio;
314 struct lwbuf *lwb;
315 struct lwbuf lwb_cache;
316 vm_object_t obj;
317 vm_page_t m;
318 int offset;
319 int n;
320 int error;
321
322 vp = ap->a_vp;
323 uio = ap->a_uio;
324
325 /*
326 * We can't short-cut if there is no VM object or this is a special
327 * UIO_NOCOPY read (typically from VOP_STRATEGY()). We also can't
328 * do this if we cannot extract the filesize from the vnode.
329 */
330 if (vm_read_shortcut_enable == 0)
331 return(0);
332 if (vp->v_object == NULL || uio->uio_segflg == UIO_NOCOPY)
333 return(0);
334 if (vp->v_filesize == NOOFFSET)
335 return(0);
336 if (uio->uio_resid == 0)
337 return(0);
338
339 /*
340 * Iterate the uio on a page-by-page basis
341 *
342 * XXX can we leave the object held shared during the uiomove()?
343 */
344 ++vm_read_shortcut_count;
345 obj = vp->v_object;
346 vm_object_hold_shared(obj);
347
348 error = 0;
349 while (uio->uio_resid && error == 0) {
350 offset = (int)uio->uio_offset & PAGE_MASK;
351 n = PAGE_SIZE - offset;
352 if (n > uio->uio_resid)
353 n = uio->uio_resid;
354 if (vp->v_filesize < uio->uio_offset)
355 break;
356 if (uio->uio_offset + n > vp->v_filesize)
357 n = vp->v_filesize - uio->uio_offset;
358 if (n == 0)
359 break; /* hit EOF */
360
74d299cb
MD
361 m = vm_page_lookup_busy_try(obj, OFF_TO_IDX(uio->uio_offset),
362 FALSE, &error);
363 if (error || m == NULL) {
68ad1455 364 ++vm_read_shortcut_failed;
74d299cb 365 error = 0;
68ad1455
MD
366 break;
367 }
368 if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
369 ++vm_read_shortcut_failed;
74d299cb 370 vm_page_wakeup(m);
68ad1455
MD
371 break;
372 }
373 lwb = lwbuf_alloc(m, &lwb_cache);
374 error = uiomove((char *)lwbuf_kva(lwb) + offset, n, uio);
375 vm_page_flag_set(m, PG_REFERENCED);
376 lwbuf_free(lwb);
74d299cb 377 vm_page_wakeup(m);
68ad1455
MD
378 }
379 vm_object_drop(obj);
380
381 return (error);
382}
383
384#else
385
386/*
387 * If lwbuf's aren't optimal then it's best to just use the buffer
388 * cache.
389 */
390int
391vop_helper_read_shortcut(struct vop_read_args *ap)
392{
393 return(0);
394}
395
396#endif