kernel - Change lockmgr LK_SHARED behavior to fix improper recursion return
[dragonfly.git] / sys / vfs / ufs / ffs_rawread.c
CommitLineData
984263bc
MD
1/*-
2 * Copyright (c) 2000-2003 Tor Egge
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 *
26 * $FreeBSD: src/sys/ufs/ffs/ffs_rawread.c,v 1.3.2.2 2003/05/29 06:15:35 alc Exp $
e92ca23a 27 * $DragonFly: src/sys/vfs/ufs/ffs_rawread.c,v 1.28 2008/06/19 23:27:39 dillon Exp $
984263bc
MD
28 */
29
30#include <sys/param.h>
31#include <sys/systm.h>
32#include <sys/fcntl.h>
33#include <sys/file.h>
34#include <sys/stat.h>
35#include <sys/proc.h>
36#include <sys/mount.h>
37#include <sys/namei.h>
38#include <sys/vnode.h>
39#include <sys/conf.h>
40#include <sys/filio.h>
41#include <sys/ttycom.h>
42#include <sys/buf.h>
1f2de5d4
MD
43#include "quota.h"
44#include "inode.h"
45#include "fs.h"
984263bc
MD
46
47#include <machine/limits.h>
48#include <vm/vm.h>
49#include <vm/vm_extern.h>
50#include <vm/vm_object.h>
51#include <sys/kernel.h>
52#include <sys/sysctl.h>
53
2d53bf02 54static int ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t offset,
f9235b6d 55 size_t len, struct buf *bp);
984263bc
MD
56static int ffs_rawread_main(struct vnode *vp,
57 struct uio *uio);
58
a11aaa81 59static int ffs_rawread_sync(struct vnode *vp);
984263bc
MD
60
61int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone);
62
63void ffs_rawread_setup(void);
64
984263bc
MD
65SYSCTL_DECL(_vfs_ffs);
66
67static int ffsrawbufcnt = 4;
68SYSCTL_INT(_vfs_ffs, OID_AUTO, ffsrawbufcnt, CTLFLAG_RD, &ffsrawbufcnt, 0,
69 "Buffers available for raw reads");
70
71static int allowrawread = 1;
72SYSCTL_INT(_vfs_ffs, OID_AUTO, allowrawread, CTLFLAG_RW, &allowrawread, 0,
73 "Flag to enable raw reads");
74
75static int rawreadahead = 1;
76SYSCTL_INT(_vfs_ffs, OID_AUTO, rawreadahead, CTLFLAG_RW, &rawreadahead, 0,
77 "Flag to enable readahead for long raw reads");
78
79
80void
81ffs_rawread_setup(void)
82{
83 ffsrawbufcnt = (nswbuf > 100 ) ? (nswbuf - (nswbuf >> 4)) : nswbuf - 8;
84}
85
86
87static int
a11aaa81 88ffs_rawread_sync(struct vnode *vp)
984263bc 89{
984263bc 90 int error;
984263bc 91
0202303b
MD
92 /*
93 * Check for dirty mmap, pending writes and dirty buffers
94 */
3b998fa9 95 lwkt_gettoken(&vp->v_token);
a9a20f98 96 if (bio_track_active(&vp->v_track_write) ||
6bae6177 97 !RB_EMPTY(&vp->v_rbdirty_tree) ||
984263bc 98 (vp->v_flag & VOBJDIRTY) != 0) {
984263bc
MD
99 /* Attempt to msync mmap() regions to clean dirty mmap */
100 if ((vp->v_flag & VOBJDIRTY) != 0) {
101 struct vm_object *obj;
7540ab49 102 if ((obj = vp->v_object) != NULL)
984263bc
MD
103 vm_object_page_clean(obj, 0, 0, OBJPC_SYNC);
104 }
105
106 /* Wait for pending writes to complete */
a9a20f98
MD
107 error = bio_track_wait(&vp->v_track_write, 0, 0);
108 if (error != 0) {
0202303b 109 goto done;
984263bc
MD
110 }
111 /* Flush dirty buffers */
6bae6177 112 if (!RB_EMPTY(&vp->v_rbdirty_tree)) {
52174f71 113 if ((error = VOP_FSYNC(vp, MNT_WAIT, 0)) != 0) {
0202303b 114 goto done;
984263bc 115 }
a9a20f98 116 if (bio_track_active(&vp->v_track_write) ||
6bae6177 117 !RB_EMPTY(&vp->v_rbdirty_tree))
984263bc
MD
118 panic("ffs_rawread_sync: dirty bufs");
119 }
984263bc 120 } else {
0202303b 121 error = 0;
984263bc 122 }
0202303b 123done:
3b998fa9 124 lwkt_reltoken(&vp->v_token);
0202303b 125 return error;
984263bc
MD
126}
127
128
129static int
81b5c339 130ffs_rawread_readahead(struct vnode *vp, caddr_t udata, off_t loffset,
f9235b6d 131 size_t len, struct buf *bp)
984263bc
MD
132{
133 int error;
3591bbc6 134 int iolen;
984263bc
MD
135 int blockoff;
136 int bsize;
137 struct vnode *dp;
138 int bforwards;
139
140 bsize = vp->v_mount->mnt_stat.f_iosize;
3591bbc6
MD
141
142 /*
143 * Make sure it fits into the pbuf
144 */
145 iolen = (int)(intptr_t)udata & PAGE_MASK;
146 if (len + iolen > bp->b_kvasize) {
147 len = bp->b_kvasize;
984263bc 148 if (iolen != 0)
3591bbc6 149 len -= PAGE_SIZE;
984263bc 150 }
ae8e83e6
MD
151
152 /*
153 * Raw disk address is in bio2, but we wait for it to
154 * chain to bio1.
155 */
10f3fee5 156 bp->b_flags &= ~B_ERROR;
81b5c339 157 bp->b_loffset = loffset;
81b5c339 158 bp->b_bio2.bio_offset = NOOFFSET;
ae8e83e6
MD
159 bp->b_bio1.bio_done = biodone_sync;
160 bp->b_bio1.bio_flags |= BIO_SYNC;
81b5c339 161
aa73238b
MD
162 blockoff = (loffset % bsize) / DEV_BSIZE;
163
08daea96 164 error = VOP_BMAP(vp, bp->b_loffset, &bp->b_bio2.bio_offset,
e92ca23a 165 &bforwards, NULL, BUF_CMD_READ);
08daea96 166 if (error != 0)
984263bc 167 return error;
08daea96 168 dp = VTOI(vp)->i_devvp;
54078292 169 if (bp->b_bio2.bio_offset == NOOFFSET) {
81b5c339
MD
170 /*
171 * Fill holes with NULs to preserve semantics
172 */
3591bbc6
MD
173 if (len + blockoff * DEV_BSIZE > bsize)
174 len = bsize - blockoff * DEV_BSIZE;
984263bc 175
3591bbc6 176 if (vmapbuf(bp, udata, len) < 0)
984263bc
MD
177 return EFAULT;
178
f9235b6d 179 lwkt_user_yield();
3591bbc6 180 bzero(bp->b_data, bp->b_bcount);
984263bc
MD
181
182 /* Mark operation completed (similar to bufdone()) */
183
184 bp->b_resid = 0;
984263bc
MD
185 return 0;
186 }
187
3591bbc6
MD
188 if (len + blockoff * DEV_BSIZE > bforwards)
189 len = bforwards - blockoff * DEV_BSIZE;
54078292 190 bp->b_bio2.bio_offset += blockoff * DEV_BSIZE;
984263bc 191
3591bbc6 192 if (vmapbuf(bp, udata, len) < 0)
984263bc
MD
193 return EFAULT;
194
81b5c339
MD
195 /*
196 * Access the block device layer using the device vnode (dp) and
197 * the translated block number (bio2) instead of the logical block
198 * number (bio1).
199 *
200 * Even though we are bypassing the vnode layer, we still
201 * want the vnode state to indicate that an I/O on its behalf
202 * is in progress.
203 */
10f3fee5 204 bp->b_cmd = BUF_CMD_READ;
81b5c339
MD
205 bio_start_transaction(&bp->b_bio1, &vp->v_track_read);
206 vn_strategy(dp, &bp->b_bio2);
984263bc
MD
207 return 0;
208}
209
984263bc 210static int
2d53bf02 211ffs_rawread_main(struct vnode *vp, struct uio *uio)
984263bc
MD
212{
213 int error, nerror;
214 struct buf *bp, *nbp, *tbp;
3591bbc6 215 int iolen;
984263bc 216 caddr_t udata;
54078292 217 int resid;
984263bc 218 off_t offset;
984263bc 219
984263bc
MD
220 udata = uio->uio_iov->iov_base;
221 resid = uio->uio_resid;
222 offset = uio->uio_offset;
223
984263bc
MD
224 error = 0;
225 nerror = 0;
226
227 bp = NULL;
228 nbp = NULL;
984263bc
MD
229
230 while (resid > 0) {
231
232 if (bp == NULL) { /* Setup first read */
233 /* XXX: Leave some bufs for swap */
9a82e536 234 bp = getpbuf_kva(&ffsrawbufcnt);
f9235b6d
MD
235 error = ffs_rawread_readahead(vp, udata, offset,
236 resid, bp);
984263bc
MD
237 if (error != 0)
238 break;
239
240 if (resid > bp->b_bufsize) { /* Setup fist readahead */
241 /* XXX: Leave bufs for swap */
242 if (rawreadahead != 0)
9a82e536 243 nbp = trypbuf_kva(&ffsrawbufcnt);
984263bc
MD
244 else
245 nbp = NULL;
246 if (nbp != NULL) {
2d53bf02
MD
247 nerror = ffs_rawread_readahead(
248 vp,
249 udata + bp->b_bufsize,
250 offset + bp->b_bufsize,
251 resid - bp->b_bufsize,
f9235b6d 252 nbp);
984263bc
MD
253 if (nerror) {
254 relpbuf(nbp, &ffsrawbufcnt);
255 nbp = NULL;
256 }
257 }
258 }
259 }
260
ae8e83e6 261 biowait(&bp->b_bio1, "rawrd");
984263bc
MD
262
263 vunmapbuf(bp);
264
265 iolen = bp->b_bcount - bp->b_resid;
266 if (iolen == 0 && (bp->b_flags & B_ERROR) == 0) {
267 nerror = 0; /* Ignore possible beyond EOF error */
268 break; /* EOF */
269 }
270
271 if ((bp->b_flags & B_ERROR) != 0) {
272 error = bp->b_error;
273 break;
274 }
81b5c339 275 clearbiocache(&bp->b_bio2);
984263bc
MD
276 resid -= iolen;
277 udata += iolen;
278 offset += iolen;
279 if (iolen < bp->b_bufsize) {
280 /* Incomplete read. Try to read remaining part */
2d53bf02
MD
281 error = ffs_rawread_readahead(
282 vp, udata, offset,
f9235b6d 283 bp->b_bufsize - iolen, bp);
984263bc
MD
284 if (error != 0)
285 break;
286 } else if (nbp != NULL) { /* Complete read with readahead */
287
288 tbp = bp;
289 bp = nbp;
290 nbp = tbp;
291
81b5c339 292 clearbiocache(&nbp->b_bio2);
984263bc
MD
293
294 if (resid <= bp->b_bufsize) { /* No more readaheads */
295 relpbuf(nbp, &ffsrawbufcnt);
296 nbp = NULL;
297 } else { /* Setup next readahead */
2d53bf02
MD
298 nerror = ffs_rawread_readahead(
299 vp, udata + bp->b_bufsize,
300 offset + bp->b_bufsize,
301 resid - bp->b_bufsize,
f9235b6d 302 nbp);
984263bc
MD
303 if (nerror != 0) {
304 relpbuf(nbp, &ffsrawbufcnt);
305 nbp = NULL;
306 }
307 }
308 } else if (nerror != 0) {/* Deferred Readahead error */
309 break;
310 } else if (resid > 0) { /* More to read, no readahead */
311 error = ffs_rawread_readahead(vp, udata, offset,
f9235b6d 312 resid, bp);
984263bc
MD
313 if (error != 0)
314 break;
315 }
316 }
317
318 if (bp != NULL)
319 relpbuf(bp, &ffsrawbufcnt);
320 if (nbp != NULL) { /* Run down readahead buffer */
ae8e83e6 321 biowait(&nbp->b_bio1, "rawrd");
984263bc
MD
322 vunmapbuf(nbp);
323 relpbuf(nbp, &ffsrawbufcnt);
324 }
325
326 if (error == 0)
327 error = nerror;
984263bc
MD
328 uio->uio_iov->iov_base = udata;
329 uio->uio_resid = resid;
330 uio->uio_offset = offset;
331 return error;
332}
333
334
335int
336ffs_rawread(struct vnode *vp,
337 struct uio *uio,
338 int *workdone)
339{
340 if (allowrawread != 0 &&
341 uio->uio_iovcnt == 1 &&
342 uio->uio_segflg == UIO_USERSPACE &&
343 uio->uio_resid == uio->uio_iov->iov_len &&
a11aaa81 344 (curthread->td_flags & TDF_DEADLKTREAT) == 0) {
984263bc
MD
345 int secsize; /* Media sector size */
346 off_t filebytes; /* Bytes left of file */
347 int blockbytes; /* Bytes left of file in full blocks */
348 int partialbytes; /* Bytes in last partial block */
349 int skipbytes; /* Bytes not to read in ffs_rawread */
350 struct inode *ip;
351 int error;
352
353
354 /* Only handle sector aligned reads */
355 ip = VTOI(vp);
356 secsize = ip->i_devvp->v_rdev->si_bsize_phys;
357 if ((uio->uio_offset & (secsize - 1)) == 0 &&
358 (uio->uio_resid & (secsize - 1)) == 0) {
359
360 /* Sync dirty pages and buffers if needed */
a11aaa81 361 error = ffs_rawread_sync(vp);
984263bc
MD
362 if (error != 0)
363 return error;
364
365 /* Check for end of file */
366 if (ip->i_size > uio->uio_offset) {
367 filebytes = ip->i_size - uio->uio_offset;
368
369 /* No special eof handling needed ? */
370 if (uio->uio_resid <= filebytes) {
371 *workdone = 1;
372 return ffs_rawread_main(vp, uio);
373 }
374
375 partialbytes = ((unsigned int) ip->i_size) %
376 ip->i_fs->fs_bsize;
377 blockbytes = (int) filebytes - partialbytes;
378 if (blockbytes > 0) {
379 skipbytes = uio->uio_resid -
380 blockbytes;
381 uio->uio_resid = blockbytes;
382 error = ffs_rawread_main(vp, uio);
383 uio->uio_resid += skipbytes;
384 if (error != 0)
385 return error;
386 /* Read remaining part using buffer */
387 }
388 }
389 }
390 }
391 *workdone = 0;
392 return 0;
393}
394