| 1 | /*- |
| 2 | * Copyright (c) 1993 |
| 3 | * The Regents of the University of California. All rights reserved. |
| 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions |
| 7 | * are met: |
| 8 | * 1. Redistributions of source code must retain the above copyright |
| 9 | * notice, this list of conditions and the following disclaimer. |
| 10 | * 2. Redistributions in binary form must reproduce the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer in the |
| 12 | * documentation and/or other materials provided with the distribution. |
| 13 | * 3. All advertising materials mentioning features or use of this software |
| 14 | * must display the following acknowledgement: |
| 15 | * This product includes software developed by the University of |
| 16 | * California, Berkeley and its contributors. |
| 17 | * 4. Neither the name of the University nor the names of its contributors |
| 18 | * may be used to endorse or promote products derived from this software |
| 19 | * without specific prior written permission. |
| 20 | * |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 22 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 23 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 24 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 25 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 26 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 27 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 28 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 29 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 30 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 31 | * SUCH DAMAGE. |
| 32 | * |
| 33 | * @(#)ufs_readwrite.c 8.11 (Berkeley) 5/8/95 |
| 34 | * $FreeBSD: src/sys/ufs/ufs/ufs_readwrite.c,v 1.65.2.14 2003/04/04 22:21:29 tegge Exp $ |
| 35 | * $DragonFly: src/sys/vfs/ufs/ufs_readwrite.c,v 1.3 2003/06/25 03:56:12 dillon Exp $ |
| 36 | */ |
| 37 | |
| 38 | #define BLKSIZE(a, b, c) blksize(a, b, c) |
| 39 | #define FS struct fs |
| 40 | #define I_FS i_fs |
| 41 | #define READ ffs_read |
| 42 | #define READ_S "ffs_read" |
| 43 | #define WRITE ffs_write |
| 44 | #define WRITE_S "ffs_write" |
| 45 | |
| 46 | #include <vm/vm.h> |
| 47 | #include <vm/vm_object.h> |
| 48 | #include <vm/vm_pager.h> |
| 49 | #include <vm/vm_map.h> |
| 50 | #include <vm/vnode_pager.h> |
| 51 | #include <sys/event.h> |
| 52 | #include <sys/vmmeter.h> |
| 53 | #include "opt_directio.h" |
| 54 | |
| 55 | #define VN_KNOTE(vp, b) \ |
| 56 | KNOTE((struct klist *)&vp->v_pollinfo.vpi_selinfo.si_note, (b)) |
| 57 | |
| 58 | #ifdef DIRECTIO |
| 59 | extern int ffs_rawread(struct vnode *vp, struct uio *uio, int *workdone); |
| 60 | #endif |
| 61 | |
| 62 | /* |
| 63 | * Vnode op for reading. |
| 64 | */ |
| 65 | /* ARGSUSED */ |
| 66 | int |
| 67 | READ(ap) |
| 68 | struct vop_read_args /* { |
| 69 | struct vnode *a_vp; |
| 70 | struct uio *a_uio; |
| 71 | int a_ioflag; |
| 72 | struct ucred *a_cred; |
| 73 | } */ *ap; |
| 74 | { |
| 75 | register struct vnode *vp; |
| 76 | register struct inode *ip; |
| 77 | register struct uio *uio; |
| 78 | register FS *fs; |
| 79 | struct buf *bp; |
| 80 | ufs_daddr_t lbn, nextlbn; |
| 81 | off_t bytesinfile; |
| 82 | long size, xfersize, blkoffset; |
| 83 | int error, orig_resid; |
| 84 | u_short mode; |
| 85 | int seqcount; |
| 86 | int ioflag; |
| 87 | vm_object_t object; |
| 88 | |
| 89 | vp = ap->a_vp; |
| 90 | seqcount = ap->a_ioflag >> 16; |
| 91 | ip = VTOI(vp); |
| 92 | mode = ip->i_mode; |
| 93 | uio = ap->a_uio; |
| 94 | ioflag = ap->a_ioflag; |
| 95 | #ifdef DIRECTIO |
| 96 | if ((ioflag & IO_DIRECT) != 0) { |
| 97 | int workdone; |
| 98 | |
| 99 | error = ffs_rawread(vp, uio, &workdone); |
| 100 | if (error || workdone) |
| 101 | return error; |
| 102 | } |
| 103 | #endif |
| 104 | |
| 105 | #ifdef DIAGNOSTIC |
| 106 | if (uio->uio_rw != UIO_READ) |
| 107 | panic("%s: mode", READ_S); |
| 108 | |
| 109 | if (vp->v_type == VLNK) { |
| 110 | if ((int)ip->i_size < vp->v_mount->mnt_maxsymlinklen) |
| 111 | panic("%s: short symlink", READ_S); |
| 112 | } else if (vp->v_type != VREG && vp->v_type != VDIR) |
| 113 | panic("%s: type %d", READ_S, vp->v_type); |
| 114 | #endif |
| 115 | fs = ip->I_FS; |
| 116 | if ((u_int64_t)uio->uio_offset > fs->fs_maxfilesize) |
| 117 | return (EFBIG); |
| 118 | |
| 119 | orig_resid = uio->uio_resid; |
| 120 | if (orig_resid <= 0) |
| 121 | return (0); |
| 122 | |
| 123 | object = vp->v_object; |
| 124 | |
| 125 | bytesinfile = ip->i_size - uio->uio_offset; |
| 126 | if (bytesinfile <= 0) { |
| 127 | if ((vp->v_mount->mnt_flag & MNT_NOATIME) == 0) |
| 128 | ip->i_flag |= IN_ACCESS; |
| 129 | return 0; |
| 130 | } |
| 131 | |
| 132 | if (object) |
| 133 | vm_object_reference(object); |
| 134 | |
| 135 | #ifdef ENABLE_VFS_IOOPT |
| 136 | /* |
| 137 | * If IO optimisation is turned on, |
| 138 | * and we are NOT a VM based IO request, |
| 139 | * (i.e. not headed for the buffer cache) |
| 140 | * but there IS a vm object associated with it. |
| 141 | */ |
| 142 | if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { |
| 143 | int nread, toread; |
| 144 | |
| 145 | toread = uio->uio_resid; |
| 146 | if (toread > bytesinfile) |
| 147 | toread = bytesinfile; |
| 148 | if (toread >= PAGE_SIZE) { |
| 149 | /* |
| 150 | * Then if it's at least a page in size, try |
| 151 | * get the data from the object using vm tricks |
| 152 | */ |
| 153 | error = uioread(toread, uio, object, &nread); |
| 154 | if ((uio->uio_resid == 0) || (error != 0)) { |
| 155 | /* |
| 156 | * If we finished or there was an error |
| 157 | * then finish up (the reference previously |
| 158 | * obtained on object must be released). |
| 159 | */ |
| 160 | if ((error == 0 || |
| 161 | uio->uio_resid != orig_resid) && |
| 162 | (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) |
| 163 | ip->i_flag |= IN_ACCESS; |
| 164 | |
| 165 | if (object) |
| 166 | vm_object_vndeallocate(object); |
| 167 | return error; |
| 168 | } |
| 169 | } |
| 170 | } |
| 171 | #endif |
| 172 | |
| 173 | /* |
| 174 | * Ok so we couldn't do it all in one vm trick... |
| 175 | * so cycle around trying smaller bites.. |
| 176 | */ |
| 177 | for (error = 0, bp = NULL; uio->uio_resid > 0; bp = NULL) { |
| 178 | if ((bytesinfile = ip->i_size - uio->uio_offset) <= 0) |
| 179 | break; |
| 180 | #ifdef ENABLE_VFS_IOOPT |
| 181 | if ((ioflag & IO_VMIO) == 0 && (vfs_ioopt > 1) && object) { |
| 182 | /* |
| 183 | * Obviously we didn't finish above, but we |
| 184 | * didn't get an error either. Try the same trick again. |
| 185 | * but this time we are looping. |
| 186 | */ |
| 187 | int nread, toread; |
| 188 | toread = uio->uio_resid; |
| 189 | if (toread > bytesinfile) |
| 190 | toread = bytesinfile; |
| 191 | |
| 192 | /* |
| 193 | * Once again, if there isn't enough for a |
| 194 | * whole page, don't try optimising. |
| 195 | */ |
| 196 | if (toread >= PAGE_SIZE) { |
| 197 | error = uioread(toread, uio, object, &nread); |
| 198 | if ((uio->uio_resid == 0) || (error != 0)) { |
| 199 | /* |
| 200 | * If we finished or there was an |
| 201 | * error then finish up (the reference |
| 202 | * previously obtained on object must |
| 203 | * be released). |
| 204 | */ |
| 205 | if ((error == 0 || |
| 206 | uio->uio_resid != orig_resid) && |
| 207 | (vp->v_mount->mnt_flag & |
| 208 | MNT_NOATIME) == 0) |
| 209 | ip->i_flag |= IN_ACCESS; |
| 210 | if (object) |
| 211 | vm_object_vndeallocate(object); |
| 212 | return error; |
| 213 | } |
| 214 | /* |
| 215 | * To get here we didnt't finish or err. |
| 216 | * If we did get some data, |
| 217 | * loop to try another bite. |
| 218 | */ |
| 219 | if (nread > 0) { |
| 220 | continue; |
| 221 | } |
| 222 | } |
| 223 | } |
| 224 | #endif |
| 225 | |
| 226 | lbn = lblkno(fs, uio->uio_offset); |
| 227 | nextlbn = lbn + 1; |
| 228 | |
| 229 | /* |
| 230 | * size of buffer. The buffer representing the |
| 231 | * end of the file is rounded up to the size of |
| 232 | * the block type ( fragment or full block, |
| 233 | * depending ). |
| 234 | */ |
| 235 | size = BLKSIZE(fs, ip, lbn); |
| 236 | blkoffset = blkoff(fs, uio->uio_offset); |
| 237 | |
| 238 | /* |
| 239 | * The amount we want to transfer in this iteration is |
| 240 | * one FS block less the amount of the data before |
| 241 | * our startpoint (duh!) |
| 242 | */ |
| 243 | xfersize = fs->fs_bsize - blkoffset; |
| 244 | |
| 245 | /* |
| 246 | * But if we actually want less than the block, |
| 247 | * or the file doesn't have a whole block more of data, |
| 248 | * then use the lesser number. |
| 249 | */ |
| 250 | if (uio->uio_resid < xfersize) |
| 251 | xfersize = uio->uio_resid; |
| 252 | if (bytesinfile < xfersize) |
| 253 | xfersize = bytesinfile; |
| 254 | |
| 255 | if (lblktosize(fs, nextlbn) >= ip->i_size) { |
| 256 | /* |
| 257 | * Don't do readahead if this is the end of the file. |
| 258 | */ |
| 259 | error = bread(vp, lbn, size, NOCRED, &bp); |
| 260 | } else if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERR) == 0) { |
| 261 | /* |
| 262 | * Otherwise if we are allowed to cluster, |
| 263 | * grab as much as we can. |
| 264 | * |
| 265 | * XXX This may not be a win if we are not |
| 266 | * doing sequential access. |
| 267 | */ |
| 268 | error = cluster_read(vp, ip->i_size, lbn, |
| 269 | size, NOCRED, uio->uio_resid, seqcount, &bp); |
| 270 | } else if (seqcount > 1) { |
| 271 | /* |
| 272 | * If we are NOT allowed to cluster, then |
| 273 | * if we appear to be acting sequentially, |
| 274 | * fire off a request for a readahead |
| 275 | * as well as a read. Note that the 4th and 5th |
| 276 | * arguments point to arrays of the size specified in |
| 277 | * the 6th argument. |
| 278 | */ |
| 279 | int nextsize = BLKSIZE(fs, ip, nextlbn); |
| 280 | error = breadn(vp, lbn, |
| 281 | size, &nextlbn, &nextsize, 1, NOCRED, &bp); |
| 282 | } else { |
| 283 | /* |
| 284 | * Failing all of the above, just read what the |
| 285 | * user asked for. Interestingly, the same as |
| 286 | * the first option above. |
| 287 | */ |
| 288 | error = bread(vp, lbn, size, NOCRED, &bp); |
| 289 | } |
| 290 | if (error) { |
| 291 | brelse(bp); |
| 292 | bp = NULL; |
| 293 | break; |
| 294 | } |
| 295 | |
| 296 | /* |
| 297 | * If IO_DIRECT then set B_DIRECT for the buffer. This |
| 298 | * will cause us to attempt to release the buffer later on |
| 299 | * and will cause the buffer cache to attempt to free the |
| 300 | * underlying pages. |
| 301 | */ |
| 302 | if (ioflag & IO_DIRECT) |
| 303 | bp->b_flags |= B_DIRECT; |
| 304 | |
| 305 | /* |
| 306 | * We should only get non-zero b_resid when an I/O error |
| 307 | * has occurred, which should cause us to break above. |
| 308 | * However, if the short read did not cause an error, |
| 309 | * then we want to ensure that we do not uiomove bad |
| 310 | * or uninitialized data. |
| 311 | * |
| 312 | * XXX b_resid is only valid when an actual I/O has occured |
| 313 | * and may be incorrect if the buffer is B_CACHE or if the |
| 314 | * last op on the buffer was a failed write. This KASSERT |
| 315 | * is a precursor to removing it from the UFS code. |
| 316 | */ |
| 317 | KASSERT(bp->b_resid == 0, ("bp->b_resid != 0")); |
| 318 | size -= bp->b_resid; |
| 319 | if (size < xfersize) { |
| 320 | if (size == 0) |
| 321 | break; |
| 322 | xfersize = size; |
| 323 | } |
| 324 | |
| 325 | #ifdef ENABLE_VFS_IOOPT |
| 326 | if (vfs_ioopt && object && |
| 327 | (bp->b_flags & B_VMIO) && |
| 328 | ((blkoffset & PAGE_MASK) == 0) && |
| 329 | ((xfersize & PAGE_MASK) == 0)) { |
| 330 | /* |
| 331 | * If VFS IO optimisation is turned on, |
| 332 | * and it's an exact page multiple |
| 333 | * And a normal VM based op, |
| 334 | * then use uiomiveco() |
| 335 | */ |
| 336 | error = |
| 337 | uiomoveco((char *)bp->b_data + blkoffset, |
| 338 | (int)xfersize, uio, object); |
| 339 | } else |
| 340 | #endif |
| 341 | { |
| 342 | /* |
| 343 | * otherwise use the general form |
| 344 | */ |
| 345 | error = |
| 346 | uiomove((char *)bp->b_data + blkoffset, |
| 347 | (int)xfersize, uio); |
| 348 | } |
| 349 | |
| 350 | if (error) |
| 351 | break; |
| 352 | |
| 353 | if ((ioflag & (IO_VMIO|IO_DIRECT)) && |
| 354 | (LIST_FIRST(&bp->b_dep) == NULL)) { |
| 355 | /* |
| 356 | * If there are no dependencies, and it's VMIO, |
| 357 | * then we don't need the buf, mark it available |
| 358 | * for freeing. The VM has the data. |
| 359 | */ |
| 360 | bp->b_flags |= B_RELBUF; |
| 361 | brelse(bp); |
| 362 | } else { |
| 363 | /* |
| 364 | * Otherwise let whoever |
| 365 | * made the request take care of |
| 366 | * freeing it. We just queue |
| 367 | * it onto another list. |
| 368 | */ |
| 369 | bqrelse(bp); |
| 370 | } |
| 371 | } |
| 372 | |
| 373 | /* |
| 374 | * This can only happen in the case of an error |
| 375 | * because the loop above resets bp to NULL on each iteration |
| 376 | * and on normal completion has not set a new value into it. |
| 377 | * so it must have come from a 'break' statement |
| 378 | */ |
| 379 | if (bp != NULL) { |
| 380 | if ((ioflag & (IO_VMIO|IO_DIRECT)) && |
| 381 | (LIST_FIRST(&bp->b_dep) == NULL)) { |
| 382 | bp->b_flags |= B_RELBUF; |
| 383 | brelse(bp); |
| 384 | } else { |
| 385 | bqrelse(bp); |
| 386 | } |
| 387 | } |
| 388 | |
| 389 | if (object) |
| 390 | vm_object_vndeallocate(object); |
| 391 | if ((error == 0 || uio->uio_resid != orig_resid) && |
| 392 | (vp->v_mount->mnt_flag & MNT_NOATIME) == 0) |
| 393 | ip->i_flag |= IN_ACCESS; |
| 394 | return (error); |
| 395 | } |
| 396 | |
| 397 | /* |
| 398 | * Vnode op for writing. |
| 399 | */ |
| 400 | int |
| 401 | WRITE(ap) |
| 402 | struct vop_write_args /* { |
| 403 | struct vnode *a_vp; |
| 404 | struct uio *a_uio; |
| 405 | int a_ioflag; |
| 406 | struct ucred *a_cred; |
| 407 | } */ *ap; |
| 408 | { |
| 409 | register struct vnode *vp; |
| 410 | register struct uio *uio; |
| 411 | register struct inode *ip; |
| 412 | register FS *fs; |
| 413 | struct buf *bp; |
| 414 | ufs_daddr_t lbn; |
| 415 | off_t osize; |
| 416 | int seqcount; |
| 417 | int blkoffset, error, extended, flags, ioflag, resid, size, xfersize; |
| 418 | vm_object_t object; |
| 419 | struct thread *td; |
| 420 | |
| 421 | extended = 0; |
| 422 | seqcount = ap->a_ioflag >> 16; |
| 423 | ioflag = ap->a_ioflag; |
| 424 | uio = ap->a_uio; |
| 425 | vp = ap->a_vp; |
| 426 | ip = VTOI(vp); |
| 427 | |
| 428 | object = vp->v_object; |
| 429 | if (object) |
| 430 | vm_object_reference(object); |
| 431 | |
| 432 | #ifdef DIAGNOSTIC |
| 433 | if (uio->uio_rw != UIO_WRITE) |
| 434 | panic("%s: mode", WRITE_S); |
| 435 | #endif |
| 436 | |
| 437 | switch (vp->v_type) { |
| 438 | case VREG: |
| 439 | if (ioflag & IO_APPEND) |
| 440 | uio->uio_offset = ip->i_size; |
| 441 | if ((ip->i_flags & APPEND) && uio->uio_offset != ip->i_size) { |
| 442 | if (object) |
| 443 | vm_object_vndeallocate(object); |
| 444 | return (EPERM); |
| 445 | } |
| 446 | /* FALLTHROUGH */ |
| 447 | case VLNK: |
| 448 | break; |
| 449 | case VDIR: |
| 450 | panic("%s: dir write", WRITE_S); |
| 451 | break; |
| 452 | default: |
| 453 | panic("%s: type %p %d (%d,%d)", WRITE_S, vp, (int)vp->v_type, |
| 454 | (int)uio->uio_offset, |
| 455 | (int)uio->uio_resid |
| 456 | ); |
| 457 | } |
| 458 | |
| 459 | fs = ip->I_FS; |
| 460 | if (uio->uio_offset < 0 || |
| 461 | (u_int64_t)uio->uio_offset + uio->uio_resid > fs->fs_maxfilesize) { |
| 462 | if (object) |
| 463 | vm_object_vndeallocate(object); |
| 464 | return (EFBIG); |
| 465 | } |
| 466 | /* |
| 467 | * Maybe this should be above the vnode op call, but so long as |
| 468 | * file servers have no limits, I don't think it matters. |
| 469 | */ |
| 470 | td = uio->uio_td; |
| 471 | if (vp->v_type == VREG && td->td_proc && |
| 472 | uio->uio_offset + uio->uio_resid > |
| 473 | td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) { |
| 474 | psignal(td->td_proc, SIGXFSZ); |
| 475 | if (object) |
| 476 | vm_object_vndeallocate(object); |
| 477 | return (EFBIG); |
| 478 | } |
| 479 | |
| 480 | resid = uio->uio_resid; |
| 481 | osize = ip->i_size; |
| 482 | |
| 483 | /* |
| 484 | * NOTE! These B_ flags are actually balloc-only flags, not buffer |
| 485 | * flags. They are similar to the BA_ flags in -current. |
| 486 | */ |
| 487 | if (seqcount > B_SEQMAX) |
| 488 | flags = B_SEQMAX << B_SEQSHIFT; |
| 489 | else |
| 490 | flags = seqcount << B_SEQSHIFT; |
| 491 | if ((ioflag & IO_SYNC) && !DOINGASYNC(vp)) |
| 492 | flags |= B_SYNC; |
| 493 | |
| 494 | if (object && (object->flags & OBJ_OPT)) { |
| 495 | vm_freeze_copyopts(object, |
| 496 | OFF_TO_IDX(uio->uio_offset), |
| 497 | OFF_TO_IDX(uio->uio_offset + uio->uio_resid + PAGE_MASK)); |
| 498 | } |
| 499 | |
| 500 | for (error = 0; uio->uio_resid > 0;) { |
| 501 | lbn = lblkno(fs, uio->uio_offset); |
| 502 | blkoffset = blkoff(fs, uio->uio_offset); |
| 503 | xfersize = fs->fs_bsize - blkoffset; |
| 504 | if (uio->uio_resid < xfersize) |
| 505 | xfersize = uio->uio_resid; |
| 506 | |
| 507 | if (uio->uio_offset + xfersize > ip->i_size) |
| 508 | vnode_pager_setsize(vp, uio->uio_offset + xfersize); |
| 509 | |
| 510 | /* |
| 511 | * We must perform a read-before-write if the transfer |
| 512 | * size does not cover the entire buffer. |
| 513 | */ |
| 514 | if (fs->fs_bsize > xfersize) |
| 515 | flags |= B_CLRBUF; |
| 516 | else |
| 517 | flags &= ~B_CLRBUF; |
| 518 | /* XXX is uio->uio_offset the right thing here? */ |
| 519 | error = VOP_BALLOC(vp, uio->uio_offset, xfersize, |
| 520 | ap->a_cred, flags, &bp); |
| 521 | if (error != 0) |
| 522 | break; |
| 523 | /* |
| 524 | * If the buffer is not valid and we did not clear garbage |
| 525 | * out above, we have to do so here even though the write |
| 526 | * covers the entire buffer in order to avoid a mmap()/write |
| 527 | * race where another process may see the garbage prior to |
| 528 | * the uiomove() for a write replacing it. |
| 529 | */ |
| 530 | if ((bp->b_flags & B_CACHE) == 0 && fs->fs_bsize <= xfersize) |
| 531 | vfs_bio_clrbuf(bp); |
| 532 | if (ioflag & IO_DIRECT) |
| 533 | bp->b_flags |= B_DIRECT; |
| 534 | if (ioflag & IO_NOWDRAIN) |
| 535 | bp->b_flags |= B_NOWDRAIN; |
| 536 | |
| 537 | if (uio->uio_offset + xfersize > ip->i_size) { |
| 538 | ip->i_size = uio->uio_offset + xfersize; |
| 539 | extended = 1; |
| 540 | } |
| 541 | |
| 542 | size = BLKSIZE(fs, ip, lbn) - bp->b_resid; |
| 543 | if (size < xfersize) |
| 544 | xfersize = size; |
| 545 | |
| 546 | error = |
| 547 | uiomove((char *)bp->b_data + blkoffset, (int)xfersize, uio); |
| 548 | if ((ioflag & (IO_VMIO|IO_DIRECT)) && |
| 549 | (LIST_FIRST(&bp->b_dep) == NULL)) { |
| 550 | bp->b_flags |= B_RELBUF; |
| 551 | } |
| 552 | |
| 553 | /* |
| 554 | * If IO_SYNC each buffer is written synchronously. Otherwise |
| 555 | * if we have a severe page deficiency write the buffer |
| 556 | * asynchronously. Otherwise try to cluster, and if that |
| 557 | * doesn't do it then either do an async write (if O_DIRECT), |
| 558 | * or a delayed write (if not). |
| 559 | */ |
| 560 | |
| 561 | if (ioflag & IO_SYNC) { |
| 562 | (void)bwrite(bp); |
| 563 | } else if (vm_page_count_severe() || |
| 564 | buf_dirty_count_severe() || |
| 565 | (ioflag & IO_ASYNC)) { |
| 566 | bp->b_flags |= B_CLUSTEROK; |
| 567 | bawrite(bp); |
| 568 | } else if (xfersize + blkoffset == fs->fs_bsize) { |
| 569 | if ((vp->v_mount->mnt_flag & MNT_NOCLUSTERW) == 0) { |
| 570 | bp->b_flags |= B_CLUSTEROK; |
| 571 | cluster_write(bp, ip->i_size, seqcount); |
| 572 | } else { |
| 573 | bawrite(bp); |
| 574 | } |
| 575 | } else if (ioflag & IO_DIRECT) { |
| 576 | bp->b_flags |= B_CLUSTEROK; |
| 577 | bawrite(bp); |
| 578 | } else { |
| 579 | bp->b_flags |= B_CLUSTEROK; |
| 580 | bdwrite(bp); |
| 581 | } |
| 582 | if (error || xfersize == 0) |
| 583 | break; |
| 584 | ip->i_flag |= IN_CHANGE | IN_UPDATE; |
| 585 | } |
| 586 | /* |
| 587 | * If we successfully wrote any data, and we are not the superuser |
| 588 | * we clear the setuid and setgid bits as a precaution against |
| 589 | * tampering. |
| 590 | */ |
| 591 | if (resid > uio->uio_resid && ap->a_cred && ap->a_cred->cr_uid != 0) |
| 592 | ip->i_mode &= ~(ISUID | ISGID); |
| 593 | if (resid > uio->uio_resid) |
| 594 | VN_KNOTE(vp, NOTE_WRITE | (extended ? NOTE_EXTEND : 0)); |
| 595 | if (error) { |
| 596 | if (ioflag & IO_UNIT) { |
| 597 | (void)UFS_TRUNCATE(vp, osize, |
| 598 | ioflag & IO_SYNC, ap->a_cred, uio->uio_td); |
| 599 | uio->uio_offset -= resid - uio->uio_resid; |
| 600 | uio->uio_resid = resid; |
| 601 | } |
| 602 | } else if (resid > uio->uio_resid && (ioflag & IO_SYNC)) |
| 603 | error = UFS_UPDATE(vp, 1); |
| 604 | |
| 605 | if (object) |
| 606 | vm_object_vndeallocate(object); |
| 607 | |
| 608 | return (error); |
| 609 | } |
| 610 | |
| 611 | |
| 612 | /* |
| 613 | * get page routine |
| 614 | */ |
| 615 | int |
| 616 | ffs_getpages(ap) |
| 617 | struct vop_getpages_args *ap; |
| 618 | { |
| 619 | off_t foff, physoffset; |
| 620 | int i, size, bsize; |
| 621 | struct vnode *dp, *vp; |
| 622 | vm_object_t obj; |
| 623 | vm_pindex_t pindex, firstindex; |
| 624 | vm_page_t mreq; |
| 625 | int bbackwards, bforwards; |
| 626 | int pbackwards, pforwards; |
| 627 | int firstpage; |
| 628 | int reqlblkno; |
| 629 | daddr_t reqblkno; |
| 630 | int poff; |
| 631 | int pcount; |
| 632 | int rtval; |
| 633 | int pagesperblock; |
| 634 | |
| 635 | |
| 636 | pcount = round_page(ap->a_count) / PAGE_SIZE; |
| 637 | mreq = ap->a_m[ap->a_reqpage]; |
| 638 | firstindex = ap->a_m[0]->pindex; |
| 639 | |
| 640 | /* |
| 641 | * if ANY DEV_BSIZE blocks are valid on a large filesystem block, |
| 642 | * then the entire page is valid. Since the page may be mapped, |
| 643 | * user programs might reference data beyond the actual end of file |
| 644 | * occuring within the page. We have to zero that data. |
| 645 | */ |
| 646 | if (mreq->valid) { |
| 647 | if (mreq->valid != VM_PAGE_BITS_ALL) |
| 648 | vm_page_zero_invalid(mreq, TRUE); |
| 649 | for (i = 0; i < pcount; i++) { |
| 650 | if (i != ap->a_reqpage) { |
| 651 | vm_page_free(ap->a_m[i]); |
| 652 | } |
| 653 | } |
| 654 | return VM_PAGER_OK; |
| 655 | } |
| 656 | |
| 657 | vp = ap->a_vp; |
| 658 | obj = vp->v_object; |
| 659 | bsize = vp->v_mount->mnt_stat.f_iosize; |
| 660 | pindex = mreq->pindex; |
| 661 | foff = IDX_TO_OFF(pindex) /* + ap->a_offset should be zero */; |
| 662 | |
| 663 | if (bsize < PAGE_SIZE) |
| 664 | return vnode_pager_generic_getpages(ap->a_vp, ap->a_m, |
| 665 | ap->a_count, |
| 666 | ap->a_reqpage); |
| 667 | |
| 668 | /* |
| 669 | * foff is the file offset of the required page |
| 670 | * reqlblkno is the logical block that contains the page |
| 671 | * poff is the index of the page into the logical block |
| 672 | */ |
| 673 | reqlblkno = foff / bsize; |
| 674 | poff = (foff % bsize) / PAGE_SIZE; |
| 675 | |
| 676 | if ( VOP_BMAP( vp, reqlblkno, &dp, &reqblkno, |
| 677 | &bforwards, &bbackwards) || (reqblkno == -1)) { |
| 678 | for(i = 0; i < pcount; i++) { |
| 679 | if (i != ap->a_reqpage) |
| 680 | vm_page_free(ap->a_m[i]); |
| 681 | } |
| 682 | if (reqblkno == -1) { |
| 683 | if ((mreq->flags & PG_ZERO) == 0) |
| 684 | vm_page_zero_fill(mreq); |
| 685 | vm_page_undirty(mreq); |
| 686 | mreq->valid = VM_PAGE_BITS_ALL; |
| 687 | return VM_PAGER_OK; |
| 688 | } else { |
| 689 | return VM_PAGER_ERROR; |
| 690 | } |
| 691 | } |
| 692 | |
| 693 | physoffset = (off_t)reqblkno * DEV_BSIZE + poff * PAGE_SIZE; |
| 694 | pagesperblock = bsize / PAGE_SIZE; |
| 695 | /* |
| 696 | * find the first page that is contiguous... |
| 697 | * note that pbackwards is the number of pages that are contiguous |
| 698 | * backwards. |
| 699 | */ |
| 700 | firstpage = 0; |
| 701 | if (ap->a_count) { |
| 702 | pbackwards = poff + bbackwards * pagesperblock; |
| 703 | if (ap->a_reqpage > pbackwards) { |
| 704 | firstpage = ap->a_reqpage - pbackwards; |
| 705 | for(i=0;i<firstpage;i++) |
| 706 | vm_page_free(ap->a_m[i]); |
| 707 | } |
| 708 | |
| 709 | /* |
| 710 | * pforwards is the number of pages that are contiguous |
| 711 | * after the current page. |
| 712 | */ |
| 713 | pforwards = (pagesperblock - (poff + 1)) + |
| 714 | bforwards * pagesperblock; |
| 715 | if (pforwards < (pcount - (ap->a_reqpage + 1))) { |
| 716 | for( i = ap->a_reqpage + pforwards + 1; i < pcount; i++) |
| 717 | vm_page_free(ap->a_m[i]); |
| 718 | pcount = ap->a_reqpage + pforwards + 1; |
| 719 | } |
| 720 | |
| 721 | /* |
| 722 | * number of pages for I/O corrected for the non-contig pages at |
| 723 | * the beginning of the array. |
| 724 | */ |
| 725 | pcount -= firstpage; |
| 726 | } |
| 727 | |
| 728 | /* |
| 729 | * calculate the size of the transfer |
| 730 | */ |
| 731 | |
| 732 | size = pcount * PAGE_SIZE; |
| 733 | |
| 734 | if ((IDX_TO_OFF(ap->a_m[firstpage]->pindex) + size) > |
| 735 | obj->un_pager.vnp.vnp_size) |
| 736 | size = obj->un_pager.vnp.vnp_size - |
| 737 | IDX_TO_OFF(ap->a_m[firstpage]->pindex); |
| 738 | |
| 739 | physoffset -= foff; |
| 740 | rtval = VOP_GETPAGES(dp, &ap->a_m[firstpage], size, |
| 741 | (ap->a_reqpage - firstpage), physoffset); |
| 742 | |
| 743 | return (rtval); |
| 744 | } |
| 745 | |
| 746 | /* |
| 747 | * put page routine |
| 748 | * |
| 749 | * XXX By default, wimp out... note that a_offset is ignored (and always |
| 750 | * XXX has been). |
| 751 | */ |
| 752 | int |
| 753 | ffs_putpages(ap) |
| 754 | struct vop_putpages_args *ap; |
| 755 | { |
| 756 | return vnode_pager_generic_putpages(ap->a_vp, ap->a_m, ap->a_count, |
| 757 | ap->a_sync, ap->a_rtvals); |
| 758 | } |