kernel - MPSAFE work - Finish tokenizing vm_page.c
[dragonfly.git] / sys / kern / vfs_vm.c
CommitLineData
8452310f
MD
1/*
2 * Copyright (c) 2010 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35/*
36 * Implements new VFS/VM coherency functions. For conforming VFSs
37 * we treat the backing VM object slightly differently. Instead of
38 * maintaining a number of pages to exactly fit the size of the file
39 * we instead maintain pages to fit the entire contents of the last
40 * buffer cache buffer used by the file.
41 *
42 * For VFSs like NFS and HAMMER which use (generally speaking) fixed
43 * sized buffers this greatly reduces the complexity of VFS/VM interactions.
44 *
45 * Truncations no longer invalidate pages covered by the buffer cache
46 * beyond the file EOF which still fit within the file's last buffer.
47 * We simply unmap them and do not allow userland to fault them in.
48 *
49 * The VFS is no longer responsible for zero-filling buffers during a
50 * truncation, the last buffer will be automatically zero-filled by
51 * nvtruncbuf().
52 *
53 * This code is intended to (eventually) replace vtruncbuf() and
54 * vnode_pager_setsize().
55 */
56
57#include <sys/param.h>
58#include <sys/systm.h>
59#include <sys/buf.h>
60#include <sys/conf.h>
61#include <sys/fcntl.h>
62#include <sys/file.h>
63#include <sys/kernel.h>
64#include <sys/malloc.h>
65#include <sys/mount.h>
66#include <sys/proc.h>
67#include <sys/socket.h>
68#include <sys/stat.h>
69#include <sys/sysctl.h>
70#include <sys/unistd.h>
71#include <sys/vmmeter.h>
72#include <sys/vnode.h>
73
74#include <machine/limits.h>
75
76#include <vm/vm.h>
77#include <vm/vm_object.h>
78#include <vm/vm_extern.h>
79#include <vm/vm_kern.h>
80#include <vm/pmap.h>
81#include <vm/vm_map.h>
82#include <vm/vm_page.h>
83#include <vm/vm_pager.h>
84#include <vm/vnode_pager.h>
85#include <vm/vm_zone.h>
86
87#include <sys/buf2.h>
88#include <sys/thread2.h>
89#include <sys/sysref2.h>
90#include <sys/mplock2.h>
91
92static int nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data);
93static int nvtruncbuf_bp_trunc(struct buf *bp, void *data);
94static int nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data);
95static int nvtruncbuf_bp_metasync(struct buf *bp, void *data);
96
97/*
98 * Truncate a file's buffer and pages to a specified length. The
99 * byte-granular length of the file is specified along with the block
100 * size of the buffer containing that offset.
101 *
102 * If the last buffer straddles the length its contents will be zero-filled
103 * as appropriate. All buffers and pages after the last buffer will be
104 * destroyed. The last buffer itself will be destroyed only if the length
105 * is exactly aligned with it.
3bb7eedb
MD
106 *
107 * UFS typically passes the old block size prior to the actual truncation,
108 * then later resizes the block based on the new file size. NFS uses a
109 * fixed block size and doesn't care. HAMMER uses a block size based on
110 * the offset which is fixed for any particular offset.
111 *
112 * When zero-filling we must bdwrite() to avoid a window of opportunity
113 * where the kernel might throw away a clean buffer and the filesystem
114 * then attempts to bread() it again before completing (or as part of)
115 * the extension. The filesystem is still responsible for zero-filling
116 * any remainder when writing to the media in the strategy function when
117 * it is able to do so without the page being mapped. The page may still
118 * be mapped by userland here.
6362a262
MD
119 *
120 * When modifying a buffer we must clear any cached raw disk offset.
121 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER,
122 * never overwrite existing data blocks.
8452310f
MD
123 */
124int
3bb7eedb 125nvtruncbuf(struct vnode *vp, off_t length, int blksize, int boff)
8452310f
MD
126{
127 off_t truncloffset;
128 off_t truncboffset;
129 const char *filename;
8452310f
MD
130 struct buf *bp;
131 int count;
8452310f
MD
132 int error;
133
134 /*
135 * Round up to the *next* block, then destroy the buffers in question.
136 * Since we are only removing some of the buffers we must rely on the
137 * scan count to determine whether a loop is necessary.
138 *
139 * Destroy any pages beyond the last buffer.
140 */
3bb7eedb
MD
141 if (boff < 0)
142 boff = (int)(length % blksize);
8452310f
MD
143 if (boff)
144 truncloffset = length + (blksize - boff);
145 else
146 truncloffset = length;
147
3b998fa9 148 lwkt_gettoken(&vp->v_token);
8452310f
MD
149 do {
150 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
151 nvtruncbuf_bp_trunc_cmp,
152 nvtruncbuf_bp_trunc, &truncloffset);
153 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
154 nvtruncbuf_bp_trunc_cmp,
155 nvtruncbuf_bp_trunc, &truncloffset);
156 } while(count);
157
3bb7eedb 158 nvnode_pager_setsize(vp, length, blksize, boff);
8452310f
MD
159
160 /*
161 * Zero-fill the area beyond the file EOF that still fits within
3bb7eedb
MD
162 * the last buffer. We must mark the buffer as dirty even though
163 * the modified area is beyond EOF to avoid races where the kernel
164 * might flush the buffer before the filesystem is able to reallocate
165 * the block.
8452310f
MD
166 *
167 * The VFS is responsible for dealing with the actual truncation.
168 */
169 if (boff) {
170 truncboffset = length - boff;
171 error = bread(vp, truncboffset, blksize, &bp);
172 if (error == 0) {
173 bzero(bp->b_data + boff, blksize - boff);
174 if (bp->b_flags & B_DELWRI) {
175 if (bp->b_dirtyoff > boff)
176 bp->b_dirtyoff = boff;
177 if (bp->b_dirtyend > boff)
178 bp->b_dirtyend = boff;
179 }
6362a262 180 bp->b_bio2.bio_offset = NOOFFSET;
3bb7eedb 181 bdwrite(bp);
8452310f
MD
182 }
183 } else {
184 error = 0;
185 }
186
187 /*
188 * For safety, fsync any remaining metadata if the file is not being
189 * truncated to 0. Since the metadata does not represent the entire
190 * dirty list we have to rely on the hit count to ensure that we get
191 * all of it.
3bb7eedb
MD
192 *
193 * This is typically applicable only to UFS. NFS and HAMMER do
194 * not store indirect blocks in the per-vnode buffer cache.
8452310f
MD
195 */
196 if (length > 0) {
197 do {
198 count = RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
199 nvtruncbuf_bp_metasync_cmp,
200 nvtruncbuf_bp_metasync, vp);
201 } while (count);
202 }
203
204 /*
205 * It is possible to have in-progress I/O from buffers that were
206 * not part of the truncation. This should not happen if we
207 * are truncating to 0-length.
208 */
209 bio_track_wait(&vp->v_track_write, 0, 0);
210
211 /*
212 * Debugging only
213 */
214 spin_lock_wr(&vp->v_spinlock);
215 filename = TAILQ_FIRST(&vp->v_namecache) ?
216 TAILQ_FIRST(&vp->v_namecache)->nc_name : "?";
217 spin_unlock_wr(&vp->v_spinlock);
218
219 /*
220 * Make sure no buffers were instantiated while we were trying
221 * to clean out the remaining VM pages. This could occur due
222 * to busy dirty VM pages being flushed out to disk.
223 */
224 do {
225 count = RB_SCAN(buf_rb_tree, &vp->v_rbclean_tree,
226 nvtruncbuf_bp_trunc_cmp,
227 nvtruncbuf_bp_trunc, &truncloffset);
228 count += RB_SCAN(buf_rb_tree, &vp->v_rbdirty_tree,
229 nvtruncbuf_bp_trunc_cmp,
230 nvtruncbuf_bp_trunc, &truncloffset);
231 if (count) {
232 kprintf("Warning: vtruncbuf(): Had to re-clean %d "
233 "left over buffers in %s\n", count, filename);
234 }
235 } while(count);
236
3b998fa9 237 lwkt_reltoken(&vp->v_token);
8452310f
MD
238
239 return (error);
240}
241
242/*
243 * The callback buffer is beyond the new file EOF and must be destroyed.
244 * Note that the compare function must conform to the RB_SCAN's requirements.
245 */
246static
247int
248nvtruncbuf_bp_trunc_cmp(struct buf *bp, void *data)
249{
250 if (bp->b_loffset >= *(off_t *)data)
251 return(0);
252 return(-1);
253}
254
255static
256int
257nvtruncbuf_bp_trunc(struct buf *bp, void *data)
258{
259 /*
260 * Do not try to use a buffer we cannot immediately lock, but sleep
261 * anyway to prevent a livelock. The code will loop until all buffers
262 * can be acted upon.
263 */
264 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
265 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
266 BUF_UNLOCK(bp);
267 } else {
268 bremfree(bp);
269 bp->b_flags |= (B_INVAL | B_RELBUF | B_NOCACHE);
270 brelse(bp);
271 }
272 return(1);
273}
274
275/*
276 * Fsync all meta-data after truncating a file to be non-zero. Only metadata
277 * blocks (with a negative loffset) are scanned.
278 * Note that the compare function must conform to the RB_SCAN's requirements.
279 */
280static int
281nvtruncbuf_bp_metasync_cmp(struct buf *bp, void *data)
282{
283 if (bp->b_loffset < 0)
284 return(0);
285 return(1);
286}
287
288static int
289nvtruncbuf_bp_metasync(struct buf *bp, void *data)
290{
291 struct vnode *vp = data;
292
293 if (bp->b_flags & B_DELWRI) {
294 /*
295 * Do not try to use a buffer we cannot immediately lock,
296 * but sleep anyway to prevent a livelock. The code will
297 * loop until all buffers can be acted upon.
298 */
299 if (BUF_LOCK(bp, LK_EXCLUSIVE | LK_NOWAIT)) {
300 if (BUF_LOCK(bp, LK_EXCLUSIVE|LK_SLEEPFAIL) == 0)
301 BUF_UNLOCK(bp);
302 } else {
303 bremfree(bp);
304 if (bp->b_vp == vp)
305 bawrite(bp);
306 else
307 bwrite(bp);
308 }
309 return(1);
310 } else {
311 return(0);
312 }
313}
314
315/*
3bb7eedb
MD
316 * Extend a file's buffer and pages to a new, larger size. The block size
317 * at both the old and new length must be passed, but buffer cache operations
318 * will only be performed on the old block. The new nlength/nblksize will
319 * be used to properly set the VM object size.
8452310f
MD
320 *
321 * To make this explicit we require the old length to passed even though
3bb7eedb
MD
322 * we can acquire it from vp->v_filesize, which also avoids potential
323 * corruption if the filesystem and vp get desynchronized somehow.
8452310f
MD
324 *
325 * If the caller intends to immediately write into the newly extended
326 * space pass trivial == 1. If trivial is 0 the original buffer will be
327 * zero-filled as necessary to clean out any junk in the extended space.
328 *
3bb7eedb
MD
329 * When zero-filling we must bdwrite() to avoid a window of opportunity
330 * where the kernel might throw away a clean buffer and the filesystem
331 * then attempts to bread() it again before completing (or as part of)
332 * the extension. The filesystem is still responsible for zero-filling
333 * any remainder when writing to the media in the strategy function when
334 * it is able to do so without the page being mapped. The page may still
335 * be mapped by userland here.
6362a262
MD
336 *
337 * When modifying a buffer we must clear any cached raw disk offset.
338 * bdwrite() will call BMAP on it again. Some filesystems, like HAMMER,
339 * never overwrite existing data blocks.
8452310f
MD
340 */
341int
342nvextendbuf(struct vnode *vp, off_t olength, off_t nlength,
3bb7eedb 343 int oblksize, int nblksize, int oboff, int nboff, int trivial)
8452310f
MD
344{
345 off_t truncboffset;
346 struct buf *bp;
8452310f
MD
347 int error;
348
349 error = 0;
3bb7eedb 350 nvnode_pager_setsize(vp, nlength, nblksize, nboff);
8452310f 351 if (trivial == 0) {
3bb7eedb
MD
352 if (oboff < 0)
353 oboff = (int)(olength % oblksize);
354 truncboffset = olength - oboff;
8452310f 355
3bb7eedb 356 if (oboff) {
8452310f
MD
357 error = bread(vp, truncboffset, oblksize, &bp);
358 if (error == 0) {
3bb7eedb 359 bzero(bp->b_data + oboff, oblksize - oboff);
6362a262 360 bp->b_bio2.bio_offset = NOOFFSET;
3bb7eedb 361 bdwrite(bp);
8452310f
MD
362 }
363 }
364 }
365 return (error);
366}
367
368/*
369 * Set vp->v_filesize and vp->v_object->size, destroy pages beyond
370 * the last buffer when truncating.
371 *
372 * This function does not do any zeroing or invalidating of partially
373 * overlapping pages. Zeroing is the responsibility of nvtruncbuf().
374 * However, it does unmap VM pages from the user address space on a
375 * page-granular (verses buffer cache granular) basis.
3bb7eedb
MD
376 *
377 * If boff is passed as -1 the base offset of the buffer cache buffer is
378 * calculated from length and blksize. Filesystems such as UFS which deal
379 * with fragments have to specify a boff >= 0 since the base offset cannot
380 * be calculated from length and blksize.
381 *
382 * For UFS blksize is the 'new' blocksize, used only to determine how large
383 * the VM object must become.
8452310f
MD
384 */
385void
3bb7eedb 386nvnode_pager_setsize(struct vnode *vp, off_t length, int blksize, int boff)
8452310f
MD
387{
388 vm_pindex_t nobjsize;
389 vm_pindex_t oobjsize;
390 vm_pindex_t pi;
391 vm_object_t object;
392 vm_page_t m;
393 off_t truncboffset;
8452310f
MD
394
395 /*
396 * Degenerate conditions
397 */
398 if ((object = vp->v_object) == NULL)
399 return;
400 if (length == vp->v_filesize)
401 return;
402
403 /*
404 * Calculate the size of the VM object, coverage includes
405 * the buffer straddling EOF. If EOF is buffer-aligned
406 * we don't bother.
407 *
408 * Buffers do not have to be page-aligned. Make sure
409 * nobjsize is beyond the last page of the buffer.
410 */
3bb7eedb
MD
411 if (boff < 0)
412 boff = (int)(length % blksize);
8452310f
MD
413 truncboffset = length - boff;
414 oobjsize = object->size;
415 if (boff)
416 nobjsize = OFF_TO_IDX(truncboffset + blksize + PAGE_MASK);
417 else
418 nobjsize = OFF_TO_IDX(truncboffset + PAGE_MASK);
419 object->size = nobjsize;
420
421 if (length < vp->v_filesize) {
422 /*
423 * File has shrunk, toss any cached pages beyond
424 * the end of the buffer (blksize aligned) for the
425 * new EOF.
426 */
427 vp->v_filesize = length;
428 if (nobjsize < oobjsize) {
429 vm_object_page_remove(object, nobjsize, oobjsize,
430 FALSE);
431 }
432
433 /*
434 * Unmap any pages (page aligned) beyond the new EOF.
435 * The pages remain part of the (last) buffer and are not
436 * invalidated.
437 */
438 pi = OFF_TO_IDX(length + PAGE_MASK);
573fb415 439 lwkt_gettoken(&vm_token);
8452310f
MD
440 while (pi < nobjsize) {
441 do {
442 m = vm_page_lookup(object, pi);
443 } while (m && vm_page_sleep_busy(m, TRUE, "vsetsz"));
444 if (m) {
445 vm_page_busy(m);
446 vm_page_protect(m, VM_PROT_NONE);
447 vm_page_wakeup(m);
448 }
449 ++pi;
450 }
573fb415 451 lwkt_reltoken(&vm_token);
8452310f
MD
452 } else {
453 /*
454 * File has expanded.
455 */
456 vp->v_filesize = length;
457 }
458}