| 1 | /* |
| 2 | * Copyright (c) 1982, 1986, 1989, 1993 |
| 3 | * The Regents of the University of California. All rights reserved. |
| 4 | * |
| 5 | * Redistribution and use in source and binary forms, with or without |
| 6 | * modification, are permitted provided that the following conditions |
| 7 | * are met: |
| 8 | * 1. Redistributions of source code must retain the above copyright |
| 9 | * notice, this list of conditions and the following disclaimer. |
| 10 | * 2. Redistributions in binary form must reproduce the above copyright |
| 11 | * notice, this list of conditions and the following disclaimer in the |
| 12 | * documentation and/or other materials provided with the distribution. |
| 13 | * 3. All advertising materials mentioning features or use of this software |
| 14 | * must display the following acknowledgement: |
| 15 | * This product includes software developed by the University of |
| 16 | * California, Berkeley and its contributors. |
| 17 | * 4. Neither the name of the University nor the names of its contributors |
| 18 | * may be used to endorse or promote products derived from this software |
| 19 | * without specific prior written permission. |
| 20 | * |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| 22 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| 23 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| 24 | * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| 25 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| 26 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| 27 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| 28 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| 29 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| 30 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 31 | * SUCH DAMAGE. |
| 32 | * |
| 33 | * @(#)vm_swap.c 8.5 (Berkeley) 2/17/94 |
| 34 | * $FreeBSD: src/sys/vm/vm_swap.c,v 1.96.2.2 2001/10/14 18:46:47 iedowse Exp $ |
| 35 | * $DragonFly: src/sys/vm/vm_swap.c,v 1.36 2007/07/20 17:21:54 dillon Exp $ |
| 36 | */ |
| 37 | |
| 38 | #include "opt_swap.h" |
| 39 | |
| 40 | #include <sys/param.h> |
| 41 | #include <sys/systm.h> |
| 42 | #include <sys/sysproto.h> |
| 43 | #include <sys/buf.h> |
| 44 | #include <sys/proc.h> |
| 45 | #include <sys/priv.h> |
| 46 | #include <sys/nlookup.h> |
| 47 | #include <sys/dmap.h> /* XXX */ |
| 48 | #include <sys/vnode.h> |
| 49 | #include <sys/fcntl.h> |
| 50 | #include <sys/blist.h> |
| 51 | #include <sys/kernel.h> |
| 52 | #include <sys/lock.h> |
| 53 | #include <sys/conf.h> |
| 54 | #include <sys/stat.h> |
| 55 | #include <sys/thread2.h> |
| 56 | #include <vm/vm.h> |
| 57 | #include <vm/vm_extern.h> |
| 58 | #include <vm/swap_pager.h> |
| 59 | #include <vm/vm_zone.h> |
| 60 | |
| 61 | /* |
| 62 | * Indirect driver for multi-controller paging. |
| 63 | */ |
| 64 | |
| 65 | #ifndef NSWAPDEV |
| 66 | #define NSWAPDEV 4 |
| 67 | #endif |
| 68 | static struct swdevt should_be_malloced[NSWAPDEV]; |
| 69 | struct swdevt *swdevt = should_be_malloced; /* exported to pstat/systat */ |
| 70 | static swblk_t nswap; /* first block after the interleaved devs */ |
| 71 | int nswdev = NSWAPDEV; /* exported to pstat/systat */ |
| 72 | int vm_swap_size; |
| 73 | |
| 74 | static int swapdev_strategy (struct vop_strategy_args *ap); |
| 75 | struct vnode *swapdev_vp; |
| 76 | |
| 77 | /* |
| 78 | * swapdev_strategy: |
| 79 | * |
| 80 | * vn_strategy() for swapdev_vp. |
| 81 | * Perform swap strategy interleave device selection. |
| 82 | * |
| 83 | * The bp is expected to be locked and on call. |
| 84 | * |
| 85 | * (struct vnode *a_vp, struct bio *b_bio) |
| 86 | */ |
| 87 | |
| 88 | static int |
| 89 | swapdev_strategy(struct vop_strategy_args *ap) |
| 90 | { |
| 91 | struct bio *bio = ap->a_bio; |
| 92 | struct bio *nbio; |
| 93 | struct buf *bp = bio->bio_buf; |
| 94 | int sz, off, seg, index, blkno, nblkno; |
| 95 | struct swdevt *sp; |
| 96 | struct vnode *vp; |
| 97 | |
| 98 | vp = ap->a_vp; |
| 99 | sz = howmany(bp->b_bcount, PAGE_SIZE); |
| 100 | blkno = (int)(bio->bio_offset >> PAGE_SHIFT); |
| 101 | |
| 102 | /* |
| 103 | * Convert interleaved swap into per-device swap. Note that |
| 104 | * the block size is left in PAGE_SIZE'd chunks (for the newswap) |
| 105 | * here. |
| 106 | */ |
| 107 | nbio = push_bio(bio); |
| 108 | if (nswdev > 1) { |
| 109 | off = blkno % dmmax; |
| 110 | if (off + sz > dmmax) { |
| 111 | bp->b_error = EINVAL; |
| 112 | bp->b_flags |= B_ERROR; |
| 113 | biodone(bio); |
| 114 | return 0; |
| 115 | } |
| 116 | seg = blkno / dmmax; |
| 117 | index = seg % nswdev; |
| 118 | seg /= nswdev; |
| 119 | nbio->bio_offset = (off_t)(seg * dmmax + off) << PAGE_SHIFT; |
| 120 | } else { |
| 121 | index = 0; |
| 122 | nbio->bio_offset = bio->bio_offset; |
| 123 | } |
| 124 | nblkno = (int)(nbio->bio_offset >> PAGE_SHIFT); |
| 125 | sp = &swdevt[index]; |
| 126 | if (nblkno + sz > sp->sw_nblks) { |
| 127 | bp->b_error = EINVAL; |
| 128 | bp->b_flags |= B_ERROR; |
| 129 | /* I/O was never started on nbio, must biodone(bio) */ |
| 130 | biodone(bio); |
| 131 | return 0; |
| 132 | } |
| 133 | if (sp->sw_vp == NULL) { |
| 134 | bp->b_error = ENODEV; |
| 135 | bp->b_flags |= B_ERROR; |
| 136 | /* I/O was never started on nbio, must biodone(bio) */ |
| 137 | biodone(bio); |
| 138 | return 0; |
| 139 | } |
| 140 | |
| 141 | /* |
| 142 | * Issue a strategy call on the appropriate swap vnode. Note that |
| 143 | * bp->b_vp is not modified. Strategy code is always supposed to |
| 144 | * use the passed vp. |
| 145 | * |
| 146 | * We have to use vn_strategy() here even if we know we have a |
| 147 | * device in order to properly break up requests which exceed the |
| 148 | * device's DMA limits. |
| 149 | */ |
| 150 | vn_strategy(sp->sw_vp, nbio); |
| 151 | return 0; |
| 152 | } |
| 153 | |
| 154 | /* |
| 155 | * Create a special vnode op vector for swapdev_vp - we only use |
| 156 | * vn_strategy(), everything else returns an error. |
| 157 | */ |
| 158 | static struct vop_ops swapdev_vnode_vops = { |
| 159 | .vop_default = vop_defaultop, |
| 160 | .vop_strategy = swapdev_strategy |
| 161 | }; |
| 162 | static struct vop_ops *swapdev_vnode_vops_p = &swapdev_vnode_vops; |
| 163 | |
| 164 | VNODEOP_SET(swapdev_vnode_vops); |
| 165 | |
| 166 | /* |
| 167 | * swapon_args(char *name) |
| 168 | * |
| 169 | * System call swapon(name) enables swapping on device name, |
| 170 | * which must be in the swdevsw. Return EBUSY |
| 171 | * if already swapping on this device. |
| 172 | * |
| 173 | * MPALMOSTSAFE |
| 174 | */ |
| 175 | int |
| 176 | sys_swapon(struct swapon_args *uap) |
| 177 | { |
| 178 | struct thread *td = curthread; |
| 179 | struct vattr attr; |
| 180 | struct vnode *vp; |
| 181 | struct nlookupdata nd; |
| 182 | int error; |
| 183 | struct ucred *cred; |
| 184 | |
| 185 | cred = td->td_ucred; |
| 186 | |
| 187 | error = priv_check(td, PRIV_ROOT); |
| 188 | if (error) |
| 189 | return (error); |
| 190 | |
| 191 | get_mplock(); |
| 192 | vp = NULL; |
| 193 | error = nlookup_init(&nd, uap->name, UIO_USERSPACE, NLC_FOLLOW); |
| 194 | if (error == 0) |
| 195 | error = nlookup(&nd); |
| 196 | if (error == 0) |
| 197 | error = cache_vref(&nd.nl_nch, nd.nl_cred, &vp); |
| 198 | nlookup_done(&nd); |
| 199 | if (error) { |
| 200 | rel_mplock(); |
| 201 | return (error); |
| 202 | } |
| 203 | |
| 204 | if (vn_isdisk(vp, &error)) { |
| 205 | error = swaponvp(td, vp, 0); |
| 206 | } else if (vp->v_type == VREG && vp->v_tag == VT_NFS && |
| 207 | (error = VOP_GETATTR(vp, &attr)) == 0) { |
| 208 | /* |
| 209 | * Allow direct swapping to NFS regular files in the same |
| 210 | * way that nfs_mountroot() sets up diskless swapping. |
| 211 | */ |
| 212 | error = swaponvp(td, vp, attr.va_size / DEV_BSIZE); |
| 213 | } |
| 214 | if (error) |
| 215 | vrele(vp); |
| 216 | rel_mplock(); |
| 217 | |
| 218 | return (error); |
| 219 | } |
| 220 | |
| 221 | /* |
| 222 | * Swfree(index) frees the index'th portion of the swap map. |
| 223 | * Each of the nswdev devices provides 1/nswdev'th of the swap |
| 224 | * space, which is laid out with blocks of dmmax pages circularly |
| 225 | * among the devices. |
| 226 | * |
| 227 | * The new swap code uses page-sized blocks. The old swap code used |
| 228 | * DEV_BSIZE'd chunks. |
| 229 | * |
| 230 | * XXX locking when multiple swapon's run in parallel |
| 231 | */ |
| 232 | int |
| 233 | swaponvp(struct thread *td, struct vnode *vp, u_quad_t nblks) |
| 234 | { |
| 235 | swblk_t aligned_nblks; |
| 236 | int64_t dpsize; |
| 237 | struct ucred *cred; |
| 238 | struct swdevt *sp; |
| 239 | swblk_t vsbase; |
| 240 | swblk_t dvbase; |
| 241 | cdev_t dev; |
| 242 | int index; |
| 243 | int error; |
| 244 | long blk; |
| 245 | |
| 246 | cred = td->td_ucred; |
| 247 | |
| 248 | if (!swapdev_vp) { |
| 249 | error = getspecialvnode(VT_NON, NULL, &swapdev_vnode_vops_p, |
| 250 | &swapdev_vp, 0, 0); |
| 251 | if (error) |
| 252 | panic("Cannot get vnode for swapdev"); |
| 253 | swapdev_vp->v_type = VNON; /* Untyped */ |
| 254 | vx_unlock(swapdev_vp); |
| 255 | } |
| 256 | |
| 257 | for (sp = swdevt, index = 0 ; index < nswdev; index++, sp++) { |
| 258 | if (sp->sw_vp == vp) |
| 259 | return EBUSY; |
| 260 | if (!sp->sw_vp) |
| 261 | goto found; |
| 262 | |
| 263 | } |
| 264 | return EINVAL; |
| 265 | found: |
| 266 | vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); |
| 267 | error = VOP_OPEN(vp, FREAD | FWRITE, cred, NULL); |
| 268 | vn_unlock(vp); |
| 269 | if (error) |
| 270 | return (error); |
| 271 | |
| 272 | /* |
| 273 | * v_rdev is not valid until after the VOP_OPEN() call. dev_psize() |
| 274 | * must be supported if a character device has been specified. |
| 275 | */ |
| 276 | if (vp->v_type == VCHR) |
| 277 | dev = vp->v_rdev; |
| 278 | else |
| 279 | dev = NULL; |
| 280 | |
| 281 | if (nblks == 0 && dev != NULL) { |
| 282 | dpsize = dev_dpsize(dev); |
| 283 | if (dpsize == -1) { |
| 284 | VOP_CLOSE(vp, FREAD | FWRITE); |
| 285 | return (ENXIO); |
| 286 | } |
| 287 | nblks = (u_quad_t)dpsize; |
| 288 | } |
| 289 | if (nblks == 0) { |
| 290 | VOP_CLOSE(vp, FREAD | FWRITE); |
| 291 | return (ENXIO); |
| 292 | } |
| 293 | |
| 294 | /* |
| 295 | * nblks is in DEV_BSIZE'd chunks, convert to PAGE_SIZE'd chunks. |
| 296 | * First chop nblks off to page-align it, then convert. |
| 297 | * |
| 298 | * sw->sw_nblks is in page-sized chunks now too. |
| 299 | */ |
| 300 | nblks &= ~(u_quad_t)(ctodb(1) - 1); |
| 301 | nblks = dbtoc(nblks); |
| 302 | |
| 303 | /* |
| 304 | * Post-conversion nblks must not be >= BLIST_MAXBLKS, and |
| 305 | * we impose a 4-swap-device limit so we have to divide it out |
| 306 | * further. Going beyond this will result in overflows in the |
| 307 | * blist code. |
| 308 | * |
| 309 | * Post-conversion nblks must fit within a (swblk_t), which |
| 310 | * this test also ensures. |
| 311 | */ |
| 312 | if (nblks > BLIST_MAXBLKS / nswdev) { |
| 313 | kprintf("exceeded maximum of %d blocks per swap unit\n", |
| 314 | (int)BLIST_MAXBLKS / nswdev); |
| 315 | VOP_CLOSE(vp, FREAD | FWRITE); |
| 316 | return (ENXIO); |
| 317 | } |
| 318 | |
| 319 | sp->sw_vp = vp; |
| 320 | sp->sw_dev = dev2udev(dev); |
| 321 | sp->sw_device = dev; |
| 322 | sp->sw_flags |= SW_FREED; |
| 323 | sp->sw_nblks = (swblk_t)nblks; |
| 324 | |
| 325 | /* |
| 326 | * nblks, nswap, and dmmax are PAGE_SIZE'd parameters now, not |
| 327 | * DEV_BSIZE'd. aligned_nblks is used to calculate the |
| 328 | * size of the swap bitmap, taking into account the stripe size. |
| 329 | */ |
| 330 | aligned_nblks = (swblk_t)((nblks + (dmmax - 1)) & ~(u_long)(dmmax - 1)); |
| 331 | |
| 332 | if (aligned_nblks * nswdev > nswap) |
| 333 | nswap = aligned_nblks * nswdev; |
| 334 | |
| 335 | if (swapblist == NULL) |
| 336 | swapblist = blist_create(nswap); |
| 337 | else |
| 338 | blist_resize(&swapblist, nswap, 0); |
| 339 | |
| 340 | for (dvbase = dmmax; dvbase < nblks; dvbase += dmmax) { |
| 341 | blk = min(nblks - dvbase, dmmax); |
| 342 | vsbase = index * dmmax + dvbase * nswdev; |
| 343 | blist_free(swapblist, vsbase, blk); |
| 344 | vm_swap_size += blk; |
| 345 | } |
| 346 | swap_pager_newswap(); |
| 347 | |
| 348 | return (0); |
| 349 | } |