| 1 | /* |
| 2 | * Copyright (c) 2004 The DragonFly Project. All rights reserved. |
| 3 | * |
| 4 | * This code is derived from software contributed to The DragonFly Project |
| 5 | * by Matthew Dillon <dillon@backplane.com> |
| 6 | * |
| 7 | * Redistribution and use in source and binary forms, with or without |
| 8 | * modification, are permitted provided that the following conditions |
| 9 | * are met: |
| 10 | * |
| 11 | * 1. Redistributions of source code must retain the above copyright |
| 12 | * notice, this list of conditions and the following disclaimer. |
| 13 | * 2. Redistributions in binary form must reproduce the above copyright |
| 14 | * notice, this list of conditions and the following disclaimer in |
| 15 | * the documentation and/or other materials provided with the |
| 16 | * distribution. |
| 17 | * 3. Neither the name of The DragonFly Project nor the names of its |
| 18 | * contributors may be used to endorse or promote products derived |
| 19 | * from this software without specific, prior written permission. |
| 20 | * |
| 21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| 22 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| 23 | * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS |
| 24 | * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE |
| 25 | * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, |
| 26 | * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, |
| 27 | * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
| 28 | * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED |
| 29 | * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| 30 | * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| 31 | * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| 32 | * SUCH DAMAGE. |
| 33 | * |
| 34 | * $DragonFly: src/sys/kern/vfs_journal.c,v 1.3 2004/12/29 02:40:02 dillon Exp $ |
| 35 | */ |
| 36 | /* |
| 37 | * Each mount point may have zero or more independantly configured journals |
| 38 | * attached to it. Each journal is represented by a memory FIFO and worker |
| 39 | * thread. Journal events are streamed through the FIFO to the thread, |
| 40 | * batched up (typically on one-second intervals), and written out by the |
| 41 | * thread. |
| 42 | * |
| 43 | * Journal vnode ops are executed instead of mnt_vn_norm_ops when one or |
| 44 | * more journals have been installed on a mount point. It becomes the |
| 45 | * responsibility of the journal op to call the underlying normal op as |
| 46 | * appropriate. |
| 47 | * |
| 48 | * The journaling protocol is intended to evolve into a two-way stream |
| 49 | * whereby transaction IDs can be acknowledged by the journaling target |
| 50 | * when the data has been committed to hard storage. Both implicit and |
| 51 | * explicit acknowledgement schemes will be supported, depending on the |
| 52 | * sophistication of the journaling stream, plus resynchronization and |
| 53 | * restart when a journaling stream is interrupted. This information will |
| 54 | * also be made available to journaling-aware filesystems to allow better |
| 55 | * management of their own physical storage synchronization mechanisms as |
| 56 | * well as to allow such filesystems to take direct advantage of the kernel's |
| 57 | * journaling layer so they don't have to roll their own. |
| 58 | * |
| 59 | * In addition, the journaling thread will have access to much larger |
| 60 | * spooling areas then the memory buffer is able to provide by e.g. |
| 61 | * reserving swap space, in order to absorb potentially long interruptions |
| 62 | * of off-site journaling streams, and to prevent 'slow' off-site linkages |
| 63 | * from radically slowing down local filesystem operations. |
| 64 | * |
| 65 | * Because of the non-trivial algorithms the journaling system will be |
| 66 | * required to support, use of a worker thread is mandatory. Efficiencies |
| 67 | * are maintained by utilitizing the memory FIFO to batch transactions when |
| 68 | * possible, reducing the number of gratuitous thread switches and taking |
| 69 | * advantage of cpu caches through the use of shorter batched code paths |
| 70 | * rather then trying to do everything in the context of the process |
| 71 | * originating the filesystem op. |
| 72 | */ |
| 73 | |
| 74 | #include <sys/param.h> |
| 75 | #include <sys/systm.h> |
| 76 | #include <sys/buf.h> |
| 77 | #include <sys/conf.h> |
| 78 | #include <sys/kernel.h> |
| 79 | #include <sys/lock.h> |
| 80 | #include <sys/malloc.h> |
| 81 | #include <sys/mount.h> |
| 82 | #include <sys/unistd.h> |
| 83 | #include <sys/vnode.h> |
| 84 | #include <sys/poll.h> |
| 85 | #include <sys/mountctl.h> |
| 86 | #include <sys/file.h> |
| 87 | |
| 88 | #include <machine/limits.h> |
| 89 | |
| 90 | #include <vm/vm.h> |
| 91 | #include <vm/vm_object.h> |
| 92 | #include <vm/vm_page.h> |
| 93 | #include <vm/vm_pager.h> |
| 94 | #include <vm/vnode_pager.h> |
| 95 | |
| 96 | #include <sys/file2.h> |
| 97 | #include <sys/thread2.h> |
| 98 | |
| 99 | static int journal_attach(struct mount *mp); |
| 100 | static void journal_detach(struct mount *mp); |
| 101 | static int journal_install_vfs_journal(struct mount *mp, struct file *fp, |
| 102 | const struct mountctl_install_journal *info); |
| 103 | static int journal_remove_vfs_journal(struct mount *mp, |
| 104 | const struct mountctl_remove_journal *info); |
| 105 | static int journal_resync_vfs_journal(struct mount *mp, const void *ctl); |
| 106 | static void journal_thread(void *info); |
| 107 | static void journal_write_record(struct journal *jo, const char *ctl, |
| 108 | const char *buf, int bytes); |
| 109 | static void journal_write(struct journal *jo, const char *buf, int bytes); |
| 110 | |
| 111 | static int journal_nmkdir(struct vop_nmkdir_args *ap); |
| 112 | |
| 113 | static struct vnodeopv_entry_desc journal_vnodeop_entries[] = { |
| 114 | { &vop_default_desc, vop_journal_operate_ap }, |
| 115 | { &vop_mountctl_desc, (void *)journal_mountctl }, |
| 116 | { &vop_nmkdir_desc, (void *)journal_nmkdir }, |
| 117 | { NULL, NULL } |
| 118 | }; |
| 119 | |
| 120 | static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structure"); |
| 121 | static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO"); |
| 122 | |
| 123 | int |
| 124 | journal_mountctl(struct vop_mountctl_args *ap) |
| 125 | { |
| 126 | struct mount *mp; |
| 127 | int error = 0; |
| 128 | |
| 129 | mp = ap->a_head.a_ops->vv_mount; |
| 130 | KKASSERT(mp); |
| 131 | |
| 132 | if (mp->mnt_vn_journal_ops == NULL) { |
| 133 | switch(ap->a_op) { |
| 134 | case MOUNTCTL_INSTALL_VFS_JOURNAL: |
| 135 | error = journal_attach(mp); |
| 136 | if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal)) |
| 137 | error = EINVAL; |
| 138 | if (error == 0 && ap->a_fp == NULL) |
| 139 | error = EBADF; |
| 140 | if (error == 0) |
| 141 | error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl); |
| 142 | if (TAILQ_EMPTY(&mp->mnt_jlist)) |
| 143 | journal_detach(mp); |
| 144 | break; |
| 145 | case MOUNTCTL_REMOVE_VFS_JOURNAL: |
| 146 | case MOUNTCTL_RESYNC_VFS_JOURNAL: |
| 147 | error = EINVAL; |
| 148 | break; |
| 149 | default: |
| 150 | error = EOPNOTSUPP; |
| 151 | break; |
| 152 | } |
| 153 | } else { |
| 154 | switch(ap->a_op) { |
| 155 | case MOUNTCTL_INSTALL_VFS_JOURNAL: |
| 156 | if (ap->a_ctllen != sizeof(struct mountctl_install_journal)) |
| 157 | error = EINVAL; |
| 158 | if (error == 0 && ap->a_fp == NULL) |
| 159 | error = EBADF; |
| 160 | if (error == 0) |
| 161 | error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl); |
| 162 | break; |
| 163 | case MOUNTCTL_REMOVE_VFS_JOURNAL: |
| 164 | if (ap->a_ctllen != sizeof(struct mountctl_remove_journal)) |
| 165 | error = EINVAL; |
| 166 | if (error == 0) |
| 167 | error = journal_remove_vfs_journal(mp, ap->a_ctl); |
| 168 | if (TAILQ_EMPTY(&mp->mnt_jlist)) |
| 169 | journal_detach(mp); |
| 170 | break; |
| 171 | case MOUNTCTL_RESYNC_VFS_JOURNAL: |
| 172 | if (ap->a_ctllen != 0) |
| 173 | error = EINVAL; |
| 174 | error = journal_resync_vfs_journal(mp, ap->a_ctl); |
| 175 | break; |
| 176 | default: |
| 177 | error = EOPNOTSUPP; |
| 178 | break; |
| 179 | } |
| 180 | } |
| 181 | return (error); |
| 182 | } |
| 183 | |
| 184 | /* |
| 185 | * High level mount point setup. When a |
| 186 | */ |
| 187 | static int |
| 188 | journal_attach(struct mount *mp) |
| 189 | { |
| 190 | vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries); |
| 191 | return(0); |
| 192 | } |
| 193 | |
| 194 | static void |
| 195 | journal_detach(struct mount *mp) |
| 196 | { |
| 197 | if (mp->mnt_vn_journal_ops) |
| 198 | vfs_rm_vnodeops(&mp->mnt_vn_journal_ops); |
| 199 | } |
| 200 | |
| 201 | /* |
| 202 | * Install a journal on a mount point |
| 203 | */ |
| 204 | static int |
| 205 | journal_install_vfs_journal(struct mount *mp, struct file *fp, |
| 206 | const struct mountctl_install_journal *info) |
| 207 | { |
| 208 | struct journal *jo; |
| 209 | int error = 0; |
| 210 | int size; |
| 211 | |
| 212 | jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK|M_ZERO); |
| 213 | bcopy(info->id, jo->id, sizeof(jo->id)); |
| 214 | jo->flags = info->flags & ~(MC_JOURNAL_ACTIVE | MC_JOURNAL_STOP_REQ); |
| 215 | |
| 216 | /* |
| 217 | * Memory FIFO size, round to nearest power of 2 |
| 218 | */ |
| 219 | if (info->flags & MC_JOURNAL_MBSIZE_PROVIDED) { |
| 220 | if (info->membufsize < 65536) |
| 221 | size = 65536; |
| 222 | else if (info->membufsize > 128 * 1024 * 1024) |
| 223 | size = 128 * 1024 * 1024; |
| 224 | else |
| 225 | size = (int)info->membufsize; |
| 226 | } else { |
| 227 | size = 1024 * 1024; |
| 228 | } |
| 229 | jo->fifo.size = 1; |
| 230 | while (jo->fifo.size < size) |
| 231 | jo->fifo.size <<= 1; |
| 232 | |
| 233 | /* |
| 234 | * Other parameters. If not specified the starting transaction id |
| 235 | * will be the current date. |
| 236 | */ |
| 237 | if (info->flags & MC_JOURNAL_TRANSID_PROVIDED) { |
| 238 | jo->transid = info->transid; |
| 239 | } else { |
| 240 | struct timespec ts; |
| 241 | getnanotime(&ts); |
| 242 | jo->transid = ((int64_t)ts.tv_sec << 30) | ts.tv_nsec; |
| 243 | } |
| 244 | |
| 245 | jo->fp = fp; |
| 246 | |
| 247 | /* |
| 248 | * Allocate the memory FIFO |
| 249 | */ |
| 250 | jo->fifo.mask = jo->fifo.size - 1; |
| 251 | jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK|M_ZERO|M_NULLOK); |
| 252 | if (jo->fifo.membase == NULL) |
| 253 | error = ENOMEM; |
| 254 | |
| 255 | if (error) { |
| 256 | free(jo, M_JOURNAL); |
| 257 | } else { |
| 258 | fhold(fp); |
| 259 | jo->flags |= MC_JOURNAL_ACTIVE; |
| 260 | lwkt_create(journal_thread, jo, NULL, &jo->thread, |
| 261 | TDF_STOPREQ, -1, "journal %.*s", JIDMAX, jo->id); |
| 262 | lwkt_setpri(&jo->thread, TDPRI_KERN_DAEMON); |
| 263 | lwkt_schedule(&jo->thread); |
| 264 | journal_write_record(jo, "INSTALL", NULL, 0); |
| 265 | |
| 266 | TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry); |
| 267 | } |
| 268 | return(error); |
| 269 | } |
| 270 | |
| 271 | static int |
| 272 | journal_remove_vfs_journal(struct mount *mp, const struct mountctl_remove_journal *info) |
| 273 | { |
| 274 | struct journal *jo; |
| 275 | int error; |
| 276 | |
| 277 | TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) { |
| 278 | if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0) |
| 279 | break; |
| 280 | } |
| 281 | if (jo) { |
| 282 | error = 0; |
| 283 | TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry); |
| 284 | journal_write_record(jo, "REMOVE", NULL, 0); /* XXX sequencing */ |
| 285 | jo->flags |= MC_JOURNAL_STOP_REQ | (info->flags & MC_JOURNAL_STOP_IMM); |
| 286 | wakeup(&jo->fifo); |
| 287 | while (jo->flags & MC_JOURNAL_ACTIVE) { |
| 288 | tsleep(jo, 0, "jwait", 0); |
| 289 | } |
| 290 | lwkt_free_thread(&jo->thread); /* XXX SMP */ |
| 291 | if (jo->fp) |
| 292 | fdrop(jo->fp, curthread); |
| 293 | if (jo->fifo.membase) |
| 294 | free(jo->fifo.membase, M_JFIFO); |
| 295 | free(jo, M_JOURNAL); |
| 296 | } else { |
| 297 | error = EINVAL; |
| 298 | } |
| 299 | return (error); |
| 300 | } |
| 301 | |
| 302 | static int |
| 303 | journal_resync_vfs_journal(struct mount *mp, const void *ctl) |
| 304 | { |
| 305 | return(EINVAL); |
| 306 | } |
| 307 | |
| 308 | static void |
| 309 | journal_thread(void *info) |
| 310 | { |
| 311 | struct journal *jo = info; |
| 312 | int bytes; |
| 313 | int error; |
| 314 | int res; |
| 315 | |
| 316 | for (;;) { |
| 317 | bytes = (jo->fifo.windex - jo->fifo.rindex) & jo->fifo.mask; |
| 318 | if (bytes == 0 || (jo->flags & MC_JOURNAL_STOP_IMM)) { |
| 319 | if (jo->flags & MC_JOURNAL_STOP_REQ) |
| 320 | break; |
| 321 | tsleep(&jo->fifo, 0, "jfifo", 0); /* XXX add heartbeat */ |
| 322 | } |
| 323 | if (bytes > jo->fifo.size - jo->fifo.windex) |
| 324 | bytes = jo->fifo.size - jo->fifo.windex; |
| 325 | error = fp_write(jo->fp, jo->fifo.membase + jo->fifo.rindex, bytes, &res); |
| 326 | if (error) { |
| 327 | printf("journal_thread(%s) write, error %d\n", jo->id, error); |
| 328 | jo->fifo.rindex = jo->fifo.windex; /* XXX flag out-of-sync */ |
| 329 | } else { |
| 330 | printf("journal_thread(%s) write %d\n", jo->id, res); |
| 331 | jo->fifo.rindex = (jo->fifo.rindex + res) & jo->fifo.mask; |
| 332 | if (jo->flags & MC_JOURNAL_WWAIT) { |
| 333 | jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */ |
| 334 | wakeup(&jo->fifo.windex); |
| 335 | } |
| 336 | } |
| 337 | } |
| 338 | jo->flags &= ~MC_JOURNAL_ACTIVE; |
| 339 | wakeup(jo); |
| 340 | wakeup(&jo->fifo.windex); |
| 341 | } |
| 342 | |
| 343 | static |
| 344 | void |
| 345 | journal_write_record(struct journal *jo, const char *ctl, |
| 346 | const char *buf, int bytes) |
| 347 | { |
| 348 | char head[64]; |
| 349 | |
| 350 | if (jo->flags & MC_JOURNAL_STOP_REQ) |
| 351 | return; |
| 352 | snprintf(head, sizeof(head), "%016llx %s\n", jo->transid, ctl); |
| 353 | /* XXX locking (token) or cmpexgl space reservation */ |
| 354 | ++jo->transid; /* XXX embed nanotime, force monotonic */ |
| 355 | journal_write(jo, head, strlen(head)); |
| 356 | if (bytes) |
| 357 | journal_write(jo, buf, bytes); |
| 358 | } |
| 359 | |
| 360 | static |
| 361 | void |
| 362 | journal_write(struct journal *jo, const char *buf, int bytes) |
| 363 | { |
| 364 | int avail; |
| 365 | |
| 366 | while (bytes && (jo->flags & MC_JOURNAL_ACTIVE)) { |
| 367 | avail = (jo->fifo.windex - jo->fifo.rindex) & jo->fifo.mask; |
| 368 | avail = jo->fifo.size - avail - 1; |
| 369 | if (avail == 0) { |
| 370 | if (jo->flags & MC_JOURNAL_STOP_IMM) |
| 371 | break; |
| 372 | jo->flags |= MC_JOURNAL_WWAIT; |
| 373 | tsleep(&jo->fifo.windex, 0, "jwrite", 0); |
| 374 | continue; |
| 375 | } |
| 376 | if (avail > jo->fifo.size - jo->fifo.windex) |
| 377 | avail = jo->fifo.size - jo->fifo.windex; |
| 378 | if (avail > bytes) |
| 379 | avail = bytes; |
| 380 | bcopy(buf, jo->fifo.membase + jo->fifo.windex, avail); |
| 381 | bytes -= avail; |
| 382 | jo->fifo.windex = (jo->fifo.windex + avail) & jo->fifo.mask; |
| 383 | tsleep(&jo->fifo, 0, "jfifo", 0); /* XXX hysteresis */ |
| 384 | } |
| 385 | } |
| 386 | |
| 387 | /************************************************************************ |
| 388 | * JOURNAL VNOPS * |
| 389 | ************************************************************************/ |
| 390 | |
| 391 | static |
| 392 | int |
| 393 | journal_nmkdir(struct vop_nmkdir_args *ap) |
| 394 | { |
| 395 | int error; |
| 396 | |
| 397 | printf("JMKDIR %s\n", ap->a_ncp->nc_name); |
| 398 | error = vop_journal_operate_ap(&ap->a_head); |
| 399 | return (error); |
| 400 | } |
| 401 | |