2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34 * $DragonFly: src/sys/kern/vfs_jops.c,v 1.3 2004/12/29 02:40:02 dillon Exp $
37 * Each mount point may have zero or more independantly configured journals
38 * attached to it. Each journal is represented by a memory FIFO and worker
39 * thread. Journal events are streamed through the FIFO to the thread,
40 * batched up (typically on one-second intervals), and written out by the
43 * Journal vnode ops are executed instead of mnt_vn_norm_ops when one or
44 * more journals have been installed on a mount point. It becomes the
45 * responsibility of the journal op to call the underlying normal op as
48 * The journaling protocol is intended to evolve into a two-way stream
49 * whereby transaction IDs can be acknowledged by the journaling target
50 * when the data has been committed to hard storage. Both implicit and
51 * explicit acknowledgement schemes will be supported, depending on the
52 * sophistication of the journaling stream, plus resynchronization and
53 * restart when a journaling stream is interrupted. This information will
54 * also be made available to journaling-aware filesystems to allow better
55 * management of their own physical storage synchronization mechanisms as
56 * well as to allow such filesystems to take direct advantage of the kernel's
57 * journaling layer so they don't have to roll their own.
59 * In addition, the journaling thread will have access to much larger
60 * spooling areas then the memory buffer is able to provide by e.g.
61 * reserving swap space, in order to absorb potentially long interruptions
62 * of off-site journaling streams, and to prevent 'slow' off-site linkages
63 * from radically slowing down local filesystem operations.
65 * Because of the non-trivial algorithms the journaling system will be
66 * required to support, use of a worker thread is mandatory. Efficiencies
67 * are maintained by utilitizing the memory FIFO to batch transactions when
68 * possible, reducing the number of gratuitous thread switches and taking
69 * advantage of cpu caches through the use of shorter batched code paths
70 * rather then trying to do everything in the context of the process
71 * originating the filesystem op.
74 #include <sys/param.h>
75 #include <sys/systm.h>
78 #include <sys/kernel.h>
80 #include <sys/malloc.h>
81 #include <sys/mount.h>
82 #include <sys/unistd.h>
83 #include <sys/vnode.h>
85 #include <sys/mountctl.h>
88 #include <machine/limits.h>
91 #include <vm/vm_object.h>
92 #include <vm/vm_page.h>
93 #include <vm/vm_pager.h>
94 #include <vm/vnode_pager.h>
96 #include <sys/file2.h>
97 #include <sys/thread2.h>
99 static int journal_attach(struct mount *mp);
100 static void journal_detach(struct mount *mp);
101 static int journal_install_vfs_journal(struct mount *mp, struct file *fp,
102 const struct mountctl_install_journal *info);
103 static int journal_remove_vfs_journal(struct mount *mp,
104 const struct mountctl_remove_journal *info);
105 static int journal_resync_vfs_journal(struct mount *mp, const void *ctl);
106 static void journal_thread(void *info);
107 static void journal_write_record(struct journal *jo, const char *ctl,
108 const char *buf, int bytes);
109 static void journal_write(struct journal *jo, const char *buf, int bytes);
111 static int journal_nmkdir(struct vop_nmkdir_args *ap);
113 static struct vnodeopv_entry_desc journal_vnodeop_entries[] = {
114 { &vop_default_desc, vop_journal_operate_ap },
115 { &vop_mountctl_desc, (void *)journal_mountctl },
116 { &vop_nmkdir_desc, (void *)journal_nmkdir },
120 static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structure");
121 static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
124 journal_mountctl(struct vop_mountctl_args *ap)
129 mp = ap->a_head.a_ops->vv_mount;
132 if (mp->mnt_vn_journal_ops == NULL) {
134 case MOUNTCTL_INSTALL_VFS_JOURNAL:
135 error = journal_attach(mp);
136 if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal))
138 if (error == 0 && ap->a_fp == NULL)
141 error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
142 if (TAILQ_EMPTY(&mp->mnt_jlist))
145 case MOUNTCTL_REMOVE_VFS_JOURNAL:
146 case MOUNTCTL_RESYNC_VFS_JOURNAL:
155 case MOUNTCTL_INSTALL_VFS_JOURNAL:
156 if (ap->a_ctllen != sizeof(struct mountctl_install_journal))
158 if (error == 0 && ap->a_fp == NULL)
161 error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
163 case MOUNTCTL_REMOVE_VFS_JOURNAL:
164 if (ap->a_ctllen != sizeof(struct mountctl_remove_journal))
167 error = journal_remove_vfs_journal(mp, ap->a_ctl);
168 if (TAILQ_EMPTY(&mp->mnt_jlist))
171 case MOUNTCTL_RESYNC_VFS_JOURNAL:
172 if (ap->a_ctllen != 0)
174 error = journal_resync_vfs_journal(mp, ap->a_ctl);
185 * High level mount point setup. When a
188 journal_attach(struct mount *mp)
190 vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries);
195 journal_detach(struct mount *mp)
197 if (mp->mnt_vn_journal_ops)
198 vfs_rm_vnodeops(&mp->mnt_vn_journal_ops);
202 * Install a journal on a mount point
205 journal_install_vfs_journal(struct mount *mp, struct file *fp,
206 const struct mountctl_install_journal *info)
212 jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK|M_ZERO);
213 bcopy(info->id, jo->id, sizeof(jo->id));
214 jo->flags = info->flags & ~(MC_JOURNAL_ACTIVE | MC_JOURNAL_STOP_REQ);
217 * Memory FIFO size, round to nearest power of 2
219 if (info->flags & MC_JOURNAL_MBSIZE_PROVIDED) {
220 if (info->membufsize < 65536)
222 else if (info->membufsize > 128 * 1024 * 1024)
223 size = 128 * 1024 * 1024;
225 size = (int)info->membufsize;
230 while (jo->fifo.size < size)
234 * Other parameters. If not specified the starting transaction id
235 * will be the current date.
237 if (info->flags & MC_JOURNAL_TRANSID_PROVIDED) {
238 jo->transid = info->transid;
242 jo->transid = ((int64_t)ts.tv_sec << 30) | ts.tv_nsec;
248 * Allocate the memory FIFO
250 jo->fifo.mask = jo->fifo.size - 1;
251 jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK|M_ZERO|M_NULLOK);
252 if (jo->fifo.membase == NULL)
259 jo->flags |= MC_JOURNAL_ACTIVE;
260 lwkt_create(journal_thread, jo, NULL, &jo->thread,
261 TDF_STOPREQ, -1, "journal %.*s", JIDMAX, jo->id);
262 lwkt_setpri(&jo->thread, TDPRI_KERN_DAEMON);
263 lwkt_schedule(&jo->thread);
264 journal_write_record(jo, "INSTALL", NULL, 0);
266 TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry);
272 journal_remove_vfs_journal(struct mount *mp, const struct mountctl_remove_journal *info)
277 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
278 if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0)
283 TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry);
284 journal_write_record(jo, "REMOVE", NULL, 0); /* XXX sequencing */
285 jo->flags |= MC_JOURNAL_STOP_REQ | (info->flags & MC_JOURNAL_STOP_IMM);
287 while (jo->flags & MC_JOURNAL_ACTIVE) {
288 tsleep(jo, 0, "jwait", 0);
290 lwkt_free_thread(&jo->thread); /* XXX SMP */
292 fdrop(jo->fp, curthread);
293 if (jo->fifo.membase)
294 free(jo->fifo.membase, M_JFIFO);
303 journal_resync_vfs_journal(struct mount *mp, const void *ctl)
309 journal_thread(void *info)
311 struct journal *jo = info;
317 bytes = (jo->fifo.windex - jo->fifo.rindex) & jo->fifo.mask;
318 if (bytes == 0 || (jo->flags & MC_JOURNAL_STOP_IMM)) {
319 if (jo->flags & MC_JOURNAL_STOP_REQ)
321 tsleep(&jo->fifo, 0, "jfifo", 0); /* XXX add heartbeat */
323 if (bytes > jo->fifo.size - jo->fifo.windex)
324 bytes = jo->fifo.size - jo->fifo.windex;
325 error = fp_write(jo->fp, jo->fifo.membase + jo->fifo.rindex, bytes, &res);
327 printf("journal_thread(%s) write, error %d\n", jo->id, error);
328 jo->fifo.rindex = jo->fifo.windex; /* XXX flag out-of-sync */
330 printf("journal_thread(%s) write %d\n", jo->id, res);
331 jo->fifo.rindex = (jo->fifo.rindex + res) & jo->fifo.mask;
332 if (jo->flags & MC_JOURNAL_WWAIT) {
333 jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
334 wakeup(&jo->fifo.windex);
338 jo->flags &= ~MC_JOURNAL_ACTIVE;
340 wakeup(&jo->fifo.windex);
345 journal_write_record(struct journal *jo, const char *ctl,
346 const char *buf, int bytes)
350 if (jo->flags & MC_JOURNAL_STOP_REQ)
352 snprintf(head, sizeof(head), "%016llx %s\n", jo->transid, ctl);
353 /* XXX locking (token) or cmpexgl space reservation */
354 ++jo->transid; /* XXX embed nanotime, force monotonic */
355 journal_write(jo, head, strlen(head));
357 journal_write(jo, buf, bytes);
362 journal_write(struct journal *jo, const char *buf, int bytes)
366 while (bytes && (jo->flags & MC_JOURNAL_ACTIVE)) {
367 avail = (jo->fifo.windex - jo->fifo.rindex) & jo->fifo.mask;
368 avail = jo->fifo.size - avail - 1;
370 if (jo->flags & MC_JOURNAL_STOP_IMM)
372 jo->flags |= MC_JOURNAL_WWAIT;
373 tsleep(&jo->fifo.windex, 0, "jwrite", 0);
376 if (avail > jo->fifo.size - jo->fifo.windex)
377 avail = jo->fifo.size - jo->fifo.windex;
380 bcopy(buf, jo->fifo.membase + jo->fifo.windex, avail);
382 jo->fifo.windex = (jo->fifo.windex + avail) & jo->fifo.mask;
383 tsleep(&jo->fifo, 0, "jfifo", 0); /* XXX hysteresis */
387 /************************************************************************
389 ************************************************************************/
393 journal_nmkdir(struct vop_nmkdir_args *ap)
397 printf("JMKDIR %s\n", ap->a_ncp->nc_name);
398 error = vop_journal_operate_ap(&ap->a_head);