- Bump WARNS to 6
[dragonfly.git] / sys / kern / vfs_journal.c
... / ...
CommitLineData
1/*
2 * Copyright (c) 2004 The DragonFly Project. All rights reserved.
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
34 * $DragonFly: src/sys/kern/vfs_journal.c,v 1.3 2004/12/29 02:40:02 dillon Exp $
35 */
36/*
37 * Each mount point may have zero or more independantly configured journals
38 * attached to it. Each journal is represented by a memory FIFO and worker
39 * thread. Journal events are streamed through the FIFO to the thread,
40 * batched up (typically on one-second intervals), and written out by the
41 * thread.
42 *
43 * Journal vnode ops are executed instead of mnt_vn_norm_ops when one or
44 * more journals have been installed on a mount point. It becomes the
45 * responsibility of the journal op to call the underlying normal op as
46 * appropriate.
47 *
48 * The journaling protocol is intended to evolve into a two-way stream
49 * whereby transaction IDs can be acknowledged by the journaling target
50 * when the data has been committed to hard storage. Both implicit and
51 * explicit acknowledgement schemes will be supported, depending on the
52 * sophistication of the journaling stream, plus resynchronization and
53 * restart when a journaling stream is interrupted. This information will
54 * also be made available to journaling-aware filesystems to allow better
55 * management of their own physical storage synchronization mechanisms as
56 * well as to allow such filesystems to take direct advantage of the kernel's
57 * journaling layer so they don't have to roll their own.
58 *
59 * In addition, the journaling thread will have access to much larger
60 * spooling areas then the memory buffer is able to provide by e.g.
61 * reserving swap space, in order to absorb potentially long interruptions
62 * of off-site journaling streams, and to prevent 'slow' off-site linkages
63 * from radically slowing down local filesystem operations.
64 *
65 * Because of the non-trivial algorithms the journaling system will be
66 * required to support, use of a worker thread is mandatory. Efficiencies
67 * are maintained by utilitizing the memory FIFO to batch transactions when
68 * possible, reducing the number of gratuitous thread switches and taking
69 * advantage of cpu caches through the use of shorter batched code paths
70 * rather then trying to do everything in the context of the process
71 * originating the filesystem op.
72 */
73
74#include <sys/param.h>
75#include <sys/systm.h>
76#include <sys/buf.h>
77#include <sys/conf.h>
78#include <sys/kernel.h>
79#include <sys/lock.h>
80#include <sys/malloc.h>
81#include <sys/mount.h>
82#include <sys/unistd.h>
83#include <sys/vnode.h>
84#include <sys/poll.h>
85#include <sys/mountctl.h>
86#include <sys/file.h>
87
88#include <machine/limits.h>
89
90#include <vm/vm.h>
91#include <vm/vm_object.h>
92#include <vm/vm_page.h>
93#include <vm/vm_pager.h>
94#include <vm/vnode_pager.h>
95
96#include <sys/file2.h>
97#include <sys/thread2.h>
98
99static int journal_attach(struct mount *mp);
100static void journal_detach(struct mount *mp);
101static int journal_install_vfs_journal(struct mount *mp, struct file *fp,
102 const struct mountctl_install_journal *info);
103static int journal_remove_vfs_journal(struct mount *mp,
104 const struct mountctl_remove_journal *info);
105static int journal_resync_vfs_journal(struct mount *mp, const void *ctl);
106static void journal_thread(void *info);
107static void journal_write_record(struct journal *jo, const char *ctl,
108 const char *buf, int bytes);
109static void journal_write(struct journal *jo, const char *buf, int bytes);
110
111static int journal_nmkdir(struct vop_nmkdir_args *ap);
112
113static struct vnodeopv_entry_desc journal_vnodeop_entries[] = {
114 { &vop_default_desc, vop_journal_operate_ap },
115 { &vop_mountctl_desc, (void *)journal_mountctl },
116 { &vop_nmkdir_desc, (void *)journal_nmkdir },
117 { NULL, NULL }
118};
119
120static MALLOC_DEFINE(M_JOURNAL, "journal", "Journaling structure");
121static MALLOC_DEFINE(M_JFIFO, "journal-fifo", "Journal FIFO");
122
123int
124journal_mountctl(struct vop_mountctl_args *ap)
125{
126 struct mount *mp;
127 int error = 0;
128
129 mp = ap->a_head.a_ops->vv_mount;
130 KKASSERT(mp);
131
132 if (mp->mnt_vn_journal_ops == NULL) {
133 switch(ap->a_op) {
134 case MOUNTCTL_INSTALL_VFS_JOURNAL:
135 error = journal_attach(mp);
136 if (error == 0 && ap->a_ctllen != sizeof(struct mountctl_install_journal))
137 error = EINVAL;
138 if (error == 0 && ap->a_fp == NULL)
139 error = EBADF;
140 if (error == 0)
141 error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
142 if (TAILQ_EMPTY(&mp->mnt_jlist))
143 journal_detach(mp);
144 break;
145 case MOUNTCTL_REMOVE_VFS_JOURNAL:
146 case MOUNTCTL_RESYNC_VFS_JOURNAL:
147 error = EINVAL;
148 break;
149 default:
150 error = EOPNOTSUPP;
151 break;
152 }
153 } else {
154 switch(ap->a_op) {
155 case MOUNTCTL_INSTALL_VFS_JOURNAL:
156 if (ap->a_ctllen != sizeof(struct mountctl_install_journal))
157 error = EINVAL;
158 if (error == 0 && ap->a_fp == NULL)
159 error = EBADF;
160 if (error == 0)
161 error = journal_install_vfs_journal(mp, ap->a_fp, ap->a_ctl);
162 break;
163 case MOUNTCTL_REMOVE_VFS_JOURNAL:
164 if (ap->a_ctllen != sizeof(struct mountctl_remove_journal))
165 error = EINVAL;
166 if (error == 0)
167 error = journal_remove_vfs_journal(mp, ap->a_ctl);
168 if (TAILQ_EMPTY(&mp->mnt_jlist))
169 journal_detach(mp);
170 break;
171 case MOUNTCTL_RESYNC_VFS_JOURNAL:
172 if (ap->a_ctllen != 0)
173 error = EINVAL;
174 error = journal_resync_vfs_journal(mp, ap->a_ctl);
175 break;
176 default:
177 error = EOPNOTSUPP;
178 break;
179 }
180 }
181 return (error);
182}
183
184/*
185 * High level mount point setup. When a
186 */
187static int
188journal_attach(struct mount *mp)
189{
190 vfs_add_vnodeops(mp, &mp->mnt_vn_journal_ops, journal_vnodeop_entries);
191 return(0);
192}
193
194static void
195journal_detach(struct mount *mp)
196{
197 if (mp->mnt_vn_journal_ops)
198 vfs_rm_vnodeops(&mp->mnt_vn_journal_ops);
199}
200
201/*
202 * Install a journal on a mount point
203 */
204static int
205journal_install_vfs_journal(struct mount *mp, struct file *fp,
206 const struct mountctl_install_journal *info)
207{
208 struct journal *jo;
209 int error = 0;
210 int size;
211
212 jo = malloc(sizeof(struct journal), M_JOURNAL, M_WAITOK|M_ZERO);
213 bcopy(info->id, jo->id, sizeof(jo->id));
214 jo->flags = info->flags & ~(MC_JOURNAL_ACTIVE | MC_JOURNAL_STOP_REQ);
215
216 /*
217 * Memory FIFO size, round to nearest power of 2
218 */
219 if (info->flags & MC_JOURNAL_MBSIZE_PROVIDED) {
220 if (info->membufsize < 65536)
221 size = 65536;
222 else if (info->membufsize > 128 * 1024 * 1024)
223 size = 128 * 1024 * 1024;
224 else
225 size = (int)info->membufsize;
226 } else {
227 size = 1024 * 1024;
228 }
229 jo->fifo.size = 1;
230 while (jo->fifo.size < size)
231 jo->fifo.size <<= 1;
232
233 /*
234 * Other parameters. If not specified the starting transaction id
235 * will be the current date.
236 */
237 if (info->flags & MC_JOURNAL_TRANSID_PROVIDED) {
238 jo->transid = info->transid;
239 } else {
240 struct timespec ts;
241 getnanotime(&ts);
242 jo->transid = ((int64_t)ts.tv_sec << 30) | ts.tv_nsec;
243 }
244
245 jo->fp = fp;
246
247 /*
248 * Allocate the memory FIFO
249 */
250 jo->fifo.mask = jo->fifo.size - 1;
251 jo->fifo.membase = malloc(jo->fifo.size, M_JFIFO, M_WAITOK|M_ZERO|M_NULLOK);
252 if (jo->fifo.membase == NULL)
253 error = ENOMEM;
254
255 if (error) {
256 free(jo, M_JOURNAL);
257 } else {
258 fhold(fp);
259 jo->flags |= MC_JOURNAL_ACTIVE;
260 lwkt_create(journal_thread, jo, NULL, &jo->thread,
261 TDF_STOPREQ, -1, "journal %.*s", JIDMAX, jo->id);
262 lwkt_setpri(&jo->thread, TDPRI_KERN_DAEMON);
263 lwkt_schedule(&jo->thread);
264 journal_write_record(jo, "INSTALL", NULL, 0);
265
266 TAILQ_INSERT_TAIL(&mp->mnt_jlist, jo, jentry);
267 }
268 return(error);
269}
270
271static int
272journal_remove_vfs_journal(struct mount *mp, const struct mountctl_remove_journal *info)
273{
274 struct journal *jo;
275 int error;
276
277 TAILQ_FOREACH(jo, &mp->mnt_jlist, jentry) {
278 if (bcmp(jo->id, info->id, sizeof(jo->id)) == 0)
279 break;
280 }
281 if (jo) {
282 error = 0;
283 TAILQ_REMOVE(&mp->mnt_jlist, jo, jentry);
284 journal_write_record(jo, "REMOVE", NULL, 0); /* XXX sequencing */
285 jo->flags |= MC_JOURNAL_STOP_REQ | (info->flags & MC_JOURNAL_STOP_IMM);
286 wakeup(&jo->fifo);
287 while (jo->flags & MC_JOURNAL_ACTIVE) {
288 tsleep(jo, 0, "jwait", 0);
289 }
290 lwkt_free_thread(&jo->thread); /* XXX SMP */
291 if (jo->fp)
292 fdrop(jo->fp, curthread);
293 if (jo->fifo.membase)
294 free(jo->fifo.membase, M_JFIFO);
295 free(jo, M_JOURNAL);
296 } else {
297 error = EINVAL;
298 }
299 return (error);
300}
301
302static int
303journal_resync_vfs_journal(struct mount *mp, const void *ctl)
304{
305 return(EINVAL);
306}
307
308static void
309journal_thread(void *info)
310{
311 struct journal *jo = info;
312 int bytes;
313 int error;
314 int res;
315
316 for (;;) {
317 bytes = (jo->fifo.windex - jo->fifo.rindex) & jo->fifo.mask;
318 if (bytes == 0 || (jo->flags & MC_JOURNAL_STOP_IMM)) {
319 if (jo->flags & MC_JOURNAL_STOP_REQ)
320 break;
321 tsleep(&jo->fifo, 0, "jfifo", 0); /* XXX add heartbeat */
322 }
323 if (bytes > jo->fifo.size - jo->fifo.windex)
324 bytes = jo->fifo.size - jo->fifo.windex;
325 error = fp_write(jo->fp, jo->fifo.membase + jo->fifo.rindex, bytes, &res);
326 if (error) {
327 printf("journal_thread(%s) write, error %d\n", jo->id, error);
328 jo->fifo.rindex = jo->fifo.windex; /* XXX flag out-of-sync */
329 } else {
330 printf("journal_thread(%s) write %d\n", jo->id, res);
331 jo->fifo.rindex = (jo->fifo.rindex + res) & jo->fifo.mask;
332 if (jo->flags & MC_JOURNAL_WWAIT) {
333 jo->flags &= ~MC_JOURNAL_WWAIT; /* XXX hysteresis */
334 wakeup(&jo->fifo.windex);
335 }
336 }
337 }
338 jo->flags &= ~MC_JOURNAL_ACTIVE;
339 wakeup(jo);
340 wakeup(&jo->fifo.windex);
341}
342
343static
344void
345journal_write_record(struct journal *jo, const char *ctl,
346 const char *buf, int bytes)
347{
348 char head[64];
349
350 if (jo->flags & MC_JOURNAL_STOP_REQ)
351 return;
352 snprintf(head, sizeof(head), "%016llx %s\n", jo->transid, ctl);
353 /* XXX locking (token) or cmpexgl space reservation */
354 ++jo->transid; /* XXX embed nanotime, force monotonic */
355 journal_write(jo, head, strlen(head));
356 if (bytes)
357 journal_write(jo, buf, bytes);
358}
359
360static
361void
362journal_write(struct journal *jo, const char *buf, int bytes)
363{
364 int avail;
365
366 while (bytes && (jo->flags & MC_JOURNAL_ACTIVE)) {
367 avail = (jo->fifo.windex - jo->fifo.rindex) & jo->fifo.mask;
368 avail = jo->fifo.size - avail - 1;
369 if (avail == 0) {
370 if (jo->flags & MC_JOURNAL_STOP_IMM)
371 break;
372 jo->flags |= MC_JOURNAL_WWAIT;
373 tsleep(&jo->fifo.windex, 0, "jwrite", 0);
374 continue;
375 }
376 if (avail > jo->fifo.size - jo->fifo.windex)
377 avail = jo->fifo.size - jo->fifo.windex;
378 if (avail > bytes)
379 avail = bytes;
380 bcopy(buf, jo->fifo.membase + jo->fifo.windex, avail);
381 bytes -= avail;
382 jo->fifo.windex = (jo->fifo.windex + avail) & jo->fifo.mask;
383 tsleep(&jo->fifo, 0, "jfifo", 0); /* XXX hysteresis */
384 }
385}
386
387/************************************************************************
388 * JOURNAL VNOPS *
389 ************************************************************************/
390
391static
392int
393journal_nmkdir(struct vop_nmkdir_args *ap)
394{
395 int error;
396
397 printf("JMKDIR %s\n", ap->a_ncp->nc_name);
398 error = vop_journal_operate_ap(&ap->a_head);
399 return (error);
400}
401