HAMMER VFS - Implement swapcache for HAMMER data in double_buffer mode
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
CommitLineData
427e5fc6 1/*
b84de5af 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
427e5fc6
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
fbb84158 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
427e5fc6
MD
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/namecache.h>
42#include <sys/vnode.h>
43#include <sys/lockf.h>
44#include <sys/event.h>
45#include <sys/stat.h>
b3deaf57 46#include <sys/dirent.h>
fbb84158 47#include <sys/file.h>
c0ade690 48#include <vm/vm_extern.h>
18bee4a2 49#include <vm/swap_pager.h>
7a04d74f 50#include <vfs/fifofs/fifo.h>
684a93c4 51
427e5fc6
MD
52#include "hammer.h"
53
54/*
55 * USERFS VNOPS
56 */
57/*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
66325755
MD
58static int hammer_vop_fsync(struct vop_fsync_args *);
59static int hammer_vop_read(struct vop_read_args *);
60static int hammer_vop_write(struct vop_write_args *);
61static int hammer_vop_access(struct vop_access_args *);
62static int hammer_vop_advlock(struct vop_advlock_args *);
63static int hammer_vop_close(struct vop_close_args *);
64static int hammer_vop_ncreate(struct vop_ncreate_args *);
65static int hammer_vop_getattr(struct vop_getattr_args *);
66static int hammer_vop_nresolve(struct vop_nresolve_args *);
67static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
68static int hammer_vop_nlink(struct vop_nlink_args *);
69static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
70static int hammer_vop_nmknod(struct vop_nmknod_args *);
71static int hammer_vop_open(struct vop_open_args *);
66325755
MD
72static int hammer_vop_print(struct vop_print_args *);
73static int hammer_vop_readdir(struct vop_readdir_args *);
74static int hammer_vop_readlink(struct vop_readlink_args *);
75static int hammer_vop_nremove(struct vop_nremove_args *);
76static int hammer_vop_nrename(struct vop_nrename_args *);
77static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
349433c9 78static int hammer_vop_markatime(struct vop_markatime_args *);
66325755
MD
79static int hammer_vop_setattr(struct vop_setattr_args *);
80static int hammer_vop_strategy(struct vop_strategy_args *);
a99b9ea2 81static int hammer_vop_bmap(struct vop_bmap_args *ap);
66325755
MD
82static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
83static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
7dc57964 84static int hammer_vop_ioctl(struct vop_ioctl_args *);
513ca7d7 85static int hammer_vop_mountctl(struct vop_mountctl_args *);
fbb84158 86static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
427e5fc6 87
7a04d74f
MD
88static int hammer_vop_fifoclose (struct vop_close_args *);
89static int hammer_vop_fiforead (struct vop_read_args *);
90static int hammer_vop_fifowrite (struct vop_write_args *);
fbb84158 91static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
7a04d74f 92
427e5fc6
MD
93struct vop_ops hammer_vnode_vops = {
94 .vop_default = vop_defaultop,
95 .vop_fsync = hammer_vop_fsync,
c0ade690
MD
96 .vop_getpages = vop_stdgetpages,
97 .vop_putpages = vop_stdputpages,
427e5fc6
MD
98 .vop_read = hammer_vop_read,
99 .vop_write = hammer_vop_write,
100 .vop_access = hammer_vop_access,
101 .vop_advlock = hammer_vop_advlock,
102 .vop_close = hammer_vop_close,
103 .vop_ncreate = hammer_vop_ncreate,
104 .vop_getattr = hammer_vop_getattr,
105 .vop_inactive = hammer_vop_inactive,
106 .vop_reclaim = hammer_vop_reclaim,
107 .vop_nresolve = hammer_vop_nresolve,
108 .vop_nlookupdotdot = hammer_vop_nlookupdotdot,
109 .vop_nlink = hammer_vop_nlink,
110 .vop_nmkdir = hammer_vop_nmkdir,
111 .vop_nmknod = hammer_vop_nmknod,
112 .vop_open = hammer_vop_open,
64950f31 113 .vop_pathconf = vop_stdpathconf,
427e5fc6
MD
114 .vop_print = hammer_vop_print,
115 .vop_readdir = hammer_vop_readdir,
116 .vop_readlink = hammer_vop_readlink,
117 .vop_nremove = hammer_vop_nremove,
118 .vop_nrename = hammer_vop_nrename,
119 .vop_nrmdir = hammer_vop_nrmdir,
349433c9 120 .vop_markatime = hammer_vop_markatime,
427e5fc6 121 .vop_setattr = hammer_vop_setattr,
a99b9ea2 122 .vop_bmap = hammer_vop_bmap,
427e5fc6
MD
123 .vop_strategy = hammer_vop_strategy,
124 .vop_nsymlink = hammer_vop_nsymlink,
7dc57964 125 .vop_nwhiteout = hammer_vop_nwhiteout,
513ca7d7 126 .vop_ioctl = hammer_vop_ioctl,
fbb84158
MD
127 .vop_mountctl = hammer_vop_mountctl,
128 .vop_kqfilter = hammer_vop_kqfilter
427e5fc6
MD
129};
130
7a04d74f 131struct vop_ops hammer_spec_vops = {
8be7edad 132 .vop_default = vop_defaultop,
7a04d74f 133 .vop_fsync = hammer_vop_fsync,
8be7edad
MD
134 .vop_read = vop_stdnoread,
135 .vop_write = vop_stdnowrite,
7a04d74f 136 .vop_access = hammer_vop_access,
8be7edad 137 .vop_close = hammer_vop_close,
349433c9 138 .vop_markatime = hammer_vop_markatime,
8be7edad 139 .vop_getattr = hammer_vop_getattr,
7a04d74f
MD
140 .vop_inactive = hammer_vop_inactive,
141 .vop_reclaim = hammer_vop_reclaim,
142 .vop_setattr = hammer_vop_setattr
143};
144
145struct vop_ops hammer_fifo_vops = {
146 .vop_default = fifo_vnoperate,
147 .vop_fsync = hammer_vop_fsync,
148 .vop_read = hammer_vop_fiforead,
149 .vop_write = hammer_vop_fifowrite,
150 .vop_access = hammer_vop_access,
151 .vop_close = hammer_vop_fifoclose,
349433c9 152 .vop_markatime = hammer_vop_markatime,
7a04d74f
MD
153 .vop_getattr = hammer_vop_getattr,
154 .vop_inactive = hammer_vop_inactive,
155 .vop_reclaim = hammer_vop_reclaim,
fbb84158
MD
156 .vop_setattr = hammer_vop_setattr,
157 .vop_kqfilter = hammer_vop_fifokqfilter
7a04d74f
MD
158};
159
fbb84158
MD
160static __inline
161void
162hammer_knote(struct vnode *vp, int flags)
163{
164 if (flags)
5b22f1a7 165 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
fbb84158
MD
166}
167
0832c9bb
MD
168#ifdef DEBUG_TRUNCATE
169struct hammer_inode *HammerTruncIp;
170#endif
171
b84de5af 172static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
d7e278bb
MD
173 struct vnode *dvp, struct ucred *cred,
174 int flags, int isdir);
8cd0a023
MD
175static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
176static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
177
427e5fc6
MD
178#if 0
179static
180int
181hammer_vop_vnoperate(struct vop_generic_args *)
182{
183 return (VOCALL(&hammer_vnode_vops, ap));
184}
185#endif
186
66325755
MD
187/*
188 * hammer_vop_fsync { vp, waitfor }
ddfdf542
MD
189 *
190 * fsync() an inode to disk and wait for it to be completely committed
191 * such that the information would not be undone if a crash occured after
192 * return.
6f3d87c0
MD
193 *
194 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
195 * a REDO log. A sysctl is provided to relax HAMMER's fsync()
196 * operation.
197 *
198 * Ultimately the combination of a REDO log and use of fast storage
199 * to front-end cluster caches will make fsync fast, but it aint
200 * here yet. And, in anycase, we need real transactional
201 * all-or-nothing features which are not restricted to a single file.
66325755 202 */
427e5fc6
MD
203static
204int
66325755 205hammer_vop_fsync(struct vop_fsync_args *ap)
427e5fc6 206{
b84de5af 207 hammer_inode_t ip = VTOI(ap->a_vp);
9192654c 208 hammer_mount_t hmp = ip->hmp;
6f3d87c0 209 int waitfor = ap->a_waitfor;
9192654c 210 int mode;
6f3d87c0 211
b0aab9b9
MD
212 lwkt_gettoken(&hmp->fs_token);
213
6f3d87c0 214 /*
9192654c
MD
215 * Fsync rule relaxation (default is either full synchronous flush
216 * or REDO semantics with synchronous flush).
6f3d87c0
MD
217 */
218 if (ap->a_flags & VOP_FSYNC_SYSCALL) {
219 switch(hammer_fsync_mode) {
220 case 0:
9192654c 221mode0:
47f363f1 222 /* no REDO, full synchronous flush */
9192654c 223 goto skip;
6f3d87c0 224 case 1:
9192654c 225mode1:
47f363f1 226 /* no REDO, full asynchronous flush */
6f3d87c0
MD
227 if (waitfor == MNT_WAIT)
228 waitfor = MNT_NOWAIT;
9192654c 229 goto skip;
6f3d87c0 230 case 2:
9192654c
MD
231 /* REDO semantics, synchronous flush */
232 if (hmp->version < HAMMER_VOL_VERSION_FOUR)
233 goto mode0;
234 mode = HAMMER_FLUSH_UNDOS_AUTO;
235 break;
6f3d87c0 236 case 3:
9192654c
MD
237 /* REDO semantics, relaxed asynchronous flush */
238 if (hmp->version < HAMMER_VOL_VERSION_FOUR)
239 goto mode1;
240 mode = HAMMER_FLUSH_UNDOS_RELAXED;
241 if (waitfor == MNT_WAIT)
242 waitfor = MNT_NOWAIT;
243 break;
244 case 4:
245 /* ignore the fsync() system call */
b0aab9b9 246 lwkt_reltoken(&hmp->fs_token);
6f3d87c0
MD
247 return(0);
248 default:
9192654c
MD
249 /* we have to do something */
250 mode = HAMMER_FLUSH_UNDOS_RELAXED;
251 if (waitfor == MNT_WAIT)
252 waitfor = MNT_NOWAIT;
253 break;
254 }
255
256 /*
47f363f1
MD
257 * Fast fsync only needs to flush the UNDO/REDO fifo if
258 * HAMMER_INODE_REDO is non-zero and the only modifications
259 * made to the file are write or write-extends.
9192654c 260 */
47f363f1 261 if ((ip->flags & HAMMER_INODE_REDO) &&
9192654c
MD
262 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
263 ) {
264 ++hammer_count_fsyncs;
265 hammer_flusher_flush_undos(hmp, mode);
266 ip->redo_count = 0;
b0aab9b9 267 lwkt_reltoken(&hmp->fs_token);
6f3d87c0
MD
268 return(0);
269 }
47f363f1
MD
270
271 /*
272 * REDO is enabled by fsync(), the idea being we really only
273 * want to lay down REDO records when programs are using
274 * fsync() heavily. The first fsync() on the file starts
275 * the gravy train going and later fsync()s keep it hot by
276 * resetting the redo_count.
277 *
278 * We weren't running REDOs before now so we have to fall
279 * through and do a full fsync of what we have.
280 */
c58123da
MD
281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
47f363f1
MD
283 ip->flags |= HAMMER_INODE_REDO;
284 ip->redo_count = 0;
285 }
6f3d87c0 286 }
9192654c 287skip:
c0ade690 288
6f3d87c0 289 /*
9192654c 290 * Do a full flush sequence.
6f3d87c0 291 */
7a61b85d 292 ++hammer_count_fsyncs;
6f3d87c0 293 vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
af209b0f 294 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
6f3d87c0 295 if (waitfor == MNT_WAIT) {
b424ca30 296 vn_unlock(ap->a_vp);
b84de5af 297 hammer_wait_inode(ip);
b424ca30
MD
298 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
299 }
b0aab9b9 300 lwkt_reltoken(&hmp->fs_token);
059819e3 301 return (ip->error);
427e5fc6
MD
302}
303
66325755
MD
304/*
305 * hammer_vop_read { vp, uio, ioflag, cred }
42cd5131 306 *
b0aab9b9 307 * MPSAFE (for the cache safe does not require fs_token)
66325755 308 */
427e5fc6
MD
309static
310int
66325755 311hammer_vop_read(struct vop_read_args *ap)
427e5fc6 312{
66325755 313 struct hammer_transaction trans;
c0ade690 314 hammer_inode_t ip;
b0aab9b9 315 hammer_mount_t hmp;
66325755
MD
316 off_t offset;
317 struct buf *bp;
318 struct uio *uio;
319 int error;
320 int n;
8cd0a023 321 int seqcount;
4a2796f3
MD
322 int ioseqcount;
323 int blksize;
f864373f 324 int bigread;
b0aab9b9 325 int got_fstoken;
66325755
MD
326
327 if (ap->a_vp->v_type != VREG)
328 return (EINVAL);
329 ip = VTOI(ap->a_vp);
b0aab9b9 330 hmp = ip->hmp;
66325755 331 error = 0;
4a2796f3
MD
332 uio = ap->a_uio;
333
334 /*
335 * Allow the UIO's size to override the sequential heuristic.
336 */
337 blksize = hammer_blocksize(uio->uio_offset);
69adbed4
MD
338 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
339 ioseqcount = (ap->a_ioflag >> 16);
4a2796f3
MD
340 if (seqcount < ioseqcount)
341 seqcount = ioseqcount;
66325755 342
f864373f
MD
343 /*
344 * If reading or writing a huge amount of data we have to break
345 * atomicy and allow the operation to be interrupted by a signal
346 * or it can DOS the machine.
347 */
348 bigread = (uio->uio_resid > 100 * 1024 * 1024);
b0aab9b9 349 got_fstoken = 0;
f864373f 350
66325755 351 /*
4a2796f3
MD
352 * Access the data typically in HAMMER_BUFSIZE blocks via the
353 * buffer cache, but HAMMER may use a variable block size based
354 * on the offset.
42cd5131
MD
355 *
356 * XXX Temporary hack, delay the start transaction while we remain
357 * MPSAFE. NOTE: ino_data.size cannot change while vnode is
358 * locked-shared.
66325755 359 */
11ad5ade 360 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
4a2796f3
MD
361 int64_t base_offset;
362 int64_t file_limit;
363
364 blksize = hammer_blocksize(uio->uio_offset);
365 offset = (int)uio->uio_offset & (blksize - 1);
366 base_offset = uio->uio_offset - offset;
367
f864373f
MD
368 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
369 break;
370
42cd5131
MD
371 /*
372 * MPSAFE
373 */
72d6a027 374 bp = getcacheblk(ap->a_vp, base_offset, blksize);
42cd5131
MD
375 if (bp) {
376 error = 0;
377 goto skip;
638e2d2d 378 } else {
9e6f3bfc 379 if (ap->a_ioflag & IO_NRDELAY)
638e2d2d 380 return (EWOULDBLOCK);
42cd5131
MD
381 }
382
383 /*
384 * MPUNSAFE
385 */
b0aab9b9
MD
386 if (got_fstoken == 0) {
387 lwkt_gettoken(&hmp->fs_token);
388 got_fstoken = 1;
42cd5131
MD
389 hammer_start_transaction(&trans, ip->hmp);
390 }
391
1b0ab2c3 392 if (hammer_cluster_enable) {
4a2796f3
MD
393 /*
394 * Use file_limit to prevent cluster_read() from
395 * creating buffers of the wrong block size past
396 * the demarc.
397 */
398 file_limit = ip->ino_data.size;
399 if (base_offset < HAMMER_XDEMARC &&
400 file_limit > HAMMER_XDEMARC) {
401 file_limit = HAMMER_XDEMARC;
402 }
403 error = cluster_read(ap->a_vp,
404 file_limit, base_offset,
364c022c
MD
405 blksize, uio->uio_resid,
406 seqcount * BKVASIZE, &bp);
a99b9ea2 407 } else {
4a2796f3 408 error = bread(ap->a_vp, base_offset, blksize, &bp);
a99b9ea2 409 }
66325755
MD
410 if (error) {
411 brelse(bp);
412 break;
413 }
42cd5131 414skip:
24c8374a
MD
415 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
416 kprintf("doff %016jx read file %016jx@%016jx\n",
417 (intmax_t)bp->b_bio2.bio_offset,
418 (intmax_t)ip->obj_id,
419 (intmax_t)bp->b_loffset);
420 }
421 bp->b_flags &= ~B_IODEBUG;
7bc5b8c2 422
c0ade690 423 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
4a2796f3 424 n = blksize - offset;
66325755
MD
425 if (n > uio->uio_resid)
426 n = uio->uio_resid;
11ad5ade
MD
427 if (n > ip->ino_data.size - uio->uio_offset)
428 n = (int)(ip->ino_data.size - uio->uio_offset);
df01a101
MD
429 if (got_fstoken)
430 lwkt_reltoken(&hmp->fs_token);
283b9448
MD
431
432 /*
433 * Set B_AGE, data has a lower priority than meta-data.
434 *
435 * Use a hold/unlock/drop sequence to run the uiomove
436 * with the buffer unlocked, avoiding deadlocks against
437 * read()s on mmap()'d spaces.
438 */
439 bp->b_flags |= B_AGE;
440 bqhold(bp);
441 bqrelse(bp);
66325755 442 error = uiomove((char *)bp->b_data + offset, n, uio);
283b9448
MD
443 bqdrop(bp);
444
df01a101
MD
445 if (got_fstoken)
446 lwkt_gettoken(&hmp->fs_token);
7bc5b8c2 447
af209b0f
MD
448 if (error)
449 break;
ce0138a6 450 hammer_stats_file_read += n;
66325755 451 }
42cd5131
MD
452
453 /*
454 * XXX only update the atime if we had to get the MP lock.
455 * XXX hack hack hack, fixme.
456 */
b0aab9b9 457 if (got_fstoken) {
42cd5131
MD
458 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
459 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
460 ip->ino_data.atime = trans.time;
e98f1b96 461 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
42cd5131
MD
462 }
463 hammer_done_transaction(&trans);
b0aab9b9 464 lwkt_reltoken(&hmp->fs_token);
b84de5af 465 }
66325755 466 return (error);
427e5fc6
MD
467}
468
66325755
MD
469/*
470 * hammer_vop_write { vp, uio, ioflag, cred }
471 */
427e5fc6
MD
472static
473int
66325755 474hammer_vop_write(struct vop_write_args *ap)
427e5fc6 475{
66325755
MD
476 struct hammer_transaction trans;
477 struct hammer_inode *ip;
4a2796f3 478 hammer_mount_t hmp;
66325755 479 struct uio *uio;
4a2796f3 480 int offset;
47637bff 481 off_t base_offset;
66325755 482 struct buf *bp;
fbb84158 483 int kflags;
66325755
MD
484 int error;
485 int n;
c0ade690 486 int flags;
cb51be26 487 int seqcount;
f864373f 488 int bigwrite;
66325755
MD
489
490 if (ap->a_vp->v_type != VREG)
491 return (EINVAL);
492 ip = VTOI(ap->a_vp);
4a2796f3 493 hmp = ip->hmp;
66325755 494 error = 0;
fbb84158 495 kflags = 0;
cb51be26 496 seqcount = ap->a_ioflag >> 16;
66325755 497
d113fda1
MD
498 if (ip->flags & HAMMER_INODE_RO)
499 return (EROFS);
500
66325755
MD
501 /*
502 * Create a transaction to cover the operations we perform.
503 */
b0aab9b9 504 lwkt_gettoken(&hmp->fs_token);
4a2796f3 505 hammer_start_transaction(&trans, hmp);
66325755
MD
506 uio = ap->a_uio;
507
508 /*
509 * Check append mode
510 */
511 if (ap->a_ioflag & IO_APPEND)
11ad5ade 512 uio->uio_offset = ip->ino_data.size;
66325755
MD
513
514 /*
af209b0f
MD
515 * Check for illegal write offsets. Valid range is 0...2^63-1.
516 *
517 * NOTE: the base_off assignment is required to work around what
518 * I consider to be a GCC-4 optimization bug.
66325755 519 */
af209b0f
MD
520 if (uio->uio_offset < 0) {
521 hammer_done_transaction(&trans);
b0aab9b9 522 lwkt_reltoken(&hmp->fs_token);
af209b0f
MD
523 return (EFBIG);
524 }
525 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
e54488bb 526 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
b84de5af 527 hammer_done_transaction(&trans);
b0aab9b9 528 lwkt_reltoken(&hmp->fs_token);
66325755 529 return (EFBIG);
9c448776 530 }
66325755 531
f864373f
MD
532 /*
533 * If reading or writing a huge amount of data we have to break
534 * atomicy and allow the operation to be interrupted by a signal
535 * or it can DOS the machine.
9192654c 536 *
47f363f1
MD
537 * Preset redo_count so we stop generating REDOs earlier if the
538 * limit is exceeded.
f864373f
MD
539 */
540 bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
47f363f1
MD
541 if ((ip->flags & HAMMER_INODE_REDO) &&
542 ip->redo_count < hammer_limit_redo) {
9192654c 543 ip->redo_count += uio->uio_resid;
47f363f1 544 }
f864373f 545
66325755 546 /*
4a2796f3
MD
547 * Access the data typically in HAMMER_BUFSIZE blocks via the
548 * buffer cache, but HAMMER may use a variable block size based
549 * on the offset.
66325755
MD
550 */
551 while (uio->uio_resid > 0) {
d5ef456e 552 int fixsize = 0;
4a2796f3
MD
553 int blksize;
554 int blkmask;
6362a262 555 int trivial;
d1eff1f7 556 int endofblk;
6362a262 557 off_t nsize;
d5ef456e 558
93291532 559 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
e63644f0 560 break;
f864373f
MD
561 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
562 break;
e63644f0 563
a9d52b76
MD
564 blksize = hammer_blocksize(uio->uio_offset);
565
059819e3 566 /*
4a2796f3
MD
567 * Do not allow HAMMER to blow out the buffer cache. Very
568 * large UIOs can lockout other processes due to bwillwrite()
569 * mechanics.
47637bff 570 *
df301614
MD
571 * The hammer inode is not locked during these operations.
572 * The vnode is locked which can interfere with the pageout
573 * daemon for non-UIO_NOCOPY writes but should not interfere
574 * with the buffer cache. Even so, we cannot afford to
575 * allow the pageout daemon to build up too many dirty buffer
576 * cache buffers.
cb63d1bc
MD
577 *
578 * Only call this if we aren't being recursively called from
579 * a virtual disk device (vn), else we may deadlock.
df301614 580 */
cb63d1bc
MD
581 if ((ap->a_ioflag & IO_RECURSE) == 0)
582 bwillwrite(blksize);
df301614 583
de996e86
MD
584 /*
585 * Control the number of pending records associated with
586 * this inode. If too many have accumulated start a
587 * flush. Try to maintain a pipeline with the flusher.
a117fbeb
MD
588 *
589 * NOTE: It is possible for other sources to grow the
590 * records but not necessarily issue another flush,
591 * so use a timeout and ensure that a re-flush occurs.
de996e86
MD
592 */
593 if (ip->rsv_recs >= hammer_limit_inode_recs) {
594 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
a117fbeb
MD
595 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
596 ip->flags |= HAMMER_INODE_RECSW;
de996e86 597 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
a117fbeb 598 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
de996e86 599 }
de996e86
MD
600 }
601
602#if 0
df301614 603 /*
e4a5ff06
MD
604 * Do not allow HAMMER to blow out system memory by
605 * accumulating too many records. Records are so well
606 * decoupled from the buffer cache that it is possible
607 * for userland to push data out to the media via
608 * direct-write, but build up the records queued to the
609 * backend faster then the backend can flush them out.
610 * HAMMER has hit its write limit but the frontend has
611 * no pushback to slow it down.
059819e3 612 */
df301614 613 if (hmp->rsv_recs > hammer_limit_recs / 2) {
4a2796f3 614 /*
df301614 615 * Get the inode on the flush list
4a2796f3 616 */
df301614
MD
617 if (ip->rsv_recs >= 64)
618 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
619 else if (ip->rsv_recs >= 16)
620 hammer_flush_inode(ip, 0);
4a2796f3
MD
621
622 /*
df301614
MD
623 * Keep the flusher going if the system keeps
624 * queueing records.
4a2796f3 625 */
df301614
MD
626 delta = hmp->count_newrecords -
627 hmp->last_newrecords;
628 if (delta < 0 || delta > hammer_limit_recs / 2) {
629 hmp->last_newrecords = hmp->count_newrecords;
630 hammer_sync_hmp(hmp, MNT_NOWAIT);
4a2796f3
MD
631 }
632
df301614
MD
633 /*
634 * If we have gotten behind start slowing
635 * down the writers.
636 */
637 delta = (hmp->rsv_recs - hammer_limit_recs) *
638 hz / hammer_limit_recs;
639 if (delta > 0)
640 tsleep(&trans, 0, "hmrslo", delta);
059819e3 641 }
de996e86 642#endif
059819e3 643
4a2796f3
MD
644 /*
645 * Calculate the blocksize at the current offset and figure
646 * out how much we can actually write.
647 */
4a2796f3
MD
648 blkmask = blksize - 1;
649 offset = (int)uio->uio_offset & blkmask;
650 base_offset = uio->uio_offset & ~(int64_t)blkmask;
651 n = blksize - offset;
d1eff1f7 652 if (n > uio->uio_resid) {
d5ef456e 653 n = uio->uio_resid;
d1eff1f7
MD
654 endofblk = 0;
655 } else {
656 endofblk = 1;
657 }
6362a262
MD
658 nsize = uio->uio_offset + n;
659 if (nsize > ip->ino_data.size) {
660 if (uio->uio_offset > ip->ino_data.size)
661 trivial = 0;
662 else
663 trivial = 1;
664 nvextendbuf(ap->a_vp,
665 ip->ino_data.size,
666 nsize,
667 hammer_blocksize(ip->ino_data.size),
668 hammer_blocksize(nsize),
669 hammer_blockoff(ip->ino_data.size),
670 hammer_blockoff(nsize),
671 trivial);
d5ef456e 672 fixsize = 1;
fbb84158 673 kflags |= NOTE_EXTEND;
d5ef456e
MD
674 }
675
c0ade690
MD
676 if (uio->uio_segflg == UIO_NOCOPY) {
677 /*
678 * Issuing a write with the same data backing the
679 * buffer. Instantiate the buffer to collect the
680 * backing vm pages, then read-in any missing bits.
681 *
682 * This case is used by vop_stdputpages().
683 */
47637bff 684 bp = getblk(ap->a_vp, base_offset,
4a2796f3 685 blksize, GETBLK_BHEAVY, 0);
c0ade690
MD
686 if ((bp->b_flags & B_CACHE) == 0) {
687 bqrelse(bp);
47637bff 688 error = bread(ap->a_vp, base_offset,
4a2796f3 689 blksize, &bp);
c0ade690 690 }
4a2796f3 691 } else if (offset == 0 && uio->uio_resid >= blksize) {
c0ade690 692 /*
a5fddc16
MD
693 * Even though we are entirely overwriting the buffer
694 * we may still have to zero it out to avoid a
695 * mmap/write visibility issue.
c0ade690 696 */
4a2796f3 697 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
a5fddc16
MD
698 if ((bp->b_flags & B_CACHE) == 0)
699 vfs_bio_clrbuf(bp);
47637bff 700 } else if (base_offset >= ip->ino_data.size) {
c0ade690 701 /*
a5fddc16
MD
702 * If the base offset of the buffer is beyond the
703 * file EOF, we don't have to issue a read.
c0ade690 704 */
47637bff 705 bp = getblk(ap->a_vp, base_offset,
4a2796f3 706 blksize, GETBLK_BHEAVY, 0);
66325755
MD
707 vfs_bio_clrbuf(bp);
708 } else {
c0ade690
MD
709 /*
710 * Partial overwrite, read in any missing bits then
711 * replace the portion being written.
712 */
4a2796f3 713 error = bread(ap->a_vp, base_offset, blksize, &bp);
d5ef456e
MD
714 if (error == 0)
715 bheavy(bp);
66325755 716 }
df01a101
MD
717 if (error == 0) {
718 lwkt_reltoken(&hmp->fs_token);
9192654c 719 error = uiomove(bp->b_data + offset, n, uio);
df01a101
MD
720 lwkt_gettoken(&hmp->fs_token);
721 }
9192654c
MD
722
723 /*
47f363f1
MD
724 * Generate REDO records if enabled and redo_count will not
725 * exceeded the limit.
726 *
727 * If redo_count exceeds the limit we stop generating records
728 * and clear HAMMER_INODE_REDO. This will cause the next
729 * fsync() to do a full meta-data sync instead of just an
730 * UNDO/REDO fifo update.
731 *
732 * When clearing HAMMER_INODE_REDO any pre-existing REDOs
733 * will still be tracked. The tracks will be terminated
734 * when the related meta-data (including possible data
735 * modifications which are not tracked via REDO) is
736 * flushed.
9192654c 737 */
47f363f1
MD
738 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
739 if (ip->redo_count < hammer_limit_redo) {
740 bp->b_flags |= B_VFSFLAG1;
741 error = hammer_generate_redo(&trans, ip,
9192654c 742 base_offset + offset,
47f363f1 743 HAMMER_REDO_WRITE,
9192654c
MD
744 bp->b_data + offset,
745 (size_t)n);
47f363f1
MD
746 } else {
747 ip->flags &= ~HAMMER_INODE_REDO;
748 }
47637bff 749 }
d5ef456e
MD
750
751 /*
752 * If we screwed up we have to undo any VM size changes we
753 * made.
754 */
66325755
MD
755 if (error) {
756 brelse(bp);
d5ef456e 757 if (fixsize) {
6362a262
MD
758 nvtruncbuf(ap->a_vp, ip->ino_data.size,
759 hammer_blocksize(ip->ino_data.size),
760 hammer_blockoff(ip->ino_data.size));
d5ef456e 761 }
66325755
MD
762 break;
763 }
fbb84158 764 kflags |= NOTE_WRITE;
ce0138a6 765 hammer_stats_file_write += n;
c0ade690 766 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
11ad5ade
MD
767 if (ip->ino_data.size < uio->uio_offset) {
768 ip->ino_data.size = uio->uio_offset;
9192654c 769 flags = HAMMER_INODE_SDIRTY;
c0ade690 770 } else {
d113fda1 771 flags = 0;
66325755 772 }
11ad5ade 773 ip->ino_data.mtime = trans.time;
ddfdf542 774 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
e98f1b96 775 hammer_modify_inode(&trans, ip, flags);
32c90105 776
1b0ab2c3
MD
777 /*
778 * Once we dirty the buffer any cached zone-X offset
779 * becomes invalid. HAMMER NOTE: no-history mode cannot
780 * allow overwriting over the same data sector unless
781 * we provide UNDOs for the old data, which we don't.
782 */
783 bp->b_bio2.bio_offset = NOOFFSET;
784
47637bff
MD
785 /*
786 * Final buffer disposition.
de996e86
MD
787 *
788 * Because meta-data updates are deferred, HAMMER is
789 * especially sensitive to excessive bdwrite()s because
790 * the I/O stream is not broken up by disk reads. So the
791 * buffer cache simply cannot keep up.
792 *
793 * WARNING! blksize is variable. cluster_write() is
d1eff1f7
MD
794 * expected to not blow up if it encounters
795 * buffers that do not match the passed blksize.
710733a6
MD
796 *
797 * NOTE! Hammer shouldn't need to bawrite()/cluster_write().
798 * The ip->rsv_recs check should burst-flush the data.
799 * If we queue it immediately the buf could be left
800 * locked on the device queue for a very long time.
d1eff1f7
MD
801 *
802 * NOTE! To avoid degenerate stalls due to mismatched block
803 * sizes we only honor IO_DIRECT on the write which
804 * abuts the end of the buffer. However, we must
805 * honor IO_SYNC in case someone is silly enough to
806 * configure a HAMMER file as swap, or when HAMMER
807 * is serving NFS (for commits). Ick ick.
47637bff 808 */
cb51be26 809 bp->b_flags |= B_AGE;
66325755
MD
810 if (ap->a_ioflag & IO_SYNC) {
811 bwrite(bp);
d1eff1f7 812 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
66325755 813 bawrite(bp);
710733a6
MD
814 } else {
815#if 0
816 if (offset + n == blksize) {
de996e86
MD
817 if (hammer_cluster_enable == 0 ||
818 (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
819 bawrite(bp);
820 } else {
821 cluster_write(bp, ip->ino_data.size,
822 blksize, seqcount);
823 }
4a2796f3 824 } else {
710733a6 825#endif
4a2796f3
MD
826 bdwrite(bp);
827 }
66325755 828 }
b84de5af 829 hammer_done_transaction(&trans);
fbb84158 830 hammer_knote(ap->a_vp, kflags);
b0aab9b9 831 lwkt_reltoken(&hmp->fs_token);
66325755 832 return (error);
427e5fc6
MD
833}
834
66325755
MD
835/*
836 * hammer_vop_access { vp, mode, cred }
b0aab9b9
MD
837 *
838 * MPSAFE - does not require fs_token
66325755 839 */
427e5fc6
MD
840static
841int
66325755 842hammer_vop_access(struct vop_access_args *ap)
427e5fc6 843{
66325755
MD
844 struct hammer_inode *ip = VTOI(ap->a_vp);
845 uid_t uid;
846 gid_t gid;
847 int error;
848
ce0138a6 849 ++hammer_stats_file_iopsr;
66325755
MD
850 uid = hammer_to_unix_xid(&ip->ino_data.uid);
851 gid = hammer_to_unix_xid(&ip->ino_data.gid);
852
853 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
854 ip->ino_data.uflags);
855 return (error);
427e5fc6
MD
856}
857
66325755
MD
858/*
859 * hammer_vop_advlock { vp, id, op, fl, flags }
b0aab9b9
MD
860 *
861 * MPSAFE - does not require fs_token
66325755 862 */
427e5fc6
MD
863static
864int
66325755 865hammer_vop_advlock(struct vop_advlock_args *ap)
427e5fc6 866{
4a2796f3 867 hammer_inode_t ip = VTOI(ap->a_vp);
66325755 868
11ad5ade 869 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
427e5fc6
MD
870}
871
66325755
MD
872/*
873 * hammer_vop_close { vp, fflag }
6f3d87c0 874 *
b0aab9b9 875 * We can only sync-on-close for normal closes. XXX disabled for now.
66325755 876 */
427e5fc6
MD
877static
878int
66325755 879hammer_vop_close(struct vop_close_args *ap)
427e5fc6 880{
9192654c 881#if 0
6f3d87c0
MD
882 struct vnode *vp = ap->a_vp;
883 hammer_inode_t ip = VTOI(vp);
884 int waitfor;
6f3d87c0
MD
885 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
886 if (vn_islocked(vp) == LK_EXCLUSIVE &&
887 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
888 if (ip->flags & HAMMER_INODE_CLOSESYNC)
889 waitfor = MNT_WAIT;
890 else
891 waitfor = MNT_NOWAIT;
892 ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
893 HAMMER_INODE_CLOSEASYNC);
894 VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
895 }
896 }
9192654c 897#endif
a89aec1b 898 return (vop_stdclose(ap));
427e5fc6
MD
899}
900
66325755
MD
901/*
902 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
903 *
904 * The operating system has already ensured that the directory entry
905 * does not exist and done all appropriate namespace locking.
906 */
427e5fc6
MD
907static
908int
66325755 909hammer_vop_ncreate(struct vop_ncreate_args *ap)
427e5fc6 910{
66325755
MD
911 struct hammer_transaction trans;
912 struct hammer_inode *dip;
913 struct hammer_inode *nip;
914 struct nchandle *nch;
b0aab9b9 915 hammer_mount_t hmp;
66325755
MD
916 int error;
917
918 nch = ap->a_nch;
919 dip = VTOI(ap->a_dvp);
b0aab9b9 920 hmp = dip->hmp;
66325755 921
d113fda1
MD
922 if (dip->flags & HAMMER_INODE_RO)
923 return (EROFS);
b0aab9b9 924 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 925 return (error);
d113fda1 926
66325755
MD
927 /*
928 * Create a transaction to cover the operations we perform.
929 */
b0aab9b9
MD
930 lwkt_gettoken(&hmp->fs_token);
931 hammer_start_transaction(&trans, hmp);
ce0138a6 932 ++hammer_stats_file_iopsw;
66325755
MD
933
934 /*
935 * Create a new filesystem object of the requested type. The
b84de5af
MD
936 * returned inode will be referenced and shared-locked to prevent
937 * it from being moved to the flusher.
66325755 938 */
5a930e66 939 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
940 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
941 NULL, &nip);
66325755 942 if (error) {
77062c8a 943 hkprintf("hammer_create_inode error %d\n", error);
b84de5af 944 hammer_done_transaction(&trans);
66325755 945 *ap->a_vpp = NULL;
b0aab9b9 946 lwkt_reltoken(&hmp->fs_token);
66325755
MD
947 return (error);
948 }
66325755
MD
949
950 /*
951 * Add the new filesystem object to the directory. This will also
952 * bump the inode's link count.
953 */
5a930e66
MD
954 error = hammer_ip_add_directory(&trans, dip,
955 nch->ncp->nc_name, nch->ncp->nc_nlen,
956 nip);
0b075555 957 if (error)
77062c8a 958 hkprintf("hammer_ip_add_directory error %d\n", error);
66325755
MD
959
960 /*
961 * Finish up.
962 */
963 if (error) {
a89aec1b 964 hammer_rel_inode(nip, 0);
b84de5af 965 hammer_done_transaction(&trans);
66325755
MD
966 *ap->a_vpp = NULL;
967 } else {
e8599db1 968 error = hammer_get_vnode(nip, ap->a_vpp);
b84de5af 969 hammer_done_transaction(&trans);
a89aec1b
MD
970 hammer_rel_inode(nip, 0);
971 if (error == 0) {
972 cache_setunresolved(ap->a_nch);
973 cache_setvp(ap->a_nch, *ap->a_vpp);
974 }
fbb84158 975 hammer_knote(ap->a_dvp, NOTE_WRITE);
66325755 976 }
b0aab9b9 977 lwkt_reltoken(&hmp->fs_token);
66325755 978 return (error);
427e5fc6
MD
979}
980
66325755
MD
981/*
982 * hammer_vop_getattr { vp, vap }
98f7132d
MD
983 *
984 * Retrieve an inode's attribute information. When accessing inodes
985 * historically we fake the atime field to ensure consistent results.
986 * The atime field is stored in the B-Tree element and allowed to be
987 * updated without cycling the element.
899eb297 988 *
b0aab9b9 989 * MPSAFE - does not require fs_token
66325755 990 */
427e5fc6
MD
991static
992int
66325755 993hammer_vop_getattr(struct vop_getattr_args *ap)
427e5fc6 994{
66325755
MD
995 struct hammer_inode *ip = VTOI(ap->a_vp);
996 struct vattr *vap = ap->a_vap;
997
a56cb012
MD
998 /*
999 * We want the fsid to be different when accessing a filesystem
1000 * with different as-of's so programs like diff don't think
1001 * the files are the same.
1002 *
1003 * We also want the fsid to be the same when comparing snapshots,
1004 * or when comparing mirrors (which might be backed by different
1005 * physical devices). HAMMER fsids are based on the PFS's
1006 * shared_uuid field.
1007 *
1008 * XXX there is a chance of collision here. The va_fsid reported
1009 * by stat is different from the more involved fsid used in the
1010 * mount structure.
c82af904 1011 */
ce0138a6 1012 ++hammer_stats_file_iopsr;
899eb297 1013 hammer_lock_sh(&ip->lock);
a56cb012
MD
1014 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1015 (u_int32_t)(ip->obj_asof >> 32);
1016
11ad5ade 1017 vap->va_fileid = ip->ino_leaf.base.obj_id;
66325755 1018 vap->va_mode = ip->ino_data.mode;
11ad5ade 1019 vap->va_nlink = ip->ino_data.nlinks;
66325755
MD
1020 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1021 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1022 vap->va_rmajor = 0;
1023 vap->va_rminor = 0;
11ad5ade 1024 vap->va_size = ip->ino_data.size;
bcac4bbb 1025
f437a2ab
MD
1026 /*
1027 * Special case for @@PFS softlinks. The actual size of the
1028 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
cb3c760c 1029 * or for MAX_TID is "@@-1:%05d" == 10 bytes.
f437a2ab
MD
1030 */
1031 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1032 ip->ino_data.size == 10 &&
1033 ip->obj_asof == HAMMER_MAX_TID &&
1034 ip->obj_localization == 0 &&
1035 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
cb3c760c
MD
1036 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1037 vap->va_size = 26;
1038 else
1039 vap->va_size = 10;
f437a2ab
MD
1040 }
1041
bcac4bbb
MD
1042 /*
1043 * We must provide a consistent atime and mtime for snapshots
1044 * so people can do a 'tar cf - ... | md5' on them and get
1045 * consistent results.
1046 */
1047 if (ip->flags & HAMMER_INODE_RO) {
ddfdf542
MD
1048 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1049 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
bcac4bbb 1050 } else {
ddfdf542
MD
1051 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1052 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
bcac4bbb 1053 }
ddfdf542 1054 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
66325755
MD
1055 vap->va_flags = ip->ino_data.uflags;
1056 vap->va_gen = 1; /* hammer inums are unique for all time */
bf686dbe 1057 vap->va_blocksize = HAMMER_BUFSIZE;
4a2796f3
MD
1058 if (ip->ino_data.size >= HAMMER_XDEMARC) {
1059 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1060 ~HAMMER_XBUFMASK64;
1061 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1062 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1063 ~HAMMER_BUFMASK64;
1064 } else {
1065 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1066 }
64950f31 1067
11ad5ade 1068 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
66325755 1069 vap->va_filerev = 0; /* XXX */
66325755
MD
1070 vap->va_uid_uuid = ip->ino_data.uid;
1071 vap->va_gid_uuid = ip->ino_data.gid;
1072 vap->va_fsid_uuid = ip->hmp->fsid;
1073 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1074 VA_FSID_UUID_VALID;
7a04d74f 1075
11ad5ade 1076 switch (ip->ino_data.obj_type) {
7a04d74f
MD
1077 case HAMMER_OBJTYPE_CDEV:
1078 case HAMMER_OBJTYPE_BDEV:
1079 vap->va_rmajor = ip->ino_data.rmajor;
1080 vap->va_rminor = ip->ino_data.rminor;
1081 break;
1082 default:
1083 break;
1084 }
899eb297 1085 hammer_unlock(&ip->lock);
66325755 1086 return(0);
427e5fc6
MD
1087}
1088
66325755
MD
1089/*
1090 * hammer_vop_nresolve { nch, dvp, cred }
1091 *
1092 * Locate the requested directory entry.
1093 */
427e5fc6
MD
1094static
1095int
66325755 1096hammer_vop_nresolve(struct vop_nresolve_args *ap)
427e5fc6 1097{
36f82b23 1098 struct hammer_transaction trans;
66325755 1099 struct namecache *ncp;
b0aab9b9 1100 hammer_mount_t hmp;
7f7c1f84
MD
1101 hammer_inode_t dip;
1102 hammer_inode_t ip;
1103 hammer_tid_t asof;
8cd0a023 1104 struct hammer_cursor cursor;
66325755
MD
1105 struct vnode *vp;
1106 int64_t namekey;
1107 int error;
7f7c1f84
MD
1108 int i;
1109 int nlen;
d113fda1 1110 int flags;
a56cb012 1111 int ispfs;
adf01747 1112 int64_t obj_id;
ddfdf542 1113 u_int32_t localization;
5e435c92 1114 u_int32_t max_iterations;
7f7c1f84
MD
1115
1116 /*
1117 * Misc initialization, plus handle as-of name extensions. Look for
1118 * the '@@' extension. Note that as-of files and directories cannot
1119 * be modified.
7f7c1f84
MD
1120 */
1121 dip = VTOI(ap->a_dvp);
1122 ncp = ap->a_nch->ncp;
1123 asof = dip->obj_asof;
bc6c1f13 1124 localization = dip->obj_localization; /* for code consistency */
7f7c1f84 1125 nlen = ncp->nc_nlen;
ea434b6f 1126 flags = dip->flags & HAMMER_INODE_RO;
a56cb012 1127 ispfs = 0;
b0aab9b9 1128 hmp = dip->hmp;
7f7c1f84 1129
b0aab9b9
MD
1130 lwkt_gettoken(&hmp->fs_token);
1131 hammer_simple_transaction(&trans, hmp);
ce0138a6 1132 ++hammer_stats_file_iopsr;
36f82b23 1133
7f7c1f84
MD
1134 for (i = 0; i < nlen; ++i) {
1135 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
bc6c1f13
MD
1136 error = hammer_str_to_tid(ncp->nc_name + i + 2,
1137 &ispfs, &asof, &localization);
1138 if (error != 0) {
1139 i = nlen;
1140 break;
1141 }
ea434b6f
MD
1142 if (asof != HAMMER_MAX_TID)
1143 flags |= HAMMER_INODE_RO;
7f7c1f84
MD
1144 break;
1145 }
1146 }
1147 nlen = i;
66325755 1148
ea434b6f
MD
1149 /*
1150 * If this is a PFS softlink we dive into the PFS
1151 */
1152 if (ispfs && nlen == 0) {
1153 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1154 asof, localization,
1155 flags, &error);
1156 if (error == 0) {
1157 error = hammer_get_vnode(ip, &vp);
1158 hammer_rel_inode(ip, 0);
1159 } else {
1160 vp = NULL;
1161 }
1162 if (error == 0) {
1163 vn_unlock(vp);
1164 cache_setvp(ap->a_nch, vp);
1165 vrele(vp);
1166 }
1167 goto done;
1168 }
1169
d113fda1 1170 /*
294aec9f
MD
1171 * If there is no path component the time extension is relative to dip.
1172 * e.g. "fubar/@@<snapshot>"
1173 *
1174 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1175 * e.g. "fubar/.@@<snapshot>"
1176 *
1177 * ".." is handled by the kernel. We do not currently handle
1178 * "..@<snapshot>".
d113fda1 1179 */
294aec9f 1180 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
bcac4bbb 1181 ip = hammer_get_inode(&trans, dip, dip->obj_id,
ddfdf542
MD
1182 asof, dip->obj_localization,
1183 flags, &error);
d113fda1 1184 if (error == 0) {
e8599db1 1185 error = hammer_get_vnode(ip, &vp);
d113fda1
MD
1186 hammer_rel_inode(ip, 0);
1187 } else {
1188 vp = NULL;
1189 }
1190 if (error == 0) {
1191 vn_unlock(vp);
1192 cache_setvp(ap->a_nch, vp);
1193 vrele(vp);
1194 }
36f82b23 1195 goto done;
d113fda1
MD
1196 }
1197
8cd0a023
MD
1198 /*
1199 * Calculate the namekey and setup the key range for the scan. This
1200 * works kinda like a chained hash table where the lower 32 bits
1201 * of the namekey synthesize the chain.
1202 *
1203 * The key range is inclusive of both key_beg and key_end.
1204 */
5e435c92
MD
1205 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1206 &max_iterations);
66325755 1207
bcac4bbb 1208 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
5a930e66 1209 cursor.key_beg.localization = dip->obj_localization +
beec5dc4 1210 hammer_dir_localization(dip);
8cd0a023
MD
1211 cursor.key_beg.obj_id = dip->obj_id;
1212 cursor.key_beg.key = namekey;
d5530d22 1213 cursor.key_beg.create_tid = 0;
8cd0a023
MD
1214 cursor.key_beg.delete_tid = 0;
1215 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1216 cursor.key_beg.obj_type = 0;
66325755 1217
8cd0a023 1218 cursor.key_end = cursor.key_beg;
5e435c92 1219 cursor.key_end.key += max_iterations;
d5530d22
MD
1220 cursor.asof = asof;
1221 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
66325755
MD
1222
1223 /*
8cd0a023 1224 * Scan all matching records (the chain), locate the one matching
a89aec1b 1225 * the requested path component.
8cd0a023
MD
1226 *
1227 * The hammer_ip_*() functions merge in-memory records with on-disk
1228 * records for the purposes of the search.
66325755 1229 */
6a37e7e4 1230 obj_id = 0;
43c665ae 1231 localization = HAMMER_DEF_LOCALIZATION;
6a37e7e4 1232
4e17f465 1233 if (error == 0) {
4e17f465
MD
1234 error = hammer_ip_first(&cursor);
1235 while (error == 0) {
1236 error = hammer_ip_resolve_data(&cursor);
1237 if (error)
1238 break;
11ad5ade
MD
1239 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1240 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1241 obj_id = cursor.data->entry.obj_id;
ddfdf542 1242 localization = cursor.data->entry.localization;
4e17f465
MD
1243 break;
1244 }
1245 error = hammer_ip_next(&cursor);
66325755
MD
1246 }
1247 }
6a37e7e4 1248 hammer_done_cursor(&cursor);
4c286c36
MD
1249
1250 /*
1251 * Lookup the obj_id. This should always succeed. If it does not
1252 * the filesystem may be damaged and we return a dummy inode.
1253 */
66325755 1254 if (error == 0) {
bcac4bbb 1255 ip = hammer_get_inode(&trans, dip, obj_id,
ddfdf542
MD
1256 asof, localization,
1257 flags, &error);
4c286c36
MD
1258 if (error == ENOENT) {
1259 kprintf("HAMMER: WARNING: Missing "
1260 "inode for dirent \"%s\"\n"
3d30bff3
MD
1261 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1262 ncp->nc_name,
1263 (long long)obj_id, (long long)asof,
1264 localization);
4c286c36
MD
1265 error = 0;
1266 ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1267 asof, localization,
1268 flags, &error);
1269 }
7f7c1f84 1270 if (error == 0) {
e8599db1 1271 error = hammer_get_vnode(ip, &vp);
7f7c1f84
MD
1272 hammer_rel_inode(ip, 0);
1273 } else {
1274 vp = NULL;
1275 }
66325755
MD
1276 if (error == 0) {
1277 vn_unlock(vp);
1278 cache_setvp(ap->a_nch, vp);
1279 vrele(vp);
1280 }
1281 } else if (error == ENOENT) {
1282 cache_setvp(ap->a_nch, NULL);
1283 }
36f82b23 1284done:
b84de5af 1285 hammer_done_transaction(&trans);
b0aab9b9 1286 lwkt_reltoken(&hmp->fs_token);
66325755 1287 return (error);
427e5fc6
MD
1288}
1289
66325755
MD
1290/*
1291 * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1292 *
1293 * Locate the parent directory of a directory vnode.
1294 *
1295 * dvp is referenced but not locked. *vpp must be returned referenced and
1296 * locked. A parent_obj_id of 0 does not necessarily indicate that we are
1297 * at the root, instead it could indicate that the directory we were in was
1298 * removed.
42c7d26b
MD
1299 *
1300 * NOTE: as-of sequences are not linked into the directory structure. If
1301 * we are at the root with a different asof then the mount point, reload
1302 * the same directory with the mount point's asof. I'm not sure what this
1303 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not
1304 * get confused, but it hasn't been tested.
66325755 1305 */
427e5fc6
MD
1306static
1307int
66325755 1308hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
427e5fc6 1309{
36f82b23 1310 struct hammer_transaction trans;
66325755 1311 struct hammer_inode *dip;
d113fda1 1312 struct hammer_inode *ip;
b0aab9b9 1313 hammer_mount_t hmp;
42c7d26b 1314 int64_t parent_obj_id;
5a930e66 1315 u_int32_t parent_obj_localization;
42c7d26b 1316 hammer_tid_t asof;
d113fda1 1317 int error;
66325755
MD
1318
1319 dip = VTOI(ap->a_dvp);
42c7d26b 1320 asof = dip->obj_asof;
b0aab9b9 1321 hmp = dip->hmp;
5a930e66
MD
1322
1323 /*
1324 * Whos are parent? This could be the root of a pseudo-filesystem
1325 * whos parent is in another localization domain.
1326 */
b0aab9b9 1327 lwkt_gettoken(&hmp->fs_token);
42c7d26b 1328 parent_obj_id = dip->ino_data.parent_obj_id;
5a930e66
MD
1329 if (dip->obj_id == HAMMER_OBJID_ROOT)
1330 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1331 else
1332 parent_obj_localization = dip->obj_localization;
42c7d26b
MD
1333
1334 if (parent_obj_id == 0) {
1335 if (dip->obj_id == HAMMER_OBJID_ROOT &&
b0aab9b9 1336 asof != hmp->asof) {
42c7d26b 1337 parent_obj_id = dip->obj_id;
b0aab9b9 1338 asof = hmp->asof;
42c7d26b
MD
1339 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1340 ksnprintf(*ap->a_fakename, 19, "0x%016llx",
973c11b9 1341 (long long)dip->obj_asof);
42c7d26b
MD
1342 } else {
1343 *ap->a_vpp = NULL;
b0aab9b9 1344 lwkt_reltoken(&hmp->fs_token);
42c7d26b
MD
1345 return ENOENT;
1346 }
66325755 1347 }
d113fda1 1348
b0aab9b9 1349 hammer_simple_transaction(&trans, hmp);
ce0138a6 1350 ++hammer_stats_file_iopsr;
36f82b23 1351
bcac4bbb 1352 ip = hammer_get_inode(&trans, dip, parent_obj_id,
5a930e66 1353 asof, parent_obj_localization,
ddfdf542 1354 dip->flags, &error);
36f82b23 1355 if (ip) {
e8599db1 1356 error = hammer_get_vnode(ip, ap->a_vpp);
36f82b23
MD
1357 hammer_rel_inode(ip, 0);
1358 } else {
d113fda1 1359 *ap->a_vpp = NULL;
d113fda1 1360 }
b84de5af 1361 hammer_done_transaction(&trans);
b0aab9b9 1362 lwkt_reltoken(&hmp->fs_token);
d113fda1 1363 return (error);
427e5fc6
MD
1364}
1365
66325755
MD
1366/*
1367 * hammer_vop_nlink { nch, dvp, vp, cred }
1368 */
427e5fc6
MD
1369static
1370int
66325755 1371hammer_vop_nlink(struct vop_nlink_args *ap)
427e5fc6 1372{
66325755
MD
1373 struct hammer_transaction trans;
1374 struct hammer_inode *dip;
1375 struct hammer_inode *ip;
1376 struct nchandle *nch;
b0aab9b9 1377 hammer_mount_t hmp;
66325755
MD
1378 int error;
1379
f437a2ab
MD
1380 if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1381 return(EXDEV);
1382
66325755
MD
1383 nch = ap->a_nch;
1384 dip = VTOI(ap->a_dvp);
1385 ip = VTOI(ap->a_vp);
b0aab9b9 1386 hmp = dip->hmp;
66325755 1387
f437a2ab
MD
1388 if (dip->obj_localization != ip->obj_localization)
1389 return(EXDEV);
1390
d113fda1
MD
1391 if (dip->flags & HAMMER_INODE_RO)
1392 return (EROFS);
1393 if (ip->flags & HAMMER_INODE_RO)
1394 return (EROFS);
b0aab9b9 1395 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1396 return (error);
d113fda1 1397
66325755
MD
1398 /*
1399 * Create a transaction to cover the operations we perform.
1400 */
b0aab9b9
MD
1401 lwkt_gettoken(&hmp->fs_token);
1402 hammer_start_transaction(&trans, hmp);
ce0138a6 1403 ++hammer_stats_file_iopsw;
66325755
MD
1404
1405 /*
1406 * Add the filesystem object to the directory. Note that neither
1407 * dip nor ip are referenced or locked, but their vnodes are
1408 * referenced. This function will bump the inode's link count.
1409 */
5a930e66
MD
1410 error = hammer_ip_add_directory(&trans, dip,
1411 nch->ncp->nc_name, nch->ncp->nc_nlen,
1412 ip);
66325755
MD
1413
1414 /*
1415 * Finish up.
1416 */
b84de5af 1417 if (error == 0) {
6b4f890b
MD
1418 cache_setunresolved(nch);
1419 cache_setvp(nch, ap->a_vp);
66325755 1420 }
b84de5af 1421 hammer_done_transaction(&trans);
fbb84158
MD
1422 hammer_knote(ap->a_vp, NOTE_LINK);
1423 hammer_knote(ap->a_dvp, NOTE_WRITE);
b0aab9b9 1424 lwkt_reltoken(&hmp->fs_token);
66325755 1425 return (error);
427e5fc6
MD
1426}
1427
66325755
MD
1428/*
1429 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1430 *
1431 * The operating system has already ensured that the directory entry
1432 * does not exist and done all appropriate namespace locking.
1433 */
427e5fc6
MD
1434static
1435int
66325755 1436hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
427e5fc6 1437{
66325755
MD
1438 struct hammer_transaction trans;
1439 struct hammer_inode *dip;
1440 struct hammer_inode *nip;
1441 struct nchandle *nch;
b0aab9b9 1442 hammer_mount_t hmp;
66325755
MD
1443 int error;
1444
1445 nch = ap->a_nch;
1446 dip = VTOI(ap->a_dvp);
b0aab9b9 1447 hmp = dip->hmp;
66325755 1448
d113fda1
MD
1449 if (dip->flags & HAMMER_INODE_RO)
1450 return (EROFS);
b0aab9b9 1451 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1452 return (error);
d113fda1 1453
66325755
MD
1454 /*
1455 * Create a transaction to cover the operations we perform.
1456 */
b0aab9b9
MD
1457 lwkt_gettoken(&hmp->fs_token);
1458 hammer_start_transaction(&trans, hmp);
ce0138a6 1459 ++hammer_stats_file_iopsw;
66325755
MD
1460
1461 /*
1462 * Create a new filesystem object of the requested type. The
8cd0a023 1463 * returned inode will be referenced but not locked.
66325755 1464 */
5a930e66 1465 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
1466 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1467 NULL, &nip);
66325755 1468 if (error) {
77062c8a 1469 hkprintf("hammer_mkdir error %d\n", error);
b84de5af 1470 hammer_done_transaction(&trans);
66325755 1471 *ap->a_vpp = NULL;
b0aab9b9 1472 lwkt_reltoken(&hmp->fs_token);
66325755
MD
1473 return (error);
1474 }
66325755
MD
1475 /*
1476 * Add the new filesystem object to the directory. This will also
1477 * bump the inode's link count.
1478 */
5a930e66
MD
1479 error = hammer_ip_add_directory(&trans, dip,
1480 nch->ncp->nc_name, nch->ncp->nc_nlen,
1481 nip);
0b075555 1482 if (error)
77062c8a 1483 hkprintf("hammer_mkdir (add) error %d\n", error);
66325755
MD
1484
1485 /*
1486 * Finish up.
1487 */
1488 if (error) {
a89aec1b 1489 hammer_rel_inode(nip, 0);
66325755
MD
1490 *ap->a_vpp = NULL;
1491 } else {
e8599db1 1492 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1493 hammer_rel_inode(nip, 0);
1494 if (error == 0) {
1495 cache_setunresolved(ap->a_nch);
1496 cache_setvp(ap->a_nch, *ap->a_vpp);
1497 }
66325755 1498 }
b84de5af 1499 hammer_done_transaction(&trans);
fbb84158
MD
1500 if (error == 0)
1501 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
b0aab9b9 1502 lwkt_reltoken(&hmp->fs_token);
66325755 1503 return (error);
427e5fc6
MD
1504}
1505
66325755
MD
1506/*
1507 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1508 *
1509 * The operating system has already ensured that the directory entry
1510 * does not exist and done all appropriate namespace locking.
1511 */
427e5fc6
MD
1512static
1513int
66325755 1514hammer_vop_nmknod(struct vop_nmknod_args *ap)
427e5fc6 1515{
66325755
MD
1516 struct hammer_transaction trans;
1517 struct hammer_inode *dip;
1518 struct hammer_inode *nip;
1519 struct nchandle *nch;
b0aab9b9 1520 hammer_mount_t hmp;
66325755
MD
1521 int error;
1522
1523 nch = ap->a_nch;
1524 dip = VTOI(ap->a_dvp);
b0aab9b9 1525 hmp = dip->hmp;
66325755 1526
d113fda1
MD
1527 if (dip->flags & HAMMER_INODE_RO)
1528 return (EROFS);
b0aab9b9 1529 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1530 return (error);
d113fda1 1531
66325755
MD
1532 /*
1533 * Create a transaction to cover the operations we perform.
1534 */
b0aab9b9
MD
1535 lwkt_gettoken(&hmp->fs_token);
1536 hammer_start_transaction(&trans, hmp);
ce0138a6 1537 ++hammer_stats_file_iopsw;
66325755
MD
1538
1539 /*
1540 * Create a new filesystem object of the requested type. The
8cd0a023 1541 * returned inode will be referenced but not locked.
5a930e66
MD
1542 *
1543 * If mknod specifies a directory a pseudo-fs is created.
66325755 1544 */
5a930e66 1545 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
1546 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1547 NULL, &nip);
66325755 1548 if (error) {
b84de5af 1549 hammer_done_transaction(&trans);
66325755 1550 *ap->a_vpp = NULL;
b0aab9b9 1551 lwkt_reltoken(&hmp->fs_token);
66325755
MD
1552 return (error);
1553 }
66325755
MD
1554
1555 /*
1556 * Add the new filesystem object to the directory. This will also
1557 * bump the inode's link count.
1558 */
5a930e66
MD
1559 error = hammer_ip_add_directory(&trans, dip,
1560 nch->ncp->nc_name, nch->ncp->nc_nlen,
1561 nip);
66325755
MD
1562
1563 /*
1564 * Finish up.
1565 */
1566 if (error) {
a89aec1b 1567 hammer_rel_inode(nip, 0);
66325755
MD
1568 *ap->a_vpp = NULL;
1569 } else {
e8599db1 1570 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1571 hammer_rel_inode(nip, 0);
1572 if (error == 0) {
1573 cache_setunresolved(ap->a_nch);
1574 cache_setvp(ap->a_nch, *ap->a_vpp);
1575 }
66325755 1576 }
b84de5af 1577 hammer_done_transaction(&trans);
fbb84158
MD
1578 if (error == 0)
1579 hammer_knote(ap->a_dvp, NOTE_WRITE);
b0aab9b9 1580 lwkt_reltoken(&hmp->fs_token);
66325755 1581 return (error);
427e5fc6
MD
1582}
1583
66325755
MD
1584/*
1585 * hammer_vop_open { vp, mode, cred, fp }
b0aab9b9
MD
1586 *
1587 * MPSAFE (does not require fs_token)
66325755 1588 */
427e5fc6
MD
1589static
1590int
66325755 1591hammer_vop_open(struct vop_open_args *ap)
427e5fc6 1592{
9f5097dc
MD
1593 hammer_inode_t ip;
1594
ce0138a6 1595 ++hammer_stats_file_iopsr;
9f5097dc
MD
1596 ip = VTOI(ap->a_vp);
1597
1598 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
d113fda1 1599 return (EROFS);
a89aec1b 1600 return(vop_stdopen(ap));
427e5fc6
MD
1601}
1602
66325755
MD
1603/*
1604 * hammer_vop_print { vp }
1605 */
427e5fc6
MD
1606static
1607int
66325755 1608hammer_vop_print(struct vop_print_args *ap)
427e5fc6
MD
1609{
1610 return EOPNOTSUPP;
1611}
1612
66325755 1613/*
6b4f890b 1614 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
66325755 1615 */
427e5fc6
MD
1616static
1617int
66325755 1618hammer_vop_readdir(struct vop_readdir_args *ap)
427e5fc6 1619{
36f82b23 1620 struct hammer_transaction trans;
6b4f890b
MD
1621 struct hammer_cursor cursor;
1622 struct hammer_inode *ip;
b0aab9b9 1623 hammer_mount_t hmp;
6b4f890b 1624 struct uio *uio;
6b4f890b
MD
1625 hammer_base_elm_t base;
1626 int error;
1627 int cookie_index;
1628 int ncookies;
1629 off_t *cookies;
1630 off_t saveoff;
1631 int r;
ea434b6f 1632 int dtype;
6b4f890b 1633
ce0138a6 1634 ++hammer_stats_file_iopsr;
6b4f890b
MD
1635 ip = VTOI(ap->a_vp);
1636 uio = ap->a_uio;
b3deaf57 1637 saveoff = uio->uio_offset;
b0aab9b9 1638 hmp = ip->hmp;
b3deaf57
MD
1639
1640 if (ap->a_ncookies) {
1641 ncookies = uio->uio_resid / 16 + 1;
1642 if (ncookies > 1024)
1643 ncookies = 1024;
1644 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1645 cookie_index = 0;
1646 } else {
1647 ncookies = -1;
1648 cookies = NULL;
1649 cookie_index = 0;
1650 }
1651
b0aab9b9
MD
1652 lwkt_gettoken(&hmp->fs_token);
1653 hammer_simple_transaction(&trans, hmp);
36f82b23 1654
b3deaf57
MD
1655 /*
1656 * Handle artificial entries
4c286c36
MD
1657 *
1658 * It should be noted that the minimum value for a directory
1659 * hash key on-media is 0x0000000100000000, so we can use anything
1660 * less then that to represent our 'special' key space.
b3deaf57
MD
1661 */
1662 error = 0;
1663 if (saveoff == 0) {
1664 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1665 if (r)
1666 goto done;
1667 if (cookies)
1668 cookies[cookie_index] = saveoff;
1669 ++saveoff;
1670 ++cookie_index;
1671 if (cookie_index == ncookies)
1672 goto done;
1673 }
1674 if (saveoff == 1) {
1675 if (ip->ino_data.parent_obj_id) {
1676 r = vop_write_dirent(&error, uio,
1677 ip->ino_data.parent_obj_id,
1678 DT_DIR, 2, "..");
1679 } else {
1680 r = vop_write_dirent(&error, uio,
1681 ip->obj_id, DT_DIR, 2, "..");
1682 }
1683 if (r)
1684 goto done;
1685 if (cookies)
1686 cookies[cookie_index] = saveoff;
1687 ++saveoff;
1688 ++cookie_index;
1689 if (cookie_index == ncookies)
1690 goto done;
1691 }
6b4f890b
MD
1692
1693 /*
1694 * Key range (begin and end inclusive) to scan. Directory keys
1695 * directly translate to a 64 bit 'seek' position.
1696 */
bcac4bbb 1697 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
5a930e66 1698 cursor.key_beg.localization = ip->obj_localization +
beec5dc4 1699 hammer_dir_localization(ip);
6b4f890b 1700 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1701 cursor.key_beg.create_tid = 0;
6b4f890b
MD
1702 cursor.key_beg.delete_tid = 0;
1703 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1704 cursor.key_beg.obj_type = 0;
b3deaf57 1705 cursor.key_beg.key = saveoff;
6b4f890b
MD
1706
1707 cursor.key_end = cursor.key_beg;
1708 cursor.key_end.key = HAMMER_MAX_KEY;
d5530d22
MD
1709 cursor.asof = ip->obj_asof;
1710 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
6b4f890b 1711
4e17f465 1712 error = hammer_ip_first(&cursor);
6b4f890b
MD
1713
1714 while (error == 0) {
11ad5ade 1715 error = hammer_ip_resolve_data(&cursor);
6b4f890b
MD
1716 if (error)
1717 break;
11ad5ade 1718 base = &cursor.leaf->base;
6b4f890b 1719 saveoff = base->key;
11ad5ade 1720 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
6b4f890b 1721
7a04d74f
MD
1722 if (base->obj_id != ip->obj_id)
1723 panic("readdir: bad record at %p", cursor.node);
1724
ea434b6f
MD
1725 /*
1726 * Convert pseudo-filesystems into softlinks
1727 */
1728 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
6b4f890b 1729 r = vop_write_dirent(
11ad5ade 1730 &error, uio, cursor.data->entry.obj_id,
ea434b6f 1731 dtype,
11ad5ade
MD
1732 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1733 (void *)cursor.data->entry.name);
6b4f890b
MD
1734 if (r)
1735 break;
1736 ++saveoff;
1737 if (cookies)
1738 cookies[cookie_index] = base->key;
1739 ++cookie_index;
1740 if (cookie_index == ncookies)
1741 break;
1742 error = hammer_ip_next(&cursor);
1743 }
1744 hammer_done_cursor(&cursor);
1745
b3deaf57 1746done:
b84de5af 1747 hammer_done_transaction(&trans);
36f82b23 1748
6b4f890b
MD
1749 if (ap->a_eofflag)
1750 *ap->a_eofflag = (error == ENOENT);
6b4f890b
MD
1751 uio->uio_offset = saveoff;
1752 if (error && cookie_index == 0) {
b3deaf57
MD
1753 if (error == ENOENT)
1754 error = 0;
6b4f890b
MD
1755 if (cookies) {
1756 kfree(cookies, M_TEMP);
1757 *ap->a_ncookies = 0;
1758 *ap->a_cookies = NULL;
1759 }
1760 } else {
7a04d74f
MD
1761 if (error == ENOENT)
1762 error = 0;
6b4f890b
MD
1763 if (cookies) {
1764 *ap->a_ncookies = cookie_index;
1765 *ap->a_cookies = cookies;
1766 }
1767 }
b0aab9b9 1768 lwkt_reltoken(&hmp->fs_token);
6b4f890b 1769 return(error);
427e5fc6
MD
1770}
1771
66325755
MD
1772/*
1773 * hammer_vop_readlink { vp, uio, cred }
1774 */
427e5fc6
MD
1775static
1776int
66325755 1777hammer_vop_readlink(struct vop_readlink_args *ap)
427e5fc6 1778{
36f82b23 1779 struct hammer_transaction trans;
7a04d74f
MD
1780 struct hammer_cursor cursor;
1781 struct hammer_inode *ip;
b0aab9b9 1782 hammer_mount_t hmp;
ea434b6f
MD
1783 char buf[32];
1784 u_int32_t localization;
1785 hammer_pseudofs_inmem_t pfsm;
7a04d74f
MD
1786 int error;
1787
1788 ip = VTOI(ap->a_vp);
b0aab9b9
MD
1789 hmp = ip->hmp;
1790
1791 lwkt_gettoken(&hmp->fs_token);
36f82b23 1792
2f85fa4d
MD
1793 /*
1794 * Shortcut if the symlink data was stuffed into ino_data.
ea434b6f 1795 *
842e7a70
MD
1796 * Also expand special "@@PFS%05d" softlinks (expansion only
1797 * occurs for non-historical (current) accesses made from the
1798 * primary filesystem).
2f85fa4d
MD
1799 */
1800 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
ea434b6f
MD
1801 char *ptr;
1802 int bytes;
1803
1804 ptr = ip->ino_data.ext.symlink;
1805 bytes = (int)ip->ino_data.size;
842e7a70
MD
1806 if (bytes == 10 &&
1807 ip->obj_asof == HAMMER_MAX_TID &&
1808 ip->obj_localization == 0 &&
1809 strncmp(ptr, "@@PFS", 5) == 0) {
b0aab9b9 1810 hammer_simple_transaction(&trans, hmp);
ea434b6f
MD
1811 bcopy(ptr + 5, buf, 5);
1812 buf[5] = 0;
1813 localization = strtoul(buf, NULL, 10) << 16;
1814 pfsm = hammer_load_pseudofs(&trans, localization,
1815 &error);
1816 if (error == 0) {
4c038e17
MD
1817 if (pfsm->pfsd.mirror_flags &
1818 HAMMER_PFSD_SLAVE) {
cb3c760c 1819 /* vap->va_size == 26 */
4c038e17
MD
1820 ksnprintf(buf, sizeof(buf),
1821 "@@0x%016llx:%05d",
973c11b9 1822 (long long)pfsm->pfsd.sync_end_tid,
4c038e17
MD
1823 localization >> 16);
1824 } else {
cb3c760c
MD
1825 /* vap->va_size == 10 */
1826 ksnprintf(buf, sizeof(buf),
1827 "@@-1:%05d",
1828 localization >> 16);
1829#if 0
4c038e17
MD
1830 ksnprintf(buf, sizeof(buf),
1831 "@@0x%016llx:%05d",
973c11b9 1832 (long long)HAMMER_MAX_TID,
4c038e17 1833 localization >> 16);
cb3c760c 1834#endif
4c038e17 1835 }
ea434b6f
MD
1836 ptr = buf;
1837 bytes = strlen(buf);
1838 }
1839 if (pfsm)
b0aab9b9 1840 hammer_rel_pseudofs(hmp, pfsm);
ea434b6f
MD
1841 hammer_done_transaction(&trans);
1842 }
1843 error = uiomove(ptr, bytes, ap->a_uio);
b0aab9b9 1844 lwkt_reltoken(&hmp->fs_token);
2f85fa4d
MD
1845 return(error);
1846 }
36f82b23 1847
2f85fa4d
MD
1848 /*
1849 * Long version
1850 */
b0aab9b9 1851 hammer_simple_transaction(&trans, hmp);
ce0138a6 1852 ++hammer_stats_file_iopsr;
bcac4bbb 1853 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
7a04d74f
MD
1854
1855 /*
1856 * Key range (begin and end inclusive) to scan. Directory keys
1857 * directly translate to a 64 bit 'seek' position.
1858 */
5a930e66
MD
1859 cursor.key_beg.localization = ip->obj_localization +
1860 HAMMER_LOCALIZE_MISC;
7a04d74f 1861 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1862 cursor.key_beg.create_tid = 0;
7a04d74f
MD
1863 cursor.key_beg.delete_tid = 0;
1864 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1865 cursor.key_beg.obj_type = 0;
1866 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
d5530d22
MD
1867 cursor.asof = ip->obj_asof;
1868 cursor.flags |= HAMMER_CURSOR_ASOF;
7a04d74f 1869
45a014dc 1870 error = hammer_ip_lookup(&cursor);
7a04d74f
MD
1871 if (error == 0) {
1872 error = hammer_ip_resolve_data(&cursor);
1873 if (error == 0) {
11ad5ade
MD
1874 KKASSERT(cursor.leaf->data_len >=
1875 HAMMER_SYMLINK_NAME_OFF);
1876 error = uiomove(cursor.data->symlink.name,
1877 cursor.leaf->data_len -
1878 HAMMER_SYMLINK_NAME_OFF,
7a04d74f
MD
1879 ap->a_uio);
1880 }
1881 }
1882 hammer_done_cursor(&cursor);
b84de5af 1883 hammer_done_transaction(&trans);
b0aab9b9 1884 lwkt_reltoken(&hmp->fs_token);
7a04d74f 1885 return(error);
427e5fc6
MD
1886}
1887
66325755
MD
1888/*
1889 * hammer_vop_nremove { nch, dvp, cred }
1890 */
427e5fc6
MD
1891static
1892int
66325755 1893hammer_vop_nremove(struct vop_nremove_args *ap)
427e5fc6 1894{
b84de5af 1895 struct hammer_transaction trans;
e63644f0 1896 struct hammer_inode *dip;
b0aab9b9 1897 hammer_mount_t hmp;
b84de5af
MD
1898 int error;
1899
e63644f0 1900 dip = VTOI(ap->a_dvp);
b0aab9b9 1901 hmp = dip->hmp;
e63644f0
MD
1902
1903 if (hammer_nohistory(dip) == 0 &&
b0aab9b9 1904 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1905 return (error);
1906 }
1907
b0aab9b9
MD
1908 lwkt_gettoken(&hmp->fs_token);
1909 hammer_start_transaction(&trans, hmp);
ce0138a6 1910 ++hammer_stats_file_iopsw;
d7e278bb 1911 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
b84de5af 1912 hammer_done_transaction(&trans);
fbb84158
MD
1913 if (error == 0)
1914 hammer_knote(ap->a_dvp, NOTE_WRITE);
b0aab9b9 1915 lwkt_reltoken(&hmp->fs_token);
b84de5af 1916 return (error);
427e5fc6
MD
1917}
1918
66325755
MD
1919/*
1920 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1921 */
427e5fc6
MD
1922static
1923int
66325755 1924hammer_vop_nrename(struct vop_nrename_args *ap)
427e5fc6 1925{
8cd0a023
MD
1926 struct hammer_transaction trans;
1927 struct namecache *fncp;
1928 struct namecache *tncp;
1929 struct hammer_inode *fdip;
1930 struct hammer_inode *tdip;
1931 struct hammer_inode *ip;
b0aab9b9 1932 hammer_mount_t hmp;
8cd0a023 1933 struct hammer_cursor cursor;
8cd0a023 1934 int64_t namekey;
5e435c92 1935 u_int32_t max_iterations;
11ad5ade 1936 int nlen, error;
8cd0a023 1937
f437a2ab
MD
1938 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1939 return(EXDEV);
1940 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1941 return(EXDEV);
1942
8cd0a023
MD
1943 fdip = VTOI(ap->a_fdvp);
1944 tdip = VTOI(ap->a_tdvp);
1945 fncp = ap->a_fnch->ncp;
1946 tncp = ap->a_tnch->ncp;
b3deaf57
MD
1947 ip = VTOI(fncp->nc_vp);
1948 KKASSERT(ip != NULL);
d113fda1 1949
b0aab9b9
MD
1950 hmp = ip->hmp;
1951
f437a2ab
MD
1952 if (fdip->obj_localization != tdip->obj_localization)
1953 return(EXDEV);
1954 if (fdip->obj_localization != ip->obj_localization)
1955 return(EXDEV);
1956
d113fda1
MD
1957 if (fdip->flags & HAMMER_INODE_RO)
1958 return (EROFS);
1959 if (tdip->flags & HAMMER_INODE_RO)
1960 return (EROFS);
1961 if (ip->flags & HAMMER_INODE_RO)
1962 return (EROFS);
b0aab9b9 1963 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1964 return (error);
d113fda1 1965
b0aab9b9
MD
1966 lwkt_gettoken(&hmp->fs_token);
1967 hammer_start_transaction(&trans, hmp);
ce0138a6 1968 ++hammer_stats_file_iopsw;
8cd0a023
MD
1969
1970 /*
b3deaf57
MD
1971 * Remove tncp from the target directory and then link ip as
1972 * tncp. XXX pass trans to dounlink
42c7d26b
MD
1973 *
1974 * Force the inode sync-time to match the transaction so it is
1975 * in-sync with the creation of the target directory entry.
8cd0a023 1976 */
d7e278bb
MD
1977 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1978 ap->a_cred, 0, -1);
42c7d26b 1979 if (error == 0 || error == ENOENT) {
5a930e66
MD
1980 error = hammer_ip_add_directory(&trans, tdip,
1981 tncp->nc_name, tncp->nc_nlen,
1982 ip);
42c7d26b
MD
1983 if (error == 0) {
1984 ip->ino_data.parent_obj_id = tdip->obj_id;
cc0758d0 1985 ip->ino_data.ctime = trans.time;
e98f1b96 1986 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
42c7d26b
MD
1987 }
1988 }
b3deaf57
MD
1989 if (error)
1990 goto failed; /* XXX */
8cd0a023
MD
1991
1992 /*
1993 * Locate the record in the originating directory and remove it.
1994 *
1995 * Calculate the namekey and setup the key range for the scan. This
1996 * works kinda like a chained hash table where the lower 32 bits
1997 * of the namekey synthesize the chain.
1998 *
1999 * The key range is inclusive of both key_beg and key_end.
2000 */
5e435c92
MD
2001 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2002 &max_iterations);
6a37e7e4 2003retry:
bcac4bbb 2004 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
5a930e66 2005 cursor.key_beg.localization = fdip->obj_localization +
beec5dc4 2006 hammer_dir_localization(fdip);
8cd0a023
MD
2007 cursor.key_beg.obj_id = fdip->obj_id;
2008 cursor.key_beg.key = namekey;
d5530d22 2009 cursor.key_beg.create_tid = 0;
8cd0a023
MD
2010 cursor.key_beg.delete_tid = 0;
2011 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2012 cursor.key_beg.obj_type = 0;
2013
2014 cursor.key_end = cursor.key_beg;
5e435c92 2015 cursor.key_end.key += max_iterations;
d5530d22
MD
2016 cursor.asof = fdip->obj_asof;
2017 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
8cd0a023
MD
2018
2019 /*
2020 * Scan all matching records (the chain), locate the one matching
a89aec1b 2021 * the requested path component.
8cd0a023
MD
2022 *
2023 * The hammer_ip_*() functions merge in-memory records with on-disk
2024 * records for the purposes of the search.
2025 */
4e17f465 2026 error = hammer_ip_first(&cursor);
a89aec1b 2027 while (error == 0) {
8cd0a023
MD
2028 if (hammer_ip_resolve_data(&cursor) != 0)
2029 break;
11ad5ade
MD
2030 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2031 KKASSERT(nlen > 0);
2032 if (fncp->nc_nlen == nlen &&
2033 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
8cd0a023
MD
2034 break;
2035 }
a89aec1b 2036 error = hammer_ip_next(&cursor);
8cd0a023 2037 }
8cd0a023
MD
2038
2039 /*
2040 * If all is ok we have to get the inode so we can adjust nlinks.
6a37e7e4
MD
2041 *
2042 * WARNING: hammer_ip_del_directory() may have to terminate the
2043 * cursor to avoid a recursion. It's ok to call hammer_done_cursor()
2044 * twice.
8cd0a023 2045 */
9944ae54 2046 if (error == 0)
6a37e7e4 2047 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
b84de5af
MD
2048
2049 /*
2050 * XXX A deadlock here will break rename's atomicy for the purposes
2051 * of crash recovery.
2052 */
2053 if (error == EDEADLK) {
b84de5af 2054 hammer_done_cursor(&cursor);
b84de5af
MD
2055 goto retry;
2056 }
2057
2058 /*
2059 * Cleanup and tell the kernel that the rename succeeded.
036ea0c3
MD
2060 *
2061 * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2062 * without formally acquiring the vp since the vp might
2063 * have zero refs on it, or in the middle of a reclaim,
2064 * etc.
b84de5af 2065 */
c0ade690 2066 hammer_done_cursor(&cursor);
fbb84158 2067 if (error == 0) {
6a37e7e4 2068 cache_rename(ap->a_fnch, ap->a_tnch);
fbb84158
MD
2069 hammer_knote(ap->a_fdvp, NOTE_WRITE);
2070 hammer_knote(ap->a_tdvp, NOTE_WRITE);
036ea0c3
MD
2071 while (ip->vp) {
2072 struct vnode *vp;
2073
2074 error = hammer_get_vnode(ip, &vp);
2075 if (error == 0 && vp) {
2076 vn_unlock(vp);
2077 hammer_knote(ip->vp, NOTE_RENAME);
2078 vrele(vp);
2079 break;
2080 }
2081 kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2082 }
fbb84158 2083 }
b84de5af 2084
b3deaf57 2085failed:
b84de5af 2086 hammer_done_transaction(&trans);
b0aab9b9 2087 lwkt_reltoken(&hmp->fs_token);
8cd0a023 2088 return (error);
427e5fc6
MD
2089}
2090
66325755
MD
2091/*
2092 * hammer_vop_nrmdir { nch, dvp, cred }
2093 */
427e5fc6
MD
2094static
2095int
66325755 2096hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
427e5fc6 2097{
b84de5af 2098 struct hammer_transaction trans;
e63644f0 2099 struct hammer_inode *dip;
b0aab9b9 2100 hammer_mount_t hmp;
b84de5af
MD
2101 int error;
2102
e63644f0 2103 dip = VTOI(ap->a_dvp);
b0aab9b9 2104 hmp = dip->hmp;
e63644f0
MD
2105
2106 if (hammer_nohistory(dip) == 0 &&
b0aab9b9 2107 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
2108 return (error);
2109 }
2110
b0aab9b9
MD
2111 lwkt_gettoken(&hmp->fs_token);
2112 hammer_start_transaction(&trans, hmp);
ce0138a6 2113 ++hammer_stats_file_iopsw;
d7e278bb 2114 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
b84de5af 2115 hammer_done_transaction(&trans);
fbb84158
MD
2116 if (error == 0)
2117 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
b0aab9b9 2118 lwkt_reltoken(&hmp->fs_token);
b84de5af 2119 return (error);
427e5fc6
MD
2120}
2121
349433c9
MD
2122/*
2123 * hammer_vop_markatime { vp, cred }
2124 */
2125static
2126int
2127hammer_vop_markatime(struct vop_markatime_args *ap)
2128{
2129 struct hammer_transaction trans;
2130 struct hammer_inode *ip;
b0aab9b9 2131 hammer_mount_t hmp;
349433c9
MD
2132
2133 ip = VTOI(ap->a_vp);
2134 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2135 return (EROFS);
2136 if (ip->flags & HAMMER_INODE_RO)
2137 return (EROFS);
b0aab9b9
MD
2138 hmp = ip->hmp;
2139 if (hmp->mp->mnt_flag & MNT_NOATIME)
349433c9 2140 return (0);
b0aab9b9
MD
2141 lwkt_gettoken(&hmp->fs_token);
2142 hammer_start_transaction(&trans, hmp);
349433c9
MD
2143 ++hammer_stats_file_iopsw;
2144
2145 ip->ino_data.atime = trans.time;
e98f1b96 2146 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
349433c9
MD
2147 hammer_done_transaction(&trans);
2148 hammer_knote(ap->a_vp, NOTE_ATTRIB);
b0aab9b9 2149 lwkt_reltoken(&hmp->fs_token);
349433c9
MD
2150 return (0);
2151}
2152
66325755
MD
2153/*
2154 * hammer_vop_setattr { vp, vap, cred }
2155 */
427e5fc6
MD
2156static
2157int
66325755 2158hammer_vop_setattr(struct vop_setattr_args *ap)
427e5fc6 2159{
8cd0a023 2160 struct hammer_transaction trans;
8cd0a023 2161 struct hammer_inode *ip;
b0aab9b9
MD
2162 struct vattr *vap;
2163 hammer_mount_t hmp;
8cd0a023
MD
2164 int modflags;
2165 int error;
d5ef456e 2166 int truncating;
4a2796f3 2167 int blksize;
fbb84158 2168 int kflags;
6362a262 2169#if 0
4a2796f3 2170 int64_t aligned_size;
6362a262 2171#endif
8cd0a023 2172 u_int32_t flags;
8cd0a023
MD
2173
2174 vap = ap->a_vap;
2175 ip = ap->a_vp->v_data;
2176 modflags = 0;
fbb84158 2177 kflags = 0;
b0aab9b9 2178 hmp = ip->hmp;
8cd0a023
MD
2179
2180 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2181 return(EROFS);
d113fda1
MD
2182 if (ip->flags & HAMMER_INODE_RO)
2183 return (EROFS);
e63644f0 2184 if (hammer_nohistory(ip) == 0 &&
b0aab9b9 2185 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
2186 return (error);
2187 }
8cd0a023 2188
b0aab9b9
MD
2189 lwkt_gettoken(&hmp->fs_token);
2190 hammer_start_transaction(&trans, hmp);
ce0138a6 2191 ++hammer_stats_file_iopsw;
8cd0a023
MD
2192 error = 0;
2193
2194 if (vap->va_flags != VNOVAL) {
2195 flags = ip->ino_data.uflags;
2196 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2197 hammer_to_unix_xid(&ip->ino_data.uid),
2198 ap->a_cred);
2199 if (error == 0) {
2200 if (ip->ino_data.uflags != flags) {
2201 ip->ino_data.uflags = flags;
cc0758d0 2202 ip->ino_data.ctime = trans.time;
8cd0a023 2203 modflags |= HAMMER_INODE_DDIRTY;
fbb84158 2204 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2205 }
2206 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2207 error = 0;
2208 goto done;
2209 }
2210 }
2211 goto done;
2212 }
2213 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2214 error = EPERM;
2215 goto done;
2216 }
7538695e
MD
2217 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2218 mode_t cur_mode = ip->ino_data.mode;
2219 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2220 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2221 uuid_t uuid_uid;
2222 uuid_t uuid_gid;
2223
2224 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2225 ap->a_cred,
2226 &cur_uid, &cur_gid, &cur_mode);
2227 if (error == 0) {
2228 hammer_guid_to_uuid(&uuid_uid, cur_uid);
2229 hammer_guid_to_uuid(&uuid_gid, cur_gid);
2230 if (bcmp(&uuid_uid, &ip->ino_data.uid,
2231 sizeof(uuid_uid)) ||
2232 bcmp(&uuid_gid, &ip->ino_data.gid,
2233 sizeof(uuid_gid)) ||
2234 ip->ino_data.mode != cur_mode
2235 ) {
2236 ip->ino_data.uid = uuid_uid;
2237 ip->ino_data.gid = uuid_gid;
2238 ip->ino_data.mode = cur_mode;
cc0758d0
MD
2239 ip->ino_data.ctime = trans.time;
2240 modflags |= HAMMER_INODE_DDIRTY;
7538695e 2241 }
fbb84158 2242 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2243 }
2244 }
11ad5ade 2245 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
8cd0a023
MD
2246 switch(ap->a_vp->v_type) {
2247 case VREG:
11ad5ade 2248 if (vap->va_size == ip->ino_data.size)
d5ef456e 2249 break;
47f363f1
MD
2250
2251 /*
c58123da
MD
2252 * Log the operation if in fast-fsync mode or if
2253 * there are unterminated redo write records present.
2254 *
2255 * The second check is needed so the recovery code
2256 * properly truncates write redos even if nominal
2257 * REDO operations is turned off due to excessive
2258 * writes, because the related records might be
2259 * destroyed and never lay down a TERM_WRITE.
47f363f1 2260 */
c58123da
MD
2261 if ((ip->flags & HAMMER_INODE_REDO) ||
2262 (ip->flags & HAMMER_INODE_RDIRTY)) {
47f363f1
MD
2263 error = hammer_generate_redo(&trans, ip,
2264 vap->va_size,
2265 HAMMER_REDO_TRUNC,
2266 NULL, 0);
2267 }
2268 blksize = hammer_blocksize(vap->va_size);
2269
b84de5af
MD
2270 /*
2271 * XXX break atomicy, we can deadlock the backend
2272 * if we do not release the lock. Probably not a
2273 * big deal here.
2274 */
11ad5ade 2275 if (vap->va_size < ip->ino_data.size) {
6362a262
MD
2276 nvtruncbuf(ap->a_vp, vap->va_size,
2277 blksize,
2278 hammer_blockoff(vap->va_size));
d5ef456e 2279 truncating = 1;
fbb84158 2280 kflags |= NOTE_WRITE;
d5ef456e 2281 } else {
6362a262
MD
2282 nvextendbuf(ap->a_vp,
2283 ip->ino_data.size,
2284 vap->va_size,
2285 hammer_blocksize(ip->ino_data.size),
2286 hammer_blocksize(vap->va_size),
2287 hammer_blockoff(ip->ino_data.size),
2288 hammer_blockoff(vap->va_size),
2289 0);
d5ef456e 2290 truncating = 0;
fbb84158 2291 kflags |= NOTE_WRITE | NOTE_EXTEND;
c0ade690 2292 }
11ad5ade 2293 ip->ino_data.size = vap->va_size;
cc0758d0 2294 ip->ino_data.mtime = trans.time;
47f363f1 2295 /* XXX safe to use SDIRTY instead of DDIRTY here? */
cc0758d0 2296 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
d5ef456e 2297
b84de5af 2298 /*
6362a262
MD
2299 * On-media truncation is cached in the inode until
2300 * the inode is synchronized. We must immediately
2301 * handle any frontend records.
b84de5af 2302 */
d5ef456e 2303 if (truncating) {
47637bff 2304 hammer_ip_frontend_trunc(ip, vap->va_size);
0832c9bb
MD
2305#ifdef DEBUG_TRUNCATE
2306 if (HammerTruncIp == NULL)
2307 HammerTruncIp = ip;
2308#endif
b84de5af
MD
2309 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2310 ip->flags |= HAMMER_INODE_TRUNCATED;
2311 ip->trunc_off = vap->va_size;
0832c9bb
MD
2312#ifdef DEBUG_TRUNCATE
2313 if (ip == HammerTruncIp)
973c11b9
MD
2314 kprintf("truncate1 %016llx\n",
2315 (long long)ip->trunc_off);
0832c9bb 2316#endif
b84de5af
MD
2317 } else if (ip->trunc_off > vap->va_size) {
2318 ip->trunc_off = vap->va_size;
0832c9bb
MD
2319#ifdef DEBUG_TRUNCATE
2320 if (ip == HammerTruncIp)
973c11b9
MD
2321 kprintf("truncate2 %016llx\n",
2322 (long long)ip->trunc_off);
0832c9bb
MD
2323#endif
2324 } else {
2325#ifdef DEBUG_TRUNCATE
2326 if (ip == HammerTruncIp)
973c11b9
MD
2327 kprintf("truncate3 %016llx (ignored)\n",
2328 (long long)vap->va_size);
0832c9bb 2329#endif
b84de5af 2330 }
d5ef456e 2331 }
b84de5af 2332
6362a262 2333#if 0
d5ef456e 2334 /*
6362a262
MD
2335 * When truncating, nvtruncbuf() may have cleaned out
2336 * a portion of the last block on-disk in the buffer
2337 * cache. We must clean out any frontend records
2338 * for blocks beyond the new last block.
d5ef456e 2339 */
4a2796f3
MD
2340 aligned_size = (vap->va_size + (blksize - 1)) &
2341 ~(int64_t)(blksize - 1);
b84de5af 2342 if (truncating && vap->va_size < aligned_size) {
4a2796f3 2343 aligned_size -= blksize;
47637bff 2344 hammer_ip_frontend_trunc(ip, aligned_size);
d5ef456e 2345 }
6362a262 2346#endif
76376933 2347 break;
8cd0a023 2348 case VDATABASE:
b84de5af
MD
2349 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2350 ip->flags |= HAMMER_INODE_TRUNCATED;
2351 ip->trunc_off = vap->va_size;
2352 } else if (ip->trunc_off > vap->va_size) {
2353 ip->trunc_off = vap->va_size;
2354 }
47637bff 2355 hammer_ip_frontend_trunc(ip, vap->va_size);
11ad5ade 2356 ip->ino_data.size = vap->va_size;
cc0758d0
MD
2357 ip->ino_data.mtime = trans.time;
2358 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
fbb84158 2359 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2360 break;
2361 default:
2362 error = EINVAL;
2363 goto done;
2364 }
d26d0ae9 2365 break;
8cd0a023
MD
2366 }
2367 if (vap->va_atime.tv_sec != VNOVAL) {
cc0758d0 2368 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
ddfdf542 2369 modflags |= HAMMER_INODE_ATIME;
fbb84158 2370 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2371 }
2372 if (vap->va_mtime.tv_sec != VNOVAL) {
cc0758d0 2373 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
ddfdf542 2374 modflags |= HAMMER_INODE_MTIME;
fbb84158 2375 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2376 }
2377 if (vap->va_mode != (mode_t)VNOVAL) {
7538695e
MD
2378 mode_t cur_mode = ip->ino_data.mode;
2379 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2380 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2381
2382 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2383 cur_uid, cur_gid, &cur_mode);
2384 if (error == 0 && ip->ino_data.mode != cur_mode) {
2385 ip->ino_data.mode = cur_mode;
cc0758d0 2386 ip->ino_data.ctime = trans.time;
8cd0a023 2387 modflags |= HAMMER_INODE_DDIRTY;
fbb84158 2388 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2389 }
2390 }
2391done:
b84de5af 2392 if (error == 0)
e98f1b96 2393 hammer_modify_inode(&trans, ip, modflags);
b84de5af 2394 hammer_done_transaction(&trans);
fbb84158 2395 hammer_knote(ap->a_vp, kflags);
b0aab9b9 2396 lwkt_reltoken(&hmp->fs_token);
8cd0a023 2397 return (error);
427e5fc6
MD
2398}
2399
66325755
MD
2400/*
2401 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2402 */
427e5fc6
MD
2403static
2404int
66325755 2405hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
427e5fc6 2406{
7a04d74f
MD
2407 struct hammer_transaction trans;
2408 struct hammer_inode *dip;
2409 struct hammer_inode *nip;
7a04d74f 2410 hammer_record_t record;
b0aab9b9
MD
2411 struct nchandle *nch;
2412 hammer_mount_t hmp;
7a04d74f
MD
2413 int error;
2414 int bytes;
2415
2416 ap->a_vap->va_type = VLNK;
2417
2418 nch = ap->a_nch;
2419 dip = VTOI(ap->a_dvp);
b0aab9b9 2420 hmp = dip->hmp;
7a04d74f 2421
d113fda1
MD
2422 if (dip->flags & HAMMER_INODE_RO)
2423 return (EROFS);
b0aab9b9 2424 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 2425 return (error);
d113fda1 2426
7a04d74f
MD
2427 /*
2428 * Create a transaction to cover the operations we perform.
2429 */
b0aab9b9
MD
2430 lwkt_gettoken(&hmp->fs_token);
2431 hammer_start_transaction(&trans, hmp);
ce0138a6 2432 ++hammer_stats_file_iopsw;
7a04d74f
MD
2433
2434 /*
2435 * Create a new filesystem object of the requested type. The
2436 * returned inode will be referenced but not locked.
2437 */
2438
5a930e66 2439 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
2440 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2441 NULL, &nip);
7a04d74f 2442 if (error) {
b84de5af 2443 hammer_done_transaction(&trans);
7a04d74f 2444 *ap->a_vpp = NULL;
b0aab9b9 2445 lwkt_reltoken(&hmp->fs_token);
7a04d74f
MD
2446 return (error);
2447 }
2448
7a04d74f
MD
2449 /*
2450 * Add a record representing the symlink. symlink stores the link
2451 * as pure data, not a string, and is no \0 terminated.
2452 */
2453 if (error == 0) {
7a04d74f
MD
2454 bytes = strlen(ap->a_target);
2455
2f85fa4d
MD
2456 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2457 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2458 } else {
2459 record = hammer_alloc_mem_record(nip, bytes);
2460 record->type = HAMMER_MEM_RECORD_GENERAL;
2461
5a930e66
MD
2462 record->leaf.base.localization = nip->obj_localization +
2463 HAMMER_LOCALIZE_MISC;
2f85fa4d
MD
2464 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2465 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2466 record->leaf.data_len = bytes;
2467 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2468 bcopy(ap->a_target, record->data->symlink.name, bytes);
2469 error = hammer_ip_add_record(&trans, record);
2470 }
42c7d26b
MD
2471
2472 /*
2473 * Set the file size to the length of the link.
2474 */
2475 if (error == 0) {
11ad5ade 2476 nip->ino_data.size = bytes;
e98f1b96 2477 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
42c7d26b 2478 }
7a04d74f 2479 }
1f07f686 2480 if (error == 0)
5a930e66
MD
2481 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2482 nch->ncp->nc_nlen, nip);
7a04d74f
MD
2483
2484 /*
2485 * Finish up.
2486 */
2487 if (error) {
2488 hammer_rel_inode(nip, 0);
7a04d74f
MD
2489 *ap->a_vpp = NULL;
2490 } else {
e8599db1 2491 error = hammer_get_vnode(nip, ap->a_vpp);
7a04d74f
MD
2492 hammer_rel_inode(nip, 0);
2493 if (error == 0) {
2494 cache_setunresolved(ap->a_nch);
2495 cache_setvp(ap->a_nch, *ap->a_vpp);
fbb84158 2496 hammer_knote(ap->a_dvp, NOTE_WRITE);
7a04d74f
MD
2497 }
2498 }
b84de5af 2499 hammer_done_transaction(&trans);
b0aab9b9 2500 lwkt_reltoken(&hmp->fs_token);
7a04d74f 2501 return (error);
427e5fc6
MD
2502}
2503
66325755
MD
2504/*
2505 * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2506 */
427e5fc6
MD
2507static
2508int
66325755 2509hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
427e5fc6 2510{
b84de5af 2511 struct hammer_transaction trans;
e63644f0 2512 struct hammer_inode *dip;
b0aab9b9 2513 hammer_mount_t hmp;
b84de5af
MD
2514 int error;
2515
e63644f0 2516 dip = VTOI(ap->a_dvp);
b0aab9b9 2517 hmp = dip->hmp;
e63644f0
MD
2518
2519 if (hammer_nohistory(dip) == 0 &&
b0aab9b9 2520 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
e63644f0
MD
2521 return (error);
2522 }
2523
b0aab9b9
MD
2524 lwkt_gettoken(&hmp->fs_token);
2525 hammer_start_transaction(&trans, hmp);
ce0138a6 2526 ++hammer_stats_file_iopsw;
b84de5af 2527 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
d7e278bb 2528 ap->a_cred, ap->a_flags, -1);
b84de5af 2529 hammer_done_transaction(&trans);
b0aab9b9 2530 lwkt_reltoken(&hmp->fs_token);
b84de5af
MD
2531
2532 return (error);
427e5fc6
MD
2533}
2534
7dc57964
MD
2535/*
2536 * hammer_vop_ioctl { vp, command, data, fflag, cred }
2537 */
2538static
2539int
2540hammer_vop_ioctl(struct vop_ioctl_args *ap)
2541{
2542 struct hammer_inode *ip = ap->a_vp->v_data;
b0aab9b9
MD
2543 hammer_mount_t hmp = ip->hmp;
2544 int error;
7dc57964 2545
ce0138a6 2546 ++hammer_stats_file_iopsr;
b0aab9b9
MD
2547 lwkt_gettoken(&hmp->fs_token);
2548 error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2549 ap->a_fflag, ap->a_cred);
2550 lwkt_reltoken(&hmp->fs_token);
2551 return (error);
7dc57964
MD
2552}
2553
513ca7d7
MD
2554static
2555int
2556hammer_vop_mountctl(struct vop_mountctl_args *ap)
2557{
dad088a5
MD
2558 static const struct mountctl_opt extraopt[] = {
2559 { HMNT_NOHISTORY, "nohistory" },
2560 { HMNT_MASTERID, "master" },
2561 { 0, NULL}
2562
2563 };
2564 struct hammer_mount *hmp;
513ca7d7 2565 struct mount *mp;
dad088a5 2566 int usedbytes;
513ca7d7
MD
2567 int error;
2568
dad088a5
MD
2569 error = 0;
2570 usedbytes = 0;
513ca7d7 2571 mp = ap->a_head.a_ops->head.vv_mount;
dad088a5
MD
2572 KKASSERT(mp->mnt_data != NULL);
2573 hmp = (struct hammer_mount *)mp->mnt_data;
513ca7d7 2574
b0aab9b9 2575 lwkt_gettoken(&hmp->fs_token);
dad088a5 2576
b0aab9b9 2577 switch(ap->a_op) {
513ca7d7
MD
2578 case MOUNTCTL_SET_EXPORT:
2579 if (ap->a_ctllen != sizeof(struct export_args))
2580 error = EINVAL;
b424ca30
MD
2581 else
2582 error = hammer_vfs_export(mp, ap->a_op,
513ca7d7
MD
2583 (const struct export_args *)ap->a_ctl);
2584 break;
dad088a5
MD
2585 case MOUNTCTL_MOUNTFLAGS:
2586 {
2587 /*
2588 * Call standard mountctl VOP function
2589 * so we get user mount flags.
2590 */
2591 error = vop_stdmountctl(ap);
2592 if (error)
2593 break;
2594
2595 usedbytes = *ap->a_res;
2596
eac446c5 2597 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
b0aab9b9
MD
2598 usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2599 ap->a_buf,
dad088a5
MD
2600 ap->a_buflen - usedbytes,
2601 &error);
dad088a5
MD
2602 }
2603
2604 *ap->a_res += usedbytes;
2605 break;
2606 }
513ca7d7 2607 default:
726e0641 2608 error = vop_stdmountctl(ap);
513ca7d7
MD
2609 break;
2610 }
b0aab9b9 2611 lwkt_reltoken(&hmp->fs_token);
513ca7d7
MD
2612 return(error);
2613}
2614
66325755
MD
2615/*
2616 * hammer_vop_strategy { vp, bio }
8cd0a023
MD
2617 *
2618 * Strategy call, used for regular file read & write only. Note that the
2619 * bp may represent a cluster.
2620 *
2621 * To simplify operation and allow better optimizations in the future,
2622 * this code does not make any assumptions with regards to buffer alignment
2623 * or size.
66325755 2624 */
427e5fc6
MD
2625static
2626int
66325755 2627hammer_vop_strategy(struct vop_strategy_args *ap)
427e5fc6 2628{
8cd0a023
MD
2629 struct buf *bp;
2630 int error;
2631
2632 bp = ap->a_bio->bio_buf;
2633
2634 switch(bp->b_cmd) {
2635 case BUF_CMD_READ:
2636 error = hammer_vop_strategy_read(ap);
2637 break;
2638 case BUF_CMD_WRITE:
2639 error = hammer_vop_strategy_write(ap);
2640 break;
2641 default:
059819e3
MD
2642 bp->b_error = error = EINVAL;
2643 bp->b_flags |= B_ERROR;
2644 biodone(ap->a_bio);
8cd0a023
MD
2645 break;
2646 }
507df98a
ID
2647
2648 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2649
8cd0a023 2650 return (error);
427e5fc6
MD
2651}
2652
8cd0a023
MD
2653/*
2654 * Read from a regular file. Iterate the related records and fill in the
2655 * BIO/BUF. Gaps are zero-filled.
2656 *
2657 * The support code in hammer_object.c should be used to deal with mixed
2658 * in-memory and on-disk records.
2659 *
4a2796f3
MD
2660 * NOTE: Can be called from the cluster code with an oversized buf.
2661 *
8cd0a023
MD
2662 * XXX atime update
2663 */
2664static
2665int
2666hammer_vop_strategy_read(struct vop_strategy_args *ap)
2667{
36f82b23
MD
2668 struct hammer_transaction trans;
2669 struct hammer_inode *ip;
39d8fd63 2670 struct hammer_inode *dip;
b0aab9b9 2671 hammer_mount_t hmp;
8cd0a023 2672 struct hammer_cursor cursor;
8cd0a023 2673 hammer_base_elm_t base;
4a2796f3 2674 hammer_off_t disk_offset;
8cd0a023 2675 struct bio *bio;
a99b9ea2 2676 struct bio *nbio;
8cd0a023
MD
2677 struct buf *bp;
2678 int64_t rec_offset;
a89aec1b 2679 int64_t ran_end;
195c19a1 2680 int64_t tmp64;
8cd0a023
MD
2681 int error;
2682 int boff;
2683 int roff;
2684 int n;
b4f86ea3 2685 int isdedupable;
8cd0a023
MD
2686
2687 bio = ap->a_bio;
2688 bp = bio->bio_buf;
36f82b23 2689 ip = ap->a_vp->v_data;
b0aab9b9 2690 hmp = ip->hmp;
8cd0a023 2691
a99b9ea2
MD
2692 /*
2693 * The zone-2 disk offset may have been set by the cluster code via
4a2796f3 2694 * a BMAP operation, or else should be NOOFFSET.
a99b9ea2 2695 *
4a2796f3 2696 * Checking the high bits for a match against zone-2 should suffice.
b4f86ea3
MD
2697 *
2698 * In cases where a lot of data duplication is present it may be
2699 * more beneficial to drop through and doubule-buffer through the
2700 * device.
a99b9ea2
MD
2701 */
2702 nbio = push_bio(bio);
b4f86ea3
MD
2703 if (hammer_double_buffer == 0 &&
2704 (nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
1b0ab2c3 2705 HAMMER_ZONE_LARGE_DATA) {
b0aab9b9
MD
2706 lwkt_gettoken(&hmp->fs_token);
2707 error = hammer_io_direct_read(hmp, nbio, NULL);
2708 lwkt_reltoken(&hmp->fs_token);
a99b9ea2
MD
2709 return (error);
2710 }
2711
2712 /*
4a2796f3
MD
2713 * Well, that sucked. Do it the hard way. If all the stars are
2714 * aligned we may still be able to issue a direct-read.
a99b9ea2 2715 */
b0aab9b9
MD
2716 lwkt_gettoken(&hmp->fs_token);
2717 hammer_simple_transaction(&trans, hmp);
47637bff 2718 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
8cd0a023
MD
2719
2720 /*
2721 * Key range (begin and end inclusive) to scan. Note that the key's
c0ade690
MD
2722 * stored in the actual records represent BASE+LEN, not BASE. The
2723 * first record containing bio_offset will have a key > bio_offset.
8cd0a023 2724 */
5a930e66
MD
2725 cursor.key_beg.localization = ip->obj_localization +
2726 HAMMER_LOCALIZE_MISC;
8cd0a023 2727 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 2728 cursor.key_beg.create_tid = 0;
8cd0a023 2729 cursor.key_beg.delete_tid = 0;
8cd0a023 2730 cursor.key_beg.obj_type = 0;
c0ade690 2731 cursor.key_beg.key = bio->bio_offset + 1;
d5530d22 2732 cursor.asof = ip->obj_asof;
bf3b416b 2733 cursor.flags |= HAMMER_CURSOR_ASOF;
8cd0a023
MD
2734
2735 cursor.key_end = cursor.key_beg;
11ad5ade 2736 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
b84de5af 2737#if 0
11ad5ade 2738 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
a89aec1b
MD
2739 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2740 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2741 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
b84de5af
MD
2742 } else
2743#endif
2744 {
c0ade690 2745 ran_end = bio->bio_offset + bp->b_bufsize;
a89aec1b
MD
2746 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2747 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
195c19a1
MD
2748 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2749 if (tmp64 < ran_end)
a89aec1b
MD
2750 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2751 else
7f7c1f84 2752 cursor.key_end.key = ran_end + MAXPHYS + 1;
a89aec1b 2753 }
d26d0ae9 2754 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
8cd0a023 2755
18bee4a2
MD
2756 /*
2757 * Set NOSWAPCACHE for cursor data extraction if double buffering
2758 * is disabled or (if the file is not marked cacheable via chflags
2759 * and vm.swapcache_use_chflags is enabled).
2760 */
2761 if (hammer_double_buffer == 0 ||
2762 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2763 vm_swapcache_use_chflags)) {
2764 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2765 }
2766
4e17f465 2767 error = hammer_ip_first(&cursor);
8cd0a023
MD
2768 boff = 0;
2769
a89aec1b 2770 while (error == 0) {
47637bff
MD
2771 /*
2772 * Get the base file offset of the record. The key for
2773 * data records is (base + bytes) rather then (base).
2774 */
11ad5ade 2775 base = &cursor.leaf->base;
11ad5ade 2776 rec_offset = base->key - cursor.leaf->data_len;
8cd0a023 2777
66325755 2778 /*
a89aec1b 2779 * Calculate the gap, if any, and zero-fill it.
1fef775e
MD
2780 *
2781 * n is the offset of the start of the record verses our
2782 * current seek offset in the bio.
66325755 2783 */
8cd0a023
MD
2784 n = (int)(rec_offset - (bio->bio_offset + boff));
2785 if (n > 0) {
a89aec1b
MD
2786 if (n > bp->b_bufsize - boff)
2787 n = bp->b_bufsize - boff;
8cd0a023
MD
2788 bzero((char *)bp->b_data + boff, n);
2789 boff += n;
2790 n = 0;
66325755 2791 }
8cd0a023
MD
2792
2793 /*
2794 * Calculate the data offset in the record and the number
2795 * of bytes we can copy.
a89aec1b 2796 *
1fef775e
MD
2797 * There are two degenerate cases. First, boff may already
2798 * be at bp->b_bufsize. Secondly, the data offset within
2799 * the record may exceed the record's size.
8cd0a023
MD
2800 */
2801 roff = -n;
b84de5af 2802 rec_offset += roff;
11ad5ade 2803 n = cursor.leaf->data_len - roff;
1fef775e
MD
2804 if (n <= 0) {
2805 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2806 n = 0;
2807 } else if (n > bp->b_bufsize - boff) {
8cd0a023 2808 n = bp->b_bufsize - boff;
1fef775e 2809 }
059819e3 2810
b84de5af 2811 /*
47637bff
MD
2812 * Deal with cached truncations. This cool bit of code
2813 * allows truncate()/ftruncate() to avoid having to sync
2814 * the file.
2815 *
2816 * If the frontend is truncated then all backend records are
2817 * subject to the frontend's truncation.
2818 *
2819 * If the backend is truncated then backend records on-disk
2820 * (but not in-memory) are subject to the backend's
2821 * truncation. In-memory records owned by the backend
2822 * represent data written after the truncation point on the
2823 * backend and must not be truncated.
2824 *
2825 * Truncate operations deal with frontend buffer cache
2826 * buffers and frontend-owned in-memory records synchronously.
b84de5af 2827 */
47637bff 2828 if (ip->flags & HAMMER_INODE_TRUNCATED) {
6362a262
MD
2829 if (hammer_cursor_ondisk(&cursor)/* ||
2830 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
47637bff
MD
2831 if (ip->trunc_off <= rec_offset)
2832 n = 0;
2833 else if (ip->trunc_off < rec_offset + n)
2834 n = (int)(ip->trunc_off - rec_offset);
2835 }
2836 }
2837 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2838 if (hammer_cursor_ondisk(&cursor)) {
2839 if (ip->sync_trunc_off <= rec_offset)
2840 n = 0;
2841 else if (ip->sync_trunc_off < rec_offset + n)
2842 n = (int)(ip->sync_trunc_off - rec_offset);
2843 }
2844 }
b84de5af
MD
2845
2846 /*
47637bff
MD
2847 * Try to issue a direct read into our bio if possible,
2848 * otherwise resolve the element data into a hammer_buffer
2849 * and copy.
4a2796f3
MD
2850 *
2851 * The buffer on-disk should be zerod past any real
2852 * truncation point, but may not be for any synthesized
2853 * truncation point from above.
b84de5af 2854 */
1b0ab2c3 2855 disk_offset = cursor.leaf->data_offset + roff;
b4f86ea3
MD
2856 isdedupable = (boff == 0 && n == bp->b_bufsize &&
2857 hammer_cursor_ondisk(&cursor) &&
2858 ((int)disk_offset & HAMMER_BUFMASK) == 0);
2859
2860 if (isdedupable && hammer_double_buffer == 0) {
1b0ab2c3
MD
2861 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2862 HAMMER_ZONE_LARGE_DATA);
4a2796f3 2863 nbio->bio_offset = disk_offset;
b0aab9b9 2864 error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
b4f86ea3 2865 if (hammer_live_dedup && error == 0)
507df98a 2866 hammer_dedup_cache_add(ip, cursor.leaf);
47637bff
MD
2867 goto done;
2868 } else if (n) {
2869 error = hammer_ip_resolve_data(&cursor);
2870 if (error == 0) {
b4f86ea3
MD
2871 if (hammer_live_dedup && isdedupable)
2872 hammer_dedup_cache_add(ip, cursor.leaf);
47637bff
MD
2873 bcopy((char *)cursor.data + roff,
2874 (char *)bp->b_data + boff, n);
2875 }
b84de5af 2876 }
47637bff
MD
2877 if (error)
2878 break;
2879
507df98a
ID
2880 /*
2881 * We have to be sure that the only elements added to the
2882 * dedup cache are those which are already on-media.
2883 */
2884 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2885 hammer_dedup_cache_add(ip, cursor.leaf);
2886
47637bff
MD
2887 /*
2888 * Iterate until we have filled the request.
2889 */
2890 boff += n;
8cd0a023 2891 if (boff == bp->b_bufsize)
66325755 2892 break;
a89aec1b 2893 error = hammer_ip_next(&cursor);
66325755
MD
2894 }
2895
2896 /*
8cd0a023 2897 * There may have been a gap after the last record
66325755 2898 */
8cd0a023
MD
2899 if (error == ENOENT)
2900 error = 0;
2901 if (error == 0 && boff != bp->b_bufsize) {
7f7c1f84 2902 KKASSERT(boff < bp->b_bufsize);
8cd0a023
MD
2903 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2904 /* boff = bp->b_bufsize; */
2905 }
18bee4a2
MD
2906
2907 /*
2908 * Disallow swapcache operation on the vnode buffer if double
2909 * buffering is enabled, the swapcache will get the data via
2910 * the block device buffer.
2911 */
2912 if (hammer_double_buffer)
2913 bp->b_flags |= B_NOTMETA;
2914
2915 /*
2916 * Cleanup
2917 */
8cd0a023 2918 bp->b_resid = 0;
059819e3
MD
2919 bp->b_error = error;
2920 if (error)
2921 bp->b_flags |= B_ERROR;
2922 biodone(ap->a_bio);
47637bff
MD
2923
2924done:
39d8fd63
MD
2925 /*
2926 * Cache the b-tree node for the last data read in cache[1].
2927 *
2928 * If we hit the file EOF then also cache the node in the
2929 * governing director's cache[3], it will be used to initialize
2930 * the inode's cache[1] for any inodes looked up via the directory.
2931 *
2932 * This doesn't reduce disk accesses since the B-Tree chain is
2933 * likely cached, but it does reduce cpu overhead when looking
2934 * up file offsets for cpdup/tar/cpio style iterations.
2935 */
47637bff 2936 if (cursor.node)
bcac4bbb 2937 hammer_cache_node(&ip->cache[1], cursor.node);
39d8fd63
MD
2938 if (ran_end >= ip->ino_data.size) {
2939 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2940 ip->obj_asof, ip->obj_localization);
2941 if (dip) {
2942 hammer_cache_node(&dip->cache[3], cursor.node);
2943 hammer_rel_inode(dip, 0);
2944 }
2945 }
47637bff
MD
2946 hammer_done_cursor(&cursor);
2947 hammer_done_transaction(&trans);
b0aab9b9 2948 lwkt_reltoken(&hmp->fs_token);
8cd0a023
MD
2949 return(error);
2950}
2951
a99b9ea2
MD
2952/*
2953 * BMAP operation - used to support cluster_read() only.
2954 *
2955 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2956 *
2957 * This routine may return EOPNOTSUPP if the opration is not supported for
2958 * the specified offset. The contents of the pointer arguments do not
2959 * need to be initialized in that case.
2960 *
2961 * If a disk address is available and properly aligned return 0 with
2962 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2963 * to the run-length relative to that offset. Callers may assume that
2964 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2965 * large, so return EOPNOTSUPP if it is not sufficiently large.
2966 */
2967static
2968int
2969hammer_vop_bmap(struct vop_bmap_args *ap)
2970{
2971 struct hammer_transaction trans;
2972 struct hammer_inode *ip;
b0aab9b9 2973 hammer_mount_t hmp;
a99b9ea2
MD
2974 struct hammer_cursor cursor;
2975 hammer_base_elm_t base;
2976 int64_t rec_offset;
2977 int64_t ran_end;
2978 int64_t tmp64;
2979 int64_t base_offset;
2980 int64_t base_disk_offset;
2981 int64_t last_offset;
2982 hammer_off_t last_disk_offset;
2983 hammer_off_t disk_offset;
2984 int rec_len;
2985 int error;
4a2796f3 2986 int blksize;
a99b9ea2 2987
ce0138a6 2988 ++hammer_stats_file_iopsr;
a99b9ea2 2989 ip = ap->a_vp->v_data;
b0aab9b9 2990 hmp = ip->hmp;
a99b9ea2
MD
2991
2992 /*
2993 * We can only BMAP regular files. We can't BMAP database files,
2994 * directories, etc.
2995 */
2996 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2997 return(EOPNOTSUPP);
2998
2999 /*
3000 * bmap is typically called with runp/runb both NULL when used
3001 * for writing. We do not support BMAP for writing atm.
3002 */
4a2796f3 3003 if (ap->a_cmd != BUF_CMD_READ)
a99b9ea2
MD
3004 return(EOPNOTSUPP);
3005
3006 /*
3007 * Scan the B-Tree to acquire blockmap addresses, then translate
3008 * to raw addresses.
3009 */
b0aab9b9
MD
3010 lwkt_gettoken(&hmp->fs_token);
3011 hammer_simple_transaction(&trans, hmp);
cb51be26 3012#if 0
973c11b9
MD
3013 kprintf("bmap_beg %016llx ip->cache %p\n",
3014 (long long)ap->a_loffset, ip->cache[1]);
cb51be26 3015#endif
a99b9ea2
MD
3016 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3017
3018 /*
3019 * Key range (begin and end inclusive) to scan. Note that the key's
3020 * stored in the actual records represent BASE+LEN, not BASE. The
3021 * first record containing bio_offset will have a key > bio_offset.
3022 */
5a930e66
MD
3023 cursor.key_beg.localization = ip->obj_localization +
3024 HAMMER_LOCALIZE_MISC;
a99b9ea2
MD
3025 cursor.key_beg.obj_id = ip->obj_id;
3026 cursor.key_beg.create_tid = 0;
3027 cursor.key_beg.delete_tid = 0;
3028 cursor.key_beg.obj_type = 0;
3029 if (ap->a_runb)
3030 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3031 else
3032 cursor.key_beg.key = ap->a_loffset + 1;
3033 if (cursor.key_beg.key < 0)
3034 cursor.key_beg.key = 0;
3035 cursor.asof = ip->obj_asof;
bf3b416b 3036 cursor.flags |= HAMMER_CURSOR_ASOF;
a99b9ea2
MD
3037
3038 cursor.key_end = cursor.key_beg;
3039 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3040
3041 ran_end = ap->a_loffset + MAXPHYS;
3042 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3043 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3044 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
3045 if (tmp64 < ran_end)
3046 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3047 else
3048 cursor.key_end.key = ran_end + MAXPHYS + 1;
3049
3050 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3051
3052 error = hammer_ip_first(&cursor);
3053 base_offset = last_offset = 0;
3054 base_disk_offset = last_disk_offset = 0;
3055
3056 while (error == 0) {
3057 /*
3058 * Get the base file offset of the record. The key for
3059 * data records is (base + bytes) rather then (base).
4a2796f3
MD
3060 *
3061 * NOTE: rec_offset + rec_len may exceed the end-of-file.
3062 * The extra bytes should be zero on-disk and the BMAP op
3063 * should still be ok.
a99b9ea2
MD
3064 */
3065 base = &cursor.leaf->base;
3066 rec_offset = base->key - cursor.leaf->data_len;
3067 rec_len = cursor.leaf->data_len;
3068
3069 /*
4a2796f3
MD
3070 * Incorporate any cached truncation.
3071 *
3072 * NOTE: Modifications to rec_len based on synthesized
3073 * truncation points remove the guarantee that any extended
3074 * data on disk is zero (since the truncations may not have
3075 * taken place on-media yet).
a99b9ea2
MD
3076 */
3077 if (ip->flags & HAMMER_INODE_TRUNCATED) {
3078 if (hammer_cursor_ondisk(&cursor) ||
3079 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3080 if (ip->trunc_off <= rec_offset)
3081 rec_len = 0;
3082 else if (ip->trunc_off < rec_offset + rec_len)
3083 rec_len = (int)(ip->trunc_off - rec_offset);
3084 }
3085 }
3086 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3087 if (hammer_cursor_ondisk(&cursor)) {
3088 if (ip->sync_trunc_off <= rec_offset)
3089 rec_len = 0;
3090 else if (ip->sync_trunc_off < rec_offset + rec_len)
3091 rec_len = (int)(ip->sync_trunc_off - rec_offset);
3092 }
3093 }
3094
3095 /*
3096 * Accumulate information. If we have hit a discontiguous
3097 * block reset base_offset unless we are already beyond the
3098 * requested offset. If we are, that's it, we stop.
3099 */
a99b9ea2
MD
3100 if (error)
3101 break;
1b0ab2c3
MD
3102 if (hammer_cursor_ondisk(&cursor)) {
3103 disk_offset = cursor.leaf->data_offset;
3104 if (rec_offset != last_offset ||
3105 disk_offset != last_disk_offset) {
3106 if (rec_offset > ap->a_loffset)
3107 break;
3108 base_offset = rec_offset;
3109 base_disk_offset = disk_offset;
3110 }
3111 last_offset = rec_offset + rec_len;
3112 last_disk_offset = disk_offset + rec_len;
507df98a
ID
3113
3114 if (hammer_live_dedup)
3115 hammer_dedup_cache_add(ip, cursor.leaf);
a99b9ea2 3116 }
507df98a 3117
a99b9ea2
MD
3118 error = hammer_ip_next(&cursor);
3119 }
3120
3121#if 0
3122 kprintf("BMAP %016llx: %016llx - %016llx\n",
973c11b9
MD
3123 (long long)ap->a_loffset,
3124 (long long)base_offset,
3125 (long long)last_offset);
3126 kprintf("BMAP %16s: %016llx - %016llx\n", "",
3127 (long long)base_disk_offset,
3128 (long long)last_disk_offset);
a99b9ea2
MD
3129#endif
3130
cb51be26 3131 if (cursor.node) {
bcac4bbb 3132 hammer_cache_node(&ip->cache[1], cursor.node);
cb51be26 3133#if 0
973c11b9
MD
3134 kprintf("bmap_end2 %016llx ip->cache %p\n",
3135 (long long)ap->a_loffset, ip->cache[1]);
cb51be26
MD
3136#endif
3137 }
a99b9ea2
MD
3138 hammer_done_cursor(&cursor);
3139 hammer_done_transaction(&trans);
b0aab9b9 3140 lwkt_reltoken(&hmp->fs_token);
a99b9ea2 3141
4a2796f3
MD
3142 /*
3143 * If we couldn't find any records or the records we did find were
3144 * all behind the requested offset, return failure. A forward
3145 * truncation can leave a hole w/ no on-disk records.
3146 */
3147 if (last_offset == 0 || last_offset < ap->a_loffset)
3148 return (EOPNOTSUPP);
3149
3150 /*
3151 * Figure out the block size at the requested offset and adjust
3152 * our limits so the cluster_read() does not create inappropriately
3153 * sized buffer cache buffers.
3154 */
3155 blksize = hammer_blocksize(ap->a_loffset);
3156 if (hammer_blocksize(base_offset) != blksize) {
3157 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3158 }
3159 if (last_offset != ap->a_loffset &&
3160 hammer_blocksize(last_offset - 1) != blksize) {
3161 last_offset = hammer_blockdemarc(ap->a_loffset,
3162 last_offset - 1);
3163 }
3164
3165 /*
3166 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3167 * from occuring.
3168 */
3169 disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3170
1b0ab2c3
MD
3171 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3172 /*
3173 * Only large-data zones can be direct-IOd
3174 */
3175 error = EOPNOTSUPP;
3176 } else if ((disk_offset & HAMMER_BUFMASK) ||
3177 (last_offset - ap->a_loffset) < blksize) {
3178 /*
3179 * doffsetp is not aligned or the forward run size does
3180 * not cover a whole buffer, disallow the direct I/O.
3181 */
a99b9ea2
MD
3182 error = EOPNOTSUPP;
3183 } else {
1b0ab2c3
MD
3184 /*
318