kernel - Adjust UFS and HAMMER to use uiomovebp()
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
CommitLineData
427e5fc6 1/*
b84de5af 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
427e5fc6
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
fbb84158 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
427e5fc6
MD
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/namecache.h>
42#include <sys/vnode.h>
43#include <sys/lockf.h>
44#include <sys/event.h>
45#include <sys/stat.h>
b3deaf57 46#include <sys/dirent.h>
fbb84158 47#include <sys/file.h>
c0ade690 48#include <vm/vm_extern.h>
18bee4a2 49#include <vm/swap_pager.h>
7a04d74f 50#include <vfs/fifofs/fifo.h>
684a93c4 51
427e5fc6
MD
52#include "hammer.h"
53
54/*
55 * USERFS VNOPS
56 */
57/*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
66325755
MD
58static int hammer_vop_fsync(struct vop_fsync_args *);
59static int hammer_vop_read(struct vop_read_args *);
60static int hammer_vop_write(struct vop_write_args *);
61static int hammer_vop_access(struct vop_access_args *);
62static int hammer_vop_advlock(struct vop_advlock_args *);
63static int hammer_vop_close(struct vop_close_args *);
64static int hammer_vop_ncreate(struct vop_ncreate_args *);
65static int hammer_vop_getattr(struct vop_getattr_args *);
66static int hammer_vop_nresolve(struct vop_nresolve_args *);
67static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
68static int hammer_vop_nlink(struct vop_nlink_args *);
69static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
70static int hammer_vop_nmknod(struct vop_nmknod_args *);
71static int hammer_vop_open(struct vop_open_args *);
66325755
MD
72static int hammer_vop_print(struct vop_print_args *);
73static int hammer_vop_readdir(struct vop_readdir_args *);
74static int hammer_vop_readlink(struct vop_readlink_args *);
75static int hammer_vop_nremove(struct vop_nremove_args *);
76static int hammer_vop_nrename(struct vop_nrename_args *);
77static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
349433c9 78static int hammer_vop_markatime(struct vop_markatime_args *);
66325755
MD
79static int hammer_vop_setattr(struct vop_setattr_args *);
80static int hammer_vop_strategy(struct vop_strategy_args *);
a99b9ea2 81static int hammer_vop_bmap(struct vop_bmap_args *ap);
66325755
MD
82static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
83static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
7dc57964 84static int hammer_vop_ioctl(struct vop_ioctl_args *);
513ca7d7 85static int hammer_vop_mountctl(struct vop_mountctl_args *);
fbb84158 86static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
427e5fc6 87
7a04d74f
MD
88static int hammer_vop_fifoclose (struct vop_close_args *);
89static int hammer_vop_fiforead (struct vop_read_args *);
90static int hammer_vop_fifowrite (struct vop_write_args *);
fbb84158 91static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
7a04d74f 92
427e5fc6
MD
93struct vop_ops hammer_vnode_vops = {
94 .vop_default = vop_defaultop,
95 .vop_fsync = hammer_vop_fsync,
c0ade690
MD
96 .vop_getpages = vop_stdgetpages,
97 .vop_putpages = vop_stdputpages,
427e5fc6
MD
98 .vop_read = hammer_vop_read,
99 .vop_write = hammer_vop_write,
100 .vop_access = hammer_vop_access,
101 .vop_advlock = hammer_vop_advlock,
102 .vop_close = hammer_vop_close,
103 .vop_ncreate = hammer_vop_ncreate,
104 .vop_getattr = hammer_vop_getattr,
105 .vop_inactive = hammer_vop_inactive,
106 .vop_reclaim = hammer_vop_reclaim,
107 .vop_nresolve = hammer_vop_nresolve,
108 .vop_nlookupdotdot = hammer_vop_nlookupdotdot,
109 .vop_nlink = hammer_vop_nlink,
110 .vop_nmkdir = hammer_vop_nmkdir,
111 .vop_nmknod = hammer_vop_nmknod,
112 .vop_open = hammer_vop_open,
64950f31 113 .vop_pathconf = vop_stdpathconf,
427e5fc6
MD
114 .vop_print = hammer_vop_print,
115 .vop_readdir = hammer_vop_readdir,
116 .vop_readlink = hammer_vop_readlink,
117 .vop_nremove = hammer_vop_nremove,
118 .vop_nrename = hammer_vop_nrename,
119 .vop_nrmdir = hammer_vop_nrmdir,
349433c9 120 .vop_markatime = hammer_vop_markatime,
427e5fc6 121 .vop_setattr = hammer_vop_setattr,
a99b9ea2 122 .vop_bmap = hammer_vop_bmap,
427e5fc6
MD
123 .vop_strategy = hammer_vop_strategy,
124 .vop_nsymlink = hammer_vop_nsymlink,
7dc57964 125 .vop_nwhiteout = hammer_vop_nwhiteout,
513ca7d7 126 .vop_ioctl = hammer_vop_ioctl,
fbb84158
MD
127 .vop_mountctl = hammer_vop_mountctl,
128 .vop_kqfilter = hammer_vop_kqfilter
427e5fc6
MD
129};
130
7a04d74f 131struct vop_ops hammer_spec_vops = {
8be7edad 132 .vop_default = vop_defaultop,
7a04d74f 133 .vop_fsync = hammer_vop_fsync,
8be7edad
MD
134 .vop_read = vop_stdnoread,
135 .vop_write = vop_stdnowrite,
7a04d74f 136 .vop_access = hammer_vop_access,
8be7edad 137 .vop_close = hammer_vop_close,
349433c9 138 .vop_markatime = hammer_vop_markatime,
8be7edad 139 .vop_getattr = hammer_vop_getattr,
7a04d74f
MD
140 .vop_inactive = hammer_vop_inactive,
141 .vop_reclaim = hammer_vop_reclaim,
142 .vop_setattr = hammer_vop_setattr
143};
144
145struct vop_ops hammer_fifo_vops = {
146 .vop_default = fifo_vnoperate,
147 .vop_fsync = hammer_vop_fsync,
148 .vop_read = hammer_vop_fiforead,
149 .vop_write = hammer_vop_fifowrite,
150 .vop_access = hammer_vop_access,
151 .vop_close = hammer_vop_fifoclose,
349433c9 152 .vop_markatime = hammer_vop_markatime,
7a04d74f
MD
153 .vop_getattr = hammer_vop_getattr,
154 .vop_inactive = hammer_vop_inactive,
155 .vop_reclaim = hammer_vop_reclaim,
fbb84158
MD
156 .vop_setattr = hammer_vop_setattr,
157 .vop_kqfilter = hammer_vop_fifokqfilter
7a04d74f
MD
158};
159
fbb84158
MD
160static __inline
161void
162hammer_knote(struct vnode *vp, int flags)
163{
164 if (flags)
5b22f1a7 165 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
fbb84158
MD
166}
167
0832c9bb
MD
168#ifdef DEBUG_TRUNCATE
169struct hammer_inode *HammerTruncIp;
170#endif
171
b84de5af 172static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
d7e278bb
MD
173 struct vnode *dvp, struct ucred *cred,
174 int flags, int isdir);
8cd0a023
MD
175static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
176static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
177
427e5fc6
MD
178#if 0
179static
180int
181hammer_vop_vnoperate(struct vop_generic_args *)
182{
183 return (VOCALL(&hammer_vnode_vops, ap));
184}
185#endif
186
66325755
MD
187/*
188 * hammer_vop_fsync { vp, waitfor }
ddfdf542
MD
189 *
190 * fsync() an inode to disk and wait for it to be completely committed
191 * such that the information would not be undone if a crash occured after
192 * return.
6f3d87c0
MD
193 *
194 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
195 * a REDO log. A sysctl is provided to relax HAMMER's fsync()
196 * operation.
197 *
198 * Ultimately the combination of a REDO log and use of fast storage
199 * to front-end cluster caches will make fsync fast, but it aint
200 * here yet. And, in anycase, we need real transactional
201 * all-or-nothing features which are not restricted to a single file.
66325755 202 */
427e5fc6
MD
203static
204int
66325755 205hammer_vop_fsync(struct vop_fsync_args *ap)
427e5fc6 206{
b84de5af 207 hammer_inode_t ip = VTOI(ap->a_vp);
9192654c 208 hammer_mount_t hmp = ip->hmp;
6f3d87c0 209 int waitfor = ap->a_waitfor;
9192654c 210 int mode;
6f3d87c0 211
b0aab9b9
MD
212 lwkt_gettoken(&hmp->fs_token);
213
6f3d87c0 214 /*
9192654c
MD
215 * Fsync rule relaxation (default is either full synchronous flush
216 * or REDO semantics with synchronous flush).
6f3d87c0
MD
217 */
218 if (ap->a_flags & VOP_FSYNC_SYSCALL) {
219 switch(hammer_fsync_mode) {
220 case 0:
9192654c 221mode0:
47f363f1 222 /* no REDO, full synchronous flush */
9192654c 223 goto skip;
6f3d87c0 224 case 1:
9192654c 225mode1:
47f363f1 226 /* no REDO, full asynchronous flush */
6f3d87c0
MD
227 if (waitfor == MNT_WAIT)
228 waitfor = MNT_NOWAIT;
9192654c 229 goto skip;
6f3d87c0 230 case 2:
9192654c
MD
231 /* REDO semantics, synchronous flush */
232 if (hmp->version < HAMMER_VOL_VERSION_FOUR)
233 goto mode0;
234 mode = HAMMER_FLUSH_UNDOS_AUTO;
235 break;
6f3d87c0 236 case 3:
9192654c
MD
237 /* REDO semantics, relaxed asynchronous flush */
238 if (hmp->version < HAMMER_VOL_VERSION_FOUR)
239 goto mode1;
240 mode = HAMMER_FLUSH_UNDOS_RELAXED;
241 if (waitfor == MNT_WAIT)
242 waitfor = MNT_NOWAIT;
243 break;
244 case 4:
245 /* ignore the fsync() system call */
b0aab9b9 246 lwkt_reltoken(&hmp->fs_token);
6f3d87c0
MD
247 return(0);
248 default:
9192654c
MD
249 /* we have to do something */
250 mode = HAMMER_FLUSH_UNDOS_RELAXED;
251 if (waitfor == MNT_WAIT)
252 waitfor = MNT_NOWAIT;
253 break;
254 }
255
256 /*
47f363f1
MD
257 * Fast fsync only needs to flush the UNDO/REDO fifo if
258 * HAMMER_INODE_REDO is non-zero and the only modifications
259 * made to the file are write or write-extends.
9192654c 260 */
47f363f1 261 if ((ip->flags & HAMMER_INODE_REDO) &&
9192654c
MD
262 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
263 ) {
264 ++hammer_count_fsyncs;
265 hammer_flusher_flush_undos(hmp, mode);
266 ip->redo_count = 0;
b0aab9b9 267 lwkt_reltoken(&hmp->fs_token);
6f3d87c0
MD
268 return(0);
269 }
47f363f1
MD
270
271 /*
272 * REDO is enabled by fsync(), the idea being we really only
273 * want to lay down REDO records when programs are using
274 * fsync() heavily. The first fsync() on the file starts
275 * the gravy train going and later fsync()s keep it hot by
276 * resetting the redo_count.
277 *
278 * We weren't running REDOs before now so we have to fall
279 * through and do a full fsync of what we have.
280 */
c58123da
MD
281 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
282 (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
47f363f1
MD
283 ip->flags |= HAMMER_INODE_REDO;
284 ip->redo_count = 0;
285 }
6f3d87c0 286 }
9192654c 287skip:
c0ade690 288
6f3d87c0 289 /*
9192654c 290 * Do a full flush sequence.
aad00981
MD
291 *
292 * Attempt to release the vnode while waiting for the inode to
293 * finish flushing. This can really mess up inactive->reclaim
294 * sequences so only do it if the vnode is active.
6f3d87c0 295 */
7a61b85d 296 ++hammer_count_fsyncs;
6f3d87c0 297 vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
af209b0f 298 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
6f3d87c0 299 if (waitfor == MNT_WAIT) {
aad00981
MD
300 if ((ap->a_vp->v_flag & VINACTIVE) == 0)
301 vn_unlock(ap->a_vp);
b84de5af 302 hammer_wait_inode(ip);
aad00981
MD
303 if ((ap->a_vp->v_flag & VINACTIVE) == 0)
304 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
b424ca30 305 }
b0aab9b9 306 lwkt_reltoken(&hmp->fs_token);
059819e3 307 return (ip->error);
427e5fc6
MD
308}
309
66325755
MD
310/*
311 * hammer_vop_read { vp, uio, ioflag, cred }
42cd5131 312 *
b0aab9b9 313 * MPSAFE (for the cache safe does not require fs_token)
66325755 314 */
427e5fc6
MD
315static
316int
66325755 317hammer_vop_read(struct vop_read_args *ap)
427e5fc6 318{
66325755 319 struct hammer_transaction trans;
c0ade690 320 hammer_inode_t ip;
b0aab9b9 321 hammer_mount_t hmp;
66325755
MD
322 off_t offset;
323 struct buf *bp;
324 struct uio *uio;
325 int error;
326 int n;
8cd0a023 327 int seqcount;
4a2796f3
MD
328 int ioseqcount;
329 int blksize;
f864373f 330 int bigread;
b0aab9b9 331 int got_fstoken;
66325755
MD
332
333 if (ap->a_vp->v_type != VREG)
334 return (EINVAL);
335 ip = VTOI(ap->a_vp);
b0aab9b9 336 hmp = ip->hmp;
66325755 337 error = 0;
4a2796f3
MD
338 uio = ap->a_uio;
339
340 /*
341 * Allow the UIO's size to override the sequential heuristic.
342 */
343 blksize = hammer_blocksize(uio->uio_offset);
69adbed4
MD
344 seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
345 ioseqcount = (ap->a_ioflag >> 16);
4a2796f3
MD
346 if (seqcount < ioseqcount)
347 seqcount = ioseqcount;
66325755 348
7ff770b4 349 /*
f864373f
MD
350 * If reading or writing a huge amount of data we have to break
351 * atomicy and allow the operation to be interrupted by a signal
352 * or it can DOS the machine.
353 */
354 bigread = (uio->uio_resid > 100 * 1024 * 1024);
b0aab9b9 355 got_fstoken = 0;
f864373f
MD
356
357 /*
4a2796f3
MD
358 * Access the data typically in HAMMER_BUFSIZE blocks via the
359 * buffer cache, but HAMMER may use a variable block size based
360 * on the offset.
42cd5131
MD
361 *
362 * XXX Temporary hack, delay the start transaction while we remain
363 * MPSAFE. NOTE: ino_data.size cannot change while vnode is
364 * locked-shared.
66325755 365 */
11ad5ade 366 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
4a2796f3
MD
367 int64_t base_offset;
368 int64_t file_limit;
369
370 blksize = hammer_blocksize(uio->uio_offset);
371 offset = (int)uio->uio_offset & (blksize - 1);
372 base_offset = uio->uio_offset - offset;
373
f864373f
MD
374 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
375 break;
376
42cd5131
MD
377 /*
378 * MPSAFE
379 */
54341a3b
MD
380 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
381 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
382 bp->b_flags &= ~B_AGE;
42cd5131
MD
383 error = 0;
384 goto skip;
54341a3b
MD
385 }
386 if (ap->a_ioflag & IO_NRDELAY) {
387 bqrelse(bp);
388 return (EWOULDBLOCK);
42cd5131
MD
389 }
390
391 /*
392 * MPUNSAFE
393 */
b0aab9b9
MD
394 if (got_fstoken == 0) {
395 lwkt_gettoken(&hmp->fs_token);
396 got_fstoken = 1;
42cd5131
MD
397 hammer_start_transaction(&trans, ip->hmp);
398 }
399
54341a3b
MD
400 /*
401 * NOTE: A valid bp has already been acquired, but was not
402 * B_CACHE.
403 */
1b0ab2c3 404 if (hammer_cluster_enable) {
4a2796f3
MD
405 /*
406 * Use file_limit to prevent cluster_read() from
407 * creating buffers of the wrong block size past
408 * the demarc.
409 */
410 file_limit = ip->ino_data.size;
411 if (base_offset < HAMMER_XDEMARC &&
412 file_limit > HAMMER_XDEMARC) {
413 file_limit = HAMMER_XDEMARC;
414 }
54341a3b 415 error = cluster_readx(ap->a_vp,
4a2796f3 416 file_limit, base_offset,
364c022c
MD
417 blksize, uio->uio_resid,
418 seqcount * BKVASIZE, &bp);
a99b9ea2 419 } else {
54341a3b
MD
420 error = breadnx(ap->a_vp, base_offset, blksize,
421 NULL, NULL, 0, &bp);
a99b9ea2 422 }
66325755
MD
423 if (error) {
424 brelse(bp);
425 break;
426 }
42cd5131 427skip:
24c8374a
MD
428 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
429 kprintf("doff %016jx read file %016jx@%016jx\n",
430 (intmax_t)bp->b_bio2.bio_offset,
431 (intmax_t)ip->obj_id,
432 (intmax_t)bp->b_loffset);
433 }
434 bp->b_flags &= ~B_IODEBUG;
66030e2b
MD
435 if (blksize == HAMMER_XBUFSIZE)
436 bp->b_flags |= B_CLUSTEROK;
7bc5b8c2 437
4a2796f3 438 n = blksize - offset;
66325755
MD
439 if (n > uio->uio_resid)
440 n = uio->uio_resid;
11ad5ade
MD
441 if (n > ip->ino_data.size - uio->uio_offset)
442 n = (int)(ip->ino_data.size - uio->uio_offset);
df01a101
MD
443 if (got_fstoken)
444 lwkt_reltoken(&hmp->fs_token);
283b9448
MD
445
446 /*
447 * Set B_AGE, data has a lower priority than meta-data.
448 *
449 * Use a hold/unlock/drop sequence to run the uiomove
450 * with the buffer unlocked, avoiding deadlocks against
451 * read()s on mmap()'d spaces.
452 */
453 bp->b_flags |= B_AGE;
44480e31 454 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio);
283b9448 455 bqrelse(bp);
283b9448 456
df01a101
MD
457 if (got_fstoken)
458 lwkt_gettoken(&hmp->fs_token);
7bc5b8c2 459
af209b0f
MD
460 if (error)
461 break;
ce0138a6 462 hammer_stats_file_read += n;
66325755 463 }
42cd5131
MD
464
465 /*
70125e78
MD
466 * Try to update the atime with just the inode lock for maximum
467 * concurrency. If we can't shortcut it we have to get the full
468 * blown transaction.
42cd5131 469 */
70125e78
MD
470 if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) {
471 lwkt_gettoken(&hmp->fs_token);
472 got_fstoken = 1;
473 hammer_start_transaction(&trans, ip->hmp);
474 }
475
b0aab9b9 476 if (got_fstoken) {
42cd5131
MD
477 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
478 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
479 ip->ino_data.atime = trans.time;
e98f1b96 480 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
42cd5131
MD
481 }
482 hammer_done_transaction(&trans);
b0aab9b9 483 lwkt_reltoken(&hmp->fs_token);
b84de5af 484 }
66325755 485 return (error);
427e5fc6
MD
486}
487
66325755
MD
488/*
489 * hammer_vop_write { vp, uio, ioflag, cred }
490 */
427e5fc6
MD
491static
492int
66325755 493hammer_vop_write(struct vop_write_args *ap)
427e5fc6 494{
66325755
MD
495 struct hammer_transaction trans;
496 struct hammer_inode *ip;
4a2796f3 497 hammer_mount_t hmp;
1589191a 498 thread_t td;
66325755 499 struct uio *uio;
4a2796f3 500 int offset;
47637bff 501 off_t base_offset;
9de13b88 502 int64_t cluster_eof;
66325755 503 struct buf *bp;
fbb84158 504 int kflags;
66325755
MD
505 int error;
506 int n;
c0ade690 507 int flags;
cb51be26 508 int seqcount;
f864373f 509 int bigwrite;
66325755
MD
510
511 if (ap->a_vp->v_type != VREG)
512 return (EINVAL);
513 ip = VTOI(ap->a_vp);
4a2796f3 514 hmp = ip->hmp;
66325755 515 error = 0;
fbb84158 516 kflags = 0;
cb51be26 517 seqcount = ap->a_ioflag >> 16;
66325755 518
d113fda1
MD
519 if (ip->flags & HAMMER_INODE_RO)
520 return (EROFS);
521
66325755
MD
522 /*
523 * Create a transaction to cover the operations we perform.
524 */
b0aab9b9 525 lwkt_gettoken(&hmp->fs_token);
4a2796f3 526 hammer_start_transaction(&trans, hmp);
66325755
MD
527 uio = ap->a_uio;
528
529 /*
530 * Check append mode
531 */
532 if (ap->a_ioflag & IO_APPEND)
11ad5ade 533 uio->uio_offset = ip->ino_data.size;
66325755
MD
534
535 /*
af209b0f
MD
536 * Check for illegal write offsets. Valid range is 0...2^63-1.
537 *
538 * NOTE: the base_off assignment is required to work around what
539 * I consider to be a GCC-4 optimization bug.
66325755 540 */
af209b0f
MD
541 if (uio->uio_offset < 0) {
542 hammer_done_transaction(&trans);
b0aab9b9 543 lwkt_reltoken(&hmp->fs_token);
af209b0f
MD
544 return (EFBIG);
545 }
546 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
e54488bb 547 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
b84de5af 548 hammer_done_transaction(&trans);
b0aab9b9 549 lwkt_reltoken(&hmp->fs_token);
66325755 550 return (EFBIG);
9c448776 551 }
66325755 552
1589191a
MD
553 if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
554 base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
555 hammer_done_transaction(&trans);
556 lwkt_reltoken(&hmp->fs_token);
557 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
558 return (EFBIG);
559 }
560
66325755 561 /*
f864373f
MD
562 * If reading or writing a huge amount of data we have to break
563 * atomicy and allow the operation to be interrupted by a signal
564 * or it can DOS the machine.
9192654c 565 *
47f363f1
MD
566 * Preset redo_count so we stop generating REDOs earlier if the
567 * limit is exceeded.
f864373f
MD
568 */
569 bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
47f363f1
MD
570 if ((ip->flags & HAMMER_INODE_REDO) &&
571 ip->redo_count < hammer_limit_redo) {
9192654c 572 ip->redo_count += uio->uio_resid;
47f363f1 573 }
f864373f
MD
574
575 /*
4a2796f3
MD
576 * Access the data typically in HAMMER_BUFSIZE blocks via the
577 * buffer cache, but HAMMER may use a variable block size based
578 * on the offset.
66325755
MD
579 */
580 while (uio->uio_resid > 0) {
d5ef456e 581 int fixsize = 0;
4a2796f3
MD
582 int blksize;
583 int blkmask;
6362a262 584 int trivial;
d1eff1f7 585 int endofblk;
6362a262 586 off_t nsize;
d5ef456e 587
93291532 588 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
e63644f0 589 break;
f864373f
MD
590 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
591 break;
e63644f0 592
a9d52b76
MD
593 blksize = hammer_blocksize(uio->uio_offset);
594
059819e3 595 /*
4a2796f3
MD
596 * Do not allow HAMMER to blow out the buffer cache. Very
597 * large UIOs can lockout other processes due to bwillwrite()
598 * mechanics.
47637bff 599 *
df301614
MD
600 * The hammer inode is not locked during these operations.
601 * The vnode is locked which can interfere with the pageout
602 * daemon for non-UIO_NOCOPY writes but should not interfere
603 * with the buffer cache. Even so, we cannot afford to
604 * allow the pageout daemon to build up too many dirty buffer
605 * cache buffers.
cb63d1bc
MD
606 *
607 * Only call this if we aren't being recursively called from
608 * a virtual disk device (vn), else we may deadlock.
df301614 609 */
cb63d1bc
MD
610 if ((ap->a_ioflag & IO_RECURSE) == 0)
611 bwillwrite(blksize);
df301614
MD
612
613 /*
de996e86
MD
614 * Control the number of pending records associated with
615 * this inode. If too many have accumulated start a
616 * flush. Try to maintain a pipeline with the flusher.
a117fbeb
MD
617 *
618 * NOTE: It is possible for other sources to grow the
619 * records but not necessarily issue another flush,
620 * so use a timeout and ensure that a re-flush occurs.
de996e86
MD
621 */
622 if (ip->rsv_recs >= hammer_limit_inode_recs) {
623 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
a117fbeb
MD
624 while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
625 ip->flags |= HAMMER_INODE_RECSW;
de996e86 626 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
a117fbeb 627 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
de996e86 628 }
de996e86
MD
629 }
630
631#if 0
632 /*
e4a5ff06
MD
633 * Do not allow HAMMER to blow out system memory by
634 * accumulating too many records. Records are so well
635 * decoupled from the buffer cache that it is possible
636 * for userland to push data out to the media via
637 * direct-write, but build up the records queued to the
638 * backend faster then the backend can flush them out.
639 * HAMMER has hit its write limit but the frontend has
640 * no pushback to slow it down.
059819e3 641 */
df301614 642 if (hmp->rsv_recs > hammer_limit_recs / 2) {
4a2796f3 643 /*
df301614 644 * Get the inode on the flush list
4a2796f3 645 */
df301614
MD
646 if (ip->rsv_recs >= 64)
647 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
648 else if (ip->rsv_recs >= 16)
649 hammer_flush_inode(ip, 0);
4a2796f3
MD
650
651 /*
df301614
MD
652 * Keep the flusher going if the system keeps
653 * queueing records.
4a2796f3 654 */
df301614
MD
655 delta = hmp->count_newrecords -
656 hmp->last_newrecords;
657 if (delta < 0 || delta > hammer_limit_recs / 2) {
658 hmp->last_newrecords = hmp->count_newrecords;
659 hammer_sync_hmp(hmp, MNT_NOWAIT);
4a2796f3
MD
660 }
661
df301614
MD
662 /*
663 * If we have gotten behind start slowing
664 * down the writers.
665 */
666 delta = (hmp->rsv_recs - hammer_limit_recs) *
667 hz / hammer_limit_recs;
668 if (delta > 0)
669 tsleep(&trans, 0, "hmrslo", delta);
059819e3 670 }
de996e86 671#endif
059819e3 672
4a2796f3
MD
673 /*
674 * Calculate the blocksize at the current offset and figure
675 * out how much we can actually write.
676 */
4a2796f3
MD
677 blkmask = blksize - 1;
678 offset = (int)uio->uio_offset & blkmask;
679 base_offset = uio->uio_offset & ~(int64_t)blkmask;
680 n = blksize - offset;
d1eff1f7 681 if (n > uio->uio_resid) {
d5ef456e 682 n = uio->uio_resid;
d1eff1f7
MD
683 endofblk = 0;
684 } else {
685 endofblk = 1;
686 }
6362a262
MD
687 nsize = uio->uio_offset + n;
688 if (nsize > ip->ino_data.size) {
689 if (uio->uio_offset > ip->ino_data.size)
690 trivial = 0;
691 else
692 trivial = 1;
693 nvextendbuf(ap->a_vp,
694 ip->ino_data.size,
695 nsize,
696 hammer_blocksize(ip->ino_data.size),
697 hammer_blocksize(nsize),
698 hammer_blockoff(ip->ino_data.size),
699 hammer_blockoff(nsize),
700 trivial);
d5ef456e 701 fixsize = 1;
fbb84158 702 kflags |= NOTE_EXTEND;
d5ef456e
MD
703 }
704
c0ade690
MD
705 if (uio->uio_segflg == UIO_NOCOPY) {
706 /*
707 * Issuing a write with the same data backing the
708 * buffer. Instantiate the buffer to collect the
709 * backing vm pages, then read-in any missing bits.
710 *
711 * This case is used by vop_stdputpages().
712 */
47637bff 713 bp = getblk(ap->a_vp, base_offset,
4a2796f3 714 blksize, GETBLK_BHEAVY, 0);
c0ade690
MD
715 if ((bp->b_flags & B_CACHE) == 0) {
716 bqrelse(bp);
47637bff 717 error = bread(ap->a_vp, base_offset,
4a2796f3 718 blksize, &bp);
c0ade690 719 }
4a2796f3 720 } else if (offset == 0 && uio->uio_resid >= blksize) {
c0ade690 721 /*
a5fddc16
MD
722 * Even though we are entirely overwriting the buffer
723 * we may still have to zero it out to avoid a
724 * mmap/write visibility issue.
c0ade690 725 */
4a2796f3 726 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
a5fddc16
MD
727 if ((bp->b_flags & B_CACHE) == 0)
728 vfs_bio_clrbuf(bp);
47637bff 729 } else if (base_offset >= ip->ino_data.size) {
c0ade690 730 /*
a5fddc16
MD
731 * If the base offset of the buffer is beyond the
732 * file EOF, we don't have to issue a read.
c0ade690 733 */
47637bff 734 bp = getblk(ap->a_vp, base_offset,
4a2796f3 735 blksize, GETBLK_BHEAVY, 0);
66325755
MD
736 vfs_bio_clrbuf(bp);
737 } else {
c0ade690
MD
738 /*
739 * Partial overwrite, read in any missing bits then
740 * replace the portion being written.
741 */
4a2796f3 742 error = bread(ap->a_vp, base_offset, blksize, &bp);
d5ef456e
MD
743 if (error == 0)
744 bheavy(bp);
66325755 745 }
df01a101
MD
746 if (error == 0) {
747 lwkt_reltoken(&hmp->fs_token);
44480e31 748 error = uiomovebp(bp, bp->b_data + offset, n, uio);
df01a101
MD
749 lwkt_gettoken(&hmp->fs_token);
750 }
9192654c
MD
751
752 /*
47f363f1
MD
753 * Generate REDO records if enabled and redo_count will not
754 * exceeded the limit.
755 *
756 * If redo_count exceeds the limit we stop generating records
757 * and clear HAMMER_INODE_REDO. This will cause the next
758 * fsync() to do a full meta-data sync instead of just an
759 * UNDO/REDO fifo update.
760 *
761 * When clearing HAMMER_INODE_REDO any pre-existing REDOs
762 * will still be tracked. The tracks will be terminated
763 * when the related meta-data (including possible data
764 * modifications which are not tracked via REDO) is
765 * flushed.
9192654c 766 */
47f363f1
MD
767 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
768 if (ip->redo_count < hammer_limit_redo) {
769 bp->b_flags |= B_VFSFLAG1;
770 error = hammer_generate_redo(&trans, ip,
9192654c 771 base_offset + offset,
47f363f1 772 HAMMER_REDO_WRITE,
9192654c
MD
773 bp->b_data + offset,
774 (size_t)n);
47f363f1
MD
775 } else {
776 ip->flags &= ~HAMMER_INODE_REDO;
777 }
47637bff 778 }
d5ef456e
MD
779
780 /*
781 * If we screwed up we have to undo any VM size changes we
782 * made.
783 */
66325755
MD
784 if (error) {
785 brelse(bp);
d5ef456e 786 if (fixsize) {
6362a262
MD
787 nvtruncbuf(ap->a_vp, ip->ino_data.size,
788 hammer_blocksize(ip->ino_data.size),
753df37e
MD
789 hammer_blockoff(ip->ino_data.size),
790 0);
d5ef456e 791 }
66325755
MD
792 break;
793 }
fbb84158 794 kflags |= NOTE_WRITE;
ce0138a6 795 hammer_stats_file_write += n;
66030e2b
MD
796 if (blksize == HAMMER_XBUFSIZE)
797 bp->b_flags |= B_CLUSTEROK;
11ad5ade
MD
798 if (ip->ino_data.size < uio->uio_offset) {
799 ip->ino_data.size = uio->uio_offset;
9192654c 800 flags = HAMMER_INODE_SDIRTY;
c0ade690 801 } else {
d113fda1 802 flags = 0;
66325755 803 }
11ad5ade 804 ip->ino_data.mtime = trans.time;
ddfdf542 805 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
e98f1b96 806 hammer_modify_inode(&trans, ip, flags);
32c90105 807
0832c9bb 808 /*
1b0ab2c3
MD
809 * Once we dirty the buffer any cached zone-X offset
810 * becomes invalid. HAMMER NOTE: no-history mode cannot
811 * allow overwriting over the same data sector unless
812 * we provide UNDOs for the old data, which we don't.
813 */
814 bp->b_bio2.bio_offset = NOOFFSET;
815
816 /*
47637bff 817 * Final buffer disposition.
de996e86
MD
818 *
819 * Because meta-data updates are deferred, HAMMER is
820 * especially sensitive to excessive bdwrite()s because
821 * the I/O stream is not broken up by disk reads. So the
822 * buffer cache simply cannot keep up.
823 *
824 * WARNING! blksize is variable. cluster_write() is
d1eff1f7
MD
825 * expected to not blow up if it encounters
826 * buffers that do not match the passed blksize.
710733a6
MD
827 *
828 * NOTE! Hammer shouldn't need to bawrite()/cluster_write().
829 * The ip->rsv_recs check should burst-flush the data.
830 * If we queue it immediately the buf could be left
831 * locked on the device queue for a very long time.
d1eff1f7 832 *
55b50bd5
MD
833 * However, failing to flush a dirty buffer out when
834 * issued from the pageout daemon can result in a low
835 * memory deadlock against bio_page_alloc(), so we
836 * have to bawrite() on IO_ASYNC as well.
837 *
d1eff1f7
MD
838 * NOTE! To avoid degenerate stalls due to mismatched block
839 * sizes we only honor IO_DIRECT on the write which
840 * abuts the end of the buffer. However, we must
841 * honor IO_SYNC in case someone is silly enough to
842 * configure a HAMMER file as swap, or when HAMMER
843 * is serving NFS (for commits). Ick ick.
47637bff 844 */
66030e2b
MD
845 bp->b_flags |= B_AGE;
846 if (blksize == HAMMER_XBUFSIZE)
847 bp->b_flags |= B_CLUSTEROK;
848
66325755
MD
849 if (ap->a_ioflag & IO_SYNC) {
850 bwrite(bp);
d1eff1f7 851 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
66325755 852 bawrite(bp);
55b50bd5
MD
853 } else if (ap->a_ioflag & IO_ASYNC) {
854 bawrite(bp);
9de13b88
MD
855 } else if (hammer_cluster_enable &&
856 !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
857 if (base_offset < HAMMER_XDEMARC)
858 cluster_eof = hammer_blockdemarc(base_offset,
859 ip->ino_data.size);
860 else
861 cluster_eof = ip->ino_data.size;
862 cluster_write(bp, cluster_eof, blksize, seqcount);
710733a6 863 } else {
4a2796f3
MD
864 bdwrite(bp);
865 }
66325755 866 }
b84de5af 867 hammer_done_transaction(&trans);
fbb84158 868 hammer_knote(ap->a_vp, kflags);
b0aab9b9 869 lwkt_reltoken(&hmp->fs_token);
66325755 870 return (error);
427e5fc6
MD
871}
872
66325755
MD
873/*
874 * hammer_vop_access { vp, mode, cred }
b0aab9b9
MD
875 *
876 * MPSAFE - does not require fs_token
66325755 877 */
427e5fc6
MD
878static
879int
66325755 880hammer_vop_access(struct vop_access_args *ap)
427e5fc6 881{
66325755
MD
882 struct hammer_inode *ip = VTOI(ap->a_vp);
883 uid_t uid;
884 gid_t gid;
885 int error;
886
ce0138a6 887 ++hammer_stats_file_iopsr;
66325755
MD
888 uid = hammer_to_unix_xid(&ip->ino_data.uid);
889 gid = hammer_to_unix_xid(&ip->ino_data.gid);
890
891 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
892 ip->ino_data.uflags);
893 return (error);
427e5fc6
MD
894}
895
66325755
MD
896/*
897 * hammer_vop_advlock { vp, id, op, fl, flags }
b0aab9b9
MD
898 *
899 * MPSAFE - does not require fs_token
66325755 900 */
427e5fc6
MD
901static
902int
66325755 903hammer_vop_advlock(struct vop_advlock_args *ap)
427e5fc6 904{
4a2796f3 905 hammer_inode_t ip = VTOI(ap->a_vp);
66325755 906
11ad5ade 907 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
427e5fc6
MD
908}
909
66325755
MD
910/*
911 * hammer_vop_close { vp, fflag }
6f3d87c0 912 *
b0aab9b9 913 * We can only sync-on-close for normal closes. XXX disabled for now.
66325755 914 */
427e5fc6
MD
915static
916int
66325755 917hammer_vop_close(struct vop_close_args *ap)
427e5fc6 918{
9192654c 919#if 0
6f3d87c0
MD
920 struct vnode *vp = ap->a_vp;
921 hammer_inode_t ip = VTOI(vp);
922 int waitfor;
6f3d87c0
MD
923 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
924 if (vn_islocked(vp) == LK_EXCLUSIVE &&
925 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
926 if (ip->flags & HAMMER_INODE_CLOSESYNC)
927 waitfor = MNT_WAIT;
928 else
929 waitfor = MNT_NOWAIT;
930 ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
931 HAMMER_INODE_CLOSEASYNC);
932 VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
933 }
934 }
9192654c 935#endif
a89aec1b 936 return (vop_stdclose(ap));
427e5fc6
MD
937}
938
66325755
MD
939/*
940 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
941 *
942 * The operating system has already ensured that the directory entry
943 * does not exist and done all appropriate namespace locking.
944 */
427e5fc6
MD
945static
946int
66325755 947hammer_vop_ncreate(struct vop_ncreate_args *ap)
427e5fc6 948{
66325755
MD
949 struct hammer_transaction trans;
950 struct hammer_inode *dip;
951 struct hammer_inode *nip;
952 struct nchandle *nch;
b0aab9b9 953 hammer_mount_t hmp;
66325755
MD
954 int error;
955
956 nch = ap->a_nch;
957 dip = VTOI(ap->a_dvp);
b0aab9b9 958 hmp = dip->hmp;
66325755 959
d113fda1
MD
960 if (dip->flags & HAMMER_INODE_RO)
961 return (EROFS);
b0aab9b9 962 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 963 return (error);
d113fda1 964
66325755
MD
965 /*
966 * Create a transaction to cover the operations we perform.
967 */
b0aab9b9
MD
968 lwkt_gettoken(&hmp->fs_token);
969 hammer_start_transaction(&trans, hmp);
ce0138a6 970 ++hammer_stats_file_iopsw;
66325755
MD
971
972 /*
973 * Create a new filesystem object of the requested type. The
b84de5af
MD
974 * returned inode will be referenced and shared-locked to prevent
975 * it from being moved to the flusher.
66325755 976 */
5a930e66 977 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
978 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
979 NULL, &nip);
66325755 980 if (error) {
77062c8a 981 hkprintf("hammer_create_inode error %d\n", error);
b84de5af 982 hammer_done_transaction(&trans);
66325755 983 *ap->a_vpp = NULL;
b0aab9b9 984 lwkt_reltoken(&hmp->fs_token);
66325755
MD
985 return (error);
986 }
66325755
MD
987
988 /*
989 * Add the new filesystem object to the directory. This will also
990 * bump the inode's link count.
991 */
5a930e66
MD
992 error = hammer_ip_add_directory(&trans, dip,
993 nch->ncp->nc_name, nch->ncp->nc_nlen,
994 nip);
0b075555 995 if (error)
77062c8a 996 hkprintf("hammer_ip_add_directory error %d\n", error);
66325755
MD
997
998 /*
999 * Finish up.
1000 */
1001 if (error) {
a89aec1b 1002 hammer_rel_inode(nip, 0);
b84de5af 1003 hammer_done_transaction(&trans);
66325755
MD
1004 *ap->a_vpp = NULL;
1005 } else {
e8599db1 1006 error = hammer_get_vnode(nip, ap->a_vpp);
b84de5af 1007 hammer_done_transaction(&trans);
a89aec1b
MD
1008 hammer_rel_inode(nip, 0);
1009 if (error == 0) {
1010 cache_setunresolved(ap->a_nch);
1011 cache_setvp(ap->a_nch, *ap->a_vpp);
1012 }
fbb84158 1013 hammer_knote(ap->a_dvp, NOTE_WRITE);
66325755 1014 }
b0aab9b9 1015 lwkt_reltoken(&hmp->fs_token);
66325755 1016 return (error);
427e5fc6
MD
1017}
1018
66325755
MD
1019/*
1020 * hammer_vop_getattr { vp, vap }
98f7132d
MD
1021 *
1022 * Retrieve an inode's attribute information. When accessing inodes
1023 * historically we fake the atime field to ensure consistent results.
1024 * The atime field is stored in the B-Tree element and allowed to be
1025 * updated without cycling the element.
899eb297 1026 *
b0aab9b9 1027 * MPSAFE - does not require fs_token
66325755 1028 */
427e5fc6
MD
1029static
1030int
66325755 1031hammer_vop_getattr(struct vop_getattr_args *ap)
427e5fc6 1032{
66325755
MD
1033 struct hammer_inode *ip = VTOI(ap->a_vp);
1034 struct vattr *vap = ap->a_vap;
1035
a56cb012
MD
1036 /*
1037 * We want the fsid to be different when accessing a filesystem
1038 * with different as-of's so programs like diff don't think
1039 * the files are the same.
1040 *
1041 * We also want the fsid to be the same when comparing snapshots,
1042 * or when comparing mirrors (which might be backed by different
1043 * physical devices). HAMMER fsids are based on the PFS's
1044 * shared_uuid field.
1045 *
1046 * XXX there is a chance of collision here. The va_fsid reported
1047 * by stat is different from the more involved fsid used in the
1048 * mount structure.
c82af904 1049 */
ce0138a6 1050 ++hammer_stats_file_iopsr;
899eb297 1051 hammer_lock_sh(&ip->lock);
a56cb012
MD
1052 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1053 (u_int32_t)(ip->obj_asof >> 32);
1054
11ad5ade 1055 vap->va_fileid = ip->ino_leaf.base.obj_id;
66325755 1056 vap->va_mode = ip->ino_data.mode;
11ad5ade 1057 vap->va_nlink = ip->ino_data.nlinks;
66325755
MD
1058 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1059 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1060 vap->va_rmajor = 0;
1061 vap->va_rminor = 0;
11ad5ade 1062 vap->va_size = ip->ino_data.size;
bcac4bbb
MD
1063
1064 /*
f437a2ab
MD
1065 * Special case for @@PFS softlinks. The actual size of the
1066 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
cb3c760c 1067 * or for MAX_TID is "@@-1:%05d" == 10 bytes.
f437a2ab
MD
1068 */
1069 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1070 ip->ino_data.size == 10 &&
1071 ip->obj_asof == HAMMER_MAX_TID &&
1072 ip->obj_localization == 0 &&
1073 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
cb3c760c
MD
1074 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1075 vap->va_size = 26;
1076 else
1077 vap->va_size = 10;
f437a2ab
MD
1078 }
1079
1080 /*
bcac4bbb
MD
1081 * We must provide a consistent atime and mtime for snapshots
1082 * so people can do a 'tar cf - ... | md5' on them and get
1083 * consistent results.
1084 */
1085 if (ip->flags & HAMMER_INODE_RO) {
ddfdf542
MD
1086 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1087 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
bcac4bbb 1088 } else {
ddfdf542
MD
1089 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1090 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
bcac4bbb 1091 }
ddfdf542 1092 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
66325755
MD
1093 vap->va_flags = ip->ino_data.uflags;
1094 vap->va_gen = 1; /* hammer inums are unique for all time */
bf686dbe 1095 vap->va_blocksize = HAMMER_BUFSIZE;
4a2796f3
MD
1096 if (ip->ino_data.size >= HAMMER_XDEMARC) {
1097 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1098 ~HAMMER_XBUFMASK64;
1099 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1100 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1101 ~HAMMER_BUFMASK64;
1102 } else {
1103 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1104 }
64950f31 1105
11ad5ade 1106 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
66325755 1107 vap->va_filerev = 0; /* XXX */
66325755
MD
1108 vap->va_uid_uuid = ip->ino_data.uid;
1109 vap->va_gid_uuid = ip->ino_data.gid;
1110 vap->va_fsid_uuid = ip->hmp->fsid;
1111 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1112 VA_FSID_UUID_VALID;
7a04d74f 1113
11ad5ade 1114 switch (ip->ino_data.obj_type) {
7a04d74f
MD
1115 case HAMMER_OBJTYPE_CDEV:
1116 case HAMMER_OBJTYPE_BDEV:
1117 vap->va_rmajor = ip->ino_data.rmajor;
1118 vap->va_rminor = ip->ino_data.rminor;
1119 break;
1120 default:
1121 break;
1122 }
899eb297 1123 hammer_unlock(&ip->lock);
66325755 1124 return(0);
427e5fc6
MD
1125}
1126
66325755
MD
1127/*
1128 * hammer_vop_nresolve { nch, dvp, cred }
1129 *
1130 * Locate the requested directory entry.
1131 */
427e5fc6
MD
1132static
1133int
66325755 1134hammer_vop_nresolve(struct vop_nresolve_args *ap)
427e5fc6 1135{
36f82b23 1136 struct hammer_transaction trans;
66325755 1137 struct namecache *ncp;
b0aab9b9 1138 hammer_mount_t hmp;
7f7c1f84
MD
1139 hammer_inode_t dip;
1140 hammer_inode_t ip;
1141 hammer_tid_t asof;
8cd0a023 1142 struct hammer_cursor cursor;
66325755
MD
1143 struct vnode *vp;
1144 int64_t namekey;
1145 int error;
7f7c1f84
MD
1146 int i;
1147 int nlen;
d113fda1 1148 int flags;
a56cb012 1149 int ispfs;
adf01747 1150 int64_t obj_id;
ddfdf542 1151 u_int32_t localization;
5e435c92 1152 u_int32_t max_iterations;
7f7c1f84
MD
1153
1154 /*
1155 * Misc initialization, plus handle as-of name extensions. Look for
1156 * the '@@' extension. Note that as-of files and directories cannot
1157 * be modified.
7f7c1f84
MD
1158 */
1159 dip = VTOI(ap->a_dvp);
1160 ncp = ap->a_nch->ncp;
1161 asof = dip->obj_asof;
bc6c1f13 1162 localization = dip->obj_localization; /* for code consistency */
7f7c1f84 1163 nlen = ncp->nc_nlen;
ea434b6f 1164 flags = dip->flags & HAMMER_INODE_RO;
a56cb012 1165 ispfs = 0;
b0aab9b9 1166 hmp = dip->hmp;
7f7c1f84 1167
b0aab9b9
MD
1168 lwkt_gettoken(&hmp->fs_token);
1169 hammer_simple_transaction(&trans, hmp);
ce0138a6 1170 ++hammer_stats_file_iopsr;
36f82b23 1171
7f7c1f84
MD
1172 for (i = 0; i < nlen; ++i) {
1173 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
bc6c1f13
MD
1174 error = hammer_str_to_tid(ncp->nc_name + i + 2,
1175 &ispfs, &asof, &localization);
1176 if (error != 0) {
1177 i = nlen;
1178 break;
1179 }
ea434b6f
MD
1180 if (asof != HAMMER_MAX_TID)
1181 flags |= HAMMER_INODE_RO;
7f7c1f84
MD
1182 break;
1183 }
1184 }
1185 nlen = i;
66325755 1186
8cd0a023 1187 /*
ea434b6f
MD
1188 * If this is a PFS softlink we dive into the PFS
1189 */
1190 if (ispfs && nlen == 0) {
1191 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1192 asof, localization,
1193 flags, &error);
1194 if (error == 0) {
1195 error = hammer_get_vnode(ip, &vp);
1196 hammer_rel_inode(ip, 0);
1197 } else {
1198 vp = NULL;
1199 }
1200 if (error == 0) {
1201 vn_unlock(vp);
1202 cache_setvp(ap->a_nch, vp);
1203 vrele(vp);
1204 }
1205 goto done;
1206 }
1207
1208 /*
294aec9f
MD
1209 * If there is no path component the time extension is relative to dip.
1210 * e.g. "fubar/@@<snapshot>"
1211 *
1212 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1213 * e.g. "fubar/.@@<snapshot>"
1214 *
1215 * ".." is handled by the kernel. We do not currently handle
1216 * "..@<snapshot>".
d113fda1 1217 */
294aec9f 1218 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
bcac4bbb 1219 ip = hammer_get_inode(&trans, dip, dip->obj_id,
ddfdf542
MD
1220 asof, dip->obj_localization,
1221 flags, &error);
d113fda1 1222 if (error == 0) {
e8599db1 1223 error = hammer_get_vnode(ip, &vp);
d113fda1
MD
1224 hammer_rel_inode(ip, 0);
1225 } else {
1226 vp = NULL;
1227 }
1228 if (error == 0) {
1229 vn_unlock(vp);
1230 cache_setvp(ap->a_nch, vp);
1231 vrele(vp);
1232 }
36f82b23 1233 goto done;
d113fda1
MD
1234 }
1235
1236 /*
8cd0a023
MD
1237 * Calculate the namekey and setup the key range for the scan. This
1238 * works kinda like a chained hash table where the lower 32 bits
1239 * of the namekey synthesize the chain.
1240 *
1241 * The key range is inclusive of both key_beg and key_end.
1242 */
5e435c92
MD
1243 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1244 &max_iterations);
66325755 1245
bcac4bbb 1246 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
5a930e66 1247 cursor.key_beg.localization = dip->obj_localization +
beec5dc4 1248 hammer_dir_localization(dip);
8cd0a023
MD
1249 cursor.key_beg.obj_id = dip->obj_id;
1250 cursor.key_beg.key = namekey;
d5530d22 1251 cursor.key_beg.create_tid = 0;
8cd0a023
MD
1252 cursor.key_beg.delete_tid = 0;
1253 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1254 cursor.key_beg.obj_type = 0;
66325755 1255
8cd0a023 1256 cursor.key_end = cursor.key_beg;
5e435c92 1257 cursor.key_end.key += max_iterations;
d5530d22
MD
1258 cursor.asof = asof;
1259 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
66325755
MD
1260
1261 /*
8cd0a023 1262 * Scan all matching records (the chain), locate the one matching
a89aec1b 1263 * the requested path component.
8cd0a023
MD
1264 *
1265 * The hammer_ip_*() functions merge in-memory records with on-disk
1266 * records for the purposes of the search.
66325755 1267 */
6a37e7e4 1268 obj_id = 0;
43c665ae 1269 localization = HAMMER_DEF_LOCALIZATION;
6a37e7e4 1270
4e17f465 1271 if (error == 0) {
4e17f465
MD
1272 error = hammer_ip_first(&cursor);
1273 while (error == 0) {
1274 error = hammer_ip_resolve_data(&cursor);
1275 if (error)
1276 break;
11ad5ade
MD
1277 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1278 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1279 obj_id = cursor.data->entry.obj_id;
ddfdf542 1280 localization = cursor.data->entry.localization;
4e17f465
MD
1281 break;
1282 }
1283 error = hammer_ip_next(&cursor);
66325755
MD
1284 }
1285 }
6a37e7e4 1286 hammer_done_cursor(&cursor);
4c286c36
MD
1287
1288 /*
1289 * Lookup the obj_id. This should always succeed. If it does not
1290 * the filesystem may be damaged and we return a dummy inode.
1291 */
66325755 1292 if (error == 0) {
bcac4bbb 1293 ip = hammer_get_inode(&trans, dip, obj_id,
ddfdf542
MD
1294 asof, localization,
1295 flags, &error);
4c286c36
MD
1296 if (error == ENOENT) {
1297 kprintf("HAMMER: WARNING: Missing "
1298 "inode for dirent \"%s\"\n"
3d30bff3
MD
1299 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1300 ncp->nc_name,
1301 (long long)obj_id, (long long)asof,
1302 localization);
4c286c36
MD
1303 error = 0;
1304 ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1305 asof, localization,
1306 flags, &error);
1307 }
7f7c1f84 1308 if (error == 0) {
e8599db1 1309 error = hammer_get_vnode(ip, &vp);
7f7c1f84
MD
1310 hammer_rel_inode(ip, 0);
1311 } else {
1312 vp = NULL;
1313 }
66325755
MD
1314 if (error == 0) {
1315 vn_unlock(vp);
1316 cache_setvp(ap->a_nch, vp);
1317 vrele(vp);
1318 }
1319 } else if (error == ENOENT) {
1320 cache_setvp(ap->a_nch, NULL);
1321 }
36f82b23 1322done:
b84de5af 1323 hammer_done_transaction(&trans);
b0aab9b9 1324 lwkt_reltoken(&hmp->fs_token);
66325755 1325 return (error);
427e5fc6
MD
1326}
1327
66325755
MD
1328/*
1329 * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1330 *
1331 * Locate the parent directory of a directory vnode.
1332 *
1333 * dvp is referenced but not locked. *vpp must be returned referenced and
1334 * locked. A parent_obj_id of 0 does not necessarily indicate that we are
1335 * at the root, instead it could indicate that the directory we were in was
1336 * removed.
42c7d26b
MD
1337 *
1338 * NOTE: as-of sequences are not linked into the directory structure. If
1339 * we are at the root with a different asof then the mount point, reload
1340 * the same directory with the mount point's asof. I'm not sure what this
1341 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not
1342 * get confused, but it hasn't been tested.
66325755 1343 */
427e5fc6
MD
1344static
1345int
66325755 1346hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
427e5fc6 1347{
36f82b23 1348 struct hammer_transaction trans;
66325755 1349 struct hammer_inode *dip;
d113fda1 1350 struct hammer_inode *ip;
b0aab9b9 1351 hammer_mount_t hmp;
42c7d26b 1352 int64_t parent_obj_id;
5a930e66 1353 u_int32_t parent_obj_localization;
42c7d26b 1354 hammer_tid_t asof;
d113fda1 1355 int error;
66325755
MD
1356
1357 dip = VTOI(ap->a_dvp);
42c7d26b 1358 asof = dip->obj_asof;
b0aab9b9 1359 hmp = dip->hmp;
5a930e66
MD
1360
1361 /*
1362 * Whos are parent? This could be the root of a pseudo-filesystem
1363 * whos parent is in another localization domain.
1364 */
b0aab9b9 1365 lwkt_gettoken(&hmp->fs_token);
42c7d26b 1366 parent_obj_id = dip->ino_data.parent_obj_id;
5a930e66
MD
1367 if (dip->obj_id == HAMMER_OBJID_ROOT)
1368 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1369 else
1370 parent_obj_localization = dip->obj_localization;
42c7d26b
MD
1371
1372 if (parent_obj_id == 0) {
1373 if (dip->obj_id == HAMMER_OBJID_ROOT &&
b0aab9b9 1374 asof != hmp->asof) {
42c7d26b 1375 parent_obj_id = dip->obj_id;
b0aab9b9 1376 asof = hmp->asof;
42c7d26b
MD
1377 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1378 ksnprintf(*ap->a_fakename, 19, "0x%016llx",
973c11b9 1379 (long long)dip->obj_asof);
42c7d26b
MD
1380 } else {
1381 *ap->a_vpp = NULL;
b0aab9b9 1382 lwkt_reltoken(&hmp->fs_token);
42c7d26b
MD
1383 return ENOENT;
1384 }
66325755 1385 }
d113fda1 1386
b0aab9b9 1387 hammer_simple_transaction(&trans, hmp);
ce0138a6 1388 ++hammer_stats_file_iopsr;
36f82b23 1389
bcac4bbb 1390 ip = hammer_get_inode(&trans, dip, parent_obj_id,
5a930e66 1391 asof, parent_obj_localization,
ddfdf542 1392 dip->flags, &error);
36f82b23 1393 if (ip) {
e8599db1 1394 error = hammer_get_vnode(ip, ap->a_vpp);
36f82b23
MD
1395 hammer_rel_inode(ip, 0);
1396 } else {
d113fda1 1397 *ap->a_vpp = NULL;
d113fda1 1398 }
b84de5af 1399 hammer_done_transaction(&trans);
b0aab9b9 1400 lwkt_reltoken(&hmp->fs_token);
d113fda1 1401 return (error);
427e5fc6
MD
1402}
1403
66325755
MD
1404/*
1405 * hammer_vop_nlink { nch, dvp, vp, cred }
1406 */
427e5fc6
MD
1407static
1408int
66325755 1409hammer_vop_nlink(struct vop_nlink_args *ap)
427e5fc6 1410{
66325755
MD
1411 struct hammer_transaction trans;
1412 struct hammer_inode *dip;
1413 struct hammer_inode *ip;
1414 struct nchandle *nch;
b0aab9b9 1415 hammer_mount_t hmp;
66325755
MD
1416 int error;
1417
f437a2ab
MD
1418 if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1419 return(EXDEV);
1420
66325755
MD
1421 nch = ap->a_nch;
1422 dip = VTOI(ap->a_dvp);
1423 ip = VTOI(ap->a_vp);
b0aab9b9 1424 hmp = dip->hmp;
66325755 1425
f437a2ab
MD
1426 if (dip->obj_localization != ip->obj_localization)
1427 return(EXDEV);
1428
d113fda1
MD
1429 if (dip->flags & HAMMER_INODE_RO)
1430 return (EROFS);
1431 if (ip->flags & HAMMER_INODE_RO)
1432 return (EROFS);
b0aab9b9 1433 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1434 return (error);
d113fda1 1435
66325755
MD
1436 /*
1437 * Create a transaction to cover the operations we perform.
1438 */
b0aab9b9
MD
1439 lwkt_gettoken(&hmp->fs_token);
1440 hammer_start_transaction(&trans, hmp);
ce0138a6 1441 ++hammer_stats_file_iopsw;
66325755
MD
1442
1443 /*
1444 * Add the filesystem object to the directory. Note that neither
1445 * dip nor ip are referenced or locked, but their vnodes are
1446 * referenced. This function will bump the inode's link count.
1447 */
5a930e66
MD
1448 error = hammer_ip_add_directory(&trans, dip,
1449 nch->ncp->nc_name, nch->ncp->nc_nlen,
1450 ip);
66325755
MD
1451
1452 /*
1453 * Finish up.
1454 */
b84de5af 1455 if (error == 0) {
6b4f890b
MD
1456 cache_setunresolved(nch);
1457 cache_setvp(nch, ap->a_vp);
66325755 1458 }
b84de5af 1459 hammer_done_transaction(&trans);
fbb84158
MD
1460 hammer_knote(ap->a_vp, NOTE_LINK);
1461 hammer_knote(ap->a_dvp, NOTE_WRITE);
b0aab9b9 1462 lwkt_reltoken(&hmp->fs_token);
66325755 1463 return (error);
427e5fc6
MD
1464}
1465
66325755
MD
1466/*
1467 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1468 *
1469 * The operating system has already ensured that the directory entry
1470 * does not exist and done all appropriate namespace locking.
1471 */
427e5fc6
MD
1472static
1473int
66325755 1474hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
427e5fc6 1475{
66325755
MD
1476 struct hammer_transaction trans;
1477 struct hammer_inode *dip;
1478 struct hammer_inode *nip;
1479 struct nchandle *nch;
b0aab9b9 1480 hammer_mount_t hmp;
66325755
MD
1481 int error;
1482
1483 nch = ap->a_nch;
1484 dip = VTOI(ap->a_dvp);
b0aab9b9 1485 hmp = dip->hmp;
66325755 1486
d113fda1
MD
1487 if (dip->flags & HAMMER_INODE_RO)
1488 return (EROFS);
b0aab9b9 1489 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1490 return (error);
d113fda1 1491
66325755
MD
1492 /*
1493 * Create a transaction to cover the operations we perform.
1494 */
b0aab9b9
MD
1495 lwkt_gettoken(&hmp->fs_token);
1496 hammer_start_transaction(&trans, hmp);
ce0138a6 1497 ++hammer_stats_file_iopsw;
66325755
MD
1498
1499 /*
1500 * Create a new filesystem object of the requested type. The
8cd0a023 1501 * returned inode will be referenced but not locked.
66325755 1502 */
5a930e66 1503 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
1504 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1505 NULL, &nip);
66325755 1506 if (error) {
77062c8a 1507 hkprintf("hammer_mkdir error %d\n", error);
b84de5af 1508 hammer_done_transaction(&trans);
66325755 1509 *ap->a_vpp = NULL;
b0aab9b9 1510 lwkt_reltoken(&hmp->fs_token);
66325755
MD
1511 return (error);
1512 }
66325755
MD
1513 /*
1514 * Add the new filesystem object to the directory. This will also
1515 * bump the inode's link count.
1516 */
5a930e66
MD
1517 error = hammer_ip_add_directory(&trans, dip,
1518 nch->ncp->nc_name, nch->ncp->nc_nlen,
1519 nip);
0b075555 1520 if (error)
77062c8a 1521 hkprintf("hammer_mkdir (add) error %d\n", error);
66325755
MD
1522
1523 /*
1524 * Finish up.
1525 */
1526 if (error) {
a89aec1b 1527 hammer_rel_inode(nip, 0);
66325755
MD
1528 *ap->a_vpp = NULL;
1529 } else {
e8599db1 1530 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1531 hammer_rel_inode(nip, 0);
1532 if (error == 0) {
1533 cache_setunresolved(ap->a_nch);
1534 cache_setvp(ap->a_nch, *ap->a_vpp);
1535 }
66325755 1536 }
b84de5af 1537 hammer_done_transaction(&trans);
fbb84158
MD
1538 if (error == 0)
1539 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
b0aab9b9 1540 lwkt_reltoken(&hmp->fs_token);
66325755 1541 return (error);
427e5fc6
MD
1542}
1543
66325755
MD
1544/*
1545 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1546 *
1547 * The operating system has already ensured that the directory entry
1548 * does not exist and done all appropriate namespace locking.
1549 */
427e5fc6
MD
1550static
1551int
66325755 1552hammer_vop_nmknod(struct vop_nmknod_args *ap)
427e5fc6 1553{
66325755
MD
1554 struct hammer_transaction trans;
1555 struct hammer_inode *dip;
1556 struct hammer_inode *nip;
1557 struct nchandle *nch;
b0aab9b9 1558 hammer_mount_t hmp;
66325755
MD
1559 int error;
1560
1561 nch = ap->a_nch;
1562 dip = VTOI(ap->a_dvp);
b0aab9b9 1563 hmp = dip->hmp;
66325755 1564
d113fda1
MD
1565 if (dip->flags & HAMMER_INODE_RO)
1566 return (EROFS);
b0aab9b9 1567 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1568 return (error);
d113fda1 1569
66325755
MD
1570 /*
1571 * Create a transaction to cover the operations we perform.
1572 */
b0aab9b9
MD
1573 lwkt_gettoken(&hmp->fs_token);
1574 hammer_start_transaction(&trans, hmp);
ce0138a6 1575 ++hammer_stats_file_iopsw;
66325755
MD
1576
1577 /*
1578 * Create a new filesystem object of the requested type. The
8cd0a023 1579 * returned inode will be referenced but not locked.
5a930e66
MD
1580 *
1581 * If mknod specifies a directory a pseudo-fs is created.
66325755 1582 */
5a930e66 1583 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
1584 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1585 NULL, &nip);
66325755 1586 if (error) {
b84de5af 1587 hammer_done_transaction(&trans);
66325755 1588 *ap->a_vpp = NULL;
b0aab9b9 1589 lwkt_reltoken(&hmp->fs_token);
66325755
MD
1590 return (error);
1591 }
66325755
MD
1592
1593 /*
1594 * Add the new filesystem object to the directory. This will also
1595 * bump the inode's link count.
1596 */
5a930e66
MD
1597 error = hammer_ip_add_directory(&trans, dip,
1598 nch->ncp->nc_name, nch->ncp->nc_nlen,
1599 nip);
66325755
MD
1600
1601 /*
1602 * Finish up.
1603 */
1604 if (error) {
a89aec1b 1605 hammer_rel_inode(nip, 0);
66325755
MD
1606 *ap->a_vpp = NULL;
1607 } else {
e8599db1 1608 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1609 hammer_rel_inode(nip, 0);
1610 if (error == 0) {
1611 cache_setunresolved(ap->a_nch);
1612 cache_setvp(ap->a_nch, *ap->a_vpp);
1613 }
66325755 1614 }
b84de5af 1615 hammer_done_transaction(&trans);
fbb84158
MD
1616 if (error == 0)
1617 hammer_knote(ap->a_dvp, NOTE_WRITE);
b0aab9b9 1618 lwkt_reltoken(&hmp->fs_token);
66325755 1619 return (error);
427e5fc6
MD
1620}
1621
66325755
MD
1622/*
1623 * hammer_vop_open { vp, mode, cred, fp }
b0aab9b9
MD
1624 *
1625 * MPSAFE (does not require fs_token)
66325755 1626 */
427e5fc6
MD
1627static
1628int
66325755 1629hammer_vop_open(struct vop_open_args *ap)
427e5fc6 1630{
9f5097dc
MD
1631 hammer_inode_t ip;
1632
ce0138a6 1633 ++hammer_stats_file_iopsr;
9f5097dc
MD
1634 ip = VTOI(ap->a_vp);
1635
1636 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
d113fda1 1637 return (EROFS);
a89aec1b 1638 return(vop_stdopen(ap));
427e5fc6
MD
1639}
1640
66325755 1641/*
66325755
MD
1642 * hammer_vop_print { vp }
1643 */
427e5fc6
MD
1644static
1645int
66325755 1646hammer_vop_print(struct vop_print_args *ap)
427e5fc6
MD
1647{
1648 return EOPNOTSUPP;
1649}
1650
66325755 1651/*
6b4f890b 1652 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
66325755 1653 */
427e5fc6
MD
1654static
1655int
66325755 1656hammer_vop_readdir(struct vop_readdir_args *ap)
427e5fc6 1657{
36f82b23 1658 struct hammer_transaction trans;
6b4f890b
MD
1659 struct hammer_cursor cursor;
1660 struct hammer_inode *ip;
b0aab9b9 1661 hammer_mount_t hmp;
6b4f890b 1662 struct uio *uio;
6b4f890b
MD
1663 hammer_base_elm_t base;
1664 int error;
1665 int cookie_index;
1666 int ncookies;
1667 off_t *cookies;
1668 off_t saveoff;
1669 int r;
ea434b6f 1670 int dtype;
6b4f890b 1671
ce0138a6 1672 ++hammer_stats_file_iopsr;
6b4f890b
MD
1673 ip = VTOI(ap->a_vp);
1674 uio = ap->a_uio;
b3deaf57 1675 saveoff = uio->uio_offset;
b0aab9b9 1676 hmp = ip->hmp;
b3deaf57
MD
1677
1678 if (ap->a_ncookies) {
1679 ncookies = uio->uio_resid / 16 + 1;
1680 if (ncookies > 1024)
1681 ncookies = 1024;
1682 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1683 cookie_index = 0;
1684 } else {
1685 ncookies = -1;
1686 cookies = NULL;
1687 cookie_index = 0;
1688 }
1689
b0aab9b9
MD
1690 lwkt_gettoken(&hmp->fs_token);
1691 hammer_simple_transaction(&trans, hmp);
36f82b23 1692
b3deaf57
MD
1693 /*
1694 * Handle artificial entries
4c286c36
MD
1695 *
1696 * It should be noted that the minimum value for a directory
1697 * hash key on-media is 0x0000000100000000, so we can use anything
1698 * less then that to represent our 'special' key space.
b3deaf57
MD
1699 */
1700 error = 0;
1701 if (saveoff == 0) {
1702 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1703 if (r)
1704 goto done;
1705 if (cookies)
1706 cookies[cookie_index] = saveoff;
1707 ++saveoff;
1708 ++cookie_index;
1709 if (cookie_index == ncookies)
1710 goto done;
1711 }
1712 if (saveoff == 1) {
1713 if (ip->ino_data.parent_obj_id) {
1714 r = vop_write_dirent(&error, uio,
1715 ip->ino_data.parent_obj_id,
1716 DT_DIR, 2, "..");
1717 } else {
1718 r = vop_write_dirent(&error, uio,
1719 ip->obj_id, DT_DIR, 2, "..");
1720 }
1721 if (r)
1722 goto done;
1723 if (cookies)
1724 cookies[cookie_index] = saveoff;
1725 ++saveoff;
1726 ++cookie_index;
1727 if (cookie_index == ncookies)
1728 goto done;
1729 }
6b4f890b
MD
1730
1731 /*
1732 * Key range (begin and end inclusive) to scan. Directory keys
1733 * directly translate to a 64 bit 'seek' position.
1734 */
bcac4bbb 1735 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
5a930e66 1736 cursor.key_beg.localization = ip->obj_localization +
beec5dc4 1737 hammer_dir_localization(ip);
6b4f890b 1738 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1739 cursor.key_beg.create_tid = 0;
6b4f890b
MD
1740 cursor.key_beg.delete_tid = 0;
1741 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1742 cursor.key_beg.obj_type = 0;
b3deaf57 1743 cursor.key_beg.key = saveoff;
6b4f890b
MD
1744
1745 cursor.key_end = cursor.key_beg;
1746 cursor.key_end.key = HAMMER_MAX_KEY;
d5530d22
MD
1747 cursor.asof = ip->obj_asof;
1748 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
6b4f890b 1749
4e17f465 1750 error = hammer_ip_first(&cursor);
6b4f890b
MD
1751
1752 while (error == 0) {
11ad5ade 1753 error = hammer_ip_resolve_data(&cursor);
6b4f890b
MD
1754 if (error)
1755 break;
11ad5ade 1756 base = &cursor.leaf->base;
6b4f890b 1757 saveoff = base->key;
11ad5ade 1758 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
6b4f890b 1759
7a04d74f
MD
1760 if (base->obj_id != ip->obj_id)
1761 panic("readdir: bad record at %p", cursor.node);
1762
ea434b6f
MD
1763 /*
1764 * Convert pseudo-filesystems into softlinks
1765 */
1766 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
6b4f890b 1767 r = vop_write_dirent(
11ad5ade 1768 &error, uio, cursor.data->entry.obj_id,
ea434b6f 1769 dtype,
11ad5ade
MD
1770 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1771 (void *)cursor.data->entry.name);
6b4f890b
MD
1772 if (r)
1773 break;
1774 ++saveoff;
1775 if (cookies)
1776 cookies[cookie_index] = base->key;
1777 ++cookie_index;
1778 if (cookie_index == ncookies)
1779 break;
1780 error = hammer_ip_next(&cursor);
1781 }
1782 hammer_done_cursor(&cursor);
1783
b3deaf57 1784done:
b84de5af 1785 hammer_done_transaction(&trans);
36f82b23 1786
6b4f890b
MD
1787 if (ap->a_eofflag)
1788 *ap->a_eofflag = (error == ENOENT);
6b4f890b
MD
1789 uio->uio_offset = saveoff;
1790 if (error && cookie_index == 0) {
b3deaf57
MD
1791 if (error == ENOENT)
1792 error = 0;
6b4f890b
MD
1793 if (cookies) {
1794 kfree(cookies, M_TEMP);
1795 *ap->a_ncookies = 0;
1796 *ap->a_cookies = NULL;
1797 }
1798 } else {
7a04d74f
MD
1799 if (error == ENOENT)
1800 error = 0;
6b4f890b
MD
1801 if (cookies) {
1802 *ap->a_ncookies = cookie_index;
1803 *ap->a_cookies = cookies;
1804 }
1805 }
b0aab9b9 1806 lwkt_reltoken(&hmp->fs_token);
6b4f890b 1807 return(error);
427e5fc6
MD
1808}
1809
66325755
MD
1810/*
1811 * hammer_vop_readlink { vp, uio, cred }
1812 */
427e5fc6
MD
1813static
1814int
66325755 1815hammer_vop_readlink(struct vop_readlink_args *ap)
427e5fc6 1816{
36f82b23 1817 struct hammer_transaction trans;
7a04d74f
MD
1818 struct hammer_cursor cursor;
1819 struct hammer_inode *ip;
b0aab9b9 1820 hammer_mount_t hmp;
ea434b6f
MD
1821 char buf[32];
1822 u_int32_t localization;
1823 hammer_pseudofs_inmem_t pfsm;
7a04d74f
MD
1824 int error;
1825
1826 ip = VTOI(ap->a_vp);
b0aab9b9
MD
1827 hmp = ip->hmp;
1828
1829 lwkt_gettoken(&hmp->fs_token);
36f82b23 1830
2f85fa4d
MD
1831 /*
1832 * Shortcut if the symlink data was stuffed into ino_data.
ea434b6f 1833 *
842e7a70
MD
1834 * Also expand special "@@PFS%05d" softlinks (expansion only
1835 * occurs for non-historical (current) accesses made from the
1836 * primary filesystem).
2f85fa4d
MD
1837 */
1838 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
ea434b6f
MD
1839 char *ptr;
1840 int bytes;
1841
1842 ptr = ip->ino_data.ext.symlink;
1843 bytes = (int)ip->ino_data.size;
842e7a70
MD
1844 if (bytes == 10 &&
1845 ip->obj_asof == HAMMER_MAX_TID &&
1846 ip->obj_localization == 0 &&
1847 strncmp(ptr, "@@PFS", 5) == 0) {
b0aab9b9 1848 hammer_simple_transaction(&trans, hmp);
ea434b6f
MD
1849 bcopy(ptr + 5, buf, 5);
1850 buf[5] = 0;
1851 localization = strtoul(buf, NULL, 10) << 16;
1852 pfsm = hammer_load_pseudofs(&trans, localization,
1853 &error);
1854 if (error == 0) {
4c038e17
MD
1855 if (pfsm->pfsd.mirror_flags &
1856 HAMMER_PFSD_SLAVE) {
cb3c760c 1857 /* vap->va_size == 26 */
4c038e17
MD
1858 ksnprintf(buf, sizeof(buf),
1859 "@@0x%016llx:%05d",
973c11b9 1860 (long long)pfsm->pfsd.sync_end_tid,
4c038e17
MD
1861 localization >> 16);
1862 } else {
cb3c760c
MD
1863 /* vap->va_size == 10 */
1864 ksnprintf(buf, sizeof(buf),
1865 "@@-1:%05d",
1866 localization >> 16);
1867#if 0
4c038e17
MD
1868 ksnprintf(buf, sizeof(buf),
1869 "@@0x%016llx:%05d",
973c11b9 1870 (long long)HAMMER_MAX_TID,
4c038e17 1871 localization >> 16);
cb3c760c 1872#endif
4c038e17 1873 }
ea434b6f
MD
1874 ptr = buf;
1875 bytes = strlen(buf);
1876 }
1877 if (pfsm)
b0aab9b9 1878 hammer_rel_pseudofs(hmp, pfsm);
ea434b6f
MD
1879 hammer_done_transaction(&trans);
1880 }
1881 error = uiomove(ptr, bytes, ap->a_uio);
b0aab9b9 1882 lwkt_reltoken(&hmp->fs_token);
2f85fa4d
MD
1883 return(error);
1884 }
36f82b23 1885
2f85fa4d
MD
1886 /*
1887 * Long version
1888 */
b0aab9b9 1889 hammer_simple_transaction(&trans, hmp);
ce0138a6 1890 ++hammer_stats_file_iopsr;
bcac4bbb 1891 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
7a04d74f
MD
1892
1893 /*
1894 * Key range (begin and end inclusive) to scan. Directory keys
1895 * directly translate to a 64 bit 'seek' position.
1896 */
5a930e66
MD
1897 cursor.key_beg.localization = ip->obj_localization +
1898 HAMMER_LOCALIZE_MISC;
7a04d74f 1899 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1900 cursor.key_beg.create_tid = 0;
7a04d74f
MD
1901 cursor.key_beg.delete_tid = 0;
1902 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1903 cursor.key_beg.obj_type = 0;
1904 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
d5530d22
MD
1905 cursor.asof = ip->obj_asof;
1906 cursor.flags |= HAMMER_CURSOR_ASOF;
7a04d74f 1907
45a014dc 1908 error = hammer_ip_lookup(&cursor);
7a04d74f
MD
1909 if (error == 0) {
1910 error = hammer_ip_resolve_data(&cursor);
1911 if (error == 0) {
11ad5ade
MD
1912 KKASSERT(cursor.leaf->data_len >=
1913 HAMMER_SYMLINK_NAME_OFF);
1914 error = uiomove(cursor.data->symlink.name,
1915 cursor.leaf->data_len -
1916 HAMMER_SYMLINK_NAME_OFF,
7a04d74f
MD
1917 ap->a_uio);
1918 }
1919 }
1920 hammer_done_cursor(&cursor);
b84de5af 1921 hammer_done_transaction(&trans);
b0aab9b9 1922 lwkt_reltoken(&hmp->fs_token);
7a04d74f 1923 return(error);
427e5fc6
MD
1924}
1925
66325755
MD
1926/*
1927 * hammer_vop_nremove { nch, dvp, cred }
1928 */
427e5fc6
MD
1929static
1930int
66325755 1931hammer_vop_nremove(struct vop_nremove_args *ap)
427e5fc6 1932{
b84de5af 1933 struct hammer_transaction trans;
e63644f0 1934 struct hammer_inode *dip;
b0aab9b9 1935 hammer_mount_t hmp;
b84de5af
MD
1936 int error;
1937
e63644f0 1938 dip = VTOI(ap->a_dvp);
b0aab9b9 1939 hmp = dip->hmp;
e63644f0
MD
1940
1941 if (hammer_nohistory(dip) == 0 &&
b0aab9b9 1942 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1943 return (error);
1944 }
1945
b0aab9b9
MD
1946 lwkt_gettoken(&hmp->fs_token);
1947 hammer_start_transaction(&trans, hmp);
ce0138a6 1948 ++hammer_stats_file_iopsw;
d7e278bb 1949 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
b84de5af 1950 hammer_done_transaction(&trans);
fbb84158
MD
1951 if (error == 0)
1952 hammer_knote(ap->a_dvp, NOTE_WRITE);
b0aab9b9 1953 lwkt_reltoken(&hmp->fs_token);
b84de5af 1954 return (error);
427e5fc6
MD
1955}
1956
66325755
MD
1957/*
1958 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1959 */
427e5fc6
MD
1960static
1961int
66325755 1962hammer_vop_nrename(struct vop_nrename_args *ap)
427e5fc6 1963{
8cd0a023
MD
1964 struct hammer_transaction trans;
1965 struct namecache *fncp;
1966 struct namecache *tncp;
1967 struct hammer_inode *fdip;
1968 struct hammer_inode *tdip;
1969 struct hammer_inode *ip;
b0aab9b9 1970 hammer_mount_t hmp;
8cd0a023 1971 struct hammer_cursor cursor;
8cd0a023 1972 int64_t namekey;
5e435c92 1973 u_int32_t max_iterations;
11ad5ade 1974 int nlen, error;
8cd0a023 1975
f437a2ab
MD
1976 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1977 return(EXDEV);
1978 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1979 return(EXDEV);
1980
8cd0a023
MD
1981 fdip = VTOI(ap->a_fdvp);
1982 tdip = VTOI(ap->a_tdvp);
1983 fncp = ap->a_fnch->ncp;
1984 tncp = ap->a_tnch->ncp;
b3deaf57
MD
1985 ip = VTOI(fncp->nc_vp);
1986 KKASSERT(ip != NULL);
d113fda1 1987
b0aab9b9
MD
1988 hmp = ip->hmp;
1989
f437a2ab
MD
1990 if (fdip->obj_localization != tdip->obj_localization)
1991 return(EXDEV);
1992 if (fdip->obj_localization != ip->obj_localization)
1993 return(EXDEV);
1994
d113fda1
MD
1995 if (fdip->flags & HAMMER_INODE_RO)
1996 return (EROFS);
1997 if (tdip->flags & HAMMER_INODE_RO)
1998 return (EROFS);
1999 if (ip->flags & HAMMER_INODE_RO)
2000 return (EROFS);
b0aab9b9 2001 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 2002 return (error);
d113fda1 2003
b0aab9b9
MD
2004 lwkt_gettoken(&hmp->fs_token);
2005 hammer_start_transaction(&trans, hmp);
ce0138a6 2006 ++hammer_stats_file_iopsw;
8cd0a023
MD
2007
2008 /*
b3deaf57
MD
2009 * Remove tncp from the target directory and then link ip as
2010 * tncp. XXX pass trans to dounlink
42c7d26b
MD
2011 *
2012 * Force the inode sync-time to match the transaction so it is
2013 * in-sync with the creation of the target directory entry.
8cd0a023 2014 */
d7e278bb
MD
2015 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
2016 ap->a_cred, 0, -1);
42c7d26b 2017 if (error == 0 || error == ENOENT) {
5a930e66
MD
2018 error = hammer_ip_add_directory(&trans, tdip,
2019 tncp->nc_name, tncp->nc_nlen,
2020 ip);
42c7d26b
MD
2021 if (error == 0) {
2022 ip->ino_data.parent_obj_id = tdip->obj_id;
cc0758d0 2023 ip->ino_data.ctime = trans.time;
e98f1b96 2024 hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
42c7d26b
MD
2025 }
2026 }
b3deaf57
MD
2027 if (error)
2028 goto failed; /* XXX */
8cd0a023
MD
2029
2030 /*
2031 * Locate the record in the originating directory and remove it.
2032 *
2033 * Calculate the namekey and setup the key range for the scan. This
2034 * works kinda like a chained hash table where the lower 32 bits
2035 * of the namekey synthesize the chain.
2036 *
2037 * The key range is inclusive of both key_beg and key_end.
2038 */
5e435c92
MD
2039 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2040 &max_iterations);
6a37e7e4 2041retry:
bcac4bbb 2042 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
5a930e66 2043 cursor.key_beg.localization = fdip->obj_localization +
beec5dc4 2044 hammer_dir_localization(fdip);
8cd0a023
MD
2045 cursor.key_beg.obj_id = fdip->obj_id;
2046 cursor.key_beg.key = namekey;
d5530d22 2047 cursor.key_beg.create_tid = 0;
8cd0a023
MD
2048 cursor.key_beg.delete_tid = 0;
2049 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2050 cursor.key_beg.obj_type = 0;
2051
2052 cursor.key_end = cursor.key_beg;
5e435c92 2053 cursor.key_end.key += max_iterations;
d5530d22
MD
2054 cursor.asof = fdip->obj_asof;
2055 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
8cd0a023
MD
2056
2057 /*
2058 * Scan all matching records (the chain), locate the one matching
a89aec1b 2059 * the requested path component.
8cd0a023
MD
2060 *
2061 * The hammer_ip_*() functions merge in-memory records with on-disk
2062 * records for the purposes of the search.
2063 */
4e17f465 2064 error = hammer_ip_first(&cursor);
a89aec1b 2065 while (error == 0) {
8cd0a023
MD
2066 if (hammer_ip_resolve_data(&cursor) != 0)
2067 break;
11ad5ade
MD
2068 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2069 KKASSERT(nlen > 0);
2070 if (fncp->nc_nlen == nlen &&
2071 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
8cd0a023
MD
2072 break;
2073 }
a89aec1b 2074 error = hammer_ip_next(&cursor);
8cd0a023 2075 }
8cd0a023
MD
2076
2077 /*
2078 * If all is ok we have to get the inode so we can adjust nlinks.
6a37e7e4
MD
2079 *
2080 * WARNING: hammer_ip_del_directory() may have to terminate the
2081 * cursor to avoid a recursion. It's ok to call hammer_done_cursor()
2082 * twice.
8cd0a023 2083 */
9944ae54 2084 if (error == 0)
6a37e7e4 2085 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
b84de5af
MD
2086
2087 /*
2088 * XXX A deadlock here will break rename's atomicy for the purposes
2089 * of crash recovery.
2090 */
2091 if (error == EDEADLK) {
b84de5af 2092 hammer_done_cursor(&cursor);
b84de5af
MD
2093 goto retry;
2094 }
2095
2096 /*
2097 * Cleanup and tell the kernel that the rename succeeded.
036ea0c3
MD
2098 *
2099 * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2100 * without formally acquiring the vp since the vp might
2101 * have zero refs on it, or in the middle of a reclaim,
2102 * etc.
b84de5af 2103 */
c0ade690 2104 hammer_done_cursor(&cursor);
fbb84158 2105 if (error == 0) {
6a37e7e4 2106 cache_rename(ap->a_fnch, ap->a_tnch);
fbb84158
MD
2107 hammer_knote(ap->a_fdvp, NOTE_WRITE);
2108 hammer_knote(ap->a_tdvp, NOTE_WRITE);
036ea0c3
MD
2109 while (ip->vp) {
2110 struct vnode *vp;
2111
2112 error = hammer_get_vnode(ip, &vp);
2113 if (error == 0 && vp) {
2114 vn_unlock(vp);
2115 hammer_knote(ip->vp, NOTE_RENAME);
2116 vrele(vp);
2117 break;
2118 }
2119 kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2120 }
fbb84158 2121 }
b84de5af 2122
b3deaf57 2123failed:
b84de5af 2124 hammer_done_transaction(&trans);
b0aab9b9 2125 lwkt_reltoken(&hmp->fs_token);
8cd0a023 2126 return (error);
427e5fc6
MD
2127}
2128
66325755
MD
2129/*
2130 * hammer_vop_nrmdir { nch, dvp, cred }
2131 */
427e5fc6
MD
2132static
2133int
66325755 2134hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
427e5fc6 2135{
b84de5af 2136 struct hammer_transaction trans;
e63644f0 2137 struct hammer_inode *dip;
b0aab9b9 2138 hammer_mount_t hmp;
b84de5af
MD
2139 int error;
2140
e63644f0 2141 dip = VTOI(ap->a_dvp);
b0aab9b9 2142 hmp = dip->hmp;
e63644f0
MD
2143
2144 if (hammer_nohistory(dip) == 0 &&
b0aab9b9 2145 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
2146 return (error);
2147 }
2148
b0aab9b9
MD
2149 lwkt_gettoken(&hmp->fs_token);
2150 hammer_start_transaction(&trans, hmp);
ce0138a6 2151 ++hammer_stats_file_iopsw;
d7e278bb 2152 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
b84de5af 2153 hammer_done_transaction(&trans);
fbb84158
MD
2154 if (error == 0)
2155 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
b0aab9b9 2156 lwkt_reltoken(&hmp->fs_token);
b84de5af 2157 return (error);
427e5fc6
MD
2158}
2159
66325755 2160/*
349433c9
MD
2161 * hammer_vop_markatime { vp, cred }
2162 */
2163static
2164int
2165hammer_vop_markatime(struct vop_markatime_args *ap)
2166{
2167 struct hammer_transaction trans;
2168 struct hammer_inode *ip;
b0aab9b9 2169 hammer_mount_t hmp;
349433c9
MD
2170
2171 ip = VTOI(ap->a_vp);
2172 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2173 return (EROFS);
2174 if (ip->flags & HAMMER_INODE_RO)
2175 return (EROFS);
b0aab9b9
MD
2176 hmp = ip->hmp;
2177 if (hmp->mp->mnt_flag & MNT_NOATIME)
349433c9 2178 return (0);
b0aab9b9
MD
2179 lwkt_gettoken(&hmp->fs_token);
2180 hammer_start_transaction(&trans, hmp);
349433c9
MD
2181 ++hammer_stats_file_iopsw;
2182
2183 ip->ino_data.atime = trans.time;
e98f1b96 2184 hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
349433c9
MD
2185 hammer_done_transaction(&trans);
2186 hammer_knote(ap->a_vp, NOTE_ATTRIB);
b0aab9b9 2187 lwkt_reltoken(&hmp->fs_token);
349433c9
MD
2188 return (0);
2189}
2190
2191/*
66325755
MD
2192 * hammer_vop_setattr { vp, vap, cred }
2193 */
427e5fc6
MD
2194static
2195int
66325755 2196hammer_vop_setattr(struct vop_setattr_args *ap)
427e5fc6 2197{
8cd0a023 2198 struct hammer_transaction trans;
8cd0a023 2199 struct hammer_inode *ip;
b0aab9b9
MD
2200 struct vattr *vap;
2201 hammer_mount_t hmp;
8cd0a023
MD
2202 int modflags;
2203 int error;
d5ef456e 2204 int truncating;
4a2796f3 2205 int blksize;
fbb84158 2206 int kflags;
6362a262 2207#if 0
4a2796f3 2208 int64_t aligned_size;
6362a262 2209#endif
8cd0a023 2210 u_int32_t flags;
8cd0a023
MD
2211
2212 vap = ap->a_vap;
2213 ip = ap->a_vp->v_data;
2214 modflags = 0;
fbb84158 2215 kflags = 0;
b0aab9b9 2216 hmp = ip->hmp;
8cd0a023
MD
2217
2218 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2219 return(EROFS);
d113fda1
MD
2220 if (ip->flags & HAMMER_INODE_RO)
2221 return (EROFS);
e63644f0 2222 if (hammer_nohistory(ip) == 0 &&
b0aab9b9 2223 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
2224 return (error);
2225 }
8cd0a023 2226
b0aab9b9
MD
2227 lwkt_gettoken(&hmp->fs_token);
2228 hammer_start_transaction(&trans, hmp);
ce0138a6 2229 ++hammer_stats_file_iopsw;
8cd0a023
MD
2230 error = 0;
2231
2232 if (vap->va_flags != VNOVAL) {
2233 flags = ip->ino_data.uflags;
2234 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2235 hammer_to_unix_xid(&ip->ino_data.uid),
2236 ap->a_cred);
2237 if (error == 0) {
2238 if (ip->ino_data.uflags != flags) {
2239 ip->ino_data.uflags = flags;
cc0758d0 2240 ip->ino_data.ctime = trans.time;
8cd0a023 2241 modflags |= HAMMER_INODE_DDIRTY;
fbb84158 2242 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2243 }
2244 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2245 error = 0;
2246 goto done;
2247 }
2248 }
2249 goto done;
2250 }
2251 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2252 error = EPERM;
2253 goto done;
2254 }
7538695e
MD
2255 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2256 mode_t cur_mode = ip->ino_data.mode;
2257 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2258 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2259 uuid_t uuid_uid;
2260 uuid_t uuid_gid;
2261
2262 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2263 ap->a_cred,
2264 &cur_uid, &cur_gid, &cur_mode);
2265 if (error == 0) {
2266 hammer_guid_to_uuid(&uuid_uid, cur_uid);
2267 hammer_guid_to_uuid(&uuid_gid, cur_gid);
2268 if (bcmp(&uuid_uid, &ip->ino_data.uid,
2269 sizeof(uuid_uid)) ||
2270 bcmp(&uuid_gid, &ip->ino_data.gid,
2271 sizeof(uuid_gid)) ||
2272 ip->ino_data.mode != cur_mode
2273 ) {
2274 ip->ino_data.uid = uuid_uid;
2275 ip->ino_data.gid = uuid_gid;
2276 ip->ino_data.mode = cur_mode;
cc0758d0
MD
2277 ip->ino_data.ctime = trans.time;
2278 modflags |= HAMMER_INODE_DDIRTY;
7538695e 2279 }
fbb84158 2280 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2281 }
2282 }
11ad5ade 2283 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
8cd0a023
MD
2284 switch(ap->a_vp->v_type) {
2285 case VREG:
11ad5ade 2286 if (vap->va_size == ip->ino_data.size)
d5ef456e 2287 break;
47f363f1
MD
2288
2289 /*
c58123da
MD
2290 * Log the operation if in fast-fsync mode or if
2291 * there are unterminated redo write records present.
2292 *
2293 * The second check is needed so the recovery code
2294 * properly truncates write redos even if nominal
2295 * REDO operations is turned off due to excessive
2296 * writes, because the related records might be
2297 * destroyed and never lay down a TERM_WRITE.
47f363f1 2298 */
c58123da
MD
2299 if ((ip->flags & HAMMER_INODE_REDO) ||
2300 (ip->flags & HAMMER_INODE_RDIRTY)) {
47f363f1
MD
2301 error = hammer_generate_redo(&trans, ip,
2302 vap->va_size,
2303 HAMMER_REDO_TRUNC,
2304 NULL, 0);
2305 }
2306 blksize = hammer_blocksize(vap->va_size);
2307
b84de5af
MD
2308 /*
2309 * XXX break atomicy, we can deadlock the backend
2310 * if we do not release the lock. Probably not a
2311 * big deal here.
2312 */
11ad5ade 2313 if (vap->va_size < ip->ino_data.size) {
6362a262
MD
2314 nvtruncbuf(ap->a_vp, vap->va_size,
2315 blksize,
753df37e
MD
2316 hammer_blockoff(vap->va_size),
2317 0);
d5ef456e 2318 truncating = 1;
fbb84158 2319 kflags |= NOTE_WRITE;
d5ef456e 2320 } else {
6362a262
MD
2321 nvextendbuf(ap->a_vp,
2322 ip->ino_data.size,
2323 vap->va_size,
2324 hammer_blocksize(ip->ino_data.size),
2325 hammer_blocksize(vap->va_size),
2326 hammer_blockoff(ip->ino_data.size),
2327 hammer_blockoff(vap->va_size),
2328 0);
d5ef456e 2329 truncating = 0;
fbb84158 2330 kflags |= NOTE_WRITE | NOTE_EXTEND;
c0ade690 2331 }
11ad5ade 2332 ip->ino_data.size = vap->va_size;
cc0758d0 2333 ip->ino_data.mtime = trans.time;
47f363f1 2334 /* XXX safe to use SDIRTY instead of DDIRTY here? */
cc0758d0 2335 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
d5ef456e 2336
b84de5af 2337 /*
6362a262
MD
2338 * On-media truncation is cached in the inode until
2339 * the inode is synchronized. We must immediately
2340 * handle any frontend records.
b84de5af 2341 */
d5ef456e 2342 if (truncating) {
47637bff 2343 hammer_ip_frontend_trunc(ip, vap->va_size);
0832c9bb
MD
2344#ifdef DEBUG_TRUNCATE
2345 if (HammerTruncIp == NULL)
2346 HammerTruncIp = ip;
2347#endif
b84de5af
MD
2348 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2349 ip->flags |= HAMMER_INODE_TRUNCATED;
2350 ip->trunc_off = vap->va_size;
0832c9bb
MD
2351#ifdef DEBUG_TRUNCATE
2352 if (ip == HammerTruncIp)
973c11b9
MD
2353 kprintf("truncate1 %016llx\n",
2354 (long long)ip->trunc_off);
0832c9bb 2355#endif
b84de5af
MD
2356 } else if (ip->trunc_off > vap->va_size) {
2357 ip->trunc_off = vap->va_size;
0832c9bb
MD
2358#ifdef DEBUG_TRUNCATE
2359 if (ip == HammerTruncIp)
973c11b9
MD
2360 kprintf("truncate2 %016llx\n",
2361 (long long)ip->trunc_off);
0832c9bb
MD
2362#endif
2363 } else {
2364#ifdef DEBUG_TRUNCATE
2365 if (ip == HammerTruncIp)
973c11b9
MD
2366 kprintf("truncate3 %016llx (ignored)\n",
2367 (long long)vap->va_size);
0832c9bb 2368#endif
b84de5af 2369 }
d5ef456e 2370 }
b84de5af 2371
6362a262 2372#if 0
d5ef456e 2373 /*
6362a262
MD
2374 * When truncating, nvtruncbuf() may have cleaned out
2375 * a portion of the last block on-disk in the buffer
2376 * cache. We must clean out any frontend records
2377 * for blocks beyond the new last block.
d5ef456e 2378 */
4a2796f3
MD
2379 aligned_size = (vap->va_size + (blksize - 1)) &
2380 ~(int64_t)(blksize - 1);
b84de5af 2381 if (truncating && vap->va_size < aligned_size) {
4a2796f3 2382 aligned_size -= blksize;
47637bff 2383 hammer_ip_frontend_trunc(ip, aligned_size);
d5ef456e 2384 }
6362a262 2385#endif
76376933 2386 break;
8cd0a023 2387 case VDATABASE:
b84de5af
MD
2388 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2389 ip->flags |= HAMMER_INODE_TRUNCATED;
2390 ip->trunc_off = vap->va_size;
2391 } else if (ip->trunc_off > vap->va_size) {
2392 ip->trunc_off = vap->va_size;
2393 }
47637bff 2394 hammer_ip_frontend_trunc(ip, vap->va_size);
11ad5ade 2395 ip->ino_data.size = vap->va_size;
cc0758d0
MD
2396 ip->ino_data.mtime = trans.time;
2397 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
fbb84158 2398 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2399 break;
2400 default:
2401 error = EINVAL;
2402 goto done;
2403 }
d26d0ae9 2404 break;
8cd0a023
MD
2405 }
2406 if (vap->va_atime.tv_sec != VNOVAL) {
cc0758d0 2407 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
ddfdf542 2408 modflags |= HAMMER_INODE_ATIME;
fbb84158 2409 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2410 }
2411 if (vap->va_mtime.tv_sec != VNOVAL) {
cc0758d0 2412 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
ddfdf542 2413 modflags |= HAMMER_INODE_MTIME;
fbb84158 2414 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2415 }
2416 if (vap->va_mode != (mode_t)VNOVAL) {
7538695e
MD
2417 mode_t cur_mode = ip->ino_data.mode;
2418 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2419 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2420
2421 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2422 cur_uid, cur_gid, &cur_mode);
2423 if (error == 0 && ip->ino_data.mode != cur_mode) {
2424 ip->ino_data.mode = cur_mode;
cc0758d0 2425 ip->ino_data.ctime = trans.time;
8cd0a023 2426 modflags |= HAMMER_INODE_DDIRTY;
fbb84158 2427 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2428 }
2429 }
2430done:
b84de5af 2431 if (error == 0)
e98f1b96 2432 hammer_modify_inode(&trans, ip, modflags);
b84de5af 2433 hammer_done_transaction(&trans);
fbb84158 2434 hammer_knote(ap->a_vp, kflags);
b0aab9b9 2435 lwkt_reltoken(&hmp->fs_token);
8cd0a023 2436 return (error);
427e5fc6
MD
2437}
2438
66325755
MD
2439/*
2440 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2441 */
427e5fc6
MD
2442static
2443int
66325755 2444hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
427e5fc6 2445{
7a04d74f
MD
2446 struct hammer_transaction trans;
2447 struct hammer_inode *dip;
2448 struct hammer_inode *nip;
7a04d74f 2449 hammer_record_t record;
b0aab9b9
MD
2450 struct nchandle *nch;
2451 hammer_mount_t hmp;
7a04d74f
MD
2452 int error;
2453 int bytes;
2454
2455 ap->a_vap->va_type = VLNK;
2456
2457 nch = ap->a_nch;
2458 dip = VTOI(ap->a_dvp);
b0aab9b9 2459 hmp = dip->hmp;
7a04d74f 2460
d113fda1
MD
2461 if (dip->flags & HAMMER_INODE_RO)
2462 return (EROFS);
b0aab9b9 2463 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 2464 return (error);
d113fda1 2465
7a04d74f
MD
2466 /*
2467 * Create a transaction to cover the operations we perform.
2468 */
b0aab9b9
MD
2469 lwkt_gettoken(&hmp->fs_token);
2470 hammer_start_transaction(&trans, hmp);
ce0138a6 2471 ++hammer_stats_file_iopsw;
7a04d74f
MD
2472
2473 /*
2474 * Create a new filesystem object of the requested type. The
2475 * returned inode will be referenced but not locked.
2476 */
2477
5a930e66 2478 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
2479 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2480 NULL, &nip);
7a04d74f 2481 if (error) {
b84de5af 2482 hammer_done_transaction(&trans);
7a04d74f 2483 *ap->a_vpp = NULL;
b0aab9b9 2484 lwkt_reltoken(&hmp->fs_token);
7a04d74f
MD
2485 return (error);
2486 }
2487
2488 /*
7a04d74f
MD
2489 * Add a record representing the symlink. symlink stores the link
2490 * as pure data, not a string, and is no \0 terminated.
2491 */
2492 if (error == 0) {
7a04d74f
MD
2493 bytes = strlen(ap->a_target);
2494
2f85fa4d
MD
2495 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2496 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2497 } else {
2498 record = hammer_alloc_mem_record(nip, bytes);
2499 record->type = HAMMER_MEM_RECORD_GENERAL;
2500
5a930e66
MD
2501 record->leaf.base.localization = nip->obj_localization +
2502 HAMMER_LOCALIZE_MISC;
2f85fa4d
MD
2503 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2504 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2505 record->leaf.data_len = bytes;
2506 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2507 bcopy(ap->a_target, record->data->symlink.name, bytes);
2508 error = hammer_ip_add_record(&trans, record);
2509 }
42c7d26b
MD
2510
2511 /*
2512 * Set the file size to the length of the link.
2513 */
2514 if (error == 0) {
11ad5ade 2515 nip->ino_data.size = bytes;
e98f1b96 2516 hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
42c7d26b 2517 }
7a04d74f 2518 }
1f07f686 2519 if (error == 0)
5a930e66
MD
2520 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2521 nch->ncp->nc_nlen, nip);
7a04d74f
MD
2522
2523 /*
2524 * Finish up.
2525 */
2526 if (error) {
2527 hammer_rel_inode(nip, 0);
7a04d74f
MD
2528 *ap->a_vpp = NULL;
2529 } else {
e8599db1 2530 error = hammer_get_vnode(nip, ap->a_vpp);
7a04d74f
MD
2531 hammer_rel_inode(nip, 0);
2532 if (error == 0) {
2533 cache_setunresolved(ap->a_nch);
2534 cache_setvp(ap->a_nch, *ap->a_vpp);
fbb84158 2535 hammer_knote(ap->a_dvp, NOTE_WRITE);
7a04d74f
MD
2536 }
2537 }
b84de5af 2538 hammer_done_transaction(&trans);
b0aab9b9 2539 lwkt_reltoken(&hmp->fs_token);
7a04d74f 2540 return (error);
427e5fc6
MD
2541}
2542
66325755
MD
2543/*
2544 * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2545 */
427e5fc6
MD
2546static
2547int
66325755 2548hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
427e5fc6 2549{
b84de5af 2550 struct hammer_transaction trans;
e63644f0 2551 struct hammer_inode *dip;
b0aab9b9 2552 hammer_mount_t hmp;
b84de5af
MD
2553 int error;
2554
e63644f0 2555 dip = VTOI(ap->a_dvp);
b0aab9b9 2556 hmp = dip->hmp;
e63644f0
MD
2557
2558 if (hammer_nohistory(dip) == 0 &&
b0aab9b9 2559 (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
e63644f0
MD
2560 return (error);
2561 }
2562
b0aab9b9
MD
2563 lwkt_gettoken(&hmp->fs_token);
2564 hammer_start_transaction(&trans, hmp);
ce0138a6 2565 ++hammer_stats_file_iopsw;
b84de5af 2566 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
d7e278bb 2567 ap->a_cred, ap->a_flags, -1);
b84de5af 2568 hammer_done_transaction(&trans);
b0aab9b9 2569 lwkt_reltoken(&hmp->fs_token);
b84de5af
MD
2570
2571 return (error);
427e5fc6
MD
2572}
2573
66325755 2574/*
7dc57964
MD
2575 * hammer_vop_ioctl { vp, command, data, fflag, cred }
2576 */
2577static
2578int
2579hammer_vop_ioctl(struct vop_ioctl_args *ap)
2580{
2581 struct hammer_inode *ip = ap->a_vp->v_data;
b0aab9b9
MD
2582 hammer_mount_t hmp = ip->hmp;
2583 int error;
7dc57964 2584
ce0138a6 2585 ++hammer_stats_file_iopsr;
b0aab9b9
MD
2586 lwkt_gettoken(&hmp->fs_token);
2587 error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2588 ap->a_fflag, ap->a_cred);
2589 lwkt_reltoken(&hmp->fs_token);
2590 return (error);
7dc57964
MD
2591}
2592
513ca7d7
MD
2593static
2594int
2595hammer_vop_mountctl(struct vop_mountctl_args *ap)
2596{
dad088a5
MD
2597 static const struct mountctl_opt extraopt[] = {
2598 { HMNT_NOHISTORY, "nohistory" },
2599 { HMNT_MASTERID, "master" },
2600 { 0, NULL}
2601
2602 };
2603 struct hammer_mount *hmp;
513ca7d7 2604 struct mount *mp;
dad088a5 2605 int usedbytes;
513ca7d7
MD
2606 int error;
2607
dad088a5
MD
2608 error = 0;
2609 usedbytes = 0;
513ca7d7 2610 mp = ap->a_head.a_ops->head.vv_mount;
dad088a5
MD
2611 KKASSERT(mp->mnt_data != NULL);
2612 hmp = (struct hammer_mount *)mp->mnt_data;
513ca7d7 2613
b0aab9b9 2614 lwkt_gettoken(&hmp->fs_token);
dad088a5 2615
b0aab9b9 2616 switch(ap->a_op) {
513ca7d7
MD
2617 case MOUNTCTL_SET_EXPORT:
2618 if (ap->a_ctllen != sizeof(struct export_args))
2619 error = EINVAL;
b424ca30
MD
2620 else
2621 error = hammer_vfs_export(mp, ap->a_op,
513ca7d7
MD
2622 (const struct export_args *)ap->a_ctl);
2623 break;
dad088a5
MD
2624 case MOUNTCTL_MOUNTFLAGS:
2625 {
2626 /*
2627 * Call standard mountctl VOP function
2628 * so we get user mount flags.
2629 */
2630 error = vop_stdmountctl(ap);
2631 if (error)
2632 break;
2633
2634 usedbytes = *ap->a_res;
2635
eac446c5 2636 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
b0aab9b9
MD
2637 usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2638 ap->a_buf,
dad088a5
MD
2639 ap->a_buflen - usedbytes,
2640 &error);
dad088a5
MD
2641 }
2642
2643 *ap->a_res += usedbytes;
2644 break;
2645 }
513ca7d7 2646 default:
726e0641 2647 error = vop_stdmountctl(ap);
513ca7d7
MD
2648 break;
2649 }
b0aab9b9 2650 lwkt_reltoken(&hmp->fs_token);
513ca7d7
MD
2651 return(error);
2652}
2653
7dc57964 2654/*
66325755 2655 * hammer_vop_strategy { vp, bio }
8cd0a023
MD
2656 *
2657 * Strategy call, used for regular file read & write only. Note that the
2658 * bp may represent a cluster.
2659 *
2660 * To simplify operation and allow better optimizations in the future,
2661 * this code does not make any assumptions with regards to buffer alignment
2662 * or size.
66325755 2663 */
427e5fc6
MD
2664static
2665int
66325755 2666hammer_vop_strategy(struct vop_strategy_args *ap)
427e5fc6 2667{
8cd0a023
MD
2668 struct buf *bp;
2669 int error;
2670
2671 bp = ap->a_bio->bio_buf;
2672
2673 switch(bp->b_cmd) {
2674 case BUF_CMD_READ:
2675 error = hammer_vop_strategy_read(ap);
2676 break;
2677 case BUF_CMD_WRITE:
2678 error = hammer_vop_strategy_write(ap);
2679 break;
2680 default:
059819e3
MD
2681 bp->b_error = error = EINVAL;
2682 bp->b_flags |= B_ERROR;
2683 biodone(ap->a_bio);
8cd0a023
MD
2684 break;
2685 }
507df98a
ID
2686
2687 /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2688
8cd0a023 2689 return (error);
427e5fc6
MD
2690}
2691
8cd0a023
MD
2692/*
2693 * Read from a regular file. Iterate the related records and fill in the
2694 * BIO/BUF. Gaps are zero-filled.
2695 *
2696 * The support code in hammer_object.c should be used to deal with mixed
2697 * in-memory and on-disk records.
2698 *
4a2796f3
MD
2699 * NOTE: Can be called from the cluster code with an oversized buf.
2700 *
8cd0a023
MD
2701 * XXX atime update
2702 */
2703static
2704int
2705hammer_vop_strategy_read(struct vop_strategy_args *ap)
2706{
36f82b23
MD
2707 struct hammer_transaction trans;
2708 struct hammer_inode *ip;
39d8fd63 2709 struct hammer_inode *dip;
b0aab9b9 2710 hammer_mount_t hmp;
8cd0a023 2711 struct hammer_cursor cursor;
8cd0a023 2712 hammer_base_elm_t base;
4a2796f3 2713 hammer_off_t disk_offset;
8cd0a023 2714 struct bio *bio;
a99b9ea2 2715 struct bio *nbio;
8cd0a023
MD
2716 struct buf *bp;
2717 int64_t rec_offset;
a89aec1b 2718 int64_t ran_end;
195c19a1 2719 int64_t tmp64;
8cd0a023
MD
2720 int error;
2721 int boff;
2722 int roff;
2723 int n;
b4f86ea3 2724 int isdedupable;
8cd0a023
MD
2725
2726 bio = ap->a_bio;
2727 bp = bio->bio_buf;
36f82b23 2728 ip = ap->a_vp->v_data;
b0aab9b9 2729 hmp = ip->hmp;
8cd0a023 2730
a99b9ea2
MD
2731 /*
2732 * The zone-2 disk offset may have been set by the cluster code via
4a2796f3 2733 * a BMAP operation, or else should be NOOFFSET.
a99b9ea2 2734 *
4a2796f3 2735 * Checking the high bits for a match against zone-2 should suffice.
b4f86ea3
MD
2736 *
2737 * In cases where a lot of data duplication is present it may be
2738 * more beneficial to drop through and doubule-buffer through the
2739 * device.
a99b9ea2
MD
2740 */
2741 nbio = push_bio(bio);
9a98f3cc 2742 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
1b0ab2c3 2743 HAMMER_ZONE_LARGE_DATA) {
9a98f3cc
MD
2744 if (hammer_double_buffer == 0) {
2745 lwkt_gettoken(&hmp->fs_token);
2746 error = hammer_io_direct_read(hmp, nbio, NULL);
2747 lwkt_reltoken(&hmp->fs_token);
2748 return (error);
2749 }
2750
2751 /*
2752 * Try to shortcut requests for double_buffer mode too.
2753 * Since this mode runs through the device buffer cache
2754 * only compatible buffer sizes (meaning those generated
2755 * by normal filesystem buffers) are legal.
2756 */
2757 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
2758 error = hammer_io_indirect_read(hmp, nbio, NULL);
2759 return (error);
2760 }
a99b9ea2
MD
2761 }
2762
2763 /*
4a2796f3
MD
2764 * Well, that sucked. Do it the hard way. If all the stars are
2765 * aligned we may still be able to issue a direct-read.
a99b9ea2 2766 */
b0aab9b9
MD
2767 lwkt_gettoken(&hmp->fs_token);
2768 hammer_simple_transaction(&trans, hmp);
47637bff 2769 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
8cd0a023
MD
2770
2771 /*
2772 * Key range (begin and end inclusive) to scan. Note that the key's
c0ade690
MD
2773 * stored in the actual records represent BASE+LEN, not BASE. The
2774 * first record containing bio_offset will have a key > bio_offset.
8cd0a023 2775 */
5a930e66
MD
2776 cursor.key_beg.localization = ip->obj_localization +
2777 HAMMER_LOCALIZE_MISC;
8cd0a023 2778 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 2779 cursor.key_beg.create_tid = 0;
8cd0a023 2780 cursor.key_beg.delete_tid = 0;
8cd0a023 2781 cursor.key_beg.obj_type = 0;
c0ade690 2782 cursor.key_beg.key = bio->bio_offset + 1;
d5530d22 2783 cursor.asof = ip->obj_asof;
bf3b416b 2784 cursor.flags |= HAMMER_CURSOR_ASOF;
8cd0a023
MD
2785
2786 cursor.key_end = cursor.key_beg;
11ad5ade 2787 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
b84de5af 2788#if 0
11ad5ade 2789 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
a89aec1b
MD
2790 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2791 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2792 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
b84de5af
MD
2793 } else
2794#endif
2795 {
c0ade690 2796 ran_end = bio->bio_offset + bp->b_bufsize;
a89aec1b
MD
2797 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2798 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
195c19a1
MD
2799 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2800 if (tmp64 < ran_end)
a89aec1b
MD
2801 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2802 else
7f7c1f84 2803 cursor.key_end.key = ran_end + MAXPHYS + 1;
a89aec1b 2804 }
d26d0ae9 2805 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
8cd0a023 2806
18bee4a2
MD
2807 /*
2808 * Set NOSWAPCACHE for cursor data extraction if double buffering
2809 * is disabled or (if the file is not marked cacheable via chflags
2810 * and vm.swapcache_use_chflags is enabled).
2811 */
2812 if (hammer_double_buffer == 0 ||
2813 ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2814 vm_swapcache_use_chflags)) {
2815 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2816 }
2817
4e17f465 2818 error = hammer_ip_first(&cursor);
8cd0a023
MD
2819 boff = 0;
2820
a89aec1b 2821 while (error == 0) {
47637bff
MD
2822 /*
2823 * Get the base file offset of the record. The key for
2824 * data records is (base + bytes) rather then (base).
2825 */
11ad5ade 2826 base = &cursor.leaf->base;
11ad5ade 2827 rec_offset = base->key - cursor.leaf->data_len;
8cd0a023 2828
66325755 2829 /*
a89aec1b 2830 * Calculate the gap, if any, and zero-fill it.
1fef775e
MD
2831 *
2832 * n is the offset of the start of the record verses our
2833 * current seek offset in the bio.
66325755 2834 */
8cd0a023
MD
2835 n = (int)(rec_offset - (bio->bio_offset + boff));
2836 if (n > 0) {
a89aec1b
MD
2837 if (n > bp->b_bufsize - boff)
2838 n = bp->b_bufsize - boff;
8cd0a023
MD
2839 bzero((char *)bp->b_data + boff, n);
2840 boff += n;
2841 n = 0;
66325755 2842 }
8cd0a023
MD
2843
2844 /*
2845 * Calculate the data offset in the record and the number
2846 * of bytes we can copy.
a89aec1b 2847 *
1fef775e
MD
2848 * There are two degenerate cases. First, boff may already
2849 * be at bp->b_bufsize. Secondly, the data offset within
2850 * the record may exceed the record's size.
8cd0a023
MD
2851 */
2852 roff = -n;
b84de5af 2853 rec_offset += roff;
11ad5ade 2854 n = cursor.leaf->data_len - roff;
1fef775e
MD
2855 if (n <= 0) {
2856 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2857 n = 0;
2858 } else if (n > bp->b_bufsize - boff) {
8cd0a023 2859 n = bp->b_bufsize - boff;
1fef775e 2860 }
059819e3 2861
b84de5af 2862 /*
47637bff
MD
2863 * Deal with cached truncations. This cool bit of code
2864 * allows truncate()/ftruncate() to avoid having to sync
2865 * the file.
2866 *
2867 * If the frontend is truncated then all backend records are
2868 * subject to the frontend's truncation.
2869 *
2870 * If the backend is truncated then backend records on-disk
2871 * (but not in-memory) are subject to the backend's
2872 * truncation. In-memory records owned by the backend
2873 * represent data written after the truncation point on the
2874 * backend and must not be truncated.
2875 *
2876 * Truncate operations deal with frontend buffer cache
2877 * buffers and frontend-owned in-memory records synchronously.
b84de5af 2878 */
47637bff 2879 if (ip->flags & HAMMER_INODE_TRUNCATED) {
6362a262
MD
2880 if (hammer_cursor_ondisk(&cursor)/* ||
2881 cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
47637bff
MD
2882 if (ip->trunc_off <= rec_offset)
2883 n = 0;
2884 else if (ip->trunc_off < rec_offset + n)
2885 n = (int)(ip->trunc_off - rec_offset);
2886 }
2887 }
2888 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2889 if (hammer_cursor_ondisk(&cursor)) {
2890 if (ip->sync_trunc_off <= rec_offset)
2891 n = 0;
2892 else if (ip->sync_trunc_off < rec_offset + n)
2893 n = (int)(ip->sync_trunc_off - rec_offset);
2894 }
2895 }
b84de5af
MD
2896
2897 /*
47637bff
MD
2898 * Try to issue a direct read into our bio if possible,
2899 * otherwise resolve the element data into a hammer_buffer
2900 * and copy.
4a2796f3
MD
2901 *
2902 * The buffer on-disk should be zerod past any real
2903 * truncation point, but may not be for any synthesized
2904 * truncation point from above.
9a98f3cc
MD
2905 *
2906 * NOTE: disk_offset is only valid if the cursor data is
2907 * on-disk.
b84de5af 2908 */
1b0ab2c3 2909 disk_offset = cursor.leaf->data_offset + roff;
b4f86ea3
MD
2910 isdedupable = (boff == 0 && n == bp->b_bufsize &&
2911 hammer_cursor_ondisk(&cursor) &&
2912 ((int)disk_offset & HAMMER_BUFMASK) == 0);
2913
2914 if (isdedupable && hammer_double_buffer == 0) {
9a98f3cc
MD
2915 /*
2916 * Direct read case
2917 */
1b0ab2c3
MD
2918 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2919 HAMMER_ZONE_LARGE_DATA);
4a2796f3 2920 nbio->bio_offset = disk_offset;
b0aab9b9 2921 error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
b4f86ea3 2922 if (hammer_live_dedup && error == 0)
507df98a 2923 hammer_dedup_cache_add(ip, cursor.leaf);
47637bff 2924 goto done;
9a98f3cc
MD
2925 } else if (isdedupable) {
2926 /*
2927 * Async I/O case for reading from backing store
2928 * and copying the data to the filesystem buffer.
2929 * live-dedup has to verify the data anyway if it
2930 * gets a hit later so we can just add the entry
2931 * now.
2932 */
2933 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2934 HAMMER_ZONE_LARGE_DATA);
2935 nbio->bio_offset = disk_offset;
2936 if (hammer_live_dedup)
2937 hammer_dedup_cache_add(ip, cursor.leaf);
2938 error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
2939 goto done;
47637bff
MD
2940 } else if (n) {
2941 error = hammer_ip_resolve_data(&cursor);
2942 if (error == 0) {
b4f86ea3
MD
2943 if (hammer_live_dedup && isdedupable)
2944 hammer_dedup_cache_add(ip, cursor.leaf);
47637bff
MD
2945 bcopy((char *)cursor.data + roff,
2946 (char *)bp->b_data + boff, n);
2947 }
b84de5af 2948 }
47637bff
MD
2949 if (error)
2950 break;
2951
2952 /*
507df98a
ID
2953 * We have to be sure that the only elements added to the
2954 * dedup cache are those which are already on-media.
2955 */
2956 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2957 hammer_dedup_cache_add(ip, cursor.leaf);
2958
2959 /*
47637bff
MD
2960 * Iterate until we have filled the request.
2961 */
2962 boff += n;
8cd0a023 2963 if (boff == bp->b_bufsize)
66325755 2964 break;
a89aec1b 2965 error = hammer_ip_next(&cursor);
66325755
MD
2966 }
2967
2968 /*
8cd0a023 2969 * There may have been a gap after the last record
66325755 2970 */
8cd0a023
MD
2971 if (error == ENOENT)
2972 error = 0;
2973 if (error == 0 && boff != bp->b_bufsize) {
7f7c1f84 2974 KKASSERT(boff < bp->b_bufsize);
8cd0a023
MD
2975 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2976 /* boff = bp->b_bufsize; */
2977 }
18bee4a2
MD
2978
2979 /*
2980 * Disallow swapcache operation on the vnode buffer if double
2981 * buffering is enabled, the swapcache will get the data via
2982 * the block device buffer.
2983 */
2984 if (hammer_double_buffer)
2985 bp->b_flags |= B_NOTMETA;
2986
2987 /*
2988 * Cleanup
2989 */
8cd0a023 2990 bp->b_resid = 0;
059819e3
MD
2991 bp->b_error = error;
2992 if (error)
2993 bp->b_flags |= B_ERROR;
2994 biodone(ap->a_bio);
47637bff
MD
2995
2996done:
39d8fd63
MD
2997 /*
2998 * Cache the b-tree node for the last data read in cache[1].
2999 *
3000 * If we hit the file EOF then also cache the node in the
3001 * governing director's cache[3], it will be used to initialize
3002 * the inode's cache[1] for any inodes looked up via the directory.
3003 *
3004 * This doesn't reduce disk accesses since the B-Tree chain is
3005 * likely cached, but it does reduce cpu overhead when looking
3006 * up file offsets for cpdup/tar/cpio style iterations.
3007 */
47637bff 3008 if (cursor.node)
bcac4bbb 3009 hammer_cache_node(&ip->cache[1], cursor.node);
39d8fd63
MD
3010 if (ran_end >= ip->ino_data.size) {
3011 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
3012 ip->obj_asof, ip->obj_localization);
3013 if (dip) {
3014 hammer_cache_node(&dip->cache[3], cursor.node);
3015 hammer_rel_inode(dip, 0);
3016 }
3017 }
47637bff
MD
3018 hammer_done_cursor(&cursor);
3019 hammer_done_transaction(&trans);
b0aab9b9 3020 lwkt_reltoken(&hmp->fs_token);
8cd0a023
MD
3021 return(error);
3022}
3023
3024/*
a99b9ea2
MD
3025 * BMAP operation - used to support cluster_read() only.
3026 *
3027 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
3028 *
3029 * This routine may return EOPNOTSUPP if the opration is not supported for
3030 * the specified offset. The contents of the pointer arguments do not
3031 * need to be initialized in that case.
3032 *
3033 * If a disk address is available and properly aligned return 0 with
3034 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
3035 * to the run-length relative to that offset. Callers may assume that
3036 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
3037 * large, so return EOPNOTSUPP if it is not sufficiently large.
3038 */
3039static
3040int
3041hammer_vop_bmap(struct vop_bmap_args *ap)
3042{
3043 struct hammer_transaction trans;
3044 struct hammer_inode *ip;
b0aab9b9 3045 hammer_mount_t hmp;
a99b9ea2
MD
3046 struct hammer_cursor cursor;
3047 hammer_base_elm_t base;
3048 int64_t rec_offset;
3049 int64_t ran_end;
3050 int64_t tmp64;
3051 int64_t base_offset;
3052 int64_t base_disk_offset;
3053 int64_t last_offset;
3054 hammer_off_t last_disk_offset;
3055 hammer_off_t disk_offset;
3056 int rec_len;
3057 int error;
4a2796f3 3058 int blksize;
a99b9ea2 3059
ce0138a6 3060 ++hammer_stats_file_iopsr;
a99b9ea2 3061 ip = ap->a_vp->v_data;
b0aab9b9 3062 hmp = ip->hmp;
a99b9ea2
MD
3063
3064 /*
3065 * We can only BMAP regular files. We can't BMAP database files,
3066 * directories, etc.
3067 */
3068 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
3069 return(EOPNOTSUPP);
3070
3071 /*
3072 * bmap is typically called with runp/runb both NULL when used
3073 * for writing. We do not support BMAP for writing atm.
3074 */
4a2796f3 3075 if (ap->a_cmd != BUF_CMD_READ)
a99b9ea2
MD
3076 return(EOPNOTSUPP);
3077
3078 /*
3079 * Scan the B-Tree to acquire blockmap addresses, then translate
3080 * to raw addresses.
3081 */
b0aab9b9
MD
3082 lwkt_gettoken(&hmp->fs_token);
3083 hammer_simple_transaction(&trans, hmp);
cb51be26 3084#if 0
973c11b9
MD
3085 kprintf("bmap_beg %016llx ip->cache %p\n",
3086 (long long)ap->a_loffset, ip->cache[1]);
cb51be26 3087#endif
a99b9ea2
MD
3088 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3089
3090 /*
3091 * Key range (begin and end inclusive) to scan. Note that the key's
3092 * stored in the actual records represent BASE+LEN, not BASE. The
3093 * first record containing bio_offset will have a key > bio_offset.
3094 */
5a930e66
MD
3095 cursor.key_beg.localization = ip->obj_localization +
3096 HAMMER_LOCALIZE_MISC;
a99b9ea2
MD
3097 cursor.key_beg.obj_id = ip->obj_id;
3098 cursor.key_beg.create_tid = 0;
3099 cursor.key_beg.delete_tid = 0;
3100 cursor.key_beg.obj_type = 0;
3101 if (ap->a_runb)
3102 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3103 else
3104 cursor.key_beg.key = ap->a_loffset + 1;
3105 if (cursor.key_beg.key < 0)
3106 cursor.key_beg.key = 0;
3107 cursor.asof = ip->obj_asof;
bf3b416b 3108 cursor.flags |= HAMMER_CURSOR_ASOF;
a99b9ea2
MD
3109
3110 cursor.key_end = cursor.key_beg;
3111 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3112
3113 ran_end = ap->a_loffset + MAXPHYS;
3114 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3115 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3116 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
3117 if (tmp64 < ran_end)
3118 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3119 else
3120 cursor.key_end.key = ran_end + MAXPHYS + 1;
3121
3122 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3123
3124 error = hammer_ip_first(&cursor);
3125 base_offset = last_offset = 0;
3126 base_disk_offset = last_disk_offset = 0;
3127
3128 while (error == 0) {
3129 /*
3130 * Get the base file offset of the record. The key for
3131 * data records is (base + bytes) rather then (base).
4a2796f3
MD
3132 *
3133 * NOTE: rec_offset + rec_len may exceed the end-of-file.
3134 * The extra bytes should be zero on-disk and the BMAP op
3135 * should still be ok.
a99b9ea2
MD
3136 */
3137 base = &cursor.leaf->base;
3138 rec_offset = base->key - cursor.leaf->data_len;
3139 rec_len = cursor.leaf->data_len;
3140
3141 /*
4a2796f3
MD
3142 * Incorporate any cached truncation.
3143 *
3144 * NOTE: Modifications to rec_len based on synthesized
3145 * truncation points remove the guarantee that any extended
3146 * data on disk is zero (since the truncations may not have
3147 * taken place on-media yet).
a99b9ea2
MD
3148 */
3149 if (ip->flags & HAMMER_INODE_TRUNCATED) {
3150 if (hammer_cursor_ondisk(&cursor) ||
3151 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3152 if (ip->trunc_off <= rec_offset)
3153 rec_len = 0;
3154 else if (ip->trunc_off < rec_offset + rec_len)
3155 rec_len = (int)(ip->trunc_off - rec_offset);
3156 }
3157 }
3158 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3159 if (hammer_cursor_ondisk(&cursor)) {
3160 if (ip->sync_trunc_off <= rec_offset)
3161 rec_len = 0;
3162 else if (ip->sync_trunc_off < rec_offset + rec_len)
3163 rec_len = (int)(ip->sync_trunc_off - rec_offset);
3164 }
3165 }
3166
3167 /*
3168 * Accumulate information. If we have hit a discontiguous
3169 * block reset base_offset unless we are already beyond the
3170 * requested offset. If we are, that's it, we stop.
3171 */
a99b9ea2
MD
3172 if (error)
3173 break;
1b0ab2c3
MD
3174 if (hammer_cursor_ondisk(&cursor)) {
3175 disk_offset = cursor.leaf->data_offset;
3176 if (rec_offset != last_offset ||
3177 disk_offset != last_disk_offset) {
3178 if (rec_offset > ap->a_loffset)
3179 break;
3180 base_offset = rec_offset;
3181 base_disk_offset = disk_offset;