HAMMER VFS - REDO implementation base code part 1/many
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
CommitLineData
427e5fc6 1/*
b84de5af 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
427e5fc6
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
fbb84158 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
427e5fc6
MD
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/namecache.h>
42#include <sys/vnode.h>
43#include <sys/lockf.h>
44#include <sys/event.h>
45#include <sys/stat.h>
b3deaf57 46#include <sys/dirent.h>
fbb84158 47#include <sys/file.h>
c0ade690 48#include <vm/vm_extern.h>
7a04d74f 49#include <vfs/fifofs/fifo.h>
684a93c4
MD
50
51#include <sys/mplock2.h>
52
427e5fc6
MD
53#include "hammer.h"
54
55/*
56 * USERFS VNOPS
57 */
58/*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
66325755
MD
59static int hammer_vop_fsync(struct vop_fsync_args *);
60static int hammer_vop_read(struct vop_read_args *);
61static int hammer_vop_write(struct vop_write_args *);
62static int hammer_vop_access(struct vop_access_args *);
63static int hammer_vop_advlock(struct vop_advlock_args *);
64static int hammer_vop_close(struct vop_close_args *);
65static int hammer_vop_ncreate(struct vop_ncreate_args *);
66static int hammer_vop_getattr(struct vop_getattr_args *);
67static int hammer_vop_nresolve(struct vop_nresolve_args *);
68static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
69static int hammer_vop_nlink(struct vop_nlink_args *);
70static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
71static int hammer_vop_nmknod(struct vop_nmknod_args *);
72static int hammer_vop_open(struct vop_open_args *);
66325755
MD
73static int hammer_vop_print(struct vop_print_args *);
74static int hammer_vop_readdir(struct vop_readdir_args *);
75static int hammer_vop_readlink(struct vop_readlink_args *);
76static int hammer_vop_nremove(struct vop_nremove_args *);
77static int hammer_vop_nrename(struct vop_nrename_args *);
78static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
349433c9 79static int hammer_vop_markatime(struct vop_markatime_args *);
66325755
MD
80static int hammer_vop_setattr(struct vop_setattr_args *);
81static int hammer_vop_strategy(struct vop_strategy_args *);
a99b9ea2 82static int hammer_vop_bmap(struct vop_bmap_args *ap);
66325755
MD
83static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
84static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
7dc57964 85static int hammer_vop_ioctl(struct vop_ioctl_args *);
513ca7d7 86static int hammer_vop_mountctl(struct vop_mountctl_args *);
fbb84158 87static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
427e5fc6 88
7a04d74f
MD
89static int hammer_vop_fifoclose (struct vop_close_args *);
90static int hammer_vop_fiforead (struct vop_read_args *);
91static int hammer_vop_fifowrite (struct vop_write_args *);
fbb84158 92static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
7a04d74f 93
427e5fc6
MD
94struct vop_ops hammer_vnode_vops = {
95 .vop_default = vop_defaultop,
96 .vop_fsync = hammer_vop_fsync,
c0ade690
MD
97 .vop_getpages = vop_stdgetpages,
98 .vop_putpages = vop_stdputpages,
427e5fc6
MD
99 .vop_read = hammer_vop_read,
100 .vop_write = hammer_vop_write,
101 .vop_access = hammer_vop_access,
102 .vop_advlock = hammer_vop_advlock,
103 .vop_close = hammer_vop_close,
104 .vop_ncreate = hammer_vop_ncreate,
105 .vop_getattr = hammer_vop_getattr,
106 .vop_inactive = hammer_vop_inactive,
107 .vop_reclaim = hammer_vop_reclaim,
108 .vop_nresolve = hammer_vop_nresolve,
109 .vop_nlookupdotdot = hammer_vop_nlookupdotdot,
110 .vop_nlink = hammer_vop_nlink,
111 .vop_nmkdir = hammer_vop_nmkdir,
112 .vop_nmknod = hammer_vop_nmknod,
113 .vop_open = hammer_vop_open,
64950f31 114 .vop_pathconf = vop_stdpathconf,
427e5fc6
MD
115 .vop_print = hammer_vop_print,
116 .vop_readdir = hammer_vop_readdir,
117 .vop_readlink = hammer_vop_readlink,
118 .vop_nremove = hammer_vop_nremove,
119 .vop_nrename = hammer_vop_nrename,
120 .vop_nrmdir = hammer_vop_nrmdir,
349433c9 121 .vop_markatime = hammer_vop_markatime,
427e5fc6 122 .vop_setattr = hammer_vop_setattr,
a99b9ea2 123 .vop_bmap = hammer_vop_bmap,
427e5fc6
MD
124 .vop_strategy = hammer_vop_strategy,
125 .vop_nsymlink = hammer_vop_nsymlink,
7dc57964 126 .vop_nwhiteout = hammer_vop_nwhiteout,
513ca7d7 127 .vop_ioctl = hammer_vop_ioctl,
fbb84158
MD
128 .vop_mountctl = hammer_vop_mountctl,
129 .vop_kqfilter = hammer_vop_kqfilter
427e5fc6
MD
130};
131
7a04d74f 132struct vop_ops hammer_spec_vops = {
8be7edad 133 .vop_default = vop_defaultop,
7a04d74f 134 .vop_fsync = hammer_vop_fsync,
8be7edad
MD
135 .vop_read = vop_stdnoread,
136 .vop_write = vop_stdnowrite,
7a04d74f 137 .vop_access = hammer_vop_access,
8be7edad 138 .vop_close = hammer_vop_close,
349433c9 139 .vop_markatime = hammer_vop_markatime,
8be7edad 140 .vop_getattr = hammer_vop_getattr,
7a04d74f
MD
141 .vop_inactive = hammer_vop_inactive,
142 .vop_reclaim = hammer_vop_reclaim,
143 .vop_setattr = hammer_vop_setattr
144};
145
146struct vop_ops hammer_fifo_vops = {
147 .vop_default = fifo_vnoperate,
148 .vop_fsync = hammer_vop_fsync,
149 .vop_read = hammer_vop_fiforead,
150 .vop_write = hammer_vop_fifowrite,
151 .vop_access = hammer_vop_access,
152 .vop_close = hammer_vop_fifoclose,
349433c9 153 .vop_markatime = hammer_vop_markatime,
7a04d74f
MD
154 .vop_getattr = hammer_vop_getattr,
155 .vop_inactive = hammer_vop_inactive,
156 .vop_reclaim = hammer_vop_reclaim,
fbb84158
MD
157 .vop_setattr = hammer_vop_setattr,
158 .vop_kqfilter = hammer_vop_fifokqfilter
7a04d74f
MD
159};
160
fbb84158
MD
161static __inline
162void
163hammer_knote(struct vnode *vp, int flags)
164{
165 if (flags)
166 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
167}
168
0832c9bb
MD
169#ifdef DEBUG_TRUNCATE
170struct hammer_inode *HammerTruncIp;
171#endif
172
b84de5af 173static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
d7e278bb
MD
174 struct vnode *dvp, struct ucred *cred,
175 int flags, int isdir);
8cd0a023
MD
176static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
177static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
178
427e5fc6
MD
179#if 0
180static
181int
182hammer_vop_vnoperate(struct vop_generic_args *)
183{
184 return (VOCALL(&hammer_vnode_vops, ap));
185}
186#endif
187
66325755
MD
188/*
189 * hammer_vop_fsync { vp, waitfor }
ddfdf542
MD
190 *
191 * fsync() an inode to disk and wait for it to be completely committed
192 * such that the information would not be undone if a crash occured after
193 * return.
6f3d87c0
MD
194 *
195 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
196 * a REDO log. A sysctl is provided to relax HAMMER's fsync()
197 * operation.
198 *
199 * Ultimately the combination of a REDO log and use of fast storage
200 * to front-end cluster caches will make fsync fast, but it aint
201 * here yet. And, in anycase, we need real transactional
202 * all-or-nothing features which are not restricted to a single file.
66325755 203 */
427e5fc6
MD
204static
205int
66325755 206hammer_vop_fsync(struct vop_fsync_args *ap)
427e5fc6 207{
b84de5af 208 hammer_inode_t ip = VTOI(ap->a_vp);
9192654c 209 hammer_mount_t hmp = ip->hmp;
6f3d87c0 210 int waitfor = ap->a_waitfor;
9192654c 211 int mode;
6f3d87c0
MD
212
213 /*
9192654c
MD
214 * Fsync rule relaxation (default is either full synchronous flush
215 * or REDO semantics with synchronous flush).
6f3d87c0
MD
216 */
217 if (ap->a_flags & VOP_FSYNC_SYSCALL) {
218 switch(hammer_fsync_mode) {
219 case 0:
9192654c
MD
220mode0:
221 /* disable REDO, full synchronous flush */
222 ip->redo_count = SIZE_T_MAX;
223 goto skip;
6f3d87c0 224 case 1:
9192654c
MD
225mode1:
226 /* disable REDO, full asynchronous flush */
227 ip->redo_count = SIZE_T_MAX;
6f3d87c0
MD
228 if (waitfor == MNT_WAIT)
229 waitfor = MNT_NOWAIT;
9192654c 230 goto skip;
6f3d87c0 231 case 2:
9192654c
MD
232 /* REDO semantics, synchronous flush */
233 if (hmp->version < HAMMER_VOL_VERSION_FOUR)
234 goto mode0;
235 mode = HAMMER_FLUSH_UNDOS_AUTO;
236 break;
6f3d87c0 237 case 3:
9192654c
MD
238 /* REDO semantics, relaxed asynchronous flush */
239 if (hmp->version < HAMMER_VOL_VERSION_FOUR)
240 goto mode1;
241 mode = HAMMER_FLUSH_UNDOS_RELAXED;
242 if (waitfor == MNT_WAIT)
243 waitfor = MNT_NOWAIT;
244 break;
245 case 4:
246 /* ignore the fsync() system call */
6f3d87c0
MD
247 return(0);
248 default:
9192654c
MD
249 /* we have to do something */
250 mode = HAMMER_FLUSH_UNDOS_RELAXED;
251 if (waitfor == MNT_WAIT)
252 waitfor = MNT_NOWAIT;
253 break;
254 }
255
256 /*
257 * redo_count is initialized to a maximal value and set
258 * to 0 after the first fsync() on a file, which enables
259 * REDO logging on the inode unless the number of bytes
260 * written exceeds the limit.
261 */
262 if (ip->redo_count < hammer_limit_redo &&
263 (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
264 ) {
265 ++hammer_count_fsyncs;
266 hammer_flusher_flush_undos(hmp, mode);
267 ip->redo_count = 0;
6f3d87c0
MD
268 return(0);
269 }
9192654c 270 ip->redo_count = 0;
6f3d87c0 271 }
9192654c 272skip:
c0ade690 273
6f3d87c0 274 /*
9192654c 275 * Do a full flush sequence.
6f3d87c0 276 */
7a61b85d 277 ++hammer_count_fsyncs;
6f3d87c0 278 vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
af209b0f 279 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
6f3d87c0 280 if (waitfor == MNT_WAIT) {
b424ca30 281 vn_unlock(ap->a_vp);
b84de5af 282 hammer_wait_inode(ip);
b424ca30
MD
283 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
284 }
059819e3 285 return (ip->error);
427e5fc6
MD
286}
287
66325755
MD
288/*
289 * hammer_vop_read { vp, uio, ioflag, cred }
42cd5131
MD
290 *
291 * MPALMOSTSAFE
66325755 292 */
427e5fc6
MD
293static
294int
66325755 295hammer_vop_read(struct vop_read_args *ap)
427e5fc6 296{
66325755 297 struct hammer_transaction trans;
c0ade690 298 hammer_inode_t ip;
66325755
MD
299 off_t offset;
300 struct buf *bp;
301 struct uio *uio;
302 int error;
303 int n;
8cd0a023 304 int seqcount;
4a2796f3
MD
305 int ioseqcount;
306 int blksize;
899eb297 307 int got_mplock;
f864373f 308 int bigread;
66325755
MD
309
310 if (ap->a_vp->v_type != VREG)
311 return (EINVAL);
312 ip = VTOI(ap->a_vp);
313 error = 0;
4a2796f3
MD
314 uio = ap->a_uio;
315
316 /*
317 * Allow the UIO's size to override the sequential heuristic.
318 */
319 blksize = hammer_blocksize(uio->uio_offset);
320 seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
321 ioseqcount = ap->a_ioflag >> 16;
322 if (seqcount < ioseqcount)
323 seqcount = ioseqcount;
66325755 324
7ff770b4
MD
325 /*
326 * Temporary hack until more of HAMMER can be made MPSAFE.
327 */
328#ifdef SMP
899eb297
MD
329 if (curthread->td_mpcount) {
330 got_mplock = -1;
331 hammer_start_transaction(&trans, ip->hmp);
332 } else {
333 got_mplock = 0;
334 }
7ff770b4
MD
335#else
336 hammer_start_transaction(&trans, ip->hmp);
337 got_mplock = -1;
338#endif
899eb297 339
66325755 340 /*
f864373f
MD
341 * If reading or writing a huge amount of data we have to break
342 * atomicy and allow the operation to be interrupted by a signal
343 * or it can DOS the machine.
344 */
345 bigread = (uio->uio_resid > 100 * 1024 * 1024);
346
347 /*
4a2796f3
MD
348 * Access the data typically in HAMMER_BUFSIZE blocks via the
349 * buffer cache, but HAMMER may use a variable block size based
350 * on the offset.
42cd5131
MD
351 *
352 * XXX Temporary hack, delay the start transaction while we remain
353 * MPSAFE. NOTE: ino_data.size cannot change while vnode is
354 * locked-shared.
66325755 355 */
11ad5ade 356 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
4a2796f3
MD
357 int64_t base_offset;
358 int64_t file_limit;
359
360 blksize = hammer_blocksize(uio->uio_offset);
361 offset = (int)uio->uio_offset & (blksize - 1);
362 base_offset = uio->uio_offset - offset;
363
f864373f
MD
364 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
365 break;
366
42cd5131
MD
367 /*
368 * MPSAFE
369 */
370 bp = getcacheblk(ap->a_vp, base_offset);
371 if (bp) {
372 error = 0;
373 goto skip;
374 }
375
376 /*
377 * MPUNSAFE
378 */
379 if (got_mplock == 0) {
380 got_mplock = 1;
381 get_mplock();
382 hammer_start_transaction(&trans, ip->hmp);
383 }
384
1b0ab2c3 385 if (hammer_cluster_enable) {
4a2796f3
MD
386 /*
387 * Use file_limit to prevent cluster_read() from
388 * creating buffers of the wrong block size past
389 * the demarc.
390 */
391 file_limit = ip->ino_data.size;
392 if (base_offset < HAMMER_XDEMARC &&
393 file_limit > HAMMER_XDEMARC) {
394 file_limit = HAMMER_XDEMARC;
395 }
396 error = cluster_read(ap->a_vp,
397 file_limit, base_offset,
398 blksize, MAXPHYS,
399 seqcount, &bp);
a99b9ea2 400 } else {
4a2796f3 401 error = bread(ap->a_vp, base_offset, blksize, &bp);
a99b9ea2 402 }
66325755
MD
403 if (error) {
404 brelse(bp);
405 break;
406 }
42cd5131 407skip:
7bc5b8c2 408
c0ade690 409 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
4a2796f3 410 n = blksize - offset;
66325755
MD
411 if (n > uio->uio_resid)
412 n = uio->uio_resid;
11ad5ade
MD
413 if (n > ip->ino_data.size - uio->uio_offset)
414 n = (int)(ip->ino_data.size - uio->uio_offset);
66325755 415 error = uiomove((char *)bp->b_data + offset, n, uio);
7bc5b8c2
MD
416
417 /* data has a lower priority then meta-data */
418 bp->b_flags |= B_AGE;
66325755 419 bqrelse(bp);
af209b0f
MD
420 if (error)
421 break;
ce0138a6 422 hammer_stats_file_read += n;
66325755 423 }
42cd5131
MD
424
425 /*
426 * XXX only update the atime if we had to get the MP lock.
427 * XXX hack hack hack, fixme.
428 */
429 if (got_mplock) {
430 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
431 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
432 ip->ino_data.atime = trans.time;
433 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
434 }
435 hammer_done_transaction(&trans);
899eb297
MD
436 if (got_mplock > 0)
437 rel_mplock();
b84de5af 438 }
66325755 439 return (error);
427e5fc6
MD
440}
441
66325755
MD
442/*
443 * hammer_vop_write { vp, uio, ioflag, cred }
444 */
427e5fc6
MD
445static
446int
66325755 447hammer_vop_write(struct vop_write_args *ap)
427e5fc6 448{
66325755
MD
449 struct hammer_transaction trans;
450 struct hammer_inode *ip;
4a2796f3 451 hammer_mount_t hmp;
66325755 452 struct uio *uio;
4a2796f3 453 int offset;
47637bff 454 off_t base_offset;
66325755 455 struct buf *bp;
fbb84158 456 int kflags;
66325755
MD
457 int error;
458 int n;
c0ade690 459 int flags;
cb51be26 460 int seqcount;
f864373f 461 int bigwrite;
66325755
MD
462
463 if (ap->a_vp->v_type != VREG)
464 return (EINVAL);
465 ip = VTOI(ap->a_vp);
4a2796f3 466 hmp = ip->hmp;
66325755 467 error = 0;
fbb84158 468 kflags = 0;
cb51be26 469 seqcount = ap->a_ioflag >> 16;
66325755 470
d113fda1
MD
471 if (ip->flags & HAMMER_INODE_RO)
472 return (EROFS);
473
66325755
MD
474 /*
475 * Create a transaction to cover the operations we perform.
476 */
4a2796f3 477 hammer_start_transaction(&trans, hmp);
66325755
MD
478 uio = ap->a_uio;
479
480 /*
481 * Check append mode
482 */
483 if (ap->a_ioflag & IO_APPEND)
11ad5ade 484 uio->uio_offset = ip->ino_data.size;
66325755
MD
485
486 /*
af209b0f
MD
487 * Check for illegal write offsets. Valid range is 0...2^63-1.
488 *
489 * NOTE: the base_off assignment is required to work around what
490 * I consider to be a GCC-4 optimization bug.
66325755 491 */
af209b0f
MD
492 if (uio->uio_offset < 0) {
493 hammer_done_transaction(&trans);
494 return (EFBIG);
495 }
496 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
e54488bb 497 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
b84de5af 498 hammer_done_transaction(&trans);
66325755 499 return (EFBIG);
9c448776 500 }
66325755
MD
501
502 /*
f864373f
MD
503 * If reading or writing a huge amount of data we have to break
504 * atomicy and allow the operation to be interrupted by a signal
505 * or it can DOS the machine.
9192654c
MD
506 *
507 * Adjust redo_count early to avoid generating unnecessary redos.
f864373f
MD
508 */
509 bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
9192654c
MD
510 if (ip->redo_count < hammer_limit_redo)
511 ip->redo_count += uio->uio_resid;
f864373f
MD
512
513 /*
4a2796f3
MD
514 * Access the data typically in HAMMER_BUFSIZE blocks via the
515 * buffer cache, but HAMMER may use a variable block size based
516 * on the offset.
66325755
MD
517 */
518 while (uio->uio_resid > 0) {
d5ef456e 519 int fixsize = 0;
4a2796f3
MD
520 int blksize;
521 int blkmask;
d5ef456e 522
93291532 523 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
e63644f0 524 break;
f864373f
MD
525 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
526 break;
e63644f0 527
a9d52b76
MD
528 blksize = hammer_blocksize(uio->uio_offset);
529
059819e3 530 /*
4a2796f3
MD
531 * Do not allow HAMMER to blow out the buffer cache. Very
532 * large UIOs can lockout other processes due to bwillwrite()
533 * mechanics.
47637bff 534 *
df301614
MD
535 * The hammer inode is not locked during these operations.
536 * The vnode is locked which can interfere with the pageout
537 * daemon for non-UIO_NOCOPY writes but should not interfere
538 * with the buffer cache. Even so, we cannot afford to
539 * allow the pageout daemon to build up too many dirty buffer
540 * cache buffers.
cb63d1bc
MD
541 *
542 * Only call this if we aren't being recursively called from
543 * a virtual disk device (vn), else we may deadlock.
df301614 544 */
cb63d1bc
MD
545 if ((ap->a_ioflag & IO_RECURSE) == 0)
546 bwillwrite(blksize);
df301614
MD
547
548 /*
de996e86
MD
549 * Control the number of pending records associated with
550 * this inode. If too many have accumulated start a
551 * flush. Try to maintain a pipeline with the flusher.
552 */
553 if (ip->rsv_recs >= hammer_limit_inode_recs) {
554 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
555 }
556 if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
557 while (ip->rsv_recs >= hammer_limit_inode_recs) {
558 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
559 }
560 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
561 }
562
563#if 0
564 /*
e4a5ff06
MD
565 * Do not allow HAMMER to blow out system memory by
566 * accumulating too many records. Records are so well
567 * decoupled from the buffer cache that it is possible
568 * for userland to push data out to the media via
569 * direct-write, but build up the records queued to the
570 * backend faster then the backend can flush them out.
571 * HAMMER has hit its write limit but the frontend has
572 * no pushback to slow it down.
059819e3 573 */
df301614 574 if (hmp->rsv_recs > hammer_limit_recs / 2) {
4a2796f3 575 /*
df301614 576 * Get the inode on the flush list
4a2796f3 577 */
df301614
MD
578 if (ip->rsv_recs >= 64)
579 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
580 else if (ip->rsv_recs >= 16)
581 hammer_flush_inode(ip, 0);
4a2796f3
MD
582
583 /*
df301614
MD
584 * Keep the flusher going if the system keeps
585 * queueing records.
4a2796f3 586 */
df301614
MD
587 delta = hmp->count_newrecords -
588 hmp->last_newrecords;
589 if (delta < 0 || delta > hammer_limit_recs / 2) {
590 hmp->last_newrecords = hmp->count_newrecords;
591 hammer_sync_hmp(hmp, MNT_NOWAIT);
4a2796f3
MD
592 }
593
df301614
MD
594 /*
595 * If we have gotten behind start slowing
596 * down the writers.
597 */
598 delta = (hmp->rsv_recs - hammer_limit_recs) *
599 hz / hammer_limit_recs;
600 if (delta > 0)
601 tsleep(&trans, 0, "hmrslo", delta);
059819e3 602 }
de996e86 603#endif
059819e3 604
4a2796f3
MD
605 /*
606 * Calculate the blocksize at the current offset and figure
607 * out how much we can actually write.
608 */
4a2796f3
MD
609 blkmask = blksize - 1;
610 offset = (int)uio->uio_offset & blkmask;
611 base_offset = uio->uio_offset & ~(int64_t)blkmask;
612 n = blksize - offset;
d5ef456e
MD
613 if (n > uio->uio_resid)
614 n = uio->uio_resid;
11ad5ade 615 if (uio->uio_offset + n > ip->ino_data.size) {
d5ef456e
MD
616 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
617 fixsize = 1;
fbb84158 618 kflags |= NOTE_EXTEND;
d5ef456e
MD
619 }
620
c0ade690
MD
621 if (uio->uio_segflg == UIO_NOCOPY) {
622 /*
623 * Issuing a write with the same data backing the
624 * buffer. Instantiate the buffer to collect the
625 * backing vm pages, then read-in any missing bits.
626 *
627 * This case is used by vop_stdputpages().
628 */
47637bff 629 bp = getblk(ap->a_vp, base_offset,
4a2796f3 630 blksize, GETBLK_BHEAVY, 0);
c0ade690
MD
631 if ((bp->b_flags & B_CACHE) == 0) {
632 bqrelse(bp);
47637bff 633 error = bread(ap->a_vp, base_offset,
4a2796f3 634 blksize, &bp);
c0ade690 635 }
4a2796f3 636 } else if (offset == 0 && uio->uio_resid >= blksize) {
c0ade690 637 /*
a5fddc16
MD
638 * Even though we are entirely overwriting the buffer
639 * we may still have to zero it out to avoid a
640 * mmap/write visibility issue.
c0ade690 641 */
4a2796f3 642 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
a5fddc16
MD
643 if ((bp->b_flags & B_CACHE) == 0)
644 vfs_bio_clrbuf(bp);
47637bff 645 } else if (base_offset >= ip->ino_data.size) {
c0ade690 646 /*
a5fddc16
MD
647 * If the base offset of the buffer is beyond the
648 * file EOF, we don't have to issue a read.
c0ade690 649 */
47637bff 650 bp = getblk(ap->a_vp, base_offset,
4a2796f3 651 blksize, GETBLK_BHEAVY, 0);
66325755
MD
652 vfs_bio_clrbuf(bp);
653 } else {
c0ade690
MD
654 /*
655 * Partial overwrite, read in any missing bits then
656 * replace the portion being written.
657 */
4a2796f3 658 error = bread(ap->a_vp, base_offset, blksize, &bp);
d5ef456e
MD
659 if (error == 0)
660 bheavy(bp);
66325755 661 }
9192654c
MD
662 if (error == 0)
663 error = uiomove(bp->b_data + offset, n, uio);
664
665 /*
666 * Generate REDO records while redo_count has not exceeded
667 * the limit. Note that redo_count is initialized to a
668 * maximal value until the first fsync(), and zerod on every
669 * fsync(). Thus at least one fsync() is required before we
670 * start generating REDO records for the ip.
671 */
672 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
673 ip->redo_count < hammer_limit_redo &&
674 error == 0) {
675 hammer_sync_lock_sh(&trans);
676 error = hammer_generate_redo(&trans, ip,
677 base_offset + offset,
678 bp->b_data + offset,
679 (size_t)n);
680 hammer_sync_unlock(&trans);
47637bff 681 }
d5ef456e
MD
682
683 /*
684 * If we screwed up we have to undo any VM size changes we
685 * made.
686 */
66325755
MD
687 if (error) {
688 brelse(bp);
d5ef456e 689 if (fixsize) {
11ad5ade 690 vtruncbuf(ap->a_vp, ip->ino_data.size,
4a2796f3 691 hammer_blocksize(ip->ino_data.size));
d5ef456e 692 }
66325755
MD
693 break;
694 }
fbb84158 695 kflags |= NOTE_WRITE;
ce0138a6 696 hammer_stats_file_write += n;
c0ade690 697 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
11ad5ade
MD
698 if (ip->ino_data.size < uio->uio_offset) {
699 ip->ino_data.size = uio->uio_offset;
9192654c 700 flags = HAMMER_INODE_SDIRTY;
11ad5ade 701 vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
c0ade690 702 } else {
d113fda1 703 flags = 0;
66325755 704 }
11ad5ade 705 ip->ino_data.mtime = trans.time;
ddfdf542 706 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
47637bff 707 hammer_modify_inode(ip, flags);
32c90105 708
0832c9bb 709 /*
1b0ab2c3
MD
710 * Once we dirty the buffer any cached zone-X offset
711 * becomes invalid. HAMMER NOTE: no-history mode cannot
712 * allow overwriting over the same data sector unless
713 * we provide UNDOs for the old data, which we don't.
714 */
715 bp->b_bio2.bio_offset = NOOFFSET;
716
717 /*
47637bff 718 * Final buffer disposition.
de996e86
MD
719 *
720 * Because meta-data updates are deferred, HAMMER is
721 * especially sensitive to excessive bdwrite()s because
722 * the I/O stream is not broken up by disk reads. So the
723 * buffer cache simply cannot keep up.
724 *
725 * WARNING! blksize is variable. cluster_write() is
726 * expected to not blow up if it encounters buffers that
727 * do not match the passed blksize.
710733a6
MD
728 *
729 * NOTE! Hammer shouldn't need to bawrite()/cluster_write().
730 * The ip->rsv_recs check should burst-flush the data.
731 * If we queue it immediately the buf could be left
732 * locked on the device queue for a very long time.
47637bff 733 */
cb51be26 734 bp->b_flags |= B_AGE;
66325755
MD
735 if (ap->a_ioflag & IO_SYNC) {
736 bwrite(bp);
737 } else if (ap->a_ioflag & IO_DIRECT) {
66325755 738 bawrite(bp);
710733a6
MD
739 } else {
740#if 0
741 if (offset + n == blksize) {
de996e86
MD
742 if (hammer_cluster_enable == 0 ||
743 (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
744 bawrite(bp);
745 } else {
746 cluster_write(bp, ip->ino_data.size,
747 blksize, seqcount);
748 }
4a2796f3 749 } else {
710733a6 750#endif
4a2796f3
MD
751 bdwrite(bp);
752 }
66325755 753 }
b84de5af 754 hammer_done_transaction(&trans);
fbb84158 755 hammer_knote(ap->a_vp, kflags);
66325755 756 return (error);
427e5fc6
MD
757}
758
66325755
MD
759/*
760 * hammer_vop_access { vp, mode, cred }
761 */
427e5fc6
MD
762static
763int
66325755 764hammer_vop_access(struct vop_access_args *ap)
427e5fc6 765{
66325755
MD
766 struct hammer_inode *ip = VTOI(ap->a_vp);
767 uid_t uid;
768 gid_t gid;
769 int error;
770
ce0138a6 771 ++hammer_stats_file_iopsr;
66325755
MD
772 uid = hammer_to_unix_xid(&ip->ino_data.uid);
773 gid = hammer_to_unix_xid(&ip->ino_data.gid);
774
775 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
776 ip->ino_data.uflags);
777 return (error);
427e5fc6
MD
778}
779
66325755
MD
780/*
781 * hammer_vop_advlock { vp, id, op, fl, flags }
782 */
427e5fc6
MD
783static
784int
66325755 785hammer_vop_advlock(struct vop_advlock_args *ap)
427e5fc6 786{
4a2796f3 787 hammer_inode_t ip = VTOI(ap->a_vp);
66325755 788
11ad5ade 789 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
427e5fc6
MD
790}
791
66325755
MD
792/*
793 * hammer_vop_close { vp, fflag }
6f3d87c0
MD
794 *
795 * We can only sync-on-close for normal closes.
66325755 796 */
427e5fc6
MD
797static
798int
66325755 799hammer_vop_close(struct vop_close_args *ap)
427e5fc6 800{
9192654c 801#if 0
6f3d87c0
MD
802 struct vnode *vp = ap->a_vp;
803 hammer_inode_t ip = VTOI(vp);
804 int waitfor;
6f3d87c0
MD
805 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
806 if (vn_islocked(vp) == LK_EXCLUSIVE &&
807 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
808 if (ip->flags & HAMMER_INODE_CLOSESYNC)
809 waitfor = MNT_WAIT;
810 else
811 waitfor = MNT_NOWAIT;
812 ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
813 HAMMER_INODE_CLOSEASYNC);
814 VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
815 }
816 }
9192654c 817#endif
a89aec1b 818 return (vop_stdclose(ap));
427e5fc6
MD
819}
820
66325755
MD
821/*
822 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
823 *
824 * The operating system has already ensured that the directory entry
825 * does not exist and done all appropriate namespace locking.
826 */
427e5fc6
MD
827static
828int
66325755 829hammer_vop_ncreate(struct vop_ncreate_args *ap)
427e5fc6 830{
66325755
MD
831 struct hammer_transaction trans;
832 struct hammer_inode *dip;
833 struct hammer_inode *nip;
834 struct nchandle *nch;
835 int error;
836
837 nch = ap->a_nch;
838 dip = VTOI(ap->a_dvp);
839
d113fda1
MD
840 if (dip->flags & HAMMER_INODE_RO)
841 return (EROFS);
93291532 842 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 843 return (error);
d113fda1 844
66325755
MD
845 /*
846 * Create a transaction to cover the operations we perform.
847 */
8cd0a023 848 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 849 ++hammer_stats_file_iopsw;
66325755
MD
850
851 /*
852 * Create a new filesystem object of the requested type. The
b84de5af
MD
853 * returned inode will be referenced and shared-locked to prevent
854 * it from being moved to the flusher.
66325755 855 */
5a930e66 856 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
857 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
858 NULL, &nip);
66325755 859 if (error) {
77062c8a 860 hkprintf("hammer_create_inode error %d\n", error);
b84de5af 861 hammer_done_transaction(&trans);
66325755
MD
862 *ap->a_vpp = NULL;
863 return (error);
864 }
66325755
MD
865
866 /*
867 * Add the new filesystem object to the directory. This will also
868 * bump the inode's link count.
869 */
5a930e66
MD
870 error = hammer_ip_add_directory(&trans, dip,
871 nch->ncp->nc_name, nch->ncp->nc_nlen,
872 nip);
0b075555 873 if (error)
77062c8a 874 hkprintf("hammer_ip_add_directory error %d\n", error);
66325755
MD
875
876 /*
877 * Finish up.
878 */
879 if (error) {
a89aec1b 880 hammer_rel_inode(nip, 0);
b84de5af 881 hammer_done_transaction(&trans);
66325755
MD
882 *ap->a_vpp = NULL;
883 } else {
e8599db1 884 error = hammer_get_vnode(nip, ap->a_vpp);
b84de5af 885 hammer_done_transaction(&trans);
a89aec1b
MD
886 hammer_rel_inode(nip, 0);
887 if (error == 0) {
888 cache_setunresolved(ap->a_nch);
889 cache_setvp(ap->a_nch, *ap->a_vpp);
890 }
fbb84158 891 hammer_knote(ap->a_dvp, NOTE_WRITE);
66325755
MD
892 }
893 return (error);
427e5fc6
MD
894}
895
66325755
MD
896/*
897 * hammer_vop_getattr { vp, vap }
98f7132d
MD
898 *
899 * Retrieve an inode's attribute information. When accessing inodes
900 * historically we fake the atime field to ensure consistent results.
901 * The atime field is stored in the B-Tree element and allowed to be
902 * updated without cycling the element.
899eb297
MD
903 *
904 * MPSAFE
66325755 905 */
427e5fc6
MD
906static
907int
66325755 908hammer_vop_getattr(struct vop_getattr_args *ap)
427e5fc6 909{
66325755
MD
910 struct hammer_inode *ip = VTOI(ap->a_vp);
911 struct vattr *vap = ap->a_vap;
912
a56cb012
MD
913 /*
914 * We want the fsid to be different when accessing a filesystem
915 * with different as-of's so programs like diff don't think
916 * the files are the same.
917 *
918 * We also want the fsid to be the same when comparing snapshots,
919 * or when comparing mirrors (which might be backed by different
920 * physical devices). HAMMER fsids are based on the PFS's
921 * shared_uuid field.
922 *
923 * XXX there is a chance of collision here. The va_fsid reported
924 * by stat is different from the more involved fsid used in the
925 * mount structure.
c82af904 926 */
ce0138a6 927 ++hammer_stats_file_iopsr;
899eb297 928 hammer_lock_sh(&ip->lock);
a56cb012
MD
929 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
930 (u_int32_t)(ip->obj_asof >> 32);
931
11ad5ade 932 vap->va_fileid = ip->ino_leaf.base.obj_id;
66325755 933 vap->va_mode = ip->ino_data.mode;
11ad5ade 934 vap->va_nlink = ip->ino_data.nlinks;
66325755
MD
935 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
936 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
937 vap->va_rmajor = 0;
938 vap->va_rminor = 0;
11ad5ade 939 vap->va_size = ip->ino_data.size;
bcac4bbb
MD
940
941 /*
f437a2ab
MD
942 * Special case for @@PFS softlinks. The actual size of the
943 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
cb3c760c 944 * or for MAX_TID is "@@-1:%05d" == 10 bytes.
f437a2ab
MD
945 */
946 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
947 ip->ino_data.size == 10 &&
948 ip->obj_asof == HAMMER_MAX_TID &&
949 ip->obj_localization == 0 &&
950 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
cb3c760c
MD
951 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
952 vap->va_size = 26;
953 else
954 vap->va_size = 10;
f437a2ab
MD
955 }
956
957 /*
bcac4bbb
MD
958 * We must provide a consistent atime and mtime for snapshots
959 * so people can do a 'tar cf - ... | md5' on them and get
960 * consistent results.
961 */
962 if (ip->flags & HAMMER_INODE_RO) {
ddfdf542
MD
963 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
964 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
bcac4bbb 965 } else {
ddfdf542
MD
966 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
967 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
bcac4bbb 968 }
ddfdf542 969 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
66325755
MD
970 vap->va_flags = ip->ino_data.uflags;
971 vap->va_gen = 1; /* hammer inums are unique for all time */
bf686dbe 972 vap->va_blocksize = HAMMER_BUFSIZE;
4a2796f3
MD
973 if (ip->ino_data.size >= HAMMER_XDEMARC) {
974 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
975 ~HAMMER_XBUFMASK64;
976 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
977 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
978 ~HAMMER_BUFMASK64;
979 } else {
980 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
981 }
64950f31 982
11ad5ade 983 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
66325755 984 vap->va_filerev = 0; /* XXX */
66325755
MD
985 vap->va_uid_uuid = ip->ino_data.uid;
986 vap->va_gid_uuid = ip->ino_data.gid;
987 vap->va_fsid_uuid = ip->hmp->fsid;
988 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
989 VA_FSID_UUID_VALID;
7a04d74f 990
11ad5ade 991 switch (ip->ino_data.obj_type) {
7a04d74f
MD
992 case HAMMER_OBJTYPE_CDEV:
993 case HAMMER_OBJTYPE_BDEV:
994 vap->va_rmajor = ip->ino_data.rmajor;
995 vap->va_rminor = ip->ino_data.rminor;
996 break;
997 default:
998 break;
999 }
899eb297 1000 hammer_unlock(&ip->lock);
66325755 1001 return(0);
427e5fc6
MD
1002}
1003
66325755
MD
1004/*
1005 * hammer_vop_nresolve { nch, dvp, cred }
1006 *
1007 * Locate the requested directory entry.
1008 */
427e5fc6
MD
1009static
1010int
66325755 1011hammer_vop_nresolve(struct vop_nresolve_args *ap)
427e5fc6 1012{
36f82b23 1013 struct hammer_transaction trans;
66325755 1014 struct namecache *ncp;
7f7c1f84
MD
1015 hammer_inode_t dip;
1016 hammer_inode_t ip;
1017 hammer_tid_t asof;
8cd0a023 1018 struct hammer_cursor cursor;
66325755
MD
1019 struct vnode *vp;
1020 int64_t namekey;
1021 int error;
7f7c1f84
MD
1022 int i;
1023 int nlen;
d113fda1 1024 int flags;
a56cb012 1025 int ispfs;
adf01747 1026 int64_t obj_id;
ddfdf542 1027 u_int32_t localization;
5e435c92 1028 u_int32_t max_iterations;
7f7c1f84
MD
1029
1030 /*
1031 * Misc initialization, plus handle as-of name extensions. Look for
1032 * the '@@' extension. Note that as-of files and directories cannot
1033 * be modified.
7f7c1f84
MD
1034 */
1035 dip = VTOI(ap->a_dvp);
1036 ncp = ap->a_nch->ncp;
1037 asof = dip->obj_asof;
bc6c1f13 1038 localization = dip->obj_localization; /* for code consistency */
7f7c1f84 1039 nlen = ncp->nc_nlen;
ea434b6f 1040 flags = dip->flags & HAMMER_INODE_RO;
a56cb012 1041 ispfs = 0;
7f7c1f84 1042
36f82b23 1043 hammer_simple_transaction(&trans, dip->hmp);
ce0138a6 1044 ++hammer_stats_file_iopsr;
36f82b23 1045
7f7c1f84
MD
1046 for (i = 0; i < nlen; ++i) {
1047 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
bc6c1f13
MD
1048 error = hammer_str_to_tid(ncp->nc_name + i + 2,
1049 &ispfs, &asof, &localization);
1050 if (error != 0) {
1051 i = nlen;
1052 break;
1053 }
ea434b6f
MD
1054 if (asof != HAMMER_MAX_TID)
1055 flags |= HAMMER_INODE_RO;
7f7c1f84
MD
1056 break;
1057 }
1058 }
1059 nlen = i;
66325755 1060
8cd0a023 1061 /*
ea434b6f
MD
1062 * If this is a PFS softlink we dive into the PFS
1063 */
1064 if (ispfs && nlen == 0) {
1065 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1066 asof, localization,
1067 flags, &error);
1068 if (error == 0) {
1069 error = hammer_get_vnode(ip, &vp);
1070 hammer_rel_inode(ip, 0);
1071 } else {
1072 vp = NULL;
1073 }
1074 if (error == 0) {
1075 vn_unlock(vp);
1076 cache_setvp(ap->a_nch, vp);
1077 vrele(vp);
1078 }
1079 goto done;
1080 }
1081
1082 /*
294aec9f
MD
1083 * If there is no path component the time extension is relative to dip.
1084 * e.g. "fubar/@@<snapshot>"
1085 *
1086 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1087 * e.g. "fubar/.@@<snapshot>"
1088 *
1089 * ".." is handled by the kernel. We do not currently handle
1090 * "..@<snapshot>".
d113fda1 1091 */
294aec9f 1092 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
bcac4bbb 1093 ip = hammer_get_inode(&trans, dip, dip->obj_id,
ddfdf542
MD
1094 asof, dip->obj_localization,
1095 flags, &error);
d113fda1 1096 if (error == 0) {
e8599db1 1097 error = hammer_get_vnode(ip, &vp);
d113fda1
MD
1098 hammer_rel_inode(ip, 0);
1099 } else {
1100 vp = NULL;
1101 }
1102 if (error == 0) {
1103 vn_unlock(vp);
1104 cache_setvp(ap->a_nch, vp);
1105 vrele(vp);
1106 }
36f82b23 1107 goto done;
d113fda1
MD
1108 }
1109
1110 /*
8cd0a023
MD
1111 * Calculate the namekey and setup the key range for the scan. This
1112 * works kinda like a chained hash table where the lower 32 bits
1113 * of the namekey synthesize the chain.
1114 *
1115 * The key range is inclusive of both key_beg and key_end.
1116 */
5e435c92
MD
1117 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1118 &max_iterations);
66325755 1119
bcac4bbb 1120 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
5a930e66 1121 cursor.key_beg.localization = dip->obj_localization +
beec5dc4 1122 hammer_dir_localization(dip);
8cd0a023
MD
1123 cursor.key_beg.obj_id = dip->obj_id;
1124 cursor.key_beg.key = namekey;
d5530d22 1125 cursor.key_beg.create_tid = 0;
8cd0a023
MD
1126 cursor.key_beg.delete_tid = 0;
1127 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1128 cursor.key_beg.obj_type = 0;
66325755 1129
8cd0a023 1130 cursor.key_end = cursor.key_beg;
5e435c92 1131 cursor.key_end.key += max_iterations;
d5530d22
MD
1132 cursor.asof = asof;
1133 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
66325755
MD
1134
1135 /*
8cd0a023 1136 * Scan all matching records (the chain), locate the one matching
a89aec1b 1137 * the requested path component.
8cd0a023
MD
1138 *
1139 * The hammer_ip_*() functions merge in-memory records with on-disk
1140 * records for the purposes of the search.
66325755 1141 */
6a37e7e4 1142 obj_id = 0;
43c665ae 1143 localization = HAMMER_DEF_LOCALIZATION;
6a37e7e4 1144
4e17f465 1145 if (error == 0) {
4e17f465
MD
1146 error = hammer_ip_first(&cursor);
1147 while (error == 0) {
1148 error = hammer_ip_resolve_data(&cursor);
1149 if (error)
1150 break;
11ad5ade
MD
1151 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1152 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1153 obj_id = cursor.data->entry.obj_id;
ddfdf542 1154 localization = cursor.data->entry.localization;
4e17f465
MD
1155 break;
1156 }
1157 error = hammer_ip_next(&cursor);
66325755
MD
1158 }
1159 }
6a37e7e4 1160 hammer_done_cursor(&cursor);
4c286c36
MD
1161
1162 /*
1163 * Lookup the obj_id. This should always succeed. If it does not
1164 * the filesystem may be damaged and we return a dummy inode.
1165 */
66325755 1166 if (error == 0) {
bcac4bbb 1167 ip = hammer_get_inode(&trans, dip, obj_id,
ddfdf542
MD
1168 asof, localization,
1169 flags, &error);
4c286c36
MD
1170 if (error == ENOENT) {
1171 kprintf("HAMMER: WARNING: Missing "
1172 "inode for dirent \"%s\"\n"
3d30bff3
MD
1173 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1174 ncp->nc_name,
1175 (long long)obj_id, (long long)asof,
1176 localization);
4c286c36
MD
1177 error = 0;
1178 ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1179 asof, localization,
1180 flags, &error);
1181 }
7f7c1f84 1182 if (error == 0) {
e8599db1 1183 error = hammer_get_vnode(ip, &vp);
7f7c1f84
MD
1184 hammer_rel_inode(ip, 0);
1185 } else {
1186 vp = NULL;
1187 }
66325755
MD
1188 if (error == 0) {
1189 vn_unlock(vp);
1190 cache_setvp(ap->a_nch, vp);
1191 vrele(vp);
1192 }
1193 } else if (error == ENOENT) {
1194 cache_setvp(ap->a_nch, NULL);
1195 }
36f82b23 1196done:
b84de5af 1197 hammer_done_transaction(&trans);
66325755 1198 return (error);
427e5fc6
MD
1199}
1200
66325755
MD
1201/*
1202 * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1203 *
1204 * Locate the parent directory of a directory vnode.
1205 *
1206 * dvp is referenced but not locked. *vpp must be returned referenced and
1207 * locked. A parent_obj_id of 0 does not necessarily indicate that we are
1208 * at the root, instead it could indicate that the directory we were in was
1209 * removed.
42c7d26b
MD
1210 *
1211 * NOTE: as-of sequences are not linked into the directory structure. If
1212 * we are at the root with a different asof then the mount point, reload
1213 * the same directory with the mount point's asof. I'm not sure what this
1214 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not
1215 * get confused, but it hasn't been tested.
66325755 1216 */
427e5fc6
MD
1217static
1218int
66325755 1219hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
427e5fc6 1220{
36f82b23 1221 struct hammer_transaction trans;
66325755 1222 struct hammer_inode *dip;
d113fda1 1223 struct hammer_inode *ip;
42c7d26b 1224 int64_t parent_obj_id;
5a930e66 1225 u_int32_t parent_obj_localization;
42c7d26b 1226 hammer_tid_t asof;
d113fda1 1227 int error;
66325755
MD
1228
1229 dip = VTOI(ap->a_dvp);
42c7d26b 1230 asof = dip->obj_asof;
5a930e66
MD
1231
1232 /*
1233 * Whos are parent? This could be the root of a pseudo-filesystem
1234 * whos parent is in another localization domain.
1235 */
42c7d26b 1236 parent_obj_id = dip->ino_data.parent_obj_id;
5a930e66
MD
1237 if (dip->obj_id == HAMMER_OBJID_ROOT)
1238 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1239 else
1240 parent_obj_localization = dip->obj_localization;
42c7d26b
MD
1241
1242 if (parent_obj_id == 0) {
1243 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1244 asof != dip->hmp->asof) {
1245 parent_obj_id = dip->obj_id;
1246 asof = dip->hmp->asof;
1247 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1248 ksnprintf(*ap->a_fakename, 19, "0x%016llx",
973c11b9 1249 (long long)dip->obj_asof);
42c7d26b
MD
1250 } else {
1251 *ap->a_vpp = NULL;
1252 return ENOENT;
1253 }
66325755 1254 }
d113fda1 1255
36f82b23 1256 hammer_simple_transaction(&trans, dip->hmp);
ce0138a6 1257 ++hammer_stats_file_iopsr;
36f82b23 1258
bcac4bbb 1259 ip = hammer_get_inode(&trans, dip, parent_obj_id,
5a930e66 1260 asof, parent_obj_localization,
ddfdf542 1261 dip->flags, &error);
36f82b23 1262 if (ip) {
e8599db1 1263 error = hammer_get_vnode(ip, ap->a_vpp);
36f82b23
MD
1264 hammer_rel_inode(ip, 0);
1265 } else {
d113fda1 1266 *ap->a_vpp = NULL;
d113fda1 1267 }
b84de5af 1268 hammer_done_transaction(&trans);
d113fda1 1269 return (error);
427e5fc6
MD
1270}
1271
66325755
MD
1272/*
1273 * hammer_vop_nlink { nch, dvp, vp, cred }
1274 */
427e5fc6
MD
1275static
1276int
66325755 1277hammer_vop_nlink(struct vop_nlink_args *ap)
427e5fc6 1278{
66325755
MD
1279 struct hammer_transaction trans;
1280 struct hammer_inode *dip;
1281 struct hammer_inode *ip;
1282 struct nchandle *nch;
1283 int error;
1284
f437a2ab
MD
1285 if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1286 return(EXDEV);
1287
66325755
MD
1288 nch = ap->a_nch;
1289 dip = VTOI(ap->a_dvp);
1290 ip = VTOI(ap->a_vp);
1291
f437a2ab
MD
1292 if (dip->obj_localization != ip->obj_localization)
1293 return(EXDEV);
1294
d113fda1
MD
1295 if (dip->flags & HAMMER_INODE_RO)
1296 return (EROFS);
1297 if (ip->flags & HAMMER_INODE_RO)
1298 return (EROFS);
93291532 1299 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1300 return (error);
d113fda1 1301
66325755
MD
1302 /*
1303 * Create a transaction to cover the operations we perform.
1304 */
8cd0a023 1305 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1306 ++hammer_stats_file_iopsw;
66325755
MD
1307
1308 /*
1309 * Add the filesystem object to the directory. Note that neither
1310 * dip nor ip are referenced or locked, but their vnodes are
1311 * referenced. This function will bump the inode's link count.
1312 */
5a930e66
MD
1313 error = hammer_ip_add_directory(&trans, dip,
1314 nch->ncp->nc_name, nch->ncp->nc_nlen,
1315 ip);
66325755
MD
1316
1317 /*
1318 * Finish up.
1319 */
b84de5af 1320 if (error == 0) {
6b4f890b
MD
1321 cache_setunresolved(nch);
1322 cache_setvp(nch, ap->a_vp);
66325755 1323 }
b84de5af 1324 hammer_done_transaction(&trans);
fbb84158
MD
1325 hammer_knote(ap->a_vp, NOTE_LINK);
1326 hammer_knote(ap->a_dvp, NOTE_WRITE);
66325755 1327 return (error);
427e5fc6
MD
1328}
1329
66325755
MD
1330/*
1331 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1332 *
1333 * The operating system has already ensured that the directory entry
1334 * does not exist and done all appropriate namespace locking.
1335 */
427e5fc6
MD
1336static
1337int
66325755 1338hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
427e5fc6 1339{
66325755
MD
1340 struct hammer_transaction trans;
1341 struct hammer_inode *dip;
1342 struct hammer_inode *nip;
1343 struct nchandle *nch;
1344 int error;
1345
1346 nch = ap->a_nch;
1347 dip = VTOI(ap->a_dvp);
1348
d113fda1
MD
1349 if (dip->flags & HAMMER_INODE_RO)
1350 return (EROFS);
93291532 1351 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1352 return (error);
d113fda1 1353
66325755
MD
1354 /*
1355 * Create a transaction to cover the operations we perform.
1356 */
8cd0a023 1357 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1358 ++hammer_stats_file_iopsw;
66325755
MD
1359
1360 /*
1361 * Create a new filesystem object of the requested type. The
8cd0a023 1362 * returned inode will be referenced but not locked.
66325755 1363 */
5a930e66 1364 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
1365 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1366 NULL, &nip);
66325755 1367 if (error) {
77062c8a 1368 hkprintf("hammer_mkdir error %d\n", error);
b84de5af 1369 hammer_done_transaction(&trans);
66325755
MD
1370 *ap->a_vpp = NULL;
1371 return (error);
1372 }
66325755
MD
1373 /*
1374 * Add the new filesystem object to the directory. This will also
1375 * bump the inode's link count.
1376 */
5a930e66
MD
1377 error = hammer_ip_add_directory(&trans, dip,
1378 nch->ncp->nc_name, nch->ncp->nc_nlen,
1379 nip);
0b075555 1380 if (error)
77062c8a 1381 hkprintf("hammer_mkdir (add) error %d\n", error);
66325755
MD
1382
1383 /*
1384 * Finish up.
1385 */
1386 if (error) {
a89aec1b 1387 hammer_rel_inode(nip, 0);
66325755
MD
1388 *ap->a_vpp = NULL;
1389 } else {
e8599db1 1390 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1391 hammer_rel_inode(nip, 0);
1392 if (error == 0) {
1393 cache_setunresolved(ap->a_nch);
1394 cache_setvp(ap->a_nch, *ap->a_vpp);
1395 }
66325755 1396 }
b84de5af 1397 hammer_done_transaction(&trans);
fbb84158
MD
1398 if (error == 0)
1399 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
66325755 1400 return (error);
427e5fc6
MD
1401}
1402
66325755
MD
1403/*
1404 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1405 *
1406 * The operating system has already ensured that the directory entry
1407 * does not exist and done all appropriate namespace locking.
1408 */
427e5fc6
MD
1409static
1410int
66325755 1411hammer_vop_nmknod(struct vop_nmknod_args *ap)
427e5fc6 1412{
66325755
MD
1413 struct hammer_transaction trans;
1414 struct hammer_inode *dip;
1415 struct hammer_inode *nip;
1416 struct nchandle *nch;
1417 int error;
1418
1419 nch = ap->a_nch;
1420 dip = VTOI(ap->a_dvp);
1421
d113fda1
MD
1422 if (dip->flags & HAMMER_INODE_RO)
1423 return (EROFS);
93291532 1424 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1425 return (error);
d113fda1 1426
66325755
MD
1427 /*
1428 * Create a transaction to cover the operations we perform.
1429 */
8cd0a023 1430 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1431 ++hammer_stats_file_iopsw;
66325755
MD
1432
1433 /*
1434 * Create a new filesystem object of the requested type. The
8cd0a023 1435 * returned inode will be referenced but not locked.
5a930e66
MD
1436 *
1437 * If mknod specifies a directory a pseudo-fs is created.
66325755 1438 */
5a930e66 1439 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
1440 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1441 NULL, &nip);
66325755 1442 if (error) {
b84de5af 1443 hammer_done_transaction(&trans);
66325755
MD
1444 *ap->a_vpp = NULL;
1445 return (error);
1446 }
66325755
MD
1447
1448 /*
1449 * Add the new filesystem object to the directory. This will also
1450 * bump the inode's link count.
1451 */
5a930e66
MD
1452 error = hammer_ip_add_directory(&trans, dip,
1453 nch->ncp->nc_name, nch->ncp->nc_nlen,
1454 nip);
66325755
MD
1455
1456 /*
1457 * Finish up.
1458 */
1459 if (error) {
a89aec1b 1460 hammer_rel_inode(nip, 0);
66325755
MD
1461 *ap->a_vpp = NULL;
1462 } else {
e8599db1 1463 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1464 hammer_rel_inode(nip, 0);
1465 if (error == 0) {
1466 cache_setunresolved(ap->a_nch);
1467 cache_setvp(ap->a_nch, *ap->a_vpp);
1468 }
66325755 1469 }
b84de5af 1470 hammer_done_transaction(&trans);
fbb84158
MD
1471 if (error == 0)
1472 hammer_knote(ap->a_dvp, NOTE_WRITE);
66325755 1473 return (error);
427e5fc6
MD
1474}
1475
66325755
MD
1476/*
1477 * hammer_vop_open { vp, mode, cred, fp }
1478 */
427e5fc6
MD
1479static
1480int
66325755 1481hammer_vop_open(struct vop_open_args *ap)
427e5fc6 1482{
9f5097dc
MD
1483 hammer_inode_t ip;
1484
ce0138a6 1485 ++hammer_stats_file_iopsr;
9f5097dc
MD
1486 ip = VTOI(ap->a_vp);
1487
1488 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
d113fda1 1489 return (EROFS);
a89aec1b 1490 return(vop_stdopen(ap));
427e5fc6
MD
1491}
1492
66325755 1493/*
66325755
MD
1494 * hammer_vop_print { vp }
1495 */
427e5fc6
MD
1496static
1497int
66325755 1498hammer_vop_print(struct vop_print_args *ap)
427e5fc6
MD
1499{
1500 return EOPNOTSUPP;
1501}
1502
66325755 1503/*
6b4f890b 1504 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
66325755 1505 */
427e5fc6
MD
1506static
1507int
66325755 1508hammer_vop_readdir(struct vop_readdir_args *ap)
427e5fc6 1509{
36f82b23 1510 struct hammer_transaction trans;
6b4f890b
MD
1511 struct hammer_cursor cursor;
1512 struct hammer_inode *ip;
1513 struct uio *uio;
6b4f890b
MD
1514 hammer_base_elm_t base;
1515 int error;
1516 int cookie_index;
1517 int ncookies;
1518 off_t *cookies;
1519 off_t saveoff;
1520 int r;
ea434b6f 1521 int dtype;
6b4f890b 1522
ce0138a6 1523 ++hammer_stats_file_iopsr;
6b4f890b
MD
1524 ip = VTOI(ap->a_vp);
1525 uio = ap->a_uio;
b3deaf57
MD
1526 saveoff = uio->uio_offset;
1527
1528 if (ap->a_ncookies) {
1529 ncookies = uio->uio_resid / 16 + 1;
1530 if (ncookies > 1024)
1531 ncookies = 1024;
1532 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1533 cookie_index = 0;
1534 } else {
1535 ncookies = -1;
1536 cookies = NULL;
1537 cookie_index = 0;
1538 }
1539
36f82b23
MD
1540 hammer_simple_transaction(&trans, ip->hmp);
1541
b3deaf57
MD
1542 /*
1543 * Handle artificial entries
4c286c36
MD
1544 *
1545 * It should be noted that the minimum value for a directory
1546 * hash key on-media is 0x0000000100000000, so we can use anything
1547 * less then that to represent our 'special' key space.
b3deaf57
MD
1548 */
1549 error = 0;
1550 if (saveoff == 0) {
1551 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1552 if (r)
1553 goto done;
1554 if (cookies)
1555 cookies[cookie_index] = saveoff;
1556 ++saveoff;
1557 ++cookie_index;
1558 if (cookie_index == ncookies)
1559 goto done;
1560 }
1561 if (saveoff == 1) {
1562 if (ip->ino_data.parent_obj_id) {
1563 r = vop_write_dirent(&error, uio,
1564 ip->ino_data.parent_obj_id,
1565 DT_DIR, 2, "..");
1566 } else {
1567 r = vop_write_dirent(&error, uio,
1568 ip->obj_id, DT_DIR, 2, "..");
1569 }
1570 if (r)
1571 goto done;
1572 if (cookies)
1573 cookies[cookie_index] = saveoff;
1574 ++saveoff;
1575 ++cookie_index;
1576 if (cookie_index == ncookies)
1577 goto done;
1578 }
6b4f890b
MD
1579
1580 /*
1581 * Key range (begin and end inclusive) to scan. Directory keys
1582 * directly translate to a 64 bit 'seek' position.
1583 */
bcac4bbb 1584 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
5a930e66 1585 cursor.key_beg.localization = ip->obj_localization +
beec5dc4 1586 hammer_dir_localization(ip);
6b4f890b 1587 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1588 cursor.key_beg.create_tid = 0;
6b4f890b
MD
1589 cursor.key_beg.delete_tid = 0;
1590 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1591 cursor.key_beg.obj_type = 0;
b3deaf57 1592 cursor.key_beg.key = saveoff;
6b4f890b
MD
1593
1594 cursor.key_end = cursor.key_beg;
1595 cursor.key_end.key = HAMMER_MAX_KEY;
d5530d22
MD
1596 cursor.asof = ip->obj_asof;
1597 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
6b4f890b 1598
4e17f465 1599 error = hammer_ip_first(&cursor);
6b4f890b
MD
1600
1601 while (error == 0) {
11ad5ade 1602 error = hammer_ip_resolve_data(&cursor);
6b4f890b
MD
1603 if (error)
1604 break;
11ad5ade 1605 base = &cursor.leaf->base;
6b4f890b 1606 saveoff = base->key;
11ad5ade 1607 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
6b4f890b 1608
7a04d74f
MD
1609 if (base->obj_id != ip->obj_id)
1610 panic("readdir: bad record at %p", cursor.node);
1611
ea434b6f
MD
1612 /*
1613 * Convert pseudo-filesystems into softlinks
1614 */
1615 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
6b4f890b 1616 r = vop_write_dirent(
11ad5ade 1617 &error, uio, cursor.data->entry.obj_id,
ea434b6f 1618 dtype,
11ad5ade
MD
1619 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1620 (void *)cursor.data->entry.name);
6b4f890b
MD
1621 if (r)
1622 break;
1623 ++saveoff;
1624 if (cookies)
1625 cookies[cookie_index] = base->key;
1626 ++cookie_index;
1627 if (cookie_index == ncookies)
1628 break;
1629 error = hammer_ip_next(&cursor);
1630 }
1631 hammer_done_cursor(&cursor);
1632
b3deaf57 1633done:
b84de5af 1634 hammer_done_transaction(&trans);
36f82b23 1635
6b4f890b
MD
1636 if (ap->a_eofflag)
1637 *ap->a_eofflag = (error == ENOENT);
6b4f890b
MD
1638 uio->uio_offset = saveoff;
1639 if (error && cookie_index == 0) {
b3deaf57
MD
1640 if (error == ENOENT)
1641 error = 0;
6b4f890b
MD
1642 if (cookies) {
1643 kfree(cookies, M_TEMP);
1644 *ap->a_ncookies = 0;
1645 *ap->a_cookies = NULL;
1646 }
1647 } else {
7a04d74f
MD
1648 if (error == ENOENT)
1649 error = 0;
6b4f890b
MD
1650 if (cookies) {
1651 *ap->a_ncookies = cookie_index;
1652 *ap->a_cookies = cookies;
1653 }
1654 }
1655 return(error);
427e5fc6
MD
1656}
1657
66325755
MD
1658/*
1659 * hammer_vop_readlink { vp, uio, cred }
1660 */
427e5fc6
MD
1661static
1662int
66325755 1663hammer_vop_readlink(struct vop_readlink_args *ap)
427e5fc6 1664{
36f82b23 1665 struct hammer_transaction trans;
7a04d74f
MD
1666 struct hammer_cursor cursor;
1667 struct hammer_inode *ip;
ea434b6f
MD
1668 char buf[32];
1669 u_int32_t localization;
1670 hammer_pseudofs_inmem_t pfsm;
7a04d74f
MD
1671 int error;
1672
1673 ip = VTOI(ap->a_vp);
36f82b23 1674
2f85fa4d
MD
1675 /*
1676 * Shortcut if the symlink data was stuffed into ino_data.
ea434b6f 1677 *
842e7a70
MD
1678 * Also expand special "@@PFS%05d" softlinks (expansion only
1679 * occurs for non-historical (current) accesses made from the
1680 * primary filesystem).
2f85fa4d
MD
1681 */
1682 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
ea434b6f
MD
1683 char *ptr;
1684 int bytes;
1685
1686 ptr = ip->ino_data.ext.symlink;
1687 bytes = (int)ip->ino_data.size;
842e7a70
MD
1688 if (bytes == 10 &&
1689 ip->obj_asof == HAMMER_MAX_TID &&
1690 ip->obj_localization == 0 &&
1691 strncmp(ptr, "@@PFS", 5) == 0) {
ea434b6f
MD
1692 hammer_simple_transaction(&trans, ip->hmp);
1693 bcopy(ptr + 5, buf, 5);
1694 buf[5] = 0;
1695 localization = strtoul(buf, NULL, 10) << 16;
1696 pfsm = hammer_load_pseudofs(&trans, localization,
1697 &error);
1698 if (error == 0) {
4c038e17
MD
1699 if (pfsm->pfsd.mirror_flags &
1700 HAMMER_PFSD_SLAVE) {
cb3c760c 1701 /* vap->va_size == 26 */
4c038e17
MD
1702 ksnprintf(buf, sizeof(buf),
1703 "@@0x%016llx:%05d",
973c11b9 1704 (long long)pfsm->pfsd.sync_end_tid,
4c038e17
MD
1705 localization >> 16);
1706 } else {
cb3c760c
MD
1707 /* vap->va_size == 10 */
1708 ksnprintf(buf, sizeof(buf),
1709 "@@-1:%05d",
1710 localization >> 16);
1711#if 0
4c038e17
MD
1712 ksnprintf(buf, sizeof(buf),
1713 "@@0x%016llx:%05d",
973c11b9 1714 (long long)HAMMER_MAX_TID,
4c038e17 1715 localization >> 16);
cb3c760c 1716#endif
4c038e17 1717 }
ea434b6f
MD
1718 ptr = buf;
1719 bytes = strlen(buf);
1720 }
1721 if (pfsm)
1722 hammer_rel_pseudofs(trans.hmp, pfsm);
1723 hammer_done_transaction(&trans);
1724 }
1725 error = uiomove(ptr, bytes, ap->a_uio);
2f85fa4d
MD
1726 return(error);
1727 }
36f82b23 1728
2f85fa4d
MD
1729 /*
1730 * Long version
1731 */
1732 hammer_simple_transaction(&trans, ip->hmp);
ce0138a6 1733 ++hammer_stats_file_iopsr;
bcac4bbb 1734 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
7a04d74f
MD
1735
1736 /*
1737 * Key range (begin and end inclusive) to scan. Directory keys
1738 * directly translate to a 64 bit 'seek' position.
1739 */
5a930e66
MD
1740 cursor.key_beg.localization = ip->obj_localization +
1741 HAMMER_LOCALIZE_MISC;
7a04d74f 1742 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1743 cursor.key_beg.create_tid = 0;
7a04d74f
MD
1744 cursor.key_beg.delete_tid = 0;
1745 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1746 cursor.key_beg.obj_type = 0;
1747 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
d5530d22
MD
1748 cursor.asof = ip->obj_asof;
1749 cursor.flags |= HAMMER_CURSOR_ASOF;
7a04d74f 1750
45a014dc 1751 error = hammer_ip_lookup(&cursor);
7a04d74f
MD
1752 if (error == 0) {
1753 error = hammer_ip_resolve_data(&cursor);
1754 if (error == 0) {
11ad5ade
MD
1755 KKASSERT(cursor.leaf->data_len >=
1756 HAMMER_SYMLINK_NAME_OFF);
1757 error = uiomove(cursor.data->symlink.name,
1758 cursor.leaf->data_len -
1759 HAMMER_SYMLINK_NAME_OFF,
7a04d74f
MD
1760 ap->a_uio);
1761 }
1762 }
1763 hammer_done_cursor(&cursor);
b84de5af 1764 hammer_done_transaction(&trans);
7a04d74f 1765 return(error);
427e5fc6
MD
1766}
1767
66325755
MD
1768/*
1769 * hammer_vop_nremove { nch, dvp, cred }
1770 */
427e5fc6
MD
1771static
1772int
66325755 1773hammer_vop_nremove(struct vop_nremove_args *ap)
427e5fc6 1774{
b84de5af 1775 struct hammer_transaction trans;
e63644f0 1776 struct hammer_inode *dip;
b84de5af
MD
1777 int error;
1778
e63644f0
MD
1779 dip = VTOI(ap->a_dvp);
1780
1781 if (hammer_nohistory(dip) == 0 &&
93291532 1782 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1783 return (error);
1784 }
1785
1786 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1787 ++hammer_stats_file_iopsw;
d7e278bb 1788 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
b84de5af 1789 hammer_done_transaction(&trans);
fbb84158
MD
1790 if (error == 0)
1791 hammer_knote(ap->a_dvp, NOTE_WRITE);
b84de5af 1792 return (error);
427e5fc6
MD
1793}
1794
66325755
MD
1795/*
1796 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1797 */
427e5fc6
MD
1798static
1799int
66325755 1800hammer_vop_nrename(struct vop_nrename_args *ap)
427e5fc6 1801{
8cd0a023
MD
1802 struct hammer_transaction trans;
1803 struct namecache *fncp;
1804 struct namecache *tncp;
1805 struct hammer_inode *fdip;
1806 struct hammer_inode *tdip;
1807 struct hammer_inode *ip;
1808 struct hammer_cursor cursor;
8cd0a023 1809 int64_t namekey;
5e435c92 1810 u_int32_t max_iterations;
11ad5ade 1811 int nlen, error;
8cd0a023 1812
f437a2ab
MD
1813 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1814 return(EXDEV);
1815 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1816 return(EXDEV);
1817
8cd0a023
MD
1818 fdip = VTOI(ap->a_fdvp);
1819 tdip = VTOI(ap->a_tdvp);
1820 fncp = ap->a_fnch->ncp;
1821 tncp = ap->a_tnch->ncp;
b3deaf57
MD
1822 ip = VTOI(fncp->nc_vp);
1823 KKASSERT(ip != NULL);
d113fda1 1824
f437a2ab
MD
1825 if (fdip->obj_localization != tdip->obj_localization)
1826 return(EXDEV);
1827 if (fdip->obj_localization != ip->obj_localization)
1828 return(EXDEV);
1829
d113fda1
MD
1830 if (fdip->flags & HAMMER_INODE_RO)
1831 return (EROFS);
1832 if (tdip->flags & HAMMER_INODE_RO)
1833 return (EROFS);
1834 if (ip->flags & HAMMER_INODE_RO)
1835 return (EROFS);
93291532 1836 if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1837 return (error);
d113fda1 1838
8cd0a023 1839 hammer_start_transaction(&trans, fdip->hmp);
ce0138a6 1840 ++hammer_stats_file_iopsw;
8cd0a023
MD
1841
1842 /*
b3deaf57
MD
1843 * Remove tncp from the target directory and then link ip as
1844 * tncp. XXX pass trans to dounlink
42c7d26b
MD
1845 *
1846 * Force the inode sync-time to match the transaction so it is
1847 * in-sync with the creation of the target directory entry.
8cd0a023 1848 */
d7e278bb
MD
1849 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1850 ap->a_cred, 0, -1);
42c7d26b 1851 if (error == 0 || error == ENOENT) {
5a930e66
MD
1852 error = hammer_ip_add_directory(&trans, tdip,
1853 tncp->nc_name, tncp->nc_nlen,
1854 ip);
42c7d26b
MD
1855 if (error == 0) {
1856 ip->ino_data.parent_obj_id = tdip->obj_id;
cc0758d0 1857 ip->ino_data.ctime = trans.time;
47637bff 1858 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
42c7d26b
MD
1859 }
1860 }
b3deaf57
MD
1861 if (error)
1862 goto failed; /* XXX */
8cd0a023
MD
1863
1864 /*
1865 * Locate the record in the originating directory and remove it.
1866 *
1867 * Calculate the namekey and setup the key range for the scan. This
1868 * works kinda like a chained hash table where the lower 32 bits
1869 * of the namekey synthesize the chain.
1870 *
1871 * The key range is inclusive of both key_beg and key_end.
1872 */
5e435c92
MD
1873 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1874 &max_iterations);
6a37e7e4 1875retry:
bcac4bbb 1876 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
5a930e66 1877 cursor.key_beg.localization = fdip->obj_localization +
beec5dc4 1878 hammer_dir_localization(fdip);
8cd0a023
MD
1879 cursor.key_beg.obj_id = fdip->obj_id;
1880 cursor.key_beg.key = namekey;
d5530d22 1881 cursor.key_beg.create_tid = 0;
8cd0a023
MD
1882 cursor.key_beg.delete_tid = 0;
1883 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1884 cursor.key_beg.obj_type = 0;
1885
1886 cursor.key_end = cursor.key_beg;
5e435c92 1887 cursor.key_end.key += max_iterations;
d5530d22
MD
1888 cursor.asof = fdip->obj_asof;
1889 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
8cd0a023
MD
1890
1891 /*
1892 * Scan all matching records (the chain), locate the one matching
a89aec1b 1893 * the requested path component.
8cd0a023
MD
1894 *
1895 * The hammer_ip_*() functions merge in-memory records with on-disk
1896 * records for the purposes of the search.
1897 */
4e17f465 1898 error = hammer_ip_first(&cursor);
a89aec1b 1899 while (error == 0) {
8cd0a023
MD
1900 if (hammer_ip_resolve_data(&cursor) != 0)
1901 break;
11ad5ade
MD
1902 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1903 KKASSERT(nlen > 0);
1904 if (fncp->nc_nlen == nlen &&
1905 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
8cd0a023
MD
1906 break;
1907 }
a89aec1b 1908 error = hammer_ip_next(&cursor);
8cd0a023 1909 }
8cd0a023
MD
1910
1911 /*
1912 * If all is ok we have to get the inode so we can adjust nlinks.
6a37e7e4
MD
1913 *
1914 * WARNING: hammer_ip_del_directory() may have to terminate the
1915 * cursor to avoid a recursion. It's ok to call hammer_done_cursor()
1916 * twice.
8cd0a023 1917 */
9944ae54 1918 if (error == 0)
6a37e7e4 1919 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
b84de5af
MD
1920
1921 /*
1922 * XXX A deadlock here will break rename's atomicy for the purposes
1923 * of crash recovery.
1924 */
1925 if (error == EDEADLK) {
b84de5af 1926 hammer_done_cursor(&cursor);
b84de5af
MD
1927 goto retry;
1928 }
1929
1930 /*
1931 * Cleanup and tell the kernel that the rename succeeded.
1932 */
c0ade690 1933 hammer_done_cursor(&cursor);
fbb84158 1934 if (error == 0) {
6a37e7e4 1935 cache_rename(ap->a_fnch, ap->a_tnch);
fbb84158
MD
1936 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1937 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1938 if (ip->vp)
1939 hammer_knote(ip->vp, NOTE_RENAME);
1940 }
b84de5af 1941
b3deaf57 1942failed:
b84de5af 1943 hammer_done_transaction(&trans);
8cd0a023 1944 return (error);
427e5fc6
MD
1945}
1946
66325755
MD
1947/*
1948 * hammer_vop_nrmdir { nch, dvp, cred }
1949 */
427e5fc6
MD
1950static
1951int
66325755 1952hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
427e5fc6 1953{
b84de5af 1954 struct hammer_transaction trans;
e63644f0 1955 struct hammer_inode *dip;
b84de5af
MD
1956 int error;
1957
e63644f0
MD
1958 dip = VTOI(ap->a_dvp);
1959
1960 if (hammer_nohistory(dip) == 0 &&
93291532 1961 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1962 return (error);
1963 }
1964
1965 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1966 ++hammer_stats_file_iopsw;
d7e278bb 1967 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
b84de5af 1968 hammer_done_transaction(&trans);
fbb84158
MD
1969 if (error == 0)
1970 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
b84de5af 1971 return (error);
427e5fc6
MD
1972}
1973
66325755 1974/*
349433c9
MD
1975 * hammer_vop_markatime { vp, cred }
1976 */
1977static
1978int
1979hammer_vop_markatime(struct vop_markatime_args *ap)
1980{
1981 struct hammer_transaction trans;
1982 struct hammer_inode *ip;
1983
1984 ip = VTOI(ap->a_vp);
1985 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1986 return (EROFS);
1987 if (ip->flags & HAMMER_INODE_RO)
1988 return (EROFS);
1989 if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
1990 return (0);
1991 hammer_start_transaction(&trans, ip->hmp);
1992 ++hammer_stats_file_iopsw;
1993
1994 ip->ino_data.atime = trans.time;
1995 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
1996 hammer_done_transaction(&trans);
1997 hammer_knote(ap->a_vp, NOTE_ATTRIB);
1998 return (0);
1999}
2000
2001/*
66325755
MD
2002 * hammer_vop_setattr { vp, vap, cred }
2003 */
427e5fc6
MD
2004static
2005int
66325755 2006hammer_vop_setattr(struct vop_setattr_args *ap)
427e5fc6 2007{
8cd0a023
MD
2008 struct hammer_transaction trans;
2009 struct vattr *vap;
2010 struct hammer_inode *ip;
2011 int modflags;
2012 int error;
d5ef456e 2013 int truncating;
4a2796f3 2014 int blksize;
fbb84158 2015 int kflags;
4a2796f3 2016 int64_t aligned_size;
8cd0a023 2017 u_int32_t flags;
8cd0a023
MD
2018
2019 vap = ap->a_vap;
2020 ip = ap->a_vp->v_data;
2021 modflags = 0;
fbb84158 2022 kflags = 0;
8cd0a023
MD
2023
2024 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2025 return(EROFS);
d113fda1
MD
2026 if (ip->flags & HAMMER_INODE_RO)
2027 return (EROFS);
e63644f0 2028 if (hammer_nohistory(ip) == 0 &&
93291532 2029 (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
2030 return (error);
2031 }
8cd0a023
MD
2032
2033 hammer_start_transaction(&trans, ip->hmp);
ce0138a6 2034 ++hammer_stats_file_iopsw;
8cd0a023
MD
2035 error = 0;
2036
2037 if (vap->va_flags != VNOVAL) {
2038 flags = ip->ino_data.uflags;
2039 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2040 hammer_to_unix_xid(&ip->ino_data.uid),
2041 ap->a_cred);
2042 if (error == 0) {
2043 if (ip->ino_data.uflags != flags) {
2044 ip->ino_data.uflags = flags;
cc0758d0 2045 ip->ino_data.ctime = trans.time;
8cd0a023 2046 modflags |= HAMMER_INODE_DDIRTY;
fbb84158 2047 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2048 }
2049 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2050 error = 0;
2051 goto done;
2052 }
2053 }
2054 goto done;
2055 }
2056 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2057 error = EPERM;
2058 goto done;
2059 }
7538695e
MD
2060 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2061 mode_t cur_mode = ip->ino_data.mode;
2062 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2063 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2064 uuid_t uuid_uid;
2065 uuid_t uuid_gid;
2066
2067 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2068 ap->a_cred,
2069 &cur_uid, &cur_gid, &cur_mode);
2070 if (error == 0) {
2071 hammer_guid_to_uuid(&uuid_uid, cur_uid);
2072 hammer_guid_to_uuid(&uuid_gid, cur_gid);
2073 if (bcmp(&uuid_uid, &ip->ino_data.uid,
2074 sizeof(uuid_uid)) ||
2075 bcmp(&uuid_gid, &ip->ino_data.gid,
2076 sizeof(uuid_gid)) ||
2077 ip->ino_data.mode != cur_mode
2078 ) {
2079 ip->ino_data.uid = uuid_uid;
2080 ip->ino_data.gid = uuid_gid;
2081 ip->ino_data.mode = cur_mode;
cc0758d0
MD
2082 ip->ino_data.ctime = trans.time;
2083 modflags |= HAMMER_INODE_DDIRTY;
7538695e 2084 }
fbb84158 2085 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2086 }
2087 }
11ad5ade 2088 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
8cd0a023
MD
2089 switch(ap->a_vp->v_type) {
2090 case VREG:
11ad5ade 2091 if (vap->va_size == ip->ino_data.size)
d5ef456e 2092 break;
b84de5af
MD
2093 /*
2094 * XXX break atomicy, we can deadlock the backend
2095 * if we do not release the lock. Probably not a
2096 * big deal here.
2097 */
4a2796f3 2098 blksize = hammer_blocksize(vap->va_size);
11ad5ade 2099 if (vap->va_size < ip->ino_data.size) {
4a2796f3 2100 vtruncbuf(ap->a_vp, vap->va_size, blksize);
d5ef456e 2101 truncating = 1;
fbb84158 2102 kflags |= NOTE_WRITE;
d5ef456e 2103 } else {
c0ade690 2104 vnode_pager_setsize(ap->a_vp, vap->va_size);
d5ef456e 2105 truncating = 0;
fbb84158 2106 kflags |= NOTE_WRITE | NOTE_EXTEND;
c0ade690 2107 }
11ad5ade 2108 ip->ino_data.size = vap->va_size;
cc0758d0
MD
2109 ip->ino_data.mtime = trans.time;
2110 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
d5ef456e 2111
b84de5af
MD
2112 /*
2113 * on-media truncation is cached in the inode until
2114 * the inode is synchronized.
2115 */
d5ef456e 2116 if (truncating) {
47637bff 2117 hammer_ip_frontend_trunc(ip, vap->va_size);
0832c9bb
MD
2118#ifdef DEBUG_TRUNCATE
2119 if (HammerTruncIp == NULL)
2120 HammerTruncIp = ip;
2121#endif
b84de5af
MD
2122 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2123 ip->flags |= HAMMER_INODE_TRUNCATED;
2124 ip->trunc_off = vap->va_size;
0832c9bb
MD
2125#ifdef DEBUG_TRUNCATE
2126 if (ip == HammerTruncIp)
973c11b9
MD
2127 kprintf("truncate1 %016llx\n",
2128 (long long)ip->trunc_off);
0832c9bb 2129#endif
b84de5af
MD
2130 } else if (ip->trunc_off > vap->va_size) {
2131 ip->trunc_off = vap->va_size;
0832c9bb
MD
2132#ifdef DEBUG_TRUNCATE
2133 if (ip == HammerTruncIp)
973c11b9
MD
2134 kprintf("truncate2 %016llx\n",
2135 (long long)ip->trunc_off);
0832c9bb
MD
2136#endif
2137 } else {
2138#ifdef DEBUG_TRUNCATE
2139 if (ip == HammerTruncIp)
973c11b9
MD
2140 kprintf("truncate3 %016llx (ignored)\n",
2141 (long long)vap->va_size);
0832c9bb 2142#endif
b84de5af 2143 }
d5ef456e 2144 }
b84de5af 2145
d5ef456e
MD
2146 /*
2147 * If truncating we have to clean out a portion of
b84de5af
MD
2148 * the last block on-disk. We do this in the
2149 * front-end buffer cache.
d5ef456e 2150 */
4a2796f3
MD
2151 aligned_size = (vap->va_size + (blksize - 1)) &
2152 ~(int64_t)(blksize - 1);
b84de5af 2153 if (truncating && vap->va_size < aligned_size) {
d5ef456e
MD
2154 struct buf *bp;
2155 int offset;
2156
4a2796f3 2157 aligned_size -= blksize;
47637bff 2158
4a2796f3 2159 offset = (int)vap->va_size & (blksize - 1);
47637bff 2160 error = bread(ap->a_vp, aligned_size,
4a2796f3 2161 blksize, &bp);
47637bff 2162 hammer_ip_frontend_trunc(ip, aligned_size);
d5ef456e
MD
2163 if (error == 0) {
2164 bzero(bp->b_data + offset,
4a2796f3 2165 blksize - offset);
1b0ab2c3
MD
2166 /* must de-cache direct-io offset */
2167 bp->b_bio2.bio_offset = NOOFFSET;
d5ef456e
MD
2168 bdwrite(bp);
2169 } else {
47637bff 2170 kprintf("ERROR %d\n", error);
d5ef456e
MD
2171 brelse(bp);
2172 }
2173 }
76376933 2174 break;
8cd0a023 2175 case VDATABASE:
b84de5af
MD
2176 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2177 ip->flags |= HAMMER_INODE_TRUNCATED;
2178 ip->trunc_off = vap->va_size;
2179 } else if (ip->trunc_off > vap->va_size) {
2180 ip->trunc_off = vap->va_size;
2181 }
47637bff 2182 hammer_ip_frontend_trunc(ip, vap->va_size);
11ad5ade 2183 ip->ino_data.size = vap->va_size;
cc0758d0
MD
2184 ip->ino_data.mtime = trans.time;
2185 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
fbb84158 2186 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2187 break;
2188 default:
2189 error = EINVAL;
2190 goto done;
2191 }
d26d0ae9 2192 break;
8cd0a023
MD
2193 }
2194 if (vap->va_atime.tv_sec != VNOVAL) {
cc0758d0 2195 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
ddfdf542 2196 modflags |= HAMMER_INODE_ATIME;
fbb84158 2197 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2198 }
2199 if (vap->va_mtime.tv_sec != VNOVAL) {
cc0758d0 2200 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
ddfdf542 2201 modflags |= HAMMER_INODE_MTIME;
fbb84158 2202 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2203 }
2204 if (vap->va_mode != (mode_t)VNOVAL) {
7538695e
MD
2205 mode_t cur_mode = ip->ino_data.mode;
2206 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2207 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2208
2209 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2210 cur_uid, cur_gid, &cur_mode);
2211 if (error == 0 && ip->ino_data.mode != cur_mode) {
2212 ip->ino_data.mode = cur_mode;
cc0758d0 2213 ip->ino_data.ctime = trans.time;
8cd0a023 2214 modflags |= HAMMER_INODE_DDIRTY;
fbb84158 2215 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2216 }
2217 }
2218done:
b84de5af 2219 if (error == 0)
47637bff 2220 hammer_modify_inode(ip, modflags);
b84de5af 2221 hammer_done_transaction(&trans);
fbb84158 2222 hammer_knote(ap->a_vp, kflags);
8cd0a023 2223 return (error);
427e5fc6
MD
2224}
2225
66325755
MD
2226/*
2227 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2228 */
427e5fc6
MD
2229static
2230int
66325755 2231hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
427e5fc6 2232{
7a04d74f
MD
2233 struct hammer_transaction trans;
2234 struct hammer_inode *dip;
2235 struct hammer_inode *nip;
2236 struct nchandle *nch;
2237 hammer_record_t record;
2238 int error;
2239 int bytes;
2240
2241 ap->a_vap->va_type = VLNK;
2242
2243 nch = ap->a_nch;
2244 dip = VTOI(ap->a_dvp);
2245
d113fda1
MD
2246 if (dip->flags & HAMMER_INODE_RO)
2247 return (EROFS);
93291532 2248 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 2249 return (error);
d113fda1 2250
7a04d74f
MD
2251 /*
2252 * Create a transaction to cover the operations we perform.
2253 */
2254 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 2255 ++hammer_stats_file_iopsw;
7a04d74f
MD
2256
2257 /*
2258 * Create a new filesystem object of the requested type. The
2259 * returned inode will be referenced but not locked.
2260 */
2261
5a930e66 2262 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
2263 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2264 NULL, &nip);
7a04d74f 2265 if (error) {
b84de5af 2266 hammer_done_transaction(&trans);
7a04d74f
MD
2267 *ap->a_vpp = NULL;
2268 return (error);
2269 }
2270
2271 /*
7a04d74f
MD
2272 * Add a record representing the symlink. symlink stores the link
2273 * as pure data, not a string, and is no \0 terminated.
2274 */
2275 if (error == 0) {
7a04d74f
MD
2276 bytes = strlen(ap->a_target);
2277
2f85fa4d
MD
2278 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2279 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2280 } else {
2281 record = hammer_alloc_mem_record(nip, bytes);
2282 record->type = HAMMER_MEM_RECORD_GENERAL;
2283
5a930e66
MD
2284 record->leaf.base.localization = nip->obj_localization +
2285 HAMMER_LOCALIZE_MISC;
2f85fa4d
MD
2286 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2287 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2288 record->leaf.data_len = bytes;
2289 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2290 bcopy(ap->a_target, record->data->symlink.name, bytes);
2291 error = hammer_ip_add_record(&trans, record);
2292 }
42c7d26b
MD
2293
2294 /*
2295 * Set the file size to the length of the link.
2296 */
2297 if (error == 0) {
11ad5ade 2298 nip->ino_data.size = bytes;
47637bff 2299 hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
42c7d26b 2300 }
7a04d74f 2301 }
1f07f686 2302 if (error == 0)
5a930e66
MD
2303 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2304 nch->ncp->nc_nlen, nip);
7a04d74f
MD
2305
2306 /*
2307 * Finish up.
2308 */
2309 if (error) {
2310 hammer_rel_inode(nip, 0);
7a04d74f
MD
2311 *ap->a_vpp = NULL;
2312 } else {
e8599db1 2313 error = hammer_get_vnode(nip, ap->a_vpp);
7a04d74f
MD
2314 hammer_rel_inode(nip, 0);
2315 if (error == 0) {
2316 cache_setunresolved(ap->a_nch);
2317 cache_setvp(ap->a_nch, *ap->a_vpp);
fbb84158 2318 hammer_knote(ap->a_dvp, NOTE_WRITE);
7a04d74f
MD
2319 }
2320 }
b84de5af 2321 hammer_done_transaction(&trans);
7a04d74f 2322 return (error);
427e5fc6
MD
2323}
2324
66325755
MD
2325/*
2326 * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2327 */
427e5fc6
MD
2328static
2329int
66325755 2330hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
427e5fc6 2331{
b84de5af 2332 struct hammer_transaction trans;
e63644f0 2333 struct hammer_inode *dip;
b84de5af
MD
2334 int error;
2335
e63644f0
MD
2336 dip = VTOI(ap->a_dvp);
2337
2338 if (hammer_nohistory(dip) == 0 &&
93291532 2339 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
e63644f0
MD
2340 return (error);
2341 }
2342
2343 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 2344 ++hammer_stats_file_iopsw;
b84de5af 2345 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
d7e278bb 2346 ap->a_cred, ap->a_flags, -1);
b84de5af
MD
2347 hammer_done_transaction(&trans);
2348
2349 return (error);
427e5fc6
MD
2350}
2351
66325755 2352/*
7dc57964
MD
2353 * hammer_vop_ioctl { vp, command, data, fflag, cred }
2354 */
2355static
2356int
2357hammer_vop_ioctl(struct vop_ioctl_args *ap)
2358{
2359 struct hammer_inode *ip = ap->a_vp->v_data;
2360
ce0138a6 2361 ++hammer_stats_file_iopsr;
7dc57964
MD
2362 return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2363 ap->a_fflag, ap->a_cred));
2364}
2365
513ca7d7
MD
2366static
2367int
2368hammer_vop_mountctl(struct vop_mountctl_args *ap)
2369{
dad088a5
MD
2370 static const struct mountctl_opt extraopt[] = {
2371 { HMNT_NOHISTORY, "nohistory" },
2372 { HMNT_MASTERID, "master" },
2373 { 0, NULL}
2374
2375 };
2376 struct hammer_mount *hmp;
513ca7d7 2377 struct mount *mp;
dad088a5 2378 int usedbytes;
513ca7d7
MD
2379 int error;
2380
dad088a5
MD
2381 error = 0;
2382 usedbytes = 0;
513ca7d7 2383 mp = ap->a_head.a_ops->head.vv_mount;
dad088a5
MD
2384 KKASSERT(mp->mnt_data != NULL);
2385 hmp = (struct hammer_mount *)mp->mnt_data;
513ca7d7
MD
2386
2387 switch(ap->a_op) {
dad088a5 2388
513ca7d7
MD
2389 case MOUNTCTL_SET_EXPORT:
2390 if (ap->a_ctllen != sizeof(struct export_args))
2391 error = EINVAL;
b424ca30
MD
2392 else
2393 error = hammer_vfs_export(mp, ap->a_op,
513ca7d7
MD
2394 (const struct export_args *)ap->a_ctl);
2395 break;
dad088a5
MD
2396 case MOUNTCTL_MOUNTFLAGS:
2397 {
2398 /*
2399 * Call standard mountctl VOP function
2400 * so we get user mount flags.
2401 */
2402 error = vop_stdmountctl(ap);
2403 if (error)
2404 break;
2405
2406 usedbytes = *ap->a_res;
2407
eac446c5 2408 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
dad088a5
MD
2409 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf,
2410 ap->a_buflen - usedbytes,
2411 &error);
dad088a5
MD
2412 }
2413
2414 *ap->a_res += usedbytes;
2415 break;
2416 }
513ca7d7 2417 default:
726e0641 2418 error = vop_stdmountctl(ap);
513ca7d7
MD
2419 break;
2420 }
2421 return(error);
2422}
2423
7dc57964 2424/*
66325755 2425 * hammer_vop_strategy { vp, bio }
8cd0a023
MD
2426 *
2427 * Strategy call, used for regular file read & write only. Note that the
2428 * bp may represent a cluster.
2429 *
2430 * To simplify operation and allow better optimizations in the future,
2431 * this code does not make any assumptions with regards to buffer alignment
2432 * or size.
66325755 2433 */
427e5fc6
MD
2434static
2435int
66325755 2436hammer_vop_strategy(struct vop_strategy_args *ap)
427e5fc6 2437{
8cd0a023
MD
2438 struct buf *bp;
2439 int error;
2440
2441 bp = ap->a_bio->bio_buf;
2442
2443 switch(bp->b_cmd) {
2444 case BUF_CMD_READ:
2445 error = hammer_vop_strategy_read(ap);
2446 break;
2447 case BUF_CMD_WRITE:
2448 error = hammer_vop_strategy_write(ap);
2449 break;
2450 default:
059819e3
MD
2451 bp->b_error = error = EINVAL;
2452 bp->b_flags |= B_ERROR;
2453 biodone(ap->a_bio);
8cd0a023
MD
2454 break;
2455 }
8cd0a023 2456 return (error);
427e5fc6
MD
2457}
2458
8cd0a023
MD
2459/*
2460 * Read from a regular file. Iterate the related records and fill in the
2461 * BIO/BUF. Gaps are zero-filled.
2462 *
2463 * The support code in hammer_object.c should be used to deal with mixed
2464 * in-memory and on-disk records.
2465 *
4a2796f3
MD
2466 * NOTE: Can be called from the cluster code with an oversized buf.
2467 *
8cd0a023
MD
2468 * XXX atime update
2469 */
2470static
2471int
2472hammer_vop_strategy_read(struct vop_strategy_args *ap)
2473{
36f82b23
MD
2474 struct hammer_transaction trans;
2475 struct hammer_inode *ip;
39d8fd63 2476 struct hammer_inode *dip;
8cd0a023 2477 struct hammer_cursor cursor;
8cd0a023 2478 hammer_base_elm_t base;
4a2796f3 2479 hammer_off_t disk_offset;
8cd0a023 2480 struct bio *bio;
a99b9ea2 2481 struct bio *nbio;
8cd0a023
MD
2482 struct buf *bp;
2483 int64_t rec_offset;
a89aec1b 2484 int64_t ran_end;
195c19a1 2485 int64_t tmp64;
8cd0a023
MD
2486 int error;
2487 int boff;
2488 int roff;
2489 int n;
2490
2491 bio = ap->a_bio;
2492 bp = bio->bio_buf;
36f82b23 2493 ip = ap->a_vp->v_data;
8cd0a023 2494
a99b9ea2
MD
2495 /*
2496 * The zone-2 disk offset may have been set by the cluster code via
4a2796f3 2497 * a BMAP operation, or else should be NOOFFSET.
a99b9ea2 2498 *
4a2796f3 2499 * Checking the high bits for a match against zone-2 should suffice.
a99b9ea2
MD
2500 */
2501 nbio = push_bio(bio);
6aeaa7bd 2502 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
1b0ab2c3
MD
2503 HAMMER_ZONE_LARGE_DATA) {
2504 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
a99b9ea2
MD
2505 return (error);
2506 }
2507
2508 /*
4a2796f3
MD
2509 * Well, that sucked. Do it the hard way. If all the stars are
2510 * aligned we may still be able to issue a direct-read.
a99b9ea2 2511 */
36f82b23 2512 hammer_simple_transaction(&trans, ip->hmp);
47637bff 2513 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
8cd0a023
MD
2514
2515 /*
2516 * Key range (begin and end inclusive) to scan. Note that the key's
c0ade690
MD
2517 * stored in the actual records represent BASE+LEN, not BASE. The
2518 * first record containing bio_offset will have a key > bio_offset.
8cd0a023 2519 */
5a930e66
MD
2520 cursor.key_beg.localization = ip->obj_localization +
2521 HAMMER_LOCALIZE_MISC;
8cd0a023 2522 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 2523 cursor.key_beg.create_tid = 0;
8cd0a023 2524 cursor.key_beg.delete_tid = 0;
8cd0a023 2525 cursor.key_beg.obj_type = 0;
c0ade690 2526 cursor.key_beg.key = bio->bio_offset + 1;
d5530d22 2527 cursor.asof = ip->obj_asof;
bf3b416b 2528 cursor.flags |= HAMMER_CURSOR_ASOF;
8cd0a023
MD
2529
2530 cursor.key_end = cursor.key_beg;
11ad5ade 2531 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
b84de5af 2532#if 0
11ad5ade 2533 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
a89aec1b
MD
2534 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2535 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2536 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
b84de5af
MD
2537 } else
2538#endif
2539 {
c0ade690 2540 ran_end = bio->bio_offset + bp->b_bufsize;
a89aec1b
MD
2541 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2542 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
195c19a1
MD
2543 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2544 if (tmp64 < ran_end)
a89aec1b
MD
2545 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2546 else
7f7c1f84 2547 cursor.key_end.key = ran_end + MAXPHYS + 1;
a89aec1b 2548 }
d26d0ae9 2549 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
8cd0a023 2550
4e17f465 2551 error = hammer_ip_first(&cursor);
8cd0a023
MD
2552 boff = 0;
2553
a89aec1b 2554 while (error == 0) {
47637bff
MD
2555 /*
2556 * Get the base file offset of the record. The key for
2557 * data records is (base + bytes) rather then (base).
2558 */
11ad5ade 2559 base = &cursor.leaf->base;
11ad5ade 2560 rec_offset = base->key - cursor.leaf->data_len;
8cd0a023 2561
66325755 2562 /*
a89aec1b 2563 * Calculate the gap, if any, and zero-fill it.
1fef775e
MD
2564 *
2565 * n is the offset of the start of the record verses our
2566 * current seek offset in the bio.
66325755 2567 */
8cd0a023
MD
2568 n = (int)(rec_offset - (bio->bio_offset + boff));
2569 if (n > 0) {
a89aec1b
MD
2570 if (n > bp->b_bufsize - boff)
2571 n = bp->b_bufsize - boff;
8cd0a023
MD
2572 bzero((char *)bp->b_data + boff, n);
2573 boff += n;
2574 n = 0;
66325755 2575 }
8cd0a023
MD
2576
2577 /*
2578 * Calculate the data offset in the record and the number
2579 * of bytes we can copy.
a89aec1b 2580 *
1fef775e
MD
2581 * There are two degenerate cases. First, boff may already
2582 * be at bp->b_bufsize. Secondly, the data offset within
2583 * the record may exceed the record's size.
8cd0a023
MD
2584 */
2585 roff = -n;
b84de5af 2586 rec_offset += roff;
11ad5ade 2587 n = cursor.leaf->data_len - roff;
1fef775e
MD
2588 if (n <= 0) {
2589 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2590 n = 0;
2591 } else if (n > bp->b_bufsize - boff) {
8cd0a023 2592 n = bp->b_bufsize - boff;
1fef775e 2593 }
059819e3 2594
b84de5af 2595 /*
47637bff
MD
2596 * Deal with cached truncations. This cool bit of code
2597 * allows truncate()/ftruncate() to avoid having to sync
2598 * the file.
2599 *
2600 * If the frontend is truncated then all backend records are
2601 * subject to the frontend's truncation.
2602 *
2603 * If the backend is truncated then backend records on-disk
2604 * (but not in-memory) are subject to the backend's
2605 * truncation. In-memory records owned by the backend
2606 * represent data written after the truncation point on the
2607 * backend and must not be truncated.
2608 *
2609 * Truncate operations deal with frontend buffer cache
2610 * buffers and frontend-owned in-memory records synchronously.
b84de5af 2611 */
47637bff
MD
2612 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2613 if (hammer_cursor_ondisk(&cursor) ||
2614 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2615 if (ip->trunc_off <= rec_offset)
2616 n = 0;
2617 else if (ip->trunc_off < rec_offset + n)
2618 n = (int)(ip->trunc_off - rec_offset);
2619 }
2620 }
2621 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2622 if (hammer_cursor_ondisk(&cursor)) {
2623 if (ip->sync_trunc_off <= rec_offset)
2624 n = 0;
2625 else if (ip->sync_trunc_off < rec_offset + n)
2626 n = (int)(ip->sync_trunc_off - rec_offset);
2627 }
2628 }
b84de5af
MD
2629
2630 /*
47637bff
MD
2631 * Try to issue a direct read into our bio if possible,
2632 * otherwise resolve the element data into a hammer_buffer
2633 * and copy.
4a2796f3
MD
2634 *
2635 * The buffer on-disk should be zerod past any real
2636 * truncation point, but may not be for any synthesized
2637 * truncation point from above.
b84de5af 2638 */
1b0ab2c3 2639 disk_offset = cursor.leaf->data_offset + roff;
4a2796f3 2640 if (boff == 0 && n == bp->b_bufsize &&
1b0ab2c3
MD
2641 hammer_cursor_ondisk(&cursor) &&
2642 (disk_offset & HAMMER_BUFMASK) == 0) {
2643 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2644 HAMMER_ZONE_LARGE_DATA);
4a2796f3 2645 nbio->bio_offset = disk_offset;
1b0ab2c3
MD
2646 error = hammer_io_direct_read(trans.hmp, nbio,
2647 cursor.leaf);
47637bff
MD
2648 goto done;
2649 } else if (n) {
2650 error = hammer_ip_resolve_data(&cursor);
2651 if (error == 0) {
2652 bcopy((char *)cursor.data + roff,
2653 (char *)bp->b_data + boff, n);
2654 }
b84de5af 2655 }
47637bff
MD
2656 if (error)
2657 break;
2658
2659 /*
2660 * Iterate until we have filled the request.
2661 */
2662 boff += n;
8cd0a023 2663 if (boff == bp->b_bufsize)
66325755 2664 break;
a89aec1b 2665 error = hammer_ip_next(&cursor);
66325755
MD
2666 }
2667
2668 /*
8cd0a023 2669 * There may have been a gap after the last record
66325755 2670 */
8cd0a023
MD
2671 if (error == ENOENT)
2672 error = 0;
2673 if (error == 0 && boff != bp->b_bufsize) {
7f7c1f84 2674 KKASSERT(boff < bp->b_bufsize);
8cd0a023
MD
2675 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2676 /* boff = bp->b_bufsize; */
2677 }
2678 bp->b_resid = 0;
059819e3
MD
2679 bp->b_error = error;
2680 if (error)
2681 bp->b_flags |= B_ERROR;
2682 biodone(ap->a_bio);
47637bff
MD
2683
2684done:
39d8fd63
MD
2685 /*
2686 * Cache the b-tree node for the last data read in cache[1].
2687 *
2688 * If we hit the file EOF then also cache the node in the
2689 * governing director's cache[3], it will be used to initialize
2690 * the inode's cache[1] for any inodes looked up via the directory.
2691 *
2692 * This doesn't reduce disk accesses since the B-Tree chain is
2693 * likely cached, but it does reduce cpu overhead when looking
2694 * up file offsets for cpdup/tar/cpio style iterations.
2695 */
47637bff 2696 if (cursor.node)
bcac4bbb 2697 hammer_cache_node(&ip->cache[1], cursor.node);
39d8fd63
MD
2698 if (ran_end >= ip->ino_data.size) {
2699 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2700 ip->obj_asof, ip->obj_localization);
2701 if (dip) {
2702 hammer_cache_node(&dip->cache[3], cursor.node);
2703 hammer_rel_inode(dip, 0);
2704 }
2705 }
47637bff
MD
2706 hammer_done_cursor(&cursor);
2707 hammer_done_transaction(&trans);
8cd0a023
MD
2708 return(error);
2709}
2710
2711/*
a99b9ea2
MD
2712 * BMAP operation - used to support cluster_read() only.
2713 *
2714 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2715 *
2716 * This routine may return EOPNOTSUPP if the opration is not supported for
2717 * the specified offset. The contents of the pointer arguments do not
2718 * need to be initialized in that case.
2719 *
2720 * If a disk address is available and properly aligned return 0 with
2721 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2722 * to the run-length relative to that offset. Callers may assume that
2723 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2724 * large, so return EOPNOTSUPP if it is not sufficiently large.
2725 */
2726static
2727int
2728hammer_vop_bmap(struct vop_bmap_args *ap)
2729{
2730 struct hammer_transaction trans;
2731 struct hammer_inode *ip;
2732 struct hammer_cursor cursor;
2733 hammer_base_elm_t base;
2734 int64_t rec_offset;
2735 int64_t ran_end;
2736 int64_t tmp64;
2737 int64_t base_offset;
2738 int64_t base_disk_offset;
2739 int64_t last_offset;
2740 hammer_off_t last_disk_offset;
2741 hammer_off_t disk_offset;
2742 int rec_len;
2743 int error;
4a2796f3 2744 int blksize;
a99b9ea2 2745
ce0138a6 2746 ++hammer_stats_file_iopsr;
a99b9ea2
MD
2747 ip = ap->a_vp->v_data;
2748
2749 /*
2750 * We can only BMAP regular files. We can't BMAP database files,
2751 * directories, etc.
2752 */
2753 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2754 return(EOPNOTSUPP);
2755
2756 /*
2757 * bmap is typically called with runp/runb both NULL when used
2758 * for writing. We do not support BMAP for writing atm.
2759 */
4a2796f3 2760 if (ap->a_cmd != BUF_CMD_READ)
a99b9ea2
MD
2761 return(EOPNOTSUPP);
2762
2763 /*
2764 * Scan the B-Tree to acquire blockmap addresses, then translate
2765 * to raw addresses.
2766 */
2767 hammer_simple_transaction(&trans, ip->hmp);
cb51be26 2768#if 0
973c11b9
MD
2769 kprintf("bmap_beg %016llx ip->cache %p\n",
2770 (long long)ap->a_loffset, ip->cache[1]);
cb51be26 2771#endif
a99b9ea2
MD
2772 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2773
2774 /*
2775 * Key range (begin and end inclusive) to scan. Note that the key's
2776 * stored in the actual records represent BASE+LEN, not BASE. The
2777 * first record containing bio_offset will have a key > bio_offset.
2778 */
5a930e66
MD
2779 cursor.key_beg.localization = ip->obj_localization +
2780 HAMMER_LOCALIZE_MISC;
a99b9ea2
MD
2781 cursor.key_beg.obj_id = ip->obj_id;
2782 cursor.key_beg.create_tid = 0;
2783 cursor.key_beg.delete_tid = 0;
2784 cursor.key_beg.obj_type = 0;
2785 if (ap->a_runb)
2786 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2787 else
2788 cursor.key_beg.key = ap->a_loffset + 1;
2789 if (cursor.key_beg.key < 0)
2790 cursor.key_beg.key = 0;
2791 cursor.asof = ip->obj_asof;
bf3b416b 2792 cursor.flags |= HAMMER_CURSOR_ASOF;
a99b9ea2
MD
2793
2794 cursor.key_end = cursor.key_beg;
2795 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2796
2797 ran_end = ap->a_loffset + MAXPHYS;
2798 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2799 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2800 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2801 if (tmp64 < ran_end)
2802 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2803 else
2804 cursor.key_end.key = ran_end + MAXPHYS + 1;
2805
2806 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2807
2808 error = hammer_ip_first(&cursor);
2809 base_offset = last_offset = 0;
2810 base_disk_offset = last_disk_offset = 0;
2811
2812 while (error == 0) {
2813 /*
2814 * Get the base file offset of the record. The key for
2815 * data records is (base + bytes) rather then (base).
4a2796f3
MD
2816 *
2817 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2818 * The extra bytes should be zero on-disk and the BMAP op
2819 * should still be ok.
a99b9ea2
MD
2820 */
2821 base = &cursor.leaf->base;
2822 rec_offset = base->key - cursor.leaf->data_len;
2823 rec_len = cursor.leaf->data_len;
2824
2825 /*
4a2796f3
MD
2826 * Incorporate any cached truncation.
2827 *
2828 * NOTE: Modifications to rec_len based on synthesized
2829 * truncation points remove the guarantee that any extended
2830 * data on disk is zero (since the truncations may not have
2831 * taken place on-media yet).
a99b9ea2
MD
2832 */
2833 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2834 if (hammer_cursor_ondisk(&cursor) ||
2835 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2836 if (ip->trunc_off <= rec_offset)
2837 rec_len = 0;
2838 else if (ip->trunc_off < rec_offset + rec_len)
2839 rec_len = (int)(ip->trunc_off - rec_offset);
2840 }
2841 }
2842 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2843 if (hammer_cursor_ondisk(&cursor)) {
2844 if (ip->sync_trunc_off <= rec_offset)
2845 rec_len = 0;
2846 else if (ip->sync_trunc_off < rec_offset + rec_len)
2847 rec_len = (int)(ip->sync_trunc_off - rec_offset);
2848 }
2849 }
2850
2851 /*
2852 * Accumulate information. If we have hit a discontiguous
2853 * block reset base_offset unless we are already beyond the
2854 * requested offset. If we are, that's it, we stop.
2855 */
a99b9ea2
MD
2856 if (error)
2857 break;
1b0ab2c3
MD
2858 if (hammer_cursor_ondisk(&cursor)) {
2859 disk_offset = cursor.leaf->data_offset;
2860 if (rec_offset != last_offset ||
2861 disk_offset != last_disk_offset) {
2862 if (rec_offset > ap->a_loffset)
2863 break;
2864 base_offset = rec_offset;
2865 base_disk_offset = disk_offset;
2866 }
2867 last_offset = rec_offset + rec_len;
2868 last_disk_offset = disk_offset + rec_len;
a99b9ea2 2869 }
a99b9ea2
MD
2870 error = hammer_ip_next(&cursor);
2871 }
2872
2873#if 0
2874 kprintf("BMAP %016llx: %016llx - %016llx\n",
973c11b9
MD
2875 (long long)ap->a_loffset,
2876 (long long)base_offset,
2877 (long long)last_offset);
2878 kprintf("BMAP %16s: %016llx - %016llx\n", "",
2879 (long long)base_disk_offset,
2880 (long long)last_disk_offset);
a99b9ea2
MD
2881#endif
2882
cb51be26 2883 if (cursor.node) {
bcac4bbb 2884 hammer_cache_node(&ip->cache[1], cursor.node);
cb51be26 2885#if 0
973c11b9
MD
2886 kprintf("bmap_end2 %016llx ip->cache %p\n",
2887 (long long)ap->a_loffset, ip->cache[1]);
cb51be26
MD
2888#endif
2889 }
a99b9ea2
MD
2890 hammer_done_cursor(&cursor);
2891 hammer_done_transaction(&trans);
2892
4a2796f3
MD
2893 /*
2894 * If we couldn't find any records or the records we did find were
2895 * all behind the requested offset, return failure. A forward
2896 * truncation can leave a hole w/ no on-disk records.
2897 */
2898 if (last_offset == 0 || last_offset < ap->a_loffset)
2899 return (EOPNOTSUPP);
2900
2901 /*
2902 * Figure out the block size at the requested offset and adjust
2903 * our limits so the cluster_read() does not create inappropriately
2904 * sized buffer cache buffers.
2905 */
2906 blksize = hammer_blocksize(ap->a_loffset);
2907 if (hammer_blocksize(base_offset) != blksize) {
2908 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2909 }
2910 if (last_offset != ap->a_loffset &&
2911 hammer_blocksize(last_offset - 1) != blksize) {
2912 last_offset = hammer_blockdemarc(ap->a_loffset,
2913 last_offset - 1);
2914 }
2915
2916 /*
2917 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2918 * from occuring.
2919 */
2920 disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2921
1b0ab2c3
MD
2922 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2923 /*
2924 * Only large-data zones can be direct-IOd
2925 */
2926 error = EOPNOTSUPP;
2927 } else if ((disk_offset & HAMMER_BUFMASK) ||
2928 (last_offset - ap->a_loffset) < blksize) {
2929 /*
2930 * doffsetp is not aligned or the forward run size does
2931 * not cover a whole buffer, disallow the direct I/O.
2932 */
a99b9ea2
MD
2933 error = EOPNOTSUPP;
2934 } else {
1b0ab2c3
MD
2935 /*
2936 * We're good.
2937 */
4a2796f3
MD
2938 *ap->a_doffsetp = disk_offset;
2939 if (ap->a_runb) {
2940 *ap->a_runb = ap->a_loffset - base_offset;
2941 KKASSERT(*ap->a_runb >= 0);
a99b9ea2 2942 }
4a2796f3
MD
2943 if (ap->a_runp) {
2944 *ap->a_runp = last_offset - ap->a_loffset;
2945 KKASSERT(*ap->a_runp >= 0);
2946 }
2947 error = 0;
a99b9ea2
MD
2948 }
2949 return(error);
2950}
2951
2952/*
059819e3 2953 * Write to a regular file. Because this is a strategy call the OS is
bcac4bbb 2954 * trying to actually get data onto the media.
8cd0a023
MD
2955 */
2956static
2957int
2958hammer_vop_strategy_write(struct vop_strategy_args *ap)
2959{
47637bff 2960 hammer_record_t record;
af209b0f 2961 hammer_mount_t hmp;
8cd0a023
MD
2962 hammer_inode_t ip;
2963 struct bio *bio;
2964 struct buf *bp;
a7e9bef1 2965 int blksize;
0832c9bb
MD
2966 int bytes;
2967 int error;
8cd0a023
MD
2968
2969 bio = ap->a_bio;
2970 bp = bio->bio_buf;
2971 ip = ap->a_vp->v_data;
af209b0f 2972 hmp = ip->hmp;
d113fda1 2973
a7e9bef1
MD
2974 blksize = hammer_blocksize(bio->bio_offset);
2975 KKASSERT(bp->b_bufsize == blksize);
4a2796f3 2976
059819e3
MD
2977 if (ip->flags & HAMMER_INODE_RO) {
2978 bp->b_error = EROFS;
2979 bp->b_flags |= B_ERROR;
2980 biodone(ap->a_bio);
2981 return(EROFS);
2982 }
b84de5af
MD
2983
2984 /*
29ce0677
MD
2985 * Interlock with inode destruction (no in-kernel or directory
2986 * topology visibility). If we queue new IO while trying to
2987 * destroy the inode we can deadlock the vtrunc call in
2988 * hammer_inode_unloadable_check().
35a49944
MD
2989 *
2990 * Besides, there's no point flushing a bp associated with an
2991 * inode that is being destroyed on-media and has no kernel
2992 * references.
29ce0677 2993 */
35a49944
MD
2994 if ((ip->flags | ip->sync_flags) &
2995 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
29ce0677
MD
2996 bp->b_resid = 0;
2997 biodone(ap->a_bio);
2998 return(0);
2999 }
3000
3001 /*
a99b9ea2
MD
3002 * Reserve space and issue a direct-write from the front-end.
3003 * NOTE: The direct_io code will hammer_bread/bcopy smaller
3004 * allocations.
47637bff 3005 *
a99b9ea2
MD
3006 * An in-memory record will be installed to reference the storage
3007 * until the flusher can get to it.
47637bff
MD
3008 *
3009 * Since we own the high level bio the front-end will not try to
0832c9bb 3010 * do a direct-read until the write completes.
a99b9ea2
MD
3011 *
3012 * NOTE: The only time we do not reserve a full-sized buffers
3013 * worth of data is if the file is small. We do not try to
3014 * allocate a fragment (from the small-data zone) at the end of
3015 * an otherwise large file as this can lead to wildly separated
3016 * data.
47637bff 3017 */
0832c9bb
MD
3018 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3019 KKASSERT(bio->bio_offset < ip->ino_data.size);
a99b9ea2 3020 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
4a2796f3 3021 bytes = bp->b_bufsize;
b84de5af 3022 else
a99b9ea2 3023 bytes = ((int)ip->ino_data.size + 15) & ~15;
0832c9bb
MD
3024
3025 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3026 bytes, &error);
3027 if (record) {
1b0ab2c3 3028 hammer_io_direct_write(hmp, record, bio);
4a2796f3
MD
3029 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3030 hammer_flush_inode(ip, 0);
0832c9bb 3031 } else {
a99b9ea2 3032 bp->b_bio2.bio_offset = NOOFFSET;
0832c9bb
MD
3033 bp->b_error = error;
3034 bp->b_flags |= B_ERROR;
3035 biodone(ap->a_bio);
3036 }
0832c9bb 3037 return(error);
059819e3
MD
3038}
3039
3040/*
8cd0a023
MD
3041 * dounlink - disconnect a directory entry
3042 *
3043 * XXX whiteout support not really in yet
3044 */
3045static int
b84de5af 3046hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
d7e278bb
MD
3047 struct vnode *dvp, struct ucred *cred,
3048 int flags, int isdir)
8cd0a023 3049{
8cd0a023
MD
3050 struct namecache *ncp;
3051 hammer_inode_t dip;
3052 hammer_inode_t ip;
8cd0a023 3053 struct hammer_cursor cursor;
8cd0a023 3054 int64_t namekey;
5e435c92 3055 u_int32_t max_iterations;
11ad5ade 3056 int nlen, error;
8cd0a023
MD
3057
3058 /*
3059 * Calculate the namekey and setup the key range for the scan. This
3060 * works kinda like a chained hash table where the lower 32 bits
3061 * of the namekey synthesize the chain.
3062 *
3063 * The key range is inclusive of both key_beg and key_end.
3064 */
3065 dip = VTOI(dvp);
3066 ncp = nch->ncp;
d113fda1
MD
3067
3068 if (dip->flags & HAMMER_INODE_RO)
3069 return (EROFS);
3070
5e435c92
MD
3071 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3072 &max_iterations);
6a37e7e4 3073retry:
bcac4bbb 3074 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
5a930e66 3075 cursor.key_beg.localization = dip->obj_localization +
beec5dc4 3076 hammer_dir_localization(dip);
8cd0a023
MD
3077 cursor.key_beg.obj_id = dip->obj_id;
3078 cursor.key_beg.key = namekey;
d5530d22 3079 cursor.key_beg.create_tid = 0;
8cd0a023
MD
3080 cursor.key_beg.delete_tid = 0;
3081 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3082 cursor.key_beg.obj_type = 0;
3083
3084 cursor.key_end = cursor.key_beg;
5e435c92 3085 cursor.key_end.key += max_iterations;
d5530d22
MD
3086 cursor.asof = dip->obj_asof;
3087 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
8cd0a023 3088
8cd0a023
MD
3089 /*
3090 * Scan all matching records (the chain), locate the one matching
3091 * the requested path component. info->last_error contains the
3092 * error code on search termination and could be 0, ENOENT, or
3093 * something else.
3094 *
3095 * The hammer_ip_*() functions merge in-memory records with on-disk
3096 * records for the purposes of the search.
3097 */
4e17f465
MD
3098 error = hammer_ip_first(&cursor);
3099
a89aec1b
MD
3100 while (error == 0) {
3101 error = hammer_ip_resolve_data(&cursor);
3102 if (error)
66325755 3103 break;
11ad5ade
MD
3104 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3105 KKASSERT(nlen > 0);
3106 if (ncp->nc_nlen == nlen &&
3107 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
66325755
MD
3108 break;
3109 }
a89aec1b 3110 error = hammer_ip_next(&cursor);
66325755 3111 }
8cd0a023
MD
3112
3113 /*
3114 * If all is ok we have to get the inode so we can adjust nlinks.
269c5eab
MD
3115 * To avoid a deadlock with the flusher we must release the inode
3116 * lock on the directory when acquiring the inode for the entry.
b3deaf57
MD
3117 *
3118 * If the target is a directory, it must be empty.
8cd0a023 3119 */
66325755 3120 if (error == 0) {
269c5eab 3121 hammer_unlock(&cursor.ip->lock);
bcac4bbb 3122 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
ddfdf542
MD
3123 dip->hmp->asof,
3124 cursor.data->entry.localization,
3125 0, &error);
269c5eab 3126 hammer_lock_sh(&cursor.ip->lock);
46fe7ae1 3127 if (error == ENOENT) {
4c286c36
MD
3128 kprintf("HAMMER: WARNING: Removing "
3129 "dirent w/missing inode \"%s\"\n"
3130 "\tobj_id = %016llx\n",
3131 ncp->nc_name,
3132 (long long)cursor.data->entry.obj_id);
3133 error = 0;
46fe7ae1 3134 }
1f07f686
MD
3135
3136 /*
d7e278bb
MD
3137 * If isdir >= 0 we validate that the entry is or is not a
3138 * directory. If isdir < 0 we don't care.
3139 */
4c286c36 3140 if (error == 0 && isdir >= 0 && ip) {
d7e278bb
MD
3141 if (isdir &&
3142 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3143 error = ENOTDIR;
3144 } else if (isdir == 0 &&
3145 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3146 error = EISDIR;
3147 }
3148 }
3149
3150 /*
1f07f686
MD
3151 * If we are trying to remove a directory the directory must
3152 * be empty.
3153 *
3f9b4cfa
MD
3154 * The check directory code can loop and deadlock/retry. Our
3155 * own cursor's node locks must be released to avoid a 3-way
3156 * deadlock with the flusher if the check directory code
3157 * blocks.
3158 *
3159 * If any changes whatsoever have been made to the cursor
3160 * set EDEADLK and retry.
c9ce54d6
MD
3161 *
3162 * WARNING: See warnings in hammer_unlock_cursor()
3163 * function.
1f07f686 3164 */
4c286c36
MD
3165 if (error == 0 && ip && ip->ino_data.obj_type ==
3166 HAMMER_OBJTYPE_DIRECTORY) {
3f9b4cfa 3167 hammer_unlock_cursor(&cursor);
98f7132d 3168 error = hammer_ip_check_directory_empty(trans, ip);
3f9b4cfa
MD
3169 hammer_lock_cursor(&cursor);
3170 if (cursor.flags & HAMMER_CURSOR_RETEST) {
3171 kprintf("HAMMER: Warning: avoided deadlock "
3172 "on rmdir '%s'\n",
3173 ncp->nc_name);
3174 error = EDEADLK;
3175 }
b3deaf57 3176 }
1f07f686 3177
6a37e7e4 3178 /*
1f07f686
MD
3179 * Delete the directory entry.
3180 *
6a37e7e4 3181 * WARNING: hammer_ip_del_directory() may have to terminate
1f07f686 3182 * the cursor to avoid a deadlock. It is ok to call
6a37e7e4
MD
3183 * hammer_done_cursor() twice.
3184 */
b84de5af 3185 if (error == 0) {
b84de5af
MD
3186 error = hammer_ip_del_directory(trans, &cursor,
3187 dip, ip);
b84de5af 3188 }
269c5eab 3189 hammer_done_cursor(&cursor);
8cd0a023
MD
3190 if (error == 0) {
3191 cache_setunresolved(nch);
3192 cache_setvp(nch, NULL);
3193 /* XXX locking */
4c286c36 3194 if (ip && ip->vp) {
fbb84158 3195 hammer_knote(ip->vp, NOTE_DELETE);
8cd0a023 3196 cache_inval_vp(ip->vp, CINV_DESTROY);
fbb84158 3197 }
8cd0a023 3198 }
af209b0f
MD
3199 if (ip)
3200 hammer_rel_inode(ip, 0);
269c5eab
MD
3201 } else {
3202 hammer_done_cursor(&cursor);
66325755 3203 }
6a37e7e4
MD
3204 if (error == EDEADLK)
3205 goto retry;
9c448776 3206
66325755 3207 return (error);
66325755
MD
3208}
3209
7a04d74f
MD
3210/************************************************************************
3211 * FIFO AND SPECFS OPS *
3212 ************************************************************************
3213 *
3214 */
3215
3216static int
3217hammer_vop_fifoclose (struct vop_close_args *ap)
3218{
3219 /* XXX update itimes */
3220 return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3221}
3222
3223static int
3224hammer_vop_fiforead (struct vop_read_args *ap)
3225{
3226 int error;
3227
3228 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3229 /* XXX update access time */
3230 return (error);
3231}
3232
3233static int
3234hammer_vop_fifowrite (struct vop_write_args *ap)
3235{
3236 int error;
3237
3238 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3239 /* XXX update access time */
3240 return (error);
3241}
3242
fbb84158
MD
3243static
3244int
3245hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3246{
3247 int error;
3248
3249 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3250 if (error)
3251 error = hammer_vop_kqfilter(ap);
3252 return(error);
3253}
3254
fbb84158
MD
3255/************************************************************************
3256 * KQFILTER OPS *
3257 ************************************************************************
3258 *
3259 */
3260static void filt_hammerdetach(struct knote *kn);
3261static int filt_hammerread(struct knote *kn, long hint);
3262static int filt_hammerwrite(struct knote *kn, long hint);