HAMMER VFS - Add debugging for write I/O
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
CommitLineData
427e5fc6 1/*
b84de5af 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
427e5fc6
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
fbb84158 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
427e5fc6
MD
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/namecache.h>
42#include <sys/vnode.h>
43#include <sys/lockf.h>
44#include <sys/event.h>
45#include <sys/stat.h>
b3deaf57 46#include <sys/dirent.h>
fbb84158 47#include <sys/file.h>
c0ade690 48#include <vm/vm_extern.h>
7a04d74f 49#include <vfs/fifofs/fifo.h>
684a93c4
MD
50
51#include <sys/mplock2.h>
52
427e5fc6
MD
53#include "hammer.h"
54
55/*
56 * USERFS VNOPS
57 */
58/*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
66325755
MD
59static int hammer_vop_fsync(struct vop_fsync_args *);
60static int hammer_vop_read(struct vop_read_args *);
61static int hammer_vop_write(struct vop_write_args *);
62static int hammer_vop_access(struct vop_access_args *);
63static int hammer_vop_advlock(struct vop_advlock_args *);
64static int hammer_vop_close(struct vop_close_args *);
65static int hammer_vop_ncreate(struct vop_ncreate_args *);
66static int hammer_vop_getattr(struct vop_getattr_args *);
67static int hammer_vop_nresolve(struct vop_nresolve_args *);
68static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
69static int hammer_vop_nlink(struct vop_nlink_args *);
70static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
71static int hammer_vop_nmknod(struct vop_nmknod_args *);
72static int hammer_vop_open(struct vop_open_args *);
66325755
MD
73static int hammer_vop_print(struct vop_print_args *);
74static int hammer_vop_readdir(struct vop_readdir_args *);
75static int hammer_vop_readlink(struct vop_readlink_args *);
76static int hammer_vop_nremove(struct vop_nremove_args *);
77static int hammer_vop_nrename(struct vop_nrename_args *);
78static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
349433c9 79static int hammer_vop_markatime(struct vop_markatime_args *);
66325755
MD
80static int hammer_vop_setattr(struct vop_setattr_args *);
81static int hammer_vop_strategy(struct vop_strategy_args *);
a99b9ea2 82static int hammer_vop_bmap(struct vop_bmap_args *ap);
66325755
MD
83static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
84static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
7dc57964 85static int hammer_vop_ioctl(struct vop_ioctl_args *);
513ca7d7 86static int hammer_vop_mountctl(struct vop_mountctl_args *);
fbb84158 87static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
427e5fc6 88
7a04d74f
MD
89static int hammer_vop_fifoclose (struct vop_close_args *);
90static int hammer_vop_fiforead (struct vop_read_args *);
91static int hammer_vop_fifowrite (struct vop_write_args *);
fbb84158 92static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
7a04d74f 93
427e5fc6
MD
94struct vop_ops hammer_vnode_vops = {
95 .vop_default = vop_defaultop,
96 .vop_fsync = hammer_vop_fsync,
c0ade690
MD
97 .vop_getpages = vop_stdgetpages,
98 .vop_putpages = vop_stdputpages,
427e5fc6
MD
99 .vop_read = hammer_vop_read,
100 .vop_write = hammer_vop_write,
101 .vop_access = hammer_vop_access,
102 .vop_advlock = hammer_vop_advlock,
103 .vop_close = hammer_vop_close,
104 .vop_ncreate = hammer_vop_ncreate,
105 .vop_getattr = hammer_vop_getattr,
106 .vop_inactive = hammer_vop_inactive,
107 .vop_reclaim = hammer_vop_reclaim,
108 .vop_nresolve = hammer_vop_nresolve,
109 .vop_nlookupdotdot = hammer_vop_nlookupdotdot,
110 .vop_nlink = hammer_vop_nlink,
111 .vop_nmkdir = hammer_vop_nmkdir,
112 .vop_nmknod = hammer_vop_nmknod,
113 .vop_open = hammer_vop_open,
64950f31 114 .vop_pathconf = vop_stdpathconf,
427e5fc6
MD
115 .vop_print = hammer_vop_print,
116 .vop_readdir = hammer_vop_readdir,
117 .vop_readlink = hammer_vop_readlink,
118 .vop_nremove = hammer_vop_nremove,
119 .vop_nrename = hammer_vop_nrename,
120 .vop_nrmdir = hammer_vop_nrmdir,
349433c9 121 .vop_markatime = hammer_vop_markatime,
427e5fc6 122 .vop_setattr = hammer_vop_setattr,
a99b9ea2 123 .vop_bmap = hammer_vop_bmap,
427e5fc6
MD
124 .vop_strategy = hammer_vop_strategy,
125 .vop_nsymlink = hammer_vop_nsymlink,
7dc57964 126 .vop_nwhiteout = hammer_vop_nwhiteout,
513ca7d7 127 .vop_ioctl = hammer_vop_ioctl,
fbb84158
MD
128 .vop_mountctl = hammer_vop_mountctl,
129 .vop_kqfilter = hammer_vop_kqfilter
427e5fc6
MD
130};
131
7a04d74f 132struct vop_ops hammer_spec_vops = {
8be7edad 133 .vop_default = vop_defaultop,
7a04d74f 134 .vop_fsync = hammer_vop_fsync,
8be7edad
MD
135 .vop_read = vop_stdnoread,
136 .vop_write = vop_stdnowrite,
7a04d74f 137 .vop_access = hammer_vop_access,
8be7edad 138 .vop_close = hammer_vop_close,
349433c9 139 .vop_markatime = hammer_vop_markatime,
8be7edad 140 .vop_getattr = hammer_vop_getattr,
7a04d74f
MD
141 .vop_inactive = hammer_vop_inactive,
142 .vop_reclaim = hammer_vop_reclaim,
143 .vop_setattr = hammer_vop_setattr
144};
145
146struct vop_ops hammer_fifo_vops = {
147 .vop_default = fifo_vnoperate,
148 .vop_fsync = hammer_vop_fsync,
149 .vop_read = hammer_vop_fiforead,
150 .vop_write = hammer_vop_fifowrite,
151 .vop_access = hammer_vop_access,
152 .vop_close = hammer_vop_fifoclose,
349433c9 153 .vop_markatime = hammer_vop_markatime,
7a04d74f
MD
154 .vop_getattr = hammer_vop_getattr,
155 .vop_inactive = hammer_vop_inactive,
156 .vop_reclaim = hammer_vop_reclaim,
fbb84158
MD
157 .vop_setattr = hammer_vop_setattr,
158 .vop_kqfilter = hammer_vop_fifokqfilter
7a04d74f
MD
159};
160
fbb84158
MD
161static __inline
162void
163hammer_knote(struct vnode *vp, int flags)
164{
165 if (flags)
166 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
167}
168
0832c9bb
MD
169#ifdef DEBUG_TRUNCATE
170struct hammer_inode *HammerTruncIp;
171#endif
172
b84de5af 173static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
d7e278bb
MD
174 struct vnode *dvp, struct ucred *cred,
175 int flags, int isdir);
8cd0a023
MD
176static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
177static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
178
427e5fc6
MD
179#if 0
180static
181int
182hammer_vop_vnoperate(struct vop_generic_args *)
183{
184 return (VOCALL(&hammer_vnode_vops, ap));
185}
186#endif
187
66325755
MD
188/*
189 * hammer_vop_fsync { vp, waitfor }
ddfdf542
MD
190 *
191 * fsync() an inode to disk and wait for it to be completely committed
192 * such that the information would not be undone if a crash occured after
193 * return.
6f3d87c0
MD
194 *
195 * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
196 * a REDO log. A sysctl is provided to relax HAMMER's fsync()
197 * operation.
198 *
199 * Ultimately the combination of a REDO log and use of fast storage
200 * to front-end cluster caches will make fsync fast, but it aint
201 * here yet. And, in anycase, we need real transactional
202 * all-or-nothing features which are not restricted to a single file.
66325755 203 */
427e5fc6
MD
204static
205int
66325755 206hammer_vop_fsync(struct vop_fsync_args *ap)
427e5fc6 207{
b84de5af 208 hammer_inode_t ip = VTOI(ap->a_vp);
6f3d87c0
MD
209 int waitfor = ap->a_waitfor;
210
211 /*
212 * Fsync rule relaxation (default disabled)
213 */
214 if (ap->a_flags & VOP_FSYNC_SYSCALL) {
215 switch(hammer_fsync_mode) {
216 case 0:
217 /* full semantics */
218 break;
219 case 1:
220 /* asynchronous */
221 if (waitfor == MNT_WAIT)
222 waitfor = MNT_NOWAIT;
223 break;
224 case 2:
225 /* synchronous fsync on close */
226 ip->flags |= HAMMER_INODE_CLOSESYNC;
227 return(0);
228 case 3:
229 /* asynchronous fsync on close */
230 ip->flags |= HAMMER_INODE_CLOSEASYNC;
231 return(0);
232 default:
233 /* ignore the fsync() system call */
234 return(0);
235 }
236 }
c0ade690 237
6f3d87c0
MD
238 /*
239 * Go do it
240 */
7a61b85d 241 ++hammer_count_fsyncs;
6f3d87c0 242 vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
af209b0f 243 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
6f3d87c0 244 if (waitfor == MNT_WAIT) {
b424ca30 245 vn_unlock(ap->a_vp);
b84de5af 246 hammer_wait_inode(ip);
b424ca30
MD
247 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
248 }
059819e3 249 return (ip->error);
427e5fc6
MD
250}
251
66325755
MD
252/*
253 * hammer_vop_read { vp, uio, ioflag, cred }
42cd5131
MD
254 *
255 * MPALMOSTSAFE
66325755 256 */
427e5fc6
MD
257static
258int
66325755 259hammer_vop_read(struct vop_read_args *ap)
427e5fc6 260{
66325755 261 struct hammer_transaction trans;
c0ade690 262 hammer_inode_t ip;
66325755
MD
263 off_t offset;
264 struct buf *bp;
265 struct uio *uio;
266 int error;
267 int n;
8cd0a023 268 int seqcount;
4a2796f3
MD
269 int ioseqcount;
270 int blksize;
899eb297 271 int got_mplock;
f864373f 272 int bigread;
66325755
MD
273
274 if (ap->a_vp->v_type != VREG)
275 return (EINVAL);
276 ip = VTOI(ap->a_vp);
277 error = 0;
4a2796f3
MD
278 uio = ap->a_uio;
279
280 /*
281 * Allow the UIO's size to override the sequential heuristic.
282 */
283 blksize = hammer_blocksize(uio->uio_offset);
284 seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
285 ioseqcount = ap->a_ioflag >> 16;
286 if (seqcount < ioseqcount)
287 seqcount = ioseqcount;
66325755 288
7ff770b4
MD
289 /*
290 * Temporary hack until more of HAMMER can be made MPSAFE.
291 */
292#ifdef SMP
899eb297
MD
293 if (curthread->td_mpcount) {
294 got_mplock = -1;
295 hammer_start_transaction(&trans, ip->hmp);
296 } else {
297 got_mplock = 0;
298 }
7ff770b4
MD
299#else
300 hammer_start_transaction(&trans, ip->hmp);
301 got_mplock = -1;
302#endif
899eb297 303
f864373f
MD
304 /*
305 * If reading or writing a huge amount of data we have to break
306 * atomicy and allow the operation to be interrupted by a signal
307 * or it can DOS the machine.
308 */
309 bigread = (uio->uio_resid > 100 * 1024 * 1024);
310
66325755 311 /*
4a2796f3
MD
312 * Access the data typically in HAMMER_BUFSIZE blocks via the
313 * buffer cache, but HAMMER may use a variable block size based
314 * on the offset.
42cd5131
MD
315 *
316 * XXX Temporary hack, delay the start transaction while we remain
317 * MPSAFE. NOTE: ino_data.size cannot change while vnode is
318 * locked-shared.
66325755 319 */
11ad5ade 320 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
4a2796f3
MD
321 int64_t base_offset;
322 int64_t file_limit;
323
324 blksize = hammer_blocksize(uio->uio_offset);
325 offset = (int)uio->uio_offset & (blksize - 1);
326 base_offset = uio->uio_offset - offset;
327
f864373f
MD
328 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
329 break;
330
42cd5131
MD
331 /*
332 * MPSAFE
333 */
334 bp = getcacheblk(ap->a_vp, base_offset);
335 if (bp) {
336 error = 0;
337 goto skip;
338 }
339
340 /*
341 * MPUNSAFE
342 */
343 if (got_mplock == 0) {
344 got_mplock = 1;
345 get_mplock();
346 hammer_start_transaction(&trans, ip->hmp);
347 }
348
1b0ab2c3 349 if (hammer_cluster_enable) {
4a2796f3
MD
350 /*
351 * Use file_limit to prevent cluster_read() from
352 * creating buffers of the wrong block size past
353 * the demarc.
354 */
355 file_limit = ip->ino_data.size;
356 if (base_offset < HAMMER_XDEMARC &&
357 file_limit > HAMMER_XDEMARC) {
358 file_limit = HAMMER_XDEMARC;
359 }
360 error = cluster_read(ap->a_vp,
361 file_limit, base_offset,
362 blksize, MAXPHYS,
363 seqcount, &bp);
a99b9ea2 364 } else {
4a2796f3 365 error = bread(ap->a_vp, base_offset, blksize, &bp);
a99b9ea2 366 }
66325755
MD
367 if (error) {
368 brelse(bp);
369 break;
370 }
42cd5131 371skip:
7bc5b8c2 372
c0ade690 373 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
4a2796f3 374 n = blksize - offset;
66325755
MD
375 if (n > uio->uio_resid)
376 n = uio->uio_resid;
11ad5ade
MD
377 if (n > ip->ino_data.size - uio->uio_offset)
378 n = (int)(ip->ino_data.size - uio->uio_offset);
66325755 379 error = uiomove((char *)bp->b_data + offset, n, uio);
7bc5b8c2
MD
380
381 /* data has a lower priority then meta-data */
382 bp->b_flags |= B_AGE;
66325755 383 bqrelse(bp);
af209b0f
MD
384 if (error)
385 break;
ce0138a6 386 hammer_stats_file_read += n;
66325755 387 }
42cd5131
MD
388
389 /*
390 * XXX only update the atime if we had to get the MP lock.
391 * XXX hack hack hack, fixme.
392 */
393 if (got_mplock) {
394 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
395 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
396 ip->ino_data.atime = trans.time;
397 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
398 }
399 hammer_done_transaction(&trans);
899eb297
MD
400 if (got_mplock > 0)
401 rel_mplock();
b84de5af 402 }
66325755 403 return (error);
427e5fc6
MD
404}
405
66325755
MD
406/*
407 * hammer_vop_write { vp, uio, ioflag, cred }
408 */
427e5fc6
MD
409static
410int
66325755 411hammer_vop_write(struct vop_write_args *ap)
427e5fc6 412{
66325755
MD
413 struct hammer_transaction trans;
414 struct hammer_inode *ip;
4a2796f3 415 hammer_mount_t hmp;
66325755 416 struct uio *uio;
4a2796f3 417 int offset;
47637bff 418 off_t base_offset;
66325755 419 struct buf *bp;
fbb84158 420 int kflags;
66325755
MD
421 int error;
422 int n;
c0ade690 423 int flags;
cb51be26 424 int seqcount;
f864373f 425 int bigwrite;
66325755
MD
426
427 if (ap->a_vp->v_type != VREG)
428 return (EINVAL);
429 ip = VTOI(ap->a_vp);
4a2796f3 430 hmp = ip->hmp;
66325755 431 error = 0;
fbb84158 432 kflags = 0;
cb51be26 433 seqcount = ap->a_ioflag >> 16;
66325755 434
d113fda1
MD
435 if (ip->flags & HAMMER_INODE_RO)
436 return (EROFS);
437
66325755
MD
438 /*
439 * Create a transaction to cover the operations we perform.
440 */
4a2796f3 441 hammer_start_transaction(&trans, hmp);
66325755
MD
442 uio = ap->a_uio;
443
444 /*
445 * Check append mode
446 */
447 if (ap->a_ioflag & IO_APPEND)
11ad5ade 448 uio->uio_offset = ip->ino_data.size;
66325755
MD
449
450 /*
af209b0f
MD
451 * Check for illegal write offsets. Valid range is 0...2^63-1.
452 *
453 * NOTE: the base_off assignment is required to work around what
454 * I consider to be a GCC-4 optimization bug.
66325755 455 */
af209b0f
MD
456 if (uio->uio_offset < 0) {
457 hammer_done_transaction(&trans);
458 return (EFBIG);
459 }
460 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
e54488bb 461 if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
b84de5af 462 hammer_done_transaction(&trans);
66325755 463 return (EFBIG);
9c448776 464 }
66325755 465
f864373f
MD
466 /*
467 * If reading or writing a huge amount of data we have to break
468 * atomicy and allow the operation to be interrupted by a signal
469 * or it can DOS the machine.
470 */
471 bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
472
66325755 473 /*
4a2796f3
MD
474 * Access the data typically in HAMMER_BUFSIZE blocks via the
475 * buffer cache, but HAMMER may use a variable block size based
476 * on the offset.
66325755
MD
477 */
478 while (uio->uio_resid > 0) {
d5ef456e 479 int fixsize = 0;
4a2796f3
MD
480 int blksize;
481 int blkmask;
d5ef456e 482
93291532 483 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
e63644f0 484 break;
f864373f
MD
485 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
486 break;
e63644f0 487
a9d52b76
MD
488 blksize = hammer_blocksize(uio->uio_offset);
489
059819e3 490 /*
4a2796f3
MD
491 * Do not allow HAMMER to blow out the buffer cache. Very
492 * large UIOs can lockout other processes due to bwillwrite()
493 * mechanics.
47637bff 494 *
df301614
MD
495 * The hammer inode is not locked during these operations.
496 * The vnode is locked which can interfere with the pageout
497 * daemon for non-UIO_NOCOPY writes but should not interfere
498 * with the buffer cache. Even so, we cannot afford to
499 * allow the pageout daemon to build up too many dirty buffer
500 * cache buffers.
cb63d1bc
MD
501 *
502 * Only call this if we aren't being recursively called from
503 * a virtual disk device (vn), else we may deadlock.
df301614 504 */
cb63d1bc
MD
505 if ((ap->a_ioflag & IO_RECURSE) == 0)
506 bwillwrite(blksize);
df301614 507
de996e86
MD
508 /*
509 * Control the number of pending records associated with
510 * this inode. If too many have accumulated start a
511 * flush. Try to maintain a pipeline with the flusher.
512 */
513 if (ip->rsv_recs >= hammer_limit_inode_recs) {
514 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
515 }
516 if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
517 while (ip->rsv_recs >= hammer_limit_inode_recs) {
518 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
519 }
520 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
521 }
522
523#if 0
df301614 524 /*
e4a5ff06
MD
525 * Do not allow HAMMER to blow out system memory by
526 * accumulating too many records. Records are so well
527 * decoupled from the buffer cache that it is possible
528 * for userland to push data out to the media via
529 * direct-write, but build up the records queued to the
530 * backend faster then the backend can flush them out.
531 * HAMMER has hit its write limit but the frontend has
532 * no pushback to slow it down.
059819e3 533 */
df301614 534 if (hmp->rsv_recs > hammer_limit_recs / 2) {
4a2796f3 535 /*
df301614 536 * Get the inode on the flush list
4a2796f3 537 */
df301614
MD
538 if (ip->rsv_recs >= 64)
539 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
540 else if (ip->rsv_recs >= 16)
541 hammer_flush_inode(ip, 0);
4a2796f3
MD
542
543 /*
df301614
MD
544 * Keep the flusher going if the system keeps
545 * queueing records.
4a2796f3 546 */
df301614
MD
547 delta = hmp->count_newrecords -
548 hmp->last_newrecords;
549 if (delta < 0 || delta > hammer_limit_recs / 2) {
550 hmp->last_newrecords = hmp->count_newrecords;
551 hammer_sync_hmp(hmp, MNT_NOWAIT);
4a2796f3
MD
552 }
553
df301614
MD
554 /*
555 * If we have gotten behind start slowing
556 * down the writers.
557 */
558 delta = (hmp->rsv_recs - hammer_limit_recs) *
559 hz / hammer_limit_recs;
560 if (delta > 0)
561 tsleep(&trans, 0, "hmrslo", delta);
059819e3 562 }
de996e86 563#endif
059819e3 564
4a2796f3
MD
565 /*
566 * Calculate the blocksize at the current offset and figure
567 * out how much we can actually write.
568 */
4a2796f3
MD
569 blkmask = blksize - 1;
570 offset = (int)uio->uio_offset & blkmask;
571 base_offset = uio->uio_offset & ~(int64_t)blkmask;
572 n = blksize - offset;
d5ef456e
MD
573 if (n > uio->uio_resid)
574 n = uio->uio_resid;
11ad5ade 575 if (uio->uio_offset + n > ip->ino_data.size) {
d5ef456e
MD
576 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
577 fixsize = 1;
fbb84158 578 kflags |= NOTE_EXTEND;
d5ef456e
MD
579 }
580
c0ade690
MD
581 if (uio->uio_segflg == UIO_NOCOPY) {
582 /*
583 * Issuing a write with the same data backing the
584 * buffer. Instantiate the buffer to collect the
585 * backing vm pages, then read-in any missing bits.
586 *
587 * This case is used by vop_stdputpages().
588 */
47637bff 589 bp = getblk(ap->a_vp, base_offset,
4a2796f3 590 blksize, GETBLK_BHEAVY, 0);
c0ade690
MD
591 if ((bp->b_flags & B_CACHE) == 0) {
592 bqrelse(bp);
47637bff 593 error = bread(ap->a_vp, base_offset,
4a2796f3 594 blksize, &bp);
c0ade690 595 }
4a2796f3 596 } else if (offset == 0 && uio->uio_resid >= blksize) {
c0ade690 597 /*
a5fddc16
MD
598 * Even though we are entirely overwriting the buffer
599 * we may still have to zero it out to avoid a
600 * mmap/write visibility issue.
c0ade690 601 */
4a2796f3 602 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
a5fddc16
MD
603 if ((bp->b_flags & B_CACHE) == 0)
604 vfs_bio_clrbuf(bp);
47637bff 605 } else if (base_offset >= ip->ino_data.size) {
c0ade690 606 /*
a5fddc16
MD
607 * If the base offset of the buffer is beyond the
608 * file EOF, we don't have to issue a read.
c0ade690 609 */
47637bff 610 bp = getblk(ap->a_vp, base_offset,
4a2796f3 611 blksize, GETBLK_BHEAVY, 0);
66325755
MD
612 vfs_bio_clrbuf(bp);
613 } else {
c0ade690
MD
614 /*
615 * Partial overwrite, read in any missing bits then
616 * replace the portion being written.
617 */
4a2796f3 618 error = bread(ap->a_vp, base_offset, blksize, &bp);
d5ef456e
MD
619 if (error == 0)
620 bheavy(bp);
66325755 621 }
47637bff 622 if (error == 0) {
4a2796f3 623 error = uiomove((char *)bp->b_data + offset,
47637bff
MD
624 n, uio);
625 }
d5ef456e
MD
626
627 /*
628 * If we screwed up we have to undo any VM size changes we
629 * made.
630 */
66325755
MD
631 if (error) {
632 brelse(bp);
d5ef456e 633 if (fixsize) {
11ad5ade 634 vtruncbuf(ap->a_vp, ip->ino_data.size,
4a2796f3 635 hammer_blocksize(ip->ino_data.size));
d5ef456e 636 }
66325755
MD
637 break;
638 }
fbb84158 639 kflags |= NOTE_WRITE;
ce0138a6 640 hammer_stats_file_write += n;
c0ade690 641 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
11ad5ade
MD
642 if (ip->ino_data.size < uio->uio_offset) {
643 ip->ino_data.size = uio->uio_offset;
644 flags = HAMMER_INODE_DDIRTY;
645 vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
c0ade690 646 } else {
d113fda1 647 flags = 0;
66325755 648 }
11ad5ade 649 ip->ino_data.mtime = trans.time;
ddfdf542 650 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
47637bff 651 hammer_modify_inode(ip, flags);
32c90105 652
1b0ab2c3
MD
653 /*
654 * Once we dirty the buffer any cached zone-X offset
655 * becomes invalid. HAMMER NOTE: no-history mode cannot
656 * allow overwriting over the same data sector unless
657 * we provide UNDOs for the old data, which we don't.
658 */
659 bp->b_bio2.bio_offset = NOOFFSET;
660
47637bff
MD
661 /*
662 * Final buffer disposition.
de996e86
MD
663 *
664 * Because meta-data updates are deferred, HAMMER is
665 * especially sensitive to excessive bdwrite()s because
666 * the I/O stream is not broken up by disk reads. So the
667 * buffer cache simply cannot keep up.
668 *
669 * WARNING! blksize is variable. cluster_write() is
670 * expected to not blow up if it encounters buffers that
671 * do not match the passed blksize.
710733a6
MD
672 *
673 * NOTE! Hammer shouldn't need to bawrite()/cluster_write().
674 * The ip->rsv_recs check should burst-flush the data.
675 * If we queue it immediately the buf could be left
676 * locked on the device queue for a very long time.
47637bff 677 */
cb51be26 678 bp->b_flags |= B_AGE;
66325755
MD
679 if (ap->a_ioflag & IO_SYNC) {
680 bwrite(bp);
681 } else if (ap->a_ioflag & IO_DIRECT) {
66325755 682 bawrite(bp);
710733a6
MD
683 } else {
684#if 0
685 if (offset + n == blksize) {
de996e86
MD
686 if (hammer_cluster_enable == 0 ||
687 (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
688 bawrite(bp);
689 } else {
690 cluster_write(bp, ip->ino_data.size,
691 blksize, seqcount);
692 }
4a2796f3 693 } else {
710733a6 694#endif
4a2796f3
MD
695 bdwrite(bp);
696 }
66325755 697 }
b84de5af 698 hammer_done_transaction(&trans);
fbb84158 699 hammer_knote(ap->a_vp, kflags);
66325755 700 return (error);
427e5fc6
MD
701}
702
66325755
MD
703/*
704 * hammer_vop_access { vp, mode, cred }
705 */
427e5fc6
MD
706static
707int
66325755 708hammer_vop_access(struct vop_access_args *ap)
427e5fc6 709{
66325755
MD
710 struct hammer_inode *ip = VTOI(ap->a_vp);
711 uid_t uid;
712 gid_t gid;
713 int error;
714
ce0138a6 715 ++hammer_stats_file_iopsr;
66325755
MD
716 uid = hammer_to_unix_xid(&ip->ino_data.uid);
717 gid = hammer_to_unix_xid(&ip->ino_data.gid);
718
719 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
720 ip->ino_data.uflags);
721 return (error);
427e5fc6
MD
722}
723
66325755
MD
724/*
725 * hammer_vop_advlock { vp, id, op, fl, flags }
726 */
427e5fc6
MD
727static
728int
66325755 729hammer_vop_advlock(struct vop_advlock_args *ap)
427e5fc6 730{
4a2796f3 731 hammer_inode_t ip = VTOI(ap->a_vp);
66325755 732
11ad5ade 733 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
427e5fc6
MD
734}
735
66325755
MD
736/*
737 * hammer_vop_close { vp, fflag }
6f3d87c0
MD
738 *
739 * We can only sync-on-close for normal closes.
66325755 740 */
427e5fc6
MD
741static
742int
66325755 743hammer_vop_close(struct vop_close_args *ap)
427e5fc6 744{
6f3d87c0
MD
745 struct vnode *vp = ap->a_vp;
746 hammer_inode_t ip = VTOI(vp);
747 int waitfor;
748
749 if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
750 if (vn_islocked(vp) == LK_EXCLUSIVE &&
751 (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
752 if (ip->flags & HAMMER_INODE_CLOSESYNC)
753 waitfor = MNT_WAIT;
754 else
755 waitfor = MNT_NOWAIT;
756 ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
757 HAMMER_INODE_CLOSEASYNC);
758 VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
759 }
760 }
a89aec1b 761 return (vop_stdclose(ap));
427e5fc6
MD
762}
763
66325755
MD
764/*
765 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
766 *
767 * The operating system has already ensured that the directory entry
768 * does not exist and done all appropriate namespace locking.
769 */
427e5fc6
MD
770static
771int
66325755 772hammer_vop_ncreate(struct vop_ncreate_args *ap)
427e5fc6 773{
66325755
MD
774 struct hammer_transaction trans;
775 struct hammer_inode *dip;
776 struct hammer_inode *nip;
777 struct nchandle *nch;
778 int error;
779
780 nch = ap->a_nch;
781 dip = VTOI(ap->a_dvp);
782
d113fda1
MD
783 if (dip->flags & HAMMER_INODE_RO)
784 return (EROFS);
93291532 785 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 786 return (error);
d113fda1 787
66325755
MD
788 /*
789 * Create a transaction to cover the operations we perform.
790 */
8cd0a023 791 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 792 ++hammer_stats_file_iopsw;
66325755
MD
793
794 /*
795 * Create a new filesystem object of the requested type. The
b84de5af
MD
796 * returned inode will be referenced and shared-locked to prevent
797 * it from being moved to the flusher.
66325755 798 */
5a930e66 799 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
800 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
801 NULL, &nip);
66325755 802 if (error) {
77062c8a 803 hkprintf("hammer_create_inode error %d\n", error);
b84de5af 804 hammer_done_transaction(&trans);
66325755
MD
805 *ap->a_vpp = NULL;
806 return (error);
807 }
66325755
MD
808
809 /*
810 * Add the new filesystem object to the directory. This will also
811 * bump the inode's link count.
812 */
5a930e66
MD
813 error = hammer_ip_add_directory(&trans, dip,
814 nch->ncp->nc_name, nch->ncp->nc_nlen,
815 nip);
0b075555 816 if (error)
77062c8a 817 hkprintf("hammer_ip_add_directory error %d\n", error);
66325755
MD
818
819 /*
820 * Finish up.
821 */
822 if (error) {
a89aec1b 823 hammer_rel_inode(nip, 0);
b84de5af 824 hammer_done_transaction(&trans);
66325755
MD
825 *ap->a_vpp = NULL;
826 } else {
e8599db1 827 error = hammer_get_vnode(nip, ap->a_vpp);
b84de5af 828 hammer_done_transaction(&trans);
a89aec1b
MD
829 hammer_rel_inode(nip, 0);
830 if (error == 0) {
831 cache_setunresolved(ap->a_nch);
832 cache_setvp(ap->a_nch, *ap->a_vpp);
833 }
fbb84158 834 hammer_knote(ap->a_dvp, NOTE_WRITE);
66325755
MD
835 }
836 return (error);
427e5fc6
MD
837}
838
66325755
MD
839/*
840 * hammer_vop_getattr { vp, vap }
98f7132d
MD
841 *
842 * Retrieve an inode's attribute information. When accessing inodes
843 * historically we fake the atime field to ensure consistent results.
844 * The atime field is stored in the B-Tree element and allowed to be
845 * updated without cycling the element.
899eb297
MD
846 *
847 * MPSAFE
66325755 848 */
427e5fc6
MD
849static
850int
66325755 851hammer_vop_getattr(struct vop_getattr_args *ap)
427e5fc6 852{
66325755
MD
853 struct hammer_inode *ip = VTOI(ap->a_vp);
854 struct vattr *vap = ap->a_vap;
855
a56cb012
MD
856 /*
857 * We want the fsid to be different when accessing a filesystem
858 * with different as-of's so programs like diff don't think
859 * the files are the same.
860 *
861 * We also want the fsid to be the same when comparing snapshots,
862 * or when comparing mirrors (which might be backed by different
863 * physical devices). HAMMER fsids are based on the PFS's
864 * shared_uuid field.
865 *
866 * XXX there is a chance of collision here. The va_fsid reported
867 * by stat is different from the more involved fsid used in the
868 * mount structure.
c82af904 869 */
ce0138a6 870 ++hammer_stats_file_iopsr;
899eb297 871 hammer_lock_sh(&ip->lock);
a56cb012
MD
872 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
873 (u_int32_t)(ip->obj_asof >> 32);
874
11ad5ade 875 vap->va_fileid = ip->ino_leaf.base.obj_id;
66325755 876 vap->va_mode = ip->ino_data.mode;
11ad5ade 877 vap->va_nlink = ip->ino_data.nlinks;
66325755
MD
878 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
879 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
880 vap->va_rmajor = 0;
881 vap->va_rminor = 0;
11ad5ade 882 vap->va_size = ip->ino_data.size;
bcac4bbb 883
f437a2ab
MD
884 /*
885 * Special case for @@PFS softlinks. The actual size of the
886 * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
cb3c760c 887 * or for MAX_TID is "@@-1:%05d" == 10 bytes.
f437a2ab
MD
888 */
889 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
890 ip->ino_data.size == 10 &&
891 ip->obj_asof == HAMMER_MAX_TID &&
892 ip->obj_localization == 0 &&
893 strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
cb3c760c
MD
894 if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
895 vap->va_size = 26;
896 else
897 vap->va_size = 10;
f437a2ab
MD
898 }
899
bcac4bbb
MD
900 /*
901 * We must provide a consistent atime and mtime for snapshots
902 * so people can do a 'tar cf - ... | md5' on them and get
903 * consistent results.
904 */
905 if (ip->flags & HAMMER_INODE_RO) {
ddfdf542
MD
906 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
907 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
bcac4bbb 908 } else {
ddfdf542
MD
909 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
910 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
bcac4bbb 911 }
ddfdf542 912 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
66325755
MD
913 vap->va_flags = ip->ino_data.uflags;
914 vap->va_gen = 1; /* hammer inums are unique for all time */
bf686dbe 915 vap->va_blocksize = HAMMER_BUFSIZE;
4a2796f3
MD
916 if (ip->ino_data.size >= HAMMER_XDEMARC) {
917 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
918 ~HAMMER_XBUFMASK64;
919 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
920 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
921 ~HAMMER_BUFMASK64;
922 } else {
923 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
924 }
64950f31 925
11ad5ade 926 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
66325755 927 vap->va_filerev = 0; /* XXX */
66325755
MD
928 vap->va_uid_uuid = ip->ino_data.uid;
929 vap->va_gid_uuid = ip->ino_data.gid;
930 vap->va_fsid_uuid = ip->hmp->fsid;
931 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
932 VA_FSID_UUID_VALID;
7a04d74f 933
11ad5ade 934 switch (ip->ino_data.obj_type) {
7a04d74f
MD
935 case HAMMER_OBJTYPE_CDEV:
936 case HAMMER_OBJTYPE_BDEV:
937 vap->va_rmajor = ip->ino_data.rmajor;
938 vap->va_rminor = ip->ino_data.rminor;
939 break;
940 default:
941 break;
942 }
899eb297 943 hammer_unlock(&ip->lock);
66325755 944 return(0);
427e5fc6
MD
945}
946
66325755
MD
947/*
948 * hammer_vop_nresolve { nch, dvp, cred }
949 *
950 * Locate the requested directory entry.
951 */
427e5fc6
MD
952static
953int
66325755 954hammer_vop_nresolve(struct vop_nresolve_args *ap)
427e5fc6 955{
36f82b23 956 struct hammer_transaction trans;
66325755 957 struct namecache *ncp;
7f7c1f84
MD
958 hammer_inode_t dip;
959 hammer_inode_t ip;
960 hammer_tid_t asof;
8cd0a023 961 struct hammer_cursor cursor;
66325755
MD
962 struct vnode *vp;
963 int64_t namekey;
964 int error;
7f7c1f84
MD
965 int i;
966 int nlen;
d113fda1 967 int flags;
a56cb012 968 int ispfs;
adf01747 969 int64_t obj_id;
ddfdf542 970 u_int32_t localization;
5e435c92 971 u_int32_t max_iterations;
7f7c1f84
MD
972
973 /*
974 * Misc initialization, plus handle as-of name extensions. Look for
975 * the '@@' extension. Note that as-of files and directories cannot
976 * be modified.
7f7c1f84
MD
977 */
978 dip = VTOI(ap->a_dvp);
979 ncp = ap->a_nch->ncp;
980 asof = dip->obj_asof;
bc6c1f13 981 localization = dip->obj_localization; /* for code consistency */
7f7c1f84 982 nlen = ncp->nc_nlen;
ea434b6f 983 flags = dip->flags & HAMMER_INODE_RO;
a56cb012 984 ispfs = 0;
7f7c1f84 985
36f82b23 986 hammer_simple_transaction(&trans, dip->hmp);
ce0138a6 987 ++hammer_stats_file_iopsr;
36f82b23 988
7f7c1f84
MD
989 for (i = 0; i < nlen; ++i) {
990 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
bc6c1f13
MD
991 error = hammer_str_to_tid(ncp->nc_name + i + 2,
992 &ispfs, &asof, &localization);
993 if (error != 0) {
994 i = nlen;
995 break;
996 }
ea434b6f
MD
997 if (asof != HAMMER_MAX_TID)
998 flags |= HAMMER_INODE_RO;
7f7c1f84
MD
999 break;
1000 }
1001 }
1002 nlen = i;
66325755 1003
ea434b6f
MD
1004 /*
1005 * If this is a PFS softlink we dive into the PFS
1006 */
1007 if (ispfs && nlen == 0) {
1008 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1009 asof, localization,
1010 flags, &error);
1011 if (error == 0) {
1012 error = hammer_get_vnode(ip, &vp);
1013 hammer_rel_inode(ip, 0);
1014 } else {
1015 vp = NULL;
1016 }
1017 if (error == 0) {
1018 vn_unlock(vp);
1019 cache_setvp(ap->a_nch, vp);
1020 vrele(vp);
1021 }
1022 goto done;
1023 }
1024
d113fda1 1025 /*
294aec9f
MD
1026 * If there is no path component the time extension is relative to dip.
1027 * e.g. "fubar/@@<snapshot>"
1028 *
1029 * "." is handled by the kernel, but ".@@<snapshot>" is not.
1030 * e.g. "fubar/.@@<snapshot>"
1031 *
1032 * ".." is handled by the kernel. We do not currently handle
1033 * "..@<snapshot>".
d113fda1 1034 */
294aec9f 1035 if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
bcac4bbb 1036 ip = hammer_get_inode(&trans, dip, dip->obj_id,
ddfdf542
MD
1037 asof, dip->obj_localization,
1038 flags, &error);
d113fda1 1039 if (error == 0) {
e8599db1 1040 error = hammer_get_vnode(ip, &vp);
d113fda1
MD
1041 hammer_rel_inode(ip, 0);
1042 } else {
1043 vp = NULL;
1044 }
1045 if (error == 0) {
1046 vn_unlock(vp);
1047 cache_setvp(ap->a_nch, vp);
1048 vrele(vp);
1049 }
36f82b23 1050 goto done;
d113fda1
MD
1051 }
1052
8cd0a023
MD
1053 /*
1054 * Calculate the namekey and setup the key range for the scan. This
1055 * works kinda like a chained hash table where the lower 32 bits
1056 * of the namekey synthesize the chain.
1057 *
1058 * The key range is inclusive of both key_beg and key_end.
1059 */
5e435c92
MD
1060 namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1061 &max_iterations);
66325755 1062
bcac4bbb 1063 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
5a930e66 1064 cursor.key_beg.localization = dip->obj_localization +
beec5dc4 1065 hammer_dir_localization(dip);
8cd0a023
MD
1066 cursor.key_beg.obj_id = dip->obj_id;
1067 cursor.key_beg.key = namekey;
d5530d22 1068 cursor.key_beg.create_tid = 0;
8cd0a023
MD
1069 cursor.key_beg.delete_tid = 0;
1070 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1071 cursor.key_beg.obj_type = 0;
66325755 1072
8cd0a023 1073 cursor.key_end = cursor.key_beg;
5e435c92 1074 cursor.key_end.key += max_iterations;
d5530d22
MD
1075 cursor.asof = asof;
1076 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
66325755
MD
1077
1078 /*
8cd0a023 1079 * Scan all matching records (the chain), locate the one matching
a89aec1b 1080 * the requested path component.
8cd0a023
MD
1081 *
1082 * The hammer_ip_*() functions merge in-memory records with on-disk
1083 * records for the purposes of the search.
66325755 1084 */
6a37e7e4 1085 obj_id = 0;
43c665ae 1086 localization = HAMMER_DEF_LOCALIZATION;
6a37e7e4 1087
4e17f465 1088 if (error == 0) {
4e17f465
MD
1089 error = hammer_ip_first(&cursor);
1090 while (error == 0) {
1091 error = hammer_ip_resolve_data(&cursor);
1092 if (error)
1093 break;
11ad5ade
MD
1094 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1095 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1096 obj_id = cursor.data->entry.obj_id;
ddfdf542 1097 localization = cursor.data->entry.localization;
4e17f465
MD
1098 break;
1099 }
1100 error = hammer_ip_next(&cursor);
66325755
MD
1101 }
1102 }
6a37e7e4 1103 hammer_done_cursor(&cursor);
4c286c36
MD
1104
1105 /*
1106 * Lookup the obj_id. This should always succeed. If it does not
1107 * the filesystem may be damaged and we return a dummy inode.
1108 */
66325755 1109 if (error == 0) {
bcac4bbb 1110 ip = hammer_get_inode(&trans, dip, obj_id,
ddfdf542
MD
1111 asof, localization,
1112 flags, &error);
4c286c36
MD
1113 if (error == ENOENT) {
1114 kprintf("HAMMER: WARNING: Missing "
1115 "inode for dirent \"%s\"\n"
3d30bff3
MD
1116 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1117 ncp->nc_name,
1118 (long long)obj_id, (long long)asof,
1119 localization);
4c286c36
MD
1120 error = 0;
1121 ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1122 asof, localization,
1123 flags, &error);
1124 }
7f7c1f84 1125 if (error == 0) {
e8599db1 1126 error = hammer_get_vnode(ip, &vp);
7f7c1f84
MD
1127 hammer_rel_inode(ip, 0);
1128 } else {
1129 vp = NULL;
1130 }
66325755
MD
1131 if (error == 0) {
1132 vn_unlock(vp);
1133 cache_setvp(ap->a_nch, vp);
1134 vrele(vp);
1135 }
1136 } else if (error == ENOENT) {
1137 cache_setvp(ap->a_nch, NULL);
1138 }
36f82b23 1139done:
b84de5af 1140 hammer_done_transaction(&trans);
66325755 1141 return (error);
427e5fc6
MD
1142}
1143
66325755
MD
1144/*
1145 * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1146 *
1147 * Locate the parent directory of a directory vnode.
1148 *
1149 * dvp is referenced but not locked. *vpp must be returned referenced and
1150 * locked. A parent_obj_id of 0 does not necessarily indicate that we are
1151 * at the root, instead it could indicate that the directory we were in was
1152 * removed.
42c7d26b
MD
1153 *
1154 * NOTE: as-of sequences are not linked into the directory structure. If
1155 * we are at the root with a different asof then the mount point, reload
1156 * the same directory with the mount point's asof. I'm not sure what this
1157 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not
1158 * get confused, but it hasn't been tested.
66325755 1159 */
427e5fc6
MD
1160static
1161int
66325755 1162hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
427e5fc6 1163{
36f82b23 1164 struct hammer_transaction trans;
66325755 1165 struct hammer_inode *dip;
d113fda1 1166 struct hammer_inode *ip;
42c7d26b 1167 int64_t parent_obj_id;
5a930e66 1168 u_int32_t parent_obj_localization;
42c7d26b 1169 hammer_tid_t asof;
d113fda1 1170 int error;
66325755
MD
1171
1172 dip = VTOI(ap->a_dvp);
42c7d26b 1173 asof = dip->obj_asof;
5a930e66
MD
1174
1175 /*
1176 * Whos are parent? This could be the root of a pseudo-filesystem
1177 * whos parent is in another localization domain.
1178 */
42c7d26b 1179 parent_obj_id = dip->ino_data.parent_obj_id;
5a930e66
MD
1180 if (dip->obj_id == HAMMER_OBJID_ROOT)
1181 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1182 else
1183 parent_obj_localization = dip->obj_localization;
42c7d26b
MD
1184
1185 if (parent_obj_id == 0) {
1186 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1187 asof != dip->hmp->asof) {
1188 parent_obj_id = dip->obj_id;
1189 asof = dip->hmp->asof;
1190 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1191 ksnprintf(*ap->a_fakename, 19, "0x%016llx",
973c11b9 1192 (long long)dip->obj_asof);
42c7d26b
MD
1193 } else {
1194 *ap->a_vpp = NULL;
1195 return ENOENT;
1196 }
66325755 1197 }
d113fda1 1198
36f82b23 1199 hammer_simple_transaction(&trans, dip->hmp);
ce0138a6 1200 ++hammer_stats_file_iopsr;
36f82b23 1201
bcac4bbb 1202 ip = hammer_get_inode(&trans, dip, parent_obj_id,
5a930e66 1203 asof, parent_obj_localization,
ddfdf542 1204 dip->flags, &error);
36f82b23 1205 if (ip) {
e8599db1 1206 error = hammer_get_vnode(ip, ap->a_vpp);
36f82b23
MD
1207 hammer_rel_inode(ip, 0);
1208 } else {
d113fda1 1209 *ap->a_vpp = NULL;
d113fda1 1210 }
b84de5af 1211 hammer_done_transaction(&trans);
d113fda1 1212 return (error);
427e5fc6
MD
1213}
1214
66325755
MD
1215/*
1216 * hammer_vop_nlink { nch, dvp, vp, cred }
1217 */
427e5fc6
MD
1218static
1219int
66325755 1220hammer_vop_nlink(struct vop_nlink_args *ap)
427e5fc6 1221{
66325755
MD
1222 struct hammer_transaction trans;
1223 struct hammer_inode *dip;
1224 struct hammer_inode *ip;
1225 struct nchandle *nch;
1226 int error;
1227
f437a2ab
MD
1228 if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1229 return(EXDEV);
1230
66325755
MD
1231 nch = ap->a_nch;
1232 dip = VTOI(ap->a_dvp);
1233 ip = VTOI(ap->a_vp);
1234
f437a2ab
MD
1235 if (dip->obj_localization != ip->obj_localization)
1236 return(EXDEV);
1237
d113fda1
MD
1238 if (dip->flags & HAMMER_INODE_RO)
1239 return (EROFS);
1240 if (ip->flags & HAMMER_INODE_RO)
1241 return (EROFS);
93291532 1242 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1243 return (error);
d113fda1 1244
66325755
MD
1245 /*
1246 * Create a transaction to cover the operations we perform.
1247 */
8cd0a023 1248 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1249 ++hammer_stats_file_iopsw;
66325755
MD
1250
1251 /*
1252 * Add the filesystem object to the directory. Note that neither
1253 * dip nor ip are referenced or locked, but their vnodes are
1254 * referenced. This function will bump the inode's link count.
1255 */
5a930e66
MD
1256 error = hammer_ip_add_directory(&trans, dip,
1257 nch->ncp->nc_name, nch->ncp->nc_nlen,
1258 ip);
66325755
MD
1259
1260 /*
1261 * Finish up.
1262 */
b84de5af 1263 if (error == 0) {
6b4f890b
MD
1264 cache_setunresolved(nch);
1265 cache_setvp(nch, ap->a_vp);
66325755 1266 }
b84de5af 1267 hammer_done_transaction(&trans);
fbb84158
MD
1268 hammer_knote(ap->a_vp, NOTE_LINK);
1269 hammer_knote(ap->a_dvp, NOTE_WRITE);
66325755 1270 return (error);
427e5fc6
MD
1271}
1272
66325755
MD
1273/*
1274 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1275 *
1276 * The operating system has already ensured that the directory entry
1277 * does not exist and done all appropriate namespace locking.
1278 */
427e5fc6
MD
1279static
1280int
66325755 1281hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
427e5fc6 1282{
66325755
MD
1283 struct hammer_transaction trans;
1284 struct hammer_inode *dip;
1285 struct hammer_inode *nip;
1286 struct nchandle *nch;
1287 int error;
1288
1289 nch = ap->a_nch;
1290 dip = VTOI(ap->a_dvp);
1291
d113fda1
MD
1292 if (dip->flags & HAMMER_INODE_RO)
1293 return (EROFS);
93291532 1294 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1295 return (error);
d113fda1 1296
66325755
MD
1297 /*
1298 * Create a transaction to cover the operations we perform.
1299 */
8cd0a023 1300 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1301 ++hammer_stats_file_iopsw;
66325755
MD
1302
1303 /*
1304 * Create a new filesystem object of the requested type. The
8cd0a023 1305 * returned inode will be referenced but not locked.
66325755 1306 */
5a930e66 1307 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
1308 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1309 NULL, &nip);
66325755 1310 if (error) {
77062c8a 1311 hkprintf("hammer_mkdir error %d\n", error);
b84de5af 1312 hammer_done_transaction(&trans);
66325755
MD
1313 *ap->a_vpp = NULL;
1314 return (error);
1315 }
66325755
MD
1316 /*
1317 * Add the new filesystem object to the directory. This will also
1318 * bump the inode's link count.
1319 */
5a930e66
MD
1320 error = hammer_ip_add_directory(&trans, dip,
1321 nch->ncp->nc_name, nch->ncp->nc_nlen,
1322 nip);
0b075555 1323 if (error)
77062c8a 1324 hkprintf("hammer_mkdir (add) error %d\n", error);
66325755
MD
1325
1326 /*
1327 * Finish up.
1328 */
1329 if (error) {
a89aec1b 1330 hammer_rel_inode(nip, 0);
66325755
MD
1331 *ap->a_vpp = NULL;
1332 } else {
e8599db1 1333 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1334 hammer_rel_inode(nip, 0);
1335 if (error == 0) {
1336 cache_setunresolved(ap->a_nch);
1337 cache_setvp(ap->a_nch, *ap->a_vpp);
1338 }
66325755 1339 }
b84de5af 1340 hammer_done_transaction(&trans);
fbb84158
MD
1341 if (error == 0)
1342 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
66325755 1343 return (error);
427e5fc6
MD
1344}
1345
66325755
MD
1346/*
1347 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1348 *
1349 * The operating system has already ensured that the directory entry
1350 * does not exist and done all appropriate namespace locking.
1351 */
427e5fc6
MD
1352static
1353int
66325755 1354hammer_vop_nmknod(struct vop_nmknod_args *ap)
427e5fc6 1355{
66325755
MD
1356 struct hammer_transaction trans;
1357 struct hammer_inode *dip;
1358 struct hammer_inode *nip;
1359 struct nchandle *nch;
1360 int error;
1361
1362 nch = ap->a_nch;
1363 dip = VTOI(ap->a_dvp);
1364
d113fda1
MD
1365 if (dip->flags & HAMMER_INODE_RO)
1366 return (EROFS);
93291532 1367 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1368 return (error);
d113fda1 1369
66325755
MD
1370 /*
1371 * Create a transaction to cover the operations we perform.
1372 */
8cd0a023 1373 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1374 ++hammer_stats_file_iopsw;
66325755
MD
1375
1376 /*
1377 * Create a new filesystem object of the requested type. The
8cd0a023 1378 * returned inode will be referenced but not locked.
5a930e66
MD
1379 *
1380 * If mknod specifies a directory a pseudo-fs is created.
66325755 1381 */
5a930e66 1382 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
1383 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1384 NULL, &nip);
66325755 1385 if (error) {
b84de5af 1386 hammer_done_transaction(&trans);
66325755
MD
1387 *ap->a_vpp = NULL;
1388 return (error);
1389 }
66325755
MD
1390
1391 /*
1392 * Add the new filesystem object to the directory. This will also
1393 * bump the inode's link count.
1394 */
5a930e66
MD
1395 error = hammer_ip_add_directory(&trans, dip,
1396 nch->ncp->nc_name, nch->ncp->nc_nlen,
1397 nip);
66325755
MD
1398
1399 /*
1400 * Finish up.
1401 */
1402 if (error) {
a89aec1b 1403 hammer_rel_inode(nip, 0);
66325755
MD
1404 *ap->a_vpp = NULL;
1405 } else {
e8599db1 1406 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1407 hammer_rel_inode(nip, 0);
1408 if (error == 0) {
1409 cache_setunresolved(ap->a_nch);
1410 cache_setvp(ap->a_nch, *ap->a_vpp);
1411 }
66325755 1412 }
b84de5af 1413 hammer_done_transaction(&trans);
fbb84158
MD
1414 if (error == 0)
1415 hammer_knote(ap->a_dvp, NOTE_WRITE);
66325755 1416 return (error);
427e5fc6
MD
1417}
1418
66325755
MD
1419/*
1420 * hammer_vop_open { vp, mode, cred, fp }
1421 */
427e5fc6
MD
1422static
1423int
66325755 1424hammer_vop_open(struct vop_open_args *ap)
427e5fc6 1425{
9f5097dc
MD
1426 hammer_inode_t ip;
1427
ce0138a6 1428 ++hammer_stats_file_iopsr;
9f5097dc
MD
1429 ip = VTOI(ap->a_vp);
1430
1431 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
d113fda1 1432 return (EROFS);
a89aec1b 1433 return(vop_stdopen(ap));
427e5fc6
MD
1434}
1435
66325755
MD
1436/*
1437 * hammer_vop_print { vp }
1438 */
427e5fc6
MD
1439static
1440int
66325755 1441hammer_vop_print(struct vop_print_args *ap)
427e5fc6
MD
1442{
1443 return EOPNOTSUPP;
1444}
1445
66325755 1446/*
6b4f890b 1447 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
66325755 1448 */
427e5fc6
MD
1449static
1450int
66325755 1451hammer_vop_readdir(struct vop_readdir_args *ap)
427e5fc6 1452{
36f82b23 1453 struct hammer_transaction trans;
6b4f890b
MD
1454 struct hammer_cursor cursor;
1455 struct hammer_inode *ip;
1456 struct uio *uio;
6b4f890b
MD
1457 hammer_base_elm_t base;
1458 int error;
1459 int cookie_index;
1460 int ncookies;
1461 off_t *cookies;
1462 off_t saveoff;
1463 int r;
ea434b6f 1464 int dtype;
6b4f890b 1465
ce0138a6 1466 ++hammer_stats_file_iopsr;
6b4f890b
MD
1467 ip = VTOI(ap->a_vp);
1468 uio = ap->a_uio;
b3deaf57
MD
1469 saveoff = uio->uio_offset;
1470
1471 if (ap->a_ncookies) {
1472 ncookies = uio->uio_resid / 16 + 1;
1473 if (ncookies > 1024)
1474 ncookies = 1024;
1475 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1476 cookie_index = 0;
1477 } else {
1478 ncookies = -1;
1479 cookies = NULL;
1480 cookie_index = 0;
1481 }
1482
36f82b23
MD
1483 hammer_simple_transaction(&trans, ip->hmp);
1484
b3deaf57
MD
1485 /*
1486 * Handle artificial entries
4c286c36
MD
1487 *
1488 * It should be noted that the minimum value for a directory
1489 * hash key on-media is 0x0000000100000000, so we can use anything
1490 * less then that to represent our 'special' key space.
b3deaf57
MD
1491 */
1492 error = 0;
1493 if (saveoff == 0) {
1494 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1495 if (r)
1496 goto done;
1497 if (cookies)
1498 cookies[cookie_index] = saveoff;
1499 ++saveoff;
1500 ++cookie_index;
1501 if (cookie_index == ncookies)
1502 goto done;
1503 }
1504 if (saveoff == 1) {
1505 if (ip->ino_data.parent_obj_id) {
1506 r = vop_write_dirent(&error, uio,
1507 ip->ino_data.parent_obj_id,
1508 DT_DIR, 2, "..");
1509 } else {
1510 r = vop_write_dirent(&error, uio,
1511 ip->obj_id, DT_DIR, 2, "..");
1512 }
1513 if (r)
1514 goto done;
1515 if (cookies)
1516 cookies[cookie_index] = saveoff;
1517 ++saveoff;
1518 ++cookie_index;
1519 if (cookie_index == ncookies)
1520 goto done;
1521 }
6b4f890b
MD
1522
1523 /*
1524 * Key range (begin and end inclusive) to scan. Directory keys
1525 * directly translate to a 64 bit 'seek' position.
1526 */
bcac4bbb 1527 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
5a930e66 1528 cursor.key_beg.localization = ip->obj_localization +
beec5dc4 1529 hammer_dir_localization(ip);
6b4f890b 1530 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1531 cursor.key_beg.create_tid = 0;
6b4f890b
MD
1532 cursor.key_beg.delete_tid = 0;
1533 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1534 cursor.key_beg.obj_type = 0;
b3deaf57 1535 cursor.key_beg.key = saveoff;
6b4f890b
MD
1536
1537 cursor.key_end = cursor.key_beg;
1538 cursor.key_end.key = HAMMER_MAX_KEY;
d5530d22
MD
1539 cursor.asof = ip->obj_asof;
1540 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
6b4f890b 1541
4e17f465 1542 error = hammer_ip_first(&cursor);
6b4f890b
MD
1543
1544 while (error == 0) {
11ad5ade 1545 error = hammer_ip_resolve_data(&cursor);
6b4f890b
MD
1546 if (error)
1547 break;
11ad5ade 1548 base = &cursor.leaf->base;
6b4f890b 1549 saveoff = base->key;
11ad5ade 1550 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
6b4f890b 1551
7a04d74f
MD
1552 if (base->obj_id != ip->obj_id)
1553 panic("readdir: bad record at %p", cursor.node);
1554
ea434b6f
MD
1555 /*
1556 * Convert pseudo-filesystems into softlinks
1557 */
1558 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
6b4f890b 1559 r = vop_write_dirent(
11ad5ade 1560 &error, uio, cursor.data->entry.obj_id,
ea434b6f 1561 dtype,
11ad5ade
MD
1562 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1563 (void *)cursor.data->entry.name);
6b4f890b
MD
1564 if (r)
1565 break;
1566 ++saveoff;
1567 if (cookies)
1568 cookies[cookie_index] = base->key;
1569 ++cookie_index;
1570 if (cookie_index == ncookies)
1571 break;
1572 error = hammer_ip_next(&cursor);
1573 }
1574 hammer_done_cursor(&cursor);
1575
b3deaf57 1576done:
b84de5af 1577 hammer_done_transaction(&trans);
36f82b23 1578
6b4f890b
MD
1579 if (ap->a_eofflag)
1580 *ap->a_eofflag = (error == ENOENT);
6b4f890b
MD
1581 uio->uio_offset = saveoff;
1582 if (error && cookie_index == 0) {
b3deaf57
MD
1583 if (error == ENOENT)
1584 error = 0;
6b4f890b
MD
1585 if (cookies) {
1586 kfree(cookies, M_TEMP);
1587 *ap->a_ncookies = 0;
1588 *ap->a_cookies = NULL;
1589 }
1590 } else {
7a04d74f
MD
1591 if (error == ENOENT)
1592 error = 0;
6b4f890b
MD
1593 if (cookies) {
1594 *ap->a_ncookies = cookie_index;
1595 *ap->a_cookies = cookies;
1596 }
1597 }
1598 return(error);
427e5fc6
MD
1599}
1600
66325755
MD
1601/*
1602 * hammer_vop_readlink { vp, uio, cred }
1603 */
427e5fc6
MD
1604static
1605int
66325755 1606hammer_vop_readlink(struct vop_readlink_args *ap)
427e5fc6 1607{
36f82b23 1608 struct hammer_transaction trans;
7a04d74f
MD
1609 struct hammer_cursor cursor;
1610 struct hammer_inode *ip;
ea434b6f
MD
1611 char buf[32];
1612 u_int32_t localization;
1613 hammer_pseudofs_inmem_t pfsm;
7a04d74f
MD
1614 int error;
1615
1616 ip = VTOI(ap->a_vp);
36f82b23 1617
2f85fa4d
MD
1618 /*
1619 * Shortcut if the symlink data was stuffed into ino_data.
ea434b6f 1620 *
842e7a70
MD
1621 * Also expand special "@@PFS%05d" softlinks (expansion only
1622 * occurs for non-historical (current) accesses made from the
1623 * primary filesystem).
2f85fa4d
MD
1624 */
1625 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
ea434b6f
MD
1626 char *ptr;
1627 int bytes;
1628
1629 ptr = ip->ino_data.ext.symlink;
1630 bytes = (int)ip->ino_data.size;
842e7a70
MD
1631 if (bytes == 10 &&
1632 ip->obj_asof == HAMMER_MAX_TID &&
1633 ip->obj_localization == 0 &&
1634 strncmp(ptr, "@@PFS", 5) == 0) {
ea434b6f
MD
1635 hammer_simple_transaction(&trans, ip->hmp);
1636 bcopy(ptr + 5, buf, 5);
1637 buf[5] = 0;
1638 localization = strtoul(buf, NULL, 10) << 16;
1639 pfsm = hammer_load_pseudofs(&trans, localization,
1640 &error);
1641 if (error == 0) {
4c038e17
MD
1642 if (pfsm->pfsd.mirror_flags &
1643 HAMMER_PFSD_SLAVE) {
cb3c760c 1644 /* vap->va_size == 26 */
4c038e17
MD
1645 ksnprintf(buf, sizeof(buf),
1646 "@@0x%016llx:%05d",
973c11b9 1647 (long long)pfsm->pfsd.sync_end_tid,
4c038e17
MD
1648 localization >> 16);
1649 } else {
cb3c760c
MD
1650 /* vap->va_size == 10 */
1651 ksnprintf(buf, sizeof(buf),
1652 "@@-1:%05d",
1653 localization >> 16);
1654#if 0
4c038e17
MD
1655 ksnprintf(buf, sizeof(buf),
1656 "@@0x%016llx:%05d",
973c11b9 1657 (long long)HAMMER_MAX_TID,
4c038e17 1658 localization >> 16);
cb3c760c 1659#endif
4c038e17 1660 }
ea434b6f
MD
1661 ptr = buf;
1662 bytes = strlen(buf);
1663 }
1664 if (pfsm)
1665 hammer_rel_pseudofs(trans.hmp, pfsm);
1666 hammer_done_transaction(&trans);
1667 }
1668 error = uiomove(ptr, bytes, ap->a_uio);
2f85fa4d
MD
1669 return(error);
1670 }
36f82b23 1671
2f85fa4d
MD
1672 /*
1673 * Long version
1674 */
1675 hammer_simple_transaction(&trans, ip->hmp);
ce0138a6 1676 ++hammer_stats_file_iopsr;
bcac4bbb 1677 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
7a04d74f
MD
1678
1679 /*
1680 * Key range (begin and end inclusive) to scan. Directory keys
1681 * directly translate to a 64 bit 'seek' position.
1682 */
5a930e66
MD
1683 cursor.key_beg.localization = ip->obj_localization +
1684 HAMMER_LOCALIZE_MISC;
7a04d74f 1685 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1686 cursor.key_beg.create_tid = 0;
7a04d74f
MD
1687 cursor.key_beg.delete_tid = 0;
1688 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1689 cursor.key_beg.obj_type = 0;
1690 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
d5530d22
MD
1691 cursor.asof = ip->obj_asof;
1692 cursor.flags |= HAMMER_CURSOR_ASOF;
7a04d74f 1693
45a014dc 1694 error = hammer_ip_lookup(&cursor);
7a04d74f
MD
1695 if (error == 0) {
1696 error = hammer_ip_resolve_data(&cursor);
1697 if (error == 0) {
11ad5ade
MD
1698 KKASSERT(cursor.leaf->data_len >=
1699 HAMMER_SYMLINK_NAME_OFF);
1700 error = uiomove(cursor.data->symlink.name,
1701 cursor.leaf->data_len -
1702 HAMMER_SYMLINK_NAME_OFF,
7a04d74f
MD
1703 ap->a_uio);
1704 }
1705 }
1706 hammer_done_cursor(&cursor);
b84de5af 1707 hammer_done_transaction(&trans);
7a04d74f 1708 return(error);
427e5fc6
MD
1709}
1710
66325755
MD
1711/*
1712 * hammer_vop_nremove { nch, dvp, cred }
1713 */
427e5fc6
MD
1714static
1715int
66325755 1716hammer_vop_nremove(struct vop_nremove_args *ap)
427e5fc6 1717{
b84de5af 1718 struct hammer_transaction trans;
e63644f0 1719 struct hammer_inode *dip;
b84de5af
MD
1720 int error;
1721
e63644f0
MD
1722 dip = VTOI(ap->a_dvp);
1723
1724 if (hammer_nohistory(dip) == 0 &&
93291532 1725 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1726 return (error);
1727 }
1728
1729 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1730 ++hammer_stats_file_iopsw;
d7e278bb 1731 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
b84de5af 1732 hammer_done_transaction(&trans);
fbb84158
MD
1733 if (error == 0)
1734 hammer_knote(ap->a_dvp, NOTE_WRITE);
b84de5af 1735 return (error);
427e5fc6
MD
1736}
1737
66325755
MD
1738/*
1739 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1740 */
427e5fc6
MD
1741static
1742int
66325755 1743hammer_vop_nrename(struct vop_nrename_args *ap)
427e5fc6 1744{
8cd0a023
MD
1745 struct hammer_transaction trans;
1746 struct namecache *fncp;
1747 struct namecache *tncp;
1748 struct hammer_inode *fdip;
1749 struct hammer_inode *tdip;
1750 struct hammer_inode *ip;
1751 struct hammer_cursor cursor;
8cd0a023 1752 int64_t namekey;
5e435c92 1753 u_int32_t max_iterations;
11ad5ade 1754 int nlen, error;
8cd0a023 1755
f437a2ab
MD
1756 if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1757 return(EXDEV);
1758 if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1759 return(EXDEV);
1760
8cd0a023
MD
1761 fdip = VTOI(ap->a_fdvp);
1762 tdip = VTOI(ap->a_tdvp);
1763 fncp = ap->a_fnch->ncp;
1764 tncp = ap->a_tnch->ncp;
b3deaf57
MD
1765 ip = VTOI(fncp->nc_vp);
1766 KKASSERT(ip != NULL);
d113fda1 1767
f437a2ab
MD
1768 if (fdip->obj_localization != tdip->obj_localization)
1769 return(EXDEV);
1770 if (fdip->obj_localization != ip->obj_localization)
1771 return(EXDEV);
1772
d113fda1
MD
1773 if (fdip->flags & HAMMER_INODE_RO)
1774 return (EROFS);
1775 if (tdip->flags & HAMMER_INODE_RO)
1776 return (EROFS);
1777 if (ip->flags & HAMMER_INODE_RO)
1778 return (EROFS);
93291532 1779 if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1780 return (error);
d113fda1 1781
8cd0a023 1782 hammer_start_transaction(&trans, fdip->hmp);
ce0138a6 1783 ++hammer_stats_file_iopsw;
8cd0a023
MD
1784
1785 /*
b3deaf57
MD
1786 * Remove tncp from the target directory and then link ip as
1787 * tncp. XXX pass trans to dounlink
42c7d26b
MD
1788 *
1789 * Force the inode sync-time to match the transaction so it is
1790 * in-sync with the creation of the target directory entry.
8cd0a023 1791 */
d7e278bb
MD
1792 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1793 ap->a_cred, 0, -1);
42c7d26b 1794 if (error == 0 || error == ENOENT) {
5a930e66
MD
1795 error = hammer_ip_add_directory(&trans, tdip,
1796 tncp->nc_name, tncp->nc_nlen,
1797 ip);
42c7d26b
MD
1798 if (error == 0) {
1799 ip->ino_data.parent_obj_id = tdip->obj_id;
cc0758d0 1800 ip->ino_data.ctime = trans.time;
47637bff 1801 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
42c7d26b
MD
1802 }
1803 }
b3deaf57
MD
1804 if (error)
1805 goto failed; /* XXX */
8cd0a023
MD
1806
1807 /*
1808 * Locate the record in the originating directory and remove it.
1809 *
1810 * Calculate the namekey and setup the key range for the scan. This
1811 * works kinda like a chained hash table where the lower 32 bits
1812 * of the namekey synthesize the chain.
1813 *
1814 * The key range is inclusive of both key_beg and key_end.
1815 */
5e435c92
MD
1816 namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1817 &max_iterations);
6a37e7e4 1818retry:
bcac4bbb 1819 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
5a930e66 1820 cursor.key_beg.localization = fdip->obj_localization +
beec5dc4 1821 hammer_dir_localization(fdip);
8cd0a023
MD
1822 cursor.key_beg.obj_id = fdip->obj_id;
1823 cursor.key_beg.key = namekey;
d5530d22 1824 cursor.key_beg.create_tid = 0;
8cd0a023
MD
1825 cursor.key_beg.delete_tid = 0;
1826 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1827 cursor.key_beg.obj_type = 0;
1828
1829 cursor.key_end = cursor.key_beg;
5e435c92 1830 cursor.key_end.key += max_iterations;
d5530d22
MD
1831 cursor.asof = fdip->obj_asof;
1832 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
8cd0a023
MD
1833
1834 /*
1835 * Scan all matching records (the chain), locate the one matching
a89aec1b 1836 * the requested path component.
8cd0a023
MD
1837 *
1838 * The hammer_ip_*() functions merge in-memory records with on-disk
1839 * records for the purposes of the search.
1840 */
4e17f465 1841 error = hammer_ip_first(&cursor);
a89aec1b 1842 while (error == 0) {
8cd0a023
MD
1843 if (hammer_ip_resolve_data(&cursor) != 0)
1844 break;
11ad5ade
MD
1845 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1846 KKASSERT(nlen > 0);
1847 if (fncp->nc_nlen == nlen &&
1848 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
8cd0a023
MD
1849 break;
1850 }
a89aec1b 1851 error = hammer_ip_next(&cursor);
8cd0a023 1852 }
8cd0a023
MD
1853
1854 /*
1855 * If all is ok we have to get the inode so we can adjust nlinks.
6a37e7e4
MD
1856 *
1857 * WARNING: hammer_ip_del_directory() may have to terminate the
1858 * cursor to avoid a recursion. It's ok to call hammer_done_cursor()
1859 * twice.
8cd0a023 1860 */
9944ae54 1861 if (error == 0)
6a37e7e4 1862 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
b84de5af
MD
1863
1864 /*
1865 * XXX A deadlock here will break rename's atomicy for the purposes
1866 * of crash recovery.
1867 */
1868 if (error == EDEADLK) {
b84de5af 1869 hammer_done_cursor(&cursor);
b84de5af
MD
1870 goto retry;
1871 }
1872
1873 /*
1874 * Cleanup and tell the kernel that the rename succeeded.
1875 */
c0ade690 1876 hammer_done_cursor(&cursor);
fbb84158 1877 if (error == 0) {
6a37e7e4 1878 cache_rename(ap->a_fnch, ap->a_tnch);
fbb84158
MD
1879 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1880 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1881 if (ip->vp)
1882 hammer_knote(ip->vp, NOTE_RENAME);
1883 }
b84de5af 1884
b3deaf57 1885failed:
b84de5af 1886 hammer_done_transaction(&trans);
8cd0a023 1887 return (error);
427e5fc6
MD
1888}
1889
66325755
MD
1890/*
1891 * hammer_vop_nrmdir { nch, dvp, cred }
1892 */
427e5fc6
MD
1893static
1894int
66325755 1895hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
427e5fc6 1896{
b84de5af 1897 struct hammer_transaction trans;
e63644f0 1898 struct hammer_inode *dip;
b84de5af
MD
1899 int error;
1900
e63644f0
MD
1901 dip = VTOI(ap->a_dvp);
1902
1903 if (hammer_nohistory(dip) == 0 &&
93291532 1904 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1905 return (error);
1906 }
1907
1908 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1909 ++hammer_stats_file_iopsw;
d7e278bb 1910 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
b84de5af 1911 hammer_done_transaction(&trans);
fbb84158
MD
1912 if (error == 0)
1913 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
b84de5af 1914 return (error);
427e5fc6
MD
1915}
1916
349433c9
MD
1917/*
1918 * hammer_vop_markatime { vp, cred }
1919 */
1920static
1921int
1922hammer_vop_markatime(struct vop_markatime_args *ap)
1923{
1924 struct hammer_transaction trans;
1925 struct hammer_inode *ip;
1926
1927 ip = VTOI(ap->a_vp);
1928 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1929 return (EROFS);
1930 if (ip->flags & HAMMER_INODE_RO)
1931 return (EROFS);
1932 if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
1933 return (0);
1934 hammer_start_transaction(&trans, ip->hmp);
1935 ++hammer_stats_file_iopsw;
1936
1937 ip->ino_data.atime = trans.time;
1938 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
1939 hammer_done_transaction(&trans);
1940 hammer_knote(ap->a_vp, NOTE_ATTRIB);
1941 return (0);
1942}
1943
66325755
MD
1944/*
1945 * hammer_vop_setattr { vp, vap, cred }
1946 */
427e5fc6
MD
1947static
1948int
66325755 1949hammer_vop_setattr(struct vop_setattr_args *ap)
427e5fc6 1950{
8cd0a023
MD
1951 struct hammer_transaction trans;
1952 struct vattr *vap;
1953 struct hammer_inode *ip;
1954 int modflags;
1955 int error;
d5ef456e 1956 int truncating;
4a2796f3 1957 int blksize;
fbb84158 1958 int kflags;
4a2796f3 1959 int64_t aligned_size;
8cd0a023 1960 u_int32_t flags;
8cd0a023
MD
1961
1962 vap = ap->a_vap;
1963 ip = ap->a_vp->v_data;
1964 modflags = 0;
fbb84158 1965 kflags = 0;
8cd0a023
MD
1966
1967 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1968 return(EROFS);
d113fda1
MD
1969 if (ip->flags & HAMMER_INODE_RO)
1970 return (EROFS);
e63644f0 1971 if (hammer_nohistory(ip) == 0 &&
93291532 1972 (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1973 return (error);
1974 }
8cd0a023
MD
1975
1976 hammer_start_transaction(&trans, ip->hmp);
ce0138a6 1977 ++hammer_stats_file_iopsw;
8cd0a023
MD
1978 error = 0;
1979
1980 if (vap->va_flags != VNOVAL) {
1981 flags = ip->ino_data.uflags;
1982 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1983 hammer_to_unix_xid(&ip->ino_data.uid),
1984 ap->a_cred);
1985 if (error == 0) {
1986 if (ip->ino_data.uflags != flags) {
1987 ip->ino_data.uflags = flags;
cc0758d0 1988 ip->ino_data.ctime = trans.time;
8cd0a023 1989 modflags |= HAMMER_INODE_DDIRTY;
fbb84158 1990 kflags |= NOTE_ATTRIB;
8cd0a023
MD
1991 }
1992 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1993 error = 0;
1994 goto done;
1995 }
1996 }
1997 goto done;
1998 }
1999 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2000 error = EPERM;
2001 goto done;
2002 }
7538695e
MD
2003 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2004 mode_t cur_mode = ip->ino_data.mode;
2005 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2006 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2007 uuid_t uuid_uid;
2008 uuid_t uuid_gid;
2009
2010 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2011 ap->a_cred,
2012 &cur_uid, &cur_gid, &cur_mode);
2013 if (error == 0) {
2014 hammer_guid_to_uuid(&uuid_uid, cur_uid);
2015 hammer_guid_to_uuid(&uuid_gid, cur_gid);
2016 if (bcmp(&uuid_uid, &ip->ino_data.uid,
2017 sizeof(uuid_uid)) ||
2018 bcmp(&uuid_gid, &ip->ino_data.gid,
2019 sizeof(uuid_gid)) ||
2020 ip->ino_data.mode != cur_mode
2021 ) {
2022 ip->ino_data.uid = uuid_uid;
2023 ip->ino_data.gid = uuid_gid;
2024 ip->ino_data.mode = cur_mode;
cc0758d0
MD
2025 ip->ino_data.ctime = trans.time;
2026 modflags |= HAMMER_INODE_DDIRTY;
7538695e 2027 }
fbb84158 2028 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2029 }
2030 }
11ad5ade 2031 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
8cd0a023
MD
2032 switch(ap->a_vp->v_type) {
2033 case VREG:
11ad5ade 2034 if (vap->va_size == ip->ino_data.size)
d5ef456e 2035 break;
b84de5af
MD
2036 /*
2037 * XXX break atomicy, we can deadlock the backend
2038 * if we do not release the lock. Probably not a
2039 * big deal here.
2040 */
4a2796f3 2041 blksize = hammer_blocksize(vap->va_size);
11ad5ade 2042 if (vap->va_size < ip->ino_data.size) {
4a2796f3 2043 vtruncbuf(ap->a_vp, vap->va_size, blksize);
d5ef456e 2044 truncating = 1;
fbb84158 2045 kflags |= NOTE_WRITE;
d5ef456e 2046 } else {
c0ade690 2047 vnode_pager_setsize(ap->a_vp, vap->va_size);
d5ef456e 2048 truncating = 0;
fbb84158 2049 kflags |= NOTE_WRITE | NOTE_EXTEND;
c0ade690 2050 }
11ad5ade 2051 ip->ino_data.size = vap->va_size;
cc0758d0
MD
2052 ip->ino_data.mtime = trans.time;
2053 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
d5ef456e 2054
b84de5af
MD
2055 /*
2056 * on-media truncation is cached in the inode until
2057 * the inode is synchronized.
2058 */
d5ef456e 2059 if (truncating) {
47637bff 2060 hammer_ip_frontend_trunc(ip, vap->va_size);
0832c9bb
MD
2061#ifdef DEBUG_TRUNCATE
2062 if (HammerTruncIp == NULL)
2063 HammerTruncIp = ip;
2064#endif
b84de5af
MD
2065 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2066 ip->flags |= HAMMER_INODE_TRUNCATED;
2067 ip->trunc_off = vap->va_size;
0832c9bb
MD
2068#ifdef DEBUG_TRUNCATE
2069 if (ip == HammerTruncIp)
973c11b9
MD
2070 kprintf("truncate1 %016llx\n",
2071 (long long)ip->trunc_off);
0832c9bb 2072#endif
b84de5af
MD
2073 } else if (ip->trunc_off > vap->va_size) {
2074 ip->trunc_off = vap->va_size;
0832c9bb
MD
2075#ifdef DEBUG_TRUNCATE
2076 if (ip == HammerTruncIp)
973c11b9
MD
2077 kprintf("truncate2 %016llx\n",
2078 (long long)ip->trunc_off);
0832c9bb
MD
2079#endif
2080 } else {
2081#ifdef DEBUG_TRUNCATE
2082 if (ip == HammerTruncIp)
973c11b9
MD
2083 kprintf("truncate3 %016llx (ignored)\n",
2084 (long long)vap->va_size);
0832c9bb 2085#endif
b84de5af 2086 }
d5ef456e 2087 }
b84de5af 2088
d5ef456e
MD
2089 /*
2090 * If truncating we have to clean out a portion of
b84de5af
MD
2091 * the last block on-disk. We do this in the
2092 * front-end buffer cache.
d5ef456e 2093 */
4a2796f3
MD
2094 aligned_size = (vap->va_size + (blksize - 1)) &
2095 ~(int64_t)(blksize - 1);
b84de5af 2096 if (truncating && vap->va_size < aligned_size) {
d5ef456e
MD
2097 struct buf *bp;
2098 int offset;
2099
4a2796f3 2100 aligned_size -= blksize;
47637bff 2101
4a2796f3 2102 offset = (int)vap->va_size & (blksize - 1);
47637bff 2103 error = bread(ap->a_vp, aligned_size,
4a2796f3 2104 blksize, &bp);
47637bff 2105 hammer_ip_frontend_trunc(ip, aligned_size);
d5ef456e
MD
2106 if (error == 0) {
2107 bzero(bp->b_data + offset,
4a2796f3 2108 blksize - offset);
1b0ab2c3
MD
2109 /* must de-cache direct-io offset */
2110 bp->b_bio2.bio_offset = NOOFFSET;
d5ef456e
MD
2111 bdwrite(bp);
2112 } else {
47637bff 2113 kprintf("ERROR %d\n", error);
d5ef456e
MD
2114 brelse(bp);
2115 }
2116 }
76376933 2117 break;
8cd0a023 2118 case VDATABASE:
b84de5af
MD
2119 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2120 ip->flags |= HAMMER_INODE_TRUNCATED;
2121 ip->trunc_off = vap->va_size;
2122 } else if (ip->trunc_off > vap->va_size) {
2123 ip->trunc_off = vap->va_size;
2124 }
47637bff 2125 hammer_ip_frontend_trunc(ip, vap->va_size);
11ad5ade 2126 ip->ino_data.size = vap->va_size;
cc0758d0
MD
2127 ip->ino_data.mtime = trans.time;
2128 modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
fbb84158 2129 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2130 break;
2131 default:
2132 error = EINVAL;
2133 goto done;
2134 }
d26d0ae9 2135 break;
8cd0a023
MD
2136 }
2137 if (vap->va_atime.tv_sec != VNOVAL) {
cc0758d0 2138 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
ddfdf542 2139 modflags |= HAMMER_INODE_ATIME;
fbb84158 2140 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2141 }
2142 if (vap->va_mtime.tv_sec != VNOVAL) {
cc0758d0 2143 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
ddfdf542 2144 modflags |= HAMMER_INODE_MTIME;
fbb84158 2145 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2146 }
2147 if (vap->va_mode != (mode_t)VNOVAL) {
7538695e
MD
2148 mode_t cur_mode = ip->ino_data.mode;
2149 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2150 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2151
2152 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2153 cur_uid, cur_gid, &cur_mode);
2154 if (error == 0 && ip->ino_data.mode != cur_mode) {
2155 ip->ino_data.mode = cur_mode;
cc0758d0 2156 ip->ino_data.ctime = trans.time;
8cd0a023 2157 modflags |= HAMMER_INODE_DDIRTY;
fbb84158 2158 kflags |= NOTE_ATTRIB;
8cd0a023
MD
2159 }
2160 }
2161done:
b84de5af 2162 if (error == 0)
47637bff 2163 hammer_modify_inode(ip, modflags);
b84de5af 2164 hammer_done_transaction(&trans);
fbb84158 2165 hammer_knote(ap->a_vp, kflags);
8cd0a023 2166 return (error);
427e5fc6
MD
2167}
2168
66325755
MD
2169/*
2170 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2171 */
427e5fc6
MD
2172static
2173int
66325755 2174hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
427e5fc6 2175{
7a04d74f
MD
2176 struct hammer_transaction trans;
2177 struct hammer_inode *dip;
2178 struct hammer_inode *nip;
2179 struct nchandle *nch;
2180 hammer_record_t record;
2181 int error;
2182 int bytes;
2183
2184 ap->a_vap->va_type = VLNK;
2185
2186 nch = ap->a_nch;
2187 dip = VTOI(ap->a_dvp);
2188
d113fda1
MD
2189 if (dip->flags & HAMMER_INODE_RO)
2190 return (EROFS);
93291532 2191 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 2192 return (error);
d113fda1 2193
7a04d74f
MD
2194 /*
2195 * Create a transaction to cover the operations we perform.
2196 */
2197 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 2198 ++hammer_stats_file_iopsw;
7a04d74f
MD
2199
2200 /*
2201 * Create a new filesystem object of the requested type. The
2202 * returned inode will be referenced but not locked.
2203 */
2204
5a930e66 2205 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
5a64efa1
MD
2206 dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2207 NULL, &nip);
7a04d74f 2208 if (error) {
b84de5af 2209 hammer_done_transaction(&trans);
7a04d74f
MD
2210 *ap->a_vpp = NULL;
2211 return (error);
2212 }
2213
7a04d74f
MD
2214 /*
2215 * Add a record representing the symlink. symlink stores the link
2216 * as pure data, not a string, and is no \0 terminated.
2217 */
2218 if (error == 0) {
7a04d74f
MD
2219 bytes = strlen(ap->a_target);
2220
2f85fa4d
MD
2221 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2222 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2223 } else {
2224 record = hammer_alloc_mem_record(nip, bytes);
2225 record->type = HAMMER_MEM_RECORD_GENERAL;
2226
5a930e66
MD
2227 record->leaf.base.localization = nip->obj_localization +
2228 HAMMER_LOCALIZE_MISC;
2f85fa4d
MD
2229 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2230 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2231 record->leaf.data_len = bytes;
2232 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2233 bcopy(ap->a_target, record->data->symlink.name, bytes);
2234 error = hammer_ip_add_record(&trans, record);
2235 }
42c7d26b
MD
2236
2237 /*
2238 * Set the file size to the length of the link.
2239 */
2240 if (error == 0) {
11ad5ade 2241 nip->ino_data.size = bytes;
47637bff 2242 hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
42c7d26b 2243 }
7a04d74f 2244 }
1f07f686 2245 if (error == 0)
5a930e66
MD
2246 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2247 nch->ncp->nc_nlen, nip);
7a04d74f
MD
2248
2249 /*
2250 * Finish up.
2251 */
2252 if (error) {
2253 hammer_rel_inode(nip, 0);
7a04d74f
MD
2254 *ap->a_vpp = NULL;
2255 } else {
e8599db1 2256 error = hammer_get_vnode(nip, ap->a_vpp);
7a04d74f
MD
2257 hammer_rel_inode(nip, 0);
2258 if (error == 0) {
2259 cache_setunresolved(ap->a_nch);
2260 cache_setvp(ap->a_nch, *ap->a_vpp);
fbb84158 2261 hammer_knote(ap->a_dvp, NOTE_WRITE);
7a04d74f
MD
2262 }
2263 }
b84de5af 2264 hammer_done_transaction(&trans);
7a04d74f 2265 return (error);
427e5fc6
MD
2266}
2267
66325755
MD
2268/*
2269 * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2270 */
427e5fc6
MD
2271static
2272int
66325755 2273hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
427e5fc6 2274{
b84de5af 2275 struct hammer_transaction trans;
e63644f0 2276 struct hammer_inode *dip;
b84de5af
MD
2277 int error;
2278
e63644f0
MD
2279 dip = VTOI(ap->a_dvp);
2280
2281 if (hammer_nohistory(dip) == 0 &&
93291532 2282 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
e63644f0
MD
2283 return (error);
2284 }
2285
2286 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 2287 ++hammer_stats_file_iopsw;
b84de5af 2288 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
d7e278bb 2289 ap->a_cred, ap->a_flags, -1);
b84de5af
MD
2290 hammer_done_transaction(&trans);
2291
2292 return (error);
427e5fc6
MD
2293}
2294
7dc57964
MD
2295/*
2296 * hammer_vop_ioctl { vp, command, data, fflag, cred }
2297 */
2298static
2299int
2300hammer_vop_ioctl(struct vop_ioctl_args *ap)
2301{
2302 struct hammer_inode *ip = ap->a_vp->v_data;
2303
ce0138a6 2304 ++hammer_stats_file_iopsr;
7dc57964
MD
2305 return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2306 ap->a_fflag, ap->a_cred));
2307}
2308
513ca7d7
MD
2309static
2310int
2311hammer_vop_mountctl(struct vop_mountctl_args *ap)
2312{
dad088a5
MD
2313 static const struct mountctl_opt extraopt[] = {
2314 { HMNT_NOHISTORY, "nohistory" },
2315 { HMNT_MASTERID, "master" },
2316 { 0, NULL}
2317
2318 };
2319 struct hammer_mount *hmp;
513ca7d7 2320 struct mount *mp;
dad088a5 2321 int usedbytes;
513ca7d7
MD
2322 int error;
2323
dad088a5
MD
2324 error = 0;
2325 usedbytes = 0;
513ca7d7 2326 mp = ap->a_head.a_ops->head.vv_mount;
dad088a5
MD
2327 KKASSERT(mp->mnt_data != NULL);
2328 hmp = (struct hammer_mount *)mp->mnt_data;
513ca7d7
MD
2329
2330 switch(ap->a_op) {
dad088a5 2331
513ca7d7
MD
2332 case MOUNTCTL_SET_EXPORT:
2333 if (ap->a_ctllen != sizeof(struct export_args))
2334 error = EINVAL;
b424ca30
MD
2335 else
2336 error = hammer_vfs_export(mp, ap->a_op,
513ca7d7
MD
2337 (const struct export_args *)ap->a_ctl);
2338 break;
dad088a5
MD
2339 case MOUNTCTL_MOUNTFLAGS:
2340 {
2341 /*
2342 * Call standard mountctl VOP function
2343 * so we get user mount flags.
2344 */
2345 error = vop_stdmountctl(ap);
2346 if (error)
2347 break;
2348
2349 usedbytes = *ap->a_res;
2350
eac446c5 2351 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
dad088a5
MD
2352 usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf,
2353 ap->a_buflen - usedbytes,
2354 &error);
dad088a5
MD
2355 }
2356
2357 *ap->a_res += usedbytes;
2358 break;
2359 }
513ca7d7 2360 default:
726e0641 2361 error = vop_stdmountctl(ap);
513ca7d7
MD
2362 break;
2363 }
2364 return(error);
2365}
2366
66325755
MD
2367/*
2368 * hammer_vop_strategy { vp, bio }
8cd0a023
MD
2369 *
2370 * Strategy call, used for regular file read & write only. Note that the
2371 * bp may represent a cluster.
2372 *
2373 * To simplify operation and allow better optimizations in the future,
2374 * this code does not make any assumptions with regards to buffer alignment
2375 * or size.
66325755 2376 */
427e5fc6
MD
2377static
2378int
66325755 2379hammer_vop_strategy(struct vop_strategy_args *ap)
427e5fc6 2380{
8cd0a023
MD
2381 struct buf *bp;
2382 int error;
2383
2384 bp = ap->a_bio->bio_buf;
2385
2386 switch(bp->b_cmd) {
2387 case BUF_CMD_READ:
2388 error = hammer_vop_strategy_read(ap);
2389 break;
2390 case BUF_CMD_WRITE:
2391 error = hammer_vop_strategy_write(ap);
2392 break;
2393 default:
059819e3
MD
2394 bp->b_error = error = EINVAL;
2395 bp->b_flags |= B_ERROR;
2396 biodone(ap->a_bio);
8cd0a023
MD
2397 break;
2398 }
8cd0a023 2399 return (error);
427e5fc6
MD
2400}
2401
8cd0a023
MD
2402/*
2403 * Read from a regular file. Iterate the related records and fill in the
2404 * BIO/BUF. Gaps are zero-filled.
2405 *
2406 * The support code in hammer_object.c should be used to deal with mixed
2407 * in-memory and on-disk records.
2408 *
4a2796f3
MD
2409 * NOTE: Can be called from the cluster code with an oversized buf.
2410 *
8cd0a023
MD
2411 * XXX atime update
2412 */
2413static
2414int
2415hammer_vop_strategy_read(struct vop_strategy_args *ap)
2416{
36f82b23
MD
2417 struct hammer_transaction trans;
2418 struct hammer_inode *ip;
39d8fd63 2419 struct hammer_inode *dip;
8cd0a023 2420 struct hammer_cursor cursor;
8cd0a023 2421 hammer_base_elm_t base;
4a2796f3 2422 hammer_off_t disk_offset;
8cd0a023 2423 struct bio *bio;
a99b9ea2 2424 struct bio *nbio;
8cd0a023
MD
2425 struct buf *bp;
2426 int64_t rec_offset;
a89aec1b 2427 int64_t ran_end;
195c19a1 2428 int64_t tmp64;
8cd0a023
MD
2429 int error;
2430 int boff;
2431 int roff;
2432 int n;
2433
2434 bio = ap->a_bio;
2435 bp = bio->bio_buf;
36f82b23 2436 ip = ap->a_vp->v_data;
8cd0a023 2437
a99b9ea2
MD
2438 /*
2439 * The zone-2 disk offset may have been set by the cluster code via
4a2796f3 2440 * a BMAP operation, or else should be NOOFFSET.
a99b9ea2 2441 *
4a2796f3 2442 * Checking the high bits for a match against zone-2 should suffice.
a99b9ea2
MD
2443 */
2444 nbio = push_bio(bio);
6aeaa7bd 2445 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
1b0ab2c3
MD
2446 HAMMER_ZONE_LARGE_DATA) {
2447 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
a99b9ea2
MD
2448 return (error);
2449 }
2450
2451 /*
4a2796f3
MD
2452 * Well, that sucked. Do it the hard way. If all the stars are
2453 * aligned we may still be able to issue a direct-read.
a99b9ea2 2454 */
36f82b23 2455 hammer_simple_transaction(&trans, ip->hmp);
47637bff 2456 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
8cd0a023
MD
2457
2458 /*
2459 * Key range (begin and end inclusive) to scan. Note that the key's
c0ade690
MD
2460 * stored in the actual records represent BASE+LEN, not BASE. The
2461 * first record containing bio_offset will have a key > bio_offset.
8cd0a023 2462 */
5a930e66
MD
2463 cursor.key_beg.localization = ip->obj_localization +
2464 HAMMER_LOCALIZE_MISC;
8cd0a023 2465 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 2466 cursor.key_beg.create_tid = 0;
8cd0a023 2467 cursor.key_beg.delete_tid = 0;
8cd0a023 2468 cursor.key_beg.obj_type = 0;
c0ade690 2469 cursor.key_beg.key = bio->bio_offset + 1;
d5530d22 2470 cursor.asof = ip->obj_asof;
bf3b416b 2471 cursor.flags |= HAMMER_CURSOR_ASOF;
8cd0a023
MD
2472
2473 cursor.key_end = cursor.key_beg;
11ad5ade 2474 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
b84de5af 2475#if 0
11ad5ade 2476 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
a89aec1b
MD
2477 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2478 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2479 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
b84de5af
MD
2480 } else
2481#endif
2482 {
c0ade690 2483 ran_end = bio->bio_offset + bp->b_bufsize;
a89aec1b
MD
2484 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2485 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
195c19a1
MD
2486 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2487 if (tmp64 < ran_end)
a89aec1b
MD
2488 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2489 else
7f7c1f84 2490 cursor.key_end.key = ran_end + MAXPHYS + 1;
a89aec1b 2491 }
d26d0ae9 2492 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
8cd0a023 2493
4e17f465 2494 error = hammer_ip_first(&cursor);
8cd0a023
MD
2495 boff = 0;
2496
a89aec1b 2497 while (error == 0) {
47637bff
MD
2498 /*
2499 * Get the base file offset of the record. The key for
2500 * data records is (base + bytes) rather then (base).
2501 */
11ad5ade 2502 base = &cursor.leaf->base;
11ad5ade 2503 rec_offset = base->key - cursor.leaf->data_len;
8cd0a023 2504
66325755 2505 /*
a89aec1b 2506 * Calculate the gap, if any, and zero-fill it.
1fef775e
MD
2507 *
2508 * n is the offset of the start of the record verses our
2509 * current seek offset in the bio.
66325755 2510 */
8cd0a023
MD
2511 n = (int)(rec_offset - (bio->bio_offset + boff));
2512 if (n > 0) {
a89aec1b
MD
2513 if (n > bp->b_bufsize - boff)
2514 n = bp->b_bufsize - boff;
8cd0a023
MD
2515 bzero((char *)bp->b_data + boff, n);
2516 boff += n;
2517 n = 0;
66325755 2518 }
8cd0a023
MD
2519
2520 /*
2521 * Calculate the data offset in the record and the number
2522 * of bytes we can copy.
a89aec1b 2523 *
1fef775e
MD
2524 * There are two degenerate cases. First, boff may already
2525 * be at bp->b_bufsize. Secondly, the data offset within
2526 * the record may exceed the record's size.
8cd0a023
MD
2527 */
2528 roff = -n;
b84de5af 2529 rec_offset += roff;
11ad5ade 2530 n = cursor.leaf->data_len - roff;
1fef775e
MD
2531 if (n <= 0) {
2532 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2533 n = 0;
2534 } else if (n > bp->b_bufsize - boff) {
8cd0a023 2535 n = bp->b_bufsize - boff;
1fef775e 2536 }
059819e3 2537
b84de5af 2538 /*
47637bff
MD
2539 * Deal with cached truncations. This cool bit of code
2540 * allows truncate()/ftruncate() to avoid having to sync
2541 * the file.
2542 *
2543 * If the frontend is truncated then all backend records are
2544 * subject to the frontend's truncation.
2545 *
2546 * If the backend is truncated then backend records on-disk
2547 * (but not in-memory) are subject to the backend's
2548 * truncation. In-memory records owned by the backend
2549 * represent data written after the truncation point on the
2550 * backend and must not be truncated.
2551 *
2552 * Truncate operations deal with frontend buffer cache
2553 * buffers and frontend-owned in-memory records synchronously.
b84de5af 2554 */
47637bff
MD
2555 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2556 if (hammer_cursor_ondisk(&cursor) ||
2557 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2558 if (ip->trunc_off <= rec_offset)
2559 n = 0;
2560 else if (ip->trunc_off < rec_offset + n)
2561 n = (int)(ip->trunc_off - rec_offset);
2562 }
2563 }
2564 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2565 if (hammer_cursor_ondisk(&cursor)) {
2566 if (ip->sync_trunc_off <= rec_offset)
2567 n = 0;
2568 else if (ip->sync_trunc_off < rec_offset + n)
2569 n = (int)(ip->sync_trunc_off - rec_offset);
2570 }
2571 }
b84de5af
MD
2572
2573 /*
47637bff
MD
2574 * Try to issue a direct read into our bio if possible,
2575 * otherwise resolve the element data into a hammer_buffer
2576 * and copy.
4a2796f3
MD
2577 *
2578 * The buffer on-disk should be zerod past any real
2579 * truncation point, but may not be for any synthesized
2580 * truncation point from above.
b84de5af 2581 */
1b0ab2c3 2582 disk_offset = cursor.leaf->data_offset + roff;
4a2796f3 2583 if (boff == 0 && n == bp->b_bufsize &&
1b0ab2c3
MD
2584 hammer_cursor_ondisk(&cursor) &&
2585 (disk_offset & HAMMER_BUFMASK) == 0) {
2586 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2587 HAMMER_ZONE_LARGE_DATA);
4a2796f3 2588 nbio->bio_offset = disk_offset;
1b0ab2c3
MD
2589 error = hammer_io_direct_read(trans.hmp, nbio,
2590 cursor.leaf);
47637bff
MD
2591 goto done;
2592 } else if (n) {
2593 error = hammer_ip_resolve_data(&cursor);
2594 if (error == 0) {
2595 bcopy((char *)cursor.data + roff,
2596 (char *)bp->b_data + boff, n);
2597 }
b84de5af 2598 }
47637bff
MD
2599 if (error)
2600 break;
2601
2602 /*
2603 * Iterate until we have filled the request.
2604 */
2605 boff += n;
8cd0a023 2606 if (boff == bp->b_bufsize)
66325755 2607 break;
a89aec1b 2608 error = hammer_ip_next(&cursor);
66325755
MD
2609 }
2610
2611 /*
8cd0a023 2612 * There may have been a gap after the last record
66325755 2613 */
8cd0a023
MD
2614 if (error == ENOENT)
2615 error = 0;
2616 if (error == 0 && boff != bp->b_bufsize) {
7f7c1f84 2617 KKASSERT(boff < bp->b_bufsize);
8cd0a023
MD
2618 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2619 /* boff = bp->b_bufsize; */
2620 }
2621 bp->b_resid = 0;
059819e3
MD
2622 bp->b_error = error;
2623 if (error)
2624 bp->b_flags |= B_ERROR;
2625 biodone(ap->a_bio);
47637bff
MD
2626
2627done:
39d8fd63
MD
2628 /*
2629 * Cache the b-tree node for the last data read in cache[1].
2630 *
2631 * If we hit the file EOF then also cache the node in the
2632 * governing director's cache[3], it will be used to initialize
2633 * the inode's cache[1] for any inodes looked up via the directory.
2634 *
2635 * This doesn't reduce disk accesses since the B-Tree chain is
2636 * likely cached, but it does reduce cpu overhead when looking
2637 * up file offsets for cpdup/tar/cpio style iterations.
2638 */
47637bff 2639 if (cursor.node)
bcac4bbb 2640 hammer_cache_node(&ip->cache[1], cursor.node);
39d8fd63
MD
2641 if (ran_end >= ip->ino_data.size) {
2642 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2643 ip->obj_asof, ip->obj_localization);
2644 if (dip) {
2645 hammer_cache_node(&dip->cache[3], cursor.node);
2646 hammer_rel_inode(dip, 0);
2647 }
2648 }
47637bff
MD
2649 hammer_done_cursor(&cursor);
2650 hammer_done_transaction(&trans);
8cd0a023
MD
2651 return(error);
2652}
2653
a99b9ea2
MD
2654/*
2655 * BMAP operation - used to support cluster_read() only.
2656 *
2657 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2658 *
2659 * This routine may return EOPNOTSUPP if the opration is not supported for
2660 * the specified offset. The contents of the pointer arguments do not
2661 * need to be initialized in that case.
2662 *
2663 * If a disk address is available and properly aligned return 0 with
2664 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2665 * to the run-length relative to that offset. Callers may assume that
2666 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2667 * large, so return EOPNOTSUPP if it is not sufficiently large.
2668 */
2669static
2670int
2671hammer_vop_bmap(struct vop_bmap_args *ap)
2672{
2673 struct hammer_transaction trans;
2674 struct hammer_inode *ip;
2675 struct hammer_cursor cursor;
2676 hammer_base_elm_t base;
2677 int64_t rec_offset;
2678 int64_t ran_end;
2679 int64_t tmp64;
2680 int64_t base_offset;
2681 int64_t base_disk_offset;
2682 int64_t last_offset;
2683 hammer_off_t last_disk_offset;
2684 hammer_off_t disk_offset;
2685 int rec_len;
2686 int error;
4a2796f3 2687 int blksize;
a99b9ea2 2688
ce0138a6 2689 ++hammer_stats_file_iopsr;
a99b9ea2
MD
2690 ip = ap->a_vp->v_data;
2691
2692 /*
2693 * We can only BMAP regular files. We can't BMAP database files,
2694 * directories, etc.
2695 */
2696 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2697 return(EOPNOTSUPP);
2698
2699 /*
2700 * bmap is typically called with runp/runb both NULL when used
2701 * for writing. We do not support BMAP for writing atm.
2702 */
4a2796f3 2703 if (ap->a_cmd != BUF_CMD_READ)
a99b9ea2
MD
2704 return(EOPNOTSUPP);
2705
2706 /*
2707 * Scan the B-Tree to acquire blockmap addresses, then translate
2708 * to raw addresses.
2709 */
2710 hammer_simple_transaction(&trans, ip->hmp);
cb51be26 2711#if 0
973c11b9
MD
2712 kprintf("bmap_beg %016llx ip->cache %p\n",
2713 (long long)ap->a_loffset, ip->cache[1]);
cb51be26 2714#endif
a99b9ea2
MD
2715 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2716
2717 /*
2718 * Key range (begin and end inclusive) to scan. Note that the key's
2719 * stored in the actual records represent BASE+LEN, not BASE. The
2720 * first record containing bio_offset will have a key > bio_offset.
2721 */
5a930e66
MD
2722 cursor.key_beg.localization = ip->obj_localization +
2723 HAMMER_LOCALIZE_MISC;
a99b9ea2
MD
2724 cursor.key_beg.obj_id = ip->obj_id;
2725 cursor.key_beg.create_tid = 0;
2726 cursor.key_beg.delete_tid = 0;
2727 cursor.key_beg.obj_type = 0;
2728 if (ap->a_runb)
2729 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2730 else
2731 cursor.key_beg.key = ap->a_loffset + 1;
2732 if (cursor.key_beg.key < 0)
2733 cursor.key_beg.key = 0;
2734 cursor.asof = ip->obj_asof;
bf3b416b 2735 cursor.flags |= HAMMER_CURSOR_ASOF;
a99b9ea2
MD
2736
2737 cursor.key_end = cursor.key_beg;
2738 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2739
2740 ran_end = ap->a_loffset + MAXPHYS;
2741 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2742 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2743 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2744 if (tmp64 < ran_end)
2745 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2746 else
2747 cursor.key_end.key = ran_end + MAXPHYS + 1;
2748
2749 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2750
2751 error = hammer_ip_first(&cursor);
2752 base_offset = last_offset = 0;
2753 base_disk_offset = last_disk_offset = 0;
2754
2755 while (error == 0) {
2756 /*
2757 * Get the base file offset of the record. The key for
2758 * data records is (base + bytes) rather then (base).
4a2796f3
MD
2759 *
2760 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2761 * The extra bytes should be zero on-disk and the BMAP op
2762 * should still be ok.
a99b9ea2
MD
2763 */
2764 base = &cursor.leaf->base;
2765 rec_offset = base->key - cursor.leaf->data_len;
2766 rec_len = cursor.leaf->data_len;
2767
2768 /*
4a2796f3
MD
2769 * Incorporate any cached truncation.
2770 *
2771 * NOTE: Modifications to rec_len based on synthesized
2772 * truncation points remove the guarantee that any extended
2773 * data on disk is zero (since the truncations may not have
2774 * taken place on-media yet).
a99b9ea2
MD
2775 */
2776 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2777 if (hammer_cursor_ondisk(&cursor) ||
2778 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2779 if (ip->trunc_off <= rec_offset)
2780 rec_len = 0;
2781 else if (ip->trunc_off < rec_offset + rec_len)
2782 rec_len = (int)(ip->trunc_off - rec_offset);
2783 }
2784 }
2785 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2786 if (hammer_cursor_ondisk(&cursor)) {
2787 if (ip->sync_trunc_off <= rec_offset)
2788 rec_len = 0;
2789 else if (ip->sync_trunc_off < rec_offset + rec_len)
2790 rec_len = (int)(ip->sync_trunc_off - rec_offset);
2791 }
2792 }
2793
2794 /*
2795 * Accumulate information. If we have hit a discontiguous
2796 * block reset base_offset unless we are already beyond the
2797 * requested offset. If we are, that's it, we stop.
2798 */
a99b9ea2
MD
2799 if (error)
2800 break;
1b0ab2c3
MD
2801 if (hammer_cursor_ondisk(&cursor)) {
2802 disk_offset = cursor.leaf->data_offset;
2803 if (rec_offset != last_offset ||
2804 disk_offset != last_disk_offset) {
2805 if (rec_offset > ap->a_loffset)
2806 break;
2807 base_offset = rec_offset;
2808 base_disk_offset = disk_offset;
2809 }
2810 last_offset = rec_offset + rec_len;
2811 last_disk_offset = disk_offset + rec_len;
a99b9ea2 2812 }
a99b9ea2
MD
2813 error = hammer_ip_next(&cursor);
2814 }
2815
2816#if 0
2817 kprintf("BMAP %016llx: %016llx - %016llx\n",
973c11b9
MD
2818 (long long)ap->a_loffset,
2819 (long long)base_offset,
2820 (long long)last_offset);
2821 kprintf("BMAP %16s: %016llx - %016llx\n", "",
2822 (long long)base_disk_offset,
2823 (long long)last_disk_offset);
a99b9ea2
MD
2824#endif
2825
cb51be26 2826 if (cursor.node) {
bcac4bbb 2827 hammer_cache_node(&ip->cache[1], cursor.node);
cb51be26 2828#if 0
973c11b9
MD
2829 kprintf("bmap_end2 %016llx ip->cache %p\n",
2830 (long long)ap->a_loffset, ip->cache[1]);
cb51be26
MD
2831#endif
2832 }
a99b9ea2
MD
2833 hammer_done_cursor(&cursor);
2834 hammer_done_transaction(&trans);
2835
4a2796f3
MD
2836 /*
2837 * If we couldn't find any records or the records we did find were
2838 * all behind the requested offset, return failure. A forward
2839 * truncation can leave a hole w/ no on-disk records.
2840 */
2841 if (last_offset == 0 || last_offset < ap->a_loffset)
2842 return (EOPNOTSUPP);
2843
2844 /*
2845 * Figure out the block size at the requested offset and adjust
2846 * our limits so the cluster_read() does not create inappropriately
2847 * sized buffer cache buffers.
2848 */
2849 blksize = hammer_blocksize(ap->a_loffset);
2850 if (hammer_blocksize(base_offset) != blksize) {
2851 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2852 }
2853 if (last_offset != ap->a_loffset &&
2854 hammer_blocksize(last_offset - 1) != blksize) {
2855 last_offset = hammer_blockdemarc(ap->a_loffset,
2856 last_offset - 1);
2857 }
2858
2859 /*
2860 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2861 * from occuring.
2862 */
2863 disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2864
1b0ab2c3
MD
2865 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2866 /*
2867 * Only large-data zones can be direct-IOd
2868 */
2869 error = EOPNOTSUPP;
2870 } else if ((disk_offset & HAMMER_BUFMASK) ||
2871 (last_offset - ap->a_loffset) < blksize) {
2872 /*
2873 * doffsetp is not aligned or the forward run size does
2874 * not cover a whole buffer, disallow the direct I/O.
2875 */
a99b9ea2
MD
2876 error = EOPNOTSUPP;
2877 } else {
1b0ab2c3
MD
2878 /*
2879 * We're good.
2880 */
4a2796f3
MD
2881 *ap->a_doffsetp = disk_offset;
2882 if (ap->a_runb) {
2883 *ap->a_runb = ap->a_loffset - base_offset;
2884 KKASSERT(*ap->a_runb >= 0);
a99b9ea2 2885 }
4a2796f3
MD
2886 if (ap->a_runp) {
2887 *ap->a_runp = last_offset - ap->a_loffset;
2888 KKASSERT(*ap->a_runp >= 0);
2889 }
2890 error = 0;
a99b9ea2
MD
2891 }
2892 return(error);
2893}
2894
8cd0a023 2895/*
059819e3 2896 * Write to a regular file. Because this is a strategy call the OS is
bcac4bbb 2897 * trying to actually get data onto the media.
8cd0a023
MD
2898 */
2899static
2900int
2901hammer_vop_strategy_write(struct vop_strategy_args *ap)
2902{
47637bff 2903 hammer_record_t record;
af209b0f 2904 hammer_mount_t hmp;
8cd0a023
MD
2905 hammer_inode_t ip;
2906 struct bio *bio;
2907 struct buf *bp;
a7e9bef1 2908 int blksize;
0832c9bb
MD
2909 int bytes;
2910 int error;
8cd0a023
MD
2911
2912 bio = ap->a_bio;
2913 bp = bio->bio_buf;
2914 ip = ap->a_vp->v_data;
af209b0f 2915 hmp = ip->hmp;
d113fda1 2916
a7e9bef1
MD
2917 blksize = hammer_blocksize(bio->bio_offset);
2918 KKASSERT(bp->b_bufsize == blksize);
4a2796f3 2919
059819e3
MD
2920 if (ip->flags & HAMMER_INODE_RO) {
2921 bp->b_error = EROFS;
2922 bp->b_flags |= B_ERROR;
2923 biodone(ap->a_bio);
2924 return(EROFS);
2925 }
b84de5af 2926
29ce0677
MD
2927 /*
2928 * Interlock with inode destruction (no in-kernel or directory
2929 * topology visibility). If we queue new IO while trying to
2930 * destroy the inode we can deadlock the vtrunc call in
2931 * hammer_inode_unloadable_check().
35a49944
MD
2932 *
2933 * Besides, there's no point flushing a bp associated with an
2934 * inode that is being destroyed on-media and has no kernel
2935 * references.
29ce0677 2936 */
35a49944
MD
2937 if ((ip->flags | ip->sync_flags) &
2938 (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
29ce0677
MD
2939 bp->b_resid = 0;
2940 biodone(ap->a_bio);
2941 return(0);
2942 }
2943
b84de5af 2944 /*
a99b9ea2
MD
2945 * Reserve space and issue a direct-write from the front-end.
2946 * NOTE: The direct_io code will hammer_bread/bcopy smaller
2947 * allocations.
47637bff 2948 *
a99b9ea2
MD
2949 * An in-memory record will be installed to reference the storage
2950 * until the flusher can get to it.
47637bff
MD
2951 *
2952 * Since we own the high level bio the front-end will not try to
0832c9bb 2953 * do a direct-read until the write completes.
a99b9ea2
MD
2954 *
2955 * NOTE: The only time we do not reserve a full-sized buffers
2956 * worth of data is if the file is small. We do not try to
2957 * allocate a fragment (from the small-data zone) at the end of
2958 * an otherwise large file as this can lead to wildly separated
2959 * data.
47637bff 2960 */
0832c9bb
MD
2961 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2962 KKASSERT(bio->bio_offset < ip->ino_data.size);
a99b9ea2 2963 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
4a2796f3 2964 bytes = bp->b_bufsize;
b84de5af 2965 else
a99b9ea2 2966 bytes = ((int)ip->ino_data.size + 15) & ~15;
0832c9bb
MD
2967
2968 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2969 bytes, &error);
2970 if (record) {
1b0ab2c3 2971 hammer_io_direct_write(hmp, record, bio);
4a2796f3
MD
2972 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2973 hammer_flush_inode(ip, 0);
0832c9bb 2974 } else {
a99b9ea2 2975 bp->b_bio2.bio_offset = NOOFFSET;
0832c9bb
MD
2976 bp->b_error = error;
2977 bp->b_flags |= B_ERROR;
2978 biodone(ap->a_bio);
2979 }
0832c9bb 2980 return(error);
059819e3
MD
2981}
2982
8cd0a023
MD
2983/*
2984 * dounlink - disconnect a directory entry
2985 *
2986 * XXX whiteout support not really in yet
2987 */
2988static int
b84de5af 2989hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
d7e278bb
MD
2990 struct vnode *dvp, struct ucred *cred,
2991 int flags, int isdir)
8cd0a023 2992{
8cd0a023
MD
2993 struct namecache *ncp;
2994 hammer_inode_t dip;
2995 hammer_inode_t ip;
8cd0a023 2996 struct hammer_cursor cursor;
8cd0a023 2997 int64_t namekey;
5e435c92 2998 u_int32_t max_iterations;
11ad5ade 2999 int nlen, error;
8cd0a023
MD
3000
3001 /*
3002 * Calculate the namekey and setup the key range for the scan. This
3003 * works kinda like a chained hash table where the lower 32 bits
3004 * of the namekey synthesize the chain.
3005 *
3006 * The key range is inclusive of both key_beg and key_end.
3007 */
3008 dip = VTOI(dvp);
3009 ncp = nch->ncp;
d113fda1
MD
3010
3011 if (dip->flags & HAMMER_INODE_RO)
3012 return (EROFS);
3013
5e435c92
MD
3014 namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3015 &max_iterations);
6a37e7e4 3016retry:
bcac4bbb 3017 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
5a930e66 3018 cursor.key_beg.localization = dip->obj_localization +
beec5dc4 3019 hammer_dir_localization(dip);
8cd0a023
MD
3020 cursor.key_beg.obj_id = dip->obj_id;
3021 cursor.key_beg.key = namekey;
d5530d22 3022 cursor.key_beg.create_tid = 0;
8cd0a023
MD
3023 cursor.key_beg.delete_tid = 0;
3024 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3025 cursor.key_beg.obj_type = 0;
3026
3027 cursor.key_end = cursor.key_beg;
5e435c92 3028 cursor.key_end.key += max_iterations;
d5530d22
MD
3029 cursor.asof = dip->obj_asof;
3030 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
8cd0a023 3031
8cd0a023
MD
3032 /*
3033 * Scan all matching records (the chain), locate the one matching
3034 * the requested path component. info->last_error contains the
3035 * error code on search termination and could be 0, ENOENT, or
3036 * something else.
3037 *
3038 * The hammer_ip_*() functions merge in-memory records with on-disk
3039 * records for the purposes of the search.
3040 */
4e17f465
MD
3041 error = hammer_ip_first(&cursor);
3042
a89aec1b
MD
3043 while (error == 0) {
3044 error = hammer_ip_resolve_data(&cursor);
3045 if (error)
66325755 3046 break;
11ad5ade
MD
3047 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3048 KKASSERT(nlen > 0);
3049 if (ncp->nc_nlen == nlen &&
3050 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
66325755
MD
3051 break;
3052 }
a89aec1b 3053 error = hammer_ip_next(&cursor);
66325755 3054 }
8cd0a023
MD
3055
3056 /*
3057 * If all is ok we have to get the inode so we can adjust nlinks.
269c5eab
MD
3058 * To avoid a deadlock with the flusher we must release the inode
3059 * lock on the directory when acquiring the inode for the entry.
b3deaf57
MD
3060 *
3061 * If the target is a directory, it must be empty.
8cd0a023 3062 */
66325755 3063 if (error == 0) {
269c5eab 3064 hammer_unlock(&cursor.ip->lock);
bcac4bbb 3065 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
ddfdf542
MD
3066 dip->hmp->asof,
3067 cursor.data->entry.localization,
3068 0, &error);
269c5eab 3069 hammer_lock_sh(&cursor.ip->lock);
46fe7ae1 3070 if (error == ENOENT) {
4c286c36
MD
3071 kprintf("HAMMER: WARNING: Removing "
3072 "dirent w/missing inode \"%s\"\n"
3073 "\tobj_id = %016llx\n",
3074 ncp->nc_name,
3075 (long long)cursor.data->entry.obj_id);
3076 error = 0;
46fe7ae1 3077 }
1f07f686 3078
d7e278bb
MD
3079 /*
3080 * If isdir >= 0 we validate that the entry is or is not a
3081 * directory. If isdir < 0 we don't care.
3082 */
4c286c36 3083 if (error == 0 && isdir >= 0 && ip) {
d7e278bb
MD
3084 if (isdir &&
3085 ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3086 error = ENOTDIR;
3087 } else if (isdir == 0 &&
3088 ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3089 error = EISDIR;
3090 }
3091 }
3092
1f07f686
MD
3093 /*
3094 * If we are trying to remove a directory the directory must
3095 * be empty.
3096 *
3f9b4cfa
MD
3097 * The check directory code can loop and deadlock/retry. Our
3098 * own cursor's node locks must be released to avoid a 3-way
3099 * deadlock with the flusher if the check directory code
3100 * blocks.
3101 *
3102 * If any changes whatsoever have been made to the cursor
3103 * set EDEADLK and retry.
c9ce54d6
MD
3104 *
3105 * WARNING: See warnings in hammer_unlock_cursor()
3106 * function.
1f07f686 3107 */
4c286c36
MD
3108 if (error == 0 && ip && ip->ino_data.obj_type ==
3109 HAMMER_OBJTYPE_DIRECTORY) {
3f9b4cfa 3110 hammer_unlock_cursor(&cursor);
98f7132d 3111 error = hammer_ip_check_directory_empty(trans, ip);
3f9b4cfa
MD
3112 hammer_lock_cursor(&cursor);
3113 if (cursor.flags & HAMMER_CURSOR_RETEST) {
3114 kprintf("HAMMER: Warning: avoided deadlock "
3115 "on rmdir '%s'\n",
3116 ncp->nc_name);
3117 error = EDEADLK;
3118 }
b3deaf57 3119 }
1f07f686 3120
6a37e7e4 3121 /*
1f07f686
MD
3122 * Delete the directory entry.
3123 *
6a37e7e4 3124 * WARNING: hammer_ip_del_directory() may have to terminate
1f07f686 3125 * the cursor to avoid a deadlock. It is ok to call
6a37e7e4
MD
3126 * hammer_done_cursor() twice.
3127 */
b84de5af 3128 if (error == 0) {
b84de5af
MD
3129 error = hammer_ip_del_directory(trans, &cursor,
3130 dip, ip);
b84de5af 3131 }
269c5eab 3132 hammer_done_cursor(&cursor);
8cd0a023
MD
3133 if (error == 0) {
3134 cache_setunresolved(nch);
3135 cache_setvp(nch, NULL);
3136 /* XXX locking */
4c286c36 3137 if (ip && ip->vp) {
fbb84158 3138 hammer_knote(ip->vp, NOTE_DELETE);
8cd0a023 3139 cache_inval_vp(ip->vp, CINV_DESTROY);
fbb84158 3140 }
8cd0a023 3141 }
af209b0f
MD
3142 if (ip)
3143 hammer_rel_inode(ip, 0);
269c5eab
MD
3144 } else {
3145 hammer_done_cursor(&cursor);
66325755 3146 }
6a37e7e4
MD
3147 if (error == EDEADLK)
3148 goto retry;
9c448776 3149
66325755 3150 return (error);
66325755
MD
3151}
3152
7a04d74f
MD
3153/************************************************************************
3154 * FIFO AND SPECFS OPS *
3155 ************************************************************************
3156 *
3157 */
3158
3159static int
3160hammer_vop_fifoclose (struct vop_close_args *ap)
3161{
3162 /* XXX update itimes */
3163 return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3164}
3165
3166static int
3167hammer_vop_fiforead (struct vop_read_args *ap)
3168{
3169 int error;
3170
3171 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3172 /* XXX update access time */
3173 return (error);
3174}
3175
3176static int
3177hammer_vop_fifowrite (struct vop_write_args *ap)
3178{
3179 int error;
3180
3181 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3182 /* XXX update access time */
3183 return (error);
3184}
3185
fbb84158
MD
3186static
3187int
3188hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3189{
3190 int error;
3191
3192 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3193 if (error)
3194 error = hammer_vop_kqfilter(ap);
3195 return(error);
3196}
3197
fbb84158
MD
3198/************************************************************************
3199 * KQFILTER OPS *
3200 ************************************************************************
3201 *
3202 */
3203static void filt_hammerdetach(struct knote *kn);
3204static int filt_hammerread(struct knote *kn, long hint);
3205static int filt_hammerwrite(struct knote *kn, long hint);
3206static int filt_hammervnode(struct knote *kn, long hint);
3207
3208static struct filterops hammerread_filtops =
3209 { 1, NULL, filt_hammerdetach, filt_hammerread };
3210static struct filterops hammerwrite_filtops =
3211 { 1, NULL, filt_hammerdetach, filt_hammerwrite };
3212static struct filterops hammervnode_filtops =
3213 { 1, NULL, filt_hammerdetach, filt_hammervnode };
3214
3215static
3216int
3217hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3218{
3219 struct vnode *vp = ap->a_vp;
3220 struct knote *kn = ap->a_kn;
0202303b 3221 lwkt_tokref vlock;
fbb84158
MD
3222
3223 switch (kn->kn_filter) {
3224 case EVFILT_READ:
3225 kn->kn_fop = &hammerread_filtops;
3226 break;
3227 case EVFILT_WRITE:
3228 kn->kn_fop = &hammerwrite_filtops;
3229 break;
3230 case EVFILT_VNODE:
3231 kn->kn_fop = &hammervnode_filtops;
3232 break;
3233 default:
3234 return (1);
3235 }
3236
3237 kn->kn_hook = (caddr_t)vp;
3238
0202303b 3239 lwkt_gettoken(&vlock, &vp->v_token);
fbb84158 3240 SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
0202303b 3241 lwkt_reltoken(&vlock);
fbb84158
MD
3242
3243 return(0);
3244}
3245
3246static void
3247filt_hammerdetach(struct knote *kn)
3248{
3249 struct vnode *vp = (void *)kn->kn_hook;
0202303b 3250 lwkt_tokref vlock;
fbb84158 3251
0202303b 3252 lwkt_gettoken(&vlock, &vp->v_token);
fbb84158
MD
3253 SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
3254 kn, knote, kn_selnext);
0202303b 3255 lwkt_reltoken(&vlock);
fbb84158
MD
3256}
3257
3258static int
3259filt_hammerread(struct knote *kn, long hint)
3260{
3261 struct vnode *vp = (void *)kn->kn_hook;
3262 hammer_inode_t ip = VTOI(vp);
3263
3264 if (hint == NOTE_REVOKE) {
3265 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3266 return(1);
3267 }
3268 kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
3269 return (kn->kn_data != 0);
3270}
3271
3272static int
3273filt_hammerwrite(struct knote *kn, long hint)
3274{
3275 if (hint == NOTE_REVOKE)
3276 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3277 kn->kn_data = 0;
3278 return (1);
3279}
3280