O_CREAT was being allowed to leak through a read-only NFS export.
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
CommitLineData
427e5fc6 1/*
b84de5af 2 * Copyright (c) 2007-2008 The DragonFly Project. All rights reserved.
427e5fc6
MD
3 *
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@backplane.com>
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 *
11 * 1. Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 * 3. Neither the name of The DragonFly Project nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific, prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 *
ce0138a6 34 * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.92 2008/07/14 20:27:54 dillon Exp $
427e5fc6
MD
35 */
36
37#include <sys/param.h>
38#include <sys/systm.h>
39#include <sys/kernel.h>
40#include <sys/fcntl.h>
41#include <sys/namecache.h>
42#include <sys/vnode.h>
43#include <sys/lockf.h>
44#include <sys/event.h>
45#include <sys/stat.h>
b3deaf57 46#include <sys/dirent.h>
c0ade690 47#include <vm/vm_extern.h>
7a04d74f 48#include <vfs/fifofs/fifo.h>
427e5fc6
MD
49#include "hammer.h"
50
51/*
52 * USERFS VNOPS
53 */
54/*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
66325755
MD
55static int hammer_vop_fsync(struct vop_fsync_args *);
56static int hammer_vop_read(struct vop_read_args *);
57static int hammer_vop_write(struct vop_write_args *);
58static int hammer_vop_access(struct vop_access_args *);
59static int hammer_vop_advlock(struct vop_advlock_args *);
60static int hammer_vop_close(struct vop_close_args *);
61static int hammer_vop_ncreate(struct vop_ncreate_args *);
62static int hammer_vop_getattr(struct vop_getattr_args *);
63static int hammer_vop_nresolve(struct vop_nresolve_args *);
64static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65static int hammer_vop_nlink(struct vop_nlink_args *);
66static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67static int hammer_vop_nmknod(struct vop_nmknod_args *);
68static int hammer_vop_open(struct vop_open_args *);
69static int hammer_vop_pathconf(struct vop_pathconf_args *);
70static int hammer_vop_print(struct vop_print_args *);
71static int hammer_vop_readdir(struct vop_readdir_args *);
72static int hammer_vop_readlink(struct vop_readlink_args *);
73static int hammer_vop_nremove(struct vop_nremove_args *);
74static int hammer_vop_nrename(struct vop_nrename_args *);
75static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76static int hammer_vop_setattr(struct vop_setattr_args *);
77static int hammer_vop_strategy(struct vop_strategy_args *);
a99b9ea2 78static int hammer_vop_bmap(struct vop_bmap_args *ap);
66325755
MD
79static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
7dc57964 81static int hammer_vop_ioctl(struct vop_ioctl_args *);
513ca7d7 82static int hammer_vop_mountctl(struct vop_mountctl_args *);
427e5fc6 83
7a04d74f
MD
84static int hammer_vop_fifoclose (struct vop_close_args *);
85static int hammer_vop_fiforead (struct vop_read_args *);
86static int hammer_vop_fifowrite (struct vop_write_args *);
87
88static int hammer_vop_specclose (struct vop_close_args *);
89static int hammer_vop_specread (struct vop_read_args *);
90static int hammer_vop_specwrite (struct vop_write_args *);
91
427e5fc6
MD
92struct vop_ops hammer_vnode_vops = {
93 .vop_default = vop_defaultop,
94 .vop_fsync = hammer_vop_fsync,
c0ade690
MD
95 .vop_getpages = vop_stdgetpages,
96 .vop_putpages = vop_stdputpages,
427e5fc6
MD
97 .vop_read = hammer_vop_read,
98 .vop_write = hammer_vop_write,
99 .vop_access = hammer_vop_access,
100 .vop_advlock = hammer_vop_advlock,
101 .vop_close = hammer_vop_close,
102 .vop_ncreate = hammer_vop_ncreate,
103 .vop_getattr = hammer_vop_getattr,
104 .vop_inactive = hammer_vop_inactive,
105 .vop_reclaim = hammer_vop_reclaim,
106 .vop_nresolve = hammer_vop_nresolve,
107 .vop_nlookupdotdot = hammer_vop_nlookupdotdot,
108 .vop_nlink = hammer_vop_nlink,
109 .vop_nmkdir = hammer_vop_nmkdir,
110 .vop_nmknod = hammer_vop_nmknod,
111 .vop_open = hammer_vop_open,
112 .vop_pathconf = hammer_vop_pathconf,
113 .vop_print = hammer_vop_print,
114 .vop_readdir = hammer_vop_readdir,
115 .vop_readlink = hammer_vop_readlink,
116 .vop_nremove = hammer_vop_nremove,
117 .vop_nrename = hammer_vop_nrename,
118 .vop_nrmdir = hammer_vop_nrmdir,
119 .vop_setattr = hammer_vop_setattr,
a99b9ea2 120 .vop_bmap = hammer_vop_bmap,
427e5fc6
MD
121 .vop_strategy = hammer_vop_strategy,
122 .vop_nsymlink = hammer_vop_nsymlink,
7dc57964 123 .vop_nwhiteout = hammer_vop_nwhiteout,
513ca7d7
MD
124 .vop_ioctl = hammer_vop_ioctl,
125 .vop_mountctl = hammer_vop_mountctl
427e5fc6
MD
126};
127
7a04d74f
MD
128struct vop_ops hammer_spec_vops = {
129 .vop_default = spec_vnoperate,
130 .vop_fsync = hammer_vop_fsync,
131 .vop_read = hammer_vop_specread,
132 .vop_write = hammer_vop_specwrite,
133 .vop_access = hammer_vop_access,
134 .vop_close = hammer_vop_specclose,
135 .vop_getattr = hammer_vop_getattr,
136 .vop_inactive = hammer_vop_inactive,
137 .vop_reclaim = hammer_vop_reclaim,
138 .vop_setattr = hammer_vop_setattr
139};
140
141struct vop_ops hammer_fifo_vops = {
142 .vop_default = fifo_vnoperate,
143 .vop_fsync = hammer_vop_fsync,
144 .vop_read = hammer_vop_fiforead,
145 .vop_write = hammer_vop_fifowrite,
146 .vop_access = hammer_vop_access,
147 .vop_close = hammer_vop_fifoclose,
148 .vop_getattr = hammer_vop_getattr,
149 .vop_inactive = hammer_vop_inactive,
150 .vop_reclaim = hammer_vop_reclaim,
151 .vop_setattr = hammer_vop_setattr
152};
153
0832c9bb
MD
154#ifdef DEBUG_TRUNCATE
155struct hammer_inode *HammerTruncIp;
156#endif
157
b84de5af
MD
158static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159 struct vnode *dvp, struct ucred *cred, int flags);
8cd0a023
MD
160static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
161static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
162
427e5fc6
MD
163#if 0
164static
165int
166hammer_vop_vnoperate(struct vop_generic_args *)
167{
168 return (VOCALL(&hammer_vnode_vops, ap));
169}
170#endif
171
66325755
MD
172/*
173 * hammer_vop_fsync { vp, waitfor }
ddfdf542
MD
174 *
175 * fsync() an inode to disk and wait for it to be completely committed
176 * such that the information would not be undone if a crash occured after
177 * return.
66325755 178 */
427e5fc6
MD
179static
180int
66325755 181hammer_vop_fsync(struct vop_fsync_args *ap)
427e5fc6 182{
b84de5af 183 hammer_inode_t ip = VTOI(ap->a_vp);
c0ade690 184
7a61b85d 185 ++hammer_count_fsyncs;
e8599db1 186 vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
af209b0f 187 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
b84de5af
MD
188 if (ap->a_waitfor == MNT_WAIT)
189 hammer_wait_inode(ip);
059819e3 190 return (ip->error);
427e5fc6
MD
191}
192
66325755
MD
193/*
194 * hammer_vop_read { vp, uio, ioflag, cred }
195 */
427e5fc6
MD
196static
197int
66325755 198hammer_vop_read(struct vop_read_args *ap)
427e5fc6 199{
66325755 200 struct hammer_transaction trans;
c0ade690 201 hammer_inode_t ip;
66325755
MD
202 off_t offset;
203 struct buf *bp;
204 struct uio *uio;
205 int error;
206 int n;
8cd0a023 207 int seqcount;
4a2796f3
MD
208 int ioseqcount;
209 int blksize;
66325755
MD
210
211 if (ap->a_vp->v_type != VREG)
212 return (EINVAL);
213 ip = VTOI(ap->a_vp);
214 error = 0;
4a2796f3
MD
215 uio = ap->a_uio;
216
217 /*
218 * Allow the UIO's size to override the sequential heuristic.
219 */
220 blksize = hammer_blocksize(uio->uio_offset);
221 seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
222 ioseqcount = ap->a_ioflag >> 16;
223 if (seqcount < ioseqcount)
224 seqcount = ioseqcount;
66325755 225
8cd0a023 226 hammer_start_transaction(&trans, ip->hmp);
66325755
MD
227
228 /*
4a2796f3
MD
229 * Access the data typically in HAMMER_BUFSIZE blocks via the
230 * buffer cache, but HAMMER may use a variable block size based
231 * on the offset.
66325755 232 */
11ad5ade 233 while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
4a2796f3
MD
234 int64_t base_offset;
235 int64_t file_limit;
236
237 blksize = hammer_blocksize(uio->uio_offset);
238 offset = (int)uio->uio_offset & (blksize - 1);
239 base_offset = uio->uio_offset - offset;
240
1b0ab2c3 241 if (hammer_cluster_enable) {
4a2796f3
MD
242 /*
243 * Use file_limit to prevent cluster_read() from
244 * creating buffers of the wrong block size past
245 * the demarc.
246 */
247 file_limit = ip->ino_data.size;
248 if (base_offset < HAMMER_XDEMARC &&
249 file_limit > HAMMER_XDEMARC) {
250 file_limit = HAMMER_XDEMARC;
251 }
252 error = cluster_read(ap->a_vp,
253 file_limit, base_offset,
254 blksize, MAXPHYS,
255 seqcount, &bp);
a99b9ea2 256 } else {
4a2796f3 257 error = bread(ap->a_vp, base_offset, blksize, &bp);
a99b9ea2 258 }
66325755 259 if (error) {
4a2796f3 260 kprintf("error %d\n", error);
66325755
MD
261 brelse(bp);
262 break;
263 }
7bc5b8c2 264
c0ade690 265 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
4a2796f3 266 n = blksize - offset;
66325755
MD
267 if (n > uio->uio_resid)
268 n = uio->uio_resid;
11ad5ade
MD
269 if (n > ip->ino_data.size - uio->uio_offset)
270 n = (int)(ip->ino_data.size - uio->uio_offset);
66325755 271 error = uiomove((char *)bp->b_data + offset, n, uio);
7bc5b8c2
MD
272
273 /* data has a lower priority then meta-data */
274 bp->b_flags |= B_AGE;
66325755 275 bqrelse(bp);
af209b0f
MD
276 if (error)
277 break;
ce0138a6 278 hammer_stats_file_read += n;
66325755 279 }
b84de5af
MD
280 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
281 (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
bcac4bbb 282 ip->ino_data.atime = trans.time;
ddfdf542 283 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
b84de5af
MD
284 }
285 hammer_done_transaction(&trans);
66325755 286 return (error);
427e5fc6
MD
287}
288
66325755
MD
289/*
290 * hammer_vop_write { vp, uio, ioflag, cred }
291 */
427e5fc6
MD
292static
293int
66325755 294hammer_vop_write(struct vop_write_args *ap)
427e5fc6 295{
66325755
MD
296 struct hammer_transaction trans;
297 struct hammer_inode *ip;
4a2796f3 298 hammer_mount_t hmp;
66325755 299 struct uio *uio;
4a2796f3 300 int offset;
47637bff 301 off_t base_offset;
66325755
MD
302 struct buf *bp;
303 int error;
304 int n;
c0ade690 305 int flags;
4a2796f3 306 int delta;
cb51be26 307 int seqcount;
66325755
MD
308
309 if (ap->a_vp->v_type != VREG)
310 return (EINVAL);
311 ip = VTOI(ap->a_vp);
4a2796f3 312 hmp = ip->hmp;
66325755 313 error = 0;
cb51be26 314 seqcount = ap->a_ioflag >> 16;
66325755 315
d113fda1
MD
316 if (ip->flags & HAMMER_INODE_RO)
317 return (EROFS);
318
66325755
MD
319 /*
320 * Create a transaction to cover the operations we perform.
321 */
4a2796f3 322 hammer_start_transaction(&trans, hmp);
66325755
MD
323 uio = ap->a_uio;
324
325 /*
326 * Check append mode
327 */
328 if (ap->a_ioflag & IO_APPEND)
11ad5ade 329 uio->uio_offset = ip->ino_data.size;
66325755
MD
330
331 /*
af209b0f
MD
332 * Check for illegal write offsets. Valid range is 0...2^63-1.
333 *
334 * NOTE: the base_off assignment is required to work around what
335 * I consider to be a GCC-4 optimization bug.
66325755 336 */
af209b0f
MD
337 if (uio->uio_offset < 0) {
338 hammer_done_transaction(&trans);
339 return (EFBIG);
340 }
341 base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
342 if (uio->uio_resid > 0 && base_offset <= 0) {
b84de5af 343 hammer_done_transaction(&trans);
66325755 344 return (EFBIG);
9c448776 345 }
66325755
MD
346
347 /*
4a2796f3
MD
348 * Access the data typically in HAMMER_BUFSIZE blocks via the
349 * buffer cache, but HAMMER may use a variable block size based
350 * on the offset.
66325755
MD
351 */
352 while (uio->uio_resid > 0) {
d5ef456e 353 int fixsize = 0;
4a2796f3
MD
354 int blksize;
355 int blkmask;
d5ef456e 356
93291532 357 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
e63644f0
MD
358 break;
359
a9d52b76
MD
360 blksize = hammer_blocksize(uio->uio_offset);
361
059819e3 362 /*
4a2796f3
MD
363 * Do not allow HAMMER to blow out the buffer cache. Very
364 * large UIOs can lockout other processes due to bwillwrite()
365 * mechanics.
47637bff 366 *
df301614
MD
367 * The hammer inode is not locked during these operations.
368 * The vnode is locked which can interfere with the pageout
369 * daemon for non-UIO_NOCOPY writes but should not interfere
370 * with the buffer cache. Even so, we cannot afford to
371 * allow the pageout daemon to build up too many dirty buffer
372 * cache buffers.
373 */
e4a5ff06 374 /*if (((int)uio->uio_offset & (blksize - 1)) == 0)*/
df301614
MD
375 bwillwrite(blksize);
376
377 /*
e4a5ff06
MD
378 * Do not allow HAMMER to blow out system memory by
379 * accumulating too many records. Records are so well
380 * decoupled from the buffer cache that it is possible
381 * for userland to push data out to the media via
382 * direct-write, but build up the records queued to the
383 * backend faster then the backend can flush them out.
384 * HAMMER has hit its write limit but the frontend has
385 * no pushback to slow it down.
059819e3 386 */
df301614 387 if (hmp->rsv_recs > hammer_limit_recs / 2) {
4a2796f3 388 /*
df301614 389 * Get the inode on the flush list
4a2796f3 390 */
df301614
MD
391 if (ip->rsv_recs >= 64)
392 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
393 else if (ip->rsv_recs >= 16)
394 hammer_flush_inode(ip, 0);
4a2796f3
MD
395
396 /*
df301614
MD
397 * Keep the flusher going if the system keeps
398 * queueing records.
4a2796f3 399 */
df301614
MD
400 delta = hmp->count_newrecords -
401 hmp->last_newrecords;
402 if (delta < 0 || delta > hammer_limit_recs / 2) {
403 hmp->last_newrecords = hmp->count_newrecords;
404 hammer_sync_hmp(hmp, MNT_NOWAIT);
4a2796f3
MD
405 }
406
df301614
MD
407 /*
408 * If we have gotten behind start slowing
409 * down the writers.
410 */
411 delta = (hmp->rsv_recs - hammer_limit_recs) *
412 hz / hammer_limit_recs;
413 if (delta > 0)
414 tsleep(&trans, 0, "hmrslo", delta);
059819e3
MD
415 }
416
4a2796f3
MD
417 /*
418 * Calculate the blocksize at the current offset and figure
419 * out how much we can actually write.
420 */
4a2796f3
MD
421 blkmask = blksize - 1;
422 offset = (int)uio->uio_offset & blkmask;
423 base_offset = uio->uio_offset & ~(int64_t)blkmask;
424 n = blksize - offset;
d5ef456e
MD
425 if (n > uio->uio_resid)
426 n = uio->uio_resid;
11ad5ade 427 if (uio->uio_offset + n > ip->ino_data.size) {
d5ef456e
MD
428 vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
429 fixsize = 1;
430 }
431
c0ade690
MD
432 if (uio->uio_segflg == UIO_NOCOPY) {
433 /*
434 * Issuing a write with the same data backing the
435 * buffer. Instantiate the buffer to collect the
436 * backing vm pages, then read-in any missing bits.
437 *
438 * This case is used by vop_stdputpages().
439 */
47637bff 440 bp = getblk(ap->a_vp, base_offset,
4a2796f3 441 blksize, GETBLK_BHEAVY, 0);
c0ade690
MD
442 if ((bp->b_flags & B_CACHE) == 0) {
443 bqrelse(bp);
47637bff 444 error = bread(ap->a_vp, base_offset,
4a2796f3 445 blksize, &bp);
c0ade690 446 }
4a2796f3 447 } else if (offset == 0 && uio->uio_resid >= blksize) {
c0ade690 448 /*
a5fddc16
MD
449 * Even though we are entirely overwriting the buffer
450 * we may still have to zero it out to avoid a
451 * mmap/write visibility issue.
c0ade690 452 */
4a2796f3 453 bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
a5fddc16
MD
454 if ((bp->b_flags & B_CACHE) == 0)
455 vfs_bio_clrbuf(bp);
47637bff 456 } else if (base_offset >= ip->ino_data.size) {
c0ade690 457 /*
a5fddc16
MD
458 * If the base offset of the buffer is beyond the
459 * file EOF, we don't have to issue a read.
c0ade690 460 */
47637bff 461 bp = getblk(ap->a_vp, base_offset,
4a2796f3 462 blksize, GETBLK_BHEAVY, 0);
66325755
MD
463 vfs_bio_clrbuf(bp);
464 } else {
c0ade690
MD
465 /*
466 * Partial overwrite, read in any missing bits then
467 * replace the portion being written.
468 */
4a2796f3 469 error = bread(ap->a_vp, base_offset, blksize, &bp);
d5ef456e
MD
470 if (error == 0)
471 bheavy(bp);
66325755 472 }
47637bff 473 if (error == 0) {
4a2796f3 474 error = uiomove((char *)bp->b_data + offset,
47637bff
MD
475 n, uio);
476 }
d5ef456e
MD
477
478 /*
479 * If we screwed up we have to undo any VM size changes we
480 * made.
481 */
66325755
MD
482 if (error) {
483 brelse(bp);
d5ef456e 484 if (fixsize) {
11ad5ade 485 vtruncbuf(ap->a_vp, ip->ino_data.size,
4a2796f3 486 hammer_blocksize(ip->ino_data.size));
d5ef456e 487 }
66325755
MD
488 break;
489 }
ce0138a6 490 hammer_stats_file_write += n;
c0ade690 491 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
11ad5ade
MD
492 if (ip->ino_data.size < uio->uio_offset) {
493 ip->ino_data.size = uio->uio_offset;
494 flags = HAMMER_INODE_DDIRTY;
495 vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
c0ade690 496 } else {
d113fda1 497 flags = 0;
66325755 498 }
11ad5ade 499 ip->ino_data.mtime = trans.time;
ddfdf542 500 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
47637bff 501 hammer_modify_inode(ip, flags);
32c90105 502
1b0ab2c3
MD
503 /*
504 * Once we dirty the buffer any cached zone-X offset
505 * becomes invalid. HAMMER NOTE: no-history mode cannot
506 * allow overwriting over the same data sector unless
507 * we provide UNDOs for the old data, which we don't.
508 */
509 bp->b_bio2.bio_offset = NOOFFSET;
510
47637bff
MD
511 /*
512 * Final buffer disposition.
513 */
cb51be26 514 bp->b_flags |= B_AGE;
66325755
MD
515 if (ap->a_ioflag & IO_SYNC) {
516 bwrite(bp);
517 } else if (ap->a_ioflag & IO_DIRECT) {
66325755 518 bawrite(bp);
4a2796f3
MD
519 } else {
520 bdwrite(bp);
521 }
66325755 522 }
b84de5af 523 hammer_done_transaction(&trans);
66325755 524 return (error);
427e5fc6
MD
525}
526
66325755
MD
527/*
528 * hammer_vop_access { vp, mode, cred }
529 */
427e5fc6
MD
530static
531int
66325755 532hammer_vop_access(struct vop_access_args *ap)
427e5fc6 533{
66325755
MD
534 struct hammer_inode *ip = VTOI(ap->a_vp);
535 uid_t uid;
536 gid_t gid;
537 int error;
538
ce0138a6 539 ++hammer_stats_file_iopsr;
66325755
MD
540 uid = hammer_to_unix_xid(&ip->ino_data.uid);
541 gid = hammer_to_unix_xid(&ip->ino_data.gid);
542
543 error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
544 ip->ino_data.uflags);
545 return (error);
427e5fc6
MD
546}
547
66325755
MD
548/*
549 * hammer_vop_advlock { vp, id, op, fl, flags }
550 */
427e5fc6
MD
551static
552int
66325755 553hammer_vop_advlock(struct vop_advlock_args *ap)
427e5fc6 554{
4a2796f3 555 hammer_inode_t ip = VTOI(ap->a_vp);
66325755 556
11ad5ade 557 return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
427e5fc6
MD
558}
559
66325755
MD
560/*
561 * hammer_vop_close { vp, fflag }
562 */
427e5fc6
MD
563static
564int
66325755 565hammer_vop_close(struct vop_close_args *ap)
427e5fc6 566{
4a2796f3
MD
567 hammer_inode_t ip = VTOI(ap->a_vp);
568
569 if ((ip->flags | ip->sync_flags) & HAMMER_INODE_MODMASK)
570 hammer_inode_waitreclaims(ip->hmp);
a89aec1b 571 return (vop_stdclose(ap));
427e5fc6
MD
572}
573
66325755
MD
574/*
575 * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
576 *
577 * The operating system has already ensured that the directory entry
578 * does not exist and done all appropriate namespace locking.
579 */
427e5fc6
MD
580static
581int
66325755 582hammer_vop_ncreate(struct vop_ncreate_args *ap)
427e5fc6 583{
66325755
MD
584 struct hammer_transaction trans;
585 struct hammer_inode *dip;
586 struct hammer_inode *nip;
587 struct nchandle *nch;
588 int error;
589
590 nch = ap->a_nch;
591 dip = VTOI(ap->a_dvp);
592
d113fda1
MD
593 if (dip->flags & HAMMER_INODE_RO)
594 return (EROFS);
93291532 595 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 596 return (error);
d113fda1 597
66325755
MD
598 /*
599 * Create a transaction to cover the operations we perform.
600 */
8cd0a023 601 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 602 ++hammer_stats_file_iopsw;
66325755
MD
603
604 /*
605 * Create a new filesystem object of the requested type. The
b84de5af
MD
606 * returned inode will be referenced and shared-locked to prevent
607 * it from being moved to the flusher.
66325755 608 */
8cd0a023 609
5a930e66 610 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
ea434b6f 611 dip, NULL, &nip);
66325755 612 if (error) {
77062c8a 613 hkprintf("hammer_create_inode error %d\n", error);
b84de5af 614 hammer_done_transaction(&trans);
66325755
MD
615 *ap->a_vpp = NULL;
616 return (error);
617 }
66325755
MD
618
619 /*
620 * Add the new filesystem object to the directory. This will also
621 * bump the inode's link count.
622 */
5a930e66
MD
623 error = hammer_ip_add_directory(&trans, dip,
624 nch->ncp->nc_name, nch->ncp->nc_nlen,
625 nip);
0b075555 626 if (error)
77062c8a 627 hkprintf("hammer_ip_add_directory error %d\n", error);
66325755
MD
628
629 /*
630 * Finish up.
631 */
632 if (error) {
a89aec1b 633 hammer_rel_inode(nip, 0);
b84de5af 634 hammer_done_transaction(&trans);
66325755
MD
635 *ap->a_vpp = NULL;
636 } else {
e8599db1 637 error = hammer_get_vnode(nip, ap->a_vpp);
b84de5af 638 hammer_done_transaction(&trans);
a89aec1b
MD
639 hammer_rel_inode(nip, 0);
640 if (error == 0) {
641 cache_setunresolved(ap->a_nch);
642 cache_setvp(ap->a_nch, *ap->a_vpp);
643 }
66325755
MD
644 }
645 return (error);
427e5fc6
MD
646}
647
66325755
MD
648/*
649 * hammer_vop_getattr { vp, vap }
98f7132d
MD
650 *
651 * Retrieve an inode's attribute information. When accessing inodes
652 * historically we fake the atime field to ensure consistent results.
653 * The atime field is stored in the B-Tree element and allowed to be
654 * updated without cycling the element.
66325755 655 */
427e5fc6
MD
656static
657int
66325755 658hammer_vop_getattr(struct vop_getattr_args *ap)
427e5fc6 659{
66325755
MD
660 struct hammer_inode *ip = VTOI(ap->a_vp);
661 struct vattr *vap = ap->a_vap;
662
a56cb012
MD
663 /*
664 * We want the fsid to be different when accessing a filesystem
665 * with different as-of's so programs like diff don't think
666 * the files are the same.
667 *
668 * We also want the fsid to be the same when comparing snapshots,
669 * or when comparing mirrors (which might be backed by different
670 * physical devices). HAMMER fsids are based on the PFS's
671 * shared_uuid field.
672 *
673 * XXX there is a chance of collision here. The va_fsid reported
674 * by stat is different from the more involved fsid used in the
675 * mount structure.
c82af904 676 */
ce0138a6 677 ++hammer_stats_file_iopsr;
a56cb012
MD
678 vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
679 (u_int32_t)(ip->obj_asof >> 32);
680
11ad5ade 681 vap->va_fileid = ip->ino_leaf.base.obj_id;
66325755 682 vap->va_mode = ip->ino_data.mode;
11ad5ade 683 vap->va_nlink = ip->ino_data.nlinks;
66325755
MD
684 vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
685 vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
686 vap->va_rmajor = 0;
687 vap->va_rminor = 0;
11ad5ade 688 vap->va_size = ip->ino_data.size;
bcac4bbb
MD
689
690 /*
691 * We must provide a consistent atime and mtime for snapshots
692 * so people can do a 'tar cf - ... | md5' on them and get
693 * consistent results.
694 */
695 if (ip->flags & HAMMER_INODE_RO) {
ddfdf542
MD
696 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
697 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
bcac4bbb 698 } else {
ddfdf542
MD
699 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
700 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
bcac4bbb 701 }
ddfdf542 702 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
66325755
MD
703 vap->va_flags = ip->ino_data.uflags;
704 vap->va_gen = 1; /* hammer inums are unique for all time */
bf686dbe 705 vap->va_blocksize = HAMMER_BUFSIZE;
4a2796f3
MD
706 if (ip->ino_data.size >= HAMMER_XDEMARC) {
707 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
708 ~HAMMER_XBUFMASK64;
709 } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
710 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
711 ~HAMMER_BUFMASK64;
712 } else {
713 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
714 }
11ad5ade 715 vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
66325755 716 vap->va_filerev = 0; /* XXX */
4a2796f3 717 /* mtime uniquely identifies any adjustments made to the file XXX */
11ad5ade 718 vap->va_fsmid = ip->ino_data.mtime;
66325755
MD
719 vap->va_uid_uuid = ip->ino_data.uid;
720 vap->va_gid_uuid = ip->ino_data.gid;
721 vap->va_fsid_uuid = ip->hmp->fsid;
722 vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
723 VA_FSID_UUID_VALID;
7a04d74f 724
11ad5ade 725 switch (ip->ino_data.obj_type) {
7a04d74f
MD
726 case HAMMER_OBJTYPE_CDEV:
727 case HAMMER_OBJTYPE_BDEV:
728 vap->va_rmajor = ip->ino_data.rmajor;
729 vap->va_rminor = ip->ino_data.rminor;
730 break;
731 default:
732 break;
733 }
66325755 734 return(0);
427e5fc6
MD
735}
736
66325755
MD
737/*
738 * hammer_vop_nresolve { nch, dvp, cred }
739 *
740 * Locate the requested directory entry.
741 */
427e5fc6
MD
742static
743int
66325755 744hammer_vop_nresolve(struct vop_nresolve_args *ap)
427e5fc6 745{
36f82b23 746 struct hammer_transaction trans;
66325755 747 struct namecache *ncp;
7f7c1f84
MD
748 hammer_inode_t dip;
749 hammer_inode_t ip;
750 hammer_tid_t asof;
8cd0a023 751 struct hammer_cursor cursor;
66325755
MD
752 struct vnode *vp;
753 int64_t namekey;
754 int error;
7f7c1f84
MD
755 int i;
756 int nlen;
d113fda1 757 int flags;
a56cb012 758 int ispfs;
adf01747 759 int64_t obj_id;
ddfdf542 760 u_int32_t localization;
7f7c1f84
MD
761
762 /*
763 * Misc initialization, plus handle as-of name extensions. Look for
764 * the '@@' extension. Note that as-of files and directories cannot
765 * be modified.
7f7c1f84
MD
766 */
767 dip = VTOI(ap->a_dvp);
768 ncp = ap->a_nch->ncp;
769 asof = dip->obj_asof;
770 nlen = ncp->nc_nlen;
ea434b6f 771 flags = dip->flags & HAMMER_INODE_RO;
a56cb012 772 ispfs = 0;
7f7c1f84 773
36f82b23 774 hammer_simple_transaction(&trans, dip->hmp);
ce0138a6 775 ++hammer_stats_file_iopsr;
36f82b23 776
7f7c1f84
MD
777 for (i = 0; i < nlen; ++i) {
778 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
ea434b6f
MD
779 asof = hammer_str_to_tid(ncp->nc_name + i + 2,
780 &ispfs, &localization);
781 if (asof != HAMMER_MAX_TID)
782 flags |= HAMMER_INODE_RO;
7f7c1f84
MD
783 break;
784 }
785 }
786 nlen = i;
66325755 787
ea434b6f
MD
788 /*
789 * If this is a PFS softlink we dive into the PFS
790 */
791 if (ispfs && nlen == 0) {
792 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
793 asof, localization,
794 flags, &error);
795 if (error == 0) {
796 error = hammer_get_vnode(ip, &vp);
797 hammer_rel_inode(ip, 0);
798 } else {
799 vp = NULL;
800 }
801 if (error == 0) {
802 vn_unlock(vp);
803 cache_setvp(ap->a_nch, vp);
804 vrele(vp);
805 }
806 goto done;
807 }
808
d113fda1
MD
809 /*
810 * If there is no path component the time extension is relative to
811 * dip.
812 */
813 if (nlen == 0) {
bcac4bbb 814 ip = hammer_get_inode(&trans, dip, dip->obj_id,
ddfdf542
MD
815 asof, dip->obj_localization,
816 flags, &error);
d113fda1 817 if (error == 0) {
e8599db1 818 error = hammer_get_vnode(ip, &vp);
d113fda1
MD
819 hammer_rel_inode(ip, 0);
820 } else {
821 vp = NULL;
822 }
823 if (error == 0) {
824 vn_unlock(vp);
825 cache_setvp(ap->a_nch, vp);
826 vrele(vp);
827 }
36f82b23 828 goto done;
d113fda1
MD
829 }
830
8cd0a023
MD
831 /*
832 * Calculate the namekey and setup the key range for the scan. This
833 * works kinda like a chained hash table where the lower 32 bits
834 * of the namekey synthesize the chain.
835 *
836 * The key range is inclusive of both key_beg and key_end.
837 */
7f7c1f84 838 namekey = hammer_directory_namekey(ncp->nc_name, nlen);
66325755 839
bcac4bbb 840 error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
5a930e66
MD
841 cursor.key_beg.localization = dip->obj_localization +
842 HAMMER_LOCALIZE_MISC;
8cd0a023
MD
843 cursor.key_beg.obj_id = dip->obj_id;
844 cursor.key_beg.key = namekey;
d5530d22 845 cursor.key_beg.create_tid = 0;
8cd0a023
MD
846 cursor.key_beg.delete_tid = 0;
847 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
848 cursor.key_beg.obj_type = 0;
66325755 849
8cd0a023
MD
850 cursor.key_end = cursor.key_beg;
851 cursor.key_end.key |= 0xFFFFFFFFULL;
d5530d22
MD
852 cursor.asof = asof;
853 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
66325755
MD
854
855 /*
8cd0a023 856 * Scan all matching records (the chain), locate the one matching
a89aec1b 857 * the requested path component.
8cd0a023
MD
858 *
859 * The hammer_ip_*() functions merge in-memory records with on-disk
860 * records for the purposes of the search.
66325755 861 */
6a37e7e4 862 obj_id = 0;
43c665ae 863 localization = HAMMER_DEF_LOCALIZATION;
6a37e7e4 864
4e17f465 865 if (error == 0) {
4e17f465
MD
866 error = hammer_ip_first(&cursor);
867 while (error == 0) {
868 error = hammer_ip_resolve_data(&cursor);
869 if (error)
870 break;
11ad5ade
MD
871 if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
872 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
873 obj_id = cursor.data->entry.obj_id;
ddfdf542 874 localization = cursor.data->entry.localization;
4e17f465
MD
875 break;
876 }
877 error = hammer_ip_next(&cursor);
66325755
MD
878 }
879 }
6a37e7e4 880 hammer_done_cursor(&cursor);
66325755 881 if (error == 0) {
bcac4bbb 882 ip = hammer_get_inode(&trans, dip, obj_id,
ddfdf542
MD
883 asof, localization,
884 flags, &error);
7f7c1f84 885 if (error == 0) {
e8599db1 886 error = hammer_get_vnode(ip, &vp);
7f7c1f84
MD
887 hammer_rel_inode(ip, 0);
888 } else {
889 vp = NULL;
890 }
66325755
MD
891 if (error == 0) {
892 vn_unlock(vp);
893 cache_setvp(ap->a_nch, vp);
894 vrele(vp);
895 }
896 } else if (error == ENOENT) {
897 cache_setvp(ap->a_nch, NULL);
898 }
36f82b23 899done:
b84de5af 900 hammer_done_transaction(&trans);
66325755 901 return (error);
427e5fc6
MD
902}
903
66325755
MD
904/*
905 * hammer_vop_nlookupdotdot { dvp, vpp, cred }
906 *
907 * Locate the parent directory of a directory vnode.
908 *
909 * dvp is referenced but not locked. *vpp must be returned referenced and
910 * locked. A parent_obj_id of 0 does not necessarily indicate that we are
911 * at the root, instead it could indicate that the directory we were in was
912 * removed.
42c7d26b
MD
913 *
914 * NOTE: as-of sequences are not linked into the directory structure. If
915 * we are at the root with a different asof then the mount point, reload
916 * the same directory with the mount point's asof. I'm not sure what this
917 * will do to NFS. We encode ASOF stamps in NFS file handles so it might not
918 * get confused, but it hasn't been tested.
66325755 919 */
427e5fc6
MD
920static
921int
66325755 922hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
427e5fc6 923{
36f82b23 924 struct hammer_transaction trans;
66325755 925 struct hammer_inode *dip;
d113fda1 926 struct hammer_inode *ip;
42c7d26b 927 int64_t parent_obj_id;
5a930e66 928 u_int32_t parent_obj_localization;
42c7d26b 929 hammer_tid_t asof;
d113fda1 930 int error;
66325755
MD
931
932 dip = VTOI(ap->a_dvp);
42c7d26b 933 asof = dip->obj_asof;
5a930e66
MD
934
935 /*
936 * Whos are parent? This could be the root of a pseudo-filesystem
937 * whos parent is in another localization domain.
938 */
42c7d26b 939 parent_obj_id = dip->ino_data.parent_obj_id;
5a930e66
MD
940 if (dip->obj_id == HAMMER_OBJID_ROOT)
941 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
942 else
943 parent_obj_localization = dip->obj_localization;
42c7d26b
MD
944
945 if (parent_obj_id == 0) {
946 if (dip->obj_id == HAMMER_OBJID_ROOT &&
947 asof != dip->hmp->asof) {
948 parent_obj_id = dip->obj_id;
949 asof = dip->hmp->asof;
950 *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
951 ksnprintf(*ap->a_fakename, 19, "0x%016llx",
952 dip->obj_asof);
953 } else {
954 *ap->a_vpp = NULL;
955 return ENOENT;
956 }
66325755 957 }
d113fda1 958
36f82b23 959 hammer_simple_transaction(&trans, dip->hmp);
ce0138a6 960 ++hammer_stats_file_iopsr;
36f82b23 961
bcac4bbb 962 ip = hammer_get_inode(&trans, dip, parent_obj_id,
5a930e66 963 asof, parent_obj_localization,
ddfdf542 964 dip->flags, &error);
36f82b23 965 if (ip) {
e8599db1 966 error = hammer_get_vnode(ip, ap->a_vpp);
36f82b23
MD
967 hammer_rel_inode(ip, 0);
968 } else {
d113fda1 969 *ap->a_vpp = NULL;
d113fda1 970 }
b84de5af 971 hammer_done_transaction(&trans);
d113fda1 972 return (error);
427e5fc6
MD
973}
974
66325755
MD
975/*
976 * hammer_vop_nlink { nch, dvp, vp, cred }
977 */
427e5fc6
MD
978static
979int
66325755 980hammer_vop_nlink(struct vop_nlink_args *ap)
427e5fc6 981{
66325755
MD
982 struct hammer_transaction trans;
983 struct hammer_inode *dip;
984 struct hammer_inode *ip;
985 struct nchandle *nch;
986 int error;
987
988 nch = ap->a_nch;
989 dip = VTOI(ap->a_dvp);
990 ip = VTOI(ap->a_vp);
991
d113fda1
MD
992 if (dip->flags & HAMMER_INODE_RO)
993 return (EROFS);
994 if (ip->flags & HAMMER_INODE_RO)
995 return (EROFS);
93291532 996 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 997 return (error);
d113fda1 998
66325755
MD
999 /*
1000 * Create a transaction to cover the operations we perform.
1001 */
8cd0a023 1002 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1003 ++hammer_stats_file_iopsw;
66325755
MD
1004
1005 /*
1006 * Add the filesystem object to the directory. Note that neither
1007 * dip nor ip are referenced or locked, but their vnodes are
1008 * referenced. This function will bump the inode's link count.
1009 */
5a930e66
MD
1010 error = hammer_ip_add_directory(&trans, dip,
1011 nch->ncp->nc_name, nch->ncp->nc_nlen,
1012 ip);
66325755
MD
1013
1014 /*
1015 * Finish up.
1016 */
b84de5af 1017 if (error == 0) {
6b4f890b
MD
1018 cache_setunresolved(nch);
1019 cache_setvp(nch, ap->a_vp);
66325755 1020 }
b84de5af 1021 hammer_done_transaction(&trans);
66325755 1022 return (error);
427e5fc6
MD
1023}
1024
66325755
MD
1025/*
1026 * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1027 *
1028 * The operating system has already ensured that the directory entry
1029 * does not exist and done all appropriate namespace locking.
1030 */
427e5fc6
MD
1031static
1032int
66325755 1033hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
427e5fc6 1034{
66325755
MD
1035 struct hammer_transaction trans;
1036 struct hammer_inode *dip;
1037 struct hammer_inode *nip;
1038 struct nchandle *nch;
1039 int error;
1040
1041 nch = ap->a_nch;
1042 dip = VTOI(ap->a_dvp);
1043
d113fda1
MD
1044 if (dip->flags & HAMMER_INODE_RO)
1045 return (EROFS);
93291532 1046 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1047 return (error);
d113fda1 1048
66325755
MD
1049 /*
1050 * Create a transaction to cover the operations we perform.
1051 */
8cd0a023 1052 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1053 ++hammer_stats_file_iopsw;
66325755
MD
1054
1055 /*
1056 * Create a new filesystem object of the requested type. The
8cd0a023 1057 * returned inode will be referenced but not locked.
66325755 1058 */
5a930e66 1059 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
ea434b6f 1060 dip, NULL, &nip);
66325755 1061 if (error) {
77062c8a 1062 hkprintf("hammer_mkdir error %d\n", error);
b84de5af 1063 hammer_done_transaction(&trans);
66325755
MD
1064 *ap->a_vpp = NULL;
1065 return (error);
1066 }
66325755
MD
1067 /*
1068 * Add the new filesystem object to the directory. This will also
1069 * bump the inode's link count.
1070 */
5a930e66
MD
1071 error = hammer_ip_add_directory(&trans, dip,
1072 nch->ncp->nc_name, nch->ncp->nc_nlen,
1073 nip);
0b075555 1074 if (error)
77062c8a 1075 hkprintf("hammer_mkdir (add) error %d\n", error);
66325755
MD
1076
1077 /*
1078 * Finish up.
1079 */
1080 if (error) {
a89aec1b 1081 hammer_rel_inode(nip, 0);
66325755
MD
1082 *ap->a_vpp = NULL;
1083 } else {
e8599db1 1084 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1085 hammer_rel_inode(nip, 0);
1086 if (error == 0) {
1087 cache_setunresolved(ap->a_nch);
1088 cache_setvp(ap->a_nch, *ap->a_vpp);
1089 }
66325755 1090 }
b84de5af 1091 hammer_done_transaction(&trans);
66325755 1092 return (error);
427e5fc6
MD
1093}
1094
66325755
MD
1095/*
1096 * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1097 *
1098 * The operating system has already ensured that the directory entry
1099 * does not exist and done all appropriate namespace locking.
1100 */
427e5fc6
MD
1101static
1102int
66325755 1103hammer_vop_nmknod(struct vop_nmknod_args *ap)
427e5fc6 1104{
66325755
MD
1105 struct hammer_transaction trans;
1106 struct hammer_inode *dip;
1107 struct hammer_inode *nip;
1108 struct nchandle *nch;
1109 int error;
1110
1111 nch = ap->a_nch;
1112 dip = VTOI(ap->a_dvp);
1113
d113fda1
MD
1114 if (dip->flags & HAMMER_INODE_RO)
1115 return (EROFS);
93291532 1116 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1117 return (error);
d113fda1 1118
66325755
MD
1119 /*
1120 * Create a transaction to cover the operations we perform.
1121 */
8cd0a023 1122 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1123 ++hammer_stats_file_iopsw;
66325755
MD
1124
1125 /*
1126 * Create a new filesystem object of the requested type. The
8cd0a023 1127 * returned inode will be referenced but not locked.
5a930e66
MD
1128 *
1129 * If mknod specifies a directory a pseudo-fs is created.
66325755 1130 */
5a930e66 1131 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
ea434b6f 1132 dip, NULL, &nip);
66325755 1133 if (error) {
b84de5af 1134 hammer_done_transaction(&trans);
66325755
MD
1135 *ap->a_vpp = NULL;
1136 return (error);
1137 }
66325755
MD
1138
1139 /*
1140 * Add the new filesystem object to the directory. This will also
1141 * bump the inode's link count.
1142 */
5a930e66
MD
1143 error = hammer_ip_add_directory(&trans, dip,
1144 nch->ncp->nc_name, nch->ncp->nc_nlen,
1145 nip);
66325755
MD
1146
1147 /*
1148 * Finish up.
1149 */
1150 if (error) {
a89aec1b 1151 hammer_rel_inode(nip, 0);
66325755
MD
1152 *ap->a_vpp = NULL;
1153 } else {
e8599db1 1154 error = hammer_get_vnode(nip, ap->a_vpp);
a89aec1b
MD
1155 hammer_rel_inode(nip, 0);
1156 if (error == 0) {
1157 cache_setunresolved(ap->a_nch);
1158 cache_setvp(ap->a_nch, *ap->a_vpp);
1159 }
66325755 1160 }
b84de5af 1161 hammer_done_transaction(&trans);
66325755 1162 return (error);
427e5fc6
MD
1163}
1164
66325755
MD
1165/*
1166 * hammer_vop_open { vp, mode, cred, fp }
1167 */
427e5fc6
MD
1168static
1169int
66325755 1170hammer_vop_open(struct vop_open_args *ap)
427e5fc6 1171{
9f5097dc
MD
1172 hammer_inode_t ip;
1173
ce0138a6 1174 ++hammer_stats_file_iopsr;
9f5097dc
MD
1175 ip = VTOI(ap->a_vp);
1176
1177 if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
d113fda1 1178 return (EROFS);
a89aec1b 1179 return(vop_stdopen(ap));
427e5fc6
MD
1180}
1181
66325755
MD
1182/*
1183 * hammer_vop_pathconf { vp, name, retval }
1184 */
427e5fc6
MD
1185static
1186int
66325755 1187hammer_vop_pathconf(struct vop_pathconf_args *ap)
427e5fc6
MD
1188{
1189 return EOPNOTSUPP;
1190}
1191
66325755
MD
1192/*
1193 * hammer_vop_print { vp }
1194 */
427e5fc6
MD
1195static
1196int
66325755 1197hammer_vop_print(struct vop_print_args *ap)
427e5fc6
MD
1198{
1199 return EOPNOTSUPP;
1200}
1201
66325755 1202/*
6b4f890b 1203 * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
66325755 1204 */
427e5fc6
MD
1205static
1206int
66325755 1207hammer_vop_readdir(struct vop_readdir_args *ap)
427e5fc6 1208{
36f82b23 1209 struct hammer_transaction trans;
6b4f890b
MD
1210 struct hammer_cursor cursor;
1211 struct hammer_inode *ip;
1212 struct uio *uio;
6b4f890b
MD
1213 hammer_base_elm_t base;
1214 int error;
1215 int cookie_index;
1216 int ncookies;
1217 off_t *cookies;
1218 off_t saveoff;
1219 int r;
ea434b6f 1220 int dtype;
6b4f890b 1221
ce0138a6 1222 ++hammer_stats_file_iopsr;
6b4f890b
MD
1223 ip = VTOI(ap->a_vp);
1224 uio = ap->a_uio;
b3deaf57
MD
1225 saveoff = uio->uio_offset;
1226
1227 if (ap->a_ncookies) {
1228 ncookies = uio->uio_resid / 16 + 1;
1229 if (ncookies > 1024)
1230 ncookies = 1024;
1231 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1232 cookie_index = 0;
1233 } else {
1234 ncookies = -1;
1235 cookies = NULL;
1236 cookie_index = 0;
1237 }
1238
36f82b23
MD
1239 hammer_simple_transaction(&trans, ip->hmp);
1240
b3deaf57
MD
1241 /*
1242 * Handle artificial entries
1243 */
1244 error = 0;
1245 if (saveoff == 0) {
1246 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1247 if (r)
1248 goto done;
1249 if (cookies)
1250 cookies[cookie_index] = saveoff;
1251 ++saveoff;
1252 ++cookie_index;
1253 if (cookie_index == ncookies)
1254 goto done;
1255 }
1256 if (saveoff == 1) {
1257 if (ip->ino_data.parent_obj_id) {
1258 r = vop_write_dirent(&error, uio,
1259 ip->ino_data.parent_obj_id,
1260 DT_DIR, 2, "..");
1261 } else {
1262 r = vop_write_dirent(&error, uio,
1263 ip->obj_id, DT_DIR, 2, "..");
1264 }
1265 if (r)
1266 goto done;
1267 if (cookies)
1268 cookies[cookie_index] = saveoff;
1269 ++saveoff;
1270 ++cookie_index;
1271 if (cookie_index == ncookies)
1272 goto done;
1273 }
6b4f890b
MD
1274
1275 /*
1276 * Key range (begin and end inclusive) to scan. Directory keys
1277 * directly translate to a 64 bit 'seek' position.
1278 */
bcac4bbb 1279 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
5a930e66
MD
1280 cursor.key_beg.localization = ip->obj_localization +
1281 HAMMER_LOCALIZE_MISC;
6b4f890b 1282 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1283 cursor.key_beg.create_tid = 0;
6b4f890b
MD
1284 cursor.key_beg.delete_tid = 0;
1285 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1286 cursor.key_beg.obj_type = 0;
b3deaf57 1287 cursor.key_beg.key = saveoff;
6b4f890b
MD
1288
1289 cursor.key_end = cursor.key_beg;
1290 cursor.key_end.key = HAMMER_MAX_KEY;
d5530d22
MD
1291 cursor.asof = ip->obj_asof;
1292 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
6b4f890b 1293
4e17f465 1294 error = hammer_ip_first(&cursor);
6b4f890b
MD
1295
1296 while (error == 0) {
11ad5ade 1297 error = hammer_ip_resolve_data(&cursor);
6b4f890b
MD
1298 if (error)
1299 break;
11ad5ade 1300 base = &cursor.leaf->base;
6b4f890b 1301 saveoff = base->key;
11ad5ade 1302 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
6b4f890b 1303
7a04d74f
MD
1304 if (base->obj_id != ip->obj_id)
1305 panic("readdir: bad record at %p", cursor.node);
1306
ea434b6f
MD
1307 /*
1308 * Convert pseudo-filesystems into softlinks
1309 */
1310 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
6b4f890b 1311 r = vop_write_dirent(
11ad5ade 1312 &error, uio, cursor.data->entry.obj_id,
ea434b6f 1313 dtype,
11ad5ade
MD
1314 cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1315 (void *)cursor.data->entry.name);
6b4f890b
MD
1316 if (r)
1317 break;
1318 ++saveoff;
1319 if (cookies)
1320 cookies[cookie_index] = base->key;
1321 ++cookie_index;
1322 if (cookie_index == ncookies)
1323 break;
1324 error = hammer_ip_next(&cursor);
1325 }
1326 hammer_done_cursor(&cursor);
1327
b3deaf57 1328done:
b84de5af 1329 hammer_done_transaction(&trans);
36f82b23 1330
6b4f890b
MD
1331 if (ap->a_eofflag)
1332 *ap->a_eofflag = (error == ENOENT);
6b4f890b
MD
1333 uio->uio_offset = saveoff;
1334 if (error && cookie_index == 0) {
b3deaf57
MD
1335 if (error == ENOENT)
1336 error = 0;
6b4f890b
MD
1337 if (cookies) {
1338 kfree(cookies, M_TEMP);
1339 *ap->a_ncookies = 0;
1340 *ap->a_cookies = NULL;
1341 }
1342 } else {
7a04d74f
MD
1343 if (error == ENOENT)
1344 error = 0;
6b4f890b
MD
1345 if (cookies) {
1346 *ap->a_ncookies = cookie_index;
1347 *ap->a_cookies = cookies;
1348 }
1349 }
1350 return(error);
427e5fc6
MD
1351}
1352
66325755
MD
1353/*
1354 * hammer_vop_readlink { vp, uio, cred }
1355 */
427e5fc6
MD
1356static
1357int
66325755 1358hammer_vop_readlink(struct vop_readlink_args *ap)
427e5fc6 1359{
36f82b23 1360 struct hammer_transaction trans;
7a04d74f
MD
1361 struct hammer_cursor cursor;
1362 struct hammer_inode *ip;
ea434b6f
MD
1363 char buf[32];
1364 u_int32_t localization;
1365 hammer_pseudofs_inmem_t pfsm;
7a04d74f
MD
1366 int error;
1367
1368 ip = VTOI(ap->a_vp);
36f82b23 1369
2f85fa4d
MD
1370 /*
1371 * Shortcut if the symlink data was stuffed into ino_data.
ea434b6f 1372 *
842e7a70
MD
1373 * Also expand special "@@PFS%05d" softlinks (expansion only
1374 * occurs for non-historical (current) accesses made from the
1375 * primary filesystem).
2f85fa4d
MD
1376 */
1377 if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
ea434b6f
MD
1378 char *ptr;
1379 int bytes;
1380
1381 ptr = ip->ino_data.ext.symlink;
1382 bytes = (int)ip->ino_data.size;
842e7a70
MD
1383 if (bytes == 10 &&
1384 ip->obj_asof == HAMMER_MAX_TID &&
1385 ip->obj_localization == 0 &&
1386 strncmp(ptr, "@@PFS", 5) == 0) {
ea434b6f
MD
1387 hammer_simple_transaction(&trans, ip->hmp);
1388 bcopy(ptr + 5, buf, 5);
1389 buf[5] = 0;
1390 localization = strtoul(buf, NULL, 10) << 16;
1391 pfsm = hammer_load_pseudofs(&trans, localization,
1392 &error);
1393 if (error == 0) {
4c038e17
MD
1394 if (pfsm->pfsd.mirror_flags &
1395 HAMMER_PFSD_SLAVE) {
1396 ksnprintf(buf, sizeof(buf),
1397 "@@0x%016llx:%05d",
1398 pfsm->pfsd.sync_end_tid,
1399 localization >> 16);
1400 } else {
1401 ksnprintf(buf, sizeof(buf),
1402 "@@0x%016llx:%05d",
1403 HAMMER_MAX_TID,
1404 localization >> 16);
1405 }
ea434b6f
MD
1406 ptr = buf;
1407 bytes = strlen(buf);
1408 }
1409 if (pfsm)
1410 hammer_rel_pseudofs(trans.hmp, pfsm);
1411 hammer_done_transaction(&trans);
1412 }
1413 error = uiomove(ptr, bytes, ap->a_uio);
2f85fa4d
MD
1414 return(error);
1415 }
36f82b23 1416
2f85fa4d
MD
1417 /*
1418 * Long version
1419 */
1420 hammer_simple_transaction(&trans, ip->hmp);
ce0138a6 1421 ++hammer_stats_file_iopsr;
bcac4bbb 1422 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
7a04d74f
MD
1423
1424 /*
1425 * Key range (begin and end inclusive) to scan. Directory keys
1426 * directly translate to a 64 bit 'seek' position.
1427 */
5a930e66
MD
1428 cursor.key_beg.localization = ip->obj_localization +
1429 HAMMER_LOCALIZE_MISC;
7a04d74f 1430 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 1431 cursor.key_beg.create_tid = 0;
7a04d74f
MD
1432 cursor.key_beg.delete_tid = 0;
1433 cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1434 cursor.key_beg.obj_type = 0;
1435 cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
d5530d22
MD
1436 cursor.asof = ip->obj_asof;
1437 cursor.flags |= HAMMER_CURSOR_ASOF;
7a04d74f 1438
45a014dc 1439 error = hammer_ip_lookup(&cursor);
7a04d74f
MD
1440 if (error == 0) {
1441 error = hammer_ip_resolve_data(&cursor);
1442 if (error == 0) {
11ad5ade
MD
1443 KKASSERT(cursor.leaf->data_len >=
1444 HAMMER_SYMLINK_NAME_OFF);
1445 error = uiomove(cursor.data->symlink.name,
1446 cursor.leaf->data_len -
1447 HAMMER_SYMLINK_NAME_OFF,
7a04d74f
MD
1448 ap->a_uio);
1449 }
1450 }
1451 hammer_done_cursor(&cursor);
b84de5af 1452 hammer_done_transaction(&trans);
7a04d74f 1453 return(error);
427e5fc6
MD
1454}
1455
66325755
MD
1456/*
1457 * hammer_vop_nremove { nch, dvp, cred }
1458 */
427e5fc6
MD
1459static
1460int
66325755 1461hammer_vop_nremove(struct vop_nremove_args *ap)
427e5fc6 1462{
b84de5af 1463 struct hammer_transaction trans;
e63644f0 1464 struct hammer_inode *dip;
b84de5af
MD
1465 int error;
1466
e63644f0
MD
1467 dip = VTOI(ap->a_dvp);
1468
1469 if (hammer_nohistory(dip) == 0 &&
93291532 1470 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1471 return (error);
1472 }
1473
1474 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1475 ++hammer_stats_file_iopsw;
b84de5af
MD
1476 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1477 hammer_done_transaction(&trans);
1478
1479 return (error);
427e5fc6
MD
1480}
1481
66325755
MD
1482/*
1483 * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1484 */
427e5fc6
MD
1485static
1486int
66325755 1487hammer_vop_nrename(struct vop_nrename_args *ap)
427e5fc6 1488{
8cd0a023
MD
1489 struct hammer_transaction trans;
1490 struct namecache *fncp;
1491 struct namecache *tncp;
1492 struct hammer_inode *fdip;
1493 struct hammer_inode *tdip;
1494 struct hammer_inode *ip;
1495 struct hammer_cursor cursor;
8cd0a023 1496 int64_t namekey;
11ad5ade 1497 int nlen, error;
8cd0a023
MD
1498
1499 fdip = VTOI(ap->a_fdvp);
1500 tdip = VTOI(ap->a_tdvp);
1501 fncp = ap->a_fnch->ncp;
1502 tncp = ap->a_tnch->ncp;
b3deaf57
MD
1503 ip = VTOI(fncp->nc_vp);
1504 KKASSERT(ip != NULL);
d113fda1
MD
1505
1506 if (fdip->flags & HAMMER_INODE_RO)
1507 return (EROFS);
1508 if (tdip->flags & HAMMER_INODE_RO)
1509 return (EROFS);
1510 if (ip->flags & HAMMER_INODE_RO)
1511 return (EROFS);
93291532 1512 if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1513 return (error);
d113fda1 1514
8cd0a023 1515 hammer_start_transaction(&trans, fdip->hmp);
ce0138a6 1516 ++hammer_stats_file_iopsw;
8cd0a023
MD
1517
1518 /*
b3deaf57
MD
1519 * Remove tncp from the target directory and then link ip as
1520 * tncp. XXX pass trans to dounlink
42c7d26b
MD
1521 *
1522 * Force the inode sync-time to match the transaction so it is
1523 * in-sync with the creation of the target directory entry.
8cd0a023 1524 */
b84de5af 1525 error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
42c7d26b 1526 if (error == 0 || error == ENOENT) {
5a930e66
MD
1527 error = hammer_ip_add_directory(&trans, tdip,
1528 tncp->nc_name, tncp->nc_nlen,
1529 ip);
42c7d26b
MD
1530 if (error == 0) {
1531 ip->ino_data.parent_obj_id = tdip->obj_id;
47637bff 1532 hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
42c7d26b
MD
1533 }
1534 }
b3deaf57
MD
1535 if (error)
1536 goto failed; /* XXX */
8cd0a023
MD
1537
1538 /*
1539 * Locate the record in the originating directory and remove it.
1540 *
1541 * Calculate the namekey and setup the key range for the scan. This
1542 * works kinda like a chained hash table where the lower 32 bits
1543 * of the namekey synthesize the chain.
1544 *
1545 * The key range is inclusive of both key_beg and key_end.
1546 */
1547 namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
6a37e7e4 1548retry:
bcac4bbb 1549 hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
5a930e66
MD
1550 cursor.key_beg.localization = fdip->obj_localization +
1551 HAMMER_LOCALIZE_MISC;
8cd0a023
MD
1552 cursor.key_beg.obj_id = fdip->obj_id;
1553 cursor.key_beg.key = namekey;
d5530d22 1554 cursor.key_beg.create_tid = 0;
8cd0a023
MD
1555 cursor.key_beg.delete_tid = 0;
1556 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1557 cursor.key_beg.obj_type = 0;
1558
1559 cursor.key_end = cursor.key_beg;
1560 cursor.key_end.key |= 0xFFFFFFFFULL;
d5530d22
MD
1561 cursor.asof = fdip->obj_asof;
1562 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
8cd0a023
MD
1563
1564 /*
1565 * Scan all matching records (the chain), locate the one matching
a89aec1b 1566 * the requested path component.
8cd0a023
MD
1567 *
1568 * The hammer_ip_*() functions merge in-memory records with on-disk
1569 * records for the purposes of the search.
1570 */
4e17f465 1571 error = hammer_ip_first(&cursor);
a89aec1b 1572 while (error == 0) {
8cd0a023
MD
1573 if (hammer_ip_resolve_data(&cursor) != 0)
1574 break;
11ad5ade
MD
1575 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1576 KKASSERT(nlen > 0);
1577 if (fncp->nc_nlen == nlen &&
1578 bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
8cd0a023
MD
1579 break;
1580 }
a89aec1b 1581 error = hammer_ip_next(&cursor);
8cd0a023 1582 }
8cd0a023
MD
1583
1584 /*
1585 * If all is ok we have to get the inode so we can adjust nlinks.
6a37e7e4
MD
1586 *
1587 * WARNING: hammer_ip_del_directory() may have to terminate the
1588 * cursor to avoid a recursion. It's ok to call hammer_done_cursor()
1589 * twice.
8cd0a023 1590 */
9944ae54 1591 if (error == 0)
6a37e7e4 1592 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
b84de5af
MD
1593
1594 /*
1595 * XXX A deadlock here will break rename's atomicy for the purposes
1596 * of crash recovery.
1597 */
1598 if (error == EDEADLK) {
b84de5af 1599 hammer_done_cursor(&cursor);
b84de5af
MD
1600 goto retry;
1601 }
1602
1603 /*
1604 * Cleanup and tell the kernel that the rename succeeded.
1605 */
c0ade690 1606 hammer_done_cursor(&cursor);
6a37e7e4
MD
1607 if (error == 0)
1608 cache_rename(ap->a_fnch, ap->a_tnch);
b84de5af 1609
b3deaf57 1610failed:
b84de5af 1611 hammer_done_transaction(&trans);
8cd0a023 1612 return (error);
427e5fc6
MD
1613}
1614
66325755
MD
1615/*
1616 * hammer_vop_nrmdir { nch, dvp, cred }
1617 */
427e5fc6
MD
1618static
1619int
66325755 1620hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
427e5fc6 1621{
b84de5af 1622 struct hammer_transaction trans;
e63644f0 1623 struct hammer_inode *dip;
b84de5af
MD
1624 int error;
1625
e63644f0
MD
1626 dip = VTOI(ap->a_dvp);
1627
1628 if (hammer_nohistory(dip) == 0 &&
93291532 1629 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1630 return (error);
1631 }
1632
1633 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1634 ++hammer_stats_file_iopsw;
b84de5af
MD
1635 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1636 hammer_done_transaction(&trans);
1637
1638 return (error);
427e5fc6
MD
1639}
1640
66325755
MD
1641/*
1642 * hammer_vop_setattr { vp, vap, cred }
1643 */
427e5fc6
MD
1644static
1645int
66325755 1646hammer_vop_setattr(struct vop_setattr_args *ap)
427e5fc6 1647{
8cd0a023
MD
1648 struct hammer_transaction trans;
1649 struct vattr *vap;
1650 struct hammer_inode *ip;
1651 int modflags;
1652 int error;
d5ef456e 1653 int truncating;
4a2796f3
MD
1654 int blksize;
1655 int64_t aligned_size;
8cd0a023 1656 u_int32_t flags;
8cd0a023
MD
1657
1658 vap = ap->a_vap;
1659 ip = ap->a_vp->v_data;
1660 modflags = 0;
1661
1662 if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1663 return(EROFS);
d113fda1
MD
1664 if (ip->flags & HAMMER_INODE_RO)
1665 return (EROFS);
e63644f0 1666 if (hammer_nohistory(ip) == 0 &&
93291532 1667 (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
e63644f0
MD
1668 return (error);
1669 }
8cd0a023
MD
1670
1671 hammer_start_transaction(&trans, ip->hmp);
ce0138a6 1672 ++hammer_stats_file_iopsw;
8cd0a023
MD
1673 error = 0;
1674
1675 if (vap->va_flags != VNOVAL) {
1676 flags = ip->ino_data.uflags;
1677 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1678 hammer_to_unix_xid(&ip->ino_data.uid),
1679 ap->a_cred);
1680 if (error == 0) {
1681 if (ip->ino_data.uflags != flags) {
1682 ip->ino_data.uflags = flags;
1683 modflags |= HAMMER_INODE_DDIRTY;
1684 }
1685 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1686 error = 0;
1687 goto done;
1688 }
1689 }
1690 goto done;
1691 }
1692 if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1693 error = EPERM;
1694 goto done;
1695 }
7538695e
MD
1696 if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1697 mode_t cur_mode = ip->ino_data.mode;
1698 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1699 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1700 uuid_t uuid_uid;
1701 uuid_t uuid_gid;
1702
1703 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1704 ap->a_cred,
1705 &cur_uid, &cur_gid, &cur_mode);
1706 if (error == 0) {
1707 hammer_guid_to_uuid(&uuid_uid, cur_uid);
1708 hammer_guid_to_uuid(&uuid_gid, cur_gid);
1709 if (bcmp(&uuid_uid, &ip->ino_data.uid,
1710 sizeof(uuid_uid)) ||
1711 bcmp(&uuid_gid, &ip->ino_data.gid,
1712 sizeof(uuid_gid)) ||
1713 ip->ino_data.mode != cur_mode
1714 ) {
1715 ip->ino_data.uid = uuid_uid;
1716 ip->ino_data.gid = uuid_gid;
1717 ip->ino_data.mode = cur_mode;
1718 }
8cd0a023
MD
1719 modflags |= HAMMER_INODE_DDIRTY;
1720 }
1721 }
11ad5ade 1722 while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
8cd0a023
MD
1723 switch(ap->a_vp->v_type) {
1724 case VREG:
11ad5ade 1725 if (vap->va_size == ip->ino_data.size)
d5ef456e 1726 break;
b84de5af
MD
1727 /*
1728 * XXX break atomicy, we can deadlock the backend
1729 * if we do not release the lock. Probably not a
1730 * big deal here.
1731 */
4a2796f3 1732 blksize = hammer_blocksize(vap->va_size);
11ad5ade 1733 if (vap->va_size < ip->ino_data.size) {
4a2796f3 1734 vtruncbuf(ap->a_vp, vap->va_size, blksize);
d5ef456e
MD
1735 truncating = 1;
1736 } else {
c0ade690 1737 vnode_pager_setsize(ap->a_vp, vap->va_size);
d5ef456e 1738 truncating = 0;
c0ade690 1739 }
11ad5ade
MD
1740 ip->ino_data.size = vap->va_size;
1741 modflags |= HAMMER_INODE_DDIRTY;
d5ef456e 1742
b84de5af
MD
1743 /*
1744 * on-media truncation is cached in the inode until
1745 * the inode is synchronized.
1746 */
d5ef456e 1747 if (truncating) {
47637bff 1748 hammer_ip_frontend_trunc(ip, vap->va_size);
0832c9bb
MD
1749#ifdef DEBUG_TRUNCATE
1750 if (HammerTruncIp == NULL)
1751 HammerTruncIp = ip;
1752#endif
b84de5af
MD
1753 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1754 ip->flags |= HAMMER_INODE_TRUNCATED;
1755 ip->trunc_off = vap->va_size;
0832c9bb
MD
1756#ifdef DEBUG_TRUNCATE
1757 if (ip == HammerTruncIp)
1758 kprintf("truncate1 %016llx\n", ip->trunc_off);
1759#endif
b84de5af
MD
1760 } else if (ip->trunc_off > vap->va_size) {
1761 ip->trunc_off = vap->va_size;
0832c9bb
MD
1762#ifdef DEBUG_TRUNCATE
1763 if (ip == HammerTruncIp)
1764 kprintf("truncate2 %016llx\n", ip->trunc_off);
1765#endif
1766 } else {
1767#ifdef DEBUG_TRUNCATE
1768 if (ip == HammerTruncIp)
1769 kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1770#endif
b84de5af 1771 }
d5ef456e 1772 }
b84de5af 1773
d5ef456e
MD
1774 /*
1775 * If truncating we have to clean out a portion of
b84de5af
MD
1776 * the last block on-disk. We do this in the
1777 * front-end buffer cache.
d5ef456e 1778 */
4a2796f3
MD
1779 aligned_size = (vap->va_size + (blksize - 1)) &
1780 ~(int64_t)(blksize - 1);
b84de5af 1781 if (truncating && vap->va_size < aligned_size) {
d5ef456e
MD
1782 struct buf *bp;
1783 int offset;
1784
4a2796f3 1785 aligned_size -= blksize;
47637bff 1786
4a2796f3 1787 offset = (int)vap->va_size & (blksize - 1);
47637bff 1788 error = bread(ap->a_vp, aligned_size,
4a2796f3 1789 blksize, &bp);
47637bff 1790 hammer_ip_frontend_trunc(ip, aligned_size);
d5ef456e
MD
1791 if (error == 0) {
1792 bzero(bp->b_data + offset,
4a2796f3 1793 blksize - offset);
1b0ab2c3
MD
1794 /* must de-cache direct-io offset */
1795 bp->b_bio2.bio_offset = NOOFFSET;
d5ef456e
MD
1796 bdwrite(bp);
1797 } else {
47637bff 1798 kprintf("ERROR %d\n", error);
d5ef456e
MD
1799 brelse(bp);
1800 }
1801 }
76376933 1802 break;
8cd0a023 1803 case VDATABASE:
b84de5af
MD
1804 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1805 ip->flags |= HAMMER_INODE_TRUNCATED;
1806 ip->trunc_off = vap->va_size;
1807 } else if (ip->trunc_off > vap->va_size) {
1808 ip->trunc_off = vap->va_size;
1809 }
47637bff 1810 hammer_ip_frontend_trunc(ip, vap->va_size);
11ad5ade
MD
1811 ip->ino_data.size = vap->va_size;
1812 modflags |= HAMMER_INODE_DDIRTY;
8cd0a023
MD
1813 break;
1814 default:
1815 error = EINVAL;
1816 goto done;
1817 }
d26d0ae9 1818 break;
8cd0a023
MD
1819 }
1820 if (vap->va_atime.tv_sec != VNOVAL) {
bcac4bbb 1821 ip->ino_data.atime =
ddfdf542
MD
1822 hammer_timespec_to_time(&vap->va_atime);
1823 modflags |= HAMMER_INODE_ATIME;
8cd0a023
MD
1824 }
1825 if (vap->va_mtime.tv_sec != VNOVAL) {
11ad5ade 1826 ip->ino_data.mtime =
ddfdf542
MD
1827 hammer_timespec_to_time(&vap->va_mtime);
1828 modflags |= HAMMER_INODE_MTIME;
8cd0a023
MD
1829 }
1830 if (vap->va_mode != (mode_t)VNOVAL) {
7538695e
MD
1831 mode_t cur_mode = ip->ino_data.mode;
1832 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1833 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1834
1835 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1836 cur_uid, cur_gid, &cur_mode);
1837 if (error == 0 && ip->ino_data.mode != cur_mode) {
1838 ip->ino_data.mode = cur_mode;
8cd0a023
MD
1839 modflags |= HAMMER_INODE_DDIRTY;
1840 }
1841 }
1842done:
b84de5af 1843 if (error == 0)
47637bff 1844 hammer_modify_inode(ip, modflags);
b84de5af 1845 hammer_done_transaction(&trans);
8cd0a023 1846 return (error);
427e5fc6
MD
1847}
1848
66325755
MD
1849/*
1850 * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1851 */
427e5fc6
MD
1852static
1853int
66325755 1854hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
427e5fc6 1855{
7a04d74f
MD
1856 struct hammer_transaction trans;
1857 struct hammer_inode *dip;
1858 struct hammer_inode *nip;
1859 struct nchandle *nch;
1860 hammer_record_t record;
1861 int error;
1862 int bytes;
1863
1864 ap->a_vap->va_type = VLNK;
1865
1866 nch = ap->a_nch;
1867 dip = VTOI(ap->a_dvp);
1868
d113fda1
MD
1869 if (dip->flags & HAMMER_INODE_RO)
1870 return (EROFS);
93291532 1871 if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
e63644f0 1872 return (error);
d113fda1 1873
7a04d74f
MD
1874 /*
1875 * Create a transaction to cover the operations we perform.
1876 */
1877 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1878 ++hammer_stats_file_iopsw;
7a04d74f
MD
1879
1880 /*
1881 * Create a new filesystem object of the requested type. The
1882 * returned inode will be referenced but not locked.
1883 */
1884
5a930e66 1885 error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
ea434b6f 1886 dip, NULL, &nip);
7a04d74f 1887 if (error) {
b84de5af 1888 hammer_done_transaction(&trans);
7a04d74f
MD
1889 *ap->a_vpp = NULL;
1890 return (error);
1891 }
1892
7a04d74f
MD
1893 /*
1894 * Add a record representing the symlink. symlink stores the link
1895 * as pure data, not a string, and is no \0 terminated.
1896 */
1897 if (error == 0) {
7a04d74f
MD
1898 bytes = strlen(ap->a_target);
1899
2f85fa4d
MD
1900 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1901 bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1902 } else {
1903 record = hammer_alloc_mem_record(nip, bytes);
1904 record->type = HAMMER_MEM_RECORD_GENERAL;
1905
5a930e66
MD
1906 record->leaf.base.localization = nip->obj_localization +
1907 HAMMER_LOCALIZE_MISC;
2f85fa4d
MD
1908 record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1909 record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1910 record->leaf.data_len = bytes;
1911 KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1912 bcopy(ap->a_target, record->data->symlink.name, bytes);
1913 error = hammer_ip_add_record(&trans, record);
1914 }
42c7d26b
MD
1915
1916 /*
1917 * Set the file size to the length of the link.
1918 */
1919 if (error == 0) {
11ad5ade 1920 nip->ino_data.size = bytes;
47637bff 1921 hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
42c7d26b 1922 }
7a04d74f 1923 }
1f07f686 1924 if (error == 0)
5a930e66
MD
1925 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1926 nch->ncp->nc_nlen, nip);
7a04d74f
MD
1927
1928 /*
1929 * Finish up.
1930 */
1931 if (error) {
1932 hammer_rel_inode(nip, 0);
7a04d74f
MD
1933 *ap->a_vpp = NULL;
1934 } else {
e8599db1 1935 error = hammer_get_vnode(nip, ap->a_vpp);
7a04d74f
MD
1936 hammer_rel_inode(nip, 0);
1937 if (error == 0) {
1938 cache_setunresolved(ap->a_nch);
1939 cache_setvp(ap->a_nch, *ap->a_vpp);
1940 }
1941 }
b84de5af 1942 hammer_done_transaction(&trans);
7a04d74f 1943 return (error);
427e5fc6
MD
1944}
1945
66325755
MD
1946/*
1947 * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1948 */
427e5fc6
MD
1949static
1950int
66325755 1951hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
427e5fc6 1952{
b84de5af 1953 struct hammer_transaction trans;
e63644f0 1954 struct hammer_inode *dip;
b84de5af
MD
1955 int error;
1956
e63644f0
MD
1957 dip = VTOI(ap->a_dvp);
1958
1959 if (hammer_nohistory(dip) == 0 &&
93291532 1960 (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
e63644f0
MD
1961 return (error);
1962 }
1963
1964 hammer_start_transaction(&trans, dip->hmp);
ce0138a6 1965 ++hammer_stats_file_iopsw;
b84de5af
MD
1966 error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1967 ap->a_cred, ap->a_flags);
1968 hammer_done_transaction(&trans);
1969
1970 return (error);
427e5fc6
MD
1971}
1972
7dc57964
MD
1973/*
1974 * hammer_vop_ioctl { vp, command, data, fflag, cred }
1975 */
1976static
1977int
1978hammer_vop_ioctl(struct vop_ioctl_args *ap)
1979{
1980 struct hammer_inode *ip = ap->a_vp->v_data;
1981
ce0138a6 1982 ++hammer_stats_file_iopsr;
7dc57964
MD
1983 return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1984 ap->a_fflag, ap->a_cred));
1985}
1986
513ca7d7
MD
1987static
1988int
1989hammer_vop_mountctl(struct vop_mountctl_args *ap)
1990{
1991 struct mount *mp;
1992 int error;
1993
1994 mp = ap->a_head.a_ops->head.vv_mount;
1995
1996 switch(ap->a_op) {
1997 case MOUNTCTL_SET_EXPORT:
1998 if (ap->a_ctllen != sizeof(struct export_args))
1999 error = EINVAL;
2000 error = hammer_vfs_export(mp, ap->a_op,
2001 (const struct export_args *)ap->a_ctl);
2002 break;
2003 default:
2004 error = journal_mountctl(ap);
2005 break;
2006 }
2007 return(error);
2008}
2009
66325755
MD
2010/*
2011 * hammer_vop_strategy { vp, bio }
8cd0a023
MD
2012 *
2013 * Strategy call, used for regular file read & write only. Note that the
2014 * bp may represent a cluster.
2015 *
2016 * To simplify operation and allow better optimizations in the future,
2017 * this code does not make any assumptions with regards to buffer alignment
2018 * or size.
66325755 2019 */
427e5fc6
MD
2020static
2021int
66325755 2022hammer_vop_strategy(struct vop_strategy_args *ap)
427e5fc6 2023{
8cd0a023
MD
2024 struct buf *bp;
2025 int error;
2026
2027 bp = ap->a_bio->bio_buf;
2028
2029 switch(bp->b_cmd) {
2030 case BUF_CMD_READ:
2031 error = hammer_vop_strategy_read(ap);
2032 break;
2033 case BUF_CMD_WRITE:
2034 error = hammer_vop_strategy_write(ap);
2035 break;
2036 default:
059819e3
MD
2037 bp->b_error = error = EINVAL;
2038 bp->b_flags |= B_ERROR;
2039 biodone(ap->a_bio);
8cd0a023
MD
2040 break;
2041 }
8cd0a023 2042 return (error);
427e5fc6
MD
2043}
2044
8cd0a023
MD
2045/*
2046 * Read from a regular file. Iterate the related records and fill in the
2047 * BIO/BUF. Gaps are zero-filled.
2048 *
2049 * The support code in hammer_object.c should be used to deal with mixed
2050 * in-memory and on-disk records.
2051 *
4a2796f3
MD
2052 * NOTE: Can be called from the cluster code with an oversized buf.
2053 *
8cd0a023
MD
2054 * XXX atime update
2055 */
2056static
2057int
2058hammer_vop_strategy_read(struct vop_strategy_args *ap)
2059{
36f82b23
MD
2060 struct hammer_transaction trans;
2061 struct hammer_inode *ip;
8cd0a023 2062 struct hammer_cursor cursor;
8cd0a023 2063 hammer_base_elm_t base;
4a2796f3 2064 hammer_off_t disk_offset;
8cd0a023 2065 struct bio *bio;
a99b9ea2 2066 struct bio *nbio;
8cd0a023
MD
2067 struct buf *bp;
2068 int64_t rec_offset;
a89aec1b 2069 int64_t ran_end;
195c19a1 2070 int64_t tmp64;
8cd0a023
MD
2071 int error;
2072 int boff;
2073 int roff;
2074 int n;
2075
2076 bio = ap->a_bio;
2077 bp = bio->bio_buf;
36f82b23 2078 ip = ap->a_vp->v_data;
8cd0a023 2079
a99b9ea2
MD
2080 /*
2081 * The zone-2 disk offset may have been set by the cluster code via
4a2796f3 2082 * a BMAP operation, or else should be NOOFFSET.
a99b9ea2 2083 *
4a2796f3 2084 * Checking the high bits for a match against zone-2 should suffice.
a99b9ea2
MD
2085 */
2086 nbio = push_bio(bio);
6aeaa7bd 2087 if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
1b0ab2c3
MD
2088 HAMMER_ZONE_LARGE_DATA) {
2089 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
a99b9ea2
MD
2090 return (error);
2091 }
2092
2093 /*
4a2796f3
MD
2094 * Well, that sucked. Do it the hard way. If all the stars are
2095 * aligned we may still be able to issue a direct-read.
a99b9ea2 2096 */
36f82b23 2097 hammer_simple_transaction(&trans, ip->hmp);
47637bff 2098 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
8cd0a023
MD
2099
2100 /*
2101 * Key range (begin and end inclusive) to scan. Note that the key's
c0ade690
MD
2102 * stored in the actual records represent BASE+LEN, not BASE. The
2103 * first record containing bio_offset will have a key > bio_offset.
8cd0a023 2104 */
5a930e66
MD
2105 cursor.key_beg.localization = ip->obj_localization +
2106 HAMMER_LOCALIZE_MISC;
8cd0a023 2107 cursor.key_beg.obj_id = ip->obj_id;
d5530d22 2108 cursor.key_beg.create_tid = 0;
8cd0a023 2109 cursor.key_beg.delete_tid = 0;
8cd0a023 2110 cursor.key_beg.obj_type = 0;
c0ade690 2111 cursor.key_beg.key = bio->bio_offset + 1;
d5530d22 2112 cursor.asof = ip->obj_asof;
bf3b416b 2113 cursor.flags |= HAMMER_CURSOR_ASOF;
8cd0a023
MD
2114
2115 cursor.key_end = cursor.key_beg;
11ad5ade 2116 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
b84de5af 2117#if 0
11ad5ade 2118 if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
a89aec1b
MD
2119 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2120 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2121 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
b84de5af
MD
2122 } else
2123#endif
2124 {
c0ade690 2125 ran_end = bio->bio_offset + bp->b_bufsize;
a89aec1b
MD
2126 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2127 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
195c19a1
MD
2128 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2129 if (tmp64 < ran_end)
a89aec1b
MD
2130 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2131 else
7f7c1f84 2132 cursor.key_end.key = ran_end + MAXPHYS + 1;
a89aec1b 2133 }
d26d0ae9 2134 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
8cd0a023 2135
4e17f465 2136 error = hammer_ip_first(&cursor);
8cd0a023
MD
2137 boff = 0;
2138
a89aec1b 2139 while (error == 0) {
47637bff
MD
2140 /*
2141 * Get the base file offset of the record. The key for
2142 * data records is (base + bytes) rather then (base).
2143 */
11ad5ade 2144 base = &cursor.leaf->base;
11ad5ade 2145 rec_offset = base->key - cursor.leaf->data_len;
8cd0a023 2146
66325755 2147 /*
a89aec1b 2148 * Calculate the gap, if any, and zero-fill it.
1fef775e
MD
2149 *
2150 * n is the offset of the start of the record verses our
2151 * current seek offset in the bio.
66325755 2152 */
8cd0a023
MD
2153 n = (int)(rec_offset - (bio->bio_offset + boff));
2154 if (n > 0) {
a89aec1b
MD
2155 if (n > bp->b_bufsize - boff)
2156 n = bp->b_bufsize - boff;
8cd0a023
MD
2157 bzero((char *)bp->b_data + boff, n);
2158 boff += n;
2159 n = 0;
66325755 2160 }
8cd0a023
MD
2161
2162 /*
2163 * Calculate the data offset in the record and the number
2164 * of bytes we can copy.
a89aec1b 2165 *
1fef775e
MD
2166 * There are two degenerate cases. First, boff may already
2167 * be at bp->b_bufsize. Secondly, the data offset within
2168 * the record may exceed the record's size.
8cd0a023
MD
2169 */
2170 roff = -n;
b84de5af 2171 rec_offset += roff;
11ad5ade 2172 n = cursor.leaf->data_len - roff;
1fef775e
MD
2173 if (n <= 0) {
2174 kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2175 n = 0;
2176 } else if (n > bp->b_bufsize - boff) {
8cd0a023 2177 n = bp->b_bufsize - boff;
1fef775e 2178 }
059819e3 2179
b84de5af 2180 /*
47637bff
MD
2181 * Deal with cached truncations. This cool bit of code
2182 * allows truncate()/ftruncate() to avoid having to sync
2183 * the file.
2184 *
2185 * If the frontend is truncated then all backend records are
2186 * subject to the frontend's truncation.
2187 *
2188 * If the backend is truncated then backend records on-disk
2189 * (but not in-memory) are subject to the backend's
2190 * truncation. In-memory records owned by the backend
2191 * represent data written after the truncation point on the
2192 * backend and must not be truncated.
2193 *
2194 * Truncate operations deal with frontend buffer cache
2195 * buffers and frontend-owned in-memory records synchronously.
b84de5af 2196 */
47637bff
MD
2197 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2198 if (hammer_cursor_ondisk(&cursor) ||
2199 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2200 if (ip->trunc_off <= rec_offset)
2201 n = 0;
2202 else if (ip->trunc_off < rec_offset + n)
2203 n = (int)(ip->trunc_off - rec_offset);
2204 }
2205 }
2206 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2207 if (hammer_cursor_ondisk(&cursor)) {
2208 if (ip->sync_trunc_off <= rec_offset)
2209 n = 0;
2210 else if (ip->sync_trunc_off < rec_offset + n)
2211 n = (int)(ip->sync_trunc_off - rec_offset);
2212 }
2213 }
b84de5af
MD
2214
2215 /*
47637bff
MD
2216 * Try to issue a direct read into our bio if possible,
2217 * otherwise resolve the element data into a hammer_buffer
2218 * and copy.
4a2796f3
MD
2219 *
2220 * The buffer on-disk should be zerod past any real
2221 * truncation point, but may not be for any synthesized
2222 * truncation point from above.
b84de5af 2223 */
1b0ab2c3 2224 disk_offset = cursor.leaf->data_offset + roff;
4a2796f3 2225 if (boff == 0 && n == bp->b_bufsize &&
1b0ab2c3
MD
2226 hammer_cursor_ondisk(&cursor) &&
2227 (disk_offset & HAMMER_BUFMASK) == 0) {
2228 KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2229 HAMMER_ZONE_LARGE_DATA);
4a2796f3 2230 nbio->bio_offset = disk_offset;
1b0ab2c3
MD
2231 error = hammer_io_direct_read(trans.hmp, nbio,
2232 cursor.leaf);
47637bff
MD
2233 goto done;
2234 } else if (n) {
2235 error = hammer_ip_resolve_data(&cursor);
2236 if (error == 0) {
2237 bcopy((char *)cursor.data + roff,
2238 (char *)bp->b_data + boff, n);
2239 }
b84de5af 2240 }
47637bff
MD
2241 if (error)
2242 break;
2243
2244 /*
2245 * Iterate until we have filled the request.
2246 */
2247 boff += n;
8cd0a023 2248 if (boff == bp->b_bufsize)
66325755 2249 break;
a89aec1b 2250 error = hammer_ip_next(&cursor);
66325755
MD
2251 }
2252
2253 /*
8cd0a023 2254 * There may have been a gap after the last record
66325755 2255 */
8cd0a023
MD
2256 if (error == ENOENT)
2257 error = 0;
2258 if (error == 0 && boff != bp->b_bufsize) {
7f7c1f84 2259 KKASSERT(boff < bp->b_bufsize);
8cd0a023
MD
2260 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2261 /* boff = bp->b_bufsize; */
2262 }
2263 bp->b_resid = 0;
059819e3
MD
2264 bp->b_error = error;
2265 if (error)
2266 bp->b_flags |= B_ERROR;
2267 biodone(ap->a_bio);
47637bff
MD
2268
2269done:
2270 if (cursor.node)
bcac4bbb 2271 hammer_cache_node(&ip->cache[1], cursor.node);
47637bff
MD
2272 hammer_done_cursor(&cursor);
2273 hammer_done_transaction(&trans);
8cd0a023
MD
2274 return(error);
2275}
2276
a99b9ea2
MD
2277/*
2278 * BMAP operation - used to support cluster_read() only.
2279 *
2280 * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2281 *
2282 * This routine may return EOPNOTSUPP if the opration is not supported for
2283 * the specified offset. The contents of the pointer arguments do not
2284 * need to be initialized in that case.
2285 *
2286 * If a disk address is available and properly aligned return 0 with
2287 * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2288 * to the run-length relative to that offset. Callers may assume that
2289 * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2290 * large, so return EOPNOTSUPP if it is not sufficiently large.
2291 */
2292static
2293int
2294hammer_vop_bmap(struct vop_bmap_args *ap)
2295{
2296 struct hammer_transaction trans;
2297 struct hammer_inode *ip;
2298 struct hammer_cursor cursor;
2299 hammer_base_elm_t base;
2300 int64_t rec_offset;
2301 int64_t ran_end;
2302 int64_t tmp64;
2303 int64_t base_offset;
2304 int64_t base_disk_offset;
2305 int64_t last_offset;
2306 hammer_off_t last_disk_offset;
2307 hammer_off_t disk_offset;
2308 int rec_len;
2309 int error;
4a2796f3 2310 int blksize;
a99b9ea2 2311
ce0138a6 2312 ++hammer_stats_file_iopsr;
a99b9ea2
MD
2313 ip = ap->a_vp->v_data;
2314
2315 /*
2316 * We can only BMAP regular files. We can't BMAP database files,
2317 * directories, etc.
2318 */
2319 if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2320 return(EOPNOTSUPP);
2321
2322 /*
2323 * bmap is typically called with runp/runb both NULL when used
2324 * for writing. We do not support BMAP for writing atm.
2325 */
4a2796f3 2326 if (ap->a_cmd != BUF_CMD_READ)
a99b9ea2
MD
2327 return(EOPNOTSUPP);
2328
2329 /*
2330 * Scan the B-Tree to acquire blockmap addresses, then translate
2331 * to raw addresses.
2332 */
2333 hammer_simple_transaction(&trans, ip->hmp);
cb51be26
MD
2334#if 0
2335 kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2336#endif
a99b9ea2
MD
2337 hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2338
2339 /*
2340 * Key range (begin and end inclusive) to scan. Note that the key's
2341 * stored in the actual records represent BASE+LEN, not BASE. The
2342 * first record containing bio_offset will have a key > bio_offset.
2343 */
5a930e66
MD
2344 cursor.key_beg.localization = ip->obj_localization +
2345 HAMMER_LOCALIZE_MISC;
a99b9ea2
MD
2346 cursor.key_beg.obj_id = ip->obj_id;
2347 cursor.key_beg.create_tid = 0;
2348 cursor.key_beg.delete_tid = 0;
2349 cursor.key_beg.obj_type = 0;
2350 if (ap->a_runb)
2351 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2352 else
2353 cursor.key_beg.key = ap->a_loffset + 1;
2354 if (cursor.key_beg.key < 0)
2355 cursor.key_beg.key = 0;
2356 cursor.asof = ip->obj_asof;
bf3b416b 2357 cursor.flags |= HAMMER_CURSOR_ASOF;
a99b9ea2
MD
2358
2359 cursor.key_end = cursor.key_beg;
2360 KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2361
2362 ran_end = ap->a_loffset + MAXPHYS;
2363 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2364 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2365 tmp64 = ran_end + MAXPHYS + 1; /* work-around GCC-4 bug */
2366 if (tmp64 < ran_end)
2367 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2368 else
2369 cursor.key_end.key = ran_end + MAXPHYS + 1;
2370
2371 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2372
2373 error = hammer_ip_first(&cursor);
2374 base_offset = last_offset = 0;
2375 base_disk_offset = last_disk_offset = 0;
2376
2377 while (error == 0) {
2378 /*
2379 * Get the base file offset of the record. The key for
2380 * data records is (base + bytes) rather then (base).
4a2796f3
MD
2381 *
2382 * NOTE: rec_offset + rec_len may exceed the end-of-file.
2383 * The extra bytes should be zero on-disk and the BMAP op
2384 * should still be ok.
a99b9ea2
MD
2385 */
2386 base = &cursor.leaf->base;
2387 rec_offset = base->key - cursor.leaf->data_len;
2388 rec_len = cursor.leaf->data_len;
2389
2390 /*
4a2796f3
MD
2391 * Incorporate any cached truncation.
2392 *
2393 * NOTE: Modifications to rec_len based on synthesized
2394 * truncation points remove the guarantee that any extended
2395 * data on disk is zero (since the truncations may not have
2396 * taken place on-media yet).
a99b9ea2
MD
2397 */
2398 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2399 if (hammer_cursor_ondisk(&cursor) ||
2400 cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2401 if (ip->trunc_off <= rec_offset)
2402 rec_len = 0;
2403 else if (ip->trunc_off < rec_offset + rec_len)
2404 rec_len = (int)(ip->trunc_off - rec_offset);
2405 }
2406 }
2407 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2408 if (hammer_cursor_ondisk(&cursor)) {
2409 if (ip->sync_trunc_off <= rec_offset)
2410 rec_len = 0;
2411 else if (ip->sync_trunc_off < rec_offset + rec_len)
2412 rec_len = (int)(ip->sync_trunc_off - rec_offset);
2413 }
2414 }
2415
2416 /*
2417 * Accumulate information. If we have hit a discontiguous
2418 * block reset base_offset unless we are already beyond the
2419 * requested offset. If we are, that's it, we stop.
2420 */
a99b9ea2
MD
2421 if (error)
2422 break;
1b0ab2c3
MD
2423 if (hammer_cursor_ondisk(&cursor)) {
2424 disk_offset = cursor.leaf->data_offset;
2425 if (rec_offset != last_offset ||
2426 disk_offset != last_disk_offset) {
2427 if (rec_offset > ap->a_loffset)
2428 break;
2429 base_offset = rec_offset;
2430 base_disk_offset = disk_offset;
2431 }
2432 last_offset = rec_offset + rec_len;
2433 last_disk_offset = disk_offset + rec_len;
a99b9ea2 2434 }
a99b9ea2
MD
2435 error = hammer_ip_next(&cursor);
2436 }
2437
2438#if 0
2439 kprintf("BMAP %016llx: %016llx - %016llx\n",
2440 ap->a_loffset, base_offset, last_offset);
2441 kprintf("BMAP %16s: %016llx - %016llx\n",
2442 "", base_disk_offset, last_disk_offset);
2443#endif
2444
cb51be26 2445 if (cursor.node) {
bcac4bbb 2446 hammer_cache_node(&ip->cache[1], cursor.node);
cb51be26
MD
2447#if 0
2448 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2449#endif
2450 }
a99b9ea2
MD
2451 hammer_done_cursor(&cursor);
2452 hammer_done_transaction(&trans);
2453
4a2796f3
MD
2454 /*
2455 * If we couldn't find any records or the records we did find were
2456 * all behind the requested offset, return failure. A forward
2457 * truncation can leave a hole w/ no on-disk records.
2458 */
2459 if (last_offset == 0 || last_offset < ap->a_loffset)
2460 return (EOPNOTSUPP);
2461
2462 /*
2463 * Figure out the block size at the requested offset and adjust
2464 * our limits so the cluster_read() does not create inappropriately
2465 * sized buffer cache buffers.
2466 */
2467 blksize = hammer_blocksize(ap->a_loffset);
2468 if (hammer_blocksize(base_offset) != blksize) {
2469 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2470 }
2471 if (last_offset != ap->a_loffset &&
2472 hammer_blocksize(last_offset - 1) != blksize) {
2473 last_offset = hammer_blockdemarc(ap->a_loffset,
2474 last_offset - 1);
2475 }
2476
2477 /*
2478 * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2479 * from occuring.
2480 */
2481 disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2482
1b0ab2c3
MD
2483 if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2484 /*
2485 * Only large-data zones can be direct-IOd
2486 */
2487 error = EOPNOTSUPP;
2488 } else if ((disk_offset & HAMMER_BUFMASK) ||
2489 (last_offset - ap->a_loffset) < blksize) {
2490 /*
2491 * doffsetp is not aligned or the forward run size does
2492 * not cover a whole buffer, disallow the direct I/O.
2493 */
a99b9ea2
MD
2494 error = EOPNOTSUPP;
2495 } else {
1b0ab2c3
MD
2496 /*
2497 * We're good.
2498 */
4a2796f3
MD
2499 *ap->a_doffsetp = disk_offset;
2500 if (ap->a_runb) {
2501 *ap->a_runb = ap->a_loffset - base_offset;
2502 KKASSERT(*ap->a_runb >= 0);
a99b9ea2 2503 }
4a2796f3
MD
2504 if (ap->a_runp) {
2505 *ap->a_runp = last_offset - ap->a_loffset;
2506 KKASSERT(*ap->a_runp >= 0);
2507 }
2508 error = 0;
a99b9ea2
MD
2509 }
2510 return(error);
2511}
2512
8cd0a023 2513/*
059819e3 2514 * Write to a regular file. Because this is a strategy call the OS is
bcac4bbb 2515 * trying to actually get data onto the media.
8cd0a023
MD
2516 */
2517static
2518int
2519hammer_vop_strategy_write(struct vop_strategy_args *ap)
2520{
47637bff 2521 hammer_record_t record;
af209b0f 2522 hammer_mount_t hmp;
8cd0a023
MD
2523 hammer_inode_t ip;
2524 struct bio *bio;
2525 struct buf *bp;
a7e9bef1 2526 int blksize;
0832c9bb
MD
2527 int bytes;
2528 int error;
8cd0a023
MD
2529
2530 bio = ap->a_bio;
2531 bp = bio->bio_buf;
2532 ip = ap->a_vp->v_data;
af209b0f 2533 hmp = ip->hmp;
d113fda1 2534
a7e9bef1
MD
2535 blksize = hammer_blocksize(bio->bio_offset);
2536 KKASSERT(bp->b_bufsize == blksize);
4a2796f3 2537
059819e3
MD
2538 if (ip->flags & HAMMER_INODE_RO) {
2539 bp->b_error = EROFS;
2540 bp->b_flags |= B_ERROR;
2541 biodone(ap->a_bio);
2542 return(EROFS);
2543 }
b84de5af 2544
29ce0677
MD
2545 /*
2546 * Interlock with inode destruction (no in-kernel or directory
2547 * topology visibility). If we queue new IO while trying to
2548 * destroy the inode we can deadlock the vtrunc call in
2549 * hammer_inode_unloadable_check().
2550 */
2551 if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2552 bp->b_resid = 0;
2553 biodone(ap->a_bio);
2554 return(0);
2555 }
2556
b84de5af 2557 /*
a99b9ea2
MD
2558 * Reserve space and issue a direct-write from the front-end.
2559 * NOTE: The direct_io code will hammer_bread/bcopy smaller
2560 * allocations.
47637bff 2561 *
a99b9ea2
MD
2562 * An in-memory record will be installed to reference the storage
2563 * until the flusher can get to it.
47637bff
MD
2564 *
2565 * Since we own the high level bio the front-end will not try to
0832c9bb 2566 * do a direct-read until the write completes.
a99b9ea2
MD
2567 *
2568 * NOTE: The only time we do not reserve a full-sized buffers
2569 * worth of data is if the file is small. We do not try to
2570 * allocate a fragment (from the small-data zone) at the end of
2571 * an otherwise large file as this can lead to wildly separated
2572 * data.
47637bff 2573 */
0832c9bb
MD
2574 KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2575 KKASSERT(bio->bio_offset < ip->ino_data.size);
a99b9ea2 2576 if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
4a2796f3 2577 bytes = bp->b_bufsize;
b84de5af 2578 else
a99b9ea2 2579 bytes = ((int)ip->ino_data.size + 15) & ~15;
0832c9bb
MD
2580
2581 record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2582 bytes, &error);
2583 if (record) {
1b0ab2c3 2584 hammer_io_direct_write(hmp, record, bio);
0832c9bb 2585 hammer_rel_mem_record(record);
4a2796f3
MD
2586 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2587 hammer_flush_inode(ip, 0);
0832c9bb 2588 } else {
a99b9ea2 2589 bp->b_bio2.bio_offset = NOOFFSET;
0832c9bb
MD
2590 bp->b_error = error;
2591 bp->b_flags |= B_ERROR;
2592 biodone(ap->a_bio);
2593 }
0832c9bb 2594 return(error);
059819e3
MD
2595}
2596
8cd0a023
MD
2597/*
2598 * dounlink - disconnect a directory entry
2599 *
2600 * XXX whiteout support not really in yet
2601 */
2602static int
b84de5af
MD
2603hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2604 struct vnode *dvp, struct ucred *cred, int flags)
8cd0a023 2605{
8cd0a023
MD
2606 struct namecache *ncp;
2607 hammer_inode_t dip;
2608 hammer_inode_t ip;
8cd0a023 2609 struct hammer_cursor cursor;
8cd0a023 2610 int64_t namekey;
11ad5ade 2611 int nlen, error;
8cd0a023
MD
2612
2613 /*
2614 * Calculate the namekey and setup the key range for the scan. This
2615 * works kinda like a chained hash table where the lower 32 bits
2616 * of the namekey synthesize the chain.
2617 *
2618 * The key range is inclusive of both key_beg and key_end.
2619 */
2620 dip = VTOI(dvp);
2621 ncp = nch->ncp;
d113fda1
MD
2622
2623 if (dip->flags & HAMMER_INODE_RO)
2624 return (EROFS);
2625
6a37e7e4
MD
2626 namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2627retry:
bcac4bbb 2628 hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
5a930e66
MD
2629 cursor.key_beg.localization = dip->obj_localization +
2630 HAMMER_LOCALIZE_MISC;
8cd0a023
MD
2631 cursor.key_beg.obj_id = dip->obj_id;
2632 cursor.key_beg.key = namekey;
d5530d22 2633 cursor.key_beg.create_tid = 0;
8cd0a023
MD
2634 cursor.key_beg.delete_tid = 0;
2635 cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2636 cursor.key_beg.obj_type = 0;
2637
2638 cursor.key_end = cursor.key_beg;
2639 cursor.key_end.key |= 0xFFFFFFFFULL;
d5530d22
MD
2640 cursor.asof = dip->obj_asof;
2641 cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
8cd0a023 2642
8cd0a023
MD
2643 /*
2644 * Scan all matching records (the chain), locate the one matching
2645 * the requested path component. info->last_error contains the
2646 * error code on search termination and could be 0, ENOENT, or
2647 * something else.
2648 *
2649 * The hammer_ip_*() functions merge in-memory records with on-disk
2650 * records for the purposes of the search.
2651 */
4e17f465
MD
2652 error = hammer_ip_first(&cursor);
2653
a89aec1b
MD
2654 while (error == 0) {
2655 error = hammer_ip_resolve_data(&cursor);
2656 if (error)
66325755 2657 break;
11ad5ade
MD
2658 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2659 KKASSERT(nlen > 0);
2660 if (ncp->nc_nlen == nlen &&
2661 bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
66325755
MD
2662 break;
2663 }
a89aec1b 2664 error = hammer_ip_next(&cursor);
66325755 2665 }
8cd0a023
MD
2666
2667 /*
2668 * If all is ok we have to get the inode so we can adjust nlinks.
269c5eab
MD
2669 * To avoid a deadlock with the flusher we must release the inode
2670 * lock on the directory when acquiring the inode for the entry.
b3deaf57
MD
2671 *
2672 * If the target is a directory, it must be empty.
8cd0a023 2673 */
66325755 2674 if (error == 0) {
269c5eab 2675 hammer_unlock(&cursor.ip->lock);
bcac4bbb 2676 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
ddfdf542
MD
2677 dip->hmp->asof,
2678 cursor.data->entry.localization,
2679 0, &error);
269c5eab 2680 hammer_lock_sh(&cursor.ip->lock);
46fe7ae1 2681 if (error == ENOENT) {
11ad5ade 2682 kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
10a5d1ba 2683 Debugger("ENOENT unlinking object that should exist");
46fe7ae1 2684 }
1f07f686
MD
2685
2686 /*
2687 * If we are trying to remove a directory the directory must
2688 * be empty.
2689 *
2690 * WARNING: hammer_ip_check_directory_empty() may have to
2691 * terminate the cursor to avoid a deadlock. It is ok to
2692 * call hammer_done_cursor() twice.
2693 */
11ad5ade 2694 if (error == 0 && ip->ino_data.obj_type ==
b3deaf57 2695 HAMMER_OBJTYPE_DIRECTORY) {
98f7132d 2696 error = hammer_ip_check_directory_empty(trans, ip);
b3deaf57 2697 }
1f07f686 2698
6a37e7e4 2699 /*
1f07f686
MD
2700 * Delete the directory entry.
2701 *
6a37e7e4 2702 * WARNING: hammer_ip_del_directory() may have to terminate
1f07f686 2703 * the cursor to avoid a deadlock. It is ok to call
6a37e7e4
MD
2704 * hammer_done_cursor() twice.
2705 */
b84de5af 2706 if (error == 0) {
b84de5af
MD
2707 error = hammer_ip_del_directory(trans, &cursor,
2708 dip, ip);
b84de5af 2709 }
269c5eab 2710 hammer_done_cursor(&cursor);
8cd0a023
MD
2711 if (error == 0) {
2712 cache_setunresolved(nch);
2713 cache_setvp(nch, NULL);
2714 /* XXX locking */
2715 if (ip->vp)
2716 cache_inval_vp(ip->vp, CINV_DESTROY);
2717 }
af209b0f
MD
2718 if (ip)
2719 hammer_rel_inode(ip, 0);
269c5eab
MD
2720 } else {
2721 hammer_done_cursor(&cursor);
66325755 2722 }
ba46eaeb 2723 hammer_inode_waitreclaims(dip->hmp);
6a37e7e4
MD
2724 if (error == EDEADLK)
2725 goto retry;
9c448776 2726
66325755 2727 return (error);
66325755
MD
2728}
2729
7a04d74f
MD
2730/************************************************************************
2731 * FIFO AND SPECFS OPS *
2732 ************************************************************************
2733 *
2734 */
2735
2736static int
2737hammer_vop_fifoclose (struct vop_close_args *ap)
2738{
2739 /* XXX update itimes */
2740 return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2741}
2742
2743static int
2744hammer_vop_fiforead (struct vop_read_args *ap)
2745{
2746 int error;
2747
2748 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2749 /* XXX update access time */
2750 return (error);
2751}
2752
2753static int
2754hammer_vop_fifowrite (struct vop_write_args *ap)
2755{
2756 int error;
2757
2758 error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2759 /* XXX update access time */
2760 return (error);
2761}
2762
2763static int
2764hammer_vop_specclose (struct vop_close_args *ap)
2765{
2766 /* XXX update itimes */
2767 return (VOCALL(&spec_vnode_vops, &ap->a_head));
2768}
2769
2770static int
2771hammer_vop_specread (struct vop_read_args *ap)
2772{
2773 /* XXX update access time */
2774 return (VOCALL(&spec_vnode_vops, &ap->a_head));
2775}
2776
2777static int
2778hammer_vop_specwrite (struct vop_write_args *ap)
2779{
2780 /* XXX update last change time */
2781 return (VOCALL(&spec_vnode_vops, &ap->a_head));
2782}
2783