Merge branch 'master' of ssh://crater.dragonflybsd.org/repository/git/dragonfly
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
51
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
85
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         vop_stdpathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_markatime =        hammer_vop_markatime,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl,
126         .vop_kqfilter =         hammer_vop_kqfilter
127 };
128
129 struct vop_ops hammer_spec_vops = {
130         .vop_default =          vop_defaultop,
131         .vop_fsync =            hammer_vop_fsync,
132         .vop_read =             vop_stdnoread,
133         .vop_write =            vop_stdnowrite,
134         .vop_access =           hammer_vop_access,
135         .vop_close =            hammer_vop_close,
136         .vop_markatime =        hammer_vop_markatime,
137         .vop_getattr =          hammer_vop_getattr,
138         .vop_inactive =         hammer_vop_inactive,
139         .vop_reclaim =          hammer_vop_reclaim,
140         .vop_setattr =          hammer_vop_setattr
141 };
142
143 struct vop_ops hammer_fifo_vops = {
144         .vop_default =          fifo_vnoperate,
145         .vop_fsync =            hammer_vop_fsync,
146         .vop_read =             hammer_vop_fiforead,
147         .vop_write =            hammer_vop_fifowrite,
148         .vop_access =           hammer_vop_access,
149         .vop_close =            hammer_vop_fifoclose,
150         .vop_markatime =        hammer_vop_markatime,
151         .vop_getattr =          hammer_vop_getattr,
152         .vop_inactive =         hammer_vop_inactive,
153         .vop_reclaim =          hammer_vop_reclaim,
154         .vop_setattr =          hammer_vop_setattr,
155         .vop_kqfilter =         hammer_vop_fifokqfilter
156 };
157
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
161 {
162         if (flags)
163                 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
164 }
165
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
169
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171                            struct vnode *dvp, struct ucred *cred,
172                            int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
175
176 #if 0
177 static
178 int
179 hammer_vop_vnoperate(struct vop_generic_args *)
180 {
181         return (VOCALL(&hammer_vnode_vops, ap));
182 }
183 #endif
184
185 /*
186  * hammer_vop_fsync { vp, waitfor }
187  *
188  * fsync() an inode to disk and wait for it to be completely committed
189  * such that the information would not be undone if a crash occured after
190  * return.
191  */
192 static
193 int
194 hammer_vop_fsync(struct vop_fsync_args *ap)
195 {
196         hammer_inode_t ip = VTOI(ap->a_vp);
197
198         ++hammer_count_fsyncs;
199         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
200         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
201         if (ap->a_waitfor == MNT_WAIT) {
202                 vn_unlock(ap->a_vp);
203                 hammer_wait_inode(ip);
204                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
205         }
206         return (ip->error);
207 }
208
209 /*
210  * hammer_vop_read { vp, uio, ioflag, cred }
211  *
212  * MPALMOSTSAFE
213  */
214 static
215 int
216 hammer_vop_read(struct vop_read_args *ap)
217 {
218         struct hammer_transaction trans;
219         hammer_inode_t ip;
220         off_t offset;
221         struct buf *bp;
222         struct uio *uio;
223         int error;
224         int n;
225         int seqcount;
226         int ioseqcount;
227         int blksize;
228         int got_mplock;
229         int bigread;
230
231         if (ap->a_vp->v_type != VREG)
232                 return (EINVAL);
233         ip = VTOI(ap->a_vp);
234         error = 0;
235         uio = ap->a_uio;
236
237         /*
238          * Allow the UIO's size to override the sequential heuristic.
239          */
240         blksize = hammer_blocksize(uio->uio_offset);
241         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
242         ioseqcount = ap->a_ioflag >> 16;
243         if (seqcount < ioseqcount)
244                 seqcount = ioseqcount;
245
246         /*
247          * Temporary hack until more of HAMMER can be made MPSAFE.
248          */
249 #ifdef SMP
250         if (curthread->td_mpcount) {
251                 got_mplock = -1;
252                 hammer_start_transaction(&trans, ip->hmp);
253         } else {
254                 got_mplock = 0;
255         }
256 #else
257         hammer_start_transaction(&trans, ip->hmp);
258         got_mplock = -1;
259 #endif
260
261         /*
262          * If reading or writing a huge amount of data we have to break
263          * atomicy and allow the operation to be interrupted by a signal
264          * or it can DOS the machine.
265          */
266         bigread = (uio->uio_resid > 100 * 1024 * 1024);
267
268         /*
269          * Access the data typically in HAMMER_BUFSIZE blocks via the
270          * buffer cache, but HAMMER may use a variable block size based
271          * on the offset.
272          *
273          * XXX Temporary hack, delay the start transaction while we remain
274          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
275          *     locked-shared.
276          */
277         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
278                 int64_t base_offset;
279                 int64_t file_limit;
280
281                 blksize = hammer_blocksize(uio->uio_offset);
282                 offset = (int)uio->uio_offset & (blksize - 1);
283                 base_offset = uio->uio_offset - offset;
284
285                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
286                         break;
287
288                 /*
289                  * MPSAFE
290                  */
291                 bp = getcacheblk(ap->a_vp, base_offset);
292                 if (bp) {
293                         error = 0;
294                         goto skip;
295                 }
296
297                 /*
298                  * MPUNSAFE
299                  */
300                 if (got_mplock == 0) {
301                         got_mplock = 1;
302                         get_mplock();
303                         hammer_start_transaction(&trans, ip->hmp);
304                 }
305
306                 if (hammer_cluster_enable) {
307                         /*
308                          * Use file_limit to prevent cluster_read() from
309                          * creating buffers of the wrong block size past
310                          * the demarc.
311                          */
312                         file_limit = ip->ino_data.size;
313                         if (base_offset < HAMMER_XDEMARC &&
314                             file_limit > HAMMER_XDEMARC) {
315                                 file_limit = HAMMER_XDEMARC;
316                         }
317                         error = cluster_read(ap->a_vp,
318                                              file_limit, base_offset,
319                                              blksize, MAXPHYS,
320                                              seqcount, &bp);
321                 } else {
322                         error = bread(ap->a_vp, base_offset, blksize, &bp);
323                 }
324                 if (error) {
325                         kprintf("error %d\n", error);
326                         brelse(bp);
327                         break;
328                 }
329 skip:
330
331                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
332                 n = blksize - offset;
333                 if (n > uio->uio_resid)
334                         n = uio->uio_resid;
335                 if (n > ip->ino_data.size - uio->uio_offset)
336                         n = (int)(ip->ino_data.size - uio->uio_offset);
337                 error = uiomove((char *)bp->b_data + offset, n, uio);
338
339                 /* data has a lower priority then meta-data */
340                 bp->b_flags |= B_AGE;
341                 bqrelse(bp);
342                 if (error)
343                         break;
344                 hammer_stats_file_read += n;
345         }
346
347         /*
348          * XXX only update the atime if we had to get the MP lock.
349          * XXX hack hack hack, fixme.
350          */
351         if (got_mplock) {
352                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
353                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
354                         ip->ino_data.atime = trans.time;
355                         hammer_modify_inode(ip, HAMMER_INODE_ATIME);
356                 }
357                 hammer_done_transaction(&trans);
358                 if (got_mplock > 0)
359                         rel_mplock();
360         }
361         return (error);
362 }
363
364 /*
365  * hammer_vop_write { vp, uio, ioflag, cred }
366  */
367 static
368 int
369 hammer_vop_write(struct vop_write_args *ap)
370 {
371         struct hammer_transaction trans;
372         struct hammer_inode *ip;
373         hammer_mount_t hmp;
374         struct uio *uio;
375         int offset;
376         off_t base_offset;
377         struct buf *bp;
378         int kflags;
379         int error;
380         int n;
381         int flags;
382         int delta;
383         int seqcount;
384         int bigwrite;
385
386         if (ap->a_vp->v_type != VREG)
387                 return (EINVAL);
388         ip = VTOI(ap->a_vp);
389         hmp = ip->hmp;
390         error = 0;
391         kflags = 0;
392         seqcount = ap->a_ioflag >> 16;
393
394         if (ip->flags & HAMMER_INODE_RO)
395                 return (EROFS);
396
397         /*
398          * Create a transaction to cover the operations we perform.
399          */
400         hammer_start_transaction(&trans, hmp);
401         uio = ap->a_uio;
402
403         /*
404          * Check append mode
405          */
406         if (ap->a_ioflag & IO_APPEND)
407                 uio->uio_offset = ip->ino_data.size;
408
409         /*
410          * Check for illegal write offsets.  Valid range is 0...2^63-1.
411          *
412          * NOTE: the base_off assignment is required to work around what
413          * I consider to be a GCC-4 optimization bug.
414          */
415         if (uio->uio_offset < 0) {
416                 hammer_done_transaction(&trans);
417                 return (EFBIG);
418         }
419         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
420         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
421                 hammer_done_transaction(&trans);
422                 return (EFBIG);
423         }
424
425         /*
426          * If reading or writing a huge amount of data we have to break
427          * atomicy and allow the operation to be interrupted by a signal
428          * or it can DOS the machine.
429          */
430         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
431
432         /*
433          * Access the data typically in HAMMER_BUFSIZE blocks via the
434          * buffer cache, but HAMMER may use a variable block size based
435          * on the offset.
436          */
437         while (uio->uio_resid > 0) {
438                 int fixsize = 0;
439                 int blksize;
440                 int blkmask;
441
442                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
443                         break;
444                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
445                         break;
446
447                 blksize = hammer_blocksize(uio->uio_offset);
448
449                 /*
450                  * Do not allow HAMMER to blow out the buffer cache.  Very
451                  * large UIOs can lockout other processes due to bwillwrite()
452                  * mechanics.
453                  *
454                  * The hammer inode is not locked during these operations.
455                  * The vnode is locked which can interfere with the pageout
456                  * daemon for non-UIO_NOCOPY writes but should not interfere
457                  * with the buffer cache.  Even so, we cannot afford to
458                  * allow the pageout daemon to build up too many dirty buffer
459                  * cache buffers.
460                  *
461                  * Only call this if we aren't being recursively called from
462                  * a virtual disk device (vn), else we may deadlock.
463                  */
464                 if ((ap->a_ioflag & IO_RECURSE) == 0)
465                         bwillwrite(blksize);
466
467                 /*
468                  * Do not allow HAMMER to blow out system memory by
469                  * accumulating too many records.   Records are so well
470                  * decoupled from the buffer cache that it is possible
471                  * for userland to push data out to the media via
472                  * direct-write, but build up the records queued to the
473                  * backend faster then the backend can flush them out.
474                  * HAMMER has hit its write limit but the frontend has
475                  * no pushback to slow it down.
476                  */
477                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
478                         /*
479                          * Get the inode on the flush list
480                          */
481                         if (ip->rsv_recs >= 64)
482                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
483                         else if (ip->rsv_recs >= 16)
484                                 hammer_flush_inode(ip, 0);
485
486                         /*
487                          * Keep the flusher going if the system keeps
488                          * queueing records.
489                          */
490                         delta = hmp->count_newrecords -
491                                 hmp->last_newrecords;
492                         if (delta < 0 || delta > hammer_limit_recs / 2) {
493                                 hmp->last_newrecords = hmp->count_newrecords;
494                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
495                         }
496
497                         /*
498                          * If we have gotten behind start slowing
499                          * down the writers.
500                          */
501                         delta = (hmp->rsv_recs - hammer_limit_recs) *
502                                 hz / hammer_limit_recs;
503                         if (delta > 0)
504                                 tsleep(&trans, 0, "hmrslo", delta);
505                 }
506
507                 /*
508                  * Calculate the blocksize at the current offset and figure
509                  * out how much we can actually write.
510                  */
511                 blkmask = blksize - 1;
512                 offset = (int)uio->uio_offset & blkmask;
513                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
514                 n = blksize - offset;
515                 if (n > uio->uio_resid)
516                         n = uio->uio_resid;
517                 if (uio->uio_offset + n > ip->ino_data.size) {
518                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
519                         fixsize = 1;
520                         kflags |= NOTE_EXTEND;
521                 }
522
523                 if (uio->uio_segflg == UIO_NOCOPY) {
524                         /*
525                          * Issuing a write with the same data backing the
526                          * buffer.  Instantiate the buffer to collect the
527                          * backing vm pages, then read-in any missing bits.
528                          *
529                          * This case is used by vop_stdputpages().
530                          */
531                         bp = getblk(ap->a_vp, base_offset,
532                                     blksize, GETBLK_BHEAVY, 0);
533                         if ((bp->b_flags & B_CACHE) == 0) {
534                                 bqrelse(bp);
535                                 error = bread(ap->a_vp, base_offset,
536                                               blksize, &bp);
537                         }
538                 } else if (offset == 0 && uio->uio_resid >= blksize) {
539                         /*
540                          * Even though we are entirely overwriting the buffer
541                          * we may still have to zero it out to avoid a 
542                          * mmap/write visibility issue.
543                          */
544                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
545                         if ((bp->b_flags & B_CACHE) == 0)
546                                 vfs_bio_clrbuf(bp);
547                 } else if (base_offset >= ip->ino_data.size) {
548                         /*
549                          * If the base offset of the buffer is beyond the
550                          * file EOF, we don't have to issue a read.
551                          */
552                         bp = getblk(ap->a_vp, base_offset,
553                                     blksize, GETBLK_BHEAVY, 0);
554                         vfs_bio_clrbuf(bp);
555                 } else {
556                         /*
557                          * Partial overwrite, read in any missing bits then
558                          * replace the portion being written.
559                          */
560                         error = bread(ap->a_vp, base_offset, blksize, &bp);
561                         if (error == 0)
562                                 bheavy(bp);
563                 }
564                 if (error == 0) {
565                         error = uiomove((char *)bp->b_data + offset,
566                                         n, uio);
567                 }
568
569                 /*
570                  * If we screwed up we have to undo any VM size changes we
571                  * made.
572                  */
573                 if (error) {
574                         brelse(bp);
575                         if (fixsize) {
576                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
577                                           hammer_blocksize(ip->ino_data.size));
578                         }
579                         break;
580                 }
581                 kflags |= NOTE_WRITE;
582                 hammer_stats_file_write += n;
583                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
584                 if (ip->ino_data.size < uio->uio_offset) {
585                         ip->ino_data.size = uio->uio_offset;
586                         flags = HAMMER_INODE_DDIRTY;
587                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
588                 } else {
589                         flags = 0;
590                 }
591                 ip->ino_data.mtime = trans.time;
592                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
593                 hammer_modify_inode(ip, flags);
594
595                 /*
596                  * Once we dirty the buffer any cached zone-X offset
597                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
598                  * allow overwriting over the same data sector unless
599                  * we provide UNDOs for the old data, which we don't.
600                  */
601                 bp->b_bio2.bio_offset = NOOFFSET;
602
603                 /*
604                  * Final buffer disposition.
605                  */
606                 bp->b_flags |= B_AGE;
607                 if (ap->a_ioflag & IO_SYNC) {
608                         bwrite(bp);
609                 } else if (ap->a_ioflag & IO_DIRECT) {
610                         bawrite(bp);
611                 } else {
612                         bdwrite(bp);
613                 }
614         }
615         hammer_done_transaction(&trans);
616         hammer_knote(ap->a_vp, kflags);
617         return (error);
618 }
619
620 /*
621  * hammer_vop_access { vp, mode, cred }
622  */
623 static
624 int
625 hammer_vop_access(struct vop_access_args *ap)
626 {
627         struct hammer_inode *ip = VTOI(ap->a_vp);
628         uid_t uid;
629         gid_t gid;
630         int error;
631
632         ++hammer_stats_file_iopsr;
633         uid = hammer_to_unix_xid(&ip->ino_data.uid);
634         gid = hammer_to_unix_xid(&ip->ino_data.gid);
635
636         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
637                                   ip->ino_data.uflags);
638         return (error);
639 }
640
641 /*
642  * hammer_vop_advlock { vp, id, op, fl, flags }
643  */
644 static
645 int
646 hammer_vop_advlock(struct vop_advlock_args *ap)
647 {
648         hammer_inode_t ip = VTOI(ap->a_vp);
649
650         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
651 }
652
653 /*
654  * hammer_vop_close { vp, fflag }
655  */
656 static
657 int
658 hammer_vop_close(struct vop_close_args *ap)
659 {
660         /*hammer_inode_t ip = VTOI(ap->a_vp);*/
661         return (vop_stdclose(ap));
662 }
663
664 /*
665  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
666  *
667  * The operating system has already ensured that the directory entry
668  * does not exist and done all appropriate namespace locking.
669  */
670 static
671 int
672 hammer_vop_ncreate(struct vop_ncreate_args *ap)
673 {
674         struct hammer_transaction trans;
675         struct hammer_inode *dip;
676         struct hammer_inode *nip;
677         struct nchandle *nch;
678         int error;
679
680         nch = ap->a_nch;
681         dip = VTOI(ap->a_dvp);
682
683         if (dip->flags & HAMMER_INODE_RO)
684                 return (EROFS);
685         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
686                 return (error);
687
688         /*
689          * Create a transaction to cover the operations we perform.
690          */
691         hammer_start_transaction(&trans, dip->hmp);
692         ++hammer_stats_file_iopsw;
693
694         /*
695          * Create a new filesystem object of the requested type.  The
696          * returned inode will be referenced and shared-locked to prevent
697          * it from being moved to the flusher.
698          */
699         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
700                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
701                                     NULL, &nip);
702         if (error) {
703                 hkprintf("hammer_create_inode error %d\n", error);
704                 hammer_done_transaction(&trans);
705                 *ap->a_vpp = NULL;
706                 return (error);
707         }
708
709         /*
710          * Add the new filesystem object to the directory.  This will also
711          * bump the inode's link count.
712          */
713         error = hammer_ip_add_directory(&trans, dip,
714                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
715                                         nip);
716         if (error)
717                 hkprintf("hammer_ip_add_directory error %d\n", error);
718
719         /*
720          * Finish up.
721          */
722         if (error) {
723                 hammer_rel_inode(nip, 0);
724                 hammer_done_transaction(&trans);
725                 *ap->a_vpp = NULL;
726         } else {
727                 error = hammer_get_vnode(nip, ap->a_vpp);
728                 hammer_done_transaction(&trans);
729                 hammer_rel_inode(nip, 0);
730                 if (error == 0) {
731                         cache_setunresolved(ap->a_nch);
732                         cache_setvp(ap->a_nch, *ap->a_vpp);
733                 }
734                 hammer_knote(ap->a_dvp, NOTE_WRITE);
735         }
736         return (error);
737 }
738
739 /*
740  * hammer_vop_getattr { vp, vap }
741  *
742  * Retrieve an inode's attribute information.  When accessing inodes
743  * historically we fake the atime field to ensure consistent results.
744  * The atime field is stored in the B-Tree element and allowed to be
745  * updated without cycling the element.
746  *
747  * MPSAFE
748  */
749 static
750 int
751 hammer_vop_getattr(struct vop_getattr_args *ap)
752 {
753         struct hammer_inode *ip = VTOI(ap->a_vp);
754         struct vattr *vap = ap->a_vap;
755
756         /*
757          * We want the fsid to be different when accessing a filesystem
758          * with different as-of's so programs like diff don't think
759          * the files are the same.
760          *
761          * We also want the fsid to be the same when comparing snapshots,
762          * or when comparing mirrors (which might be backed by different
763          * physical devices).  HAMMER fsids are based on the PFS's
764          * shared_uuid field.
765          *
766          * XXX there is a chance of collision here.  The va_fsid reported
767          * by stat is different from the more involved fsid used in the
768          * mount structure.
769          */
770         ++hammer_stats_file_iopsr;
771         hammer_lock_sh(&ip->lock);
772         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
773                        (u_int32_t)(ip->obj_asof >> 32);
774
775         vap->va_fileid = ip->ino_leaf.base.obj_id;
776         vap->va_mode = ip->ino_data.mode;
777         vap->va_nlink = ip->ino_data.nlinks;
778         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
779         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
780         vap->va_rmajor = 0;
781         vap->va_rminor = 0;
782         vap->va_size = ip->ino_data.size;
783
784         /*
785          * Special case for @@PFS softlinks.  The actual size of the
786          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
787          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
788          */
789         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
790             ip->ino_data.size == 10 &&
791             ip->obj_asof == HAMMER_MAX_TID &&
792             ip->obj_localization == 0 &&
793             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
794                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
795                             vap->va_size = 26;
796                     else
797                             vap->va_size = 10;
798         }
799
800         /*
801          * We must provide a consistent atime and mtime for snapshots
802          * so people can do a 'tar cf - ... | md5' on them and get
803          * consistent results.
804          */
805         if (ip->flags & HAMMER_INODE_RO) {
806                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
807                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
808         } else {
809                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
810                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
811         }
812         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
813         vap->va_flags = ip->ino_data.uflags;
814         vap->va_gen = 1;        /* hammer inums are unique for all time */
815         vap->va_blocksize = HAMMER_BUFSIZE;
816         if (ip->ino_data.size >= HAMMER_XDEMARC) {
817                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
818                                 ~HAMMER_XBUFMASK64;
819         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
820                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
821                                 ~HAMMER_BUFMASK64;
822         } else {
823                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
824         }
825
826         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
827         vap->va_filerev = 0;    /* XXX */
828         /* mtime uniquely identifies any adjustments made to the file XXX */
829         vap->va_fsmid = ip->ino_data.mtime;
830         vap->va_uid_uuid = ip->ino_data.uid;
831         vap->va_gid_uuid = ip->ino_data.gid;
832         vap->va_fsid_uuid = ip->hmp->fsid;
833         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
834                           VA_FSID_UUID_VALID;
835
836         switch (ip->ino_data.obj_type) {
837         case HAMMER_OBJTYPE_CDEV:
838         case HAMMER_OBJTYPE_BDEV:
839                 vap->va_rmajor = ip->ino_data.rmajor;
840                 vap->va_rminor = ip->ino_data.rminor;
841                 break;
842         default:
843                 break;
844         }
845         hammer_unlock(&ip->lock);
846         return(0);
847 }
848
849 /*
850  * hammer_vop_nresolve { nch, dvp, cred }
851  *
852  * Locate the requested directory entry.
853  */
854 static
855 int
856 hammer_vop_nresolve(struct vop_nresolve_args *ap)
857 {
858         struct hammer_transaction trans;
859         struct namecache *ncp;
860         hammer_inode_t dip;
861         hammer_inode_t ip;
862         hammer_tid_t asof;
863         struct hammer_cursor cursor;
864         struct vnode *vp;
865         int64_t namekey;
866         int error;
867         int i;
868         int nlen;
869         int flags;
870         int ispfs;
871         int64_t obj_id;
872         u_int32_t localization;
873         u_int32_t max_iterations;
874
875         /*
876          * Misc initialization, plus handle as-of name extensions.  Look for
877          * the '@@' extension.  Note that as-of files and directories cannot
878          * be modified.
879          */
880         dip = VTOI(ap->a_dvp);
881         ncp = ap->a_nch->ncp;
882         asof = dip->obj_asof;
883         localization = dip->obj_localization;   /* for code consistency */
884         nlen = ncp->nc_nlen;
885         flags = dip->flags & HAMMER_INODE_RO;
886         ispfs = 0;
887
888         hammer_simple_transaction(&trans, dip->hmp);
889         ++hammer_stats_file_iopsr;
890
891         for (i = 0; i < nlen; ++i) {
892                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
893                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
894                                                   &ispfs, &asof, &localization);
895                         if (error != 0) {
896                                 i = nlen;
897                                 break;
898                         }
899                         if (asof != HAMMER_MAX_TID)
900                                 flags |= HAMMER_INODE_RO;
901                         break;
902                 }
903         }
904         nlen = i;
905
906         /*
907          * If this is a PFS softlink we dive into the PFS
908          */
909         if (ispfs && nlen == 0) {
910                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
911                                       asof, localization,
912                                       flags, &error);
913                 if (error == 0) {
914                         error = hammer_get_vnode(ip, &vp);
915                         hammer_rel_inode(ip, 0);
916                 } else {
917                         vp = NULL;
918                 }
919                 if (error == 0) {
920                         vn_unlock(vp);
921                         cache_setvp(ap->a_nch, vp);
922                         vrele(vp);
923                 }
924                 goto done;
925         }
926
927         /*
928          * If there is no path component the time extension is relative to dip.
929          * e.g. "fubar/@@<snapshot>"
930          *
931          * "." is handled by the kernel, but ".@@<snapshot>" is not.
932          * e.g. "fubar/.@@<snapshot>"
933          *
934          * ".." is handled by the kernel.  We do not currently handle
935          * "..@<snapshot>".
936          */
937         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
938                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
939                                       asof, dip->obj_localization,
940                                       flags, &error);
941                 if (error == 0) {
942                         error = hammer_get_vnode(ip, &vp);
943                         hammer_rel_inode(ip, 0);
944                 } else {
945                         vp = NULL;
946                 }
947                 if (error == 0) {
948                         vn_unlock(vp);
949                         cache_setvp(ap->a_nch, vp);
950                         vrele(vp);
951                 }
952                 goto done;
953         }
954
955         /*
956          * Calculate the namekey and setup the key range for the scan.  This
957          * works kinda like a chained hash table where the lower 32 bits
958          * of the namekey synthesize the chain.
959          *
960          * The key range is inclusive of both key_beg and key_end.
961          */
962         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
963                                            &max_iterations);
964
965         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
966         cursor.key_beg.localization = dip->obj_localization +
967                                       hammer_dir_localization(dip);
968         cursor.key_beg.obj_id = dip->obj_id;
969         cursor.key_beg.key = namekey;
970         cursor.key_beg.create_tid = 0;
971         cursor.key_beg.delete_tid = 0;
972         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
973         cursor.key_beg.obj_type = 0;
974
975         cursor.key_end = cursor.key_beg;
976         cursor.key_end.key += max_iterations;
977         cursor.asof = asof;
978         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
979
980         /*
981          * Scan all matching records (the chain), locate the one matching
982          * the requested path component.
983          *
984          * The hammer_ip_*() functions merge in-memory records with on-disk
985          * records for the purposes of the search.
986          */
987         obj_id = 0;
988         localization = HAMMER_DEF_LOCALIZATION;
989
990         if (error == 0) {
991                 error = hammer_ip_first(&cursor);
992                 while (error == 0) {
993                         error = hammer_ip_resolve_data(&cursor);
994                         if (error)
995                                 break;
996                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
997                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
998                                 obj_id = cursor.data->entry.obj_id;
999                                 localization = cursor.data->entry.localization;
1000                                 break;
1001                         }
1002                         error = hammer_ip_next(&cursor);
1003                 }
1004         }
1005         hammer_done_cursor(&cursor);
1006
1007         /*
1008          * Lookup the obj_id.  This should always succeed.  If it does not
1009          * the filesystem may be damaged and we return a dummy inode.
1010          */
1011         if (error == 0) {
1012                 ip = hammer_get_inode(&trans, dip, obj_id,
1013                                       asof, localization,
1014                                       flags, &error);
1015                 if (error == ENOENT) {
1016                         kprintf("HAMMER: WARNING: Missing "
1017                                 "inode for dirent \"%s\"\n"
1018                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1019                                 ncp->nc_name,
1020                                 (long long)obj_id, (long long)asof,
1021                                 localization);
1022                         error = 0;
1023                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1024                                                     asof, localization,
1025                                                     flags, &error);
1026                 }
1027                 if (error == 0) {
1028                         error = hammer_get_vnode(ip, &vp);
1029                         hammer_rel_inode(ip, 0);
1030                 } else {
1031                         vp = NULL;
1032                 }
1033                 if (error == 0) {
1034                         vn_unlock(vp);
1035                         cache_setvp(ap->a_nch, vp);
1036                         vrele(vp);
1037                 }
1038         } else if (error == ENOENT) {
1039                 cache_setvp(ap->a_nch, NULL);
1040         }
1041 done:
1042         hammer_done_transaction(&trans);
1043         return (error);
1044 }
1045
1046 /*
1047  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1048  *
1049  * Locate the parent directory of a directory vnode.
1050  *
1051  * dvp is referenced but not locked.  *vpp must be returned referenced and
1052  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1053  * at the root, instead it could indicate that the directory we were in was
1054  * removed.
1055  *
1056  * NOTE: as-of sequences are not linked into the directory structure.  If
1057  * we are at the root with a different asof then the mount point, reload
1058  * the same directory with the mount point's asof.   I'm not sure what this
1059  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1060  * get confused, but it hasn't been tested.
1061  */
1062 static
1063 int
1064 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1065 {
1066         struct hammer_transaction trans;
1067         struct hammer_inode *dip;
1068         struct hammer_inode *ip;
1069         int64_t parent_obj_id;
1070         u_int32_t parent_obj_localization;
1071         hammer_tid_t asof;
1072         int error;
1073
1074         dip = VTOI(ap->a_dvp);
1075         asof = dip->obj_asof;
1076
1077         /*
1078          * Whos are parent?  This could be the root of a pseudo-filesystem
1079          * whos parent is in another localization domain.
1080          */
1081         parent_obj_id = dip->ino_data.parent_obj_id;
1082         if (dip->obj_id == HAMMER_OBJID_ROOT)
1083                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1084         else
1085                 parent_obj_localization = dip->obj_localization;
1086
1087         if (parent_obj_id == 0) {
1088                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1089                    asof != dip->hmp->asof) {
1090                         parent_obj_id = dip->obj_id;
1091                         asof = dip->hmp->asof;
1092                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1093                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1094                                   (long long)dip->obj_asof);
1095                 } else {
1096                         *ap->a_vpp = NULL;
1097                         return ENOENT;
1098                 }
1099         }
1100
1101         hammer_simple_transaction(&trans, dip->hmp);
1102         ++hammer_stats_file_iopsr;
1103
1104         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1105                               asof, parent_obj_localization,
1106                               dip->flags, &error);
1107         if (ip) {
1108                 error = hammer_get_vnode(ip, ap->a_vpp);
1109                 hammer_rel_inode(ip, 0);
1110         } else {
1111                 *ap->a_vpp = NULL;
1112         }
1113         hammer_done_transaction(&trans);
1114         return (error);
1115 }
1116
1117 /*
1118  * hammer_vop_nlink { nch, dvp, vp, cred }
1119  */
1120 static
1121 int
1122 hammer_vop_nlink(struct vop_nlink_args *ap)
1123 {
1124         struct hammer_transaction trans;
1125         struct hammer_inode *dip;
1126         struct hammer_inode *ip;
1127         struct nchandle *nch;
1128         int error;
1129
1130         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1131                 return(EXDEV);
1132
1133         nch = ap->a_nch;
1134         dip = VTOI(ap->a_dvp);
1135         ip = VTOI(ap->a_vp);
1136
1137         if (dip->obj_localization != ip->obj_localization)
1138                 return(EXDEV);
1139
1140         if (dip->flags & HAMMER_INODE_RO)
1141                 return (EROFS);
1142         if (ip->flags & HAMMER_INODE_RO)
1143                 return (EROFS);
1144         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1145                 return (error);
1146
1147         /*
1148          * Create a transaction to cover the operations we perform.
1149          */
1150         hammer_start_transaction(&trans, dip->hmp);
1151         ++hammer_stats_file_iopsw;
1152
1153         /*
1154          * Add the filesystem object to the directory.  Note that neither
1155          * dip nor ip are referenced or locked, but their vnodes are
1156          * referenced.  This function will bump the inode's link count.
1157          */
1158         error = hammer_ip_add_directory(&trans, dip,
1159                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1160                                         ip);
1161
1162         /*
1163          * Finish up.
1164          */
1165         if (error == 0) {
1166                 cache_setunresolved(nch);
1167                 cache_setvp(nch, ap->a_vp);
1168         }
1169         hammer_done_transaction(&trans);
1170         hammer_knote(ap->a_vp, NOTE_LINK);
1171         hammer_knote(ap->a_dvp, NOTE_WRITE);
1172         return (error);
1173 }
1174
1175 /*
1176  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1177  *
1178  * The operating system has already ensured that the directory entry
1179  * does not exist and done all appropriate namespace locking.
1180  */
1181 static
1182 int
1183 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1184 {
1185         struct hammer_transaction trans;
1186         struct hammer_inode *dip;
1187         struct hammer_inode *nip;
1188         struct nchandle *nch;
1189         int error;
1190
1191         nch = ap->a_nch;
1192         dip = VTOI(ap->a_dvp);
1193
1194         if (dip->flags & HAMMER_INODE_RO)
1195                 return (EROFS);
1196         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1197                 return (error);
1198
1199         /*
1200          * Create a transaction to cover the operations we perform.
1201          */
1202         hammer_start_transaction(&trans, dip->hmp);
1203         ++hammer_stats_file_iopsw;
1204
1205         /*
1206          * Create a new filesystem object of the requested type.  The
1207          * returned inode will be referenced but not locked.
1208          */
1209         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1210                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1211                                     NULL, &nip);
1212         if (error) {
1213                 hkprintf("hammer_mkdir error %d\n", error);
1214                 hammer_done_transaction(&trans);
1215                 *ap->a_vpp = NULL;
1216                 return (error);
1217         }
1218         /*
1219          * Add the new filesystem object to the directory.  This will also
1220          * bump the inode's link count.
1221          */
1222         error = hammer_ip_add_directory(&trans, dip,
1223                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1224                                         nip);
1225         if (error)
1226                 hkprintf("hammer_mkdir (add) error %d\n", error);
1227
1228         /*
1229          * Finish up.
1230          */
1231         if (error) {
1232                 hammer_rel_inode(nip, 0);
1233                 *ap->a_vpp = NULL;
1234         } else {
1235                 error = hammer_get_vnode(nip, ap->a_vpp);
1236                 hammer_rel_inode(nip, 0);
1237                 if (error == 0) {
1238                         cache_setunresolved(ap->a_nch);
1239                         cache_setvp(ap->a_nch, *ap->a_vpp);
1240                 }
1241         }
1242         hammer_done_transaction(&trans);
1243         if (error == 0)
1244                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1245         return (error);
1246 }
1247
1248 /*
1249  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1250  *
1251  * The operating system has already ensured that the directory entry
1252  * does not exist and done all appropriate namespace locking.
1253  */
1254 static
1255 int
1256 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1257 {
1258         struct hammer_transaction trans;
1259         struct hammer_inode *dip;
1260         struct hammer_inode *nip;
1261         struct nchandle *nch;
1262         int error;
1263
1264         nch = ap->a_nch;
1265         dip = VTOI(ap->a_dvp);
1266
1267         if (dip->flags & HAMMER_INODE_RO)
1268                 return (EROFS);
1269         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1270                 return (error);
1271
1272         /*
1273          * Create a transaction to cover the operations we perform.
1274          */
1275         hammer_start_transaction(&trans, dip->hmp);
1276         ++hammer_stats_file_iopsw;
1277
1278         /*
1279          * Create a new filesystem object of the requested type.  The
1280          * returned inode will be referenced but not locked.
1281          *
1282          * If mknod specifies a directory a pseudo-fs is created.
1283          */
1284         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1285                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1286                                     NULL, &nip);
1287         if (error) {
1288                 hammer_done_transaction(&trans);
1289                 *ap->a_vpp = NULL;
1290                 return (error);
1291         }
1292
1293         /*
1294          * Add the new filesystem object to the directory.  This will also
1295          * bump the inode's link count.
1296          */
1297         error = hammer_ip_add_directory(&trans, dip,
1298                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1299                                         nip);
1300
1301         /*
1302          * Finish up.
1303          */
1304         if (error) {
1305                 hammer_rel_inode(nip, 0);
1306                 *ap->a_vpp = NULL;
1307         } else {
1308                 error = hammer_get_vnode(nip, ap->a_vpp);
1309                 hammer_rel_inode(nip, 0);
1310                 if (error == 0) {
1311                         cache_setunresolved(ap->a_nch);
1312                         cache_setvp(ap->a_nch, *ap->a_vpp);
1313                 }
1314         }
1315         hammer_done_transaction(&trans);
1316         if (error == 0)
1317                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1318         return (error);
1319 }
1320
1321 /*
1322  * hammer_vop_open { vp, mode, cred, fp }
1323  */
1324 static
1325 int
1326 hammer_vop_open(struct vop_open_args *ap)
1327 {
1328         hammer_inode_t ip;
1329
1330         ++hammer_stats_file_iopsr;
1331         ip = VTOI(ap->a_vp);
1332
1333         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1334                 return (EROFS);
1335         return(vop_stdopen(ap));
1336 }
1337
1338 /*
1339  * hammer_vop_print { vp }
1340  */
1341 static
1342 int
1343 hammer_vop_print(struct vop_print_args *ap)
1344 {
1345         return EOPNOTSUPP;
1346 }
1347
1348 /*
1349  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1350  */
1351 static
1352 int
1353 hammer_vop_readdir(struct vop_readdir_args *ap)
1354 {
1355         struct hammer_transaction trans;
1356         struct hammer_cursor cursor;
1357         struct hammer_inode *ip;
1358         struct uio *uio;
1359         hammer_base_elm_t base;
1360         int error;
1361         int cookie_index;
1362         int ncookies;
1363         off_t *cookies;
1364         off_t saveoff;
1365         int r;
1366         int dtype;
1367
1368         ++hammer_stats_file_iopsr;
1369         ip = VTOI(ap->a_vp);
1370         uio = ap->a_uio;
1371         saveoff = uio->uio_offset;
1372
1373         if (ap->a_ncookies) {
1374                 ncookies = uio->uio_resid / 16 + 1;
1375                 if (ncookies > 1024)
1376                         ncookies = 1024;
1377                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1378                 cookie_index = 0;
1379         } else {
1380                 ncookies = -1;
1381                 cookies = NULL;
1382                 cookie_index = 0;
1383         }
1384
1385         hammer_simple_transaction(&trans, ip->hmp);
1386
1387         /*
1388          * Handle artificial entries
1389          *
1390          * It should be noted that the minimum value for a directory
1391          * hash key on-media is 0x0000000100000000, so we can use anything
1392          * less then that to represent our 'special' key space.
1393          */
1394         error = 0;
1395         if (saveoff == 0) {
1396                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1397                 if (r)
1398                         goto done;
1399                 if (cookies)
1400                         cookies[cookie_index] = saveoff;
1401                 ++saveoff;
1402                 ++cookie_index;
1403                 if (cookie_index == ncookies)
1404                         goto done;
1405         }
1406         if (saveoff == 1) {
1407                 if (ip->ino_data.parent_obj_id) {
1408                         r = vop_write_dirent(&error, uio,
1409                                              ip->ino_data.parent_obj_id,
1410                                              DT_DIR, 2, "..");
1411                 } else {
1412                         r = vop_write_dirent(&error, uio,
1413                                              ip->obj_id, DT_DIR, 2, "..");
1414                 }
1415                 if (r)
1416                         goto done;
1417                 if (cookies)
1418                         cookies[cookie_index] = saveoff;
1419                 ++saveoff;
1420                 ++cookie_index;
1421                 if (cookie_index == ncookies)
1422                         goto done;
1423         }
1424
1425         /*
1426          * Key range (begin and end inclusive) to scan.  Directory keys
1427          * directly translate to a 64 bit 'seek' position.
1428          */
1429         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1430         cursor.key_beg.localization = ip->obj_localization +
1431                                       hammer_dir_localization(ip);
1432         cursor.key_beg.obj_id = ip->obj_id;
1433         cursor.key_beg.create_tid = 0;
1434         cursor.key_beg.delete_tid = 0;
1435         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1436         cursor.key_beg.obj_type = 0;
1437         cursor.key_beg.key = saveoff;
1438
1439         cursor.key_end = cursor.key_beg;
1440         cursor.key_end.key = HAMMER_MAX_KEY;
1441         cursor.asof = ip->obj_asof;
1442         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1443
1444         error = hammer_ip_first(&cursor);
1445
1446         while (error == 0) {
1447                 error = hammer_ip_resolve_data(&cursor);
1448                 if (error)
1449                         break;
1450                 base = &cursor.leaf->base;
1451                 saveoff = base->key;
1452                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1453
1454                 if (base->obj_id != ip->obj_id)
1455                         panic("readdir: bad record at %p", cursor.node);
1456
1457                 /*
1458                  * Convert pseudo-filesystems into softlinks
1459                  */
1460                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1461                 r = vop_write_dirent(
1462                              &error, uio, cursor.data->entry.obj_id,
1463                              dtype,
1464                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1465                              (void *)cursor.data->entry.name);
1466                 if (r)
1467                         break;
1468                 ++saveoff;
1469                 if (cookies)
1470                         cookies[cookie_index] = base->key;
1471                 ++cookie_index;
1472                 if (cookie_index == ncookies)
1473                         break;
1474                 error = hammer_ip_next(&cursor);
1475         }
1476         hammer_done_cursor(&cursor);
1477
1478 done:
1479         hammer_done_transaction(&trans);
1480
1481         if (ap->a_eofflag)
1482                 *ap->a_eofflag = (error == ENOENT);
1483         uio->uio_offset = saveoff;
1484         if (error && cookie_index == 0) {
1485                 if (error == ENOENT)
1486                         error = 0;
1487                 if (cookies) {
1488                         kfree(cookies, M_TEMP);
1489                         *ap->a_ncookies = 0;
1490                         *ap->a_cookies = NULL;
1491                 }
1492         } else {
1493                 if (error == ENOENT)
1494                         error = 0;
1495                 if (cookies) {
1496                         *ap->a_ncookies = cookie_index;
1497                         *ap->a_cookies = cookies;
1498                 }
1499         }
1500         return(error);
1501 }
1502
1503 /*
1504  * hammer_vop_readlink { vp, uio, cred }
1505  */
1506 static
1507 int
1508 hammer_vop_readlink(struct vop_readlink_args *ap)
1509 {
1510         struct hammer_transaction trans;
1511         struct hammer_cursor cursor;
1512         struct hammer_inode *ip;
1513         char buf[32];
1514         u_int32_t localization;
1515         hammer_pseudofs_inmem_t pfsm;
1516         int error;
1517
1518         ip = VTOI(ap->a_vp);
1519
1520         /*
1521          * Shortcut if the symlink data was stuffed into ino_data.
1522          *
1523          * Also expand special "@@PFS%05d" softlinks (expansion only
1524          * occurs for non-historical (current) accesses made from the
1525          * primary filesystem).
1526          */
1527         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1528                 char *ptr;
1529                 int bytes;
1530
1531                 ptr = ip->ino_data.ext.symlink;
1532                 bytes = (int)ip->ino_data.size;
1533                 if (bytes == 10 &&
1534                     ip->obj_asof == HAMMER_MAX_TID &&
1535                     ip->obj_localization == 0 &&
1536                     strncmp(ptr, "@@PFS", 5) == 0) {
1537                         hammer_simple_transaction(&trans, ip->hmp);
1538                         bcopy(ptr + 5, buf, 5);
1539                         buf[5] = 0;
1540                         localization = strtoul(buf, NULL, 10) << 16;
1541                         pfsm = hammer_load_pseudofs(&trans, localization,
1542                                                     &error);
1543                         if (error == 0) {
1544                                 if (pfsm->pfsd.mirror_flags &
1545                                     HAMMER_PFSD_SLAVE) {
1546                                         /* vap->va_size == 26 */
1547                                         ksnprintf(buf, sizeof(buf),
1548                                                   "@@0x%016llx:%05d",
1549                                                   (long long)pfsm->pfsd.sync_end_tid,
1550                                                   localization >> 16);
1551                                 } else {
1552                                         /* vap->va_size == 10 */
1553                                         ksnprintf(buf, sizeof(buf),
1554                                                   "@@-1:%05d",
1555                                                   localization >> 16);
1556 #if 0
1557                                         ksnprintf(buf, sizeof(buf),
1558                                                   "@@0x%016llx:%05d",
1559                                                   (long long)HAMMER_MAX_TID,
1560                                                   localization >> 16);
1561 #endif
1562                                 }
1563                                 ptr = buf;
1564                                 bytes = strlen(buf);
1565                         }
1566                         if (pfsm)
1567                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1568                         hammer_done_transaction(&trans);
1569                 }
1570                 error = uiomove(ptr, bytes, ap->a_uio);
1571                 return(error);
1572         }
1573
1574         /*
1575          * Long version
1576          */
1577         hammer_simple_transaction(&trans, ip->hmp);
1578         ++hammer_stats_file_iopsr;
1579         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1580
1581         /*
1582          * Key range (begin and end inclusive) to scan.  Directory keys
1583          * directly translate to a 64 bit 'seek' position.
1584          */
1585         cursor.key_beg.localization = ip->obj_localization +
1586                                       HAMMER_LOCALIZE_MISC;
1587         cursor.key_beg.obj_id = ip->obj_id;
1588         cursor.key_beg.create_tid = 0;
1589         cursor.key_beg.delete_tid = 0;
1590         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1591         cursor.key_beg.obj_type = 0;
1592         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1593         cursor.asof = ip->obj_asof;
1594         cursor.flags |= HAMMER_CURSOR_ASOF;
1595
1596         error = hammer_ip_lookup(&cursor);
1597         if (error == 0) {
1598                 error = hammer_ip_resolve_data(&cursor);
1599                 if (error == 0) {
1600                         KKASSERT(cursor.leaf->data_len >=
1601                                  HAMMER_SYMLINK_NAME_OFF);
1602                         error = uiomove(cursor.data->symlink.name,
1603                                         cursor.leaf->data_len -
1604                                                 HAMMER_SYMLINK_NAME_OFF,
1605                                         ap->a_uio);
1606                 }
1607         }
1608         hammer_done_cursor(&cursor);
1609         hammer_done_transaction(&trans);
1610         return(error);
1611 }
1612
1613 /*
1614  * hammer_vop_nremove { nch, dvp, cred }
1615  */
1616 static
1617 int
1618 hammer_vop_nremove(struct vop_nremove_args *ap)
1619 {
1620         struct hammer_transaction trans;
1621         struct hammer_inode *dip;
1622         int error;
1623
1624         dip = VTOI(ap->a_dvp);
1625
1626         if (hammer_nohistory(dip) == 0 &&
1627             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1628                 return (error);
1629         }
1630
1631         hammer_start_transaction(&trans, dip->hmp);
1632         ++hammer_stats_file_iopsw;
1633         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1634         hammer_done_transaction(&trans);
1635         if (error == 0)
1636                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1637         return (error);
1638 }
1639
1640 /*
1641  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1642  */
1643 static
1644 int
1645 hammer_vop_nrename(struct vop_nrename_args *ap)
1646 {
1647         struct hammer_transaction trans;
1648         struct namecache *fncp;
1649         struct namecache *tncp;
1650         struct hammer_inode *fdip;
1651         struct hammer_inode *tdip;
1652         struct hammer_inode *ip;
1653         struct hammer_cursor cursor;
1654         int64_t namekey;
1655         u_int32_t max_iterations;
1656         int nlen, error;
1657
1658         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1659                 return(EXDEV);
1660         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1661                 return(EXDEV);
1662
1663         fdip = VTOI(ap->a_fdvp);
1664         tdip = VTOI(ap->a_tdvp);
1665         fncp = ap->a_fnch->ncp;
1666         tncp = ap->a_tnch->ncp;
1667         ip = VTOI(fncp->nc_vp);
1668         KKASSERT(ip != NULL);
1669
1670         if (fdip->obj_localization != tdip->obj_localization)
1671                 return(EXDEV);
1672         if (fdip->obj_localization != ip->obj_localization)
1673                 return(EXDEV);
1674
1675         if (fdip->flags & HAMMER_INODE_RO)
1676                 return (EROFS);
1677         if (tdip->flags & HAMMER_INODE_RO)
1678                 return (EROFS);
1679         if (ip->flags & HAMMER_INODE_RO)
1680                 return (EROFS);
1681         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1682                 return (error);
1683
1684         hammer_start_transaction(&trans, fdip->hmp);
1685         ++hammer_stats_file_iopsw;
1686
1687         /*
1688          * Remove tncp from the target directory and then link ip as
1689          * tncp. XXX pass trans to dounlink
1690          *
1691          * Force the inode sync-time to match the transaction so it is
1692          * in-sync with the creation of the target directory entry.
1693          */
1694         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1695                                 ap->a_cred, 0, -1);
1696         if (error == 0 || error == ENOENT) {
1697                 error = hammer_ip_add_directory(&trans, tdip,
1698                                                 tncp->nc_name, tncp->nc_nlen,
1699                                                 ip);
1700                 if (error == 0) {
1701                         ip->ino_data.parent_obj_id = tdip->obj_id;
1702                         ip->ino_data.ctime = trans.time;
1703                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1704                 }
1705         }
1706         if (error)
1707                 goto failed; /* XXX */
1708
1709         /*
1710          * Locate the record in the originating directory and remove it.
1711          *
1712          * Calculate the namekey and setup the key range for the scan.  This
1713          * works kinda like a chained hash table where the lower 32 bits
1714          * of the namekey synthesize the chain.
1715          *
1716          * The key range is inclusive of both key_beg and key_end.
1717          */
1718         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1719                                            &max_iterations);
1720 retry:
1721         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1722         cursor.key_beg.localization = fdip->obj_localization +
1723                                       hammer_dir_localization(fdip);
1724         cursor.key_beg.obj_id = fdip->obj_id;
1725         cursor.key_beg.key = namekey;
1726         cursor.key_beg.create_tid = 0;
1727         cursor.key_beg.delete_tid = 0;
1728         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1729         cursor.key_beg.obj_type = 0;
1730
1731         cursor.key_end = cursor.key_beg;
1732         cursor.key_end.key += max_iterations;
1733         cursor.asof = fdip->obj_asof;
1734         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1735
1736         /*
1737          * Scan all matching records (the chain), locate the one matching
1738          * the requested path component.
1739          *
1740          * The hammer_ip_*() functions merge in-memory records with on-disk
1741          * records for the purposes of the search.
1742          */
1743         error = hammer_ip_first(&cursor);
1744         while (error == 0) {
1745                 if (hammer_ip_resolve_data(&cursor) != 0)
1746                         break;
1747                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1748                 KKASSERT(nlen > 0);
1749                 if (fncp->nc_nlen == nlen &&
1750                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1751                         break;
1752                 }
1753                 error = hammer_ip_next(&cursor);
1754         }
1755
1756         /*
1757          * If all is ok we have to get the inode so we can adjust nlinks.
1758          *
1759          * WARNING: hammer_ip_del_directory() may have to terminate the
1760          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1761          * twice.
1762          */
1763         if (error == 0)
1764                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1765
1766         /*
1767          * XXX A deadlock here will break rename's atomicy for the purposes
1768          * of crash recovery.
1769          */
1770         if (error == EDEADLK) {
1771                 hammer_done_cursor(&cursor);
1772                 goto retry;
1773         }
1774
1775         /*
1776          * Cleanup and tell the kernel that the rename succeeded.
1777          */
1778         hammer_done_cursor(&cursor);
1779         if (error == 0) {
1780                 cache_rename(ap->a_fnch, ap->a_tnch);
1781                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1782                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1783                 if (ip->vp)
1784                         hammer_knote(ip->vp, NOTE_RENAME);
1785         }
1786
1787 failed:
1788         hammer_done_transaction(&trans);
1789         return (error);
1790 }
1791
1792 /*
1793  * hammer_vop_nrmdir { nch, dvp, cred }
1794  */
1795 static
1796 int
1797 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1798 {
1799         struct hammer_transaction trans;
1800         struct hammer_inode *dip;
1801         int error;
1802
1803         dip = VTOI(ap->a_dvp);
1804
1805         if (hammer_nohistory(dip) == 0 &&
1806             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1807                 return (error);
1808         }
1809
1810         hammer_start_transaction(&trans, dip->hmp);
1811         ++hammer_stats_file_iopsw;
1812         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1813         hammer_done_transaction(&trans);
1814         if (error == 0)
1815                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1816         return (error);
1817 }
1818
1819 /*
1820  * hammer_vop_markatime { vp, cred }
1821  */
1822 static
1823 int
1824 hammer_vop_markatime(struct vop_markatime_args *ap)
1825 {
1826         struct hammer_transaction trans;
1827         struct hammer_inode *ip;
1828
1829         ip = VTOI(ap->a_vp);
1830         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1831                 return (EROFS);
1832         if (ip->flags & HAMMER_INODE_RO)
1833                 return (EROFS);
1834         if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
1835                 return (0);
1836         hammer_start_transaction(&trans, ip->hmp);
1837         ++hammer_stats_file_iopsw;
1838
1839         ip->ino_data.atime = trans.time;
1840         hammer_modify_inode(ip, HAMMER_INODE_ATIME);
1841         hammer_done_transaction(&trans);
1842         hammer_knote(ap->a_vp, NOTE_ATTRIB);
1843         return (0);
1844 }
1845
1846 /*
1847  * hammer_vop_setattr { vp, vap, cred }
1848  */
1849 static
1850 int
1851 hammer_vop_setattr(struct vop_setattr_args *ap)
1852 {
1853         struct hammer_transaction trans;
1854         struct vattr *vap;
1855         struct hammer_inode *ip;
1856         int modflags;
1857         int error;
1858         int truncating;
1859         int blksize;
1860         int kflags;
1861         int64_t aligned_size;
1862         u_int32_t flags;
1863
1864         vap = ap->a_vap;
1865         ip = ap->a_vp->v_data;
1866         modflags = 0;
1867         kflags = 0;
1868
1869         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1870                 return(EROFS);
1871         if (ip->flags & HAMMER_INODE_RO)
1872                 return (EROFS);
1873         if (hammer_nohistory(ip) == 0 &&
1874             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1875                 return (error);
1876         }
1877
1878         hammer_start_transaction(&trans, ip->hmp);
1879         ++hammer_stats_file_iopsw;
1880         error = 0;
1881
1882         if (vap->va_flags != VNOVAL) {
1883                 flags = ip->ino_data.uflags;
1884                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1885                                          hammer_to_unix_xid(&ip->ino_data.uid),
1886                                          ap->a_cred);
1887                 if (error == 0) {
1888                         if (ip->ino_data.uflags != flags) {
1889                                 ip->ino_data.uflags = flags;
1890                                 ip->ino_data.ctime = trans.time;
1891                                 modflags |= HAMMER_INODE_DDIRTY;
1892                                 kflags |= NOTE_ATTRIB;
1893                         }
1894                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1895                                 error = 0;
1896                                 goto done;
1897                         }
1898                 }
1899                 goto done;
1900         }
1901         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1902                 error = EPERM;
1903                 goto done;
1904         }
1905         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1906                 mode_t cur_mode = ip->ino_data.mode;
1907                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1908                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1909                 uuid_t uuid_uid;
1910                 uuid_t uuid_gid;
1911
1912                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1913                                          ap->a_cred,
1914                                          &cur_uid, &cur_gid, &cur_mode);
1915                 if (error == 0) {
1916                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1917                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1918                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1919                                  sizeof(uuid_uid)) ||
1920                             bcmp(&uuid_gid, &ip->ino_data.gid,
1921                                  sizeof(uuid_gid)) ||
1922                             ip->ino_data.mode != cur_mode
1923                         ) {
1924                                 ip->ino_data.uid = uuid_uid;
1925                                 ip->ino_data.gid = uuid_gid;
1926                                 ip->ino_data.mode = cur_mode;
1927                                 ip->ino_data.ctime = trans.time;
1928                                 modflags |= HAMMER_INODE_DDIRTY;
1929                         }
1930                         kflags |= NOTE_ATTRIB;
1931                 }
1932         }
1933         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1934                 switch(ap->a_vp->v_type) {
1935                 case VREG:
1936                         if (vap->va_size == ip->ino_data.size)
1937                                 break;
1938                         /*
1939                          * XXX break atomicy, we can deadlock the backend
1940                          * if we do not release the lock.  Probably not a
1941                          * big deal here.
1942                          */
1943                         blksize = hammer_blocksize(vap->va_size);
1944                         if (vap->va_size < ip->ino_data.size) {
1945                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1946                                 truncating = 1;
1947                                 kflags |= NOTE_WRITE;
1948                         } else {
1949                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1950                                 truncating = 0;
1951                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
1952                         }
1953                         ip->ino_data.size = vap->va_size;
1954                         ip->ino_data.mtime = trans.time;
1955                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
1956
1957                         /*
1958                          * on-media truncation is cached in the inode until
1959                          * the inode is synchronized.
1960                          */
1961                         if (truncating) {
1962                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1963 #ifdef DEBUG_TRUNCATE
1964                                 if (HammerTruncIp == NULL)
1965                                         HammerTruncIp = ip;
1966 #endif
1967                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1968                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1969                                         ip->trunc_off = vap->va_size;
1970 #ifdef DEBUG_TRUNCATE
1971                                         if (ip == HammerTruncIp)
1972                                         kprintf("truncate1 %016llx\n",
1973                                                 (long long)ip->trunc_off);
1974 #endif
1975                                 } else if (ip->trunc_off > vap->va_size) {
1976                                         ip->trunc_off = vap->va_size;
1977 #ifdef DEBUG_TRUNCATE
1978                                         if (ip == HammerTruncIp)
1979                                         kprintf("truncate2 %016llx\n",
1980                                                 (long long)ip->trunc_off);
1981 #endif
1982                                 } else {
1983 #ifdef DEBUG_TRUNCATE
1984                                         if (ip == HammerTruncIp)
1985                                         kprintf("truncate3 %016llx (ignored)\n",
1986                                                 (long long)vap->va_size);
1987 #endif
1988                                 }
1989                         }
1990
1991                         /*
1992                          * If truncating we have to clean out a portion of
1993                          * the last block on-disk.  We do this in the
1994                          * front-end buffer cache.
1995                          */
1996                         aligned_size = (vap->va_size + (blksize - 1)) &
1997                                        ~(int64_t)(blksize - 1);
1998                         if (truncating && vap->va_size < aligned_size) {
1999                                 struct buf *bp;
2000                                 int offset;
2001
2002                                 aligned_size -= blksize;
2003
2004                                 offset = (int)vap->va_size & (blksize - 1);
2005                                 error = bread(ap->a_vp, aligned_size,
2006                                               blksize, &bp);
2007                                 hammer_ip_frontend_trunc(ip, aligned_size);
2008                                 if (error == 0) {
2009                                         bzero(bp->b_data + offset,
2010                                               blksize - offset);
2011                                         /* must de-cache direct-io offset */
2012                                         bp->b_bio2.bio_offset = NOOFFSET;
2013                                         bdwrite(bp);
2014                                 } else {
2015                                         kprintf("ERROR %d\n", error);
2016                                         brelse(bp);
2017                                 }
2018                         }
2019                         break;
2020                 case VDATABASE:
2021                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2022                                 ip->flags |= HAMMER_INODE_TRUNCATED;
2023                                 ip->trunc_off = vap->va_size;
2024                         } else if (ip->trunc_off > vap->va_size) {
2025                                 ip->trunc_off = vap->va_size;
2026                         }
2027                         hammer_ip_frontend_trunc(ip, vap->va_size);
2028                         ip->ino_data.size = vap->va_size;
2029                         ip->ino_data.mtime = trans.time;
2030                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2031                         kflags |= NOTE_ATTRIB;
2032                         break;
2033                 default:
2034                         error = EINVAL;
2035                         goto done;
2036                 }
2037                 break;
2038         }
2039         if (vap->va_atime.tv_sec != VNOVAL) {
2040                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2041                 modflags |= HAMMER_INODE_ATIME;
2042                 kflags |= NOTE_ATTRIB;
2043         }
2044         if (vap->va_mtime.tv_sec != VNOVAL) {
2045                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2046                 modflags |= HAMMER_INODE_MTIME;
2047                 kflags |= NOTE_ATTRIB;
2048         }
2049         if (vap->va_mode != (mode_t)VNOVAL) {
2050                 mode_t   cur_mode = ip->ino_data.mode;
2051                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2052                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2053
2054                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2055                                          cur_uid, cur_gid, &cur_mode);
2056                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2057                         ip->ino_data.mode = cur_mode;
2058                         ip->ino_data.ctime = trans.time;
2059                         modflags |= HAMMER_INODE_DDIRTY;
2060                         kflags |= NOTE_ATTRIB;
2061                 }
2062         }
2063 done:
2064         if (error == 0)
2065                 hammer_modify_inode(ip, modflags);
2066         hammer_done_transaction(&trans);
2067         hammer_knote(ap->a_vp, kflags);
2068         return (error);
2069 }
2070
2071 /*
2072  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2073  */
2074 static
2075 int
2076 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2077 {
2078         struct hammer_transaction trans;
2079         struct hammer_inode *dip;
2080         struct hammer_inode *nip;
2081         struct nchandle *nch;
2082         hammer_record_t record;
2083         int error;
2084         int bytes;
2085
2086         ap->a_vap->va_type = VLNK;
2087
2088         nch = ap->a_nch;
2089         dip = VTOI(ap->a_dvp);
2090
2091         if (dip->flags & HAMMER_INODE_RO)
2092                 return (EROFS);
2093         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
2094                 return (error);
2095
2096         /*
2097          * Create a transaction to cover the operations we perform.
2098          */
2099         hammer_start_transaction(&trans, dip->hmp);
2100         ++hammer_stats_file_iopsw;
2101
2102         /*
2103          * Create a new filesystem object of the requested type.  The
2104          * returned inode will be referenced but not locked.
2105          */
2106
2107         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2108                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2109                                     NULL, &nip);
2110         if (error) {
2111                 hammer_done_transaction(&trans);
2112                 *ap->a_vpp = NULL;
2113                 return (error);
2114         }
2115
2116         /*
2117          * Add a record representing the symlink.  symlink stores the link
2118          * as pure data, not a string, and is no \0 terminated.
2119          */
2120         if (error == 0) {
2121                 bytes = strlen(ap->a_target);
2122
2123                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2124                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2125                 } else {
2126                         record = hammer_alloc_mem_record(nip, bytes);
2127                         record->type = HAMMER_MEM_RECORD_GENERAL;
2128
2129                         record->leaf.base.localization = nip->obj_localization +
2130                                                          HAMMER_LOCALIZE_MISC;
2131                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2132                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2133                         record->leaf.data_len = bytes;
2134                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2135                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2136                         error = hammer_ip_add_record(&trans, record);
2137                 }
2138
2139                 /*
2140                  * Set the file size to the length of the link.
2141                  */
2142                 if (error == 0) {
2143                         nip->ino_data.size = bytes;
2144                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
2145                 }
2146         }
2147         if (error == 0)
2148                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2149                                                 nch->ncp->nc_nlen, nip);
2150
2151         /*
2152          * Finish up.
2153          */
2154         if (error) {
2155                 hammer_rel_inode(nip, 0);
2156                 *ap->a_vpp = NULL;
2157         } else {
2158                 error = hammer_get_vnode(nip, ap->a_vpp);
2159                 hammer_rel_inode(nip, 0);
2160                 if (error == 0) {
2161                         cache_setunresolved(ap->a_nch);
2162                         cache_setvp(ap->a_nch, *ap->a_vpp);
2163                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2164                 }
2165         }
2166         hammer_done_transaction(&trans);
2167         return (error);
2168 }
2169
2170 /*
2171  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2172  */
2173 static
2174 int
2175 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2176 {
2177         struct hammer_transaction trans;
2178         struct hammer_inode *dip;
2179         int error;
2180
2181         dip = VTOI(ap->a_dvp);
2182
2183         if (hammer_nohistory(dip) == 0 &&
2184             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2185                 return (error);
2186         }
2187
2188         hammer_start_transaction(&trans, dip->hmp);
2189         ++hammer_stats_file_iopsw;
2190         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2191                                 ap->a_cred, ap->a_flags, -1);
2192         hammer_done_transaction(&trans);
2193
2194         return (error);
2195 }
2196
2197 /*
2198  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2199  */
2200 static
2201 int
2202 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2203 {
2204         struct hammer_inode *ip = ap->a_vp->v_data;
2205
2206         ++hammer_stats_file_iopsr;
2207         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2208                             ap->a_fflag, ap->a_cred));
2209 }
2210
2211 static
2212 int
2213 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2214 {
2215         static const struct mountctl_opt extraopt[] = {
2216                 { HMNT_NOHISTORY,       "nohistory" },
2217                 { HMNT_MASTERID,        "master" },
2218                 { 0, NULL}
2219
2220         };
2221         struct hammer_mount *hmp;
2222         struct mount *mp;
2223         int usedbytes;
2224         int error;
2225         char *pos;
2226
2227         error = 0;
2228         usedbytes = 0;
2229         mp = ap->a_head.a_ops->head.vv_mount;
2230         KKASSERT(mp->mnt_data != NULL);
2231         hmp = (struct hammer_mount *)mp->mnt_data;
2232
2233         switch(ap->a_op) {
2234
2235         case MOUNTCTL_SET_EXPORT:
2236                 if (ap->a_ctllen != sizeof(struct export_args))
2237                         error = EINVAL;
2238                 else
2239                         error = hammer_vfs_export(mp, ap->a_op,
2240                                       (const struct export_args *)ap->a_ctl);
2241                 break;
2242         case MOUNTCTL_MOUNTFLAGS:
2243         {
2244                 /*
2245                  * Call standard mountctl VOP function
2246                  * so we get user mount flags.
2247                  */
2248                 error = vop_stdmountctl(ap);
2249                 if (error)
2250                         break;
2251
2252                 usedbytes = *ap->a_res;
2253
2254                 if (usedbytes && usedbytes < ap->a_buflen) {
2255                         pos = (char *)ap->a_buf + usedbytes;
2256                         *pos++ = ','; /* Overwrite trailing \0 */
2257                         usedbytes++;
2258
2259                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf,
2260                                                     ap->a_buflen - usedbytes,
2261                                                     &error);
2262
2263                         /* Remove trailing comma if  no HAMMER flags returned */
2264                         if (usedbytes == *ap->a_res) {
2265                                 *pos-- = 0;
2266                                 usedbytes--;
2267                         }
2268
2269                 }
2270
2271                 *ap->a_res += usedbytes;
2272                 break;
2273         }
2274         default:
2275                 error = vop_stdmountctl(ap);
2276                 break;
2277         }
2278         return(error);
2279 }
2280
2281 /*
2282  * hammer_vop_strategy { vp, bio }
2283  *
2284  * Strategy call, used for regular file read & write only.  Note that the
2285  * bp may represent a cluster.
2286  *
2287  * To simplify operation and allow better optimizations in the future,
2288  * this code does not make any assumptions with regards to buffer alignment
2289  * or size.
2290  */
2291 static
2292 int
2293 hammer_vop_strategy(struct vop_strategy_args *ap)
2294 {
2295         struct buf *bp;
2296         int error;
2297
2298         bp = ap->a_bio->bio_buf;
2299
2300         switch(bp->b_cmd) {
2301         case BUF_CMD_READ:
2302                 error = hammer_vop_strategy_read(ap);
2303                 break;
2304         case BUF_CMD_WRITE:
2305                 error = hammer_vop_strategy_write(ap);
2306                 break;
2307         default:
2308                 bp->b_error = error = EINVAL;
2309                 bp->b_flags |= B_ERROR;
2310                 biodone(ap->a_bio);
2311                 break;
2312         }
2313         return (error);
2314 }
2315
2316 /*
2317  * Read from a regular file.  Iterate the related records and fill in the
2318  * BIO/BUF.  Gaps are zero-filled.
2319  *
2320  * The support code in hammer_object.c should be used to deal with mixed
2321  * in-memory and on-disk records.
2322  *
2323  * NOTE: Can be called from the cluster code with an oversized buf.
2324  *
2325  * XXX atime update
2326  */
2327 static
2328 int
2329 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2330 {
2331         struct hammer_transaction trans;
2332         struct hammer_inode *ip;
2333         struct hammer_inode *dip;
2334         struct hammer_cursor cursor;
2335         hammer_base_elm_t base;
2336         hammer_off_t disk_offset;
2337         struct bio *bio;
2338         struct bio *nbio;
2339         struct buf *bp;
2340         int64_t rec_offset;
2341         int64_t ran_end;
2342         int64_t tmp64;
2343         int error;
2344         int boff;
2345         int roff;
2346         int n;
2347
2348         bio = ap->a_bio;
2349         bp = bio->bio_buf;
2350         ip = ap->a_vp->v_data;
2351
2352         /*
2353          * The zone-2 disk offset may have been set by the cluster code via
2354          * a BMAP operation, or else should be NOOFFSET.
2355          *
2356          * Checking the high bits for a match against zone-2 should suffice.
2357          */
2358         nbio = push_bio(bio);
2359         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2360             HAMMER_ZONE_LARGE_DATA) {
2361                 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2362                 return (error);
2363         }
2364
2365         /*
2366          * Well, that sucked.  Do it the hard way.  If all the stars are
2367          * aligned we may still be able to issue a direct-read.
2368          */
2369         hammer_simple_transaction(&trans, ip->hmp);
2370         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2371
2372         /*
2373          * Key range (begin and end inclusive) to scan.  Note that the key's
2374          * stored in the actual records represent BASE+LEN, not BASE.  The
2375          * first record containing bio_offset will have a key > bio_offset.
2376          */
2377         cursor.key_beg.localization = ip->obj_localization +
2378                                       HAMMER_LOCALIZE_MISC;
2379         cursor.key_beg.obj_id = ip->obj_id;
2380         cursor.key_beg.create_tid = 0;
2381         cursor.key_beg.delete_tid = 0;
2382         cursor.key_beg.obj_type = 0;
2383         cursor.key_beg.key = bio->bio_offset + 1;
2384         cursor.asof = ip->obj_asof;
2385         cursor.flags |= HAMMER_CURSOR_ASOF;
2386
2387         cursor.key_end = cursor.key_beg;
2388         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2389 #if 0
2390         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2391                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2392                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2393                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2394         } else
2395 #endif
2396         {
2397                 ran_end = bio->bio_offset + bp->b_bufsize;
2398                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2399                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2400                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2401                 if (tmp64 < ran_end)
2402                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2403                 else
2404                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2405         }
2406         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2407
2408         error = hammer_ip_first(&cursor);
2409         boff = 0;
2410
2411         while (error == 0) {
2412                 /*
2413                  * Get the base file offset of the record.  The key for
2414                  * data records is (base + bytes) rather then (base).
2415                  */
2416                 base = &cursor.leaf->base;
2417                 rec_offset = base->key - cursor.leaf->data_len;
2418
2419                 /*
2420                  * Calculate the gap, if any, and zero-fill it.
2421                  *
2422                  * n is the offset of the start of the record verses our
2423                  * current seek offset in the bio.
2424                  */
2425                 n = (int)(rec_offset - (bio->bio_offset + boff));
2426                 if (n > 0) {
2427                         if (n > bp->b_bufsize - boff)
2428                                 n = bp->b_bufsize - boff;
2429                         bzero((char *)bp->b_data + boff, n);
2430                         boff += n;
2431                         n = 0;
2432                 }
2433
2434                 /*
2435                  * Calculate the data offset in the record and the number
2436                  * of bytes we can copy.
2437                  *
2438                  * There are two degenerate cases.  First, boff may already
2439                  * be at bp->b_bufsize.  Secondly, the data offset within
2440                  * the record may exceed the record's size.
2441                  */
2442                 roff = -n;
2443                 rec_offset += roff;
2444                 n = cursor.leaf->data_len - roff;
2445                 if (n <= 0) {
2446                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2447                         n = 0;
2448                 } else if (n > bp->b_bufsize - boff) {
2449                         n = bp->b_bufsize - boff;
2450                 }
2451
2452                 /*
2453                  * Deal with cached truncations.  This cool bit of code
2454                  * allows truncate()/ftruncate() to avoid having to sync
2455                  * the file.
2456                  *
2457                  * If the frontend is truncated then all backend records are
2458                  * subject to the frontend's truncation.
2459                  *
2460                  * If the backend is truncated then backend records on-disk
2461                  * (but not in-memory) are subject to the backend's
2462                  * truncation.  In-memory records owned by the backend
2463                  * represent data written after the truncation point on the
2464                  * backend and must not be truncated.
2465                  *
2466                  * Truncate operations deal with frontend buffer cache
2467                  * buffers and frontend-owned in-memory records synchronously.
2468                  */
2469                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2470                         if (hammer_cursor_ondisk(&cursor) ||
2471                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2472                                 if (ip->trunc_off <= rec_offset)
2473                                         n = 0;
2474                                 else if (ip->trunc_off < rec_offset + n)
2475                                         n = (int)(ip->trunc_off - rec_offset);
2476                         }
2477                 }
2478                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2479                         if (hammer_cursor_ondisk(&cursor)) {
2480                                 if (ip->sync_trunc_off <= rec_offset)
2481                                         n = 0;
2482                                 else if (ip->sync_trunc_off < rec_offset + n)
2483                                         n = (int)(ip->sync_trunc_off - rec_offset);
2484                         }
2485                 }
2486
2487                 /*
2488                  * Try to issue a direct read into our bio if possible,
2489                  * otherwise resolve the element data into a hammer_buffer
2490                  * and copy.
2491                  *
2492                  * The buffer on-disk should be zerod past any real
2493                  * truncation point, but may not be for any synthesized
2494                  * truncation point from above.
2495                  */
2496                 disk_offset = cursor.leaf->data_offset + roff;
2497                 if (boff == 0 && n == bp->b_bufsize &&
2498                     hammer_cursor_ondisk(&cursor) &&
2499                     (disk_offset & HAMMER_BUFMASK) == 0) {
2500                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2501                                  HAMMER_ZONE_LARGE_DATA);
2502                         nbio->bio_offset = disk_offset;
2503                         error = hammer_io_direct_read(trans.hmp, nbio,
2504                                                       cursor.leaf);
2505                         goto done;
2506                 } else if (n) {
2507                         error = hammer_ip_resolve_data(&cursor);
2508                         if (error == 0) {
2509                                 bcopy((char *)cursor.data + roff,
2510                                       (char *)bp->b_data + boff, n);
2511                         }
2512                 }
2513                 if (error)
2514                         break;
2515
2516                 /*
2517                  * Iterate until we have filled the request.
2518                  */
2519                 boff += n;
2520                 if (boff == bp->b_bufsize)
2521                         break;
2522                 error = hammer_ip_next(&cursor);
2523         }
2524
2525         /*
2526          * There may have been a gap after the last record
2527          */
2528         if (error == ENOENT)
2529                 error = 0;
2530         if (error == 0 && boff != bp->b_bufsize) {
2531                 KKASSERT(boff < bp->b_bufsize);
2532                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2533                 /* boff = bp->b_bufsize; */
2534         }
2535         bp->b_resid = 0;
2536         bp->b_error = error;
2537         if (error)
2538                 bp->b_flags |= B_ERROR;
2539         biodone(ap->a_bio);
2540
2541 done:
2542         /*
2543          * Cache the b-tree node for the last data read in cache[1].
2544          *
2545          * If we hit the file EOF then also cache the node in the
2546          * governing director's cache[3], it will be used to initialize
2547          * the inode's cache[1] for any inodes looked up via the directory.
2548          *
2549          * This doesn't reduce disk accesses since the B-Tree chain is
2550          * likely cached, but it does reduce cpu overhead when looking
2551          * up file offsets for cpdup/tar/cpio style iterations.
2552          */
2553         if (cursor.node)
2554                 hammer_cache_node(&ip->cache[1], cursor.node);
2555         if (ran_end >= ip->ino_data.size) {
2556                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2557                                         ip->obj_asof, ip->obj_localization);
2558                 if (dip) {
2559                         hammer_cache_node(&dip->cache[3], cursor.node);
2560                         hammer_rel_inode(dip, 0);
2561                 }
2562         }
2563         hammer_done_cursor(&cursor);
2564         hammer_done_transaction(&trans);
2565         return(error);
2566 }
2567
2568 /*
2569  * BMAP operation - used to support cluster_read() only.
2570  *
2571  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2572  *
2573  * This routine may return EOPNOTSUPP if the opration is not supported for
2574  * the specified offset.  The contents of the pointer arguments do not
2575  * need to be initialized in that case. 
2576  *
2577  * If a disk address is available and properly aligned return 0 with 
2578  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2579  * to the run-length relative to that offset.  Callers may assume that
2580  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2581  * large, so return EOPNOTSUPP if it is not sufficiently large.
2582  */
2583 static
2584 int
2585 hammer_vop_bmap(struct vop_bmap_args *ap)
2586 {
2587         struct hammer_transaction trans;
2588         struct hammer_inode *ip;
2589         struct hammer_cursor cursor;
2590         hammer_base_elm_t base;
2591         int64_t rec_offset;
2592         int64_t ran_end;
2593         int64_t tmp64;
2594         int64_t base_offset;
2595         int64_t base_disk_offset;
2596         int64_t last_offset;
2597         hammer_off_t last_disk_offset;
2598         hammer_off_t disk_offset;
2599         int     rec_len;
2600         int     error;
2601         int     blksize;
2602
2603         ++hammer_stats_file_iopsr;
2604         ip = ap->a_vp->v_data;
2605
2606         /*
2607          * We can only BMAP regular files.  We can't BMAP database files,
2608          * directories, etc.
2609          */
2610         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2611                 return(EOPNOTSUPP);
2612
2613         /*
2614          * bmap is typically called with runp/runb both NULL when used
2615          * for writing.  We do not support BMAP for writing atm.
2616          */
2617         if (ap->a_cmd != BUF_CMD_READ)
2618                 return(EOPNOTSUPP);
2619
2620         /*
2621          * Scan the B-Tree to acquire blockmap addresses, then translate
2622          * to raw addresses.
2623          */
2624         hammer_simple_transaction(&trans, ip->hmp);
2625 #if 0
2626         kprintf("bmap_beg %016llx ip->cache %p\n",
2627                 (long long)ap->a_loffset, ip->cache[1]);
2628 #endif
2629         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2630
2631         /*
2632          * Key range (begin and end inclusive) to scan.  Note that the key's
2633          * stored in the actual records represent BASE+LEN, not BASE.  The
2634          * first record containing bio_offset will have a key > bio_offset.
2635          */
2636         cursor.key_beg.localization = ip->obj_localization +
2637                                       HAMMER_LOCALIZE_MISC;
2638         cursor.key_beg.obj_id = ip->obj_id;
2639         cursor.key_beg.create_tid = 0;
2640         cursor.key_beg.delete_tid = 0;
2641         cursor.key_beg.obj_type = 0;
2642         if (ap->a_runb)
2643                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2644         else
2645                 cursor.key_beg.key = ap->a_loffset + 1;
2646         if (cursor.key_beg.key < 0)
2647                 cursor.key_beg.key = 0;
2648         cursor.asof = ip->obj_asof;
2649         cursor.flags |= HAMMER_CURSOR_ASOF;
2650
2651         cursor.key_end = cursor.key_beg;
2652         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2653
2654         ran_end = ap->a_loffset + MAXPHYS;
2655         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2656         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2657         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2658         if (tmp64 < ran_end)
2659                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2660         else
2661                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2662
2663         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2664
2665         error = hammer_ip_first(&cursor);
2666         base_offset = last_offset = 0;
2667         base_disk_offset = last_disk_offset = 0;
2668
2669         while (error == 0) {
2670                 /*
2671                  * Get the base file offset of the record.  The key for
2672                  * data records is (base + bytes) rather then (base).
2673                  *
2674                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2675                  * The extra bytes should be zero on-disk and the BMAP op
2676                  * should still be ok.
2677                  */
2678                 base = &cursor.leaf->base;
2679                 rec_offset = base->key - cursor.leaf->data_len;
2680                 rec_len    = cursor.leaf->data_len;
2681
2682                 /*
2683                  * Incorporate any cached truncation.
2684                  *
2685                  * NOTE: Modifications to rec_len based on synthesized
2686                  * truncation points remove the guarantee that any extended
2687                  * data on disk is zero (since the truncations may not have
2688                  * taken place on-media yet).
2689                  */
2690                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2691                         if (hammer_cursor_ondisk(&cursor) ||
2692                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2693                                 if (ip->trunc_off <= rec_offset)
2694                                         rec_len = 0;
2695                                 else if (ip->trunc_off < rec_offset + rec_len)
2696                                         rec_len = (int)(ip->trunc_off - rec_offset);
2697                         }
2698                 }
2699                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2700                         if (hammer_cursor_ondisk(&cursor)) {
2701                                 if (ip->sync_trunc_off <= rec_offset)
2702                                         rec_len = 0;
2703                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2704                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2705                         }
2706                 }
2707
2708                 /*
2709                  * Accumulate information.  If we have hit a discontiguous
2710                  * block reset base_offset unless we are already beyond the
2711                  * requested offset.  If we are, that's it, we stop.
2712                  */
2713                 if (error)
2714                         break;
2715                 if (hammer_cursor_ondisk(&cursor)) {
2716                         disk_offset = cursor.leaf->data_offset;
2717                         if (rec_offset != last_offset ||
2718                             disk_offset != last_disk_offset) {
2719                                 if (rec_offset > ap->a_loffset)
2720                                         break;
2721                                 base_offset = rec_offset;
2722                                 base_disk_offset = disk_offset;
2723                         }
2724                         last_offset = rec_offset + rec_len;
2725                         last_disk_offset = disk_offset + rec_len;
2726                 }
2727                 error = hammer_ip_next(&cursor);
2728         }
2729
2730 #if 0
2731         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2732                 (long long)ap->a_loffset,
2733                 (long long)base_offset,
2734                 (long long)last_offset);
2735         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
2736                 (long long)base_disk_offset,
2737                 (long long)last_disk_offset);
2738 #endif
2739
2740         if (cursor.node) {
2741                 hammer_cache_node(&ip->cache[1], cursor.node);
2742 #if 0
2743                 kprintf("bmap_end2 %016llx ip->cache %p\n",
2744                         (long long)ap->a_loffset, ip->cache[1]);
2745 #endif
2746         }
2747         hammer_done_cursor(&cursor);
2748         hammer_done_transaction(&trans);
2749
2750         /*
2751          * If we couldn't find any records or the records we did find were
2752          * all behind the requested offset, return failure.  A forward
2753          * truncation can leave a hole w/ no on-disk records.
2754          */
2755         if (last_offset == 0 || last_offset < ap->a_loffset)
2756                 return (EOPNOTSUPP);
2757
2758         /*
2759          * Figure out the block size at the requested offset and adjust
2760          * our limits so the cluster_read() does not create inappropriately
2761          * sized buffer cache buffers.
2762          */
2763         blksize = hammer_blocksize(ap->a_loffset);
2764         if (hammer_blocksize(base_offset) != blksize) {
2765                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2766         }
2767         if (last_offset != ap->a_loffset &&
2768             hammer_blocksize(last_offset - 1) != blksize) {
2769                 last_offset = hammer_blockdemarc(ap->a_loffset,
2770                                                  last_offset - 1);
2771         }
2772
2773         /*
2774          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2775          * from occuring.
2776          */
2777         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2778
2779         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2780                 /*
2781                  * Only large-data zones can be direct-IOd
2782                  */
2783                 error = EOPNOTSUPP;
2784         } else if ((disk_offset & HAMMER_BUFMASK) ||
2785                    (last_offset - ap->a_loffset) < blksize) {
2786                 /*
2787                  * doffsetp is not aligned or the forward run size does
2788                  * not cover a whole buffer, disallow the direct I/O.
2789                  */
2790                 error = EOPNOTSUPP;
2791         } else {
2792                 /*
2793                  * We're good.
2794                  */
2795                 *ap->a_doffsetp = disk_offset;
2796                 if (ap->a_runb) {
2797                         *ap->a_runb = ap->a_loffset - base_offset;
2798                         KKASSERT(*ap->a_runb >= 0);
2799                 }
2800                 if (ap->a_runp) {
2801                         *ap->a_runp = last_offset - ap->a_loffset;
2802                         KKASSERT(*ap->a_runp >= 0);
2803                 }
2804                 error = 0;
2805         }
2806         return(error);
2807 }
2808
2809 /*
2810  * Write to a regular file.   Because this is a strategy call the OS is
2811  * trying to actually get data onto the media.
2812  */
2813 static
2814 int
2815 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2816 {
2817         hammer_record_t record;
2818         hammer_mount_t hmp;
2819         hammer_inode_t ip;
2820         struct bio *bio;
2821         struct buf *bp;
2822         int blksize;
2823         int bytes;
2824         int error;
2825
2826         bio = ap->a_bio;
2827         bp = bio->bio_buf;
2828         ip = ap->a_vp->v_data;
2829         hmp = ip->hmp;
2830
2831         blksize = hammer_blocksize(bio->bio_offset);
2832         KKASSERT(bp->b_bufsize == blksize);
2833
2834         if (ip->flags & HAMMER_INODE_RO) {
2835                 bp->b_error = EROFS;
2836                 bp->b_flags |= B_ERROR;
2837                 biodone(ap->a_bio);
2838                 return(EROFS);
2839         }
2840
2841         /*
2842          * Interlock with inode destruction (no in-kernel or directory
2843          * topology visibility).  If we queue new IO while trying to
2844          * destroy the inode we can deadlock the vtrunc call in
2845          * hammer_inode_unloadable_check().
2846          *
2847          * Besides, there's no point flushing a bp associated with an
2848          * inode that is being destroyed on-media and has no kernel
2849          * references.
2850          */
2851         if ((ip->flags | ip->sync_flags) &
2852             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2853                 bp->b_resid = 0;
2854                 biodone(ap->a_bio);
2855                 return(0);
2856         }
2857
2858         /*
2859          * Reserve space and issue a direct-write from the front-end. 
2860          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2861          * allocations.
2862          *
2863          * An in-memory record will be installed to reference the storage
2864          * until the flusher can get to it.
2865          *
2866          * Since we own the high level bio the front-end will not try to
2867          * do a direct-read until the write completes.
2868          *
2869          * NOTE: The only time we do not reserve a full-sized buffers
2870          * worth of data is if the file is small.  We do not try to
2871          * allocate a fragment (from the small-data zone) at the end of
2872          * an otherwise large file as this can lead to wildly separated
2873          * data.
2874          */
2875         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2876         KKASSERT(bio->bio_offset < ip->ino_data.size);
2877         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2878                 bytes = bp->b_bufsize;
2879         else
2880                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2881
2882         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2883                                     bytes, &error);
2884         if (record) {
2885                 hammer_io_direct_write(hmp, record, bio);
2886                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2887                         hammer_flush_inode(ip, 0);
2888         } else {
2889                 bp->b_bio2.bio_offset = NOOFFSET;
2890                 bp->b_error = error;
2891                 bp->b_flags |= B_ERROR;
2892                 biodone(ap->a_bio);
2893         }
2894         return(error);
2895 }
2896
2897 /*
2898  * dounlink - disconnect a directory entry
2899  *
2900  * XXX whiteout support not really in yet
2901  */
2902 static int
2903 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2904                 struct vnode *dvp, struct ucred *cred, 
2905                 int flags, int isdir)
2906 {
2907         struct namecache *ncp;
2908         hammer_inode_t dip;
2909         hammer_inode_t ip;
2910         struct hammer_cursor cursor;
2911         int64_t namekey;
2912         u_int32_t max_iterations;
2913         int nlen, error;
2914
2915         /*
2916          * Calculate the namekey and setup the key range for the scan.  This
2917          * works kinda like a chained hash table where the lower 32 bits
2918          * of the namekey synthesize the chain.
2919          *
2920          * The key range is inclusive of both key_beg and key_end.
2921          */
2922         dip = VTOI(dvp);
2923         ncp = nch->ncp;
2924
2925         if (dip->flags & HAMMER_INODE_RO)
2926                 return (EROFS);
2927
2928         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
2929                                            &max_iterations);
2930 retry:
2931         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2932         cursor.key_beg.localization = dip->obj_localization +
2933                                       hammer_dir_localization(dip);
2934         cursor.key_beg.obj_id = dip->obj_id;
2935         cursor.key_beg.key = namekey;
2936         cursor.key_beg.create_tid = 0;
2937         cursor.key_beg.delete_tid = 0;
2938         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2939         cursor.key_beg.obj_type = 0;
2940
2941         cursor.key_end = cursor.key_beg;
2942         cursor.key_end.key += max_iterations;
2943         cursor.asof = dip->obj_asof;
2944         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2945
2946         /*
2947          * Scan all matching records (the chain), locate the one matching
2948          * the requested path component.  info->last_error contains the
2949          * error code on search termination and could be 0, ENOENT, or
2950          * something else.
2951          *
2952          * The hammer_ip_*() functions merge in-memory records with on-disk
2953          * records for the purposes of the search.
2954          */
2955         error = hammer_ip_first(&cursor);
2956
2957         while (error == 0) {
2958                 error = hammer_ip_resolve_data(&cursor);
2959                 if (error)
2960                         break;
2961                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2962                 KKASSERT(nlen > 0);
2963                 if (ncp->nc_nlen == nlen &&
2964                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2965                         break;
2966                 }
2967                 error = hammer_ip_next(&cursor);
2968         }
2969
2970         /*
2971          * If all is ok we have to get the inode so we can adjust nlinks.
2972          * To avoid a deadlock with the flusher we must release the inode
2973          * lock on the directory when acquiring the inode for the entry.
2974          *
2975          * If the target is a directory, it must be empty.
2976          */
2977         if (error == 0) {
2978                 hammer_unlock(&cursor.ip->lock);
2979                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2980                                       dip->hmp->asof,
2981                                       cursor.data->entry.localization,
2982                                       0, &error);
2983                 hammer_lock_sh(&cursor.ip->lock);
2984                 if (error == ENOENT) {
2985                         kprintf("HAMMER: WARNING: Removing "
2986                                 "dirent w/missing inode \"%s\"\n"
2987                                 "\tobj_id = %016llx\n",
2988                                 ncp->nc_name,
2989                                 (long long)cursor.data->entry.obj_id);
2990                         error = 0;
2991                 }
2992
2993                 /*
2994                  * If isdir >= 0 we validate that the entry is or is not a
2995                  * directory.  If isdir < 0 we don't care.
2996                  */
2997                 if (error == 0 && isdir >= 0 && ip) {
2998                         if (isdir &&
2999                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3000                                 error = ENOTDIR;
3001                         } else if (isdir == 0 &&
3002                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3003                                 error = EISDIR;
3004                         }
3005                 }
3006
3007                 /*
3008                  * If we are trying to remove a directory the directory must
3009                  * be empty.
3010                  *
3011                  * The check directory code can loop and deadlock/retry.  Our
3012                  * own cursor's node locks must be released to avoid a 3-way
3013                  * deadlock with the flusher if the check directory code
3014                  * blocks.
3015                  *
3016                  * If any changes whatsoever have been made to the cursor
3017                  * set EDEADLK and retry.
3018                  */
3019                 if (error == 0 && ip && ip->ino_data.obj_type ==
3020                                         HAMMER_OBJTYPE_DIRECTORY) {
3021                         hammer_unlock_cursor(&cursor);
3022                         error = hammer_ip_check_directory_empty(trans, ip);
3023                         hammer_lock_cursor(&cursor);
3024                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
3025                                 kprintf("HAMMER: Warning: avoided deadlock "
3026                                         "on rmdir '%s'\n",
3027                                         ncp->nc_name);
3028                                 error = EDEADLK;
3029                         }
3030                 }
3031
3032                 /*
3033                  * Delete the directory entry.
3034                  *
3035                  * WARNING: hammer_ip_del_directory() may have to terminate
3036                  * the cursor to avoid a deadlock.  It is ok to call
3037                  * hammer_done_cursor() twice.
3038                  */
3039                 if (error == 0) {
3040                         error = hammer_ip_del_directory(trans, &cursor,
3041                                                         dip, ip);
3042                 }
3043                 hammer_done_cursor(&cursor);
3044                 if (error == 0) {
3045                         cache_setunresolved(nch);
3046                         cache_setvp(nch, NULL);
3047                         /* XXX locking */
3048                         if (ip && ip->vp) {
3049                                 hammer_knote(ip->vp, NOTE_DELETE);
3050                                 cache_inval_vp(ip->vp, CINV_DESTROY);
3051                         }
3052                 }
3053                 if (ip)
3054                         hammer_rel_inode(ip, 0);
3055         } else {
3056                 hammer_done_cursor(&cursor);
3057         }
3058         if (error == EDEADLK)
3059                 goto retry;
3060
3061         return (error);
3062 }
3063
3064 /************************************************************************
3065  *                          FIFO AND SPECFS OPS                         *
3066  ************************************************************************
3067  *
3068  */
3069
3070 static int
3071 hammer_vop_fifoclose (struct vop_close_args *ap)
3072 {
3073         /* XXX update itimes */
3074         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3075 }
3076
3077 static int
3078 hammer_vop_fiforead (struct vop_read_args *ap)
3079 {
3080         int error;
3081
3082         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3083         /* XXX update access time */
3084         return (error);
3085 }
3086
3087 static int
3088 hammer_vop_fifowrite (struct vop_write_args *ap)
3089 {
3090         int error;
3091
3092         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3093         /* XXX update access time */
3094         return (error);
3095 }
3096
3097 static
3098 int
3099 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3100 {
3101         int error;
3102
3103         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3104         if (error)
3105                 error = hammer_vop_kqfilter(ap);
3106         return(error);
3107 }
3108
3109 /************************************************************************
3110  *                          KQFILTER OPS                                *
3111  ************************************************************************
3112  *
3113  */
3114 static void filt_hammerdetach(struct knote *kn);
3115 static int filt_hammerread(struct knote *kn, long hint);
3116 static int filt_hammerwrite(struct knote *kn, long hint);
3117 static int filt_hammervnode(struct knote *kn, long hint);
3118
3119 static struct filterops hammerread_filtops =
3120         { 1, NULL, filt_hammerdetach, filt_hammerread };
3121 static struct filterops hammerwrite_filtops =
3122         { 1, NULL, filt_hammerdetach, filt_hammerwrite };
3123 static struct filterops hammervnode_filtops =
3124         { 1, NULL, filt_hammerdetach, filt_hammervnode };
3125
3126 static
3127 int
3128 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3129 {
3130         struct vnode *vp = ap->a_vp;
3131         struct knote *kn = ap->a_kn;
3132         lwkt_tokref vlock;
3133
3134         switch (kn->kn_filter) {
3135         case EVFILT_READ:
3136                 kn->kn_fop = &hammerread_filtops;
3137                 break;
3138         case EVFILT_WRITE:
3139                 kn->kn_fop = &hammerwrite_filtops;
3140                 break;
3141         case EVFILT_VNODE:
3142                 kn->kn_fop = &hammervnode_filtops;
3143                 break;
3144         default:
3145                 return (1);
3146         }
3147
3148         kn->kn_hook = (caddr_t)vp;
3149
3150         lwkt_gettoken(&vlock, &vp->v_token);
3151         SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
3152         lwkt_reltoken(&vlock);
3153
3154         return(0);
3155 }
3156
3157 static void
3158 filt_hammerdetach(struct knote *kn)
3159 {
3160         struct vnode *vp = (void *)kn->kn_hook;
3161         lwkt_tokref vlock;
3162
3163         lwkt_gettoken(&vlock, &vp->v_token);
3164         SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
3165                      kn, knote, kn_selnext);
3166         lwkt_reltoken(&vlock);
3167 }
3168
3169 static int
3170 filt_hammerread(struct knote *kn, long hint)
3171 {
3172         struct vnode *vp = (void *)kn->kn_hook;
3173         hammer_inode_t ip = VTOI(vp);
3174
3175         if (hint == NOTE_REVOKE) {
3176                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3177                 return(1);
3178         }
3179         kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
3180         return (kn->kn_data != 0);
3181 }
3182
3183 static int
3184 filt_hammerwrite(struct knote *kn, long hint)
3185 {
3186         if (hint == NOTE_REVOKE)
3187                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3188         kn->kn_data = 0;
3189         return (1);
3190 }
3191
3192 static int
3193 filt_hammervnode(struct knote *kn, long hint)
3194 {
3195         if (kn->kn_sfflags & hint)
3196                 kn->kn_fflags |= hint;
3197         if (hint == NOTE_REVOKE) {
3198                 kn->kn_flags |= EV_EOF;
3199                 return (1);
3200         }
3201         return (kn->kn_fflags != 0);
3202 }
3203