Fix a buffer cache deadlock which can occur when simulated disk devices
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
51
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
84
85 static int hammer_vop_fifoclose (struct vop_close_args *);
86 static int hammer_vop_fiforead (struct vop_read_args *);
87 static int hammer_vop_fifowrite (struct vop_write_args *);
88 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
89
90 static int hammer_vop_specclose (struct vop_close_args *);
91 static int hammer_vop_specread (struct vop_read_args *);
92 static int hammer_vop_specwrite (struct vop_write_args *);
93
94 struct vop_ops hammer_vnode_vops = {
95         .vop_default =          vop_defaultop,
96         .vop_fsync =            hammer_vop_fsync,
97         .vop_getpages =         vop_stdgetpages,
98         .vop_putpages =         vop_stdputpages,
99         .vop_read =             hammer_vop_read,
100         .vop_write =            hammer_vop_write,
101         .vop_access =           hammer_vop_access,
102         .vop_advlock =          hammer_vop_advlock,
103         .vop_close =            hammer_vop_close,
104         .vop_ncreate =          hammer_vop_ncreate,
105         .vop_getattr =          hammer_vop_getattr,
106         .vop_inactive =         hammer_vop_inactive,
107         .vop_reclaim =          hammer_vop_reclaim,
108         .vop_nresolve =         hammer_vop_nresolve,
109         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
110         .vop_nlink =            hammer_vop_nlink,
111         .vop_nmkdir =           hammer_vop_nmkdir,
112         .vop_nmknod =           hammer_vop_nmknod,
113         .vop_open =             hammer_vop_open,
114         .vop_pathconf =         vop_stdpathconf,
115         .vop_print =            hammer_vop_print,
116         .vop_readdir =          hammer_vop_readdir,
117         .vop_readlink =         hammer_vop_readlink,
118         .vop_nremove =          hammer_vop_nremove,
119         .vop_nrename =          hammer_vop_nrename,
120         .vop_nrmdir =           hammer_vop_nrmdir,
121         .vop_setattr =          hammer_vop_setattr,
122         .vop_bmap =             hammer_vop_bmap,
123         .vop_strategy =         hammer_vop_strategy,
124         .vop_nsymlink =         hammer_vop_nsymlink,
125         .vop_nwhiteout =        hammer_vop_nwhiteout,
126         .vop_ioctl =            hammer_vop_ioctl,
127         .vop_mountctl =         hammer_vop_mountctl,
128         .vop_kqfilter =         hammer_vop_kqfilter
129 };
130
131 struct vop_ops hammer_spec_vops = {
132         .vop_default =          spec_vnoperate,
133         .vop_fsync =            hammer_vop_fsync,
134         .vop_read =             hammer_vop_specread,
135         .vop_write =            hammer_vop_specwrite,
136         .vop_access =           hammer_vop_access,
137         .vop_close =            hammer_vop_specclose,
138         .vop_getattr =          hammer_vop_getattr,
139         .vop_inactive =         hammer_vop_inactive,
140         .vop_reclaim =          hammer_vop_reclaim,
141         .vop_setattr =          hammer_vop_setattr
142 };
143
144 struct vop_ops hammer_fifo_vops = {
145         .vop_default =          fifo_vnoperate,
146         .vop_fsync =            hammer_vop_fsync,
147         .vop_read =             hammer_vop_fiforead,
148         .vop_write =            hammer_vop_fifowrite,
149         .vop_access =           hammer_vop_access,
150         .vop_close =            hammer_vop_fifoclose,
151         .vop_getattr =          hammer_vop_getattr,
152         .vop_inactive =         hammer_vop_inactive,
153         .vop_reclaim =          hammer_vop_reclaim,
154         .vop_setattr =          hammer_vop_setattr,
155         .vop_kqfilter =         hammer_vop_fifokqfilter
156 };
157
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
161 {
162         if (flags)
163                 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
164 }
165
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
169
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171                            struct vnode *dvp, struct ucred *cred,
172                            int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
175
176 #if 0
177 static
178 int
179 hammer_vop_vnoperate(struct vop_generic_args *)
180 {
181         return (VOCALL(&hammer_vnode_vops, ap));
182 }
183 #endif
184
185 /*
186  * hammer_vop_fsync { vp, waitfor }
187  *
188  * fsync() an inode to disk and wait for it to be completely committed
189  * such that the information would not be undone if a crash occured after
190  * return.
191  */
192 static
193 int
194 hammer_vop_fsync(struct vop_fsync_args *ap)
195 {
196         hammer_inode_t ip = VTOI(ap->a_vp);
197
198         ++hammer_count_fsyncs;
199         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
200         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
201         if (ap->a_waitfor == MNT_WAIT) {
202                 vn_unlock(ap->a_vp);
203                 hammer_wait_inode(ip);
204                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
205         }
206         return (ip->error);
207 }
208
209 /*
210  * hammer_vop_read { vp, uio, ioflag, cred }
211  */
212 static
213 int
214 hammer_vop_read(struct vop_read_args *ap)
215 {
216         struct hammer_transaction trans;
217         hammer_inode_t ip;
218         off_t offset;
219         struct buf *bp;
220         struct uio *uio;
221         int error;
222         int n;
223         int seqcount;
224         int ioseqcount;
225         int blksize;
226
227         if (ap->a_vp->v_type != VREG)
228                 return (EINVAL);
229         ip = VTOI(ap->a_vp);
230         error = 0;
231         uio = ap->a_uio;
232
233         /*
234          * Allow the UIO's size to override the sequential heuristic.
235          */
236         blksize = hammer_blocksize(uio->uio_offset);
237         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
238         ioseqcount = ap->a_ioflag >> 16;
239         if (seqcount < ioseqcount)
240                 seqcount = ioseqcount;
241
242         hammer_start_transaction(&trans, ip->hmp);
243
244         /*
245          * Access the data typically in HAMMER_BUFSIZE blocks via the
246          * buffer cache, but HAMMER may use a variable block size based
247          * on the offset.
248          */
249         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
250                 int64_t base_offset;
251                 int64_t file_limit;
252
253                 blksize = hammer_blocksize(uio->uio_offset);
254                 offset = (int)uio->uio_offset & (blksize - 1);
255                 base_offset = uio->uio_offset - offset;
256
257                 if (hammer_cluster_enable) {
258                         /*
259                          * Use file_limit to prevent cluster_read() from
260                          * creating buffers of the wrong block size past
261                          * the demarc.
262                          */
263                         file_limit = ip->ino_data.size;
264                         if (base_offset < HAMMER_XDEMARC &&
265                             file_limit > HAMMER_XDEMARC) {
266                                 file_limit = HAMMER_XDEMARC;
267                         }
268                         error = cluster_read(ap->a_vp,
269                                              file_limit, base_offset,
270                                              blksize, MAXPHYS,
271                                              seqcount, &bp);
272                 } else {
273                         error = bread(ap->a_vp, base_offset, blksize, &bp);
274                 }
275                 if (error) {
276                         kprintf("error %d\n", error);
277                         brelse(bp);
278                         break;
279                 }
280
281                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
282                 n = blksize - offset;
283                 if (n > uio->uio_resid)
284                         n = uio->uio_resid;
285                 if (n > ip->ino_data.size - uio->uio_offset)
286                         n = (int)(ip->ino_data.size - uio->uio_offset);
287                 error = uiomove((char *)bp->b_data + offset, n, uio);
288
289                 /* data has a lower priority then meta-data */
290                 bp->b_flags |= B_AGE;
291                 bqrelse(bp);
292                 if (error)
293                         break;
294                 hammer_stats_file_read += n;
295         }
296         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
297             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
298                 ip->ino_data.atime = trans.time;
299                 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
300         }
301         hammer_done_transaction(&trans);
302         return (error);
303 }
304
305 /*
306  * hammer_vop_write { vp, uio, ioflag, cred }
307  */
308 static
309 int
310 hammer_vop_write(struct vop_write_args *ap)
311 {
312         struct hammer_transaction trans;
313         struct hammer_inode *ip;
314         hammer_mount_t hmp;
315         struct uio *uio;
316         int offset;
317         off_t base_offset;
318         struct buf *bp;
319         int kflags;
320         int error;
321         int n;
322         int flags;
323         int delta;
324         int seqcount;
325
326         if (ap->a_vp->v_type != VREG)
327                 return (EINVAL);
328         ip = VTOI(ap->a_vp);
329         hmp = ip->hmp;
330         error = 0;
331         kflags = 0;
332         seqcount = ap->a_ioflag >> 16;
333
334         if (ip->flags & HAMMER_INODE_RO)
335                 return (EROFS);
336
337         /*
338          * Create a transaction to cover the operations we perform.
339          */
340         hammer_start_transaction(&trans, hmp);
341         uio = ap->a_uio;
342
343         /*
344          * Check append mode
345          */
346         if (ap->a_ioflag & IO_APPEND)
347                 uio->uio_offset = ip->ino_data.size;
348
349         /*
350          * Check for illegal write offsets.  Valid range is 0...2^63-1.
351          *
352          * NOTE: the base_off assignment is required to work around what
353          * I consider to be a GCC-4 optimization bug.
354          */
355         if (uio->uio_offset < 0) {
356                 hammer_done_transaction(&trans);
357                 return (EFBIG);
358         }
359         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
360         if (uio->uio_resid > 0 && base_offset <= 0) {
361                 hammer_done_transaction(&trans);
362                 return (EFBIG);
363         }
364
365         /*
366          * Access the data typically in HAMMER_BUFSIZE blocks via the
367          * buffer cache, but HAMMER may use a variable block size based
368          * on the offset.
369          */
370         while (uio->uio_resid > 0) {
371                 int fixsize = 0;
372                 int blksize;
373                 int blkmask;
374
375                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
376                         break;
377
378                 blksize = hammer_blocksize(uio->uio_offset);
379
380                 /*
381                  * Do not allow HAMMER to blow out the buffer cache.  Very
382                  * large UIOs can lockout other processes due to bwillwrite()
383                  * mechanics.
384                  *
385                  * The hammer inode is not locked during these operations.
386                  * The vnode is locked which can interfere with the pageout
387                  * daemon for non-UIO_NOCOPY writes but should not interfere
388                  * with the buffer cache.  Even so, we cannot afford to
389                  * allow the pageout daemon to build up too many dirty buffer
390                  * cache buffers.
391                  *
392                  * Only call this if we aren't being recursively called from
393                  * a virtual disk device (vn), else we may deadlock.
394                  */
395                 if ((ap->a_ioflag & IO_RECURSE) == 0)
396                         bwillwrite(blksize);
397
398                 /*
399                  * Do not allow HAMMER to blow out system memory by
400                  * accumulating too many records.   Records are so well
401                  * decoupled from the buffer cache that it is possible
402                  * for userland to push data out to the media via
403                  * direct-write, but build up the records queued to the
404                  * backend faster then the backend can flush them out.
405                  * HAMMER has hit its write limit but the frontend has
406                  * no pushback to slow it down.
407                  */
408                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
409                         /*
410                          * Get the inode on the flush list
411                          */
412                         if (ip->rsv_recs >= 64)
413                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
414                         else if (ip->rsv_recs >= 16)
415                                 hammer_flush_inode(ip, 0);
416
417                         /*
418                          * Keep the flusher going if the system keeps
419                          * queueing records.
420                          */
421                         delta = hmp->count_newrecords -
422                                 hmp->last_newrecords;
423                         if (delta < 0 || delta > hammer_limit_recs / 2) {
424                                 hmp->last_newrecords = hmp->count_newrecords;
425                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
426                         }
427
428                         /*
429                          * If we have gotten behind start slowing
430                          * down the writers.
431                          */
432                         delta = (hmp->rsv_recs - hammer_limit_recs) *
433                                 hz / hammer_limit_recs;
434                         if (delta > 0)
435                                 tsleep(&trans, 0, "hmrslo", delta);
436                 }
437
438                 /*
439                  * Calculate the blocksize at the current offset and figure
440                  * out how much we can actually write.
441                  */
442                 blkmask = blksize - 1;
443                 offset = (int)uio->uio_offset & blkmask;
444                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
445                 n = blksize - offset;
446                 if (n > uio->uio_resid)
447                         n = uio->uio_resid;
448                 if (uio->uio_offset + n > ip->ino_data.size) {
449                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
450                         fixsize = 1;
451                         kflags |= NOTE_EXTEND;
452                 }
453
454                 if (uio->uio_segflg == UIO_NOCOPY) {
455                         /*
456                          * Issuing a write with the same data backing the
457                          * buffer.  Instantiate the buffer to collect the
458                          * backing vm pages, then read-in any missing bits.
459                          *
460                          * This case is used by vop_stdputpages().
461                          */
462                         bp = getblk(ap->a_vp, base_offset,
463                                     blksize, GETBLK_BHEAVY, 0);
464                         if ((bp->b_flags & B_CACHE) == 0) {
465                                 bqrelse(bp);
466                                 error = bread(ap->a_vp, base_offset,
467                                               blksize, &bp);
468                         }
469                 } else if (offset == 0 && uio->uio_resid >= blksize) {
470                         /*
471                          * Even though we are entirely overwriting the buffer
472                          * we may still have to zero it out to avoid a 
473                          * mmap/write visibility issue.
474                          */
475                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
476                         if ((bp->b_flags & B_CACHE) == 0)
477                                 vfs_bio_clrbuf(bp);
478                 } else if (base_offset >= ip->ino_data.size) {
479                         /*
480                          * If the base offset of the buffer is beyond the
481                          * file EOF, we don't have to issue a read.
482                          */
483                         bp = getblk(ap->a_vp, base_offset,
484                                     blksize, GETBLK_BHEAVY, 0);
485                         vfs_bio_clrbuf(bp);
486                 } else {
487                         /*
488                          * Partial overwrite, read in any missing bits then
489                          * replace the portion being written.
490                          */
491                         error = bread(ap->a_vp, base_offset, blksize, &bp);
492                         if (error == 0)
493                                 bheavy(bp);
494                 }
495                 if (error == 0) {
496                         error = uiomove((char *)bp->b_data + offset,
497                                         n, uio);
498                 }
499
500                 /*
501                  * If we screwed up we have to undo any VM size changes we
502                  * made.
503                  */
504                 if (error) {
505                         brelse(bp);
506                         if (fixsize) {
507                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
508                                           hammer_blocksize(ip->ino_data.size));
509                         }
510                         break;
511                 }
512                 kflags |= NOTE_WRITE;
513                 hammer_stats_file_write += n;
514                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
515                 if (ip->ino_data.size < uio->uio_offset) {
516                         ip->ino_data.size = uio->uio_offset;
517                         flags = HAMMER_INODE_DDIRTY;
518                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
519                 } else {
520                         flags = 0;
521                 }
522                 ip->ino_data.mtime = trans.time;
523                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
524                 hammer_modify_inode(ip, flags);
525
526                 /*
527                  * Once we dirty the buffer any cached zone-X offset
528                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
529                  * allow overwriting over the same data sector unless
530                  * we provide UNDOs for the old data, which we don't.
531                  */
532                 bp->b_bio2.bio_offset = NOOFFSET;
533
534                 /*
535                  * Final buffer disposition.
536                  */
537                 bp->b_flags |= B_AGE;
538                 if (ap->a_ioflag & IO_SYNC) {
539                         bwrite(bp);
540                 } else if (ap->a_ioflag & IO_DIRECT) {
541                         bawrite(bp);
542                 } else {
543                         bdwrite(bp);
544                 }
545         }
546         hammer_done_transaction(&trans);
547         hammer_knote(ap->a_vp, kflags);
548         return (error);
549 }
550
551 /*
552  * hammer_vop_access { vp, mode, cred }
553  */
554 static
555 int
556 hammer_vop_access(struct vop_access_args *ap)
557 {
558         struct hammer_inode *ip = VTOI(ap->a_vp);
559         uid_t uid;
560         gid_t gid;
561         int error;
562
563         ++hammer_stats_file_iopsr;
564         uid = hammer_to_unix_xid(&ip->ino_data.uid);
565         gid = hammer_to_unix_xid(&ip->ino_data.gid);
566
567         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
568                                   ip->ino_data.uflags);
569         return (error);
570 }
571
572 /*
573  * hammer_vop_advlock { vp, id, op, fl, flags }
574  */
575 static
576 int
577 hammer_vop_advlock(struct vop_advlock_args *ap)
578 {
579         hammer_inode_t ip = VTOI(ap->a_vp);
580
581         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
582 }
583
584 /*
585  * hammer_vop_close { vp, fflag }
586  */
587 static
588 int
589 hammer_vop_close(struct vop_close_args *ap)
590 {
591         /*hammer_inode_t ip = VTOI(ap->a_vp);*/
592         return (vop_stdclose(ap));
593 }
594
595 /*
596  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
597  *
598  * The operating system has already ensured that the directory entry
599  * does not exist and done all appropriate namespace locking.
600  */
601 static
602 int
603 hammer_vop_ncreate(struct vop_ncreate_args *ap)
604 {
605         struct hammer_transaction trans;
606         struct hammer_inode *dip;
607         struct hammer_inode *nip;
608         struct nchandle *nch;
609         int error;
610
611         nch = ap->a_nch;
612         dip = VTOI(ap->a_dvp);
613
614         if (dip->flags & HAMMER_INODE_RO)
615                 return (EROFS);
616         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
617                 return (error);
618
619         /*
620          * Create a transaction to cover the operations we perform.
621          */
622         hammer_start_transaction(&trans, dip->hmp);
623         ++hammer_stats_file_iopsw;
624
625         /*
626          * Create a new filesystem object of the requested type.  The
627          * returned inode will be referenced and shared-locked to prevent
628          * it from being moved to the flusher.
629          */
630
631         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
632                                     dip, NULL, &nip);
633         if (error) {
634                 hkprintf("hammer_create_inode error %d\n", error);
635                 hammer_done_transaction(&trans);
636                 *ap->a_vpp = NULL;
637                 return (error);
638         }
639
640         /*
641          * Add the new filesystem object to the directory.  This will also
642          * bump the inode's link count.
643          */
644         error = hammer_ip_add_directory(&trans, dip,
645                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
646                                         nip);
647         if (error)
648                 hkprintf("hammer_ip_add_directory error %d\n", error);
649
650         /*
651          * Finish up.
652          */
653         if (error) {
654                 hammer_rel_inode(nip, 0);
655                 hammer_done_transaction(&trans);
656                 *ap->a_vpp = NULL;
657         } else {
658                 error = hammer_get_vnode(nip, ap->a_vpp);
659                 hammer_done_transaction(&trans);
660                 hammer_rel_inode(nip, 0);
661                 if (error == 0) {
662                         cache_setunresolved(ap->a_nch);
663                         cache_setvp(ap->a_nch, *ap->a_vpp);
664                 }
665                 hammer_knote(ap->a_dvp, NOTE_WRITE);
666         }
667         return (error);
668 }
669
670 /*
671  * hammer_vop_getattr { vp, vap }
672  *
673  * Retrieve an inode's attribute information.  When accessing inodes
674  * historically we fake the atime field to ensure consistent results.
675  * The atime field is stored in the B-Tree element and allowed to be
676  * updated without cycling the element.
677  */
678 static
679 int
680 hammer_vop_getattr(struct vop_getattr_args *ap)
681 {
682         struct hammer_inode *ip = VTOI(ap->a_vp);
683         struct vattr *vap = ap->a_vap;
684
685         /*
686          * We want the fsid to be different when accessing a filesystem
687          * with different as-of's so programs like diff don't think
688          * the files are the same.
689          *
690          * We also want the fsid to be the same when comparing snapshots,
691          * or when comparing mirrors (which might be backed by different
692          * physical devices).  HAMMER fsids are based on the PFS's
693          * shared_uuid field.
694          *
695          * XXX there is a chance of collision here.  The va_fsid reported
696          * by stat is different from the more involved fsid used in the
697          * mount structure.
698          */
699         ++hammer_stats_file_iopsr;
700         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
701                        (u_int32_t)(ip->obj_asof >> 32);
702
703         vap->va_fileid = ip->ino_leaf.base.obj_id;
704         vap->va_mode = ip->ino_data.mode;
705         vap->va_nlink = ip->ino_data.nlinks;
706         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
707         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
708         vap->va_rmajor = 0;
709         vap->va_rminor = 0;
710         vap->va_size = ip->ino_data.size;
711
712         /*
713          * Special case for @@PFS softlinks.  The actual size of the
714          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
715          */
716         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
717             ip->ino_data.size == 10 &&
718             ip->obj_asof == HAMMER_MAX_TID &&
719             ip->obj_localization == 0 &&
720             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
721                     vap->va_size = 26;
722         }
723
724         /*
725          * We must provide a consistent atime and mtime for snapshots
726          * so people can do a 'tar cf - ... | md5' on them and get
727          * consistent results.
728          */
729         if (ip->flags & HAMMER_INODE_RO) {
730                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
731                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
732         } else {
733                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
734                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
735         }
736         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
737         vap->va_flags = ip->ino_data.uflags;
738         vap->va_gen = 1;        /* hammer inums are unique for all time */
739         vap->va_blocksize = HAMMER_BUFSIZE;
740         if (ip->ino_data.size >= HAMMER_XDEMARC) {
741                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
742                                 ~HAMMER_XBUFMASK64;
743         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
744                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
745                                 ~HAMMER_BUFMASK64;
746         } else {
747                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
748         }
749
750         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
751         vap->va_filerev = 0;    /* XXX */
752         /* mtime uniquely identifies any adjustments made to the file XXX */
753         vap->va_fsmid = ip->ino_data.mtime;
754         vap->va_uid_uuid = ip->ino_data.uid;
755         vap->va_gid_uuid = ip->ino_data.gid;
756         vap->va_fsid_uuid = ip->hmp->fsid;
757         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
758                           VA_FSID_UUID_VALID;
759
760         switch (ip->ino_data.obj_type) {
761         case HAMMER_OBJTYPE_CDEV:
762         case HAMMER_OBJTYPE_BDEV:
763                 vap->va_rmajor = ip->ino_data.rmajor;
764                 vap->va_rminor = ip->ino_data.rminor;
765                 break;
766         default:
767                 break;
768         }
769         return(0);
770 }
771
772 /*
773  * hammer_vop_nresolve { nch, dvp, cred }
774  *
775  * Locate the requested directory entry.
776  */
777 static
778 int
779 hammer_vop_nresolve(struct vop_nresolve_args *ap)
780 {
781         struct hammer_transaction trans;
782         struct namecache *ncp;
783         hammer_inode_t dip;
784         hammer_inode_t ip;
785         hammer_tid_t asof;
786         struct hammer_cursor cursor;
787         struct vnode *vp;
788         int64_t namekey;
789         int error;
790         int i;
791         int nlen;
792         int flags;
793         int ispfs;
794         int64_t obj_id;
795         u_int32_t localization;
796         u_int32_t max_iterations;
797
798         /*
799          * Misc initialization, plus handle as-of name extensions.  Look for
800          * the '@@' extension.  Note that as-of files and directories cannot
801          * be modified.
802          */
803         dip = VTOI(ap->a_dvp);
804         ncp = ap->a_nch->ncp;
805         asof = dip->obj_asof;
806         localization = dip->obj_localization;   /* for code consistency */
807         nlen = ncp->nc_nlen;
808         flags = dip->flags & HAMMER_INODE_RO;
809         ispfs = 0;
810
811         hammer_simple_transaction(&trans, dip->hmp);
812         ++hammer_stats_file_iopsr;
813
814         for (i = 0; i < nlen; ++i) {
815                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
816                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
817                                                   &ispfs, &asof, &localization);
818                         if (error != 0) {
819                                 i = nlen;
820                                 break;
821                         }
822                         if (asof != HAMMER_MAX_TID)
823                                 flags |= HAMMER_INODE_RO;
824                         break;
825                 }
826         }
827         nlen = i;
828
829         /*
830          * If this is a PFS softlink we dive into the PFS
831          */
832         if (ispfs && nlen == 0) {
833                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
834                                       asof, localization,
835                                       flags, &error);
836                 if (error == 0) {
837                         error = hammer_get_vnode(ip, &vp);
838                         hammer_rel_inode(ip, 0);
839                 } else {
840                         vp = NULL;
841                 }
842                 if (error == 0) {
843                         vn_unlock(vp);
844                         cache_setvp(ap->a_nch, vp);
845                         vrele(vp);
846                 }
847                 goto done;
848         }
849
850         /*
851          * If there is no path component the time extension is relative to
852          * dip.
853          */
854         if (nlen == 0) {
855                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
856                                       asof, dip->obj_localization,
857                                       flags, &error);
858                 if (error == 0) {
859                         error = hammer_get_vnode(ip, &vp);
860                         hammer_rel_inode(ip, 0);
861                 } else {
862                         vp = NULL;
863                 }
864                 if (error == 0) {
865                         vn_unlock(vp);
866                         cache_setvp(ap->a_nch, vp);
867                         vrele(vp);
868                 }
869                 goto done;
870         }
871
872         /*
873          * Calculate the namekey and setup the key range for the scan.  This
874          * works kinda like a chained hash table where the lower 32 bits
875          * of the namekey synthesize the chain.
876          *
877          * The key range is inclusive of both key_beg and key_end.
878          */
879         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
880                                            &max_iterations);
881
882         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
883         cursor.key_beg.localization = dip->obj_localization +
884                                       HAMMER_LOCALIZE_MISC;
885         cursor.key_beg.obj_id = dip->obj_id;
886         cursor.key_beg.key = namekey;
887         cursor.key_beg.create_tid = 0;
888         cursor.key_beg.delete_tid = 0;
889         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
890         cursor.key_beg.obj_type = 0;
891
892         cursor.key_end = cursor.key_beg;
893         cursor.key_end.key += max_iterations;
894         cursor.asof = asof;
895         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
896
897         /*
898          * Scan all matching records (the chain), locate the one matching
899          * the requested path component.
900          *
901          * The hammer_ip_*() functions merge in-memory records with on-disk
902          * records for the purposes of the search.
903          */
904         obj_id = 0;
905         localization = HAMMER_DEF_LOCALIZATION;
906
907         if (error == 0) {
908                 error = hammer_ip_first(&cursor);
909                 while (error == 0) {
910                         error = hammer_ip_resolve_data(&cursor);
911                         if (error)
912                                 break;
913                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
914                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
915                                 obj_id = cursor.data->entry.obj_id;
916                                 localization = cursor.data->entry.localization;
917                                 break;
918                         }
919                         error = hammer_ip_next(&cursor);
920                 }
921         }
922         hammer_done_cursor(&cursor);
923         if (error == 0) {
924                 ip = hammer_get_inode(&trans, dip, obj_id,
925                                       asof, localization,
926                                       flags, &error);
927                 if (error == 0) {
928                         error = hammer_get_vnode(ip, &vp);
929                         hammer_rel_inode(ip, 0);
930                 } else {
931                         vp = NULL;
932                 }
933                 if (error == 0) {
934                         vn_unlock(vp);
935                         cache_setvp(ap->a_nch, vp);
936                         vrele(vp);
937                 }
938         } else if (error == ENOENT) {
939                 cache_setvp(ap->a_nch, NULL);
940         }
941 done:
942         hammer_done_transaction(&trans);
943         return (error);
944 }
945
946 /*
947  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
948  *
949  * Locate the parent directory of a directory vnode.
950  *
951  * dvp is referenced but not locked.  *vpp must be returned referenced and
952  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
953  * at the root, instead it could indicate that the directory we were in was
954  * removed.
955  *
956  * NOTE: as-of sequences are not linked into the directory structure.  If
957  * we are at the root with a different asof then the mount point, reload
958  * the same directory with the mount point's asof.   I'm not sure what this
959  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
960  * get confused, but it hasn't been tested.
961  */
962 static
963 int
964 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
965 {
966         struct hammer_transaction trans;
967         struct hammer_inode *dip;
968         struct hammer_inode *ip;
969         int64_t parent_obj_id;
970         u_int32_t parent_obj_localization;
971         hammer_tid_t asof;
972         int error;
973
974         dip = VTOI(ap->a_dvp);
975         asof = dip->obj_asof;
976
977         /*
978          * Whos are parent?  This could be the root of a pseudo-filesystem
979          * whos parent is in another localization domain.
980          */
981         parent_obj_id = dip->ino_data.parent_obj_id;
982         if (dip->obj_id == HAMMER_OBJID_ROOT)
983                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
984         else
985                 parent_obj_localization = dip->obj_localization;
986
987         if (parent_obj_id == 0) {
988                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
989                    asof != dip->hmp->asof) {
990                         parent_obj_id = dip->obj_id;
991                         asof = dip->hmp->asof;
992                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
993                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
994                                    dip->obj_asof);
995                 } else {
996                         *ap->a_vpp = NULL;
997                         return ENOENT;
998                 }
999         }
1000
1001         hammer_simple_transaction(&trans, dip->hmp);
1002         ++hammer_stats_file_iopsr;
1003
1004         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1005                               asof, parent_obj_localization,
1006                               dip->flags, &error);
1007         if (ip) {
1008                 error = hammer_get_vnode(ip, ap->a_vpp);
1009                 hammer_rel_inode(ip, 0);
1010         } else {
1011                 *ap->a_vpp = NULL;
1012         }
1013         hammer_done_transaction(&trans);
1014         return (error);
1015 }
1016
1017 /*
1018  * hammer_vop_nlink { nch, dvp, vp, cred }
1019  */
1020 static
1021 int
1022 hammer_vop_nlink(struct vop_nlink_args *ap)
1023 {
1024         struct hammer_transaction trans;
1025         struct hammer_inode *dip;
1026         struct hammer_inode *ip;
1027         struct nchandle *nch;
1028         int error;
1029
1030         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1031                 return(EXDEV);
1032
1033         nch = ap->a_nch;
1034         dip = VTOI(ap->a_dvp);
1035         ip = VTOI(ap->a_vp);
1036
1037         if (dip->obj_localization != ip->obj_localization)
1038                 return(EXDEV);
1039
1040         if (dip->flags & HAMMER_INODE_RO)
1041                 return (EROFS);
1042         if (ip->flags & HAMMER_INODE_RO)
1043                 return (EROFS);
1044         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1045                 return (error);
1046
1047         /*
1048          * Create a transaction to cover the operations we perform.
1049          */
1050         hammer_start_transaction(&trans, dip->hmp);
1051         ++hammer_stats_file_iopsw;
1052
1053         /*
1054          * Add the filesystem object to the directory.  Note that neither
1055          * dip nor ip are referenced or locked, but their vnodes are
1056          * referenced.  This function will bump the inode's link count.
1057          */
1058         error = hammer_ip_add_directory(&trans, dip,
1059                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1060                                         ip);
1061
1062         /*
1063          * Finish up.
1064          */
1065         if (error == 0) {
1066                 cache_setunresolved(nch);
1067                 cache_setvp(nch, ap->a_vp);
1068         }
1069         hammer_done_transaction(&trans);
1070         hammer_knote(ap->a_vp, NOTE_LINK);
1071         hammer_knote(ap->a_dvp, NOTE_WRITE);
1072         return (error);
1073 }
1074
1075 /*
1076  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1077  *
1078  * The operating system has already ensured that the directory entry
1079  * does not exist and done all appropriate namespace locking.
1080  */
1081 static
1082 int
1083 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1084 {
1085         struct hammer_transaction trans;
1086         struct hammer_inode *dip;
1087         struct hammer_inode *nip;
1088         struct nchandle *nch;
1089         int error;
1090
1091         nch = ap->a_nch;
1092         dip = VTOI(ap->a_dvp);
1093
1094         if (dip->flags & HAMMER_INODE_RO)
1095                 return (EROFS);
1096         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1097                 return (error);
1098
1099         /*
1100          * Create a transaction to cover the operations we perform.
1101          */
1102         hammer_start_transaction(&trans, dip->hmp);
1103         ++hammer_stats_file_iopsw;
1104
1105         /*
1106          * Create a new filesystem object of the requested type.  The
1107          * returned inode will be referenced but not locked.
1108          */
1109         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1110                                     dip, NULL, &nip);
1111         if (error) {
1112                 hkprintf("hammer_mkdir error %d\n", error);
1113                 hammer_done_transaction(&trans);
1114                 *ap->a_vpp = NULL;
1115                 return (error);
1116         }
1117         /*
1118          * Add the new filesystem object to the directory.  This will also
1119          * bump the inode's link count.
1120          */
1121         error = hammer_ip_add_directory(&trans, dip,
1122                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1123                                         nip);
1124         if (error)
1125                 hkprintf("hammer_mkdir (add) error %d\n", error);
1126
1127         /*
1128          * Finish up.
1129          */
1130         if (error) {
1131                 hammer_rel_inode(nip, 0);
1132                 *ap->a_vpp = NULL;
1133         } else {
1134                 error = hammer_get_vnode(nip, ap->a_vpp);
1135                 hammer_rel_inode(nip, 0);
1136                 if (error == 0) {
1137                         cache_setunresolved(ap->a_nch);
1138                         cache_setvp(ap->a_nch, *ap->a_vpp);
1139                 }
1140         }
1141         hammer_done_transaction(&trans);
1142         if (error == 0)
1143                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1144         return (error);
1145 }
1146
1147 /*
1148  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1149  *
1150  * The operating system has already ensured that the directory entry
1151  * does not exist and done all appropriate namespace locking.
1152  */
1153 static
1154 int
1155 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1156 {
1157         struct hammer_transaction trans;
1158         struct hammer_inode *dip;
1159         struct hammer_inode *nip;
1160         struct nchandle *nch;
1161         int error;
1162
1163         nch = ap->a_nch;
1164         dip = VTOI(ap->a_dvp);
1165
1166         if (dip->flags & HAMMER_INODE_RO)
1167                 return (EROFS);
1168         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1169                 return (error);
1170
1171         /*
1172          * Create a transaction to cover the operations we perform.
1173          */
1174         hammer_start_transaction(&trans, dip->hmp);
1175         ++hammer_stats_file_iopsw;
1176
1177         /*
1178          * Create a new filesystem object of the requested type.  The
1179          * returned inode will be referenced but not locked.
1180          *
1181          * If mknod specifies a directory a pseudo-fs is created.
1182          */
1183         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1184                                     dip, NULL, &nip);
1185         if (error) {
1186                 hammer_done_transaction(&trans);
1187                 *ap->a_vpp = NULL;
1188                 return (error);
1189         }
1190
1191         /*
1192          * Add the new filesystem object to the directory.  This will also
1193          * bump the inode's link count.
1194          */
1195         error = hammer_ip_add_directory(&trans, dip,
1196                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1197                                         nip);
1198
1199         /*
1200          * Finish up.
1201          */
1202         if (error) {
1203                 hammer_rel_inode(nip, 0);
1204                 *ap->a_vpp = NULL;
1205         } else {
1206                 error = hammer_get_vnode(nip, ap->a_vpp);
1207                 hammer_rel_inode(nip, 0);
1208                 if (error == 0) {
1209                         cache_setunresolved(ap->a_nch);
1210                         cache_setvp(ap->a_nch, *ap->a_vpp);
1211                 }
1212         }
1213         hammer_done_transaction(&trans);
1214         if (error == 0)
1215                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1216         return (error);
1217 }
1218
1219 /*
1220  * hammer_vop_open { vp, mode, cred, fp }
1221  */
1222 static
1223 int
1224 hammer_vop_open(struct vop_open_args *ap)
1225 {
1226         hammer_inode_t ip;
1227
1228         ++hammer_stats_file_iopsr;
1229         ip = VTOI(ap->a_vp);
1230
1231         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1232                 return (EROFS);
1233         return(vop_stdopen(ap));
1234 }
1235
1236 /*
1237  * hammer_vop_print { vp }
1238  */
1239 static
1240 int
1241 hammer_vop_print(struct vop_print_args *ap)
1242 {
1243         return EOPNOTSUPP;
1244 }
1245
1246 /*
1247  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1248  */
1249 static
1250 int
1251 hammer_vop_readdir(struct vop_readdir_args *ap)
1252 {
1253         struct hammer_transaction trans;
1254         struct hammer_cursor cursor;
1255         struct hammer_inode *ip;
1256         struct uio *uio;
1257         hammer_base_elm_t base;
1258         int error;
1259         int cookie_index;
1260         int ncookies;
1261         off_t *cookies;
1262         off_t saveoff;
1263         int r;
1264         int dtype;
1265
1266         ++hammer_stats_file_iopsr;
1267         ip = VTOI(ap->a_vp);
1268         uio = ap->a_uio;
1269         saveoff = uio->uio_offset;
1270
1271         if (ap->a_ncookies) {
1272                 ncookies = uio->uio_resid / 16 + 1;
1273                 if (ncookies > 1024)
1274                         ncookies = 1024;
1275                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1276                 cookie_index = 0;
1277         } else {
1278                 ncookies = -1;
1279                 cookies = NULL;
1280                 cookie_index = 0;
1281         }
1282
1283         hammer_simple_transaction(&trans, ip->hmp);
1284
1285         /*
1286          * Handle artificial entries
1287          */
1288         error = 0;
1289         if (saveoff == 0) {
1290                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1291                 if (r)
1292                         goto done;
1293                 if (cookies)
1294                         cookies[cookie_index] = saveoff;
1295                 ++saveoff;
1296                 ++cookie_index;
1297                 if (cookie_index == ncookies)
1298                         goto done;
1299         }
1300         if (saveoff == 1) {
1301                 if (ip->ino_data.parent_obj_id) {
1302                         r = vop_write_dirent(&error, uio,
1303                                              ip->ino_data.parent_obj_id,
1304                                              DT_DIR, 2, "..");
1305                 } else {
1306                         r = vop_write_dirent(&error, uio,
1307                                              ip->obj_id, DT_DIR, 2, "..");
1308                 }
1309                 if (r)
1310                         goto done;
1311                 if (cookies)
1312                         cookies[cookie_index] = saveoff;
1313                 ++saveoff;
1314                 ++cookie_index;
1315                 if (cookie_index == ncookies)
1316                         goto done;
1317         }
1318
1319         /*
1320          * Key range (begin and end inclusive) to scan.  Directory keys
1321          * directly translate to a 64 bit 'seek' position.
1322          */
1323         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1324         cursor.key_beg.localization = ip->obj_localization +
1325                                       HAMMER_LOCALIZE_MISC;
1326         cursor.key_beg.obj_id = ip->obj_id;
1327         cursor.key_beg.create_tid = 0;
1328         cursor.key_beg.delete_tid = 0;
1329         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1330         cursor.key_beg.obj_type = 0;
1331         cursor.key_beg.key = saveoff;
1332
1333         cursor.key_end = cursor.key_beg;
1334         cursor.key_end.key = HAMMER_MAX_KEY;
1335         cursor.asof = ip->obj_asof;
1336         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1337
1338         error = hammer_ip_first(&cursor);
1339
1340         while (error == 0) {
1341                 error = hammer_ip_resolve_data(&cursor);
1342                 if (error)
1343                         break;
1344                 base = &cursor.leaf->base;
1345                 saveoff = base->key;
1346                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1347
1348                 if (base->obj_id != ip->obj_id)
1349                         panic("readdir: bad record at %p", cursor.node);
1350
1351                 /*
1352                  * Convert pseudo-filesystems into softlinks
1353                  */
1354                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1355                 r = vop_write_dirent(
1356                              &error, uio, cursor.data->entry.obj_id,
1357                              dtype,
1358                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1359                              (void *)cursor.data->entry.name);
1360                 if (r)
1361                         break;
1362                 ++saveoff;
1363                 if (cookies)
1364                         cookies[cookie_index] = base->key;
1365                 ++cookie_index;
1366                 if (cookie_index == ncookies)
1367                         break;
1368                 error = hammer_ip_next(&cursor);
1369         }
1370         hammer_done_cursor(&cursor);
1371
1372 done:
1373         hammer_done_transaction(&trans);
1374
1375         if (ap->a_eofflag)
1376                 *ap->a_eofflag = (error == ENOENT);
1377         uio->uio_offset = saveoff;
1378         if (error && cookie_index == 0) {
1379                 if (error == ENOENT)
1380                         error = 0;
1381                 if (cookies) {
1382                         kfree(cookies, M_TEMP);
1383                         *ap->a_ncookies = 0;
1384                         *ap->a_cookies = NULL;
1385                 }
1386         } else {
1387                 if (error == ENOENT)
1388                         error = 0;
1389                 if (cookies) {
1390                         *ap->a_ncookies = cookie_index;
1391                         *ap->a_cookies = cookies;
1392                 }
1393         }
1394         return(error);
1395 }
1396
1397 /*
1398  * hammer_vop_readlink { vp, uio, cred }
1399  */
1400 static
1401 int
1402 hammer_vop_readlink(struct vop_readlink_args *ap)
1403 {
1404         struct hammer_transaction trans;
1405         struct hammer_cursor cursor;
1406         struct hammer_inode *ip;
1407         char buf[32];
1408         u_int32_t localization;
1409         hammer_pseudofs_inmem_t pfsm;
1410         int error;
1411
1412         ip = VTOI(ap->a_vp);
1413
1414         /*
1415          * Shortcut if the symlink data was stuffed into ino_data.
1416          *
1417          * Also expand special "@@PFS%05d" softlinks (expansion only
1418          * occurs for non-historical (current) accesses made from the
1419          * primary filesystem).
1420          */
1421         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1422                 char *ptr;
1423                 int bytes;
1424
1425                 ptr = ip->ino_data.ext.symlink;
1426                 bytes = (int)ip->ino_data.size;
1427                 if (bytes == 10 &&
1428                     ip->obj_asof == HAMMER_MAX_TID &&
1429                     ip->obj_localization == 0 &&
1430                     strncmp(ptr, "@@PFS", 5) == 0) {
1431                         hammer_simple_transaction(&trans, ip->hmp);
1432                         bcopy(ptr + 5, buf, 5);
1433                         buf[5] = 0;
1434                         localization = strtoul(buf, NULL, 10) << 16;
1435                         pfsm = hammer_load_pseudofs(&trans, localization,
1436                                                     &error);
1437                         if (error == 0) {
1438                                 if (pfsm->pfsd.mirror_flags &
1439                                     HAMMER_PFSD_SLAVE) {
1440                                         ksnprintf(buf, sizeof(buf),
1441                                                   "@@0x%016llx:%05d",
1442                                                   pfsm->pfsd.sync_end_tid,
1443                                                   localization >> 16);
1444                                 } else {
1445                                         ksnprintf(buf, sizeof(buf),
1446                                                   "@@0x%016llx:%05d",
1447                                                   HAMMER_MAX_TID,
1448                                                   localization >> 16);
1449                                 }
1450                                 ptr = buf;
1451                                 bytes = strlen(buf);
1452                         }
1453                         if (pfsm)
1454                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1455                         hammer_done_transaction(&trans);
1456                 }
1457                 error = uiomove(ptr, bytes, ap->a_uio);
1458                 return(error);
1459         }
1460
1461         /*
1462          * Long version
1463          */
1464         hammer_simple_transaction(&trans, ip->hmp);
1465         ++hammer_stats_file_iopsr;
1466         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1467
1468         /*
1469          * Key range (begin and end inclusive) to scan.  Directory keys
1470          * directly translate to a 64 bit 'seek' position.
1471          */
1472         cursor.key_beg.localization = ip->obj_localization +
1473                                       HAMMER_LOCALIZE_MISC;
1474         cursor.key_beg.obj_id = ip->obj_id;
1475         cursor.key_beg.create_tid = 0;
1476         cursor.key_beg.delete_tid = 0;
1477         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1478         cursor.key_beg.obj_type = 0;
1479         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1480         cursor.asof = ip->obj_asof;
1481         cursor.flags |= HAMMER_CURSOR_ASOF;
1482
1483         error = hammer_ip_lookup(&cursor);
1484         if (error == 0) {
1485                 error = hammer_ip_resolve_data(&cursor);
1486                 if (error == 0) {
1487                         KKASSERT(cursor.leaf->data_len >=
1488                                  HAMMER_SYMLINK_NAME_OFF);
1489                         error = uiomove(cursor.data->symlink.name,
1490                                         cursor.leaf->data_len -
1491                                                 HAMMER_SYMLINK_NAME_OFF,
1492                                         ap->a_uio);
1493                 }
1494         }
1495         hammer_done_cursor(&cursor);
1496         hammer_done_transaction(&trans);
1497         return(error);
1498 }
1499
1500 /*
1501  * hammer_vop_nremove { nch, dvp, cred }
1502  */
1503 static
1504 int
1505 hammer_vop_nremove(struct vop_nremove_args *ap)
1506 {
1507         struct hammer_transaction trans;
1508         struct hammer_inode *dip;
1509         int error;
1510
1511         dip = VTOI(ap->a_dvp);
1512
1513         if (hammer_nohistory(dip) == 0 &&
1514             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1515                 return (error);
1516         }
1517
1518         hammer_start_transaction(&trans, dip->hmp);
1519         ++hammer_stats_file_iopsw;
1520         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1521         hammer_done_transaction(&trans);
1522         if (error == 0)
1523                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1524         return (error);
1525 }
1526
1527 /*
1528  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1529  */
1530 static
1531 int
1532 hammer_vop_nrename(struct vop_nrename_args *ap)
1533 {
1534         struct hammer_transaction trans;
1535         struct namecache *fncp;
1536         struct namecache *tncp;
1537         struct hammer_inode *fdip;
1538         struct hammer_inode *tdip;
1539         struct hammer_inode *ip;
1540         struct hammer_cursor cursor;
1541         int64_t namekey;
1542         u_int32_t max_iterations;
1543         int nlen, error;
1544
1545         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1546                 return(EXDEV);
1547         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1548                 return(EXDEV);
1549
1550         fdip = VTOI(ap->a_fdvp);
1551         tdip = VTOI(ap->a_tdvp);
1552         fncp = ap->a_fnch->ncp;
1553         tncp = ap->a_tnch->ncp;
1554         ip = VTOI(fncp->nc_vp);
1555         KKASSERT(ip != NULL);
1556
1557         if (fdip->obj_localization != tdip->obj_localization)
1558                 return(EXDEV);
1559         if (fdip->obj_localization != ip->obj_localization)
1560                 return(EXDEV);
1561
1562         if (fdip->flags & HAMMER_INODE_RO)
1563                 return (EROFS);
1564         if (tdip->flags & HAMMER_INODE_RO)
1565                 return (EROFS);
1566         if (ip->flags & HAMMER_INODE_RO)
1567                 return (EROFS);
1568         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1569                 return (error);
1570
1571         hammer_start_transaction(&trans, fdip->hmp);
1572         ++hammer_stats_file_iopsw;
1573
1574         /*
1575          * Remove tncp from the target directory and then link ip as
1576          * tncp. XXX pass trans to dounlink
1577          *
1578          * Force the inode sync-time to match the transaction so it is
1579          * in-sync with the creation of the target directory entry.
1580          */
1581         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1582                                 ap->a_cred, 0, -1);
1583         if (error == 0 || error == ENOENT) {
1584                 error = hammer_ip_add_directory(&trans, tdip,
1585                                                 tncp->nc_name, tncp->nc_nlen,
1586                                                 ip);
1587                 if (error == 0) {
1588                         ip->ino_data.parent_obj_id = tdip->obj_id;
1589                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1590                 }
1591         }
1592         if (error)
1593                 goto failed; /* XXX */
1594
1595         /*
1596          * Locate the record in the originating directory and remove it.
1597          *
1598          * Calculate the namekey and setup the key range for the scan.  This
1599          * works kinda like a chained hash table where the lower 32 bits
1600          * of the namekey synthesize the chain.
1601          *
1602          * The key range is inclusive of both key_beg and key_end.
1603          */
1604         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1605                                            &max_iterations);
1606 retry:
1607         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1608         cursor.key_beg.localization = fdip->obj_localization +
1609                                       HAMMER_LOCALIZE_MISC;
1610         cursor.key_beg.obj_id = fdip->obj_id;
1611         cursor.key_beg.key = namekey;
1612         cursor.key_beg.create_tid = 0;
1613         cursor.key_beg.delete_tid = 0;
1614         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1615         cursor.key_beg.obj_type = 0;
1616
1617         cursor.key_end = cursor.key_beg;
1618         cursor.key_end.key += max_iterations;
1619         cursor.asof = fdip->obj_asof;
1620         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1621
1622         /*
1623          * Scan all matching records (the chain), locate the one matching
1624          * the requested path component.
1625          *
1626          * The hammer_ip_*() functions merge in-memory records with on-disk
1627          * records for the purposes of the search.
1628          */
1629         error = hammer_ip_first(&cursor);
1630         while (error == 0) {
1631                 if (hammer_ip_resolve_data(&cursor) != 0)
1632                         break;
1633                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1634                 KKASSERT(nlen > 0);
1635                 if (fncp->nc_nlen == nlen &&
1636                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1637                         break;
1638                 }
1639                 error = hammer_ip_next(&cursor);
1640         }
1641
1642         /*
1643          * If all is ok we have to get the inode so we can adjust nlinks.
1644          *
1645          * WARNING: hammer_ip_del_directory() may have to terminate the
1646          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1647          * twice.
1648          */
1649         if (error == 0)
1650                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1651
1652         /*
1653          * XXX A deadlock here will break rename's atomicy for the purposes
1654          * of crash recovery.
1655          */
1656         if (error == EDEADLK) {
1657                 hammer_done_cursor(&cursor);
1658                 goto retry;
1659         }
1660
1661         /*
1662          * Cleanup and tell the kernel that the rename succeeded.
1663          */
1664         hammer_done_cursor(&cursor);
1665         if (error == 0) {
1666                 cache_rename(ap->a_fnch, ap->a_tnch);
1667                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1668                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1669                 if (ip->vp)
1670                         hammer_knote(ip->vp, NOTE_RENAME);
1671         }
1672
1673 failed:
1674         hammer_done_transaction(&trans);
1675         return (error);
1676 }
1677
1678 /*
1679  * hammer_vop_nrmdir { nch, dvp, cred }
1680  */
1681 static
1682 int
1683 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1684 {
1685         struct hammer_transaction trans;
1686         struct hammer_inode *dip;
1687         int error;
1688
1689         dip = VTOI(ap->a_dvp);
1690
1691         if (hammer_nohistory(dip) == 0 &&
1692             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1693                 return (error);
1694         }
1695
1696         hammer_start_transaction(&trans, dip->hmp);
1697         ++hammer_stats_file_iopsw;
1698         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1699         hammer_done_transaction(&trans);
1700         if (error == 0)
1701                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1702         return (error);
1703 }
1704
1705 /*
1706  * hammer_vop_setattr { vp, vap, cred }
1707  */
1708 static
1709 int
1710 hammer_vop_setattr(struct vop_setattr_args *ap)
1711 {
1712         struct hammer_transaction trans;
1713         struct vattr *vap;
1714         struct hammer_inode *ip;
1715         int modflags;
1716         int error;
1717         int truncating;
1718         int blksize;
1719         int kflags;
1720         int64_t aligned_size;
1721         u_int32_t flags;
1722
1723         vap = ap->a_vap;
1724         ip = ap->a_vp->v_data;
1725         modflags = 0;
1726         kflags = 0;
1727
1728         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1729                 return(EROFS);
1730         if (ip->flags & HAMMER_INODE_RO)
1731                 return (EROFS);
1732         if (hammer_nohistory(ip) == 0 &&
1733             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1734                 return (error);
1735         }
1736
1737         hammer_start_transaction(&trans, ip->hmp);
1738         ++hammer_stats_file_iopsw;
1739         error = 0;
1740
1741         if (vap->va_flags != VNOVAL) {
1742                 flags = ip->ino_data.uflags;
1743                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1744                                          hammer_to_unix_xid(&ip->ino_data.uid),
1745                                          ap->a_cred);
1746                 if (error == 0) {
1747                         if (ip->ino_data.uflags != flags) {
1748                                 ip->ino_data.uflags = flags;
1749                                 modflags |= HAMMER_INODE_DDIRTY;
1750                                 kflags |= NOTE_ATTRIB;
1751                         }
1752                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1753                                 error = 0;
1754                                 goto done;
1755                         }
1756                 }
1757                 goto done;
1758         }
1759         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1760                 error = EPERM;
1761                 goto done;
1762         }
1763         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1764                 mode_t cur_mode = ip->ino_data.mode;
1765                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1766                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1767                 uuid_t uuid_uid;
1768                 uuid_t uuid_gid;
1769
1770                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1771                                          ap->a_cred,
1772                                          &cur_uid, &cur_gid, &cur_mode);
1773                 if (error == 0) {
1774                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1775                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1776                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1777                                  sizeof(uuid_uid)) ||
1778                             bcmp(&uuid_gid, &ip->ino_data.gid,
1779                                  sizeof(uuid_gid)) ||
1780                             ip->ino_data.mode != cur_mode
1781                         ) {
1782                                 ip->ino_data.uid = uuid_uid;
1783                                 ip->ino_data.gid = uuid_gid;
1784                                 ip->ino_data.mode = cur_mode;
1785                         }
1786                         modflags |= HAMMER_INODE_DDIRTY;
1787                         kflags |= NOTE_ATTRIB;
1788                 }
1789         }
1790         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1791                 switch(ap->a_vp->v_type) {
1792                 case VREG:
1793                         if (vap->va_size == ip->ino_data.size)
1794                                 break;
1795                         /*
1796                          * XXX break atomicy, we can deadlock the backend
1797                          * if we do not release the lock.  Probably not a
1798                          * big deal here.
1799                          */
1800                         blksize = hammer_blocksize(vap->va_size);
1801                         if (vap->va_size < ip->ino_data.size) {
1802                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1803                                 truncating = 1;
1804                                 kflags |= NOTE_WRITE;
1805                         } else {
1806                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1807                                 truncating = 0;
1808                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
1809                         }
1810                         ip->ino_data.size = vap->va_size;
1811                         modflags |= HAMMER_INODE_DDIRTY;
1812
1813                         /*
1814                          * on-media truncation is cached in the inode until
1815                          * the inode is synchronized.
1816                          */
1817                         if (truncating) {
1818                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1819 #ifdef DEBUG_TRUNCATE
1820                                 if (HammerTruncIp == NULL)
1821                                         HammerTruncIp = ip;
1822 #endif
1823                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1824                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1825                                         ip->trunc_off = vap->va_size;
1826 #ifdef DEBUG_TRUNCATE
1827                                         if (ip == HammerTruncIp)
1828                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1829 #endif
1830                                 } else if (ip->trunc_off > vap->va_size) {
1831                                         ip->trunc_off = vap->va_size;
1832 #ifdef DEBUG_TRUNCATE
1833                                         if (ip == HammerTruncIp)
1834                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1835 #endif
1836                                 } else {
1837 #ifdef DEBUG_TRUNCATE
1838                                         if (ip == HammerTruncIp)
1839                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1840 #endif
1841                                 }
1842                         }
1843
1844                         /*
1845                          * If truncating we have to clean out a portion of
1846                          * the last block on-disk.  We do this in the
1847                          * front-end buffer cache.
1848                          */
1849                         aligned_size = (vap->va_size + (blksize - 1)) &
1850                                        ~(int64_t)(blksize - 1);
1851                         if (truncating && vap->va_size < aligned_size) {
1852                                 struct buf *bp;
1853                                 int offset;
1854
1855                                 aligned_size -= blksize;
1856
1857                                 offset = (int)vap->va_size & (blksize - 1);
1858                                 error = bread(ap->a_vp, aligned_size,
1859                                               blksize, &bp);
1860                                 hammer_ip_frontend_trunc(ip, aligned_size);
1861                                 if (error == 0) {
1862                                         bzero(bp->b_data + offset,
1863                                               blksize - offset);
1864                                         /* must de-cache direct-io offset */
1865                                         bp->b_bio2.bio_offset = NOOFFSET;
1866                                         bdwrite(bp);
1867                                 } else {
1868                                         kprintf("ERROR %d\n", error);
1869                                         brelse(bp);
1870                                 }
1871                         }
1872                         break;
1873                 case VDATABASE:
1874                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1875                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1876                                 ip->trunc_off = vap->va_size;
1877                         } else if (ip->trunc_off > vap->va_size) {
1878                                 ip->trunc_off = vap->va_size;
1879                         }
1880                         hammer_ip_frontend_trunc(ip, vap->va_size);
1881                         ip->ino_data.size = vap->va_size;
1882                         modflags |= HAMMER_INODE_DDIRTY;
1883                         kflags |= NOTE_ATTRIB;
1884                         break;
1885                 default:
1886                         error = EINVAL;
1887                         goto done;
1888                 }
1889                 break;
1890         }
1891         if (vap->va_atime.tv_sec != VNOVAL) {
1892                 ip->ino_data.atime =
1893                         hammer_timespec_to_time(&vap->va_atime);
1894                 modflags |= HAMMER_INODE_ATIME;
1895                 kflags |= NOTE_ATTRIB;
1896         }
1897         if (vap->va_mtime.tv_sec != VNOVAL) {
1898                 ip->ino_data.mtime =
1899                         hammer_timespec_to_time(&vap->va_mtime);
1900                 modflags |= HAMMER_INODE_MTIME;
1901                 kflags |= NOTE_ATTRIB;
1902         }
1903         if (vap->va_mode != (mode_t)VNOVAL) {
1904                 mode_t   cur_mode = ip->ino_data.mode;
1905                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1906                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1907
1908                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1909                                          cur_uid, cur_gid, &cur_mode);
1910                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1911                         ip->ino_data.mode = cur_mode;
1912                         modflags |= HAMMER_INODE_DDIRTY;
1913                         kflags |= NOTE_ATTRIB;
1914                 }
1915         }
1916 done:
1917         if (error == 0)
1918                 hammer_modify_inode(ip, modflags);
1919         hammer_done_transaction(&trans);
1920         hammer_knote(ap->a_vp, kflags);
1921         return (error);
1922 }
1923
1924 /*
1925  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1926  */
1927 static
1928 int
1929 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1930 {
1931         struct hammer_transaction trans;
1932         struct hammer_inode *dip;
1933         struct hammer_inode *nip;
1934         struct nchandle *nch;
1935         hammer_record_t record;
1936         int error;
1937         int bytes;
1938
1939         ap->a_vap->va_type = VLNK;
1940
1941         nch = ap->a_nch;
1942         dip = VTOI(ap->a_dvp);
1943
1944         if (dip->flags & HAMMER_INODE_RO)
1945                 return (EROFS);
1946         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1947                 return (error);
1948
1949         /*
1950          * Create a transaction to cover the operations we perform.
1951          */
1952         hammer_start_transaction(&trans, dip->hmp);
1953         ++hammer_stats_file_iopsw;
1954
1955         /*
1956          * Create a new filesystem object of the requested type.  The
1957          * returned inode will be referenced but not locked.
1958          */
1959
1960         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1961                                     dip, NULL, &nip);
1962         if (error) {
1963                 hammer_done_transaction(&trans);
1964                 *ap->a_vpp = NULL;
1965                 return (error);
1966         }
1967
1968         /*
1969          * Add a record representing the symlink.  symlink stores the link
1970          * as pure data, not a string, and is no \0 terminated.
1971          */
1972         if (error == 0) {
1973                 bytes = strlen(ap->a_target);
1974
1975                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1976                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1977                 } else {
1978                         record = hammer_alloc_mem_record(nip, bytes);
1979                         record->type = HAMMER_MEM_RECORD_GENERAL;
1980
1981                         record->leaf.base.localization = nip->obj_localization +
1982                                                          HAMMER_LOCALIZE_MISC;
1983                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1984                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1985                         record->leaf.data_len = bytes;
1986                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1987                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1988                         error = hammer_ip_add_record(&trans, record);
1989                 }
1990
1991                 /*
1992                  * Set the file size to the length of the link.
1993                  */
1994                 if (error == 0) {
1995                         nip->ino_data.size = bytes;
1996                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1997                 }
1998         }
1999         if (error == 0)
2000                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2001                                                 nch->ncp->nc_nlen, nip);
2002
2003         /*
2004          * Finish up.
2005          */
2006         if (error) {
2007                 hammer_rel_inode(nip, 0);
2008                 *ap->a_vpp = NULL;
2009         } else {
2010                 error = hammer_get_vnode(nip, ap->a_vpp);
2011                 hammer_rel_inode(nip, 0);
2012                 if (error == 0) {
2013                         cache_setunresolved(ap->a_nch);
2014                         cache_setvp(ap->a_nch, *ap->a_vpp);
2015                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2016                 }
2017         }
2018         hammer_done_transaction(&trans);
2019         return (error);
2020 }
2021
2022 /*
2023  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2024  */
2025 static
2026 int
2027 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2028 {
2029         struct hammer_transaction trans;
2030         struct hammer_inode *dip;
2031         int error;
2032
2033         dip = VTOI(ap->a_dvp);
2034
2035         if (hammer_nohistory(dip) == 0 &&
2036             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2037                 return (error);
2038         }
2039
2040         hammer_start_transaction(&trans, dip->hmp);
2041         ++hammer_stats_file_iopsw;
2042         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2043                                 ap->a_cred, ap->a_flags, -1);
2044         hammer_done_transaction(&trans);
2045
2046         return (error);
2047 }
2048
2049 /*
2050  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2051  */
2052 static
2053 int
2054 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2055 {
2056         struct hammer_inode *ip = ap->a_vp->v_data;
2057
2058         ++hammer_stats_file_iopsr;
2059         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2060                             ap->a_fflag, ap->a_cred));
2061 }
2062
2063 static
2064 int
2065 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2066 {
2067         struct mount *mp;
2068         int error;
2069
2070         mp = ap->a_head.a_ops->head.vv_mount;
2071
2072         switch(ap->a_op) {
2073         case MOUNTCTL_SET_EXPORT:
2074                 if (ap->a_ctllen != sizeof(struct export_args))
2075                         error = EINVAL;
2076                 else
2077                         error = hammer_vfs_export(mp, ap->a_op,
2078                                       (const struct export_args *)ap->a_ctl);
2079                 break;
2080         default:
2081                 error = journal_mountctl(ap);
2082                 break;
2083         }
2084         return(error);
2085 }
2086
2087 /*
2088  * hammer_vop_strategy { vp, bio }
2089  *
2090  * Strategy call, used for regular file read & write only.  Note that the
2091  * bp may represent a cluster.
2092  *
2093  * To simplify operation and allow better optimizations in the future,
2094  * this code does not make any assumptions with regards to buffer alignment
2095  * or size.
2096  */
2097 static
2098 int
2099 hammer_vop_strategy(struct vop_strategy_args *ap)
2100 {
2101         struct buf *bp;
2102         int error;
2103
2104         bp = ap->a_bio->bio_buf;
2105
2106         switch(bp->b_cmd) {
2107         case BUF_CMD_READ:
2108                 error = hammer_vop_strategy_read(ap);
2109                 break;
2110         case BUF_CMD_WRITE:
2111                 error = hammer_vop_strategy_write(ap);
2112                 break;
2113         default:
2114                 bp->b_error = error = EINVAL;
2115                 bp->b_flags |= B_ERROR;
2116                 biodone(ap->a_bio);
2117                 break;
2118         }
2119         return (error);
2120 }
2121
2122 /*
2123  * Read from a regular file.  Iterate the related records and fill in the
2124  * BIO/BUF.  Gaps are zero-filled.
2125  *
2126  * The support code in hammer_object.c should be used to deal with mixed
2127  * in-memory and on-disk records.
2128  *
2129  * NOTE: Can be called from the cluster code with an oversized buf.
2130  *
2131  * XXX atime update
2132  */
2133 static
2134 int
2135 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2136 {
2137         struct hammer_transaction trans;
2138         struct hammer_inode *ip;
2139         struct hammer_cursor cursor;
2140         hammer_base_elm_t base;
2141         hammer_off_t disk_offset;
2142         struct bio *bio;
2143         struct bio *nbio;
2144         struct buf *bp;
2145         int64_t rec_offset;
2146         int64_t ran_end;
2147         int64_t tmp64;
2148         int error;
2149         int boff;
2150         int roff;
2151         int n;
2152
2153         bio = ap->a_bio;
2154         bp = bio->bio_buf;
2155         ip = ap->a_vp->v_data;
2156
2157         /*
2158          * The zone-2 disk offset may have been set by the cluster code via
2159          * a BMAP operation, or else should be NOOFFSET.
2160          *
2161          * Checking the high bits for a match against zone-2 should suffice.
2162          */
2163         nbio = push_bio(bio);
2164         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2165             HAMMER_ZONE_LARGE_DATA) {
2166                 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2167                 return (error);
2168         }
2169
2170         /*
2171          * Well, that sucked.  Do it the hard way.  If all the stars are
2172          * aligned we may still be able to issue a direct-read.
2173          */
2174         hammer_simple_transaction(&trans, ip->hmp);
2175         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2176
2177         /*
2178          * Key range (begin and end inclusive) to scan.  Note that the key's
2179          * stored in the actual records represent BASE+LEN, not BASE.  The
2180          * first record containing bio_offset will have a key > bio_offset.
2181          */
2182         cursor.key_beg.localization = ip->obj_localization +
2183                                       HAMMER_LOCALIZE_MISC;
2184         cursor.key_beg.obj_id = ip->obj_id;
2185         cursor.key_beg.create_tid = 0;
2186         cursor.key_beg.delete_tid = 0;
2187         cursor.key_beg.obj_type = 0;
2188         cursor.key_beg.key = bio->bio_offset + 1;
2189         cursor.asof = ip->obj_asof;
2190         cursor.flags |= HAMMER_CURSOR_ASOF;
2191
2192         cursor.key_end = cursor.key_beg;
2193         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2194 #if 0
2195         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2196                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2197                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2198                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2199         } else
2200 #endif
2201         {
2202                 ran_end = bio->bio_offset + bp->b_bufsize;
2203                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2204                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2205                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2206                 if (tmp64 < ran_end)
2207                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2208                 else
2209                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2210         }
2211         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2212
2213         error = hammer_ip_first(&cursor);
2214         boff = 0;
2215
2216         while (error == 0) {
2217                 /*
2218                  * Get the base file offset of the record.  The key for
2219                  * data records is (base + bytes) rather then (base).
2220                  */
2221                 base = &cursor.leaf->base;
2222                 rec_offset = base->key - cursor.leaf->data_len;
2223
2224                 /*
2225                  * Calculate the gap, if any, and zero-fill it.
2226                  *
2227                  * n is the offset of the start of the record verses our
2228                  * current seek offset in the bio.
2229                  */
2230                 n = (int)(rec_offset - (bio->bio_offset + boff));
2231                 if (n > 0) {
2232                         if (n > bp->b_bufsize - boff)
2233                                 n = bp->b_bufsize - boff;
2234                         bzero((char *)bp->b_data + boff, n);
2235                         boff += n;
2236                         n = 0;
2237                 }
2238
2239                 /*
2240                  * Calculate the data offset in the record and the number
2241                  * of bytes we can copy.
2242                  *
2243                  * There are two degenerate cases.  First, boff may already
2244                  * be at bp->b_bufsize.  Secondly, the data offset within
2245                  * the record may exceed the record's size.
2246                  */
2247                 roff = -n;
2248                 rec_offset += roff;
2249                 n = cursor.leaf->data_len - roff;
2250                 if (n <= 0) {
2251                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2252                         n = 0;
2253                 } else if (n > bp->b_bufsize - boff) {
2254                         n = bp->b_bufsize - boff;
2255                 }
2256
2257                 /*
2258                  * Deal with cached truncations.  This cool bit of code
2259                  * allows truncate()/ftruncate() to avoid having to sync
2260                  * the file.
2261                  *
2262                  * If the frontend is truncated then all backend records are
2263                  * subject to the frontend's truncation.
2264                  *
2265                  * If the backend is truncated then backend records on-disk
2266                  * (but not in-memory) are subject to the backend's
2267                  * truncation.  In-memory records owned by the backend
2268                  * represent data written after the truncation point on the
2269                  * backend and must not be truncated.
2270                  *
2271                  * Truncate operations deal with frontend buffer cache
2272                  * buffers and frontend-owned in-memory records synchronously.
2273                  */
2274                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2275                         if (hammer_cursor_ondisk(&cursor) ||
2276                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2277                                 if (ip->trunc_off <= rec_offset)
2278                                         n = 0;
2279                                 else if (ip->trunc_off < rec_offset + n)
2280                                         n = (int)(ip->trunc_off - rec_offset);
2281                         }
2282                 }
2283                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2284                         if (hammer_cursor_ondisk(&cursor)) {
2285                                 if (ip->sync_trunc_off <= rec_offset)
2286                                         n = 0;
2287                                 else if (ip->sync_trunc_off < rec_offset + n)
2288                                         n = (int)(ip->sync_trunc_off - rec_offset);
2289                         }
2290                 }
2291
2292                 /*
2293                  * Try to issue a direct read into our bio if possible,
2294                  * otherwise resolve the element data into a hammer_buffer
2295                  * and copy.
2296                  *
2297                  * The buffer on-disk should be zerod past any real
2298                  * truncation point, but may not be for any synthesized
2299                  * truncation point from above.
2300                  */
2301                 disk_offset = cursor.leaf->data_offset + roff;
2302                 if (boff == 0 && n == bp->b_bufsize &&
2303                     hammer_cursor_ondisk(&cursor) &&
2304                     (disk_offset & HAMMER_BUFMASK) == 0) {
2305                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2306                                  HAMMER_ZONE_LARGE_DATA);
2307                         nbio->bio_offset = disk_offset;
2308                         error = hammer_io_direct_read(trans.hmp, nbio,
2309                                                       cursor.leaf);
2310                         goto done;
2311                 } else if (n) {
2312                         error = hammer_ip_resolve_data(&cursor);
2313                         if (error == 0) {
2314                                 bcopy((char *)cursor.data + roff,
2315                                       (char *)bp->b_data + boff, n);
2316                         }
2317                 }
2318                 if (error)
2319                         break;
2320
2321                 /*
2322                  * Iterate until we have filled the request.
2323                  */
2324                 boff += n;
2325                 if (boff == bp->b_bufsize)
2326                         break;
2327                 error = hammer_ip_next(&cursor);
2328         }
2329
2330         /*
2331          * There may have been a gap after the last record
2332          */
2333         if (error == ENOENT)
2334                 error = 0;
2335         if (error == 0 && boff != bp->b_bufsize) {
2336                 KKASSERT(boff < bp->b_bufsize);
2337                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2338                 /* boff = bp->b_bufsize; */
2339         }
2340         bp->b_resid = 0;
2341         bp->b_error = error;
2342         if (error)
2343                 bp->b_flags |= B_ERROR;
2344         biodone(ap->a_bio);
2345
2346 done:
2347         if (cursor.node)
2348                 hammer_cache_node(&ip->cache[1], cursor.node);
2349         hammer_done_cursor(&cursor);
2350         hammer_done_transaction(&trans);
2351         return(error);
2352 }
2353
2354 /*
2355  * BMAP operation - used to support cluster_read() only.
2356  *
2357  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2358  *
2359  * This routine may return EOPNOTSUPP if the opration is not supported for
2360  * the specified offset.  The contents of the pointer arguments do not
2361  * need to be initialized in that case. 
2362  *
2363  * If a disk address is available and properly aligned return 0 with 
2364  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2365  * to the run-length relative to that offset.  Callers may assume that
2366  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2367  * large, so return EOPNOTSUPP if it is not sufficiently large.
2368  */
2369 static
2370 int
2371 hammer_vop_bmap(struct vop_bmap_args *ap)
2372 {
2373         struct hammer_transaction trans;
2374         struct hammer_inode *ip;
2375         struct hammer_cursor cursor;
2376         hammer_base_elm_t base;
2377         int64_t rec_offset;
2378         int64_t ran_end;
2379         int64_t tmp64;
2380         int64_t base_offset;
2381         int64_t base_disk_offset;
2382         int64_t last_offset;
2383         hammer_off_t last_disk_offset;
2384         hammer_off_t disk_offset;
2385         int     rec_len;
2386         int     error;
2387         int     blksize;
2388
2389         ++hammer_stats_file_iopsr;
2390         ip = ap->a_vp->v_data;
2391
2392         /*
2393          * We can only BMAP regular files.  We can't BMAP database files,
2394          * directories, etc.
2395          */
2396         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2397                 return(EOPNOTSUPP);
2398
2399         /*
2400          * bmap is typically called with runp/runb both NULL when used
2401          * for writing.  We do not support BMAP for writing atm.
2402          */
2403         if (ap->a_cmd != BUF_CMD_READ)
2404                 return(EOPNOTSUPP);
2405
2406         /*
2407          * Scan the B-Tree to acquire blockmap addresses, then translate
2408          * to raw addresses.
2409          */
2410         hammer_simple_transaction(&trans, ip->hmp);
2411 #if 0
2412         kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2413 #endif
2414         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2415
2416         /*
2417          * Key range (begin and end inclusive) to scan.  Note that the key's
2418          * stored in the actual records represent BASE+LEN, not BASE.  The
2419          * first record containing bio_offset will have a key > bio_offset.
2420          */
2421         cursor.key_beg.localization = ip->obj_localization +
2422                                       HAMMER_LOCALIZE_MISC;
2423         cursor.key_beg.obj_id = ip->obj_id;
2424         cursor.key_beg.create_tid = 0;
2425         cursor.key_beg.delete_tid = 0;
2426         cursor.key_beg.obj_type = 0;
2427         if (ap->a_runb)
2428                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2429         else
2430                 cursor.key_beg.key = ap->a_loffset + 1;
2431         if (cursor.key_beg.key < 0)
2432                 cursor.key_beg.key = 0;
2433         cursor.asof = ip->obj_asof;
2434         cursor.flags |= HAMMER_CURSOR_ASOF;
2435
2436         cursor.key_end = cursor.key_beg;
2437         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2438
2439         ran_end = ap->a_loffset + MAXPHYS;
2440         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2441         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2442         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2443         if (tmp64 < ran_end)
2444                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2445         else
2446                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2447
2448         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2449
2450         error = hammer_ip_first(&cursor);
2451         base_offset = last_offset = 0;
2452         base_disk_offset = last_disk_offset = 0;
2453
2454         while (error == 0) {
2455                 /*
2456                  * Get the base file offset of the record.  The key for
2457                  * data records is (base + bytes) rather then (base).
2458                  *
2459                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2460                  * The extra bytes should be zero on-disk and the BMAP op
2461                  * should still be ok.
2462                  */
2463                 base = &cursor.leaf->base;
2464                 rec_offset = base->key - cursor.leaf->data_len;
2465                 rec_len    = cursor.leaf->data_len;
2466
2467                 /*
2468                  * Incorporate any cached truncation.
2469                  *
2470                  * NOTE: Modifications to rec_len based on synthesized
2471                  * truncation points remove the guarantee that any extended
2472                  * data on disk is zero (since the truncations may not have
2473                  * taken place on-media yet).
2474                  */
2475                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2476                         if (hammer_cursor_ondisk(&cursor) ||
2477                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2478                                 if (ip->trunc_off <= rec_offset)
2479                                         rec_len = 0;
2480                                 else if (ip->trunc_off < rec_offset + rec_len)
2481                                         rec_len = (int)(ip->trunc_off - rec_offset);
2482                         }
2483                 }
2484                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2485                         if (hammer_cursor_ondisk(&cursor)) {
2486                                 if (ip->sync_trunc_off <= rec_offset)
2487                                         rec_len = 0;
2488                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2489                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2490                         }
2491                 }
2492
2493                 /*
2494                  * Accumulate information.  If we have hit a discontiguous
2495                  * block reset base_offset unless we are already beyond the
2496                  * requested offset.  If we are, that's it, we stop.
2497                  */
2498                 if (error)
2499                         break;
2500                 if (hammer_cursor_ondisk(&cursor)) {
2501                         disk_offset = cursor.leaf->data_offset;
2502                         if (rec_offset != last_offset ||
2503                             disk_offset != last_disk_offset) {
2504                                 if (rec_offset > ap->a_loffset)
2505                                         break;
2506                                 base_offset = rec_offset;
2507                                 base_disk_offset = disk_offset;
2508                         }
2509                         last_offset = rec_offset + rec_len;
2510                         last_disk_offset = disk_offset + rec_len;
2511                 }
2512                 error = hammer_ip_next(&cursor);
2513         }
2514
2515 #if 0
2516         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2517                 ap->a_loffset, base_offset, last_offset);
2518         kprintf("BMAP %16s:  %016llx - %016llx\n",
2519                 "", base_disk_offset, last_disk_offset);
2520 #endif
2521
2522         if (cursor.node) {
2523                 hammer_cache_node(&ip->cache[1], cursor.node);
2524 #if 0
2525                 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2526 #endif
2527         }
2528         hammer_done_cursor(&cursor);
2529         hammer_done_transaction(&trans);
2530
2531         /*
2532          * If we couldn't find any records or the records we did find were
2533          * all behind the requested offset, return failure.  A forward
2534          * truncation can leave a hole w/ no on-disk records.
2535          */
2536         if (last_offset == 0 || last_offset < ap->a_loffset)
2537                 return (EOPNOTSUPP);
2538
2539         /*
2540          * Figure out the block size at the requested offset and adjust
2541          * our limits so the cluster_read() does not create inappropriately
2542          * sized buffer cache buffers.
2543          */
2544         blksize = hammer_blocksize(ap->a_loffset);
2545         if (hammer_blocksize(base_offset) != blksize) {
2546                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2547         }
2548         if (last_offset != ap->a_loffset &&
2549             hammer_blocksize(last_offset - 1) != blksize) {
2550                 last_offset = hammer_blockdemarc(ap->a_loffset,
2551                                                  last_offset - 1);
2552         }
2553
2554         /*
2555          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2556          * from occuring.
2557          */
2558         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2559
2560         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2561                 /*
2562                  * Only large-data zones can be direct-IOd
2563                  */
2564                 error = EOPNOTSUPP;
2565         } else if ((disk_offset & HAMMER_BUFMASK) ||
2566                    (last_offset - ap->a_loffset) < blksize) {
2567                 /*
2568                  * doffsetp is not aligned or the forward run size does
2569                  * not cover a whole buffer, disallow the direct I/O.
2570                  */
2571                 error = EOPNOTSUPP;
2572         } else {
2573                 /*
2574                  * We're good.
2575                  */
2576                 *ap->a_doffsetp = disk_offset;
2577                 if (ap->a_runb) {
2578                         *ap->a_runb = ap->a_loffset - base_offset;
2579                         KKASSERT(*ap->a_runb >= 0);
2580                 }
2581                 if (ap->a_runp) {
2582                         *ap->a_runp = last_offset - ap->a_loffset;
2583                         KKASSERT(*ap->a_runp >= 0);
2584                 }
2585                 error = 0;
2586         }
2587         return(error);
2588 }
2589
2590 /*
2591  * Write to a regular file.   Because this is a strategy call the OS is
2592  * trying to actually get data onto the media.
2593  */
2594 static
2595 int
2596 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2597 {
2598         hammer_record_t record;
2599         hammer_mount_t hmp;
2600         hammer_inode_t ip;
2601         struct bio *bio;
2602         struct buf *bp;
2603         int blksize;
2604         int bytes;
2605         int error;
2606
2607         bio = ap->a_bio;
2608         bp = bio->bio_buf;
2609         ip = ap->a_vp->v_data;
2610         hmp = ip->hmp;
2611
2612         blksize = hammer_blocksize(bio->bio_offset);
2613         KKASSERT(bp->b_bufsize == blksize);
2614
2615         if (ip->flags & HAMMER_INODE_RO) {
2616                 bp->b_error = EROFS;
2617                 bp->b_flags |= B_ERROR;
2618                 biodone(ap->a_bio);
2619                 return(EROFS);
2620         }
2621
2622         /*
2623          * Interlock with inode destruction (no in-kernel or directory
2624          * topology visibility).  If we queue new IO while trying to
2625          * destroy the inode we can deadlock the vtrunc call in
2626          * hammer_inode_unloadable_check().
2627          *
2628          * Besides, there's no point flushing a bp associated with an
2629          * inode that is being destroyed on-media and has no kernel
2630          * references.
2631          */
2632         if ((ip->flags | ip->sync_flags) &
2633             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2634                 bp->b_resid = 0;
2635                 biodone(ap->a_bio);
2636                 return(0);
2637         }
2638
2639         /*
2640          * Reserve space and issue a direct-write from the front-end. 
2641          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2642          * allocations.
2643          *
2644          * An in-memory record will be installed to reference the storage
2645          * until the flusher can get to it.
2646          *
2647          * Since we own the high level bio the front-end will not try to
2648          * do a direct-read until the write completes.
2649          *
2650          * NOTE: The only time we do not reserve a full-sized buffers
2651          * worth of data is if the file is small.  We do not try to
2652          * allocate a fragment (from the small-data zone) at the end of
2653          * an otherwise large file as this can lead to wildly separated
2654          * data.
2655          */
2656         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2657         KKASSERT(bio->bio_offset < ip->ino_data.size);
2658         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2659                 bytes = bp->b_bufsize;
2660         else
2661                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2662
2663         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2664                                     bytes, &error);
2665         if (record) {
2666                 hammer_io_direct_write(hmp, record, bio);
2667                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2668                         hammer_flush_inode(ip, 0);
2669         } else {
2670                 bp->b_bio2.bio_offset = NOOFFSET;
2671                 bp->b_error = error;
2672                 bp->b_flags |= B_ERROR;
2673                 biodone(ap->a_bio);
2674         }
2675         return(error);
2676 }
2677
2678 /*
2679  * dounlink - disconnect a directory entry
2680  *
2681  * XXX whiteout support not really in yet
2682  */
2683 static int
2684 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2685                 struct vnode *dvp, struct ucred *cred, 
2686                 int flags, int isdir)
2687 {
2688         struct namecache *ncp;
2689         hammer_inode_t dip;
2690         hammer_inode_t ip;
2691         struct hammer_cursor cursor;
2692         int64_t namekey;
2693         u_int32_t max_iterations;
2694         int nlen, error;
2695
2696         /*
2697          * Calculate the namekey and setup the key range for the scan.  This
2698          * works kinda like a chained hash table where the lower 32 bits
2699          * of the namekey synthesize the chain.
2700          *
2701          * The key range is inclusive of both key_beg and key_end.
2702          */
2703         dip = VTOI(dvp);
2704         ncp = nch->ncp;
2705
2706         if (dip->flags & HAMMER_INODE_RO)
2707                 return (EROFS);
2708
2709         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
2710                                            &max_iterations);
2711 retry:
2712         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2713         cursor.key_beg.localization = dip->obj_localization +
2714                                       HAMMER_LOCALIZE_MISC;
2715         cursor.key_beg.obj_id = dip->obj_id;
2716         cursor.key_beg.key = namekey;
2717         cursor.key_beg.create_tid = 0;
2718         cursor.key_beg.delete_tid = 0;
2719         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2720         cursor.key_beg.obj_type = 0;
2721
2722         cursor.key_end = cursor.key_beg;
2723         cursor.key_end.key += max_iterations;
2724         cursor.asof = dip->obj_asof;
2725         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2726
2727         /*
2728          * Scan all matching records (the chain), locate the one matching
2729          * the requested path component.  info->last_error contains the
2730          * error code on search termination and could be 0, ENOENT, or
2731          * something else.
2732          *
2733          * The hammer_ip_*() functions merge in-memory records with on-disk
2734          * records for the purposes of the search.
2735          */
2736         error = hammer_ip_first(&cursor);
2737
2738         while (error == 0) {
2739                 error = hammer_ip_resolve_data(&cursor);
2740                 if (error)
2741                         break;
2742                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2743                 KKASSERT(nlen > 0);
2744                 if (ncp->nc_nlen == nlen &&
2745                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2746                         break;
2747                 }
2748                 error = hammer_ip_next(&cursor);
2749         }
2750
2751         /*
2752          * If all is ok we have to get the inode so we can adjust nlinks.
2753          * To avoid a deadlock with the flusher we must release the inode
2754          * lock on the directory when acquiring the inode for the entry.
2755          *
2756          * If the target is a directory, it must be empty.
2757          */
2758         if (error == 0) {
2759                 hammer_unlock(&cursor.ip->lock);
2760                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2761                                       dip->hmp->asof,
2762                                       cursor.data->entry.localization,
2763                                       0, &error);
2764                 hammer_lock_sh(&cursor.ip->lock);
2765                 if (error == ENOENT) {
2766                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2767                         Debugger("ENOENT unlinking object that should exist");
2768                 }
2769
2770                 /*
2771                  * If isdir >= 0 we validate that the entry is or is not a
2772                  * directory.  If isdir < 0 we don't care.
2773                  */
2774                 if (error == 0 && isdir >= 0) {
2775                         if (isdir &&
2776                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
2777                                 error = ENOTDIR;
2778                         } else if (isdir == 0 &&
2779                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
2780                                 error = EISDIR;
2781                         }
2782                 }
2783
2784                 /*
2785                  * If we are trying to remove a directory the directory must
2786                  * be empty.
2787                  *
2788                  * WARNING: hammer_ip_check_directory_empty() may have to
2789                  * terminate the cursor to avoid a deadlock.  It is ok to
2790                  * call hammer_done_cursor() twice.
2791                  */
2792                 if (error == 0 && ip->ino_data.obj_type ==
2793                                   HAMMER_OBJTYPE_DIRECTORY) {
2794                         error = hammer_ip_check_directory_empty(trans, ip);
2795                 }
2796
2797                 /*
2798                  * Delete the directory entry.
2799                  *
2800                  * WARNING: hammer_ip_del_directory() may have to terminate
2801                  * the cursor to avoid a deadlock.  It is ok to call
2802                  * hammer_done_cursor() twice.
2803                  */
2804                 if (error == 0) {
2805                         error = hammer_ip_del_directory(trans, &cursor,
2806                                                         dip, ip);
2807                 }
2808                 hammer_done_cursor(&cursor);
2809                 if (error == 0) {
2810                         cache_setunresolved(nch);
2811                         cache_setvp(nch, NULL);
2812                         /* XXX locking */
2813                         if (ip->vp) {
2814                                 hammer_knote(ip->vp, NOTE_DELETE);
2815                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2816                         }
2817                 }
2818                 if (ip)
2819                         hammer_rel_inode(ip, 0);
2820         } else {
2821                 hammer_done_cursor(&cursor);
2822         }
2823         if (error == EDEADLK)
2824                 goto retry;
2825
2826         return (error);
2827 }
2828
2829 /************************************************************************
2830  *                          FIFO AND SPECFS OPS                         *
2831  ************************************************************************
2832  *
2833  */
2834
2835 static int
2836 hammer_vop_fifoclose (struct vop_close_args *ap)
2837 {
2838         /* XXX update itimes */
2839         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2840 }
2841
2842 static int
2843 hammer_vop_fiforead (struct vop_read_args *ap)
2844 {
2845         int error;
2846
2847         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2848         /* XXX update access time */
2849         return (error);
2850 }
2851
2852 static int
2853 hammer_vop_fifowrite (struct vop_write_args *ap)
2854 {
2855         int error;
2856
2857         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2858         /* XXX update access time */
2859         return (error);
2860 }
2861
2862 static
2863 int
2864 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
2865 {
2866         int error;
2867
2868         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2869         if (error)
2870                 error = hammer_vop_kqfilter(ap);
2871         return(error);
2872 }
2873
2874 static int
2875 hammer_vop_specclose (struct vop_close_args *ap)
2876 {
2877         /* XXX update itimes */
2878         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2879 }
2880
2881 static int
2882 hammer_vop_specread (struct vop_read_args *ap)
2883 {
2884         /* XXX update access time */
2885         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2886 }
2887
2888 static int
2889 hammer_vop_specwrite (struct vop_write_args *ap)
2890 {
2891         /* XXX update last change time */
2892         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2893 }
2894
2895 /************************************************************************
2896  *                          KQFILTER OPS                                *
2897  ************************************************************************
2898  *
2899  */
2900 static void filt_hammerdetach(struct knote *kn);
2901 static int filt_hammerread(struct knote *kn, long hint);
2902 static int filt_hammerwrite(struct knote *kn, long hint);
2903 static int filt_hammervnode(struct knote *kn, long hint);
2904
2905 static struct filterops hammerread_filtops =
2906         { 1, NULL, filt_hammerdetach, filt_hammerread };
2907 static struct filterops hammerwrite_filtops =
2908         { 1, NULL, filt_hammerdetach, filt_hammerwrite };
2909 static struct filterops hammervnode_filtops =
2910         { 1, NULL, filt_hammerdetach, filt_hammervnode };
2911
2912 static
2913 int
2914 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
2915 {
2916         struct vnode *vp = ap->a_vp;
2917         struct knote *kn = ap->a_kn;
2918         lwkt_tokref ilock;
2919
2920         switch (kn->kn_filter) {
2921         case EVFILT_READ:
2922                 kn->kn_fop = &hammerread_filtops;
2923                 break;
2924         case EVFILT_WRITE:
2925                 kn->kn_fop = &hammerwrite_filtops;
2926                 break;
2927         case EVFILT_VNODE:
2928                 kn->kn_fop = &hammervnode_filtops;
2929                 break;
2930         default:
2931                 return (1);
2932         }
2933
2934         kn->kn_hook = (caddr_t)vp;
2935
2936         lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
2937         SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
2938         lwkt_reltoken(&ilock);
2939
2940         return(0);
2941 }
2942
2943 static void
2944 filt_hammerdetach(struct knote *kn)
2945 {
2946         struct vnode *vp = (void *)kn->kn_hook;
2947         lwkt_tokref ilock;
2948
2949         lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
2950         SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
2951                      kn, knote, kn_selnext);
2952         lwkt_reltoken(&ilock);
2953 }
2954
2955 static int
2956 filt_hammerread(struct knote *kn, long hint)
2957 {
2958         struct vnode *vp = (void *)kn->kn_hook;
2959         hammer_inode_t ip = VTOI(vp);
2960
2961         if (hint == NOTE_REVOKE) {
2962                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2963                 return(1);
2964         }
2965         kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
2966         return (kn->kn_data != 0);
2967 }
2968
2969 static int
2970 filt_hammerwrite(struct knote *kn, long hint)
2971 {
2972         if (hint == NOTE_REVOKE)
2973                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2974         kn->kn_data = 0;
2975         return (1);
2976 }
2977
2978 static int
2979 filt_hammervnode(struct knote *kn, long hint)
2980 {
2981         if (kn->kn_sfflags & hint)
2982                 kn->kn_fflags |= hint;
2983         if (hint == NOTE_REVOKE) {
2984                 kn->kn_flags |= EV_EOF;
2985                 return (1);
2986         }
2987         return (kn->kn_fflags != 0);
2988 }
2989