Merge branch 'vendor/BMAKE'
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
39 #include <sys/namecache.h>
40 #include <sys/vnode.h>
41 #include <sys/lockf.h>
42 #include <sys/event.h>
43 #include <sys/stat.h>
44 #include <sys/dirent.h>
45 #include <sys/file.h>
46 #include <vm/vm_extern.h>
47 #include <vm/swap_pager.h>
48 #include <vfs/fifofs/fifo.h>
49
50 #include "hammer.h"
51
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
85
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         vop_stdpathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_markatime =        hammer_vop_markatime,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl,
126         .vop_kqfilter =         hammer_vop_kqfilter
127 };
128
129 struct vop_ops hammer_spec_vops = {
130         .vop_default =          vop_defaultop,
131         .vop_fsync =            hammer_vop_fsync,
132         .vop_read =             vop_stdnoread,
133         .vop_write =            vop_stdnowrite,
134         .vop_access =           hammer_vop_access,
135         .vop_close =            hammer_vop_close,
136         .vop_markatime =        hammer_vop_markatime,
137         .vop_getattr =          hammer_vop_getattr,
138         .vop_inactive =         hammer_vop_inactive,
139         .vop_reclaim =          hammer_vop_reclaim,
140         .vop_setattr =          hammer_vop_setattr
141 };
142
143 struct vop_ops hammer_fifo_vops = {
144         .vop_default =          fifo_vnoperate,
145         .vop_fsync =            hammer_vop_fsync,
146         .vop_read =             hammer_vop_fiforead,
147         .vop_write =            hammer_vop_fifowrite,
148         .vop_access =           hammer_vop_access,
149         .vop_close =            hammer_vop_fifoclose,
150         .vop_markatime =        hammer_vop_markatime,
151         .vop_getattr =          hammer_vop_getattr,
152         .vop_inactive =         hammer_vop_inactive,
153         .vop_reclaim =          hammer_vop_reclaim,
154         .vop_setattr =          hammer_vop_setattr,
155         .vop_kqfilter =         hammer_vop_fifokqfilter
156 };
157
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
161 {
162         if (flags)
163                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
164 }
165
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
169
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171                            struct vnode *dvp, struct ucred *cred,
172                            int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
175
176 #if 0
177 static
178 int
179 hammer_vop_vnoperate(struct vop_generic_args *)
180 {
181         return (VOCALL(&hammer_vnode_vops, ap));
182 }
183 #endif
184
185 /*
186  * hammer_vop_fsync { vp, waitfor }
187  *
188  * fsync() an inode to disk and wait for it to be completely committed
189  * such that the information would not be undone if a crash occured after
190  * return.
191  *
192  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
193  *       a REDO log.  A sysctl is provided to relax HAMMER's fsync()
194  *       operation.
195  *
196  *       Ultimately the combination of a REDO log and use of fast storage
197  *       to front-end cluster caches will make fsync fast, but it aint
198  *       here yet.  And, in anycase, we need real transactional
199  *       all-or-nothing features which are not restricted to a single file.
200  */
201 static
202 int
203 hammer_vop_fsync(struct vop_fsync_args *ap)
204 {
205         hammer_inode_t ip = VTOI(ap->a_vp);
206         hammer_mount_t hmp = ip->hmp;
207         int waitfor = ap->a_waitfor;
208         int mode;
209
210         lwkt_gettoken(&hmp->fs_token);
211
212         /*
213          * Fsync rule relaxation (default is either full synchronous flush
214          * or REDO semantics with synchronous flush).
215          */
216         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
217                 switch(hammer_fsync_mode) {
218                 case 0:
219 mode0:
220                         /* no REDO, full synchronous flush */
221                         goto skip;
222                 case 1:
223 mode1:
224                         /* no REDO, full asynchronous flush */
225                         if (waitfor == MNT_WAIT)
226                                 waitfor = MNT_NOWAIT;
227                         goto skip;
228                 case 2:
229                         /* REDO semantics, synchronous flush */
230                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
231                                 goto mode0;
232                         mode = HAMMER_FLUSH_UNDOS_AUTO;
233                         break;
234                 case 3:
235                         /* REDO semantics, relaxed asynchronous flush */
236                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
237                                 goto mode1;
238                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
239                         if (waitfor == MNT_WAIT)
240                                 waitfor = MNT_NOWAIT;
241                         break;
242                 case 4:
243                         /* ignore the fsync() system call */
244                         lwkt_reltoken(&hmp->fs_token);
245                         return(0);
246                 default:
247                         /* we have to do something */
248                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
249                         if (waitfor == MNT_WAIT)
250                                 waitfor = MNT_NOWAIT;
251                         break;
252                 }
253
254                 /*
255                  * Fast fsync only needs to flush the UNDO/REDO fifo if
256                  * HAMMER_INODE_REDO is non-zero and the only modifications
257                  * made to the file are write or write-extends.
258                  */
259                 if ((ip->flags & HAMMER_INODE_REDO) &&
260                     (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
261                 ) {
262                         ++hammer_count_fsyncs;
263                         hammer_flusher_flush_undos(hmp, mode);
264                         ip->redo_count = 0;
265                         if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
266                                 vclrisdirty(ip->vp);
267                         lwkt_reltoken(&hmp->fs_token);
268                         return(0);
269                 }
270
271                 /*
272                  * REDO is enabled by fsync(), the idea being we really only
273                  * want to lay down REDO records when programs are using
274                  * fsync() heavily.  The first fsync() on the file starts
275                  * the gravy train going and later fsync()s keep it hot by
276                  * resetting the redo_count.
277                  *
278                  * We weren't running REDOs before now so we have to fall
279                  * through and do a full fsync of what we have.
280                  */
281                 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
282                     (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
283                         ip->flags |= HAMMER_INODE_REDO;
284                         ip->redo_count = 0;
285                 }
286         }
287 skip:
288
289         /*
290          * Do a full flush sequence.
291          *
292          * Attempt to release the vnode while waiting for the inode to
293          * finish flushing.  This can really mess up inactive->reclaim
294          * sequences so only do it if the vnode is active.
295          *
296          * WARNING! The VX lock functions must be used.  vn_lock() will
297          *          fail when this is part of a VOP_RECLAIM sequence.
298          */
299         ++hammer_count_fsyncs;
300         vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
301         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
302         if (waitfor == MNT_WAIT) {
303                 int dorelock;
304
305                 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) {
306                         vx_unlock(ap->a_vp);
307                         dorelock = 1;
308                 } else {
309                         dorelock = 0;
310                 }
311                 hammer_wait_inode(ip);
312                 if (dorelock)
313                         vx_lock(ap->a_vp);
314         }
315         if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
316                 vclrisdirty(ip->vp);
317         lwkt_reltoken(&hmp->fs_token);
318         return (ip->error);
319 }
320
321 /*
322  * hammer_vop_read { vp, uio, ioflag, cred }
323  *
324  * MPSAFE (for the cache safe does not require fs_token)
325  */
326 static
327 int
328 hammer_vop_read(struct vop_read_args *ap)
329 {
330         struct hammer_transaction trans;
331         hammer_inode_t ip;
332         hammer_mount_t hmp;
333         off_t offset;
334         struct buf *bp;
335         struct uio *uio;
336         int error;
337         int n;
338         int seqcount;
339         int ioseqcount;
340         int blksize;
341         int bigread;
342         int got_trans;
343         size_t resid;
344
345         if (ap->a_vp->v_type != VREG)
346                 return (EINVAL);
347         ip = VTOI(ap->a_vp);
348         hmp = ip->hmp;
349         error = 0;
350         got_trans = 0;
351         uio = ap->a_uio;
352
353         /*
354          * Attempt to shortcut directly to the VM object using lwbufs.
355          * This is much faster than instantiating buffer cache buffers.
356          */
357         resid = uio->uio_resid;
358         error = vop_helper_read_shortcut(ap);
359         hammer_stats_file_read += resid - uio->uio_resid;
360         if (error)
361                 return (error);
362         if (uio->uio_resid == 0)
363                 goto finished;
364
365         /*
366          * Allow the UIO's size to override the sequential heuristic.
367          */
368         blksize = hammer_blocksize(uio->uio_offset);
369         seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
370         ioseqcount = (ap->a_ioflag >> 16);
371         if (seqcount < ioseqcount)
372                 seqcount = ioseqcount;
373
374         /*
375          * If reading or writing a huge amount of data we have to break
376          * atomicy and allow the operation to be interrupted by a signal
377          * or it can DOS the machine.
378          */
379         bigread = (uio->uio_resid > 100 * 1024 * 1024);
380
381         /*
382          * Access the data typically in HAMMER_BUFSIZE blocks via the
383          * buffer cache, but HAMMER may use a variable block size based
384          * on the offset.
385          *
386          * XXX Temporary hack, delay the start transaction while we remain
387          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
388          *     locked-shared.
389          */
390         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
391                 int64_t base_offset;
392                 int64_t file_limit;
393
394                 blksize = hammer_blocksize(uio->uio_offset);
395                 offset = (int)uio->uio_offset & (blksize - 1);
396                 base_offset = uio->uio_offset - offset;
397
398                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
399                         break;
400
401                 /*
402                  * MPSAFE
403                  */
404                 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
405                 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
406                         bp->b_flags &= ~B_AGE;
407                         error = 0;
408                         goto skip;
409                 }
410                 if (ap->a_ioflag & IO_NRDELAY) {
411                         bqrelse(bp);
412                         return (EWOULDBLOCK);
413                 }
414
415                 /*
416                  * MPUNSAFE
417                  */
418                 if (got_trans == 0) {
419                         hammer_start_transaction(&trans, ip->hmp);
420                         got_trans = 1;
421                 }
422
423                 /*
424                  * NOTE: A valid bp has already been acquired, but was not
425                  *       B_CACHE.
426                  */
427                 if (hammer_cluster_enable) {
428                         /*
429                          * Use file_limit to prevent cluster_read() from
430                          * creating buffers of the wrong block size past
431                          * the demarc.
432                          */
433                         file_limit = ip->ino_data.size;
434                         if (base_offset < HAMMER_XDEMARC &&
435                             file_limit > HAMMER_XDEMARC) {
436                                 file_limit = HAMMER_XDEMARC;
437                         }
438                         error = cluster_readx(ap->a_vp,
439                                              file_limit, base_offset,
440                                              blksize, uio->uio_resid,
441                                              seqcount * BKVASIZE, &bp);
442                 } else {
443                         error = breadnx(ap->a_vp, base_offset, blksize,
444                                         NULL, NULL, 0, &bp);
445                 }
446                 if (error) {
447                         brelse(bp);
448                         break;
449                 }
450 skip:
451                 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
452                         kprintf("doff %016jx read file %016jx@%016jx\n",
453                                 (intmax_t)bp->b_bio2.bio_offset,
454                                 (intmax_t)ip->obj_id,
455                                 (intmax_t)bp->b_loffset);
456                 }
457                 bp->b_flags &= ~B_IODEBUG;
458                 if (blksize == HAMMER_XBUFSIZE)
459                         bp->b_flags |= B_CLUSTEROK;
460
461                 n = blksize - offset;
462                 if (n > uio->uio_resid)
463                         n = uio->uio_resid;
464                 if (n > ip->ino_data.size - uio->uio_offset)
465                         n = (int)(ip->ino_data.size - uio->uio_offset);
466
467                 /*
468                  * Set B_AGE, data has a lower priority than meta-data.
469                  *
470                  * Use a hold/unlock/drop sequence to run the uiomove
471                  * with the buffer unlocked, avoiding deadlocks against
472                  * read()s on mmap()'d spaces.
473                  */
474                 bp->b_flags |= B_AGE;
475                 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio);
476                 bqrelse(bp);
477
478                 if (error)
479                         break;
480                 hammer_stats_file_read += n;
481         }
482
483 finished:
484
485         /*
486          * Try to update the atime with just the inode lock for maximum
487          * concurrency.  If we can't shortcut it we have to get the full
488          * blown transaction.
489          */
490         if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) {
491                 hammer_start_transaction(&trans, ip->hmp);
492                 got_trans = 1;
493         }
494
495         if (got_trans) {
496                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
497                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
498                         lwkt_gettoken(&hmp->fs_token);
499                         ip->ino_data.atime = trans.time;
500                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
501                         hammer_done_transaction(&trans);
502                         lwkt_reltoken(&hmp->fs_token);
503                 } else {
504                         hammer_done_transaction(&trans);
505                 }
506         }
507         return (error);
508 }
509
510 /*
511  * hammer_vop_write { vp, uio, ioflag, cred }
512  */
513 static
514 int
515 hammer_vop_write(struct vop_write_args *ap)
516 {
517         struct hammer_transaction trans;
518         struct hammer_inode *ip;
519         hammer_mount_t hmp;
520         thread_t td;
521         struct uio *uio;
522         int offset;
523         off_t base_offset;
524         int64_t cluster_eof;
525         struct buf *bp;
526         int kflags;
527         int error;
528         int n;
529         int flags;
530         int seqcount;
531         int bigwrite;
532
533         if (ap->a_vp->v_type != VREG)
534                 return (EINVAL);
535         ip = VTOI(ap->a_vp);
536         hmp = ip->hmp;
537         error = 0;
538         kflags = 0;
539         seqcount = ap->a_ioflag >> 16;
540
541         if (ip->flags & HAMMER_INODE_RO)
542                 return (EROFS);
543
544         /*
545          * Create a transaction to cover the operations we perform.
546          */
547         hammer_start_transaction(&trans, hmp);
548         uio = ap->a_uio;
549
550         /*
551          * Check append mode
552          */
553         if (ap->a_ioflag & IO_APPEND)
554                 uio->uio_offset = ip->ino_data.size;
555
556         /*
557          * Check for illegal write offsets.  Valid range is 0...2^63-1.
558          *
559          * NOTE: the base_off assignment is required to work around what
560          * I consider to be a GCC-4 optimization bug.
561          */
562         if (uio->uio_offset < 0) {
563                 hammer_done_transaction(&trans);
564                 return (EFBIG);
565         }
566         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
567         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
568                 hammer_done_transaction(&trans);
569                 return (EFBIG);
570         }
571
572         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
573             base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
574                 hammer_done_transaction(&trans);
575                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
576                 return (EFBIG);
577         }
578
579         /*
580          * If reading or writing a huge amount of data we have to break
581          * atomicy and allow the operation to be interrupted by a signal
582          * or it can DOS the machine.
583          *
584          * Preset redo_count so we stop generating REDOs earlier if the
585          * limit is exceeded.
586          *
587          * redo_count is heuristical, SMP races are ok
588          */
589         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
590         if ((ip->flags & HAMMER_INODE_REDO) &&
591             ip->redo_count < hammer_limit_redo) {
592                 ip->redo_count += uio->uio_resid;
593         }
594
595         /*
596          * Access the data typically in HAMMER_BUFSIZE blocks via the
597          * buffer cache, but HAMMER may use a variable block size based
598          * on the offset.
599          */
600         while (uio->uio_resid > 0) {
601                 int fixsize = 0;
602                 int blksize;
603                 int blkmask;
604                 int trivial;
605                 int endofblk;
606                 off_t nsize;
607
608                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
609                         break;
610                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
611                         break;
612
613                 blksize = hammer_blocksize(uio->uio_offset);
614
615                 /*
616                  * Control the number of pending records associated with
617                  * this inode.  If too many have accumulated start a
618                  * flush.  Try to maintain a pipeline with the flusher.
619                  *
620                  * NOTE: It is possible for other sources to grow the
621                  *       records but not necessarily issue another flush,
622                  *       so use a timeout and ensure that a re-flush occurs.
623                  */
624                 if (ip->rsv_recs >= hammer_limit_inode_recs) {
625                         lwkt_gettoken(&hmp->fs_token);
626                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
627                         while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
628                                 ip->flags |= HAMMER_INODE_RECSW;
629                                 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
630                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
631                         }
632                         lwkt_reltoken(&hmp->fs_token);
633                 }
634
635                 /*
636                  * Do not allow HAMMER to blow out the buffer cache.  Very
637                  * large UIOs can lockout other processes due to bwillwrite()
638                  * mechanics.
639                  *
640                  * The hammer inode is not locked during these operations.
641                  * The vnode is locked which can interfere with the pageout
642                  * daemon for non-UIO_NOCOPY writes but should not interfere
643                  * with the buffer cache.  Even so, we cannot afford to
644                  * allow the pageout daemon to build up too many dirty buffer
645                  * cache buffers.
646                  *
647                  * Only call this if we aren't being recursively called from
648                  * a virtual disk device (vn), else we may deadlock.
649                  */
650                 if ((ap->a_ioflag & IO_RECURSE) == 0)
651                         bwillwrite(blksize);
652
653                 /*
654                  * Calculate the blocksize at the current offset and figure
655                  * out how much we can actually write.
656                  */
657                 blkmask = blksize - 1;
658                 offset = (int)uio->uio_offset & blkmask;
659                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
660                 n = blksize - offset;
661                 if (n > uio->uio_resid) {
662                         n = uio->uio_resid;
663                         endofblk = 0;
664                 } else {
665                         endofblk = 1;
666                 }
667                 nsize = uio->uio_offset + n;
668                 if (nsize > ip->ino_data.size) {
669                         if (uio->uio_offset > ip->ino_data.size)
670                                 trivial = 0;
671                         else
672                                 trivial = 1;
673                         nvextendbuf(ap->a_vp,
674                                     ip->ino_data.size,
675                                     nsize,
676                                     hammer_blocksize(ip->ino_data.size),
677                                     hammer_blocksize(nsize),
678                                     hammer_blockoff(ip->ino_data.size),
679                                     hammer_blockoff(nsize),
680                                     trivial);
681                         fixsize = 1;
682                         kflags |= NOTE_EXTEND;
683                 }
684
685                 if (uio->uio_segflg == UIO_NOCOPY) {
686                         /*
687                          * Issuing a write with the same data backing the
688                          * buffer.  Instantiate the buffer to collect the
689                          * backing vm pages, then read-in any missing bits.
690                          *
691                          * This case is used by vop_stdputpages().
692                          */
693                         bp = getblk(ap->a_vp, base_offset,
694                                     blksize, GETBLK_BHEAVY, 0);
695                         if ((bp->b_flags & B_CACHE) == 0) {
696                                 bqrelse(bp);
697                                 error = bread(ap->a_vp, base_offset,
698                                               blksize, &bp);
699                         }
700                 } else if (offset == 0 && uio->uio_resid >= blksize) {
701                         /*
702                          * Even though we are entirely overwriting the buffer
703                          * we may still have to zero it out to avoid a 
704                          * mmap/write visibility issue.
705                          */
706                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
707                         if ((bp->b_flags & B_CACHE) == 0)
708                                 vfs_bio_clrbuf(bp);
709                 } else if (base_offset >= ip->ino_data.size) {
710                         /*
711                          * If the base offset of the buffer is beyond the
712                          * file EOF, we don't have to issue a read.
713                          */
714                         bp = getblk(ap->a_vp, base_offset,
715                                     blksize, GETBLK_BHEAVY, 0);
716                         vfs_bio_clrbuf(bp);
717                 } else {
718                         /*
719                          * Partial overwrite, read in any missing bits then
720                          * replace the portion being written.
721                          */
722                         error = bread(ap->a_vp, base_offset, blksize, &bp);
723                         if (error == 0)
724                                 bheavy(bp);
725                 }
726                 if (error == 0)
727                         error = uiomovebp(bp, bp->b_data + offset, n, uio);
728
729                 lwkt_gettoken(&hmp->fs_token);
730
731                 /*
732                  * Generate REDO records if enabled and redo_count will not
733                  * exceeded the limit.
734                  *
735                  * If redo_count exceeds the limit we stop generating records
736                  * and clear HAMMER_INODE_REDO.  This will cause the next
737                  * fsync() to do a full meta-data sync instead of just an
738                  * UNDO/REDO fifo update.
739                  *
740                  * When clearing HAMMER_INODE_REDO any pre-existing REDOs
741                  * will still be tracked.  The tracks will be terminated
742                  * when the related meta-data (including possible data
743                  * modifications which are not tracked via REDO) is
744                  * flushed.
745                  */
746                 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
747                         if (ip->redo_count < hammer_limit_redo) {
748                                 bp->b_flags |= B_VFSFLAG1;
749                                 error = hammer_generate_redo(&trans, ip,
750                                                      base_offset + offset,
751                                                      HAMMER_REDO_WRITE,
752                                                      bp->b_data + offset,
753                                                      (size_t)n);
754                         } else {
755                                 ip->flags &= ~HAMMER_INODE_REDO;
756                         }
757                 }
758
759                 /*
760                  * If we screwed up we have to undo any VM size changes we
761                  * made.
762                  */
763                 if (error) {
764                         brelse(bp);
765                         if (fixsize) {
766                                 nvtruncbuf(ap->a_vp, ip->ino_data.size,
767                                           hammer_blocksize(ip->ino_data.size),
768                                           hammer_blockoff(ip->ino_data.size),
769                                           0);
770                         }
771                         break;
772                 }
773                 kflags |= NOTE_WRITE;
774                 hammer_stats_file_write += n;
775                 if (blksize == HAMMER_XBUFSIZE)
776                         bp->b_flags |= B_CLUSTEROK;
777                 if (ip->ino_data.size < uio->uio_offset) {
778                         ip->ino_data.size = uio->uio_offset;
779                         flags = HAMMER_INODE_SDIRTY;
780                 } else {
781                         flags = 0;
782                 }
783                 ip->ino_data.mtime = trans.time;
784                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
785                 hammer_modify_inode(&trans, ip, flags);
786
787                 /*
788                  * Once we dirty the buffer any cached zone-X offset
789                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
790                  * allow overwriting over the same data sector unless
791                  * we provide UNDOs for the old data, which we don't.
792                  */
793                 bp->b_bio2.bio_offset = NOOFFSET;
794
795                 lwkt_reltoken(&hmp->fs_token);
796
797                 /*
798                  * Final buffer disposition.
799                  *
800                  * Because meta-data updates are deferred, HAMMER is
801                  * especially sensitive to excessive bdwrite()s because
802                  * the I/O stream is not broken up by disk reads.  So the
803                  * buffer cache simply cannot keep up.
804                  *
805                  * WARNING!  blksize is variable.  cluster_write() is
806                  *           expected to not blow up if it encounters
807                  *           buffers that do not match the passed blksize.
808                  *
809                  * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
810                  *        The ip->rsv_recs check should burst-flush the data.
811                  *        If we queue it immediately the buf could be left
812                  *        locked on the device queue for a very long time.
813                  *
814                  *        However, failing to flush a dirty buffer out when
815                  *        issued from the pageout daemon can result in a low
816                  *        memory deadlock against bio_page_alloc(), so we
817                  *        have to bawrite() on IO_ASYNC as well.
818                  *
819                  * NOTE!  To avoid degenerate stalls due to mismatched block
820                  *        sizes we only honor IO_DIRECT on the write which
821                  *        abuts the end of the buffer.  However, we must
822                  *        honor IO_SYNC in case someone is silly enough to
823                  *        configure a HAMMER file as swap, or when HAMMER
824                  *        is serving NFS (for commits).  Ick ick.
825                  */
826                 bp->b_flags |= B_AGE;
827                 if (blksize == HAMMER_XBUFSIZE)
828                         bp->b_flags |= B_CLUSTEROK;
829
830                 if (ap->a_ioflag & IO_SYNC) {
831                         bwrite(bp);
832                 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
833                         bawrite(bp);
834                 } else if (ap->a_ioflag & IO_ASYNC) {
835                         bawrite(bp);
836                 } else if (hammer_cluster_enable &&
837                            !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
838                         if (base_offset < HAMMER_XDEMARC)
839                                 cluster_eof = hammer_blockdemarc(base_offset,
840                                                          ip->ino_data.size);
841                         else
842                                 cluster_eof = ip->ino_data.size;
843                         cluster_write(bp, cluster_eof, blksize, seqcount);
844                 } else {
845                         bdwrite(bp);
846                 }
847         }
848         hammer_done_transaction(&trans);
849         hammer_knote(ap->a_vp, kflags);
850
851         return (error);
852 }
853
854 /*
855  * hammer_vop_access { vp, mode, cred }
856  *
857  * MPSAFE - does not require fs_token
858  */
859 static
860 int
861 hammer_vop_access(struct vop_access_args *ap)
862 {
863         struct hammer_inode *ip = VTOI(ap->a_vp);
864         uid_t uid;
865         gid_t gid;
866         int error;
867
868         ++hammer_stats_file_iopsr;
869         uid = hammer_to_unix_xid(&ip->ino_data.uid);
870         gid = hammer_to_unix_xid(&ip->ino_data.gid);
871
872         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
873                                   ip->ino_data.uflags);
874         return (error);
875 }
876
877 /*
878  * hammer_vop_advlock { vp, id, op, fl, flags }
879  *
880  * MPSAFE - does not require fs_token
881  */
882 static
883 int
884 hammer_vop_advlock(struct vop_advlock_args *ap)
885 {
886         hammer_inode_t ip = VTOI(ap->a_vp);
887
888         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
889 }
890
891 /*
892  * hammer_vop_close { vp, fflag }
893  *
894  * We can only sync-on-close for normal closes.  XXX disabled for now.
895  */
896 static
897 int
898 hammer_vop_close(struct vop_close_args *ap)
899 {
900 #if 0
901         struct vnode *vp = ap->a_vp;
902         hammer_inode_t ip = VTOI(vp);
903         int waitfor;
904         if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
905                 if (vn_islocked(vp) == LK_EXCLUSIVE &&
906                     (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
907                         if (ip->flags & HAMMER_INODE_CLOSESYNC)
908                                 waitfor = MNT_WAIT;
909                         else
910                                 waitfor = MNT_NOWAIT;
911                         ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
912                                        HAMMER_INODE_CLOSEASYNC);
913                         VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
914                 }
915         }
916 #endif
917         return (vop_stdclose(ap));
918 }
919
920 /*
921  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
922  *
923  * The operating system has already ensured that the directory entry
924  * does not exist and done all appropriate namespace locking.
925  */
926 static
927 int
928 hammer_vop_ncreate(struct vop_ncreate_args *ap)
929 {
930         struct hammer_transaction trans;
931         struct hammer_inode *dip;
932         struct hammer_inode *nip;
933         struct nchandle *nch;
934         hammer_mount_t hmp;
935         int error;
936
937         nch = ap->a_nch;
938         dip = VTOI(ap->a_dvp);
939         hmp = dip->hmp;
940
941         if (dip->flags & HAMMER_INODE_RO)
942                 return (EROFS);
943         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
944                 return (error);
945
946         /*
947          * Create a transaction to cover the operations we perform.
948          */
949         lwkt_gettoken(&hmp->fs_token);
950         hammer_start_transaction(&trans, hmp);
951         ++hammer_stats_file_iopsw;
952
953         /*
954          * Create a new filesystem object of the requested type.  The
955          * returned inode will be referenced and shared-locked to prevent
956          * it from being moved to the flusher.
957          */
958         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
959                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
960                                     NULL, &nip);
961         if (error) {
962                 hkprintf("hammer_create_inode error %d\n", error);
963                 hammer_done_transaction(&trans);
964                 *ap->a_vpp = NULL;
965                 lwkt_reltoken(&hmp->fs_token);
966                 return (error);
967         }
968
969         /*
970          * Add the new filesystem object to the directory.  This will also
971          * bump the inode's link count.
972          */
973         error = hammer_ip_add_directory(&trans, dip,
974                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
975                                         nip);
976         if (error)
977                 hkprintf("hammer_ip_add_directory error %d\n", error);
978
979         /*
980          * Finish up.
981          */
982         if (error) {
983                 hammer_rel_inode(nip, 0);
984                 hammer_done_transaction(&trans);
985                 *ap->a_vpp = NULL;
986         } else {
987                 error = hammer_get_vnode(nip, ap->a_vpp);
988                 hammer_done_transaction(&trans);
989                 hammer_rel_inode(nip, 0);
990                 if (error == 0) {
991                         cache_setunresolved(ap->a_nch);
992                         cache_setvp(ap->a_nch, *ap->a_vpp);
993                 }
994                 hammer_knote(ap->a_dvp, NOTE_WRITE);
995         }
996         lwkt_reltoken(&hmp->fs_token);
997         return (error);
998 }
999
1000 /*
1001  * hammer_vop_getattr { vp, vap }
1002  *
1003  * Retrieve an inode's attribute information.  When accessing inodes
1004  * historically we fake the atime field to ensure consistent results.
1005  * The atime field is stored in the B-Tree element and allowed to be
1006  * updated without cycling the element.
1007  *
1008  * MPSAFE - does not require fs_token
1009  */
1010 static
1011 int
1012 hammer_vop_getattr(struct vop_getattr_args *ap)
1013 {
1014         struct hammer_inode *ip = VTOI(ap->a_vp);
1015         struct vattr *vap = ap->a_vap;
1016
1017         /*
1018          * We want the fsid to be different when accessing a filesystem
1019          * with different as-of's so programs like diff don't think
1020          * the files are the same.
1021          *
1022          * We also want the fsid to be the same when comparing snapshots,
1023          * or when comparing mirrors (which might be backed by different
1024          * physical devices).  HAMMER fsids are based on the PFS's
1025          * shared_uuid field.
1026          *
1027          * XXX there is a chance of collision here.  The va_fsid reported
1028          * by stat is different from the more involved fsid used in the
1029          * mount structure.
1030          */
1031         ++hammer_stats_file_iopsr;
1032         hammer_lock_sh(&ip->lock);
1033         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1034                        (u_int32_t)(ip->obj_asof >> 32);
1035
1036         vap->va_fileid = ip->ino_leaf.base.obj_id;
1037         vap->va_mode = ip->ino_data.mode;
1038         vap->va_nlink = ip->ino_data.nlinks;
1039         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1040         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1041         vap->va_rmajor = 0;
1042         vap->va_rminor = 0;
1043         vap->va_size = ip->ino_data.size;
1044
1045         /*
1046          * Special case for @@PFS softlinks.  The actual size of the
1047          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1048          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1049          */
1050         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1051             ip->ino_data.size == 10 &&
1052             ip->obj_asof == HAMMER_MAX_TID &&
1053             ip->obj_localization == 0 &&
1054             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1055                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1056                             vap->va_size = 26;
1057                     else
1058                             vap->va_size = 10;
1059         }
1060
1061         /*
1062          * We must provide a consistent atime and mtime for snapshots
1063          * so people can do a 'tar cf - ... | md5' on them and get
1064          * consistent results.
1065          */
1066         if (ip->flags & HAMMER_INODE_RO) {
1067                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1068                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1069         } else {
1070                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1071                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1072         }
1073         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1074         vap->va_flags = ip->ino_data.uflags;
1075         vap->va_gen = 1;        /* hammer inums are unique for all time */
1076         vap->va_blocksize = HAMMER_BUFSIZE;
1077         if (ip->ino_data.size >= HAMMER_XDEMARC) {
1078                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1079                                 ~HAMMER_XBUFMASK64;
1080         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1081                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1082                                 ~HAMMER_BUFMASK64;
1083         } else {
1084                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1085         }
1086
1087         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1088         vap->va_filerev = 0;    /* XXX */
1089         vap->va_uid_uuid = ip->ino_data.uid;
1090         vap->va_gid_uuid = ip->ino_data.gid;
1091         vap->va_fsid_uuid = ip->hmp->fsid;
1092         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1093                           VA_FSID_UUID_VALID;
1094
1095         switch (ip->ino_data.obj_type) {
1096         case HAMMER_OBJTYPE_CDEV:
1097         case HAMMER_OBJTYPE_BDEV:
1098                 vap->va_rmajor = ip->ino_data.rmajor;
1099                 vap->va_rminor = ip->ino_data.rminor;
1100                 break;
1101         default:
1102                 break;
1103         }
1104         hammer_unlock(&ip->lock);
1105         return(0);
1106 }
1107
1108 /*
1109  * hammer_vop_nresolve { nch, dvp, cred }
1110  *
1111  * Locate the requested directory entry.
1112  */
1113 static
1114 int
1115 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1116 {
1117         struct hammer_transaction trans;
1118         struct namecache *ncp;
1119         hammer_mount_t hmp;
1120         hammer_inode_t dip;
1121         hammer_inode_t ip;
1122         hammer_tid_t asof;
1123         struct hammer_cursor cursor;
1124         struct vnode *vp;
1125         int64_t namekey;
1126         int error;
1127         int i;
1128         int nlen;
1129         int flags;
1130         int ispfs;
1131         int64_t obj_id;
1132         u_int32_t localization;
1133         u_int32_t max_iterations;
1134
1135         /*
1136          * Misc initialization, plus handle as-of name extensions.  Look for
1137          * the '@@' extension.  Note that as-of files and directories cannot
1138          * be modified.
1139          */
1140         dip = VTOI(ap->a_dvp);
1141         ncp = ap->a_nch->ncp;
1142         asof = dip->obj_asof;
1143         localization = dip->obj_localization;   /* for code consistency */
1144         nlen = ncp->nc_nlen;
1145         flags = dip->flags & HAMMER_INODE_RO;
1146         ispfs = 0;
1147         hmp = dip->hmp;
1148
1149         lwkt_gettoken(&hmp->fs_token);
1150         hammer_simple_transaction(&trans, hmp);
1151         ++hammer_stats_file_iopsr;
1152
1153         for (i = 0; i < nlen; ++i) {
1154                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1155                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
1156                                                   &ispfs, &asof, &localization);
1157                         if (error != 0) {
1158                                 i = nlen;
1159                                 break;
1160                         }
1161                         if (asof != HAMMER_MAX_TID)
1162                                 flags |= HAMMER_INODE_RO;
1163                         break;
1164                 }
1165         }
1166         nlen = i;
1167
1168         /*
1169          * If this is a PFS softlink we dive into the PFS
1170          */
1171         if (ispfs && nlen == 0) {
1172                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1173                                       asof, localization,
1174                                       flags, &error);
1175                 if (error == 0) {
1176                         error = hammer_get_vnode(ip, &vp);
1177                         hammer_rel_inode(ip, 0);
1178                 } else {
1179                         vp = NULL;
1180                 }
1181                 if (error == 0) {
1182                         vn_unlock(vp);
1183                         cache_setvp(ap->a_nch, vp);
1184                         vrele(vp);
1185                 }
1186                 goto done;
1187         }
1188
1189         /*
1190          * If there is no path component the time extension is relative to dip.
1191          * e.g. "fubar/@@<snapshot>"
1192          *
1193          * "." is handled by the kernel, but ".@@<snapshot>" is not.
1194          * e.g. "fubar/.@@<snapshot>"
1195          *
1196          * ".." is handled by the kernel.  We do not currently handle
1197          * "..@<snapshot>".
1198          */
1199         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1200                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
1201                                       asof, dip->obj_localization,
1202                                       flags, &error);
1203                 if (error == 0) {
1204                         error = hammer_get_vnode(ip, &vp);
1205                         hammer_rel_inode(ip, 0);
1206                 } else {
1207                         vp = NULL;
1208                 }
1209                 if (error == 0) {
1210                         vn_unlock(vp);
1211                         cache_setvp(ap->a_nch, vp);
1212                         vrele(vp);
1213                 }
1214                 goto done;
1215         }
1216
1217         /*
1218          * Calculate the namekey and setup the key range for the scan.  This
1219          * works kinda like a chained hash table where the lower 32 bits
1220          * of the namekey synthesize the chain.
1221          *
1222          * The key range is inclusive of both key_beg and key_end.
1223          */
1224         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1225                                            &max_iterations);
1226
1227         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1228         cursor.key_beg.localization = dip->obj_localization +
1229                                       hammer_dir_localization(dip);
1230         cursor.key_beg.obj_id = dip->obj_id;
1231         cursor.key_beg.key = namekey;
1232         cursor.key_beg.create_tid = 0;
1233         cursor.key_beg.delete_tid = 0;
1234         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1235         cursor.key_beg.obj_type = 0;
1236
1237         cursor.key_end = cursor.key_beg;
1238         cursor.key_end.key += max_iterations;
1239         cursor.asof = asof;
1240         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1241
1242         /*
1243          * Scan all matching records (the chain), locate the one matching
1244          * the requested path component.
1245          *
1246          * The hammer_ip_*() functions merge in-memory records with on-disk
1247          * records for the purposes of the search.
1248          */
1249         obj_id = 0;
1250         localization = HAMMER_DEF_LOCALIZATION;
1251
1252         if (error == 0) {
1253                 error = hammer_ip_first(&cursor);
1254                 while (error == 0) {
1255                         error = hammer_ip_resolve_data(&cursor);
1256                         if (error)
1257                                 break;
1258                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1259                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1260                                 obj_id = cursor.data->entry.obj_id;
1261                                 localization = cursor.data->entry.localization;
1262                                 break;
1263                         }
1264                         error = hammer_ip_next(&cursor);
1265                 }
1266         }
1267         hammer_done_cursor(&cursor);
1268
1269         /*
1270          * Lookup the obj_id.  This should always succeed.  If it does not
1271          * the filesystem may be damaged and we return a dummy inode.
1272          */
1273         if (error == 0) {
1274                 ip = hammer_get_inode(&trans, dip, obj_id,
1275                                       asof, localization,
1276                                       flags, &error);
1277                 if (error == ENOENT) {
1278                         kprintf("HAMMER: WARNING: Missing "
1279                                 "inode for dirent \"%s\"\n"
1280                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1281                                 ncp->nc_name,
1282                                 (long long)obj_id, (long long)asof,
1283                                 localization);
1284                         error = 0;
1285                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1286                                                     asof, localization,
1287                                                     flags, &error);
1288                 }
1289                 if (error == 0) {
1290                         error = hammer_get_vnode(ip, &vp);
1291                         hammer_rel_inode(ip, 0);
1292                 } else {
1293                         vp = NULL;
1294                 }
1295                 if (error == 0) {
1296                         vn_unlock(vp);
1297                         cache_setvp(ap->a_nch, vp);
1298                         vrele(vp);
1299                 }
1300         } else if (error == ENOENT) {
1301                 cache_setvp(ap->a_nch, NULL);
1302         }
1303 done:
1304         hammer_done_transaction(&trans);
1305         lwkt_reltoken(&hmp->fs_token);
1306         return (error);
1307 }
1308
1309 /*
1310  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1311  *
1312  * Locate the parent directory of a directory vnode.
1313  *
1314  * dvp is referenced but not locked.  *vpp must be returned referenced and
1315  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1316  * at the root, instead it could indicate that the directory we were in was
1317  * removed.
1318  *
1319  * NOTE: as-of sequences are not linked into the directory structure.  If
1320  * we are at the root with a different asof then the mount point, reload
1321  * the same directory with the mount point's asof.   I'm not sure what this
1322  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1323  * get confused, but it hasn't been tested.
1324  */
1325 static
1326 int
1327 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1328 {
1329         struct hammer_transaction trans;
1330         struct hammer_inode *dip;
1331         struct hammer_inode *ip;
1332         hammer_mount_t hmp;
1333         int64_t parent_obj_id;
1334         u_int32_t parent_obj_localization;
1335         hammer_tid_t asof;
1336         int error;
1337
1338         dip = VTOI(ap->a_dvp);
1339         asof = dip->obj_asof;
1340         hmp = dip->hmp;
1341
1342         /*
1343          * Whos are parent?  This could be the root of a pseudo-filesystem
1344          * whos parent is in another localization domain.
1345          */
1346         lwkt_gettoken(&hmp->fs_token);
1347         parent_obj_id = dip->ino_data.parent_obj_id;
1348         if (dip->obj_id == HAMMER_OBJID_ROOT)
1349                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1350         else
1351                 parent_obj_localization = dip->obj_localization;
1352
1353         if (parent_obj_id == 0) {
1354                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1355                    asof != hmp->asof) {
1356                         parent_obj_id = dip->obj_id;
1357                         asof = hmp->asof;
1358                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1359                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1360                                   (long long)dip->obj_asof);
1361                 } else {
1362                         *ap->a_vpp = NULL;
1363                         lwkt_reltoken(&hmp->fs_token);
1364                         return ENOENT;
1365                 }
1366         }
1367
1368         hammer_simple_transaction(&trans, hmp);
1369         ++hammer_stats_file_iopsr;
1370
1371         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1372                               asof, parent_obj_localization,
1373                               dip->flags, &error);
1374         if (ip) {
1375                 error = hammer_get_vnode(ip, ap->a_vpp);
1376                 hammer_rel_inode(ip, 0);
1377         } else {
1378                 *ap->a_vpp = NULL;
1379         }
1380         hammer_done_transaction(&trans);
1381         lwkt_reltoken(&hmp->fs_token);
1382         return (error);
1383 }
1384
1385 /*
1386  * hammer_vop_nlink { nch, dvp, vp, cred }
1387  */
1388 static
1389 int
1390 hammer_vop_nlink(struct vop_nlink_args *ap)
1391 {
1392         struct hammer_transaction trans;
1393         struct hammer_inode *dip;
1394         struct hammer_inode *ip;
1395         struct nchandle *nch;
1396         hammer_mount_t hmp;
1397         int error;
1398
1399         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1400                 return(EXDEV);
1401
1402         nch = ap->a_nch;
1403         dip = VTOI(ap->a_dvp);
1404         ip = VTOI(ap->a_vp);
1405         hmp = dip->hmp;
1406
1407         if (dip->obj_localization != ip->obj_localization)
1408                 return(EXDEV);
1409
1410         if (dip->flags & HAMMER_INODE_RO)
1411                 return (EROFS);
1412         if (ip->flags & HAMMER_INODE_RO)
1413                 return (EROFS);
1414         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1415                 return (error);
1416
1417         /*
1418          * Create a transaction to cover the operations we perform.
1419          */
1420         lwkt_gettoken(&hmp->fs_token);
1421         hammer_start_transaction(&trans, hmp);
1422         ++hammer_stats_file_iopsw;
1423
1424         /*
1425          * Add the filesystem object to the directory.  Note that neither
1426          * dip nor ip are referenced or locked, but their vnodes are
1427          * referenced.  This function will bump the inode's link count.
1428          */
1429         error = hammer_ip_add_directory(&trans, dip,
1430                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1431                                         ip);
1432
1433         /*
1434          * Finish up.
1435          */
1436         if (error == 0) {
1437                 cache_setunresolved(nch);
1438                 cache_setvp(nch, ap->a_vp);
1439         }
1440         hammer_done_transaction(&trans);
1441         hammer_knote(ap->a_vp, NOTE_LINK);
1442         hammer_knote(ap->a_dvp, NOTE_WRITE);
1443         lwkt_reltoken(&hmp->fs_token);
1444         return (error);
1445 }
1446
1447 /*
1448  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1449  *
1450  * The operating system has already ensured that the directory entry
1451  * does not exist and done all appropriate namespace locking.
1452  */
1453 static
1454 int
1455 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1456 {
1457         struct hammer_transaction trans;
1458         struct hammer_inode *dip;
1459         struct hammer_inode *nip;
1460         struct nchandle *nch;
1461         hammer_mount_t hmp;
1462         int error;
1463
1464         nch = ap->a_nch;
1465         dip = VTOI(ap->a_dvp);
1466         hmp = dip->hmp;
1467
1468         if (dip->flags & HAMMER_INODE_RO)
1469                 return (EROFS);
1470         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1471                 return (error);
1472
1473         /*
1474          * Create a transaction to cover the operations we perform.
1475          */
1476         lwkt_gettoken(&hmp->fs_token);
1477         hammer_start_transaction(&trans, hmp);
1478         ++hammer_stats_file_iopsw;
1479
1480         /*
1481          * Create a new filesystem object of the requested type.  The
1482          * returned inode will be referenced but not locked.
1483          */
1484         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1485                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1486                                     NULL, &nip);
1487         if (error) {
1488                 hkprintf("hammer_mkdir error %d\n", error);
1489                 hammer_done_transaction(&trans);
1490                 *ap->a_vpp = NULL;
1491                 lwkt_reltoken(&hmp->fs_token);
1492                 return (error);
1493         }
1494         /*
1495          * Add the new filesystem object to the directory.  This will also
1496          * bump the inode's link count.
1497          */
1498         error = hammer_ip_add_directory(&trans, dip,
1499                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1500                                         nip);
1501         if (error)
1502                 hkprintf("hammer_mkdir (add) error %d\n", error);
1503
1504         /*
1505          * Finish up.
1506          */
1507         if (error) {
1508                 hammer_rel_inode(nip, 0);
1509                 *ap->a_vpp = NULL;
1510         } else {
1511                 error = hammer_get_vnode(nip, ap->a_vpp);
1512                 hammer_rel_inode(nip, 0);
1513                 if (error == 0) {
1514                         cache_setunresolved(ap->a_nch);
1515                         cache_setvp(ap->a_nch, *ap->a_vpp);
1516                 }
1517         }
1518         hammer_done_transaction(&trans);
1519         if (error == 0)
1520                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1521         lwkt_reltoken(&hmp->fs_token);
1522         return (error);
1523 }
1524
1525 /*
1526  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1527  *
1528  * The operating system has already ensured that the directory entry
1529  * does not exist and done all appropriate namespace locking.
1530  */
1531 static
1532 int
1533 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1534 {
1535         struct hammer_transaction trans;
1536         struct hammer_inode *dip;
1537         struct hammer_inode *nip;
1538         struct nchandle *nch;
1539         hammer_mount_t hmp;
1540         int error;
1541
1542         nch = ap->a_nch;
1543         dip = VTOI(ap->a_dvp);
1544         hmp = dip->hmp;
1545
1546         if (dip->flags & HAMMER_INODE_RO)
1547                 return (EROFS);
1548         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1549                 return (error);
1550
1551         /*
1552          * Create a transaction to cover the operations we perform.
1553          */
1554         lwkt_gettoken(&hmp->fs_token);
1555         hammer_start_transaction(&trans, hmp);
1556         ++hammer_stats_file_iopsw;
1557
1558         /*
1559          * Create a new filesystem object of the requested type.  The
1560          * returned inode will be referenced but not locked.
1561          *
1562          * If mknod specifies a directory a pseudo-fs is created.
1563          */
1564         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1565                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1566                                     NULL, &nip);
1567         if (error) {
1568                 hammer_done_transaction(&trans);
1569                 *ap->a_vpp = NULL;
1570                 lwkt_reltoken(&hmp->fs_token);
1571                 return (error);
1572         }
1573
1574         /*
1575          * Add the new filesystem object to the directory.  This will also
1576          * bump the inode's link count.
1577          */
1578         error = hammer_ip_add_directory(&trans, dip,
1579                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1580                                         nip);
1581
1582         /*
1583          * Finish up.
1584          */
1585         if (error) {
1586                 hammer_rel_inode(nip, 0);
1587                 *ap->a_vpp = NULL;
1588         } else {
1589                 error = hammer_get_vnode(nip, ap->a_vpp);
1590                 hammer_rel_inode(nip, 0);
1591                 if (error == 0) {
1592                         cache_setunresolved(ap->a_nch);
1593                         cache_setvp(ap->a_nch, *ap->a_vpp);
1594                 }
1595         }
1596         hammer_done_transaction(&trans);
1597         if (error == 0)
1598                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1599         lwkt_reltoken(&hmp->fs_token);
1600         return (error);
1601 }
1602
1603 /*
1604  * hammer_vop_open { vp, mode, cred, fp }
1605  *
1606  * MPSAFE (does not require fs_token)
1607  */
1608 static
1609 int
1610 hammer_vop_open(struct vop_open_args *ap)
1611 {
1612         hammer_inode_t ip;
1613
1614         ++hammer_stats_file_iopsr;
1615         ip = VTOI(ap->a_vp);
1616
1617         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1618                 return (EROFS);
1619         return(vop_stdopen(ap));
1620 }
1621
1622 /*
1623  * hammer_vop_print { vp }
1624  */
1625 static
1626 int
1627 hammer_vop_print(struct vop_print_args *ap)
1628 {
1629         return EOPNOTSUPP;
1630 }
1631
1632 /*
1633  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1634  */
1635 static
1636 int
1637 hammer_vop_readdir(struct vop_readdir_args *ap)
1638 {
1639         struct hammer_transaction trans;
1640         struct hammer_cursor cursor;
1641         struct hammer_inode *ip;
1642         hammer_mount_t hmp;
1643         struct uio *uio;
1644         hammer_base_elm_t base;
1645         int error;
1646         int cookie_index;
1647         int ncookies;
1648         off_t *cookies;
1649         off_t saveoff;
1650         int r;
1651         int dtype;
1652
1653         ++hammer_stats_file_iopsr;
1654         ip = VTOI(ap->a_vp);
1655         uio = ap->a_uio;
1656         saveoff = uio->uio_offset;
1657         hmp = ip->hmp;
1658
1659         if (ap->a_ncookies) {
1660                 ncookies = uio->uio_resid / 16 + 1;
1661                 if (ncookies > 1024)
1662                         ncookies = 1024;
1663                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1664                 cookie_index = 0;
1665         } else {
1666                 ncookies = -1;
1667                 cookies = NULL;
1668                 cookie_index = 0;
1669         }
1670
1671         lwkt_gettoken(&hmp->fs_token);
1672         hammer_simple_transaction(&trans, hmp);
1673
1674         /*
1675          * Handle artificial entries
1676          *
1677          * It should be noted that the minimum value for a directory
1678          * hash key on-media is 0x0000000100000000, so we can use anything
1679          * less then that to represent our 'special' key space.
1680          */
1681         error = 0;
1682         if (saveoff == 0) {
1683                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1684                 if (r)
1685                         goto done;
1686                 if (cookies)
1687                         cookies[cookie_index] = saveoff;
1688                 ++saveoff;
1689                 ++cookie_index;
1690                 if (cookie_index == ncookies)
1691                         goto done;
1692         }
1693         if (saveoff == 1) {
1694                 if (ip->ino_data.parent_obj_id) {
1695                         r = vop_write_dirent(&error, uio,
1696                                              ip->ino_data.parent_obj_id,
1697                                              DT_DIR, 2, "..");
1698                 } else {
1699                         r = vop_write_dirent(&error, uio,
1700                                              ip->obj_id, DT_DIR, 2, "..");
1701                 }
1702                 if (r)
1703                         goto done;
1704                 if (cookies)
1705                         cookies[cookie_index] = saveoff;
1706                 ++saveoff;
1707                 ++cookie_index;
1708                 if (cookie_index == ncookies)
1709                         goto done;
1710         }
1711
1712         /*
1713          * Key range (begin and end inclusive) to scan.  Directory keys
1714          * directly translate to a 64 bit 'seek' position.
1715          */
1716         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1717         cursor.key_beg.localization = ip->obj_localization +
1718                                       hammer_dir_localization(ip);
1719         cursor.key_beg.obj_id = ip->obj_id;
1720         cursor.key_beg.create_tid = 0;
1721         cursor.key_beg.delete_tid = 0;
1722         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1723         cursor.key_beg.obj_type = 0;
1724         cursor.key_beg.key = saveoff;
1725
1726         cursor.key_end = cursor.key_beg;
1727         cursor.key_end.key = HAMMER_MAX_KEY;
1728         cursor.asof = ip->obj_asof;
1729         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1730
1731         error = hammer_ip_first(&cursor);
1732
1733         while (error == 0) {
1734                 error = hammer_ip_resolve_data(&cursor);
1735                 if (error)
1736                         break;
1737                 base = &cursor.leaf->base;
1738                 saveoff = base->key;
1739                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1740
1741                 if (base->obj_id != ip->obj_id)
1742                         panic("readdir: bad record at %p", cursor.node);
1743
1744                 /*
1745                  * Convert pseudo-filesystems into softlinks
1746                  */
1747                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1748                 r = vop_write_dirent(
1749                              &error, uio, cursor.data->entry.obj_id,
1750                              dtype,
1751                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1752                              (void *)cursor.data->entry.name);
1753                 if (r)
1754                         break;
1755                 ++saveoff;
1756                 if (cookies)
1757                         cookies[cookie_index] = base->key;
1758                 ++cookie_index;
1759                 if (cookie_index == ncookies)
1760                         break;
1761                 error = hammer_ip_next(&cursor);
1762         }
1763         hammer_done_cursor(&cursor);
1764
1765 done:
1766         hammer_done_transaction(&trans);
1767
1768         if (ap->a_eofflag)
1769                 *ap->a_eofflag = (error == ENOENT);
1770         uio->uio_offset = saveoff;
1771         if (error && cookie_index == 0) {
1772                 if (error == ENOENT)
1773                         error = 0;
1774                 if (cookies) {
1775                         kfree(cookies, M_TEMP);
1776                         *ap->a_ncookies = 0;
1777                         *ap->a_cookies = NULL;
1778                 }
1779         } else {
1780                 if (error == ENOENT)
1781                         error = 0;
1782                 if (cookies) {
1783                         *ap->a_ncookies = cookie_index;
1784                         *ap->a_cookies = cookies;
1785                 }
1786         }
1787         lwkt_reltoken(&hmp->fs_token);
1788         return(error);
1789 }
1790
1791 /*
1792  * hammer_vop_readlink { vp, uio, cred }
1793  */
1794 static
1795 int
1796 hammer_vop_readlink(struct vop_readlink_args *ap)
1797 {
1798         struct hammer_transaction trans;
1799         struct hammer_cursor cursor;
1800         struct hammer_inode *ip;
1801         hammer_mount_t hmp;
1802         char buf[32];
1803         u_int32_t localization;
1804         hammer_pseudofs_inmem_t pfsm;
1805         int error;
1806
1807         ip = VTOI(ap->a_vp);
1808         hmp = ip->hmp;
1809
1810         lwkt_gettoken(&hmp->fs_token);
1811
1812         /*
1813          * Shortcut if the symlink data was stuffed into ino_data.
1814          *
1815          * Also expand special "@@PFS%05d" softlinks (expansion only
1816          * occurs for non-historical (current) accesses made from the
1817          * primary filesystem).
1818          */
1819         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1820                 char *ptr;
1821                 int bytes;
1822
1823                 ptr = ip->ino_data.ext.symlink;
1824                 bytes = (int)ip->ino_data.size;
1825                 if (bytes == 10 &&
1826                     ip->obj_asof == HAMMER_MAX_TID &&
1827                     ip->obj_localization == 0 &&
1828                     strncmp(ptr, "@@PFS", 5) == 0) {
1829                         hammer_simple_transaction(&trans, hmp);
1830                         bcopy(ptr + 5, buf, 5);
1831                         buf[5] = 0;
1832                         localization = strtoul(buf, NULL, 10) << 16;
1833                         pfsm = hammer_load_pseudofs(&trans, localization,
1834                                                     &error);
1835                         if (error == 0) {
1836                                 if (pfsm->pfsd.mirror_flags &
1837                                     HAMMER_PFSD_SLAVE) {
1838                                         /* vap->va_size == 26 */
1839                                         ksnprintf(buf, sizeof(buf),
1840                                                   "@@0x%016llx:%05d",
1841                                                   (long long)pfsm->pfsd.sync_end_tid,
1842                                                   localization >> 16);
1843                                 } else {
1844                                         /* vap->va_size == 10 */
1845                                         ksnprintf(buf, sizeof(buf),
1846                                                   "@@-1:%05d",
1847                                                   localization >> 16);
1848 #if 0
1849                                         ksnprintf(buf, sizeof(buf),
1850                                                   "@@0x%016llx:%05d",
1851                                                   (long long)HAMMER_MAX_TID,
1852                                                   localization >> 16);
1853 #endif
1854                                 }
1855                                 ptr = buf;
1856                                 bytes = strlen(buf);
1857                         }
1858                         if (pfsm)
1859                                 hammer_rel_pseudofs(hmp, pfsm);
1860                         hammer_done_transaction(&trans);
1861                 }
1862                 error = uiomove(ptr, bytes, ap->a_uio);
1863                 lwkt_reltoken(&hmp->fs_token);
1864                 return(error);
1865         }
1866
1867         /*
1868          * Long version
1869          */
1870         hammer_simple_transaction(&trans, hmp);
1871         ++hammer_stats_file_iopsr;
1872         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1873
1874         /*
1875          * Key range (begin and end inclusive) to scan.  Directory keys
1876          * directly translate to a 64 bit 'seek' position.
1877          */
1878         cursor.key_beg.localization = ip->obj_localization +
1879                                       HAMMER_LOCALIZE_MISC;
1880         cursor.key_beg.obj_id = ip->obj_id;
1881         cursor.key_beg.create_tid = 0;
1882         cursor.key_beg.delete_tid = 0;
1883         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1884         cursor.key_beg.obj_type = 0;
1885         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1886         cursor.asof = ip->obj_asof;
1887         cursor.flags |= HAMMER_CURSOR_ASOF;
1888
1889         error = hammer_ip_lookup(&cursor);
1890         if (error == 0) {
1891                 error = hammer_ip_resolve_data(&cursor);
1892                 if (error == 0) {
1893                         KKASSERT(cursor.leaf->data_len >=
1894                                  HAMMER_SYMLINK_NAME_OFF);
1895                         error = uiomove(cursor.data->symlink.name,
1896                                         cursor.leaf->data_len -
1897                                                 HAMMER_SYMLINK_NAME_OFF,
1898                                         ap->a_uio);
1899                 }
1900         }
1901         hammer_done_cursor(&cursor);
1902         hammer_done_transaction(&trans);
1903         lwkt_reltoken(&hmp->fs_token);
1904         return(error);
1905 }
1906
1907 /*
1908  * hammer_vop_nremove { nch, dvp, cred }
1909  */
1910 static
1911 int
1912 hammer_vop_nremove(struct vop_nremove_args *ap)
1913 {
1914         struct hammer_transaction trans;
1915         struct hammer_inode *dip;
1916         hammer_mount_t hmp;
1917         int error;
1918
1919         dip = VTOI(ap->a_dvp);
1920         hmp = dip->hmp;
1921
1922         if (hammer_nohistory(dip) == 0 &&
1923             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1924                 return (error);
1925         }
1926
1927         lwkt_gettoken(&hmp->fs_token);
1928         hammer_start_transaction(&trans, hmp);
1929         ++hammer_stats_file_iopsw;
1930         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1931         hammer_done_transaction(&trans);
1932         if (error == 0)
1933                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1934         lwkt_reltoken(&hmp->fs_token);
1935         return (error);
1936 }
1937
1938 /*
1939  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1940  */
1941 static
1942 int
1943 hammer_vop_nrename(struct vop_nrename_args *ap)
1944 {
1945         struct hammer_transaction trans;
1946         struct namecache *fncp;
1947         struct namecache *tncp;
1948         struct hammer_inode *fdip;
1949         struct hammer_inode *tdip;
1950         struct hammer_inode *ip;
1951         hammer_mount_t hmp;
1952         struct hammer_cursor cursor;
1953         int64_t namekey;
1954         u_int32_t max_iterations;
1955         int nlen, error;
1956
1957         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1958                 return(EXDEV);
1959         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1960                 return(EXDEV);
1961
1962         fdip = VTOI(ap->a_fdvp);
1963         tdip = VTOI(ap->a_tdvp);
1964         fncp = ap->a_fnch->ncp;
1965         tncp = ap->a_tnch->ncp;
1966         ip = VTOI(fncp->nc_vp);
1967         KKASSERT(ip != NULL);
1968
1969         hmp = ip->hmp;
1970
1971         if (fdip->obj_localization != tdip->obj_localization)
1972                 return(EXDEV);
1973         if (fdip->obj_localization != ip->obj_localization)
1974                 return(EXDEV);
1975
1976         if (fdip->flags & HAMMER_INODE_RO)
1977                 return (EROFS);
1978         if (tdip->flags & HAMMER_INODE_RO)
1979                 return (EROFS);
1980         if (ip->flags & HAMMER_INODE_RO)
1981                 return (EROFS);
1982         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1983                 return (error);
1984
1985         lwkt_gettoken(&hmp->fs_token);
1986         hammer_start_transaction(&trans, hmp);
1987         ++hammer_stats_file_iopsw;
1988
1989         /*
1990          * Remove tncp from the target directory and then link ip as
1991          * tncp. XXX pass trans to dounlink
1992          *
1993          * Force the inode sync-time to match the transaction so it is
1994          * in-sync with the creation of the target directory entry.
1995          */
1996         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1997                                 ap->a_cred, 0, -1);
1998         if (error == 0 || error == ENOENT) {
1999                 error = hammer_ip_add_directory(&trans, tdip,
2000                                                 tncp->nc_name, tncp->nc_nlen,
2001                                                 ip);
2002                 if (error == 0) {
2003                         ip->ino_data.parent_obj_id = tdip->obj_id;
2004                         ip->ino_data.ctime = trans.time;
2005                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
2006                 }
2007         }
2008         if (error)
2009                 goto failed; /* XXX */
2010
2011         /*
2012          * Locate the record in the originating directory and remove it.
2013          *
2014          * Calculate the namekey and setup the key range for the scan.  This
2015          * works kinda like a chained hash table where the lower 32 bits
2016          * of the namekey synthesize the chain.
2017          *
2018          * The key range is inclusive of both key_beg and key_end.
2019          */
2020         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2021                                            &max_iterations);
2022 retry:
2023         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
2024         cursor.key_beg.localization = fdip->obj_localization +
2025                                       hammer_dir_localization(fdip);
2026         cursor.key_beg.obj_id = fdip->obj_id;
2027         cursor.key_beg.key = namekey;
2028         cursor.key_beg.create_tid = 0;
2029         cursor.key_beg.delete_tid = 0;
2030         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2031         cursor.key_beg.obj_type = 0;
2032
2033         cursor.key_end = cursor.key_beg;
2034         cursor.key_end.key += max_iterations;
2035         cursor.asof = fdip->obj_asof;
2036         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2037
2038         /*
2039          * Scan all matching records (the chain), locate the one matching
2040          * the requested path component.
2041          *
2042          * The hammer_ip_*() functions merge in-memory records with on-disk
2043          * records for the purposes of the search.
2044          */
2045         error = hammer_ip_first(&cursor);
2046         while (error == 0) {
2047                 if (hammer_ip_resolve_data(&cursor) != 0)
2048                         break;
2049                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2050                 KKASSERT(nlen > 0);
2051                 if (fncp->nc_nlen == nlen &&
2052                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2053                         break;
2054                 }
2055                 error = hammer_ip_next(&cursor);
2056         }
2057
2058         /*
2059          * If all is ok we have to get the inode so we can adjust nlinks.
2060          *
2061          * WARNING: hammer_ip_del_directory() may have to terminate the
2062          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2063          * twice.
2064          */
2065         if (error == 0)
2066                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2067
2068         /*
2069          * XXX A deadlock here will break rename's atomicy for the purposes
2070          * of crash recovery.
2071          */
2072         if (error == EDEADLK) {
2073                 hammer_done_cursor(&cursor);
2074                 goto retry;
2075         }
2076
2077         /*
2078          * Cleanup and tell the kernel that the rename succeeded.
2079          *
2080          * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2081          *       without formally acquiring the vp since the vp might
2082          *       have zero refs on it, or in the middle of a reclaim,
2083          *       etc.
2084          */
2085         hammer_done_cursor(&cursor);
2086         if (error == 0) {
2087                 cache_rename(ap->a_fnch, ap->a_tnch);
2088                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
2089                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
2090                 while (ip->vp) {
2091                         struct vnode *vp;
2092
2093                         error = hammer_get_vnode(ip, &vp);
2094                         if (error == 0 && vp) {
2095                                 vn_unlock(vp);
2096                                 hammer_knote(ip->vp, NOTE_RENAME);
2097                                 vrele(vp);
2098                                 break;
2099                         }
2100                         kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2101                 }
2102         }
2103
2104 failed:
2105         hammer_done_transaction(&trans);
2106         lwkt_reltoken(&hmp->fs_token);
2107         return (error);
2108 }
2109
2110 /*
2111  * hammer_vop_nrmdir { nch, dvp, cred }
2112  */
2113 static
2114 int
2115 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2116 {
2117         struct hammer_transaction trans;
2118         struct hammer_inode *dip;
2119         hammer_mount_t hmp;
2120         int error;
2121
2122         dip = VTOI(ap->a_dvp);
2123         hmp = dip->hmp;
2124
2125         if (hammer_nohistory(dip) == 0 &&
2126             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2127                 return (error);
2128         }
2129
2130         lwkt_gettoken(&hmp->fs_token);
2131         hammer_start_transaction(&trans, hmp);
2132         ++hammer_stats_file_iopsw;
2133         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2134         hammer_done_transaction(&trans);
2135         if (error == 0)
2136                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2137         lwkt_reltoken(&hmp->fs_token);
2138         return (error);
2139 }
2140
2141 /*
2142  * hammer_vop_markatime { vp, cred }
2143  */
2144 static
2145 int
2146 hammer_vop_markatime(struct vop_markatime_args *ap)
2147 {
2148         struct hammer_transaction trans;
2149         struct hammer_inode *ip;
2150         hammer_mount_t hmp;
2151
2152         ip = VTOI(ap->a_vp);
2153         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2154                 return (EROFS);
2155         if (ip->flags & HAMMER_INODE_RO)
2156                 return (EROFS);
2157         hmp = ip->hmp;
2158         if (hmp->mp->mnt_flag & MNT_NOATIME)
2159                 return (0);
2160         lwkt_gettoken(&hmp->fs_token);
2161         hammer_start_transaction(&trans, hmp);
2162         ++hammer_stats_file_iopsw;
2163
2164         ip->ino_data.atime = trans.time;
2165         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2166         hammer_done_transaction(&trans);
2167         hammer_knote(ap->a_vp, NOTE_ATTRIB);
2168         lwkt_reltoken(&hmp->fs_token);
2169         return (0);
2170 }
2171
2172 /*
2173  * hammer_vop_setattr { vp, vap, cred }
2174  */
2175 static
2176 int
2177 hammer_vop_setattr(struct vop_setattr_args *ap)
2178 {
2179         struct hammer_transaction trans;
2180         struct hammer_inode *ip;
2181         struct vattr *vap;
2182         hammer_mount_t hmp;
2183         int modflags;
2184         int error;
2185         int truncating;
2186         int blksize;
2187         int kflags;
2188 #if 0
2189         int64_t aligned_size;
2190 #endif
2191         u_int32_t flags;
2192
2193         vap = ap->a_vap;
2194         ip = ap->a_vp->v_data;
2195         modflags = 0;
2196         kflags = 0;
2197         hmp = ip->hmp;
2198
2199         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2200                 return(EROFS);
2201         if (ip->flags & HAMMER_INODE_RO)
2202                 return (EROFS);
2203         if (hammer_nohistory(ip) == 0 &&
2204             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2205                 return (error);
2206         }
2207
2208         lwkt_gettoken(&hmp->fs_token);
2209         hammer_start_transaction(&trans, hmp);
2210         ++hammer_stats_file_iopsw;
2211         error = 0;
2212
2213         if (vap->va_flags != VNOVAL) {
2214                 flags = ip->ino_data.uflags;
2215                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2216                                          hammer_to_unix_xid(&ip->ino_data.uid),
2217                                          ap->a_cred);
2218                 if (error == 0) {
2219                         if (ip->ino_data.uflags != flags) {
2220                                 ip->ino_data.uflags = flags;
2221                                 ip->ino_data.ctime = trans.time;
2222                                 modflags |= HAMMER_INODE_DDIRTY;
2223                                 kflags |= NOTE_ATTRIB;
2224                         }
2225                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2226                                 error = 0;
2227                                 goto done;
2228                         }
2229                 }
2230                 goto done;
2231         }
2232         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2233                 error = EPERM;
2234                 goto done;
2235         }
2236         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2237                 mode_t cur_mode = ip->ino_data.mode;
2238                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2239                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2240                 uuid_t uuid_uid;
2241                 uuid_t uuid_gid;
2242
2243                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2244                                          ap->a_cred,
2245                                          &cur_uid, &cur_gid, &cur_mode);
2246                 if (error == 0) {
2247                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
2248                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
2249                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
2250                                  sizeof(uuid_uid)) ||
2251                             bcmp(&uuid_gid, &ip->ino_data.gid,
2252                                  sizeof(uuid_gid)) ||
2253                             ip->ino_data.mode != cur_mode
2254                         ) {
2255                                 ip->ino_data.uid = uuid_uid;
2256                                 ip->ino_data.gid = uuid_gid;
2257                                 ip->ino_data.mode = cur_mode;
2258                                 ip->ino_data.ctime = trans.time;
2259                                 modflags |= HAMMER_INODE_DDIRTY;
2260                         }
2261                         kflags |= NOTE_ATTRIB;
2262                 }
2263         }
2264         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2265                 switch(ap->a_vp->v_type) {
2266                 case VREG:
2267                         if (vap->va_size == ip->ino_data.size)
2268                                 break;
2269
2270                         /*
2271                          * Log the operation if in fast-fsync mode or if
2272                          * there are unterminated redo write records present.
2273                          *
2274                          * The second check is needed so the recovery code
2275                          * properly truncates write redos even if nominal
2276                          * REDO operations is turned off due to excessive
2277                          * writes, because the related records might be
2278                          * destroyed and never lay down a TERM_WRITE.
2279                          */
2280                         if ((ip->flags & HAMMER_INODE_REDO) ||
2281                             (ip->flags & HAMMER_INODE_RDIRTY)) {
2282                                 error = hammer_generate_redo(&trans, ip,
2283                                                              vap->va_size,
2284                                                              HAMMER_REDO_TRUNC,
2285                                                              NULL, 0);
2286                         }
2287                         blksize = hammer_blocksize(vap->va_size);
2288
2289                         /*
2290                          * XXX break atomicy, we can deadlock the backend
2291                          * if we do not release the lock.  Probably not a
2292                          * big deal here.
2293                          */
2294                         if (vap->va_size < ip->ino_data.size) {
2295                                 nvtruncbuf(ap->a_vp, vap->va_size,
2296                                            blksize,
2297                                            hammer_blockoff(vap->va_size),
2298                                            0);
2299                                 truncating = 1;
2300                                 kflags |= NOTE_WRITE;
2301                         } else {
2302                                 nvextendbuf(ap->a_vp,
2303                                             ip->ino_data.size,
2304                                             vap->va_size,
2305                                             hammer_blocksize(ip->ino_data.size),
2306                                             hammer_blocksize(vap->va_size),
2307                                             hammer_blockoff(ip->ino_data.size),
2308                                             hammer_blockoff(vap->va_size),
2309                                             0);
2310                                 truncating = 0;
2311                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
2312                         }
2313                         ip->ino_data.size = vap->va_size;
2314                         ip->ino_data.mtime = trans.time;
2315                         /* XXX safe to use SDIRTY instead of DDIRTY here? */
2316                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2317
2318                         /*
2319                          * On-media truncation is cached in the inode until
2320                          * the inode is synchronized.  We must immediately
2321                          * handle any frontend records.
2322                          */
2323                         if (truncating) {
2324                                 hammer_ip_frontend_trunc(ip, vap->va_size);
2325 #ifdef DEBUG_TRUNCATE
2326                                 if (HammerTruncIp == NULL)
2327                                         HammerTruncIp = ip;
2328 #endif
2329                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2330                                         ip->flags |= HAMMER_INODE_TRUNCATED;
2331                                         ip->trunc_off = vap->va_size;
2332                                         hammer_inode_dirty(ip);
2333 #ifdef DEBUG_TRUNCATE
2334                                         if (ip == HammerTruncIp)
2335                                         kprintf("truncate1 %016llx\n",
2336                                                 (long long)ip->trunc_off);
2337 #endif
2338                                 } else if (ip->trunc_off > vap->va_size) {
2339                                         ip->trunc_off = vap->va_size;
2340 #ifdef DEBUG_TRUNCATE
2341                                         if (ip == HammerTruncIp)
2342                                         kprintf("truncate2 %016llx\n",
2343                                                 (long long)ip->trunc_off);
2344 #endif
2345                                 } else {
2346 #ifdef DEBUG_TRUNCATE
2347                                         if (ip == HammerTruncIp)
2348                                         kprintf("truncate3 %016llx (ignored)\n",
2349                                                 (long long)vap->va_size);
2350 #endif
2351                                 }
2352                         }
2353
2354 #if 0
2355                         /*
2356                          * When truncating, nvtruncbuf() may have cleaned out
2357                          * a portion of the last block on-disk in the buffer
2358                          * cache.  We must clean out any frontend records
2359                          * for blocks beyond the new last block.
2360                          */
2361                         aligned_size = (vap->va_size + (blksize - 1)) &
2362                                        ~(int64_t)(blksize - 1);
2363                         if (truncating && vap->va_size < aligned_size) {
2364                                 aligned_size -= blksize;
2365                                 hammer_ip_frontend_trunc(ip, aligned_size);
2366                         }
2367 #endif
2368                         break;
2369                 case VDATABASE:
2370                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2371                                 ip->flags |= HAMMER_INODE_TRUNCATED;
2372                                 ip->trunc_off = vap->va_size;
2373                                 hammer_inode_dirty(ip);
2374                         } else if (ip->trunc_off > vap->va_size) {
2375                                 ip->trunc_off = vap->va_size;
2376                         }
2377                         hammer_ip_frontend_trunc(ip, vap->va_size);
2378                         ip->ino_data.size = vap->va_size;
2379                         ip->ino_data.mtime = trans.time;
2380                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2381                         kflags |= NOTE_ATTRIB;
2382                         break;
2383                 default:
2384                         error = EINVAL;
2385                         goto done;
2386                 }
2387                 break;
2388         }
2389         if (vap->va_atime.tv_sec != VNOVAL) {
2390                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2391                 modflags |= HAMMER_INODE_ATIME;
2392                 kflags |= NOTE_ATTRIB;
2393         }
2394         if (vap->va_mtime.tv_sec != VNOVAL) {
2395                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2396                 modflags |= HAMMER_INODE_MTIME;
2397                 kflags |= NOTE_ATTRIB;
2398         }
2399         if (vap->va_mode != (mode_t)VNOVAL) {
2400                 mode_t   cur_mode = ip->ino_data.mode;
2401                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2402                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2403
2404                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2405                                          cur_uid, cur_gid, &cur_mode);
2406                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2407                         ip->ino_data.mode = cur_mode;
2408                         ip->ino_data.ctime = trans.time;
2409                         modflags |= HAMMER_INODE_DDIRTY;
2410                         kflags |= NOTE_ATTRIB;
2411                 }
2412         }
2413 done:
2414         if (error == 0)
2415                 hammer_modify_inode(&trans, ip, modflags);
2416         hammer_done_transaction(&trans);
2417         hammer_knote(ap->a_vp, kflags);
2418         lwkt_reltoken(&hmp->fs_token);
2419         return (error);
2420 }
2421
2422 /*
2423  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2424  */
2425 static
2426 int
2427 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2428 {
2429         struct hammer_transaction trans;
2430         struct hammer_inode *dip;
2431         struct hammer_inode *nip;
2432         hammer_record_t record;
2433         struct nchandle *nch;
2434         hammer_mount_t hmp;
2435         int error;
2436         int bytes;
2437
2438         ap->a_vap->va_type = VLNK;
2439
2440         nch = ap->a_nch;
2441         dip = VTOI(ap->a_dvp);
2442         hmp = dip->hmp;
2443
2444         if (dip->flags & HAMMER_INODE_RO)
2445                 return (EROFS);
2446         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2447                 return (error);
2448
2449         /*
2450          * Create a transaction to cover the operations we perform.
2451          */
2452         lwkt_gettoken(&hmp->fs_token);
2453         hammer_start_transaction(&trans, hmp);
2454         ++hammer_stats_file_iopsw;
2455
2456         /*
2457          * Create a new filesystem object of the requested type.  The
2458          * returned inode will be referenced but not locked.
2459          */
2460
2461         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2462                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2463                                     NULL, &nip);
2464         if (error) {
2465                 hammer_done_transaction(&trans);
2466                 *ap->a_vpp = NULL;
2467                 lwkt_reltoken(&hmp->fs_token);
2468                 return (error);
2469         }
2470
2471         /*
2472          * Add a record representing the symlink.  symlink stores the link
2473          * as pure data, not a string, and is no \0 terminated.
2474          */
2475         if (error == 0) {
2476                 bytes = strlen(ap->a_target);
2477
2478                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2479                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2480                 } else {
2481                         record = hammer_alloc_mem_record(nip, bytes);
2482                         record->type = HAMMER_MEM_RECORD_GENERAL;
2483
2484                         record->leaf.base.localization = nip->obj_localization +
2485                                                          HAMMER_LOCALIZE_MISC;
2486                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2487                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2488                         record->leaf.data_len = bytes;
2489                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2490                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2491                         error = hammer_ip_add_record(&trans, record);
2492                 }
2493
2494                 /*
2495                  * Set the file size to the length of the link.
2496                  */
2497                 if (error == 0) {
2498                         nip->ino_data.size = bytes;
2499                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2500                 }
2501         }
2502         if (error == 0)
2503                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2504                                                 nch->ncp->nc_nlen, nip);
2505
2506         /*
2507          * Finish up.
2508          */
2509         if (error) {
2510                 hammer_rel_inode(nip, 0);
2511                 *ap->a_vpp = NULL;
2512         } else {
2513                 error = hammer_get_vnode(nip, ap->a_vpp);
2514                 hammer_rel_inode(nip, 0);
2515                 if (error == 0) {
2516                         cache_setunresolved(ap->a_nch);
2517                         cache_setvp(ap->a_nch, *ap->a_vpp);
2518                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2519                 }
2520         }
2521         hammer_done_transaction(&trans);
2522         lwkt_reltoken(&hmp->fs_token);
2523         return (error);
2524 }
2525
2526 /*
2527  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2528  */
2529 static
2530 int
2531 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2532 {
2533         struct hammer_transaction trans;
2534         struct hammer_inode *dip;
2535         hammer_mount_t hmp;
2536         int error;
2537
2538         dip = VTOI(ap->a_dvp);
2539         hmp = dip->hmp;
2540
2541         if (hammer_nohistory(dip) == 0 &&
2542             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2543                 return (error);
2544         }
2545
2546         lwkt_gettoken(&hmp->fs_token);
2547         hammer_start_transaction(&trans, hmp);
2548         ++hammer_stats_file_iopsw;
2549         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2550                                 ap->a_cred, ap->a_flags, -1);
2551         hammer_done_transaction(&trans);
2552         lwkt_reltoken(&hmp->fs_token);
2553
2554         return (error);
2555 }
2556
2557 /*
2558  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2559  */
2560 static
2561 int
2562 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2563 {
2564         struct hammer_inode *ip = ap->a_vp->v_data;
2565         hammer_mount_t hmp = ip->hmp;
2566         int error;
2567
2568         ++hammer_stats_file_iopsr;
2569         lwkt_gettoken(&hmp->fs_token);
2570         error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2571                              ap->a_fflag, ap->a_cred);
2572         lwkt_reltoken(&hmp->fs_token);
2573         return (error);
2574 }
2575
2576 static
2577 int
2578 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2579 {
2580         static const struct mountctl_opt extraopt[] = {
2581                 { HMNT_NOHISTORY,       "nohistory" },
2582                 { HMNT_MASTERID,        "master" },
2583                 { 0, NULL}
2584
2585         };
2586         struct hammer_mount *hmp;
2587         struct mount *mp;
2588         int usedbytes;
2589         int error;
2590
2591         error = 0;
2592         usedbytes = 0;
2593         mp = ap->a_head.a_ops->head.vv_mount;
2594         KKASSERT(mp->mnt_data != NULL);
2595         hmp = (struct hammer_mount *)mp->mnt_data;
2596
2597         lwkt_gettoken(&hmp->fs_token);
2598
2599         switch(ap->a_op) {
2600         case MOUNTCTL_SET_EXPORT:
2601                 if (ap->a_ctllen != sizeof(struct export_args))
2602                         error = EINVAL;
2603                 else
2604                         error = hammer_vfs_export(mp, ap->a_op,
2605                                       (const struct export_args *)ap->a_ctl);
2606                 break;
2607         case MOUNTCTL_MOUNTFLAGS:
2608         {
2609                 /*
2610                  * Call standard mountctl VOP function
2611                  * so we get user mount flags.
2612                  */
2613                 error = vop_stdmountctl(ap);
2614                 if (error)
2615                         break;
2616
2617                 usedbytes = *ap->a_res;
2618
2619                 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2620                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2621                                                     ap->a_buf,
2622                                                     ap->a_buflen - usedbytes,
2623                                                     &error);
2624                 }
2625
2626                 *ap->a_res += usedbytes;
2627                 break;
2628         }
2629         default:
2630                 error = vop_stdmountctl(ap);
2631                 break;
2632         }
2633         lwkt_reltoken(&hmp->fs_token);
2634         return(error);
2635 }
2636
2637 /*
2638  * hammer_vop_strategy { vp, bio }
2639  *
2640  * Strategy call, used for regular file read & write only.  Note that the
2641  * bp may represent a cluster.
2642  *
2643  * To simplify operation and allow better optimizations in the future,
2644  * this code does not make any assumptions with regards to buffer alignment
2645  * or size.
2646  */
2647 static
2648 int
2649 hammer_vop_strategy(struct vop_strategy_args *ap)
2650 {
2651         struct buf *bp;
2652         int error;
2653
2654         bp = ap->a_bio->bio_buf;
2655
2656         switch(bp->b_cmd) {
2657         case BUF_CMD_READ:
2658                 error = hammer_vop_strategy_read(ap);
2659                 break;
2660         case BUF_CMD_WRITE:
2661                 error = hammer_vop_strategy_write(ap);
2662                 break;
2663         default:
2664                 bp->b_error = error = EINVAL;
2665                 bp->b_flags |= B_ERROR;
2666                 biodone(ap->a_bio);
2667                 break;
2668         }
2669
2670         /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2671
2672         return (error);
2673 }
2674
2675 /*
2676  * Read from a regular file.  Iterate the related records and fill in the
2677  * BIO/BUF.  Gaps are zero-filled.
2678  *
2679  * The support code in hammer_object.c should be used to deal with mixed
2680  * in-memory and on-disk records.
2681  *
2682  * NOTE: Can be called from the cluster code with an oversized buf.
2683  *
2684  * XXX atime update
2685  */
2686 static
2687 int
2688 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2689 {
2690         struct hammer_transaction trans;
2691         struct hammer_inode *ip;
2692         struct hammer_inode *dip;
2693         hammer_mount_t hmp;
2694         struct hammer_cursor cursor;
2695         hammer_base_elm_t base;
2696         hammer_off_t disk_offset;
2697         struct bio *bio;
2698         struct bio *nbio;
2699         struct buf *bp;
2700         int64_t rec_offset;
2701         int64_t ran_end;
2702         int64_t tmp64;
2703         int error;
2704         int boff;
2705         int roff;
2706         int n;
2707         int isdedupable;
2708
2709         bio = ap->a_bio;
2710         bp = bio->bio_buf;
2711         ip = ap->a_vp->v_data;
2712         hmp = ip->hmp;
2713
2714         /*
2715          * The zone-2 disk offset may have been set by the cluster code via
2716          * a BMAP operation, or else should be NOOFFSET.
2717          *
2718          * Checking the high bits for a match against zone-2 should suffice.
2719          *
2720          * In cases where a lot of data duplication is present it may be
2721          * more beneficial to drop through and doubule-buffer through the
2722          * device.
2723          */
2724         nbio = push_bio(bio);
2725         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2726             HAMMER_ZONE_LARGE_DATA) {
2727                 if (hammer_double_buffer == 0) {
2728                         lwkt_gettoken(&hmp->fs_token);
2729                         error = hammer_io_direct_read(hmp, nbio, NULL);
2730                         lwkt_reltoken(&hmp->fs_token);
2731                         return (error);
2732                 }
2733
2734                 /*
2735                  * Try to shortcut requests for double_buffer mode too.
2736                  * Since this mode runs through the device buffer cache
2737                  * only compatible buffer sizes (meaning those generated
2738                  * by normal filesystem buffers) are legal.
2739                  */
2740                 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
2741                         lwkt_gettoken(&hmp->fs_token);
2742                         error = hammer_io_indirect_read(hmp, nbio, NULL);
2743                         lwkt_reltoken(&hmp->fs_token);
2744                         return (error);
2745                 }
2746         }
2747
2748         /*
2749          * Well, that sucked.  Do it the hard way.  If all the stars are
2750          * aligned we may still be able to issue a direct-read.
2751          */
2752         lwkt_gettoken(&hmp->fs_token);
2753         hammer_simple_transaction(&trans, hmp);
2754         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2755
2756         /*
2757          * Key range (begin and end inclusive) to scan.  Note that the key's
2758          * stored in the actual records represent BASE+LEN, not BASE.  The
2759          * first record containing bio_offset will have a key > bio_offset.
2760          */
2761         cursor.key_beg.localization = ip->obj_localization +
2762                                       HAMMER_LOCALIZE_MISC;
2763         cursor.key_beg.obj_id = ip->obj_id;
2764         cursor.key_beg.create_tid = 0;
2765         cursor.key_beg.delete_tid = 0;
2766         cursor.key_beg.obj_type = 0;
2767         cursor.key_beg.key = bio->bio_offset + 1;
2768         cursor.asof = ip->obj_asof;
2769         cursor.flags |= HAMMER_CURSOR_ASOF;
2770
2771         cursor.key_end = cursor.key_beg;
2772         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2773 #if 0
2774         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2775                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2776                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2777                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2778         } else
2779 #endif
2780         {
2781                 ran_end = bio->bio_offset + bp->b_bufsize;
2782                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2783                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2784                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2785                 if (tmp64 < ran_end)
2786                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2787                 else
2788                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2789         }
2790         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2791
2792         /*
2793          * Set NOSWAPCACHE for cursor data extraction if double buffering
2794          * is disabled or (if the file is not marked cacheable via chflags
2795          * and vm.swapcache_use_chflags is enabled).
2796          */
2797         if (hammer_double_buffer == 0 ||
2798             ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2799              vm_swapcache_use_chflags)) {
2800                 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2801         }
2802
2803         error = hammer_ip_first(&cursor);
2804         boff = 0;
2805
2806         while (error == 0) {
2807                 /*
2808                  * Get the base file offset of the record.  The key for
2809                  * data records is (base + bytes) rather then (base).
2810                  */
2811                 base = &cursor.leaf->base;
2812                 rec_offset = base->key - cursor.leaf->data_len;
2813
2814                 /*
2815                  * Calculate the gap, if any, and zero-fill it.
2816                  *
2817                  * n is the offset of the start of the record verses our
2818                  * current seek offset in the bio.
2819                  */
2820                 n = (int)(rec_offset - (bio->bio_offset + boff));
2821                 if (n > 0) {
2822                         if (n > bp->b_bufsize - boff)
2823                                 n = bp->b_bufsize - boff;
2824                         bzero((char *)bp->b_data + boff, n);
2825                         boff += n;
2826                         n = 0;
2827                 }
2828
2829                 /*
2830                  * Calculate the data offset in the record and the number
2831                  * of bytes we can copy.
2832                  *
2833                  * There are two degenerate cases.  First, boff may already
2834                  * be at bp->b_bufsize.  Secondly, the data offset within
2835                  * the record may exceed the record's size.
2836                  */
2837                 roff = -n;
2838                 rec_offset += roff;
2839                 n = cursor.leaf->data_len - roff;
2840                 if (n <= 0) {
2841                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2842                         n = 0;
2843                 } else if (n > bp->b_bufsize - boff) {
2844                         n = bp->b_bufsize - boff;
2845                 }
2846
2847                 /*
2848                  * Deal with cached truncations.  This cool bit of code
2849                  * allows truncate()/ftruncate() to avoid having to sync
2850                  * the file.
2851                  *
2852                  * If the frontend is truncated then all backend records are
2853                  * subject to the frontend's truncation.
2854                  *
2855                  * If the backend is truncated then backend records on-disk
2856                  * (but not in-memory) are subject to the backend's
2857                  * truncation.  In-memory records owned by the backend
2858                  * represent data written after the truncation point on the
2859                  * backend and must not be truncated.
2860                  *
2861                  * Truncate operations deal with frontend buffer cache
2862                  * buffers and frontend-owned in-memory records synchronously.
2863                  */
2864                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2865                         if (hammer_cursor_ondisk(&cursor)/* ||
2866                             cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2867                                 if (ip->trunc_off <= rec_offset)
2868                                         n = 0;
2869                                 else if (ip->trunc_off < rec_offset + n)
2870                                         n = (int)(ip->trunc_off - rec_offset);
2871                         }
2872                 }
2873                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2874                         if (hammer_cursor_ondisk(&cursor)) {
2875                                 if (ip->sync_trunc_off <= rec_offset)
2876                                         n = 0;
2877                                 else if (ip->sync_trunc_off < rec_offset + n)
2878                                         n = (int)(ip->sync_trunc_off - rec_offset);
2879                         }
2880                 }
2881
2882                 /*
2883                  * Try to issue a direct read into our bio if possible,
2884                  * otherwise resolve the element data into a hammer_buffer
2885                  * and copy.
2886                  *
2887                  * The buffer on-disk should be zerod past any real
2888                  * truncation point, but may not be for any synthesized
2889                  * truncation point from above.
2890                  *
2891                  * NOTE: disk_offset is only valid if the cursor data is
2892                  *       on-disk.
2893                  */
2894                 disk_offset = cursor.leaf->data_offset + roff;
2895                 isdedupable = (boff == 0 && n == bp->b_bufsize &&
2896                                hammer_cursor_ondisk(&cursor) &&
2897                                ((int)disk_offset & HAMMER_BUFMASK) == 0);
2898
2899                 if (isdedupable && hammer_double_buffer == 0) {
2900                         /*
2901                          * Direct read case
2902                          */
2903                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2904                                  HAMMER_ZONE_LARGE_DATA);
2905                         nbio->bio_offset = disk_offset;
2906                         error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2907                         if (hammer_live_dedup && error == 0)
2908                                 hammer_dedup_cache_add(ip, cursor.leaf);
2909                         goto done;
2910                 } else if (isdedupable) {
2911                         /*
2912                          * Async I/O case for reading from backing store
2913                          * and copying the data to the filesystem buffer.
2914                          * live-dedup has to verify the data anyway if it
2915                          * gets a hit later so we can just add the entry
2916                          * now.
2917                          */
2918                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2919                                  HAMMER_ZONE_LARGE_DATA);
2920                         nbio->bio_offset = disk_offset;
2921                         if (hammer_live_dedup)
2922                                 hammer_dedup_cache_add(ip, cursor.leaf);
2923                         error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
2924                         goto done;
2925                 } else if (n) {
2926                         error = hammer_ip_resolve_data(&cursor);
2927                         if (error == 0) {
2928                                 if (hammer_live_dedup && isdedupable)
2929                                         hammer_dedup_cache_add(ip, cursor.leaf);
2930                                 bcopy((char *)cursor.data + roff,
2931                                       (char *)bp->b_data + boff, n);
2932                         }
2933                 }
2934                 if (error)
2935                         break;
2936
2937                 /*
2938                  * We have to be sure that the only elements added to the
2939                  * dedup cache are those which are already on-media.
2940                  */
2941                 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2942                         hammer_dedup_cache_add(ip, cursor.leaf);
2943
2944                 /*
2945                  * Iterate until we have filled the request.
2946                  */
2947                 boff += n;
2948                 if (boff == bp->b_bufsize)
2949                         break;
2950                 error = hammer_ip_next(&cursor);
2951         }
2952
2953         /*
2954          * There may have been a gap after the last record
2955          */
2956         if (error == ENOENT)
2957                 error = 0;
2958         if (error == 0 && boff != bp->b_bufsize) {
2959                 KKASSERT(boff < bp->b_bufsize);
2960                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2961                 /* boff = bp->b_bufsize; */
2962         }
2963
2964         /*
2965          * Disallow swapcache operation on the vnode buffer if double
2966          * buffering is enabled, the swapcache will get the data via
2967          * the block device buffer.
2968          */
2969         if (hammer_double_buffer)
2970                 bp->b_flags |= B_NOTMETA;
2971
2972         /*
2973          * Cleanup
2974          */
2975         bp->b_resid = 0;
2976         bp->b_error = error;
2977         if (error)
2978                 bp->b_flags |= B_ERROR;
2979         biodone(ap->a_bio);
2980
2981 done:
2982         /*
2983          * Cache the b-tree node for the last data read in cache[1].
2984          *
2985          * If we hit the file EOF then also cache the node in the
2986          * governing director's cache[3], it will be used to initialize
2987          * the inode's cache[1] for any inodes looked up via the directory.
2988          *
2989          * This doesn't reduce disk accesses since the B-Tree chain is
2990          * likely cached, but it does reduce cpu overhead when looking
2991          * up file offsets for cpdup/tar/cpio style iterations.
2992          */
2993         if (cursor.node)
2994                 hammer_cache_node(&ip->cache[1], cursor.node);
2995         if (ran_end >= ip->ino_data.size) {
2996                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2997                                         ip->obj_asof, ip->obj_localization);
2998                 if (dip) {
2999                         hammer_cache_node(&dip->cache[3], cursor.node);
3000                         hammer_rel_inode(dip, 0);
3001                 }
3002         }
3003         hammer_done_cursor(&cursor);
3004         hammer_done_transaction(&trans);
3005         lwkt_reltoken(&hmp->fs_token);
3006         return(error);
3007 }
3008
3009 /*
3010  * BMAP operation - used to support cluster_read() only.
3011  *
3012  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
3013  *
3014  * This routine may return EOPNOTSUPP if the opration is not supported for
3015  * the specified offset.  The contents of the pointer arguments do not
3016  * need to be initialized in that case. 
3017  *
3018  * If a disk address is available and properly aligned return 0 with 
3019  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
3020  * to the run-length relative to that offset.  Callers may assume that
3021  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
3022  * large, so return EOPNOTSUPP if it is not sufficiently large.
3023  */
3024 static
3025 int
3026 hammer_vop_bmap(struct vop_bmap_args *ap)
3027 {
3028         struct hammer_transaction trans;
3029         struct hammer_inode *ip;
3030         hammer_mount_t hmp;
3031         struct hammer_cursor cursor;
3032         hammer_base_elm_t base;
3033         int64_t rec_offset;
3034         int64_t ran_end;
3035         int64_t tmp64;
3036         int64_t base_offset;
3037         int64_t base_disk_offset;
3038         int64_t last_offset;
3039         hammer_off_t last_disk_offset;
3040         hammer_off_t disk_offset;
3041         int     rec_len;
3042         int     error;
3043         int     blksize;
3044
3045         ++hammer_stats_file_iopsr;
3046         ip = ap->a_vp->v_data;
3047         hmp = ip->hmp;
3048
3049         /*
3050          * We can only BMAP regular files.  We can't BMAP database files,
3051          * directories, etc.
3052          */
3053         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
3054                 return(EOPNOTSUPP);
3055
3056         /*
3057          * bmap is typically called with runp/runb both NULL when used
3058          * for writing.  We do not support BMAP for writing atm.
3059          */
3060         if (ap->a_cmd != BUF_CMD_READ)
3061                 return(EOPNOTSUPP);
3062
3063         /*
3064          * Scan the B-Tree to acquire blockmap addresses, then translate
3065          * to raw addresses.
3066          */
3067         lwkt_gettoken(&hmp->fs_token);
3068         hammer_simple_transaction(&trans, hmp);
3069 #if 0
3070         kprintf("bmap_beg %016llx ip->cache %p\n",
3071                 (long long)ap->a_loffset, ip->cache[1]);
3072 #endif
3073         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3074
3075         /*
3076          * Key range (begin and end inclusive) to scan.  Note that the key's
3077          * stored in the actual records represent BASE+LEN, not BASE.  The
3078          * first record containing bio_offset will have a key > bio_offset.
3079          */
3080         cursor.key_beg.localization = ip->obj_localization +
3081                                       HAMMER_LOCALIZE_MISC;
3082         cursor.key_beg.obj_id = ip->obj_id;
3083         cursor.key_beg.create_tid = 0;
3084         cursor.key_beg.delete_tid = 0;
3085         cursor.key_beg.obj_type = 0;
3086         if (ap->a_runb)
3087                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3088         else
3089                 cursor.key_beg.key = ap->a_loffset + 1;
3090         if (cursor.key_beg.key < 0)
3091                 cursor.key_beg.key = 0;
3092         cursor.asof = ip->obj_asof;
3093         cursor.flags |= HAMMER_CURSOR_ASOF;
3094
3095         cursor.key_end = cursor.key_beg;
3096         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3097
3098         ran_end = ap->a_loffset + MAXPHYS;
3099         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3100         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3101         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
3102         if (tmp64 < ran_end)
3103                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3104         else
3105                 cursor.key_end.key = ran_end + MAXPHYS + 1;
3106
3107         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3108
3109         error = hammer_ip_first(&cursor);
3110         base_offset = last_offset = 0;
3111         base_disk_offset = last_disk_offset = 0;
3112
3113         while (error == 0) {
3114                 /*
3115                  * Get the base file offset of the record.  The key for
3116                  * data records is (base + bytes) rather then (base).
3117                  *
3118                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
3119                  * The extra bytes should be zero on-disk and the BMAP op
3120                  * should still be ok.
3121                  */
3122                 base = &cursor.leaf->base;
3123                 rec_offset = base->key - cursor.leaf->data_len;
3124                 rec_len    = cursor.leaf->data_len;
3125
3126                 /*
3127                  * Incorporate any cached truncation.
3128                  *
3129                  * NOTE: Modifications to rec_len based on synthesized
3130                  * truncation points remove the guarantee that any extended
3131                  * data on disk is zero (since the truncations may not have
3132                  * taken place on-media yet).
3133                  */
3134                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
3135                         if (hammer_cursor_ondisk(&cursor) ||
3136                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3137                                 if (ip->trunc_off <= rec_offset)
3138                                         rec_len = 0;
3139                                 else if (ip->trunc_off < rec_offset + rec_len)
3140                                         rec_len = (int)(ip->trunc_off - rec_offset);
3141                         }
3142                 }
3143                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3144                         if (hammer_cursor_ondisk(&cursor)) {
3145                                 if (ip->sync_trunc_off <= rec_offset)
3146                                         rec_len = 0;
3147                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
3148                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
3149                         }
3150                 }
3151
3152                 /*
3153                  * Accumulate information.  If we have hit a discontiguous
3154                  * block reset base_offset unless we are already beyond the
3155                  * requested offset.  If we are, that's it, we stop.
3156                  */
3157                 if (error)
3158                         break;
3159                 if (hammer_cursor_ondisk(&cursor)) {
3160                         disk_offset = cursor.leaf->data_offset;
3161                         if (rec_offset != last_offset ||
3162                             disk_offset != last_disk_offset) {
3163                                 if (rec_offset > ap->a_loffset)
3164                                         break;
3165                                 base_offset = rec_offset;
3166                                 base_disk_offset = disk_offset;
3167                         }
3168                         last_offset = rec_offset + rec_len;
3169                         last_disk_offset = disk_offset + rec_len;
3170
3171                         if (hammer_live_dedup)
3172                                 hammer_dedup_cache_add(ip, cursor.leaf);
3173                 }
3174                 
3175                 error = hammer_ip_next(&cursor);
3176         }
3177
3178 #if 0
3179         kprintf("BMAP %016llx:  %016llx - %016llx\n",
3180                 (long long)ap->a_loffset,
3181                 (long long)base_offset,
3182                 (long long)last_offset);
3183         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3184                 (long long)base_disk_offset,
3185                 (long long)last_disk_offset);
3186 #endif
3187
3188         if (cursor.node) {
3189                 hammer_cache_node(&ip->cache[1], cursor.node);
3190 #if 0
3191                 kprintf("bmap_end2 %016llx ip->cache %p\n",
3192                         (long long)ap->a_loffset, ip->cache[1]);
3193 #endif
3194         }
3195         hammer_done_cursor(&cursor);
3196         hammer_done_transaction(&trans);
3197         lwkt_reltoken(&hmp->fs_token);
3198
3199         /*
3200          * If we couldn't find any records or the records we did find were
3201          * all behind the requested offset, return failure.  A forward
3202          * truncation can leave a hole w/ no on-disk records.
3203          */
3204         if (last_offset == 0 || last_offset < ap->a_loffset)
3205                 return (EOPNOTSUPP);
3206
3207         /*
3208          * Figure out the block size at the requested offset and adjust
3209          * our limits so the cluster_read() does not create inappropriately
3210          * sized buffer cache buffers.
3211          */
3212         blksize = hammer_blocksize(ap->a_loffset);
3213         if (hammer_blocksize(base_offset) != blksize) {
3214                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3215         }
3216         if (last_offset != ap->a_loffset &&
3217             hammer_blocksize(last_offset - 1) != blksize) {
3218                 last_offset = hammer_blockdemarc(ap->a_loffset,
3219                                                  last_offset - 1);
3220         }
3221
3222         /*
3223          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3224          * from occuring.
3225          */
3226         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3227
3228         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3229                 /*
3230                  * Only large-data zones can be direct-IOd
3231                  */
3232                 error = EOPNOTSUPP;
3233         } else if ((disk_offset & HAMMER_BUFMASK) ||
3234                    (last_offset - ap->a_loffset) < blksize) {
3235                 /*
3236                  * doffsetp is not aligned or the forward run size does
3237                  * not cover a whole buffer, disallow the direct I/O.
3238                  */
3239                 error = EOPNOTSUPP;
3240         } else {
3241                 /*
3242                  * We're good.
3243                  */
3244                 *ap->a_doffsetp = disk_offset;
3245                 if (ap->a_runb) {
3246                         *ap->a_runb = ap->a_loffset - base_offset;
3247                         KKASSERT(*ap->a_runb >= 0);
3248                 }
3249                 if (ap->a_runp) {
3250                         *ap->a_runp = last_offset - ap->a_loffset;
3251                         KKASSERT(*ap->a_runp >= 0);
3252                 }
3253                 error = 0;
3254         }
3255         return(error);
3256 }
3257
3258 /*
3259  * Write to a regular file.   Because this is a strategy call the OS is
3260  * trying to actually get data onto the media.
3261  */
3262 static
3263 int
3264 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3265 {
3266         hammer_record_t record;
3267         hammer_mount_t hmp;
3268         hammer_inode_t ip;
3269         struct bio *bio;
3270         struct buf *bp;
3271         int blksize __debugvar;
3272         int bytes;
3273         int error;
3274
3275         bio = ap->a_bio;
3276         bp = bio->bio_buf;
3277         ip = ap->a_vp->v_data;
3278         hmp = ip->hmp;
3279
3280         blksize = hammer_blocksize(bio->bio_offset);
3281         KKASSERT(bp->b_bufsize == blksize);
3282
3283         if (ip->flags & HAMMER_INODE_RO) {
3284                 bp->b_error = EROFS;
3285                 bp->b_flags |= B_ERROR;
3286                 biodone(ap->a_bio);
3287                 return(EROFS);
3288         }
3289
3290         lwkt_gettoken(&hmp->fs_token);
3291
3292         /*
3293          * Disallow swapcache operation on the vnode buffer if double
3294          * buffering is enabled, the swapcache will get the data via
3295          * the block device buffer.
3296          */
3297         if (hammer_double_buffer)
3298                 bp->b_flags |= B_NOTMETA;
3299
3300         /*
3301          * Interlock with inode destruction (no in-kernel or directory
3302          * topology visibility).  If we queue new IO while trying to
3303          * destroy the inode we can deadlock the vtrunc call in
3304          * hammer_inode_unloadable_check().
3305          *
3306          * Besides, there's no point flushing a bp associated with an
3307          * inode that is being destroyed on-media and has no kernel
3308          * references.
3309          */
3310         if ((ip->flags | ip->sync_flags) &
3311             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3312                 bp->b_resid = 0;
3313                 biodone(ap->a_bio);
3314                 lwkt_reltoken(&hmp->fs_token);
3315                 return(0);
3316         }
3317
3318         /*
3319          * Reserve space and issue a direct-write from the front-end. 
3320          * NOTE: The direct_io code will hammer_bread/bcopy smaller
3321          * allocations.
3322          *
3323          * An in-memory record will be installed to reference the storage
3324          * until the flusher can get to it.
3325          *
3326          * Since we own the high level bio the front-end will not try to
3327          * do a direct-read until the write completes.
3328          *
3329          * NOTE: The only time we do not reserve a full-sized buffers
3330          * worth of data is if the file is small.  We do not try to
3331          * allocate a fragment (from the small-data zone) at the end of
3332          * an otherwise large file as this can lead to wildly separated
3333          * data.
3334          */
3335         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3336         KKASSERT(bio->bio_offset < ip->ino_data.size);
3337         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3338                 bytes = bp->b_bufsize;
3339         else
3340                 bytes = ((int)ip->ino_data.size + 15) & ~15;
3341
3342         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3343                                     bytes, &error);
3344
3345         /*
3346          * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3347          * in hammer_vop_write().  We must flag the record so the proper
3348          * REDO_TERM_WRITE entry is generated during the flush.
3349          */
3350         if (record) {
3351                 if (bp->b_flags & B_VFSFLAG1) {
3352                         record->flags |= HAMMER_RECF_REDO;
3353                         bp->b_flags &= ~B_VFSFLAG1;
3354                 }
3355                 if (record->flags & HAMMER_RECF_DEDUPED) {
3356                         bp->b_resid = 0;
3357                         hammer_ip_replace_bulk(hmp, record);
3358                         biodone(ap->a_bio);
3359                 } else {
3360                         hammer_io_direct_write(hmp, bio, record);
3361                 }
3362                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3363                         hammer_flush_inode(ip, 0);
3364         } else {
3365                 bp->b_bio2.bio_offset = NOOFFSET;
3366                 bp->b_error = error;
3367                 bp->b_flags |= B_ERROR;
3368                 biodone(ap->a_bio);
3369         }
3370         lwkt_reltoken(&hmp->fs_token);
3371         return(error);
3372 }
3373
3374 /*
3375  * dounlink - disconnect a directory entry
3376  *
3377  * XXX whiteout support not really in yet
3378  */
3379 static int
3380 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3381                 struct vnode *dvp, struct ucred *cred, 
3382                 int flags, int isdir)
3383 {
3384         struct namecache *ncp;
3385         hammer_inode_t dip;
3386         hammer_inode_t ip;
3387         hammer_mount_t hmp;
3388         struct hammer_cursor cursor;
3389         int64_t namekey;
3390         u_int32_t max_iterations;
3391         int nlen, error;
3392
3393         /*
3394          * Calculate the namekey and setup the key range for the scan.  This
3395          * works kinda like a chained hash table where the lower 32 bits
3396          * of the namekey synthesize the chain.
3397          *
3398          * The key range is inclusive of both key_beg and key_end.
3399          */
3400         dip = VTOI(dvp);
3401         ncp = nch->ncp;
3402         hmp = dip->hmp;
3403
3404         if (dip->flags & HAMMER_INODE_RO)
3405                 return (EROFS);
3406
3407         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3408                                            &max_iterations);
3409 retry:
3410         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3411         cursor.key_beg.localization = dip->obj_localization +
3412                                       hammer_dir_localization(dip);
3413         cursor.key_beg.obj_id = dip->obj_id;
3414         cursor.key_beg.key = namekey;
3415         cursor.key_beg.create_tid = 0;
3416         cursor.key_beg.delete_tid = 0;
3417         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3418         cursor.key_beg.obj_type = 0;
3419
3420         cursor.key_end = cursor.key_beg;
3421         cursor.key_end.key += max_iterations;
3422         cursor.asof = dip->obj_asof;
3423         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3424
3425         /*
3426          * Scan all matching records (the chain), locate the one matching
3427          * the requested path component.  info->last_error contains the
3428          * error code on search termination and could be 0, ENOENT, or
3429          * something else.
3430          *
3431          * The hammer_ip_*() functions merge in-memory records with on-disk
3432          * records for the purposes of the search.
3433          */
3434         error = hammer_ip_first(&cursor);
3435
3436         while (error == 0) {
3437                 error = hammer_ip_resolve_data(&cursor);
3438                 if (error)
3439                         break;
3440                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3441                 KKASSERT(nlen > 0);
3442                 if (ncp->nc_nlen == nlen &&
3443                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3444                         break;
3445                 }
3446                 error = hammer_ip_next(&cursor);
3447         }
3448
3449         /*
3450          * If all is ok we have to get the inode so we can adjust nlinks.
3451          * To avoid a deadlock with the flusher we must release the inode
3452          * lock on the directory when acquiring the inode for the entry.
3453          *
3454          * If the target is a directory, it must be empty.
3455          */
3456         if (error == 0) {
3457                 hammer_unlock(&cursor.ip->lock);
3458                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3459                                       hmp->asof,
3460                                       cursor.data->entry.localization,
3461                                       0, &error);
3462                 hammer_lock_sh(&cursor.ip->lock);
3463                 if (error == ENOENT) {
3464                         kprintf("HAMMER: WARNING: Removing "
3465                                 "dirent w/missing inode \"%s\"\n"
3466                                 "\tobj_id = %016llx\n",
3467                                 ncp->nc_name,
3468                                 (long long)cursor.data->entry.obj_id);
3469                         error = 0;
3470                 }
3471
3472                 /*
3473                  * If isdir >= 0 we validate that the entry is or is not a
3474                  * directory.  If isdir < 0 we don't care.
3475                  */
3476                 if (error == 0 && isdir >= 0 && ip) {
3477                         if (isdir &&
3478                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3479                                 error = ENOTDIR;
3480                         } else if (isdir == 0 &&
3481                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3482                                 error = EISDIR;
3483                         }
3484                 }
3485
3486                 /*
3487                  * If we are trying to remove a directory the directory must
3488                  * be empty.
3489                  *
3490                  * The check directory code can loop and deadlock/retry.  Our
3491                  * own cursor's node locks must be released to avoid a 3-way
3492                  * deadlock with the flusher if the check directory code
3493                  * blocks.
3494                  *
3495                  * If any changes whatsoever have been made to the cursor
3496                  * set EDEADLK and retry.
3497                  *
3498                  * WARNING: See warnings in hammer_unlock_cursor()
3499                  *          function.
3500                  */
3501                 if (error == 0 && ip && ip->ino_data.obj_type ==
3502                                         HAMMER_OBJTYPE_DIRECTORY) {
3503                         hammer_unlock_cursor(&cursor);
3504                         error = hammer_ip_check_directory_empty(trans, ip);
3505                         hammer_lock_cursor(&cursor);
3506                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
3507                                 kprintf("HAMMER: Warning: avoided deadlock "
3508                                         "on rmdir '%s'\n",
3509                                         ncp->nc_name);
3510                                 error = EDEADLK;
3511                         }
3512                 }
3513
3514                 /*
3515                  * Delete the directory entry.
3516                  *
3517                  * WARNING: hammer_ip_del_directory() may have to terminate
3518                  * the cursor to avoid a deadlock.  It is ok to call
3519                  * hammer_done_cursor() twice.
3520                  */
3521                 if (error == 0) {
3522                         error = hammer_ip_del_directory(trans, &cursor,
3523                                                         dip, ip);
3524                 }
3525                 hammer_done_cursor(&cursor);
3526                 if (error == 0) {
3527                         /*
3528                          * Tell the namecache that we are now unlinked.
3529                          */
3530                         cache_unlink(nch);
3531
3532                         /*
3533                          * NOTE: ip->vp, if non-NULL, cannot be directly
3534                          *       referenced without formally acquiring the
3535                          *       vp since the vp might have zero refs on it,
3536                          *       or in the middle of a reclaim, etc.
3537                          *
3538                          * NOTE: The cache_setunresolved() can rip the vp
3539                          *       out from under us since the vp may not have
3540                          *       any refs, in which case ip->vp will be NULL
3541                          *       from the outset.
3542                          */
3543                         while (ip && ip->vp) {
3544                                 struct vnode *vp;
3545
3546                                 error = hammer_get_vnode(ip, &vp);
3547                                 if (error == 0 && vp) {
3548                                         vn_unlock(vp);
3549                                         hammer_knote(ip->vp, NOTE_DELETE);
3550 #if 0
3551                                         /*
3552                                          * Don't do this, it can deadlock
3553                                          * on concurrent rm's of hardlinks.
3554                                          * Shouldn't be needed any more.
3555                                          */
3556                                         cache_inval_vp(ip->vp, CINV_DESTROY);
3557 #endif
3558                                         vrele(vp);
3559                                         break;
3560                                 }
3561                                 kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3562                         }
3563                 }
3564                 if (ip)
3565                         hammer_rel_inode(ip, 0);
3566         } else {
3567                 hammer_done_cursor(&cursor);
3568         }
3569         if (error == EDEADLK)
3570                 goto retry;
3571
3572         return (error);
3573 }
3574
3575 /************************************************************************
3576  *                          FIFO AND SPECFS OPS                         *
3577  ************************************************************************
3578  *
3579  */
3580 static int
3581 hammer_vop_fifoclose (struct vop_close_args *ap)
3582 {
3583         /* XXX update itimes */
3584         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3585 }
3586
3587 static int
3588 hammer_vop_fiforead (struct vop_read_args *ap)
3589 {
3590         int error;
3591
3592         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3593         /* XXX update access time */
3594         return (error);
3595 }
3596
3597 static int
3598 hammer_vop_fifowrite (struct vop_write_args *ap)
3599 {
3600         int error;
3601
3602         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3603         /* XXX update access time */
3604         return (error);
3605 }
3606
3607 static
3608 int
3609 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3610 {
3611         int error;
3612
3613         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3614         if (error)
3615                 error = hammer_vop_kqfilter(ap);
3616         return(error);
3617 }
3618
3619 /************************************************************************
3620  *                          KQFILTER OPS                                *
3621  ************************************************************************
3622  *
3623  */
3624 static void filt_hammerdetach(struct knote *kn);
3625 static int filt_hammerread(struct knote *kn, long hint);
3626 static int filt_hammerwrite(struct knote *kn, long hint);
3627 static int filt_hammervnode(struct knote *kn, long hint);
3628
3629 static struct filterops hammerread_filtops =
3630         { FILTEROP_ISFD | FILTEROP_MPSAFE,
3631           NULL, filt_hammerdetach, filt_hammerread };
3632 static struct filterops hammerwrite_filtops =
3633         { FILTEROP_ISFD | FILTEROP_MPSAFE,
3634           NULL, filt_hammerdetach, filt_hammerwrite };
3635 static struct filterops hammervnode_filtops =
3636         { FILTEROP_ISFD | FILTEROP_MPSAFE,
3637           NULL, filt_hammerdetach, filt_hammervnode };
3638
3639 static
3640 int
3641 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3642 {
3643         struct vnode *vp = ap->a_vp;
3644         struct knote *kn = ap->a_kn;
3645
3646         switch (kn->kn_filter) {
3647         case EVFILT_READ:
3648                 kn->kn_fop = &hammerread_filtops;
3649                 break;
3650         case EVFILT_WRITE:
3651                 kn->kn_fop = &hammerwrite_filtops;
3652                 break;
3653         case EVFILT_VNODE:
3654                 kn->kn_fop = &hammervnode_filtops;
3655                 break;
3656         default:
3657                 return (EOPNOTSUPP);
3658         }
3659
3660         kn->kn_hook = (caddr_t)vp;
3661
3662         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3663
3664         return(0);
3665 }
3666
3667 static void
3668 filt_hammerdetach(struct knote *kn)
3669 {
3670         struct vnode *vp = (void *)kn->kn_hook;
3671
3672         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3673 }
3674
3675 static int
3676 filt_hammerread(struct knote *kn, long hint)
3677 {
3678         struct vnode *vp = (void *)kn->kn_hook;
3679         hammer_inode_t ip = VTOI(vp);
3680         hammer_mount_t hmp = ip->hmp;
3681         off_t off;
3682
3683         if (hint == NOTE_REVOKE) {
3684                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3685                 return(1);
3686         }
3687         lwkt_gettoken(&hmp->fs_token);  /* XXX use per-ip-token */
3688         off = ip->ino_data.size - kn->kn_fp->f_offset;
3689         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
3690         lwkt_reltoken(&hmp->fs_token);
3691         if (kn->kn_sfflags & NOTE_OLDAPI)
3692                 return(1);
3693         return (kn->kn_data != 0);
3694 }
3695
3696 static int
3697 filt_hammerwrite(struct knote *kn, long hint)
3698 {
3699         if (hint == NOTE_REVOKE)
3700                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3701         kn->kn_data = 0;
3702         return (1);
3703 }
3704
3705 static int
3706 filt_hammervnode(struct knote *kn, long hint)
3707 {
3708         if (kn->kn_sfflags & hint)
3709                 kn->kn_fflags |= hint;
3710         if (hint == NOTE_REVOKE) {
3711                 kn->kn_flags |= (EV_EOF | EV_NODATA);
3712                 return (1);
3713         }
3714         return (kn->kn_fflags != 0);
3715 }
3716