abebc5233294e1802d4b297da923b000de78aa1e
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <sys/fcntl.h>
36 #include <sys/namecache.h>
37 #include <sys/event.h>
38 #include <sys/dirent.h>
39 #include <sys/file.h>
40 #include <vm/swap_pager.h>
41 #include <vfs/fifofs/fifo.h>
42
43 #include "hammer.h"
44
45 /*
46  * USERFS VNOPS
47  */
48 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
49 static int hammer_vop_fsync(struct vop_fsync_args *);
50 static int hammer_vop_read(struct vop_read_args *);
51 static int hammer_vop_write(struct vop_write_args *);
52 static int hammer_vop_access(struct vop_access_args *);
53 static int hammer_vop_advlock(struct vop_advlock_args *);
54 static int hammer_vop_close(struct vop_close_args *);
55 static int hammer_vop_ncreate(struct vop_ncreate_args *);
56 static int hammer_vop_getattr(struct vop_getattr_args *);
57 static int hammer_vop_nresolve(struct vop_nresolve_args *);
58 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
59 static int hammer_vop_nlink(struct vop_nlink_args *);
60 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
61 static int hammer_vop_nmknod(struct vop_nmknod_args *);
62 static int hammer_vop_open(struct vop_open_args *);
63 static int hammer_vop_print(struct vop_print_args *);
64 static int hammer_vop_readdir(struct vop_readdir_args *);
65 static int hammer_vop_readlink(struct vop_readlink_args *);
66 static int hammer_vop_nremove(struct vop_nremove_args *);
67 static int hammer_vop_nrename(struct vop_nrename_args *);
68 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
69 static int hammer_vop_markatime(struct vop_markatime_args *);
70 static int hammer_vop_setattr(struct vop_setattr_args *);
71 static int hammer_vop_strategy(struct vop_strategy_args *);
72 static int hammer_vop_bmap(struct vop_bmap_args *ap);
73 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
74 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
75 static int hammer_vop_ioctl(struct vop_ioctl_args *);
76 static int hammer_vop_mountctl(struct vop_mountctl_args *);
77 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
78
79 static int hammer_vop_fifoclose (struct vop_close_args *);
80 static int hammer_vop_fiforead (struct vop_read_args *);
81 static int hammer_vop_fifowrite (struct vop_write_args *);
82 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
83
84 struct vop_ops hammer_vnode_vops = {
85         .vop_default =          vop_defaultop,
86         .vop_fsync =            hammer_vop_fsync,
87         .vop_getpages =         vop_stdgetpages,
88         .vop_putpages =         vop_stdputpages,
89         .vop_read =             hammer_vop_read,
90         .vop_write =            hammer_vop_write,
91         .vop_access =           hammer_vop_access,
92         .vop_advlock =          hammer_vop_advlock,
93         .vop_close =            hammer_vop_close,
94         .vop_ncreate =          hammer_vop_ncreate,
95         .vop_getattr =          hammer_vop_getattr,
96         .vop_inactive =         hammer_vop_inactive,
97         .vop_reclaim =          hammer_vop_reclaim,
98         .vop_nresolve =         hammer_vop_nresolve,
99         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
100         .vop_nlink =            hammer_vop_nlink,
101         .vop_nmkdir =           hammer_vop_nmkdir,
102         .vop_nmknod =           hammer_vop_nmknod,
103         .vop_open =             hammer_vop_open,
104         .vop_pathconf =         vop_stdpathconf,
105         .vop_print =            hammer_vop_print,
106         .vop_readdir =          hammer_vop_readdir,
107         .vop_readlink =         hammer_vop_readlink,
108         .vop_nremove =          hammer_vop_nremove,
109         .vop_nrename =          hammer_vop_nrename,
110         .vop_nrmdir =           hammer_vop_nrmdir,
111         .vop_markatime =        hammer_vop_markatime,
112         .vop_setattr =          hammer_vop_setattr,
113         .vop_bmap =             hammer_vop_bmap,
114         .vop_strategy =         hammer_vop_strategy,
115         .vop_nsymlink =         hammer_vop_nsymlink,
116         .vop_nwhiteout =        hammer_vop_nwhiteout,
117         .vop_ioctl =            hammer_vop_ioctl,
118         .vop_mountctl =         hammer_vop_mountctl,
119         .vop_kqfilter =         hammer_vop_kqfilter
120 };
121
122 struct vop_ops hammer_spec_vops = {
123         .vop_default =          vop_defaultop,
124         .vop_fsync =            hammer_vop_fsync,
125         .vop_read =             vop_stdnoread,
126         .vop_write =            vop_stdnowrite,
127         .vop_access =           hammer_vop_access,
128         .vop_close =            hammer_vop_close,
129         .vop_markatime =        hammer_vop_markatime,
130         .vop_getattr =          hammer_vop_getattr,
131         .vop_inactive =         hammer_vop_inactive,
132         .vop_reclaim =          hammer_vop_reclaim,
133         .vop_setattr =          hammer_vop_setattr
134 };
135
136 struct vop_ops hammer_fifo_vops = {
137         .vop_default =          fifo_vnoperate,
138         .vop_fsync =            hammer_vop_fsync,
139         .vop_read =             hammer_vop_fiforead,
140         .vop_write =            hammer_vop_fifowrite,
141         .vop_access =           hammer_vop_access,
142         .vop_close =            hammer_vop_fifoclose,
143         .vop_markatime =        hammer_vop_markatime,
144         .vop_getattr =          hammer_vop_getattr,
145         .vop_inactive =         hammer_vop_inactive,
146         .vop_reclaim =          hammer_vop_reclaim,
147         .vop_setattr =          hammer_vop_setattr,
148         .vop_kqfilter =         hammer_vop_fifokqfilter
149 };
150
151 static __inline
152 void
153 hammer_knote(struct vnode *vp, int flags)
154 {
155         if (flags)
156                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
157 }
158
159 #ifdef DEBUG_TRUNCATE
160 struct hammer_inode *HammerTruncIp;
161 #endif
162
163 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
164                            struct vnode *dvp, struct ucred *cred,
165                            int flags, int isdir);
166 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
167 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
168
169 #if 0
170 static
171 int
172 hammer_vop_vnoperate(struct vop_generic_args *)
173 {
174         return (VOCALL(&hammer_vnode_vops, ap));
175 }
176 #endif
177
178 /*
179  * hammer_vop_fsync { vp, waitfor }
180  *
181  * fsync() an inode to disk and wait for it to be completely committed
182  * such that the information would not be undone if a crash occured after
183  * return.
184  *
185  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
186  *       a REDO log.  A sysctl is provided to relax HAMMER's fsync()
187  *       operation.
188  *
189  *       Ultimately the combination of a REDO log and use of fast storage
190  *       to front-end cluster caches will make fsync fast, but it aint
191  *       here yet.  And, in anycase, we need real transactional
192  *       all-or-nothing features which are not restricted to a single file.
193  */
194 static
195 int
196 hammer_vop_fsync(struct vop_fsync_args *ap)
197 {
198         hammer_inode_t ip = VTOI(ap->a_vp);
199         hammer_mount_t hmp = ip->hmp;
200         int waitfor = ap->a_waitfor;
201         int mode;
202
203         lwkt_gettoken(&hmp->fs_token);
204
205         /*
206          * Fsync rule relaxation (default is either full synchronous flush
207          * or REDO semantics with synchronous flush).
208          */
209         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
210                 switch(hammer_fsync_mode) {
211                 case 0:
212 mode0:
213                         /* no REDO, full synchronous flush */
214                         goto skip;
215                 case 1:
216 mode1:
217                         /* no REDO, full asynchronous flush */
218                         if (waitfor == MNT_WAIT)
219                                 waitfor = MNT_NOWAIT;
220                         goto skip;
221                 case 2:
222                         /* REDO semantics, synchronous flush */
223                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
224                                 goto mode0;
225                         mode = HAMMER_FLUSH_UNDOS_AUTO;
226                         break;
227                 case 3:
228                         /* REDO semantics, relaxed asynchronous flush */
229                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
230                                 goto mode1;
231                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
232                         if (waitfor == MNT_WAIT)
233                                 waitfor = MNT_NOWAIT;
234                         break;
235                 case 4:
236                         /* ignore the fsync() system call */
237                         lwkt_reltoken(&hmp->fs_token);
238                         return(0);
239                 default:
240                         /* we have to do something */
241                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
242                         if (waitfor == MNT_WAIT)
243                                 waitfor = MNT_NOWAIT;
244                         break;
245                 }
246
247                 /*
248                  * Fast fsync only needs to flush the UNDO/REDO fifo if
249                  * HAMMER_INODE_REDO is non-zero and the only modifications
250                  * made to the file are write or write-extends.
251                  */
252                 if ((ip->flags & HAMMER_INODE_REDO) &&
253                     (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0) {
254                         ++hammer_count_fsyncs;
255                         hammer_flusher_flush_undos(hmp, mode);
256                         ip->redo_count = 0;
257                         if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
258                                 vclrisdirty(ip->vp);
259                         lwkt_reltoken(&hmp->fs_token);
260                         return(0);
261                 }
262
263                 /*
264                  * REDO is enabled by fsync(), the idea being we really only
265                  * want to lay down REDO records when programs are using
266                  * fsync() heavily.  The first fsync() on the file starts
267                  * the gravy train going and later fsync()s keep it hot by
268                  * resetting the redo_count.
269                  *
270                  * We weren't running REDOs before now so we have to fall
271                  * through and do a full fsync of what we have.
272                  */
273                 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
274                     (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
275                         ip->flags |= HAMMER_INODE_REDO;
276                         ip->redo_count = 0;
277                 }
278         }
279 skip:
280
281         /*
282          * Do a full flush sequence.
283          *
284          * Attempt to release the vnode while waiting for the inode to
285          * finish flushing.  This can really mess up inactive->reclaim
286          * sequences so only do it if the vnode is active.
287          *
288          * WARNING! The VX lock functions must be used.  vn_lock() will
289          *          fail when this is part of a VOP_RECLAIM sequence.
290          */
291         ++hammer_count_fsyncs;
292         vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
293         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
294         if (waitfor == MNT_WAIT) {
295                 int dorelock;
296
297                 if ((ap->a_vp->v_flag & VRECLAIMED) == 0) {
298                         vx_unlock(ap->a_vp);
299                         dorelock = 1;
300                 } else {
301                         dorelock = 0;
302                 }
303                 hammer_wait_inode(ip);
304                 if (dorelock)
305                         vx_lock(ap->a_vp);
306         }
307         if (ip->vp && (ip->flags & HAMMER_INODE_MODMASK) == 0)
308                 vclrisdirty(ip->vp);
309         lwkt_reltoken(&hmp->fs_token);
310         return (ip->error);
311 }
312
313 /*
314  * hammer_vop_read { vp, uio, ioflag, cred }
315  *
316  * MPSAFE (for the cache safe does not require fs_token)
317  */
318 static
319 int
320 hammer_vop_read(struct vop_read_args *ap)
321 {
322         struct hammer_transaction trans;
323         hammer_inode_t ip;
324         hammer_mount_t hmp;
325         off_t offset;
326         struct buf *bp;
327         struct uio *uio;
328         int error;
329         int n;
330         int seqcount;
331         int ioseqcount;
332         int blksize;
333         int bigread;
334         int got_trans;
335         size_t resid;
336
337         if (ap->a_vp->v_type != VREG)
338                 return (EINVAL);
339         ip = VTOI(ap->a_vp);
340         hmp = ip->hmp;
341         error = 0;
342         got_trans = 0;
343         uio = ap->a_uio;
344
345         /*
346          * Attempt to shortcut directly to the VM object using lwbufs.
347          * This is much faster than instantiating buffer cache buffers.
348          */
349         resid = uio->uio_resid;
350         error = vop_helper_read_shortcut(ap);
351         hammer_stats_file_read += resid - uio->uio_resid;
352         if (error)
353                 return (error);
354         if (uio->uio_resid == 0)
355                 goto finished;
356
357         /*
358          * Allow the UIO's size to override the sequential heuristic.
359          */
360         blksize = hammer_blocksize(uio->uio_offset);
361         seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
362         ioseqcount = (ap->a_ioflag >> 16);
363         if (seqcount < ioseqcount)
364                 seqcount = ioseqcount;
365
366         /*
367          * If reading or writing a huge amount of data we have to break
368          * atomicy and allow the operation to be interrupted by a signal
369          * or it can DOS the machine.
370          */
371         bigread = (uio->uio_resid > 100 * 1024 * 1024);
372
373         /*
374          * Access the data typically in HAMMER_BUFSIZE blocks via the
375          * buffer cache, but HAMMER may use a variable block size based
376          * on the offset.
377          *
378          * XXX Temporary hack, delay the start transaction while we remain
379          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
380          *     locked-shared.
381          */
382         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
383                 int64_t base_offset;
384                 int64_t file_limit;
385
386                 blksize = hammer_blocksize(uio->uio_offset);
387                 offset = (int)uio->uio_offset & (blksize - 1);
388                 base_offset = uio->uio_offset - offset;
389
390                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
391                         break;
392
393                 /*
394                  * MPSAFE
395                  */
396                 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
397                 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
398                         bp->b_flags &= ~B_AGE;
399                         error = 0;
400                         goto skip;
401                 }
402                 if (ap->a_ioflag & IO_NRDELAY) {
403                         bqrelse(bp);
404                         return (EWOULDBLOCK);
405                 }
406
407                 /*
408                  * MPUNSAFE
409                  */
410                 if (got_trans == 0) {
411                         hammer_start_transaction(&trans, ip->hmp);
412                         got_trans = 1;
413                 }
414
415                 /*
416                  * NOTE: A valid bp has already been acquired, but was not
417                  *       B_CACHE.
418                  */
419                 if (hammer_cluster_enable) {
420                         /*
421                          * Use file_limit to prevent cluster_read() from
422                          * creating buffers of the wrong block size past
423                          * the demarc.
424                          */
425                         file_limit = ip->ino_data.size;
426                         if (base_offset < HAMMER_XDEMARC &&
427                             file_limit > HAMMER_XDEMARC) {
428                                 file_limit = HAMMER_XDEMARC;
429                         }
430                         error = cluster_readx(ap->a_vp,
431                                              file_limit, base_offset,
432                                              blksize, uio->uio_resid,
433                                              seqcount * BKVASIZE, &bp);
434                 } else {
435                         error = breadnx(ap->a_vp, base_offset, blksize,
436                                         NULL, NULL, 0, &bp);
437                 }
438                 if (error) {
439                         brelse(bp);
440                         break;
441                 }
442 skip:
443                 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
444                         kprintf("doff %016jx read file %016jx@%016jx\n",
445                                 (intmax_t)bp->b_bio2.bio_offset,
446                                 (intmax_t)ip->obj_id,
447                                 (intmax_t)bp->b_loffset);
448                 }
449                 bp->b_flags &= ~B_IODEBUG;
450                 if (blksize == HAMMER_XBUFSIZE)
451                         bp->b_flags |= B_CLUSTEROK;
452
453                 n = blksize - offset;
454                 if (n > uio->uio_resid)
455                         n = uio->uio_resid;
456                 if (n > ip->ino_data.size - uio->uio_offset)
457                         n = (int)(ip->ino_data.size - uio->uio_offset);
458
459                 /*
460                  * Set B_AGE, data has a lower priority than meta-data.
461                  *
462                  * Use a hold/unlock/drop sequence to run the uiomove
463                  * with the buffer unlocked, avoiding deadlocks against
464                  * read()s on mmap()'d spaces.
465                  */
466                 bp->b_flags |= B_AGE;
467                 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio);
468                 bqrelse(bp);
469
470                 if (error)
471                         break;
472                 hammer_stats_file_read += n;
473         }
474
475 finished:
476
477         /*
478          * Try to update the atime with just the inode lock for maximum
479          * concurrency.  If we can't shortcut it we have to get the full
480          * blown transaction.
481          */
482         if (got_trans == 0 && hammer_update_atime_quick(ip) < 0) {
483                 hammer_start_transaction(&trans, ip->hmp);
484                 got_trans = 1;
485         }
486
487         if (got_trans) {
488                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
489                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
490                         lwkt_gettoken(&hmp->fs_token);
491                         ip->ino_data.atime = trans.time;
492                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
493                         hammer_done_transaction(&trans);
494                         lwkt_reltoken(&hmp->fs_token);
495                 } else {
496                         hammer_done_transaction(&trans);
497                 }
498         }
499         return (error);
500 }
501
502 /*
503  * hammer_vop_write { vp, uio, ioflag, cred }
504  */
505 static
506 int
507 hammer_vop_write(struct vop_write_args *ap)
508 {
509         struct hammer_transaction trans;
510         struct hammer_inode *ip;
511         hammer_mount_t hmp;
512         thread_t td;
513         struct uio *uio;
514         int offset;
515         off_t base_offset;
516         int64_t cluster_eof;
517         struct buf *bp;
518         int kflags;
519         int error;
520         int n;
521         int flags;
522         int seqcount;
523         int bigwrite;
524
525         if (ap->a_vp->v_type != VREG)
526                 return (EINVAL);
527         ip = VTOI(ap->a_vp);
528         hmp = ip->hmp;
529         error = 0;
530         kflags = 0;
531         seqcount = ap->a_ioflag >> 16;
532
533         if (ip->flags & HAMMER_INODE_RO)
534                 return (EROFS);
535
536         /*
537          * Create a transaction to cover the operations we perform.
538          */
539         hammer_start_transaction(&trans, hmp);
540         uio = ap->a_uio;
541
542         /*
543          * Check append mode
544          */
545         if (ap->a_ioflag & IO_APPEND)
546                 uio->uio_offset = ip->ino_data.size;
547
548         /*
549          * Check for illegal write offsets.  Valid range is 0...2^63-1.
550          *
551          * NOTE: the base_off assignment is required to work around what
552          * I consider to be a GCC-4 optimization bug.
553          */
554         if (uio->uio_offset < 0) {
555                 hammer_done_transaction(&trans);
556                 return (EFBIG);
557         }
558         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
559         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
560                 hammer_done_transaction(&trans);
561                 return (EFBIG);
562         }
563
564         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
565             base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
566                 hammer_done_transaction(&trans);
567                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
568                 return (EFBIG);
569         }
570
571         /*
572          * If reading or writing a huge amount of data we have to break
573          * atomicy and allow the operation to be interrupted by a signal
574          * or it can DOS the machine.
575          *
576          * Preset redo_count so we stop generating REDOs earlier if the
577          * limit is exceeded.
578          *
579          * redo_count is heuristical, SMP races are ok
580          */
581         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
582         if ((ip->flags & HAMMER_INODE_REDO) &&
583             ip->redo_count < hammer_limit_redo) {
584                 ip->redo_count += uio->uio_resid;
585         }
586
587         /*
588          * Access the data typically in HAMMER_BUFSIZE blocks via the
589          * buffer cache, but HAMMER may use a variable block size based
590          * on the offset.
591          */
592         while (uio->uio_resid > 0) {
593                 int fixsize = 0;
594                 int blksize;
595                 int blkmask;
596                 int trivial;
597                 int endofblk;
598                 off_t nsize;
599
600                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
601                         break;
602                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
603                         break;
604
605                 blksize = hammer_blocksize(uio->uio_offset);
606
607                 /*
608                  * Control the number of pending records associated with
609                  * this inode.  If too many have accumulated start a
610                  * flush.  Try to maintain a pipeline with the flusher.
611                  *
612                  * NOTE: It is possible for other sources to grow the
613                  *       records but not necessarily issue another flush,
614                  *       so use a timeout and ensure that a re-flush occurs.
615                  */
616                 if (ip->rsv_recs >= hammer_limit_inode_recs) {
617                         lwkt_gettoken(&hmp->fs_token);
618                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
619                         while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
620                                 ip->flags |= HAMMER_INODE_RECSW;
621                                 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
622                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
623                         }
624                         lwkt_reltoken(&hmp->fs_token);
625                 }
626
627                 /*
628                  * Do not allow HAMMER to blow out the buffer cache.  Very
629                  * large UIOs can lockout other processes due to bwillwrite()
630                  * mechanics.
631                  *
632                  * The hammer inode is not locked during these operations.
633                  * The vnode is locked which can interfere with the pageout
634                  * daemon for non-UIO_NOCOPY writes but should not interfere
635                  * with the buffer cache.  Even so, we cannot afford to
636                  * allow the pageout daemon to build up too many dirty buffer
637                  * cache buffers.
638                  *
639                  * Only call this if we aren't being recursively called from
640                  * a virtual disk device (vn), else we may deadlock.
641                  */
642                 if ((ap->a_ioflag & IO_RECURSE) == 0)
643                         bwillwrite(blksize);
644
645                 /*
646                  * Calculate the blocksize at the current offset and figure
647                  * out how much we can actually write.
648                  */
649                 blkmask = blksize - 1;
650                 offset = (int)uio->uio_offset & blkmask;
651                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
652                 n = blksize - offset;
653                 if (n > uio->uio_resid) {
654                         n = uio->uio_resid;
655                         endofblk = 0;
656                 } else {
657                         endofblk = 1;
658                 }
659                 nsize = uio->uio_offset + n;
660                 if (nsize > ip->ino_data.size) {
661                         if (uio->uio_offset > ip->ino_data.size)
662                                 trivial = 0;
663                         else
664                                 trivial = 1;
665                         nvextendbuf(ap->a_vp,
666                                     ip->ino_data.size,
667                                     nsize,
668                                     hammer_blocksize(ip->ino_data.size),
669                                     hammer_blocksize(nsize),
670                                     hammer_blockoff(ip->ino_data.size),
671                                     hammer_blockoff(nsize),
672                                     trivial);
673                         fixsize = 1;
674                         kflags |= NOTE_EXTEND;
675                 }
676
677                 if (uio->uio_segflg == UIO_NOCOPY) {
678                         /*
679                          * Issuing a write with the same data backing the
680                          * buffer.  Instantiate the buffer to collect the
681                          * backing vm pages, then read-in any missing bits.
682                          *
683                          * This case is used by vop_stdputpages().
684                          */
685                         bp = getblk(ap->a_vp, base_offset,
686                                     blksize, GETBLK_BHEAVY, 0);
687                         if ((bp->b_flags & B_CACHE) == 0) {
688                                 bqrelse(bp);
689                                 error = bread(ap->a_vp, base_offset,
690                                               blksize, &bp);
691                         }
692                 } else if (offset == 0 && uio->uio_resid >= blksize) {
693                         /*
694                          * Even though we are entirely overwriting the buffer
695                          * we may still have to zero it out to avoid a
696                          * mmap/write visibility issue.
697                          */
698                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
699                         if ((bp->b_flags & B_CACHE) == 0)
700                                 vfs_bio_clrbuf(bp);
701                 } else if (base_offset >= ip->ino_data.size) {
702                         /*
703                          * If the base offset of the buffer is beyond the
704                          * file EOF, we don't have to issue a read.
705                          */
706                         bp = getblk(ap->a_vp, base_offset,
707                                     blksize, GETBLK_BHEAVY, 0);
708                         vfs_bio_clrbuf(bp);
709                 } else {
710                         /*
711                          * Partial overwrite, read in any missing bits then
712                          * replace the portion being written.
713                          */
714                         error = bread(ap->a_vp, base_offset, blksize, &bp);
715                         if (error == 0)
716                                 bheavy(bp);
717                 }
718                 if (error == 0)
719                         error = uiomovebp(bp, bp->b_data + offset, n, uio);
720
721                 lwkt_gettoken(&hmp->fs_token);
722
723                 /*
724                  * Generate REDO records if enabled and redo_count will not
725                  * exceeded the limit.
726                  *
727                  * If redo_count exceeds the limit we stop generating records
728                  * and clear HAMMER_INODE_REDO.  This will cause the next
729                  * fsync() to do a full meta-data sync instead of just an
730                  * UNDO/REDO fifo update.
731                  *
732                  * When clearing HAMMER_INODE_REDO any pre-existing REDOs
733                  * will still be tracked.  The tracks will be terminated
734                  * when the related meta-data (including possible data
735                  * modifications which are not tracked via REDO) is
736                  * flushed.
737                  */
738                 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
739                         if (ip->redo_count < hammer_limit_redo) {
740                                 bp->b_flags |= B_VFSFLAG1;
741                                 error = hammer_generate_redo(&trans, ip,
742                                                      base_offset + offset,
743                                                      HAMMER_REDO_WRITE,
744                                                      bp->b_data + offset,
745                                                      (size_t)n);
746                         } else {
747                                 ip->flags &= ~HAMMER_INODE_REDO;
748                         }
749                 }
750
751                 /*
752                  * If we screwed up we have to undo any VM size changes we
753                  * made.
754                  */
755                 if (error) {
756                         brelse(bp);
757                         if (fixsize) {
758                                 nvtruncbuf(ap->a_vp, ip->ino_data.size,
759                                           hammer_blocksize(ip->ino_data.size),
760                                           hammer_blockoff(ip->ino_data.size),
761                                           0);
762                         }
763                         lwkt_reltoken(&hmp->fs_token);
764                         break;
765                 }
766                 kflags |= NOTE_WRITE;
767                 hammer_stats_file_write += n;
768                 if (blksize == HAMMER_XBUFSIZE)
769                         bp->b_flags |= B_CLUSTEROK;
770                 if (ip->ino_data.size < uio->uio_offset) {
771                         ip->ino_data.size = uio->uio_offset;
772                         flags = HAMMER_INODE_SDIRTY;
773                 } else {
774                         flags = 0;
775                 }
776                 ip->ino_data.mtime = trans.time;
777                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
778                 hammer_modify_inode(&trans, ip, flags);
779
780                 /*
781                  * Once we dirty the buffer any cached zone-X offset
782                  * becomes invalid.  HAMMER NOTE: no-history mode cannot
783                  * allow overwriting over the same data sector unless
784                  * we provide UNDOs for the old data, which we don't.
785                  */
786                 bp->b_bio2.bio_offset = NOOFFSET;
787
788                 lwkt_reltoken(&hmp->fs_token);
789
790                 /*
791                  * Final buffer disposition.
792                  *
793                  * Because meta-data updates are deferred, HAMMER is
794                  * especially sensitive to excessive bdwrite()s because
795                  * the I/O stream is not broken up by disk reads.  So the
796                  * buffer cache simply cannot keep up.
797                  *
798                  * WARNING!  blksize is variable.  cluster_write() is
799                  *           expected to not blow up if it encounters
800                  *           buffers that do not match the passed blksize.
801                  *
802                  * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
803                  *        The ip->rsv_recs check should burst-flush the data.
804                  *        If we queue it immediately the buf could be left
805                  *        locked on the device queue for a very long time.
806                  *
807                  *        However, failing to flush a dirty buffer out when
808                  *        issued from the pageout daemon can result in a low
809                  *        memory deadlock against bio_page_alloc(), so we
810                  *        have to bawrite() on IO_ASYNC as well.
811                  *
812                  * NOTE!  To avoid degenerate stalls due to mismatched block
813                  *        sizes we only honor IO_DIRECT on the write which
814                  *        abuts the end of the buffer.  However, we must
815                  *        honor IO_SYNC in case someone is silly enough to
816                  *        configure a HAMMER file as swap, or when HAMMER
817                  *        is serving NFS (for commits).  Ick ick.
818                  */
819                 bp->b_flags |= B_AGE;
820                 if (blksize == HAMMER_XBUFSIZE)
821                         bp->b_flags |= B_CLUSTEROK;
822
823                 if (ap->a_ioflag & IO_SYNC) {
824                         bwrite(bp);
825                 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
826                         bawrite(bp);
827                 } else if (ap->a_ioflag & IO_ASYNC) {
828                         bawrite(bp);
829                 } else if (hammer_cluster_enable &&
830                            !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
831                         if (base_offset < HAMMER_XDEMARC)
832                                 cluster_eof = hammer_blockdemarc(base_offset,
833                                                          ip->ino_data.size);
834                         else
835                                 cluster_eof = ip->ino_data.size;
836                         cluster_write(bp, cluster_eof, blksize, seqcount);
837                 } else {
838                         bdwrite(bp);
839                 }
840         }
841         hammer_done_transaction(&trans);
842         hammer_knote(ap->a_vp, kflags);
843
844         return (error);
845 }
846
847 /*
848  * hammer_vop_access { vp, mode, cred }
849  *
850  * MPSAFE - does not require fs_token
851  */
852 static
853 int
854 hammer_vop_access(struct vop_access_args *ap)
855 {
856         struct hammer_inode *ip = VTOI(ap->a_vp);
857         uid_t uid;
858         gid_t gid;
859         int error;
860
861         ++hammer_stats_file_iopsr;
862         uid = hammer_to_unix_xid(&ip->ino_data.uid);
863         gid = hammer_to_unix_xid(&ip->ino_data.gid);
864
865         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
866                                   ip->ino_data.uflags);
867         return (error);
868 }
869
870 /*
871  * hammer_vop_advlock { vp, id, op, fl, flags }
872  *
873  * MPSAFE - does not require fs_token
874  */
875 static
876 int
877 hammer_vop_advlock(struct vop_advlock_args *ap)
878 {
879         hammer_inode_t ip = VTOI(ap->a_vp);
880
881         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
882 }
883
884 /*
885  * hammer_vop_close { vp, fflag }
886  *
887  * We can only sync-on-close for normal closes.  XXX disabled for now.
888  */
889 static
890 int
891 hammer_vop_close(struct vop_close_args *ap)
892 {
893 #if 0
894         struct vnode *vp = ap->a_vp;
895         hammer_inode_t ip = VTOI(vp);
896         int waitfor;
897         if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
898                 if (vn_islocked(vp) == LK_EXCLUSIVE &&
899                     (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
900                         if (ip->flags & HAMMER_INODE_CLOSESYNC)
901                                 waitfor = MNT_WAIT;
902                         else
903                                 waitfor = MNT_NOWAIT;
904                         ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
905                                        HAMMER_INODE_CLOSEASYNC);
906                         VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
907                 }
908         }
909 #endif
910         return (vop_stdclose(ap));
911 }
912
913 /*
914  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
915  *
916  * The operating system has already ensured that the directory entry
917  * does not exist and done all appropriate namespace locking.
918  */
919 static
920 int
921 hammer_vop_ncreate(struct vop_ncreate_args *ap)
922 {
923         struct hammer_transaction trans;
924         struct hammer_inode *dip;
925         struct hammer_inode *nip;
926         struct nchandle *nch;
927         hammer_mount_t hmp;
928         int error;
929
930         nch = ap->a_nch;
931         dip = VTOI(ap->a_dvp);
932         hmp = dip->hmp;
933
934         if (dip->flags & HAMMER_INODE_RO)
935                 return (EROFS);
936         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
937                 return (error);
938
939         /*
940          * Create a transaction to cover the operations we perform.
941          */
942         lwkt_gettoken(&hmp->fs_token);
943         hammer_start_transaction(&trans, hmp);
944         ++hammer_stats_file_iopsw;
945
946         /*
947          * Create a new filesystem object of the requested type.  The
948          * returned inode will be referenced and shared-locked to prevent
949          * it from being moved to the flusher.
950          */
951         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
952                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
953                                     NULL, &nip);
954         if (error) {
955                 hkprintf("hammer_create_inode error %d\n", error);
956                 hammer_done_transaction(&trans);
957                 *ap->a_vpp = NULL;
958                 lwkt_reltoken(&hmp->fs_token);
959                 return (error);
960         }
961
962         /*
963          * Add the new filesystem object to the directory.  This will also
964          * bump the inode's link count.
965          */
966         error = hammer_ip_add_directory(&trans, dip,
967                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
968                                         nip);
969         if (error)
970                 hkprintf("hammer_ip_add_directory error %d\n", error);
971
972         /*
973          * Finish up.
974          */
975         if (error) {
976                 hammer_rel_inode(nip, 0);
977                 hammer_done_transaction(&trans);
978                 *ap->a_vpp = NULL;
979         } else {
980                 error = hammer_get_vnode(nip, ap->a_vpp);
981                 hammer_done_transaction(&trans);
982                 hammer_rel_inode(nip, 0);
983                 if (error == 0) {
984                         cache_setunresolved(ap->a_nch);
985                         cache_setvp(ap->a_nch, *ap->a_vpp);
986                 }
987                 hammer_knote(ap->a_dvp, NOTE_WRITE);
988         }
989         lwkt_reltoken(&hmp->fs_token);
990         return (error);
991 }
992
993 /*
994  * hammer_vop_getattr { vp, vap }
995  *
996  * Retrieve an inode's attribute information.  When accessing inodes
997  * historically we fake the atime field to ensure consistent results.
998  * The atime field is stored in the B-Tree element and allowed to be
999  * updated without cycling the element.
1000  *
1001  * MPSAFE - does not require fs_token
1002  */
1003 static
1004 int
1005 hammer_vop_getattr(struct vop_getattr_args *ap)
1006 {
1007         struct hammer_inode *ip = VTOI(ap->a_vp);
1008         struct vattr *vap = ap->a_vap;
1009
1010         /*
1011          * We want the fsid to be different when accessing a filesystem
1012          * with different as-of's so programs like diff don't think
1013          * the files are the same.
1014          *
1015          * We also want the fsid to be the same when comparing snapshots,
1016          * or when comparing mirrors (which might be backed by different
1017          * physical devices).  HAMMER fsids are based on the PFS's
1018          * shared_uuid field.
1019          *
1020          * XXX there is a chance of collision here.  The va_fsid reported
1021          * by stat is different from the more involved fsid used in the
1022          * mount structure.
1023          */
1024         ++hammer_stats_file_iopsr;
1025         hammer_lock_sh(&ip->lock);
1026         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1027                        (u_int32_t)(ip->obj_asof >> 32);
1028
1029         vap->va_fileid = ip->ino_leaf.base.obj_id;
1030         vap->va_mode = ip->ino_data.mode;
1031         vap->va_nlink = ip->ino_data.nlinks;
1032         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1033         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1034         vap->va_rmajor = 0;
1035         vap->va_rminor = 0;
1036         vap->va_size = ip->ino_data.size;
1037
1038         /*
1039          * Special case for @@PFS softlinks.  The actual size of the
1040          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1041          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1042          *
1043          * Note that userspace hammer command does not allow users to
1044          * create a @@PFS softlink under an existing other PFS (id!=0)
1045          * so the ip localization here for @@PFS softlink is always 0.
1046          */
1047         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1048             ip->ino_data.size == 10 &&
1049             ip->obj_asof == HAMMER_MAX_TID &&
1050             ip->obj_localization == 0 &&
1051             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1052                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1053                             vap->va_size = 26;
1054                     else
1055                             vap->va_size = 10;
1056         }
1057
1058         /*
1059          * We must provide a consistent atime and mtime for snapshots
1060          * so people can do a 'tar cf - ... | md5' on them and get
1061          * consistent results.
1062          */
1063         if (ip->flags & HAMMER_INODE_RO) {
1064                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1065                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1066         } else {
1067                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1068                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1069         }
1070         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1071         vap->va_flags = ip->ino_data.uflags;
1072         vap->va_gen = 1;        /* hammer inums are unique for all time */
1073         vap->va_blocksize = HAMMER_BUFSIZE;
1074         if (ip->ino_data.size >= HAMMER_XDEMARC) {
1075                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1076                                 ~HAMMER_XBUFMASK64;
1077         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1078                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1079                                 ~HAMMER_BUFMASK64;
1080         } else {
1081                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1082         }
1083
1084         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1085         vap->va_filerev = 0;    /* XXX */
1086         vap->va_uid_uuid = ip->ino_data.uid;
1087         vap->va_gid_uuid = ip->ino_data.gid;
1088         vap->va_fsid_uuid = ip->hmp->fsid;
1089         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1090                           VA_FSID_UUID_VALID;
1091
1092         switch (ip->ino_data.obj_type) {
1093         case HAMMER_OBJTYPE_CDEV:
1094         case HAMMER_OBJTYPE_BDEV:
1095                 vap->va_rmajor = ip->ino_data.rmajor;
1096                 vap->va_rminor = ip->ino_data.rminor;
1097                 break;
1098         default:
1099                 break;
1100         }
1101         hammer_unlock(&ip->lock);
1102         return(0);
1103 }
1104
1105 /*
1106  * hammer_vop_nresolve { nch, dvp, cred }
1107  *
1108  * Locate the requested directory entry.
1109  */
1110 static
1111 int
1112 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1113 {
1114         struct hammer_transaction trans;
1115         struct namecache *ncp;
1116         hammer_mount_t hmp;
1117         hammer_inode_t dip;
1118         hammer_inode_t ip;
1119         hammer_tid_t asof;
1120         struct hammer_cursor cursor;
1121         struct vnode *vp;
1122         int64_t namekey;
1123         int error;
1124         int i;
1125         int nlen;
1126         int flags;
1127         int ispfs;
1128         int64_t obj_id;
1129         u_int32_t localization;
1130         u_int32_t max_iterations;
1131
1132         /*
1133          * Misc initialization, plus handle as-of name extensions.  Look for
1134          * the '@@' extension.  Note that as-of files and directories cannot
1135          * be modified.
1136          */
1137         dip = VTOI(ap->a_dvp);
1138         ncp = ap->a_nch->ncp;
1139         asof = dip->obj_asof;
1140         localization = dip->obj_localization;   /* for code consistency */
1141         nlen = ncp->nc_nlen;
1142         flags = dip->flags & HAMMER_INODE_RO;
1143         ispfs = 0;
1144         hmp = dip->hmp;
1145
1146         lwkt_gettoken(&hmp->fs_token);
1147         hammer_simple_transaction(&trans, hmp);
1148         ++hammer_stats_file_iopsr;
1149
1150         for (i = 0; i < nlen; ++i) {
1151                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1152                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
1153                                                   &ispfs, &asof, &localization);
1154                         if (error != 0) {
1155                                 i = nlen;
1156                                 break;
1157                         }
1158                         if (asof != HAMMER_MAX_TID)
1159                                 flags |= HAMMER_INODE_RO;
1160                         break;
1161                 }
1162         }
1163         nlen = i;
1164
1165         /*
1166          * If this is a PFS softlink we dive into the PFS
1167          */
1168         if (ispfs && nlen == 0) {
1169                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1170                                       asof, localization,
1171                                       flags, &error);
1172                 if (error == 0) {
1173                         error = hammer_get_vnode(ip, &vp);
1174                         hammer_rel_inode(ip, 0);
1175                 } else {
1176                         vp = NULL;
1177                 }
1178                 if (error == 0) {
1179                         vn_unlock(vp);
1180                         cache_setvp(ap->a_nch, vp);
1181                         vrele(vp);
1182                 }
1183                 goto done;
1184         }
1185
1186         /*
1187          * If there is no path component the time extension is relative to dip.
1188          * e.g. "fubar/@@<snapshot>"
1189          *
1190          * "." is handled by the kernel, but ".@@<snapshot>" is not.
1191          * e.g. "fubar/.@@<snapshot>"
1192          *
1193          * ".." is handled by the kernel.  We do not currently handle
1194          * "..@<snapshot>".
1195          */
1196         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1197                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
1198                                       asof, dip->obj_localization,
1199                                       flags, &error);
1200                 if (error == 0) {
1201                         error = hammer_get_vnode(ip, &vp);
1202                         hammer_rel_inode(ip, 0);
1203                 } else {
1204                         vp = NULL;
1205                 }
1206                 if (error == 0) {
1207                         vn_unlock(vp);
1208                         cache_setvp(ap->a_nch, vp);
1209                         vrele(vp);
1210                 }
1211                 goto done;
1212         }
1213
1214         /*
1215          * Calculate the namekey and setup the key range for the scan.  This
1216          * works kinda like a chained hash table where the lower 32 bits
1217          * of the namekey synthesize the chain.
1218          *
1219          * The key range is inclusive of both key_beg and key_end.
1220          */
1221         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1222                                            &max_iterations);
1223
1224         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1225         cursor.key_beg.localization = dip->obj_localization +
1226                                       hammer_dir_localization(dip);
1227         cursor.key_beg.obj_id = dip->obj_id;
1228         cursor.key_beg.key = namekey;
1229         cursor.key_beg.create_tid = 0;
1230         cursor.key_beg.delete_tid = 0;
1231         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1232         cursor.key_beg.obj_type = 0;
1233
1234         cursor.key_end = cursor.key_beg;
1235         cursor.key_end.key += max_iterations;
1236         cursor.asof = asof;
1237         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1238
1239         /*
1240          * Scan all matching records (the chain), locate the one matching
1241          * the requested path component.
1242          *
1243          * The hammer_ip_*() functions merge in-memory records with on-disk
1244          * records for the purposes of the search.
1245          */
1246         obj_id = 0;
1247         localization = HAMMER_DEF_LOCALIZATION;
1248
1249         if (error == 0) {
1250                 error = hammer_ip_first(&cursor);
1251                 while (error == 0) {
1252                         error = hammer_ip_resolve_data(&cursor);
1253                         if (error)
1254                                 break;
1255                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1256                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1257                                 obj_id = cursor.data->entry.obj_id;
1258                                 localization = cursor.data->entry.localization;
1259                                 break;
1260                         }
1261                         error = hammer_ip_next(&cursor);
1262                 }
1263         }
1264         hammer_done_cursor(&cursor);
1265
1266         /*
1267          * Lookup the obj_id.  This should always succeed.  If it does not
1268          * the filesystem may be damaged and we return a dummy inode.
1269          */
1270         if (error == 0) {
1271                 ip = hammer_get_inode(&trans, dip, obj_id,
1272                                       asof, localization,
1273                                       flags, &error);
1274                 if (error == ENOENT) {
1275                         kprintf("HAMMER: WARNING: Missing "
1276                                 "inode for dirent \"%s\"\n"
1277                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1278                                 ncp->nc_name,
1279                                 (long long)obj_id, (long long)asof,
1280                                 localization);
1281                         error = 0;
1282                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1283                                                     asof, localization,
1284                                                     flags, &error);
1285                 }
1286                 if (error == 0) {
1287                         error = hammer_get_vnode(ip, &vp);
1288                         hammer_rel_inode(ip, 0);
1289                 } else {
1290                         vp = NULL;
1291                 }
1292                 if (error == 0) {
1293                         vn_unlock(vp);
1294                         cache_setvp(ap->a_nch, vp);
1295                         vrele(vp);
1296                 }
1297         } else if (error == ENOENT) {
1298                 cache_setvp(ap->a_nch, NULL);
1299         }
1300 done:
1301         hammer_done_transaction(&trans);
1302         lwkt_reltoken(&hmp->fs_token);
1303         return (error);
1304 }
1305
1306 /*
1307  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1308  *
1309  * Locate the parent directory of a directory vnode.
1310  *
1311  * dvp is referenced but not locked.  *vpp must be returned referenced and
1312  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1313  * at the root, instead it could indicate that the directory we were in was
1314  * removed.
1315  *
1316  * NOTE: as-of sequences are not linked into the directory structure.  If
1317  * we are at the root with a different asof then the mount point, reload
1318  * the same directory with the mount point's asof.   I'm not sure what this
1319  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1320  * get confused, but it hasn't been tested.
1321  */
1322 static
1323 int
1324 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1325 {
1326         struct hammer_transaction trans;
1327         struct hammer_inode *dip;
1328         struct hammer_inode *ip;
1329         hammer_mount_t hmp;
1330         int64_t parent_obj_id;
1331         u_int32_t parent_obj_localization;
1332         hammer_tid_t asof;
1333         int error;
1334
1335         dip = VTOI(ap->a_dvp);
1336         asof = dip->obj_asof;
1337         hmp = dip->hmp;
1338
1339         /*
1340          * Whos are parent?  This could be the root of a pseudo-filesystem
1341          * whos parent is in another localization domain.
1342          */
1343         lwkt_gettoken(&hmp->fs_token);
1344         parent_obj_id = dip->ino_data.parent_obj_id;
1345         if (dip->obj_id == HAMMER_OBJID_ROOT)
1346                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1347         else
1348                 parent_obj_localization = dip->obj_localization;
1349
1350         /*
1351          * It's probably a PFS root when dip->ino_data.parent_obj_id is 0.
1352          */
1353         if (parent_obj_id == 0) {
1354                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1355                    asof != hmp->asof) {
1356                         parent_obj_id = dip->obj_id;
1357                         asof = hmp->asof;
1358                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1359                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1360                                   (long long)dip->obj_asof);
1361                 } else {
1362                         *ap->a_vpp = NULL;
1363                         lwkt_reltoken(&hmp->fs_token);
1364                         return ENOENT;
1365                 }
1366         }
1367
1368         hammer_simple_transaction(&trans, hmp);
1369         ++hammer_stats_file_iopsr;
1370
1371         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1372                               asof, parent_obj_localization,
1373                               dip->flags, &error);
1374         if (ip) {
1375                 error = hammer_get_vnode(ip, ap->a_vpp);
1376                 hammer_rel_inode(ip, 0);
1377         } else {
1378                 *ap->a_vpp = NULL;
1379         }
1380         hammer_done_transaction(&trans);
1381         lwkt_reltoken(&hmp->fs_token);
1382         return (error);
1383 }
1384
1385 /*
1386  * hammer_vop_nlink { nch, dvp, vp, cred }
1387  */
1388 static
1389 int
1390 hammer_vop_nlink(struct vop_nlink_args *ap)
1391 {
1392         struct hammer_transaction trans;
1393         struct hammer_inode *dip;
1394         struct hammer_inode *ip;
1395         struct nchandle *nch;
1396         hammer_mount_t hmp;
1397         int error;
1398
1399         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1400                 return(EXDEV);
1401
1402         nch = ap->a_nch;
1403         dip = VTOI(ap->a_dvp);
1404         ip = VTOI(ap->a_vp);
1405         hmp = dip->hmp;
1406
1407         if (dip->obj_localization != ip->obj_localization)
1408                 return(EXDEV);
1409
1410         if (dip->flags & HAMMER_INODE_RO)
1411                 return (EROFS);
1412         if (ip->flags & HAMMER_INODE_RO)
1413                 return (EROFS);
1414         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1415                 return (error);
1416
1417         /*
1418          * Create a transaction to cover the operations we perform.
1419          */
1420         lwkt_gettoken(&hmp->fs_token);
1421         hammer_start_transaction(&trans, hmp);
1422         ++hammer_stats_file_iopsw;
1423
1424         /*
1425          * Add the filesystem object to the directory.  Note that neither
1426          * dip nor ip are referenced or locked, but their vnodes are
1427          * referenced.  This function will bump the inode's link count.
1428          */
1429         error = hammer_ip_add_directory(&trans, dip,
1430                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1431                                         ip);
1432
1433         /*
1434          * Finish up.
1435          */
1436         if (error == 0) {
1437                 cache_setunresolved(nch);
1438                 cache_setvp(nch, ap->a_vp);
1439         }
1440         hammer_done_transaction(&trans);
1441         hammer_knote(ap->a_vp, NOTE_LINK);
1442         hammer_knote(ap->a_dvp, NOTE_WRITE);
1443         lwkt_reltoken(&hmp->fs_token);
1444         return (error);
1445 }
1446
1447 /*
1448  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1449  *
1450  * The operating system has already ensured that the directory entry
1451  * does not exist and done all appropriate namespace locking.
1452  */
1453 static
1454 int
1455 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1456 {
1457         struct hammer_transaction trans;
1458         struct hammer_inode *dip;
1459         struct hammer_inode *nip;
1460         struct nchandle *nch;
1461         hammer_mount_t hmp;
1462         int error;
1463
1464         nch = ap->a_nch;
1465         dip = VTOI(ap->a_dvp);
1466         hmp = dip->hmp;
1467
1468         if (dip->flags & HAMMER_INODE_RO)
1469                 return (EROFS);
1470         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1471                 return (error);
1472
1473         /*
1474          * Create a transaction to cover the operations we perform.
1475          */
1476         lwkt_gettoken(&hmp->fs_token);
1477         hammer_start_transaction(&trans, hmp);
1478         ++hammer_stats_file_iopsw;
1479
1480         /*
1481          * Create a new filesystem object of the requested type.  The
1482          * returned inode will be referenced but not locked.
1483          */
1484         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1485                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1486                                     NULL, &nip);
1487         if (error) {
1488                 hkprintf("hammer_mkdir error %d\n", error);
1489                 hammer_done_transaction(&trans);
1490                 *ap->a_vpp = NULL;
1491                 lwkt_reltoken(&hmp->fs_token);
1492                 return (error);
1493         }
1494         /*
1495          * Add the new filesystem object to the directory.  This will also
1496          * bump the inode's link count.
1497          */
1498         error = hammer_ip_add_directory(&trans, dip,
1499                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1500                                         nip);
1501         if (error)
1502                 hkprintf("hammer_mkdir (add) error %d\n", error);
1503
1504         /*
1505          * Finish up.
1506          */
1507         if (error) {
1508                 hammer_rel_inode(nip, 0);
1509                 *ap->a_vpp = NULL;
1510         } else {
1511                 error = hammer_get_vnode(nip, ap->a_vpp);
1512                 hammer_rel_inode(nip, 0);
1513                 if (error == 0) {
1514                         cache_setunresolved(ap->a_nch);
1515                         cache_setvp(ap->a_nch, *ap->a_vpp);
1516                 }
1517         }
1518         hammer_done_transaction(&trans);
1519         if (error == 0)
1520                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1521         lwkt_reltoken(&hmp->fs_token);
1522         return (error);
1523 }
1524
1525 /*
1526  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1527  *
1528  * The operating system has already ensured that the directory entry
1529  * does not exist and done all appropriate namespace locking.
1530  */
1531 static
1532 int
1533 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1534 {
1535         struct hammer_transaction trans;
1536         struct hammer_inode *dip;
1537         struct hammer_inode *nip;
1538         struct nchandle *nch;
1539         hammer_mount_t hmp;
1540         int error;
1541
1542         nch = ap->a_nch;
1543         dip = VTOI(ap->a_dvp);
1544         hmp = dip->hmp;
1545
1546         if (dip->flags & HAMMER_INODE_RO)
1547                 return (EROFS);
1548         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1549                 return (error);
1550
1551         /*
1552          * Create a transaction to cover the operations we perform.
1553          */
1554         lwkt_gettoken(&hmp->fs_token);
1555         hammer_start_transaction(&trans, hmp);
1556         ++hammer_stats_file_iopsw;
1557
1558         /*
1559          * Create a new filesystem object of the requested type.  The
1560          * returned inode will be referenced but not locked.
1561          *
1562          * If mknod specifies a directory a pseudo-fs is created.
1563          */
1564         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1565                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1566                                     NULL, &nip);
1567         if (error) {
1568                 hammer_done_transaction(&trans);
1569                 *ap->a_vpp = NULL;
1570                 lwkt_reltoken(&hmp->fs_token);
1571                 return (error);
1572         }
1573
1574         /*
1575          * Add the new filesystem object to the directory.  This will also
1576          * bump the inode's link count.
1577          */
1578         error = hammer_ip_add_directory(&trans, dip,
1579                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1580                                         nip);
1581
1582         /*
1583          * Finish up.
1584          */
1585         if (error) {
1586                 hammer_rel_inode(nip, 0);
1587                 *ap->a_vpp = NULL;
1588         } else {
1589                 error = hammer_get_vnode(nip, ap->a_vpp);
1590                 hammer_rel_inode(nip, 0);
1591                 if (error == 0) {
1592                         cache_setunresolved(ap->a_nch);
1593                         cache_setvp(ap->a_nch, *ap->a_vpp);
1594                 }
1595         }
1596         hammer_done_transaction(&trans);
1597         if (error == 0)
1598                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1599         lwkt_reltoken(&hmp->fs_token);
1600         return (error);
1601 }
1602
1603 /*
1604  * hammer_vop_open { vp, mode, cred, fp }
1605  *
1606  * MPSAFE (does not require fs_token)
1607  */
1608 static
1609 int
1610 hammer_vop_open(struct vop_open_args *ap)
1611 {
1612         hammer_inode_t ip;
1613
1614         ++hammer_stats_file_iopsr;
1615         ip = VTOI(ap->a_vp);
1616
1617         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1618                 return (EROFS);
1619         return(vop_stdopen(ap));
1620 }
1621
1622 /*
1623  * hammer_vop_print { vp }
1624  */
1625 static
1626 int
1627 hammer_vop_print(struct vop_print_args *ap)
1628 {
1629         return EOPNOTSUPP;
1630 }
1631
1632 /*
1633  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1634  */
1635 static
1636 int
1637 hammer_vop_readdir(struct vop_readdir_args *ap)
1638 {
1639         struct hammer_transaction trans;
1640         struct hammer_cursor cursor;
1641         struct hammer_inode *ip;
1642         hammer_mount_t hmp;
1643         struct uio *uio;
1644         hammer_base_elm_t base;
1645         int error;
1646         int cookie_index;
1647         int ncookies;
1648         off_t *cookies;
1649         off_t saveoff;
1650         int r;
1651         int dtype;
1652
1653         ++hammer_stats_file_iopsr;
1654         ip = VTOI(ap->a_vp);
1655         uio = ap->a_uio;
1656         saveoff = uio->uio_offset;
1657         hmp = ip->hmp;
1658
1659         if (ap->a_ncookies) {
1660                 ncookies = uio->uio_resid / 16 + 1;
1661                 if (ncookies > 1024)
1662                         ncookies = 1024;
1663                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1664                 cookie_index = 0;
1665         } else {
1666                 ncookies = -1;
1667                 cookies = NULL;
1668                 cookie_index = 0;
1669         }
1670
1671         lwkt_gettoken(&hmp->fs_token);
1672         hammer_simple_transaction(&trans, hmp);
1673
1674         /*
1675          * Handle artificial entries
1676          *
1677          * It should be noted that the minimum value for a directory
1678          * hash key on-media is 0x0000000100000000, so we can use anything
1679          * less then that to represent our 'special' key space.
1680          */
1681         error = 0;
1682         if (saveoff == 0) {
1683                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1684                 if (r)
1685                         goto done;
1686                 if (cookies)
1687                         cookies[cookie_index] = saveoff;
1688                 ++saveoff;
1689                 ++cookie_index;
1690                 if (cookie_index == ncookies)
1691                         goto done;
1692         }
1693         if (saveoff == 1) {
1694                 if (ip->ino_data.parent_obj_id) {
1695                         r = vop_write_dirent(&error, uio,
1696                                              ip->ino_data.parent_obj_id,
1697                                              DT_DIR, 2, "..");
1698                 } else {
1699                         r = vop_write_dirent(&error, uio,
1700                                              ip->obj_id, DT_DIR, 2, "..");
1701                 }
1702                 if (r)
1703                         goto done;
1704                 if (cookies)
1705                         cookies[cookie_index] = saveoff;
1706                 ++saveoff;
1707                 ++cookie_index;
1708                 if (cookie_index == ncookies)
1709                         goto done;
1710         }
1711
1712         /*
1713          * Key range (begin and end inclusive) to scan.  Directory keys
1714          * directly translate to a 64 bit 'seek' position.
1715          */
1716         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1717         cursor.key_beg.localization = ip->obj_localization +
1718                                       hammer_dir_localization(ip);
1719         cursor.key_beg.obj_id = ip->obj_id;
1720         cursor.key_beg.create_tid = 0;
1721         cursor.key_beg.delete_tid = 0;
1722         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1723         cursor.key_beg.obj_type = 0;
1724         cursor.key_beg.key = saveoff;
1725
1726         cursor.key_end = cursor.key_beg;
1727         cursor.key_end.key = HAMMER_MAX_KEY;
1728         cursor.asof = ip->obj_asof;
1729         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1730
1731         error = hammer_ip_first(&cursor);
1732
1733         while (error == 0) {
1734                 error = hammer_ip_resolve_data(&cursor);
1735                 if (error)
1736                         break;
1737                 base = &cursor.leaf->base;
1738                 saveoff = base->key;
1739                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1740
1741                 if (base->obj_id != ip->obj_id)
1742                         panic("readdir: bad record at %p", cursor.node);
1743
1744                 /*
1745                  * Convert pseudo-filesystems into softlinks
1746                  */
1747                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1748                 r = vop_write_dirent(
1749                              &error, uio, cursor.data->entry.obj_id,
1750                              dtype,
1751                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1752                              (void *)cursor.data->entry.name);
1753                 if (r)
1754                         break;
1755                 ++saveoff;
1756                 if (cookies)
1757                         cookies[cookie_index] = base->key;
1758                 ++cookie_index;
1759                 if (cookie_index == ncookies)
1760                         break;
1761                 error = hammer_ip_next(&cursor);
1762         }
1763         hammer_done_cursor(&cursor);
1764
1765 done:
1766         hammer_done_transaction(&trans);
1767
1768         if (ap->a_eofflag)
1769                 *ap->a_eofflag = (error == ENOENT);
1770         uio->uio_offset = saveoff;
1771         if (error && cookie_index == 0) {
1772                 if (error == ENOENT)
1773                         error = 0;
1774                 if (cookies) {
1775                         kfree(cookies, M_TEMP);
1776                         *ap->a_ncookies = 0;
1777                         *ap->a_cookies = NULL;
1778                 }
1779         } else {
1780                 if (error == ENOENT)
1781                         error = 0;
1782                 if (cookies) {
1783                         *ap->a_ncookies = cookie_index;
1784                         *ap->a_cookies = cookies;
1785                 }
1786         }
1787         lwkt_reltoken(&hmp->fs_token);
1788         return(error);
1789 }
1790
1791 /*
1792  * hammer_vop_readlink { vp, uio, cred }
1793  */
1794 static
1795 int
1796 hammer_vop_readlink(struct vop_readlink_args *ap)
1797 {
1798         struct hammer_transaction trans;
1799         struct hammer_cursor cursor;
1800         struct hammer_inode *ip;
1801         hammer_mount_t hmp;
1802         char buf[32];
1803         u_int32_t localization;
1804         hammer_pseudofs_inmem_t pfsm;
1805         int error;
1806
1807         ip = VTOI(ap->a_vp);
1808         hmp = ip->hmp;
1809
1810         lwkt_gettoken(&hmp->fs_token);
1811
1812         /*
1813          * Shortcut if the symlink data was stuffed into ino_data.
1814          *
1815          * Also expand special "@@PFS%05d" softlinks (expansion only
1816          * occurs for non-historical (current) accesses made from the
1817          * primary filesystem).
1818          *
1819          * Note that userspace hammer command does not allow users to
1820          * create a @@PFS softlink under an existing other PFS (id!=0)
1821          * so the ip localization here for @@PFS softlink is always 0.
1822          */
1823         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1824                 char *ptr;
1825                 int bytes;
1826
1827                 ptr = ip->ino_data.ext.symlink;
1828                 bytes = (int)ip->ino_data.size;
1829                 if (bytes == 10 &&
1830                     ip->obj_asof == HAMMER_MAX_TID &&
1831                     ip->obj_localization == 0 &&
1832                     strncmp(ptr, "@@PFS", 5) == 0) {
1833                         hammer_simple_transaction(&trans, hmp);
1834                         bcopy(ptr + 5, buf, 5);
1835                         buf[5] = 0;
1836                         localization = strtoul(buf, NULL, 10) << 16;
1837                         pfsm = hammer_load_pseudofs(&trans, localization,
1838                                                     &error);
1839                         if (error == 0) {
1840                                 if (pfsm->pfsd.mirror_flags &
1841                                     HAMMER_PFSD_SLAVE) {
1842                                         /* vap->va_size == 26 */
1843                                         ksnprintf(buf, sizeof(buf),
1844                                                   "@@0x%016llx:%05d",
1845                                                   (long long)pfsm->pfsd.sync_end_tid,
1846                                                   localization >> 16);
1847                                 } else {
1848                                         /* vap->va_size == 10 */
1849                                         ksnprintf(buf, sizeof(buf),
1850                                                   "@@-1:%05d",
1851                                                   localization >> 16);
1852 #if 0
1853                                         ksnprintf(buf, sizeof(buf),
1854                                                   "@@0x%016llx:%05d",
1855                                                   (long long)HAMMER_MAX_TID,
1856                                                   localization >> 16);
1857 #endif
1858                                 }
1859                                 ptr = buf;
1860                                 bytes = strlen(buf);
1861                         }
1862                         if (pfsm)
1863                                 hammer_rel_pseudofs(hmp, pfsm);
1864                         hammer_done_transaction(&trans);
1865                 }
1866                 error = uiomove(ptr, bytes, ap->a_uio);
1867                 lwkt_reltoken(&hmp->fs_token);
1868                 return(error);
1869         }
1870
1871         /*
1872          * Long version
1873          */
1874         hammer_simple_transaction(&trans, hmp);
1875         ++hammer_stats_file_iopsr;
1876         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1877
1878         /*
1879          * Key range (begin and end inclusive) to scan.  Directory keys
1880          * directly translate to a 64 bit 'seek' position.
1881          */
1882         cursor.key_beg.localization = ip->obj_localization +
1883                                       HAMMER_LOCALIZE_MISC;
1884         cursor.key_beg.obj_id = ip->obj_id;
1885         cursor.key_beg.create_tid = 0;
1886         cursor.key_beg.delete_tid = 0;
1887         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1888         cursor.key_beg.obj_type = 0;
1889         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1890         cursor.asof = ip->obj_asof;
1891         cursor.flags |= HAMMER_CURSOR_ASOF;
1892
1893         error = hammer_ip_lookup(&cursor);
1894         if (error == 0) {
1895                 error = hammer_ip_resolve_data(&cursor);
1896                 if (error == 0) {
1897                         KKASSERT(cursor.leaf->data_len >=
1898                                  HAMMER_SYMLINK_NAME_OFF);
1899                         error = uiomove(cursor.data->symlink.name,
1900                                         cursor.leaf->data_len -
1901                                                 HAMMER_SYMLINK_NAME_OFF,
1902                                         ap->a_uio);
1903                 }
1904         }
1905         hammer_done_cursor(&cursor);
1906         hammer_done_transaction(&trans);
1907         lwkt_reltoken(&hmp->fs_token);
1908         return(error);
1909 }
1910
1911 /*
1912  * hammer_vop_nremove { nch, dvp, cred }
1913  */
1914 static
1915 int
1916 hammer_vop_nremove(struct vop_nremove_args *ap)
1917 {
1918         struct hammer_transaction trans;
1919         struct hammer_inode *dip;
1920         hammer_mount_t hmp;
1921         int error;
1922
1923         dip = VTOI(ap->a_dvp);
1924         hmp = dip->hmp;
1925
1926         if (hammer_nohistory(dip) == 0 &&
1927             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1928                 return (error);
1929         }
1930
1931         lwkt_gettoken(&hmp->fs_token);
1932         hammer_start_transaction(&trans, hmp);
1933         ++hammer_stats_file_iopsw;
1934         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1935         hammer_done_transaction(&trans);
1936         if (error == 0)
1937                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1938         lwkt_reltoken(&hmp->fs_token);
1939         return (error);
1940 }
1941
1942 /*
1943  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1944  */
1945 static
1946 int
1947 hammer_vop_nrename(struct vop_nrename_args *ap)
1948 {
1949         struct hammer_transaction trans;
1950         struct namecache *fncp;
1951         struct namecache *tncp;
1952         struct hammer_inode *fdip;
1953         struct hammer_inode *tdip;
1954         struct hammer_inode *ip;
1955         hammer_mount_t hmp;
1956         struct hammer_cursor cursor;
1957         int64_t namekey;
1958         u_int32_t max_iterations;
1959         int nlen, error;
1960
1961         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1962                 return(EXDEV);
1963         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1964                 return(EXDEV);
1965
1966         fdip = VTOI(ap->a_fdvp);
1967         tdip = VTOI(ap->a_tdvp);
1968         fncp = ap->a_fnch->ncp;
1969         tncp = ap->a_tnch->ncp;
1970         ip = VTOI(fncp->nc_vp);
1971         KKASSERT(ip != NULL);
1972
1973         hmp = ip->hmp;
1974
1975         if (fdip->obj_localization != tdip->obj_localization)
1976                 return(EXDEV);
1977         if (fdip->obj_localization != ip->obj_localization)
1978                 return(EXDEV);
1979
1980         if (fdip->flags & HAMMER_INODE_RO)
1981                 return (EROFS);
1982         if (tdip->flags & HAMMER_INODE_RO)
1983                 return (EROFS);
1984         if (ip->flags & HAMMER_INODE_RO)
1985                 return (EROFS);
1986         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1987                 return (error);
1988
1989         lwkt_gettoken(&hmp->fs_token);
1990         hammer_start_transaction(&trans, hmp);
1991         ++hammer_stats_file_iopsw;
1992
1993         /*
1994          * Remove tncp from the target directory and then link ip as
1995          * tncp. XXX pass trans to dounlink
1996          *
1997          * Force the inode sync-time to match the transaction so it is
1998          * in-sync with the creation of the target directory entry.
1999          */
2000         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
2001                                 ap->a_cred, 0, -1);
2002         if (error == 0 || error == ENOENT) {
2003                 error = hammer_ip_add_directory(&trans, tdip,
2004                                                 tncp->nc_name, tncp->nc_nlen,
2005                                                 ip);
2006                 if (error == 0) {
2007                         ip->ino_data.parent_obj_id = tdip->obj_id;
2008                         ip->ino_data.ctime = trans.time;
2009                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
2010                 }
2011         }
2012         if (error)
2013                 goto failed; /* XXX */
2014
2015         /*
2016          * Locate the record in the originating directory and remove it.
2017          *
2018          * Calculate the namekey and setup the key range for the scan.  This
2019          * works kinda like a chained hash table where the lower 32 bits
2020          * of the namekey synthesize the chain.
2021          *
2022          * The key range is inclusive of both key_beg and key_end.
2023          */
2024         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2025                                            &max_iterations);
2026 retry:
2027         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
2028         cursor.key_beg.localization = fdip->obj_localization +
2029                                       hammer_dir_localization(fdip);
2030         cursor.key_beg.obj_id = fdip->obj_id;
2031         cursor.key_beg.key = namekey;
2032         cursor.key_beg.create_tid = 0;
2033         cursor.key_beg.delete_tid = 0;
2034         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2035         cursor.key_beg.obj_type = 0;
2036
2037         cursor.key_end = cursor.key_beg;
2038         cursor.key_end.key += max_iterations;
2039         cursor.asof = fdip->obj_asof;
2040         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2041
2042         /*
2043          * Scan all matching records (the chain), locate the one matching
2044          * the requested path component.
2045          *
2046          * The hammer_ip_*() functions merge in-memory records with on-disk
2047          * records for the purposes of the search.
2048          */
2049         error = hammer_ip_first(&cursor);
2050         while (error == 0) {
2051                 if (hammer_ip_resolve_data(&cursor) != 0)
2052                         break;
2053                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2054                 KKASSERT(nlen > 0);
2055                 if (fncp->nc_nlen == nlen &&
2056                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2057                         break;
2058                 }
2059                 error = hammer_ip_next(&cursor);
2060         }
2061
2062         /*
2063          * If all is ok we have to get the inode so we can adjust nlinks.
2064          *
2065          * WARNING: hammer_ip_del_directory() may have to terminate the
2066          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2067          * twice.
2068          */
2069         if (error == 0)
2070                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2071
2072         /*
2073          * XXX A deadlock here will break rename's atomicy for the purposes
2074          * of crash recovery.
2075          */
2076         if (error == EDEADLK) {
2077                 hammer_done_cursor(&cursor);
2078                 goto retry;
2079         }
2080
2081         /*
2082          * Cleanup and tell the kernel that the rename succeeded.
2083          *
2084          * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2085          *       without formally acquiring the vp since the vp might
2086          *       have zero refs on it, or in the middle of a reclaim,
2087          *       etc.
2088          */
2089         hammer_done_cursor(&cursor);
2090         if (error == 0) {
2091                 cache_rename(ap->a_fnch, ap->a_tnch);
2092                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
2093                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
2094                 while (ip->vp) {
2095                         struct vnode *vp;
2096
2097                         error = hammer_get_vnode(ip, &vp);
2098                         if (error == 0 && vp) {
2099                                 vn_unlock(vp);
2100                                 hammer_knote(ip->vp, NOTE_RENAME);
2101                                 vrele(vp);
2102                                 break;
2103                         }
2104                         kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2105                 }
2106         }
2107
2108 failed:
2109         hammer_done_transaction(&trans);
2110         lwkt_reltoken(&hmp->fs_token);
2111         return (error);
2112 }
2113
2114 /*
2115  * hammer_vop_nrmdir { nch, dvp, cred }
2116  */
2117 static
2118 int
2119 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2120 {
2121         struct hammer_transaction trans;
2122         struct hammer_inode *dip;
2123         hammer_mount_t hmp;
2124         int error;
2125
2126         dip = VTOI(ap->a_dvp);
2127         hmp = dip->hmp;
2128
2129         if (hammer_nohistory(dip) == 0 &&
2130             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2131                 return (error);
2132         }
2133
2134         lwkt_gettoken(&hmp->fs_token);
2135         hammer_start_transaction(&trans, hmp);
2136         ++hammer_stats_file_iopsw;
2137         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2138         hammer_done_transaction(&trans);
2139         if (error == 0)
2140                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2141         lwkt_reltoken(&hmp->fs_token);
2142         return (error);
2143 }
2144
2145 /*
2146  * hammer_vop_markatime { vp, cred }
2147  */
2148 static
2149 int
2150 hammer_vop_markatime(struct vop_markatime_args *ap)
2151 {
2152         struct hammer_transaction trans;
2153         struct hammer_inode *ip;
2154         hammer_mount_t hmp;
2155
2156         ip = VTOI(ap->a_vp);
2157         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2158                 return (EROFS);
2159         if (ip->flags & HAMMER_INODE_RO)
2160                 return (EROFS);
2161         hmp = ip->hmp;
2162         if (hmp->mp->mnt_flag & MNT_NOATIME)
2163                 return (0);
2164         lwkt_gettoken(&hmp->fs_token);
2165         hammer_start_transaction(&trans, hmp);
2166         ++hammer_stats_file_iopsw;
2167
2168         ip->ino_data.atime = trans.time;
2169         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2170         hammer_done_transaction(&trans);
2171         hammer_knote(ap->a_vp, NOTE_ATTRIB);
2172         lwkt_reltoken(&hmp->fs_token);
2173         return (0);
2174 }
2175
2176 /*
2177  * hammer_vop_setattr { vp, vap, cred }
2178  */
2179 static
2180 int
2181 hammer_vop_setattr(struct vop_setattr_args *ap)
2182 {
2183         struct hammer_transaction trans;
2184         struct hammer_inode *ip;
2185         struct vattr *vap;
2186         hammer_mount_t hmp;
2187         int modflags;
2188         int error;
2189         int truncating;
2190         int blksize;
2191         int kflags;
2192 #if 0
2193         int64_t aligned_size;
2194 #endif
2195         u_int32_t flags;
2196
2197         vap = ap->a_vap;
2198         ip = ap->a_vp->v_data;
2199         modflags = 0;
2200         kflags = 0;
2201         hmp = ip->hmp;
2202
2203         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2204                 return(EROFS);
2205         if (ip->flags & HAMMER_INODE_RO)
2206                 return (EROFS);
2207         if (hammer_nohistory(ip) == 0 &&
2208             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2209                 return (error);
2210         }
2211
2212         lwkt_gettoken(&hmp->fs_token);
2213         hammer_start_transaction(&trans, hmp);
2214         ++hammer_stats_file_iopsw;
2215         error = 0;
2216
2217         if (vap->va_flags != VNOVAL) {
2218                 flags = ip->ino_data.uflags;
2219                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2220                                          hammer_to_unix_xid(&ip->ino_data.uid),
2221                                          ap->a_cred);
2222                 if (error == 0) {
2223                         if (ip->ino_data.uflags != flags) {
2224                                 ip->ino_data.uflags = flags;
2225                                 ip->ino_data.ctime = trans.time;
2226                                 modflags |= HAMMER_INODE_DDIRTY;
2227                                 kflags |= NOTE_ATTRIB;
2228                         }
2229                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2230                                 error = 0;
2231                                 goto done;
2232                         }
2233                 }
2234                 goto done;
2235         }
2236         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2237                 error = EPERM;
2238                 goto done;
2239         }
2240         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2241                 mode_t cur_mode = ip->ino_data.mode;
2242                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2243                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2244                 uuid_t uuid_uid;
2245                 uuid_t uuid_gid;
2246
2247                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2248                                          ap->a_cred,
2249                                          &cur_uid, &cur_gid, &cur_mode);
2250                 if (error == 0) {
2251                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
2252                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
2253                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
2254                                  sizeof(uuid_uid)) ||
2255                             bcmp(&uuid_gid, &ip->ino_data.gid,
2256                                  sizeof(uuid_gid)) ||
2257                             ip->ino_data.mode != cur_mode) {
2258                                 ip->ino_data.uid = uuid_uid;
2259                                 ip->ino_data.gid = uuid_gid;
2260                                 ip->ino_data.mode = cur_mode;
2261                                 ip->ino_data.ctime = trans.time;
2262                                 modflags |= HAMMER_INODE_DDIRTY;
2263                         }
2264                         kflags |= NOTE_ATTRIB;
2265                 }
2266         }
2267         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2268                 switch(ap->a_vp->v_type) {
2269                 case VREG:
2270                         if (vap->va_size == ip->ino_data.size)
2271                                 break;
2272
2273                         /*
2274                          * Log the operation if in fast-fsync mode or if
2275                          * there are unterminated redo write records present.
2276                          *
2277                          * The second check is needed so the recovery code
2278                          * properly truncates write redos even if nominal
2279                          * REDO operations is turned off due to excessive
2280                          * writes, because the related records might be
2281                          * destroyed and never lay down a TERM_WRITE.
2282                          */
2283                         if ((ip->flags & HAMMER_INODE_REDO) ||
2284                             (ip->flags & HAMMER_INODE_RDIRTY)) {
2285                                 error = hammer_generate_redo(&trans, ip,
2286                                                              vap->va_size,
2287                                                              HAMMER_REDO_TRUNC,
2288                                                              NULL, 0);
2289                         }
2290                         blksize = hammer_blocksize(vap->va_size);
2291
2292                         /*
2293                          * XXX break atomicy, we can deadlock the backend
2294                          * if we do not release the lock.  Probably not a
2295                          * big deal here.
2296                          */
2297                         if (vap->va_size < ip->ino_data.size) {
2298                                 nvtruncbuf(ap->a_vp, vap->va_size,
2299                                            blksize,
2300                                            hammer_blockoff(vap->va_size),
2301                                            0);
2302                                 truncating = 1;
2303                                 kflags |= NOTE_WRITE;
2304                         } else {
2305                                 nvextendbuf(ap->a_vp,
2306                                             ip->ino_data.size,
2307                                             vap->va_size,
2308                                             hammer_blocksize(ip->ino_data.size),
2309                                             hammer_blocksize(vap->va_size),
2310                                             hammer_blockoff(ip->ino_data.size),
2311                                             hammer_blockoff(vap->va_size),
2312                                             0);
2313                                 truncating = 0;
2314                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
2315                         }
2316                         ip->ino_data.size = vap->va_size;
2317                         ip->ino_data.mtime = trans.time;
2318                         /* XXX safe to use SDIRTY instead of DDIRTY here? */
2319                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2320
2321                         /*
2322                          * On-media truncation is cached in the inode until
2323                          * the inode is synchronized.  We must immediately
2324                          * handle any frontend records.
2325                          */
2326                         if (truncating) {
2327                                 hammer_ip_frontend_trunc(ip, vap->va_size);
2328 #ifdef DEBUG_TRUNCATE
2329                                 if (HammerTruncIp == NULL)
2330                                         HammerTruncIp = ip;
2331 #endif
2332                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2333                                         ip->flags |= HAMMER_INODE_TRUNCATED;
2334                                         ip->trunc_off = vap->va_size;
2335                                         hammer_inode_dirty(ip);
2336 #ifdef DEBUG_TRUNCATE
2337                                         if (ip == HammerTruncIp)
2338                                         kprintf("truncate1 %016llx\n",
2339                                                 (long long)ip->trunc_off);
2340 #endif
2341                                 } else if (ip->trunc_off > vap->va_size) {
2342                                         ip->trunc_off = vap->va_size;
2343 #ifdef DEBUG_TRUNCATE
2344                                         if (ip == HammerTruncIp)
2345                                         kprintf("truncate2 %016llx\n",
2346                                                 (long long)ip->trunc_off);
2347 #endif
2348                                 } else {
2349 #ifdef DEBUG_TRUNCATE
2350                                         if (ip == HammerTruncIp)
2351                                         kprintf("truncate3 %016llx (ignored)\n",
2352                                                 (long long)vap->va_size);
2353 #endif
2354                                 }
2355                         }
2356
2357 #if 0
2358                         /*
2359                          * When truncating, nvtruncbuf() may have cleaned out
2360                          * a portion of the last block on-disk in the buffer
2361                          * cache.  We must clean out any frontend records
2362                          * for blocks beyond the new last block.
2363                          */
2364                         aligned_size = (vap->va_size + (blksize - 1)) &
2365                                        ~(int64_t)(blksize - 1);
2366                         if (truncating && vap->va_size < aligned_size) {
2367                                 aligned_size -= blksize;
2368                                 hammer_ip_frontend_trunc(ip, aligned_size);
2369                         }
2370 #endif
2371                         break;
2372                 case VDATABASE:
2373                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2374                                 ip->flags |= HAMMER_INODE_TRUNCATED;
2375                                 ip->trunc_off = vap->va_size;
2376                                 hammer_inode_dirty(ip);
2377                         } else if (ip->trunc_off > vap->va_size) {
2378                                 ip->trunc_off = vap->va_size;
2379                         }
2380                         hammer_ip_frontend_trunc(ip, vap->va_size);
2381                         ip->ino_data.size = vap->va_size;
2382                         ip->ino_data.mtime = trans.time;
2383                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2384                         kflags |= NOTE_ATTRIB;
2385                         break;
2386                 default:
2387                         error = EINVAL;
2388                         goto done;
2389                 }
2390                 break;
2391         }
2392         if (vap->va_atime.tv_sec != VNOVAL) {
2393                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2394                 modflags |= HAMMER_INODE_ATIME;
2395                 kflags |= NOTE_ATTRIB;
2396         }
2397         if (vap->va_mtime.tv_sec != VNOVAL) {
2398                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2399                 modflags |= HAMMER_INODE_MTIME;
2400                 kflags |= NOTE_ATTRIB;
2401         }
2402         if (vap->va_mode != (mode_t)VNOVAL) {
2403                 mode_t   cur_mode = ip->ino_data.mode;
2404                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2405                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2406
2407                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2408                                          cur_uid, cur_gid, &cur_mode);
2409                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2410                         ip->ino_data.mode = cur_mode;
2411                         ip->ino_data.ctime = trans.time;
2412                         modflags |= HAMMER_INODE_DDIRTY;
2413                         kflags |= NOTE_ATTRIB;
2414                 }
2415         }
2416 done:
2417         if (error == 0)
2418                 hammer_modify_inode(&trans, ip, modflags);
2419         hammer_done_transaction(&trans);
2420         hammer_knote(ap->a_vp, kflags);
2421         lwkt_reltoken(&hmp->fs_token);
2422         return (error);
2423 }
2424
2425 /*
2426  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2427  */
2428 static
2429 int
2430 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2431 {
2432         struct hammer_transaction trans;
2433         struct hammer_inode *dip;
2434         struct hammer_inode *nip;
2435         hammer_record_t record;
2436         struct nchandle *nch;
2437         hammer_mount_t hmp;
2438         int error;
2439         int bytes;
2440
2441         ap->a_vap->va_type = VLNK;
2442
2443         nch = ap->a_nch;
2444         dip = VTOI(ap->a_dvp);
2445         hmp = dip->hmp;
2446
2447         if (dip->flags & HAMMER_INODE_RO)
2448                 return (EROFS);
2449         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2450                 return (error);
2451
2452         /*
2453          * Create a transaction to cover the operations we perform.
2454          */
2455         lwkt_gettoken(&hmp->fs_token);
2456         hammer_start_transaction(&trans, hmp);
2457         ++hammer_stats_file_iopsw;
2458
2459         /*
2460          * Create a new filesystem object of the requested type.  The
2461          * returned inode will be referenced but not locked.
2462          */
2463
2464         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2465                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2466                                     NULL, &nip);
2467         if (error) {
2468                 hammer_done_transaction(&trans);
2469                 *ap->a_vpp = NULL;
2470                 lwkt_reltoken(&hmp->fs_token);
2471                 return (error);
2472         }
2473
2474         /*
2475          * Add a record representing the symlink.  symlink stores the link
2476          * as pure data, not a string, and is no \0 terminated.
2477          */
2478         if (error == 0) {
2479                 bytes = strlen(ap->a_target);
2480
2481                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2482                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2483                 } else {
2484                         record = hammer_alloc_mem_record(nip, bytes);
2485                         record->type = HAMMER_MEM_RECORD_GENERAL;
2486
2487                         record->leaf.base.localization = nip->obj_localization +
2488                                                          HAMMER_LOCALIZE_MISC;
2489                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2490                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2491                         record->leaf.data_len = bytes;
2492                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2493                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2494                         error = hammer_ip_add_record(&trans, record);
2495                 }
2496
2497                 /*
2498                  * Set the file size to the length of the link.
2499                  */
2500                 if (error == 0) {
2501                         nip->ino_data.size = bytes;
2502                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2503                 }
2504         }
2505         if (error == 0)
2506                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2507                                                 nch->ncp->nc_nlen, nip);
2508
2509         /*
2510          * Finish up.
2511          */
2512         if (error) {
2513                 hammer_rel_inode(nip, 0);
2514                 *ap->a_vpp = NULL;
2515         } else {
2516                 error = hammer_get_vnode(nip, ap->a_vpp);
2517                 hammer_rel_inode(nip, 0);
2518                 if (error == 0) {
2519                         cache_setunresolved(ap->a_nch);
2520                         cache_setvp(ap->a_nch, *ap->a_vpp);
2521                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2522                 }
2523         }
2524         hammer_done_transaction(&trans);
2525         lwkt_reltoken(&hmp->fs_token);
2526         return (error);
2527 }
2528
2529 /*
2530  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2531  */
2532 static
2533 int
2534 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2535 {
2536         struct hammer_transaction trans;
2537         struct hammer_inode *dip;
2538         hammer_mount_t hmp;
2539         int error;
2540
2541         dip = VTOI(ap->a_dvp);
2542         hmp = dip->hmp;
2543
2544         if (hammer_nohistory(dip) == 0 &&
2545             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2546                 return (error);
2547         }
2548
2549         lwkt_gettoken(&hmp->fs_token);
2550         hammer_start_transaction(&trans, hmp);
2551         ++hammer_stats_file_iopsw;
2552         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2553                                 ap->a_cred, ap->a_flags, -1);
2554         hammer_done_transaction(&trans);
2555         lwkt_reltoken(&hmp->fs_token);
2556
2557         return (error);
2558 }
2559
2560 /*
2561  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2562  */
2563 static
2564 int
2565 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2566 {
2567         struct hammer_inode *ip = ap->a_vp->v_data;
2568         hammer_mount_t hmp = ip->hmp;
2569         int error;
2570
2571         ++hammer_stats_file_iopsr;
2572         lwkt_gettoken(&hmp->fs_token);
2573         error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2574                              ap->a_fflag, ap->a_cred);
2575         lwkt_reltoken(&hmp->fs_token);
2576         return (error);
2577 }
2578
2579 static
2580 int
2581 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2582 {
2583         static const struct mountctl_opt extraopt[] = {
2584                 { HMNT_NOHISTORY,       "nohistory" },
2585                 { HMNT_MASTERID,        "master" },
2586                 { 0, NULL}
2587
2588         };
2589         struct hammer_mount *hmp;
2590         struct mount *mp;
2591         int usedbytes;
2592         int error;
2593
2594         error = 0;
2595         usedbytes = 0;
2596         mp = ap->a_head.a_ops->head.vv_mount;
2597         KKASSERT(mp->mnt_data != NULL);
2598         hmp = (struct hammer_mount *)mp->mnt_data;
2599
2600         lwkt_gettoken(&hmp->fs_token);
2601
2602         switch(ap->a_op) {
2603         case MOUNTCTL_SET_EXPORT:
2604                 if (ap->a_ctllen != sizeof(struct export_args))
2605                         error = EINVAL;
2606                 else
2607                         error = hammer_vfs_export(mp, ap->a_op,
2608                                       (const struct export_args *)ap->a_ctl);
2609                 break;
2610         case MOUNTCTL_MOUNTFLAGS:
2611         {
2612                 /*
2613                  * Call standard mountctl VOP function
2614                  * so we get user mount flags.
2615                  */
2616                 error = vop_stdmountctl(ap);
2617                 if (error)
2618                         break;
2619
2620                 usedbytes = *ap->a_res;
2621
2622                 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2623                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2624                                                     ap->a_buf,
2625                                                     ap->a_buflen - usedbytes,
2626                                                     &error);
2627                 }
2628
2629                 *ap->a_res += usedbytes;
2630                 break;
2631         }
2632         default:
2633                 error = vop_stdmountctl(ap);
2634                 break;
2635         }
2636         lwkt_reltoken(&hmp->fs_token);
2637         return(error);
2638 }
2639
2640 /*
2641  * hammer_vop_strategy { vp, bio }
2642  *
2643  * Strategy call, used for regular file read & write only.  Note that the
2644  * bp may represent a cluster.
2645  *
2646  * To simplify operation and allow better optimizations in the future,
2647  * this code does not make any assumptions with regards to buffer alignment
2648  * or size.
2649  */
2650 static
2651 int
2652 hammer_vop_strategy(struct vop_strategy_args *ap)
2653 {
2654         struct buf *bp;
2655         int error;
2656
2657         bp = ap->a_bio->bio_buf;
2658
2659         switch(bp->b_cmd) {
2660         case BUF_CMD_READ:
2661                 error = hammer_vop_strategy_read(ap);
2662                 break;
2663         case BUF_CMD_WRITE:
2664                 error = hammer_vop_strategy_write(ap);
2665                 break;
2666         default:
2667                 bp->b_error = error = EINVAL;
2668                 bp->b_flags |= B_ERROR;
2669                 biodone(ap->a_bio);
2670                 break;
2671         }
2672
2673         /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2674
2675         return (error);
2676 }
2677
2678 /*
2679  * Read from a regular file.  Iterate the related records and fill in the
2680  * BIO/BUF.  Gaps are zero-filled.
2681  *
2682  * The support code in hammer_object.c should be used to deal with mixed
2683  * in-memory and on-disk records.
2684  *
2685  * NOTE: Can be called from the cluster code with an oversized buf.
2686  *
2687  * XXX atime update
2688  */
2689 static
2690 int
2691 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2692 {
2693         struct hammer_transaction trans;
2694         struct hammer_inode *ip;
2695         struct hammer_inode *dip;
2696         hammer_mount_t hmp;
2697         struct hammer_cursor cursor;
2698         hammer_base_elm_t base;
2699         hammer_off_t disk_offset;
2700         struct bio *bio;
2701         struct bio *nbio;
2702         struct buf *bp;
2703         int64_t rec_offset;
2704         int64_t ran_end;
2705         int64_t tmp64;
2706         int error;
2707         int boff;
2708         int roff;
2709         int n;
2710         int isdedupable;
2711
2712         bio = ap->a_bio;
2713         bp = bio->bio_buf;
2714         ip = ap->a_vp->v_data;
2715         hmp = ip->hmp;
2716
2717         /*
2718          * The zone-2 disk offset may have been set by the cluster code via
2719          * a BMAP operation, or else should be NOOFFSET.
2720          *
2721          * Checking the high bits for a match against zone-2 should suffice.
2722          *
2723          * In cases where a lot of data duplication is present it may be
2724          * more beneficial to drop through and doubule-buffer through the
2725          * device.
2726          */
2727         nbio = push_bio(bio);
2728         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2729             HAMMER_ZONE_LARGE_DATA) {
2730                 if (hammer_double_buffer == 0) {
2731                         lwkt_gettoken(&hmp->fs_token);
2732                         error = hammer_io_direct_read(hmp, nbio, NULL);
2733                         lwkt_reltoken(&hmp->fs_token);
2734                         return (error);
2735                 }
2736
2737                 /*
2738                  * Try to shortcut requests for double_buffer mode too.
2739                  * Since this mode runs through the device buffer cache
2740                  * only compatible buffer sizes (meaning those generated
2741                  * by normal filesystem buffers) are legal.
2742                  */
2743                 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
2744                         lwkt_gettoken(&hmp->fs_token);
2745                         error = hammer_io_indirect_read(hmp, nbio, NULL);
2746                         lwkt_reltoken(&hmp->fs_token);
2747                         return (error);
2748                 }
2749         }
2750
2751         /*
2752          * Well, that sucked.  Do it the hard way.  If all the stars are
2753          * aligned we may still be able to issue a direct-read.
2754          */
2755         lwkt_gettoken(&hmp->fs_token);
2756         hammer_simple_transaction(&trans, hmp);
2757         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2758
2759         /*
2760          * Key range (begin and end inclusive) to scan.  Note that the key's
2761          * stored in the actual records represent BASE+LEN, not BASE.  The
2762          * first record containing bio_offset will have a key > bio_offset.
2763          */
2764         cursor.key_beg.localization = ip->obj_localization +
2765                                       HAMMER_LOCALIZE_MISC;
2766         cursor.key_beg.obj_id = ip->obj_id;
2767         cursor.key_beg.create_tid = 0;
2768         cursor.key_beg.delete_tid = 0;
2769         cursor.key_beg.obj_type = 0;
2770         cursor.key_beg.key = bio->bio_offset + 1;
2771         cursor.asof = ip->obj_asof;
2772         cursor.flags |= HAMMER_CURSOR_ASOF;
2773
2774         cursor.key_end = cursor.key_beg;
2775         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2776 #if 0
2777         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2778                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2779                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2780                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2781         } else
2782 #endif
2783         {
2784                 ran_end = bio->bio_offset + bp->b_bufsize;
2785                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2786                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2787                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2788                 if (tmp64 < ran_end)
2789                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2790                 else
2791                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2792         }
2793         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2794
2795         /*
2796          * Set NOSWAPCACHE for cursor data extraction if double buffering
2797          * is disabled or (if the file is not marked cacheable via chflags
2798          * and vm.swapcache_use_chflags is enabled).
2799          */
2800         if (hammer_double_buffer == 0 ||
2801             ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2802              vm_swapcache_use_chflags)) {
2803                 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2804         }
2805
2806         error = hammer_ip_first(&cursor);
2807         boff = 0;
2808
2809         while (error == 0) {
2810                 /*
2811                  * Get the base file offset of the record.  The key for
2812                  * data records is (base + bytes) rather then (base).
2813                  */
2814                 base = &cursor.leaf->base;
2815                 rec_offset = base->key - cursor.leaf->data_len;
2816
2817                 /*
2818                  * Calculate the gap, if any, and zero-fill it.
2819                  *
2820                  * n is the offset of the start of the record verses our
2821                  * current seek offset in the bio.
2822                  */
2823                 n = (int)(rec_offset - (bio->bio_offset + boff));
2824                 if (n > 0) {
2825                         if (n > bp->b_bufsize - boff)
2826                                 n = bp->b_bufsize - boff;
2827                         bzero((char *)bp->b_data + boff, n);
2828                         boff += n;
2829                         n = 0;
2830                 }
2831
2832                 /*
2833                  * Calculate the data offset in the record and the number
2834                  * of bytes we can copy.
2835                  *
2836                  * There are two degenerate cases.  First, boff may already
2837                  * be at bp->b_bufsize.  Secondly, the data offset within
2838                  * the record may exceed the record's size.
2839                  */
2840                 roff = -n;
2841                 rec_offset += roff;
2842                 n = cursor.leaf->data_len - roff;
2843                 if (n <= 0) {
2844                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2845                         n = 0;
2846                 } else if (n > bp->b_bufsize - boff) {
2847                         n = bp->b_bufsize - boff;
2848                 }
2849
2850                 /*
2851                  * Deal with cached truncations.  This cool bit of code
2852                  * allows truncate()/ftruncate() to avoid having to sync
2853                  * the file.
2854                  *
2855                  * If the frontend is truncated then all backend records are
2856                  * subject to the frontend's truncation.
2857                  *
2858                  * If the backend is truncated then backend records on-disk
2859                  * (but not in-memory) are subject to the backend's
2860                  * truncation.  In-memory records owned by the backend
2861                  * represent data written after the truncation point on the
2862                  * backend and must not be truncated.
2863                  *
2864                  * Truncate operations deal with frontend buffer cache
2865                  * buffers and frontend-owned in-memory records synchronously.
2866                  */
2867                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2868                         if (hammer_cursor_ondisk(&cursor)/* ||
2869                             cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2870                                 if (ip->trunc_off <= rec_offset)
2871                                         n = 0;
2872                                 else if (ip->trunc_off < rec_offset + n)
2873                                         n = (int)(ip->trunc_off - rec_offset);
2874                         }
2875                 }
2876                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2877                         if (hammer_cursor_ondisk(&cursor)) {
2878                                 if (ip->sync_trunc_off <= rec_offset)
2879                                         n = 0;
2880                                 else if (ip->sync_trunc_off < rec_offset + n)
2881                                         n = (int)(ip->sync_trunc_off - rec_offset);
2882                         }
2883                 }
2884
2885                 /*
2886                  * Try to issue a direct read into our bio if possible,
2887                  * otherwise resolve the element data into a hammer_buffer
2888                  * and copy.
2889                  *
2890                  * The buffer on-disk should be zerod past any real
2891                  * truncation point, but may not be for any synthesized
2892                  * truncation point from above.
2893                  *
2894                  * NOTE: disk_offset is only valid if the cursor data is
2895                  *       on-disk.
2896                  */
2897                 disk_offset = cursor.leaf->data_offset + roff;
2898                 isdedupable = (boff == 0 && n == bp->b_bufsize &&
2899                                hammer_cursor_ondisk(&cursor) &&
2900                                ((int)disk_offset & HAMMER_BUFMASK) == 0);
2901
2902                 if (isdedupable && hammer_double_buffer == 0) {
2903                         /*
2904                          * Direct read case
2905                          */
2906                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2907                                  HAMMER_ZONE_LARGE_DATA);
2908                         nbio->bio_offset = disk_offset;
2909                         error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2910                         if (hammer_live_dedup && error == 0)
2911                                 hammer_dedup_cache_add(ip, cursor.leaf);
2912                         goto done;
2913                 } else if (isdedupable) {
2914                         /*
2915                          * Async I/O case for reading from backing store
2916                          * and copying the data to the filesystem buffer.
2917                          * live-dedup has to verify the data anyway if it
2918                          * gets a hit later so we can just add the entry
2919                          * now.
2920                          */
2921                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2922                                  HAMMER_ZONE_LARGE_DATA);
2923                         nbio->bio_offset = disk_offset;
2924                         if (hammer_live_dedup)
2925                                 hammer_dedup_cache_add(ip, cursor.leaf);
2926                         error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
2927                         goto done;
2928                 } else if (n) {
2929                         error = hammer_ip_resolve_data(&cursor);
2930                         if (error == 0) {
2931                                 if (hammer_live_dedup && isdedupable)
2932                                         hammer_dedup_cache_add(ip, cursor.leaf);
2933                                 bcopy((char *)cursor.data + roff,
2934                                       (char *)bp->b_data + boff, n);
2935                         }
2936                 }
2937                 if (error)
2938                         break;
2939
2940                 /*
2941                  * We have to be sure that the only elements added to the
2942                  * dedup cache are those which are already on-media.
2943                  */
2944                 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2945                         hammer_dedup_cache_add(ip, cursor.leaf);
2946
2947                 /*
2948                  * Iterate until we have filled the request.
2949                  */
2950                 boff += n;
2951                 if (boff == bp->b_bufsize)
2952                         break;
2953                 error = hammer_ip_next(&cursor);
2954         }
2955
2956         /*
2957          * There may have been a gap after the last record
2958          */
2959         if (error == ENOENT)
2960                 error = 0;
2961         if (error == 0 && boff != bp->b_bufsize) {
2962                 KKASSERT(boff < bp->b_bufsize);
2963                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2964                 /* boff = bp->b_bufsize; */
2965         }
2966
2967         /*
2968          * Disallow swapcache operation on the vnode buffer if double
2969          * buffering is enabled, the swapcache will get the data via
2970          * the block device buffer.
2971          */
2972         if (hammer_double_buffer)
2973                 bp->b_flags |= B_NOTMETA;
2974
2975         /*
2976          * Cleanup
2977          */
2978         bp->b_resid = 0;
2979         bp->b_error = error;
2980         if (error)
2981                 bp->b_flags |= B_ERROR;
2982         biodone(ap->a_bio);
2983
2984 done:
2985         /*
2986          * Cache the b-tree node for the last data read in cache[1].
2987          *
2988          * If we hit the file EOF then also cache the node in the
2989          * governing director's cache[3], it will be used to initialize
2990          * the inode's cache[1] for any inodes looked up via the directory.
2991          *
2992          * This doesn't reduce disk accesses since the B-Tree chain is
2993          * likely cached, but it does reduce cpu overhead when looking
2994          * up file offsets for cpdup/tar/cpio style iterations.
2995          */
2996         if (cursor.node)
2997                 hammer_cache_node(&ip->cache[1], cursor.node);
2998         if (ran_end >= ip->ino_data.size) {
2999                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
3000                                         ip->obj_asof, ip->obj_localization);
3001                 if (dip) {
3002                         hammer_cache_node(&dip->cache[3], cursor.node);
3003                         hammer_rel_inode(dip, 0);
3004                 }
3005         }
3006         hammer_done_cursor(&cursor);
3007         hammer_done_transaction(&trans);
3008         lwkt_reltoken(&hmp->fs_token);
3009         return(error);
3010 }
3011
3012 /*
3013  * BMAP operation - used to support cluster_read() only.
3014  *
3015  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
3016  *
3017  * This routine may return EOPNOTSUPP if the opration is not supported for
3018  * the specified offset.  The contents of the pointer arguments do not
3019  * need to be initialized in that case.
3020  *
3021  * If a disk address is available and properly aligned return 0 with
3022  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
3023  * to the run-length relative to that offset.  Callers may assume that
3024  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
3025  * large, so return EOPNOTSUPP if it is not sufficiently large.
3026  */
3027 static
3028 int
3029 hammer_vop_bmap(struct vop_bmap_args *ap)
3030 {
3031         struct hammer_transaction trans;
3032         struct hammer_inode *ip;
3033         hammer_mount_t hmp;
3034         struct hammer_cursor cursor;
3035         hammer_base_elm_t base;
3036         int64_t rec_offset;
3037         int64_t ran_end;
3038         int64_t tmp64;
3039         int64_t base_offset;
3040         int64_t base_disk_offset;
3041         int64_t last_offset;
3042         hammer_off_t last_disk_offset;
3043         hammer_off_t disk_offset;
3044         int     rec_len;
3045         int     error;
3046         int     blksize;
3047
3048         ++hammer_stats_file_iopsr;
3049         ip = ap->a_vp->v_data;
3050         hmp = ip->hmp;
3051
3052         /*
3053          * We can only BMAP regular files.  We can't BMAP database files,
3054          * directories, etc.
3055          */
3056         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
3057                 return(EOPNOTSUPP);
3058
3059         /*
3060          * bmap is typically called with runp/runb both NULL when used
3061          * for writing.  We do not support BMAP for writing atm.
3062          */
3063         if (ap->a_cmd != BUF_CMD_READ)
3064                 return(EOPNOTSUPP);
3065
3066         /*
3067          * Scan the B-Tree to acquire blockmap addresses, then translate
3068          * to raw addresses.
3069          */
3070         lwkt_gettoken(&hmp->fs_token);
3071         hammer_simple_transaction(&trans, hmp);
3072 #if 0
3073         kprintf("bmap_beg %016llx ip->cache %p\n",
3074                 (long long)ap->a_loffset, ip->cache[1]);
3075 #endif
3076         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3077
3078         /*
3079          * Key range (begin and end inclusive) to scan.  Note that the key's
3080          * stored in the actual records represent BASE+LEN, not BASE.  The
3081          * first record containing bio_offset will have a key > bio_offset.
3082          */
3083         cursor.key_beg.localization = ip->obj_localization +
3084                                       HAMMER_LOCALIZE_MISC;
3085         cursor.key_beg.obj_id = ip->obj_id;
3086         cursor.key_beg.create_tid = 0;
3087         cursor.key_beg.delete_tid = 0;
3088         cursor.key_beg.obj_type = 0;
3089         if (ap->a_runb)
3090                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3091         else
3092                 cursor.key_beg.key = ap->a_loffset + 1;
3093         if (cursor.key_beg.key < 0)
3094                 cursor.key_beg.key = 0;
3095         cursor.asof = ip->obj_asof;
3096         cursor.flags |= HAMMER_CURSOR_ASOF;
3097
3098         cursor.key_end = cursor.key_beg;
3099         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3100
3101         ran_end = ap->a_loffset + MAXPHYS;
3102         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3103         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3104         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
3105         if (tmp64 < ran_end)
3106                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3107         else
3108                 cursor.key_end.key = ran_end + MAXPHYS + 1;
3109
3110         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3111
3112         error = hammer_ip_first(&cursor);
3113         base_offset = last_offset = 0;
3114         base_disk_offset = last_disk_offset = 0;
3115
3116         while (error == 0) {
3117                 /*
3118                  * Get the base file offset of the record.  The key for
3119                  * data records is (base + bytes) rather then (base).
3120                  *
3121                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
3122                  * The extra bytes should be zero on-disk and the BMAP op
3123                  * should still be ok.
3124                  */
3125                 base = &cursor.leaf->base;
3126                 rec_offset = base->key - cursor.leaf->data_len;
3127                 rec_len    = cursor.leaf->data_len;
3128
3129                 /*
3130                  * Incorporate any cached truncation.
3131                  *
3132                  * NOTE: Modifications to rec_len based on synthesized
3133                  * truncation points remove the guarantee that any extended
3134                  * data on disk is zero (since the truncations may not have
3135                  * taken place on-media yet).
3136                  */
3137                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
3138                         if (hammer_cursor_ondisk(&cursor) ||
3139                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3140                                 if (ip->trunc_off <= rec_offset)
3141                                         rec_len = 0;
3142                                 else if (ip->trunc_off < rec_offset + rec_len)
3143                                         rec_len = (int)(ip->trunc_off - rec_offset);
3144                         }
3145                 }
3146                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3147                         if (hammer_cursor_ondisk(&cursor)) {
3148                                 if (ip->sync_trunc_off <= rec_offset)
3149                                         rec_len = 0;
3150                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
3151                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
3152                         }
3153                 }
3154
3155                 /*
3156                  * Accumulate information.  If we have hit a discontiguous
3157                  * block reset base_offset unless we are already beyond the
3158                  * requested offset.  If we are, that's it, we stop.
3159                  */
3160                 if (error)
3161                         break;
3162                 if (hammer_cursor_ondisk(&cursor)) {
3163                         disk_offset = cursor.leaf->data_offset;
3164                         if (rec_offset != last_offset ||
3165                             disk_offset != last_disk_offset) {
3166                                 if (rec_offset > ap->a_loffset)
3167                                         break;
3168                                 base_offset = rec_offset;
3169                                 base_disk_offset = disk_offset;
3170                         }
3171                         last_offset = rec_offset + rec_len;
3172                         last_disk_offset = disk_offset + rec_len;
3173
3174                         if (hammer_live_dedup)
3175                                 hammer_dedup_cache_add(ip, cursor.leaf);
3176                 }
3177
3178                 error = hammer_ip_next(&cursor);
3179         }
3180
3181 #if 0
3182         kprintf("BMAP %016llx:  %016llx - %016llx\n",
3183                 (long long)ap->a_loffset,
3184                 (long long)base_offset,
3185                 (long long)last_offset);
3186         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3187                 (long long)base_disk_offset,
3188                 (long long)last_disk_offset);
3189 #endif
3190
3191         if (cursor.node) {
3192                 hammer_cache_node(&ip->cache[1], cursor.node);
3193 #if 0
3194                 kprintf("bmap_end2 %016llx ip->cache %p\n",
3195                         (long long)ap->a_loffset, ip->cache[1]);
3196 #endif
3197         }
3198         hammer_done_cursor(&cursor);
3199         hammer_done_transaction(&trans);
3200         lwkt_reltoken(&hmp->fs_token);
3201
3202         /*
3203          * If we couldn't find any records or the records we did find were
3204          * all behind the requested offset, return failure.  A forward
3205          * truncation can leave a hole w/ no on-disk records.
3206          */
3207         if (last_offset == 0 || last_offset < ap->a_loffset)
3208                 return (EOPNOTSUPP);
3209
3210         /*
3211          * Figure out the block size at the requested offset and adjust
3212          * our limits so the cluster_read() does not create inappropriately
3213          * sized buffer cache buffers.
3214          */
3215         blksize = hammer_blocksize(ap->a_loffset);
3216         if (hammer_blocksize(base_offset) != blksize) {
3217                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3218         }
3219         if (last_offset != ap->a_loffset &&
3220             hammer_blocksize(last_offset - 1) != blksize) {
3221                 last_offset = hammer_blockdemarc(ap->a_loffset,
3222                                                  last_offset - 1);
3223         }
3224
3225         /*
3226          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3227          * from occuring.
3228          */
3229         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3230
3231         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3232                 /*
3233                  * Only large-data zones can be direct-IOd
3234                  */
3235                 error = EOPNOTSUPP;
3236         } else if ((disk_offset & HAMMER_BUFMASK) ||
3237                    (last_offset - ap->a_loffset) < blksize) {
3238                 /*
3239                  * doffsetp is not aligned or the forward run size does
3240                  * not cover a whole buffer, disallow the direct I/O.
3241                  */
3242                 error = EOPNOTSUPP;
3243         } else {
3244                 /*
3245                  * We're good.
3246                  */
3247                 *ap->a_doffsetp = disk_offset;
3248                 if (ap->a_runb) {
3249                         *ap->a_runb = ap->a_loffset - base_offset;
3250                         KKASSERT(*ap->a_runb >= 0);
3251                 }
3252                 if (ap->a_runp) {
3253                         *ap->a_runp = last_offset - ap->a_loffset;
3254                         KKASSERT(*ap->a_runp >= 0);
3255                 }
3256                 error = 0;
3257         }
3258         return(error);
3259 }
3260
3261 /*
3262  * Write to a regular file.   Because this is a strategy call the OS is
3263  * trying to actually get data onto the media.
3264  */
3265 static
3266 int
3267 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3268 {
3269         hammer_record_t record;
3270         hammer_mount_t hmp;
3271         hammer_inode_t ip;
3272         struct bio *bio;
3273         struct buf *bp;
3274         int blksize __debugvar;
3275         int bytes;
3276         int error;
3277
3278         bio = ap->a_bio;
3279         bp = bio->bio_buf;
3280         ip = ap->a_vp->v_data;
3281         hmp = ip->hmp;
3282
3283         blksize = hammer_blocksize(bio->bio_offset);
3284         KKASSERT(bp->b_bufsize == blksize);
3285
3286         if (ip->flags & HAMMER_INODE_RO) {
3287                 bp->b_error = EROFS;
3288                 bp->b_flags |= B_ERROR;
3289                 biodone(ap->a_bio);
3290                 return(EROFS);
3291         }
3292
3293         lwkt_gettoken(&hmp->fs_token);
3294
3295         /*
3296          * Disallow swapcache operation on the vnode buffer if double
3297          * buffering is enabled, the swapcache will get the data via
3298          * the block device buffer.
3299          */
3300         if (hammer_double_buffer)
3301                 bp->b_flags |= B_NOTMETA;
3302
3303         /*
3304          * Interlock with inode destruction (no in-kernel or directory
3305          * topology visibility).  If we queue new IO while trying to
3306          * destroy the inode we can deadlock the vtrunc call in
3307          * hammer_inode_unloadable_check().
3308          *
3309          * Besides, there's no point flushing a bp associated with an
3310          * inode that is being destroyed on-media and has no kernel
3311          * references.
3312          */
3313         if ((ip->flags | ip->sync_flags) &
3314             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3315                 bp->b_resid = 0;
3316                 biodone(ap->a_bio);
3317                 lwkt_reltoken(&hmp->fs_token);
3318                 return(0);
3319         }
3320
3321         /*
3322          * Reserve space and issue a direct-write from the front-end.
3323          * NOTE: The direct_io code will hammer_bread/bcopy smaller
3324          * allocations.
3325          *
3326          * An in-memory record will be installed to reference the storage
3327          * until the flusher can get to it.
3328          *
3329          * Since we own the high level bio the front-end will not try to
3330          * do a direct-read until the write completes.
3331          *
3332          * NOTE: The only time we do not reserve a full-sized buffers
3333          * worth of data is if the file is small.  We do not try to
3334          * allocate a fragment (from the small-data zone) at the end of
3335          * an otherwise large file as this can lead to wildly separated
3336          * data.
3337          */
3338         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3339         KKASSERT(bio->bio_offset < ip->ino_data.size);
3340         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3341                 bytes = bp->b_bufsize;
3342         else
3343                 bytes = ((int)ip->ino_data.size + 15) & ~15;
3344
3345         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3346                                     bytes, &error);
3347
3348         /*
3349          * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3350          * in hammer_vop_write().  We must flag the record so the proper
3351          * REDO_TERM_WRITE entry is generated during the flush.
3352          */
3353         if (record) {
3354                 if (bp->b_flags & B_VFSFLAG1) {
3355                         record->flags |= HAMMER_RECF_REDO;
3356                         bp->b_flags &= ~B_VFSFLAG1;
3357                 }
3358                 if (record->flags & HAMMER_RECF_DEDUPED) {
3359                         bp->b_resid = 0;
3360                         hammer_ip_replace_bulk(hmp, record);
3361                         biodone(ap->a_bio);
3362                 } else {
3363                         hammer_io_direct_write(hmp, bio, record);
3364                 }
3365                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3366                         hammer_flush_inode(ip, 0);
3367         } else {
3368                 bp->b_bio2.bio_offset = NOOFFSET;
3369                 bp->b_error = error;
3370                 bp->b_flags |= B_ERROR;
3371                 biodone(ap->a_bio);
3372         }
3373         lwkt_reltoken(&hmp->fs_token);
3374         return(error);
3375 }
3376
3377 /*
3378  * dounlink - disconnect a directory entry
3379  *
3380  * XXX whiteout support not really in yet
3381  */
3382 static int
3383 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3384                 struct vnode *dvp, struct ucred *cred,
3385                 int flags, int isdir)
3386 {
3387         struct namecache *ncp;
3388         hammer_inode_t dip;
3389         hammer_inode_t ip;
3390         hammer_mount_t hmp;
3391         struct hammer_cursor cursor;
3392         int64_t namekey;
3393         u_int32_t max_iterations;
3394         int nlen, error;
3395
3396         /*
3397          * Calculate the namekey and setup the key range for the scan.  This
3398          * works kinda like a chained hash table where the lower 32 bits
3399          * of the namekey synthesize the chain.
3400          *
3401          * The key range is inclusive of both key_beg and key_end.
3402          */
3403         dip = VTOI(dvp);
3404         ncp = nch->ncp;
3405         hmp = dip->hmp;
3406
3407         if (dip->flags & HAMMER_INODE_RO)
3408                 return (EROFS);
3409
3410         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3411                                            &max_iterations);
3412 retry:
3413         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3414         cursor.key_beg.localization = dip->obj_localization +
3415                                       hammer_dir_localization(dip);
3416         cursor.key_beg.obj_id = dip->obj_id;
3417         cursor.key_beg.key = namekey;
3418         cursor.key_beg.create_tid = 0;
3419         cursor.key_beg.delete_tid = 0;
3420         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3421         cursor.key_beg.obj_type = 0;
3422
3423         cursor.key_end = cursor.key_beg;
3424         cursor.key_end.key += max_iterations;
3425         cursor.asof = dip->obj_asof;
3426         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3427
3428         /*
3429          * Scan all matching records (the chain), locate the one matching
3430          * the requested path component.  info->last_error contains the
3431          * error code on search termination and could be 0, ENOENT, or
3432          * something else.
3433          *
3434          * The hammer_ip_*() functions merge in-memory records with on-disk
3435          * records for the purposes of the search.
3436          */
3437         error = hammer_ip_first(&cursor);
3438
3439         while (error == 0) {
3440                 error = hammer_ip_resolve_data(&cursor);
3441                 if (error)
3442                         break;
3443                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3444                 KKASSERT(nlen > 0);
3445                 if (ncp->nc_nlen == nlen &&
3446                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3447                         break;
3448                 }
3449                 error = hammer_ip_next(&cursor);
3450         }
3451
3452         /*
3453          * If all is ok we have to get the inode so we can adjust nlinks.
3454          * To avoid a deadlock with the flusher we must release the inode
3455          * lock on the directory when acquiring the inode for the entry.
3456          *
3457          * If the target is a directory, it must be empty.
3458          */
3459         if (error == 0) {
3460                 hammer_unlock(&cursor.ip->lock);
3461                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3462                                       hmp->asof,
3463                                       cursor.data->entry.localization,
3464                                       0, &error);
3465                 hammer_lock_sh(&cursor.ip->lock);
3466                 if (error == ENOENT) {
3467                         kprintf("HAMMER: WARNING: Removing "
3468                                 "dirent w/missing inode \"%s\"\n"
3469                                 "\tobj_id = %016llx\n",
3470                                 ncp->nc_name,
3471                                 (long long)cursor.data->entry.obj_id);
3472                         error = 0;
3473                 }
3474
3475                 /*
3476                  * If isdir >= 0 we validate that the entry is or is not a
3477                  * directory.  If isdir < 0 we don't care.
3478                  */
3479                 if (error == 0 && isdir >= 0 && ip) {
3480                         if (isdir &&
3481                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3482                                 error = ENOTDIR;
3483                         } else if (isdir == 0 &&
3484                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3485                                 error = EISDIR;
3486                         }
3487                 }
3488
3489                 /*
3490                  * If we are trying to remove a directory the directory must
3491                  * be empty.
3492                  *
3493                  * The check directory code can loop and deadlock/retry.  Our
3494                  * own cursor's node locks must be released to avoid a 3-way
3495                  * deadlock with the flusher if the check directory code
3496                  * blocks.
3497                  *
3498                  * If any changes whatsoever have been made to the cursor
3499                  * set EDEADLK and retry.
3500                  *
3501                  * WARNING: See warnings in hammer_unlock_cursor()
3502                  *          function.
3503                  */
3504                 if (error == 0 && ip && ip->ino_data.obj_type ==
3505                                         HAMMER_OBJTYPE_DIRECTORY) {
3506                         hammer_unlock_cursor(&cursor);
3507                         error = hammer_ip_check_directory_empty(trans, ip);
3508                         hammer_lock_cursor(&cursor);
3509                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
3510                                 kprintf("HAMMER: Warning: avoided deadlock "
3511                                         "on rmdir '%s'\n",
3512                                         ncp->nc_name);
3513                                 error = EDEADLK;
3514                         }
3515                 }
3516
3517                 /*
3518                  * Delete the directory entry.
3519                  *
3520                  * WARNING: hammer_ip_del_directory() may have to terminate
3521                  * the cursor to avoid a deadlock.  It is ok to call
3522                  * hammer_done_cursor() twice.
3523                  */
3524                 if (error == 0) {
3525                         error = hammer_ip_del_directory(trans, &cursor,
3526                                                         dip, ip);
3527                 }
3528                 hammer_done_cursor(&cursor);
3529                 if (error == 0) {
3530                         /*
3531                          * Tell the namecache that we are now unlinked.
3532                          */
3533                         cache_unlink(nch);
3534
3535                         /*
3536                          * NOTE: ip->vp, if non-NULL, cannot be directly
3537                          *       referenced without formally acquiring the
3538                          *       vp since the vp might have zero refs on it,
3539                          *       or in the middle of a reclaim, etc.
3540                          *
3541                          * NOTE: The cache_setunresolved() can rip the vp
3542                          *       out from under us since the vp may not have
3543                          *       any refs, in which case ip->vp will be NULL
3544                          *       from the outset.
3545                          */
3546                         while (ip && ip->vp) {
3547                                 struct vnode *vp;
3548
3549                                 error = hammer_get_vnode(ip, &vp);
3550                                 if (error == 0 && vp) {
3551                                         vn_unlock(vp);
3552                                         hammer_knote(ip->vp, NOTE_DELETE);
3553 #if 0
3554                                         /*
3555                                          * Don't do this, it can deadlock
3556                                          * on concurrent rm's of hardlinks.
3557                                          * Shouldn't be needed any more.
3558                                          */
3559                                         cache_inval_vp(ip->vp, CINV_DESTROY);
3560 #endif
3561                                         vrele(vp);
3562                                         break;
3563                                 }
3564                                 kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3565                         }
3566                 }
3567                 if (ip)
3568                         hammer_rel_inode(ip, 0);
3569         } else {
3570                 hammer_done_cursor(&cursor);
3571         }
3572         if (error == EDEADLK)
3573                 goto retry;
3574
3575         return (error);
3576 }
3577
3578 /************************************************************************
3579  *                          FIFO AND SPECFS OPS                         *
3580  ************************************************************************
3581  *
3582  */
3583 static int
3584 hammer_vop_fifoclose (struct vop_close_args *ap)
3585 {
3586         /* XXX update itimes */
3587         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3588 }
3589
3590 static int
3591 hammer_vop_fiforead (struct vop_read_args *ap)
3592 {
3593         int error;