Merge branch 'vendor/GCC44'
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vm/swap_pager.h>
50 #include <vfs/fifofs/fifo.h>
51
52 #include "hammer.h"
53
54 /*
55  * USERFS VNOPS
56  */
57 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
58 static int hammer_vop_fsync(struct vop_fsync_args *);
59 static int hammer_vop_read(struct vop_read_args *);
60 static int hammer_vop_write(struct vop_write_args *);
61 static int hammer_vop_access(struct vop_access_args *);
62 static int hammer_vop_advlock(struct vop_advlock_args *);
63 static int hammer_vop_close(struct vop_close_args *);
64 static int hammer_vop_ncreate(struct vop_ncreate_args *);
65 static int hammer_vop_getattr(struct vop_getattr_args *);
66 static int hammer_vop_nresolve(struct vop_nresolve_args *);
67 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
68 static int hammer_vop_nlink(struct vop_nlink_args *);
69 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
70 static int hammer_vop_nmknod(struct vop_nmknod_args *);
71 static int hammer_vop_open(struct vop_open_args *);
72 static int hammer_vop_print(struct vop_print_args *);
73 static int hammer_vop_readdir(struct vop_readdir_args *);
74 static int hammer_vop_readlink(struct vop_readlink_args *);
75 static int hammer_vop_nremove(struct vop_nremove_args *);
76 static int hammer_vop_nrename(struct vop_nrename_args *);
77 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
78 static int hammer_vop_markatime(struct vop_markatime_args *);
79 static int hammer_vop_setattr(struct vop_setattr_args *);
80 static int hammer_vop_strategy(struct vop_strategy_args *);
81 static int hammer_vop_bmap(struct vop_bmap_args *ap);
82 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
83 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
84 static int hammer_vop_ioctl(struct vop_ioctl_args *);
85 static int hammer_vop_mountctl(struct vop_mountctl_args *);
86 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
87
88 static int hammer_vop_fifoclose (struct vop_close_args *);
89 static int hammer_vop_fiforead (struct vop_read_args *);
90 static int hammer_vop_fifowrite (struct vop_write_args *);
91 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
92
93 struct vop_ops hammer_vnode_vops = {
94         .vop_default =          vop_defaultop,
95         .vop_fsync =            hammer_vop_fsync,
96         .vop_getpages =         vop_stdgetpages,
97         .vop_putpages =         vop_stdputpages,
98         .vop_read =             hammer_vop_read,
99         .vop_write =            hammer_vop_write,
100         .vop_access =           hammer_vop_access,
101         .vop_advlock =          hammer_vop_advlock,
102         .vop_close =            hammer_vop_close,
103         .vop_ncreate =          hammer_vop_ncreate,
104         .vop_getattr =          hammer_vop_getattr,
105         .vop_inactive =         hammer_vop_inactive,
106         .vop_reclaim =          hammer_vop_reclaim,
107         .vop_nresolve =         hammer_vop_nresolve,
108         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
109         .vop_nlink =            hammer_vop_nlink,
110         .vop_nmkdir =           hammer_vop_nmkdir,
111         .vop_nmknod =           hammer_vop_nmknod,
112         .vop_open =             hammer_vop_open,
113         .vop_pathconf =         vop_stdpathconf,
114         .vop_print =            hammer_vop_print,
115         .vop_readdir =          hammer_vop_readdir,
116         .vop_readlink =         hammer_vop_readlink,
117         .vop_nremove =          hammer_vop_nremove,
118         .vop_nrename =          hammer_vop_nrename,
119         .vop_nrmdir =           hammer_vop_nrmdir,
120         .vop_markatime =        hammer_vop_markatime,
121         .vop_setattr =          hammer_vop_setattr,
122         .vop_bmap =             hammer_vop_bmap,
123         .vop_strategy =         hammer_vop_strategy,
124         .vop_nsymlink =         hammer_vop_nsymlink,
125         .vop_nwhiteout =        hammer_vop_nwhiteout,
126         .vop_ioctl =            hammer_vop_ioctl,
127         .vop_mountctl =         hammer_vop_mountctl,
128         .vop_kqfilter =         hammer_vop_kqfilter
129 };
130
131 struct vop_ops hammer_spec_vops = {
132         .vop_default =          vop_defaultop,
133         .vop_fsync =            hammer_vop_fsync,
134         .vop_read =             vop_stdnoread,
135         .vop_write =            vop_stdnowrite,
136         .vop_access =           hammer_vop_access,
137         .vop_close =            hammer_vop_close,
138         .vop_markatime =        hammer_vop_markatime,
139         .vop_getattr =          hammer_vop_getattr,
140         .vop_inactive =         hammer_vop_inactive,
141         .vop_reclaim =          hammer_vop_reclaim,
142         .vop_setattr =          hammer_vop_setattr
143 };
144
145 struct vop_ops hammer_fifo_vops = {
146         .vop_default =          fifo_vnoperate,
147         .vop_fsync =            hammer_vop_fsync,
148         .vop_read =             hammer_vop_fiforead,
149         .vop_write =            hammer_vop_fifowrite,
150         .vop_access =           hammer_vop_access,
151         .vop_close =            hammer_vop_fifoclose,
152         .vop_markatime =        hammer_vop_markatime,
153         .vop_getattr =          hammer_vop_getattr,
154         .vop_inactive =         hammer_vop_inactive,
155         .vop_reclaim =          hammer_vop_reclaim,
156         .vop_setattr =          hammer_vop_setattr,
157         .vop_kqfilter =         hammer_vop_fifokqfilter
158 };
159
160 static __inline
161 void
162 hammer_knote(struct vnode *vp, int flags)
163 {
164         if (flags)
165                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
166 }
167
168 #ifdef DEBUG_TRUNCATE
169 struct hammer_inode *HammerTruncIp;
170 #endif
171
172 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
173                            struct vnode *dvp, struct ucred *cred,
174                            int flags, int isdir);
175 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
176 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
177
178 #if 0
179 static
180 int
181 hammer_vop_vnoperate(struct vop_generic_args *)
182 {
183         return (VOCALL(&hammer_vnode_vops, ap));
184 }
185 #endif
186
187 /*
188  * hammer_vop_fsync { vp, waitfor }
189  *
190  * fsync() an inode to disk and wait for it to be completely committed
191  * such that the information would not be undone if a crash occured after
192  * return.
193  *
194  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
195  *       a REDO log.  A sysctl is provided to relax HAMMER's fsync()
196  *       operation.
197  *
198  *       Ultimately the combination of a REDO log and use of fast storage
199  *       to front-end cluster caches will make fsync fast, but it aint
200  *       here yet.  And, in anycase, we need real transactional
201  *       all-or-nothing features which are not restricted to a single file.
202  */
203 static
204 int
205 hammer_vop_fsync(struct vop_fsync_args *ap)
206 {
207         hammer_inode_t ip = VTOI(ap->a_vp);
208         hammer_mount_t hmp = ip->hmp;
209         int waitfor = ap->a_waitfor;
210         int mode;
211
212         lwkt_gettoken(&hmp->fs_token);
213
214         /*
215          * Fsync rule relaxation (default is either full synchronous flush
216          * or REDO semantics with synchronous flush).
217          */
218         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
219                 switch(hammer_fsync_mode) {
220                 case 0:
221 mode0:
222                         /* no REDO, full synchronous flush */
223                         goto skip;
224                 case 1:
225 mode1:
226                         /* no REDO, full asynchronous flush */
227                         if (waitfor == MNT_WAIT)
228                                 waitfor = MNT_NOWAIT;
229                         goto skip;
230                 case 2:
231                         /* REDO semantics, synchronous flush */
232                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
233                                 goto mode0;
234                         mode = HAMMER_FLUSH_UNDOS_AUTO;
235                         break;
236                 case 3:
237                         /* REDO semantics, relaxed asynchronous flush */
238                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
239                                 goto mode1;
240                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
241                         if (waitfor == MNT_WAIT)
242                                 waitfor = MNT_NOWAIT;
243                         break;
244                 case 4:
245                         /* ignore the fsync() system call */
246                         lwkt_reltoken(&hmp->fs_token);
247                         return(0);
248                 default:
249                         /* we have to do something */
250                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
251                         if (waitfor == MNT_WAIT)
252                                 waitfor = MNT_NOWAIT;
253                         break;
254                 }
255
256                 /*
257                  * Fast fsync only needs to flush the UNDO/REDO fifo if
258                  * HAMMER_INODE_REDO is non-zero and the only modifications
259                  * made to the file are write or write-extends.
260                  */
261                 if ((ip->flags & HAMMER_INODE_REDO) &&
262                     (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
263                 ) {
264                         ++hammer_count_fsyncs;
265                         hammer_flusher_flush_undos(hmp, mode);
266                         ip->redo_count = 0;
267                         lwkt_reltoken(&hmp->fs_token);
268                         return(0);
269                 }
270
271                 /*
272                  * REDO is enabled by fsync(), the idea being we really only
273                  * want to lay down REDO records when programs are using
274                  * fsync() heavily.  The first fsync() on the file starts
275                  * the gravy train going and later fsync()s keep it hot by
276                  * resetting the redo_count.
277                  *
278                  * We weren't running REDOs before now so we have to fall
279                  * through and do a full fsync of what we have.
280                  */
281                 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
282                     (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
283                         ip->flags |= HAMMER_INODE_REDO;
284                         ip->redo_count = 0;
285                 }
286         }
287 skip:
288
289         /*
290          * Do a full flush sequence.
291          */
292         ++hammer_count_fsyncs;
293         vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
294         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
295         if (waitfor == MNT_WAIT) {
296                 vn_unlock(ap->a_vp);
297                 hammer_wait_inode(ip);
298                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
299         }
300         lwkt_reltoken(&hmp->fs_token);
301         return (ip->error);
302 }
303
304 /*
305  * hammer_vop_read { vp, uio, ioflag, cred }
306  *
307  * MPSAFE (for the cache safe does not require fs_token)
308  */
309 static
310 int
311 hammer_vop_read(struct vop_read_args *ap)
312 {
313         struct hammer_transaction trans;
314         hammer_inode_t ip;
315         hammer_mount_t hmp;
316         off_t offset;
317         struct buf *bp;
318         struct uio *uio;
319         int error;
320         int n;
321         int seqcount;
322         int ioseqcount;
323         int blksize;
324         int bigread;
325         int got_fstoken;
326
327         if (ap->a_vp->v_type != VREG)
328                 return (EINVAL);
329         ip = VTOI(ap->a_vp);
330         hmp = ip->hmp;
331         error = 0;
332         uio = ap->a_uio;
333
334         /*
335          * Allow the UIO's size to override the sequential heuristic.
336          */
337         blksize = hammer_blocksize(uio->uio_offset);
338         seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
339         ioseqcount = (ap->a_ioflag >> 16);
340         if (seqcount < ioseqcount)
341                 seqcount = ioseqcount;
342
343         /*
344          * If reading or writing a huge amount of data we have to break
345          * atomicy and allow the operation to be interrupted by a signal
346          * or it can DOS the machine.
347          */
348         bigread = (uio->uio_resid > 100 * 1024 * 1024);
349         got_fstoken = 0;
350
351         /*
352          * Access the data typically in HAMMER_BUFSIZE blocks via the
353          * buffer cache, but HAMMER may use a variable block size based
354          * on the offset.
355          *
356          * XXX Temporary hack, delay the start transaction while we remain
357          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
358          *     locked-shared.
359          */
360         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
361                 int64_t base_offset;
362                 int64_t file_limit;
363
364                 blksize = hammer_blocksize(uio->uio_offset);
365                 offset = (int)uio->uio_offset & (blksize - 1);
366                 base_offset = uio->uio_offset - offset;
367
368                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
369                         break;
370
371                 /*
372                  * MPSAFE
373                  */
374                 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
375                 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
376                         bp->b_flags &= ~B_AGE;
377                         error = 0;
378                         goto skip;
379                 }
380                 if (ap->a_ioflag & IO_NRDELAY) {
381                         bqrelse(bp);
382                         return (EWOULDBLOCK);
383                 }
384
385                 /*
386                  * MPUNSAFE
387                  */
388                 if (got_fstoken == 0) {
389                         lwkt_gettoken(&hmp->fs_token);
390                         got_fstoken = 1;
391                         hammer_start_transaction(&trans, ip->hmp);
392                 }
393
394                 /*
395                  * NOTE: A valid bp has already been acquired, but was not
396                  *       B_CACHE.
397                  */
398                 if (hammer_cluster_enable) {
399                         /*
400                          * Use file_limit to prevent cluster_read() from
401                          * creating buffers of the wrong block size past
402                          * the demarc.
403                          */
404                         file_limit = ip->ino_data.size;
405                         if (base_offset < HAMMER_XDEMARC &&
406                             file_limit > HAMMER_XDEMARC) {
407                                 file_limit = HAMMER_XDEMARC;
408                         }
409                         error = cluster_readx(ap->a_vp,
410                                              file_limit, base_offset,
411                                              blksize, uio->uio_resid,
412                                              seqcount * BKVASIZE, &bp);
413                 } else {
414                         error = breadnx(ap->a_vp, base_offset, blksize,
415                                         NULL, NULL, 0, &bp);
416                 }
417                 if (error) {
418                         brelse(bp);
419                         break;
420                 }
421 skip:
422                 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
423                         kprintf("doff %016jx read file %016jx@%016jx\n",
424                                 (intmax_t)bp->b_bio2.bio_offset,
425                                 (intmax_t)ip->obj_id,
426                                 (intmax_t)bp->b_loffset);
427                 }
428                 bp->b_flags &= ~B_IODEBUG;
429
430                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
431                 n = blksize - offset;
432                 if (n > uio->uio_resid)
433                         n = uio->uio_resid;
434                 if (n > ip->ino_data.size - uio->uio_offset)
435                         n = (int)(ip->ino_data.size - uio->uio_offset);
436                 if (got_fstoken)
437                         lwkt_reltoken(&hmp->fs_token);
438
439                 /*
440                  * Set B_AGE, data has a lower priority than meta-data.
441                  *
442                  * Use a hold/unlock/drop sequence to run the uiomove
443                  * with the buffer unlocked, avoiding deadlocks against
444                  * read()s on mmap()'d spaces.
445                  */
446                 bp->b_flags |= B_AGE;
447                 bqhold(bp);
448                 bqrelse(bp);
449                 error = uiomove((char *)bp->b_data + offset, n, uio);
450                 bqdrop(bp);
451
452                 if (got_fstoken)
453                         lwkt_gettoken(&hmp->fs_token);
454
455                 if (error)
456                         break;
457                 hammer_stats_file_read += n;
458         }
459
460         /*
461          * Try to update the atime with just the inode lock for maximum
462          * concurrency.  If we can't shortcut it we have to get the full
463          * blown transaction.
464          */
465         if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) {
466                 lwkt_gettoken(&hmp->fs_token);
467                 got_fstoken = 1;
468                 hammer_start_transaction(&trans, ip->hmp);
469         }
470
471         if (got_fstoken) {
472                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
473                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
474                         ip->ino_data.atime = trans.time;
475                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
476                 }
477                 hammer_done_transaction(&trans);
478                 lwkt_reltoken(&hmp->fs_token);
479         }
480         return (error);
481 }
482
483 /*
484  * hammer_vop_write { vp, uio, ioflag, cred }
485  */
486 static
487 int
488 hammer_vop_write(struct vop_write_args *ap)
489 {
490         struct hammer_transaction trans;
491         struct hammer_inode *ip;
492         hammer_mount_t hmp;
493         thread_t td;
494         struct uio *uio;
495         int offset;
496         off_t base_offset;
497         struct buf *bp;
498         int kflags;
499         int error;
500         int n;
501         int flags;
502         int seqcount;
503         int bigwrite;
504
505         if (ap->a_vp->v_type != VREG)
506                 return (EINVAL);
507         ip = VTOI(ap->a_vp);
508         hmp = ip->hmp;
509         error = 0;
510         kflags = 0;
511         seqcount = ap->a_ioflag >> 16;
512
513         if (ip->flags & HAMMER_INODE_RO)
514                 return (EROFS);
515
516         /*
517          * Create a transaction to cover the operations we perform.
518          */
519         lwkt_gettoken(&hmp->fs_token);
520         hammer_start_transaction(&trans, hmp);
521         uio = ap->a_uio;
522
523         /*
524          * Check append mode
525          */
526         if (ap->a_ioflag & IO_APPEND)
527                 uio->uio_offset = ip->ino_data.size;
528
529         /*
530          * Check for illegal write offsets.  Valid range is 0...2^63-1.
531          *
532          * NOTE: the base_off assignment is required to work around what
533          * I consider to be a GCC-4 optimization bug.
534          */
535         if (uio->uio_offset < 0) {
536                 hammer_done_transaction(&trans);
537                 lwkt_reltoken(&hmp->fs_token);
538                 return (EFBIG);
539         }
540         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
541         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
542                 hammer_done_transaction(&trans);
543                 lwkt_reltoken(&hmp->fs_token);
544                 return (EFBIG);
545         }
546
547         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
548             base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
549                 hammer_done_transaction(&trans);
550                 lwkt_reltoken(&hmp->fs_token);
551                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
552                 return (EFBIG);
553         }
554
555         /*
556          * If reading or writing a huge amount of data we have to break
557          * atomicy and allow the operation to be interrupted by a signal
558          * or it can DOS the machine.
559          *
560          * Preset redo_count so we stop generating REDOs earlier if the
561          * limit is exceeded.
562          */
563         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
564         if ((ip->flags & HAMMER_INODE_REDO) &&
565             ip->redo_count < hammer_limit_redo) {
566                 ip->redo_count += uio->uio_resid;
567         }
568
569         /*
570          * Access the data typically in HAMMER_BUFSIZE blocks via the
571          * buffer cache, but HAMMER may use a variable block size based
572          * on the offset.
573          */
574         while (uio->uio_resid > 0) {
575                 int fixsize = 0;
576                 int blksize;
577                 int blkmask;
578                 int trivial;
579                 int endofblk;
580                 off_t nsize;
581
582                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
583                         break;
584                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
585                         break;
586
587                 blksize = hammer_blocksize(uio->uio_offset);
588
589                 /*
590                  * Do not allow HAMMER to blow out the buffer cache.  Very
591                  * large UIOs can lockout other processes due to bwillwrite()
592                  * mechanics.
593                  *
594                  * The hammer inode is not locked during these operations.
595                  * The vnode is locked which can interfere with the pageout
596                  * daemon for non-UIO_NOCOPY writes but should not interfere
597                  * with the buffer cache.  Even so, we cannot afford to
598                  * allow the pageout daemon to build up too many dirty buffer
599                  * cache buffers.
600                  *
601                  * Only call this if we aren't being recursively called from
602                  * a virtual disk device (vn), else we may deadlock.
603                  */
604                 if ((ap->a_ioflag & IO_RECURSE) == 0)
605                         bwillwrite(blksize);
606
607                 /*
608                  * Control the number of pending records associated with
609                  * this inode.  If too many have accumulated start a
610                  * flush.  Try to maintain a pipeline with the flusher.
611                  *
612                  * NOTE: It is possible for other sources to grow the
613                  *       records but not necessarily issue another flush,
614                  *       so use a timeout and ensure that a re-flush occurs.
615                  */
616                 if (ip->rsv_recs >= hammer_limit_inode_recs) {
617                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
618                         while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
619                                 ip->flags |= HAMMER_INODE_RECSW;
620                                 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
621                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
622                         }
623                 }
624
625 #if 0
626                 /*
627                  * Do not allow HAMMER to blow out system memory by
628                  * accumulating too many records.   Records are so well
629                  * decoupled from the buffer cache that it is possible
630                  * for userland to push data out to the media via
631                  * direct-write, but build up the records queued to the
632                  * backend faster then the backend can flush them out.
633                  * HAMMER has hit its write limit but the frontend has
634                  * no pushback to slow it down.
635                  */
636                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
637                         /*
638                          * Get the inode on the flush list
639                          */
640                         if (ip->rsv_recs >= 64)
641                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
642                         else if (ip->rsv_recs >= 16)
643                                 hammer_flush_inode(ip, 0);
644
645                         /*
646                          * Keep the flusher going if the system keeps
647                          * queueing records.
648                          */
649                         delta = hmp->count_newrecords -
650                                 hmp->last_newrecords;
651                         if (delta < 0 || delta > hammer_limit_recs / 2) {
652                                 hmp->last_newrecords = hmp->count_newrecords;
653                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
654                         }
655
656                         /*
657                          * If we have gotten behind start slowing
658                          * down the writers.
659                          */
660                         delta = (hmp->rsv_recs - hammer_limit_recs) *
661                                 hz / hammer_limit_recs;
662                         if (delta > 0)
663                                 tsleep(&trans, 0, "hmrslo", delta);
664                 }
665 #endif
666
667                 /*
668                  * Calculate the blocksize at the current offset and figure
669                  * out how much we can actually write.
670                  */
671                 blkmask = blksize - 1;
672                 offset = (int)uio->uio_offset & blkmask;
673                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
674                 n = blksize - offset;
675                 if (n > uio->uio_resid) {
676                         n = uio->uio_resid;
677                         endofblk = 0;
678                 } else {
679                         endofblk = 1;
680                 }
681                 nsize = uio->uio_offset + n;
682                 if (nsize > ip->ino_data.size) {
683                         if (uio->uio_offset > ip->ino_data.size)
684                                 trivial = 0;
685                         else
686                                 trivial = 1;
687                         nvextendbuf(ap->a_vp,
688                                     ip->ino_data.size,
689                                     nsize,
690                                     hammer_blocksize(ip->ino_data.size),
691                                     hammer_blocksize(nsize),
692                                     hammer_blockoff(ip->ino_data.size),
693                                     hammer_blockoff(nsize),
694                                     trivial);
695                         fixsize = 1;
696                         kflags |= NOTE_EXTEND;
697                 }
698
699                 if (uio->uio_segflg == UIO_NOCOPY) {
700                         /*
701                          * Issuing a write with the same data backing the
702                          * buffer.  Instantiate the buffer to collect the
703                          * backing vm pages, then read-in any missing bits.
704                          *
705                          * This case is used by vop_stdputpages().
706                          */
707                         bp = getblk(ap->a_vp, base_offset,
708                                     blksize, GETBLK_BHEAVY, 0);
709                         if ((bp->b_flags & B_CACHE) == 0) {
710                                 bqrelse(bp);
711                                 error = bread(ap->a_vp, base_offset,
712                                               blksize, &bp);
713                         }
714                 } else if (offset == 0 && uio->uio_resid >= blksize) {
715                         /*
716                          * Even though we are entirely overwriting the buffer
717                          * we may still have to zero it out to avoid a 
718                          * mmap/write visibility issue.
719                          */
720                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
721                         if ((bp->b_flags & B_CACHE) == 0)
722                                 vfs_bio_clrbuf(bp);
723                 } else if (base_offset >= ip->ino_data.size) {
724                         /*
725                          * If the base offset of the buffer is beyond the
726                          * file EOF, we don't have to issue a read.
727                          */
728                         bp = getblk(ap->a_vp, base_offset,
729                                     blksize, GETBLK_BHEAVY, 0);
730                         vfs_bio_clrbuf(bp);
731                 } else {
732                         /*
733                          * Partial overwrite, read in any missing bits then
734                          * replace the portion being written.
735                          */
736                         error = bread(ap->a_vp, base_offset, blksize, &bp);
737                         if (error == 0)
738                                 bheavy(bp);
739                 }
740                 if (error == 0) {
741                         lwkt_reltoken(&hmp->fs_token);
742                         error = uiomove(bp->b_data + offset, n, uio);
743                         lwkt_gettoken(&hmp->fs_token);
744                 }
745
746                 /*
747                  * Generate REDO records if enabled and redo_count will not
748                  * exceeded the limit.
749                  *
750                  * If redo_count exceeds the limit we stop generating records
751                  * and clear HAMMER_INODE_REDO.  This will cause the next
752                  * fsync() to do a full meta-data sync instead of just an
753                  * UNDO/REDO fifo update.
754                  *
755                  * When clearing HAMMER_INODE_REDO any pre-existing REDOs
756                  * will still be tracked.  The tracks will be terminated
757                  * when the related meta-data (including possible data
758                  * modifications which are not tracked via REDO) is
759                  * flushed.
760                  */
761                 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
762                         if (ip->redo_count < hammer_limit_redo) {
763                                 bp->b_flags |= B_VFSFLAG1;
764                                 error = hammer_generate_redo(&trans, ip,
765                                                      base_offset + offset,
766                                                      HAMMER_REDO_WRITE,
767                                                      bp->b_data + offset,
768                                                      (size_t)n);
769                         } else {
770                                 ip->flags &= ~HAMMER_INODE_REDO;
771                         }
772                 }
773
774                 /*
775                  * If we screwed up we have to undo any VM size changes we
776                  * made.
777                  */
778                 if (error) {
779                         brelse(bp);
780                         if (fixsize) {
781                                 nvtruncbuf(ap->a_vp, ip->ino_data.size,
782                                           hammer_blocksize(ip->ino_data.size),
783                                           hammer_blockoff(ip->ino_data.size));
784                         }
785                         break;
786                 }
787                 kflags |= NOTE_WRITE;
788                 hammer_stats_file_write += n;
789                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
790                 if (ip->ino_data.size < uio->uio_offset) {
791                         ip->ino_data.size = uio->uio_offset;
792                         flags = HAMMER_INODE_SDIRTY;
793                 } else {
794                         flags = 0;
795                 }
796                 ip->ino_data.mtime = trans.time;
797                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
798                 hammer_modify_inode(&trans, ip, flags);
799
800                 /*
801                  * Once we dirty the buffer any cached zone-X offset
802                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
803                  * allow overwriting over the same data sector unless
804                  * we provide UNDOs for the old data, which we don't.
805                  */
806                 bp->b_bio2.bio_offset = NOOFFSET;
807
808                 /*
809                  * Final buffer disposition.
810                  *
811                  * Because meta-data updates are deferred, HAMMER is
812                  * especially sensitive to excessive bdwrite()s because
813                  * the I/O stream is not broken up by disk reads.  So the
814                  * buffer cache simply cannot keep up.
815                  *
816                  * WARNING!  blksize is variable.  cluster_write() is
817                  *           expected to not blow up if it encounters
818                  *           buffers that do not match the passed blksize.
819                  *
820                  * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
821                  *        The ip->rsv_recs check should burst-flush the data.
822                  *        If we queue it immediately the buf could be left
823                  *        locked on the device queue for a very long time.
824                  *
825                  *        However, failing to flush a dirty buffer out when
826                  *        issued from the pageout daemon can result in a low
827                  *        memory deadlock against bio_page_alloc(), so we
828                  *        have to bawrite() on IO_ASYNC as well.
829                  *
830                  * NOTE!  To avoid degenerate stalls due to mismatched block
831                  *        sizes we only honor IO_DIRECT on the write which
832                  *        abuts the end of the buffer.  However, we must
833                  *        honor IO_SYNC in case someone is silly enough to
834                  *        configure a HAMMER file as swap, or when HAMMER
835                  *        is serving NFS (for commits).  Ick ick.
836                  */
837                 bp->b_flags |= B_AGE;
838                 if (ap->a_ioflag & IO_SYNC) {
839                         bwrite(bp);
840                 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
841                         bawrite(bp);
842                 } else if (ap->a_ioflag & IO_ASYNC) {
843                         bawrite(bp);
844                 } else {
845 #if 0
846                 if (offset + n == blksize) {
847                         if (hammer_cluster_enable == 0 ||
848                             (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
849                                 bawrite(bp);
850                         } else {
851                                 cluster_write(bp, ip->ino_data.size,
852                                               blksize, seqcount);
853                         }
854                 } else {
855 #endif
856                         bdwrite(bp);
857                 }
858         }
859         hammer_done_transaction(&trans);
860         hammer_knote(ap->a_vp, kflags);
861         lwkt_reltoken(&hmp->fs_token);
862         return (error);
863 }
864
865 /*
866  * hammer_vop_access { vp, mode, cred }
867  *
868  * MPSAFE - does not require fs_token
869  */
870 static
871 int
872 hammer_vop_access(struct vop_access_args *ap)
873 {
874         struct hammer_inode *ip = VTOI(ap->a_vp);
875         uid_t uid;
876         gid_t gid;
877         int error;
878
879         ++hammer_stats_file_iopsr;
880         uid = hammer_to_unix_xid(&ip->ino_data.uid);
881         gid = hammer_to_unix_xid(&ip->ino_data.gid);
882
883         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
884                                   ip->ino_data.uflags);
885         return (error);
886 }
887
888 /*
889  * hammer_vop_advlock { vp, id, op, fl, flags }
890  *
891  * MPSAFE - does not require fs_token
892  */
893 static
894 int
895 hammer_vop_advlock(struct vop_advlock_args *ap)
896 {
897         hammer_inode_t ip = VTOI(ap->a_vp);
898
899         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
900 }
901
902 /*
903  * hammer_vop_close { vp, fflag }
904  *
905  * We can only sync-on-close for normal closes.  XXX disabled for now.
906  */
907 static
908 int
909 hammer_vop_close(struct vop_close_args *ap)
910 {
911 #if 0
912         struct vnode *vp = ap->a_vp;
913         hammer_inode_t ip = VTOI(vp);
914         int waitfor;
915         if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
916                 if (vn_islocked(vp) == LK_EXCLUSIVE &&
917                     (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
918                         if (ip->flags & HAMMER_INODE_CLOSESYNC)
919                                 waitfor = MNT_WAIT;
920                         else
921                                 waitfor = MNT_NOWAIT;
922                         ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
923                                        HAMMER_INODE_CLOSEASYNC);
924                         VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
925                 }
926         }
927 #endif
928         return (vop_stdclose(ap));
929 }
930
931 /*
932  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
933  *
934  * The operating system has already ensured that the directory entry
935  * does not exist and done all appropriate namespace locking.
936  */
937 static
938 int
939 hammer_vop_ncreate(struct vop_ncreate_args *ap)
940 {
941         struct hammer_transaction trans;
942         struct hammer_inode *dip;
943         struct hammer_inode *nip;
944         struct nchandle *nch;
945         hammer_mount_t hmp;
946         int error;
947
948         nch = ap->a_nch;
949         dip = VTOI(ap->a_dvp);
950         hmp = dip->hmp;
951
952         if (dip->flags & HAMMER_INODE_RO)
953                 return (EROFS);
954         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
955                 return (error);
956
957         /*
958          * Create a transaction to cover the operations we perform.
959          */
960         lwkt_gettoken(&hmp->fs_token);
961         hammer_start_transaction(&trans, hmp);
962         ++hammer_stats_file_iopsw;
963
964         /*
965          * Create a new filesystem object of the requested type.  The
966          * returned inode will be referenced and shared-locked to prevent
967          * it from being moved to the flusher.
968          */
969         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
970                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
971                                     NULL, &nip);
972         if (error) {
973                 hkprintf("hammer_create_inode error %d\n", error);
974                 hammer_done_transaction(&trans);
975                 *ap->a_vpp = NULL;
976                 lwkt_reltoken(&hmp->fs_token);
977                 return (error);
978         }
979
980         /*
981          * Add the new filesystem object to the directory.  This will also
982          * bump the inode's link count.
983          */
984         error = hammer_ip_add_directory(&trans, dip,
985                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
986                                         nip);
987         if (error)
988                 hkprintf("hammer_ip_add_directory error %d\n", error);
989
990         /*
991          * Finish up.
992          */
993         if (error) {
994                 hammer_rel_inode(nip, 0);
995                 hammer_done_transaction(&trans);
996                 *ap->a_vpp = NULL;
997         } else {
998                 error = hammer_get_vnode(nip, ap->a_vpp);
999                 hammer_done_transaction(&trans);
1000                 hammer_rel_inode(nip, 0);
1001                 if (error == 0) {
1002                         cache_setunresolved(ap->a_nch);
1003                         cache_setvp(ap->a_nch, *ap->a_vpp);
1004                 }
1005                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1006         }
1007         lwkt_reltoken(&hmp->fs_token);
1008         return (error);
1009 }
1010
1011 /*
1012  * hammer_vop_getattr { vp, vap }
1013  *
1014  * Retrieve an inode's attribute information.  When accessing inodes
1015  * historically we fake the atime field to ensure consistent results.
1016  * The atime field is stored in the B-Tree element and allowed to be
1017  * updated without cycling the element.
1018  *
1019  * MPSAFE - does not require fs_token
1020  */
1021 static
1022 int
1023 hammer_vop_getattr(struct vop_getattr_args *ap)
1024 {
1025         struct hammer_inode *ip = VTOI(ap->a_vp);
1026         struct vattr *vap = ap->a_vap;
1027
1028         /*
1029          * We want the fsid to be different when accessing a filesystem
1030          * with different as-of's so programs like diff don't think
1031          * the files are the same.
1032          *
1033          * We also want the fsid to be the same when comparing snapshots,
1034          * or when comparing mirrors (which might be backed by different
1035          * physical devices).  HAMMER fsids are based on the PFS's
1036          * shared_uuid field.
1037          *
1038          * XXX there is a chance of collision here.  The va_fsid reported
1039          * by stat is different from the more involved fsid used in the
1040          * mount structure.
1041          */
1042         ++hammer_stats_file_iopsr;
1043         hammer_lock_sh(&ip->lock);
1044         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1045                        (u_int32_t)(ip->obj_asof >> 32);
1046
1047         vap->va_fileid = ip->ino_leaf.base.obj_id;
1048         vap->va_mode = ip->ino_data.mode;
1049         vap->va_nlink = ip->ino_data.nlinks;
1050         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1051         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1052         vap->va_rmajor = 0;
1053         vap->va_rminor = 0;
1054         vap->va_size = ip->ino_data.size;
1055
1056         /*
1057          * Special case for @@PFS softlinks.  The actual size of the
1058          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1059          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1060          */
1061         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1062             ip->ino_data.size == 10 &&
1063             ip->obj_asof == HAMMER_MAX_TID &&
1064             ip->obj_localization == 0 &&
1065             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1066                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1067                             vap->va_size = 26;
1068                     else
1069                             vap->va_size = 10;
1070         }
1071
1072         /*
1073          * We must provide a consistent atime and mtime for snapshots
1074          * so people can do a 'tar cf - ... | md5' on them and get
1075          * consistent results.
1076          */
1077         if (ip->flags & HAMMER_INODE_RO) {
1078                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1079                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1080         } else {
1081                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1082                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1083         }
1084         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1085         vap->va_flags = ip->ino_data.uflags;
1086         vap->va_gen = 1;        /* hammer inums are unique for all time */
1087         vap->va_blocksize = HAMMER_BUFSIZE;
1088         if (ip->ino_data.size >= HAMMER_XDEMARC) {
1089                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1090                                 ~HAMMER_XBUFMASK64;
1091         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1092                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1093                                 ~HAMMER_BUFMASK64;
1094         } else {
1095                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1096         }
1097
1098         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1099         vap->va_filerev = 0;    /* XXX */
1100         vap->va_uid_uuid = ip->ino_data.uid;
1101         vap->va_gid_uuid = ip->ino_data.gid;
1102         vap->va_fsid_uuid = ip->hmp->fsid;
1103         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1104                           VA_FSID_UUID_VALID;
1105
1106         switch (ip->ino_data.obj_type) {
1107         case HAMMER_OBJTYPE_CDEV:
1108         case HAMMER_OBJTYPE_BDEV:
1109                 vap->va_rmajor = ip->ino_data.rmajor;
1110                 vap->va_rminor = ip->ino_data.rminor;
1111                 break;
1112         default:
1113                 break;
1114         }
1115         hammer_unlock(&ip->lock);
1116         return(0);
1117 }
1118
1119 /*
1120  * hammer_vop_nresolve { nch, dvp, cred }
1121  *
1122  * Locate the requested directory entry.
1123  */
1124 static
1125 int
1126 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1127 {
1128         struct hammer_transaction trans;
1129         struct namecache *ncp;
1130         hammer_mount_t hmp;
1131         hammer_inode_t dip;
1132         hammer_inode_t ip;
1133         hammer_tid_t asof;
1134         struct hammer_cursor cursor;
1135         struct vnode *vp;
1136         int64_t namekey;
1137         int error;
1138         int i;
1139         int nlen;
1140         int flags;
1141         int ispfs;
1142         int64_t obj_id;
1143         u_int32_t localization;
1144         u_int32_t max_iterations;
1145
1146         /*
1147          * Misc initialization, plus handle as-of name extensions.  Look for
1148          * the '@@' extension.  Note that as-of files and directories cannot
1149          * be modified.
1150          */
1151         dip = VTOI(ap->a_dvp);
1152         ncp = ap->a_nch->ncp;
1153         asof = dip->obj_asof;
1154         localization = dip->obj_localization;   /* for code consistency */
1155         nlen = ncp->nc_nlen;
1156         flags = dip->flags & HAMMER_INODE_RO;
1157         ispfs = 0;
1158         hmp = dip->hmp;
1159
1160         lwkt_gettoken(&hmp->fs_token);
1161         hammer_simple_transaction(&trans, hmp);
1162         ++hammer_stats_file_iopsr;
1163
1164         for (i = 0; i < nlen; ++i) {
1165                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1166                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
1167                                                   &ispfs, &asof, &localization);
1168                         if (error != 0) {
1169                                 i = nlen;
1170                                 break;
1171                         }
1172                         if (asof != HAMMER_MAX_TID)
1173                                 flags |= HAMMER_INODE_RO;
1174                         break;
1175                 }
1176         }
1177         nlen = i;
1178
1179         /*
1180          * If this is a PFS softlink we dive into the PFS
1181          */
1182         if (ispfs && nlen == 0) {
1183                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1184                                       asof, localization,
1185                                       flags, &error);
1186                 if (error == 0) {
1187                         error = hammer_get_vnode(ip, &vp);
1188                         hammer_rel_inode(ip, 0);
1189                 } else {
1190                         vp = NULL;
1191                 }
1192                 if (error == 0) {
1193                         vn_unlock(vp);
1194                         cache_setvp(ap->a_nch, vp);
1195                         vrele(vp);
1196                 }
1197                 goto done;
1198         }
1199
1200         /*
1201          * If there is no path component the time extension is relative to dip.
1202          * e.g. "fubar/@@<snapshot>"
1203          *
1204          * "." is handled by the kernel, but ".@@<snapshot>" is not.
1205          * e.g. "fubar/.@@<snapshot>"
1206          *
1207          * ".." is handled by the kernel.  We do not currently handle
1208          * "..@<snapshot>".
1209          */
1210         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1211                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
1212                                       asof, dip->obj_localization,
1213                                       flags, &error);
1214                 if (error == 0) {
1215                         error = hammer_get_vnode(ip, &vp);
1216                         hammer_rel_inode(ip, 0);
1217                 } else {
1218                         vp = NULL;
1219                 }
1220                 if (error == 0) {
1221                         vn_unlock(vp);
1222                         cache_setvp(ap->a_nch, vp);
1223                         vrele(vp);
1224                 }
1225                 goto done;
1226         }
1227
1228         /*
1229          * Calculate the namekey and setup the key range for the scan.  This
1230          * works kinda like a chained hash table where the lower 32 bits
1231          * of the namekey synthesize the chain.
1232          *
1233          * The key range is inclusive of both key_beg and key_end.
1234          */
1235         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1236                                            &max_iterations);
1237
1238         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1239         cursor.key_beg.localization = dip->obj_localization +
1240                                       hammer_dir_localization(dip);
1241         cursor.key_beg.obj_id = dip->obj_id;
1242         cursor.key_beg.key = namekey;
1243         cursor.key_beg.create_tid = 0;
1244         cursor.key_beg.delete_tid = 0;
1245         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1246         cursor.key_beg.obj_type = 0;
1247
1248         cursor.key_end = cursor.key_beg;
1249         cursor.key_end.key += max_iterations;
1250         cursor.asof = asof;
1251         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1252
1253         /*
1254          * Scan all matching records (the chain), locate the one matching
1255          * the requested path component.
1256          *
1257          * The hammer_ip_*() functions merge in-memory records with on-disk
1258          * records for the purposes of the search.
1259          */
1260         obj_id = 0;
1261         localization = HAMMER_DEF_LOCALIZATION;
1262
1263         if (error == 0) {
1264                 error = hammer_ip_first(&cursor);
1265                 while (error == 0) {
1266                         error = hammer_ip_resolve_data(&cursor);
1267                         if (error)
1268                                 break;
1269                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1270                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1271                                 obj_id = cursor.data->entry.obj_id;
1272                                 localization = cursor.data->entry.localization;
1273                                 break;
1274                         }
1275                         error = hammer_ip_next(&cursor);
1276                 }
1277         }
1278         hammer_done_cursor(&cursor);
1279
1280         /*
1281          * Lookup the obj_id.  This should always succeed.  If it does not
1282          * the filesystem may be damaged and we return a dummy inode.
1283          */
1284         if (error == 0) {
1285                 ip = hammer_get_inode(&trans, dip, obj_id,
1286                                       asof, localization,
1287                                       flags, &error);
1288                 if (error == ENOENT) {
1289                         kprintf("HAMMER: WARNING: Missing "
1290                                 "inode for dirent \"%s\"\n"
1291                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1292                                 ncp->nc_name,
1293                                 (long long)obj_id, (long long)asof,
1294                                 localization);
1295                         error = 0;
1296                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1297                                                     asof, localization,
1298                                                     flags, &error);
1299                 }
1300                 if (error == 0) {
1301                         error = hammer_get_vnode(ip, &vp);
1302                         hammer_rel_inode(ip, 0);
1303                 } else {
1304                         vp = NULL;
1305                 }
1306                 if (error == 0) {
1307                         vn_unlock(vp);
1308                         cache_setvp(ap->a_nch, vp);
1309                         vrele(vp);
1310                 }
1311         } else if (error == ENOENT) {
1312                 cache_setvp(ap->a_nch, NULL);
1313         }
1314 done:
1315         hammer_done_transaction(&trans);
1316         lwkt_reltoken(&hmp->fs_token);
1317         return (error);
1318 }
1319
1320 /*
1321  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1322  *
1323  * Locate the parent directory of a directory vnode.
1324  *
1325  * dvp is referenced but not locked.  *vpp must be returned referenced and
1326  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1327  * at the root, instead it could indicate that the directory we were in was
1328  * removed.
1329  *
1330  * NOTE: as-of sequences are not linked into the directory structure.  If
1331  * we are at the root with a different asof then the mount point, reload
1332  * the same directory with the mount point's asof.   I'm not sure what this
1333  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1334  * get confused, but it hasn't been tested.
1335  */
1336 static
1337 int
1338 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1339 {
1340         struct hammer_transaction trans;
1341         struct hammer_inode *dip;
1342         struct hammer_inode *ip;
1343         hammer_mount_t hmp;
1344         int64_t parent_obj_id;
1345         u_int32_t parent_obj_localization;
1346         hammer_tid_t asof;
1347         int error;
1348
1349         dip = VTOI(ap->a_dvp);
1350         asof = dip->obj_asof;
1351         hmp = dip->hmp;
1352
1353         /*
1354          * Whos are parent?  This could be the root of a pseudo-filesystem
1355          * whos parent is in another localization domain.
1356          */
1357         lwkt_gettoken(&hmp->fs_token);
1358         parent_obj_id = dip->ino_data.parent_obj_id;
1359         if (dip->obj_id == HAMMER_OBJID_ROOT)
1360                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1361         else
1362                 parent_obj_localization = dip->obj_localization;
1363
1364         if (parent_obj_id == 0) {
1365                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1366                    asof != hmp->asof) {
1367                         parent_obj_id = dip->obj_id;
1368                         asof = hmp->asof;
1369                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1370                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1371                                   (long long)dip->obj_asof);
1372                 } else {
1373                         *ap->a_vpp = NULL;
1374                         lwkt_reltoken(&hmp->fs_token);
1375                         return ENOENT;
1376                 }
1377         }
1378
1379         hammer_simple_transaction(&trans, hmp);
1380         ++hammer_stats_file_iopsr;
1381
1382         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1383                               asof, parent_obj_localization,
1384                               dip->flags, &error);
1385         if (ip) {
1386                 error = hammer_get_vnode(ip, ap->a_vpp);
1387                 hammer_rel_inode(ip, 0);
1388         } else {
1389                 *ap->a_vpp = NULL;
1390         }
1391         hammer_done_transaction(&trans);
1392         lwkt_reltoken(&hmp->fs_token);
1393         return (error);
1394 }
1395
1396 /*
1397  * hammer_vop_nlink { nch, dvp, vp, cred }
1398  */
1399 static
1400 int
1401 hammer_vop_nlink(struct vop_nlink_args *ap)
1402 {
1403         struct hammer_transaction trans;
1404         struct hammer_inode *dip;
1405         struct hammer_inode *ip;
1406         struct nchandle *nch;
1407         hammer_mount_t hmp;
1408         int error;
1409
1410         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1411                 return(EXDEV);
1412
1413         nch = ap->a_nch;
1414         dip = VTOI(ap->a_dvp);
1415         ip = VTOI(ap->a_vp);
1416         hmp = dip->hmp;
1417
1418         if (dip->obj_localization != ip->obj_localization)
1419                 return(EXDEV);
1420
1421         if (dip->flags & HAMMER_INODE_RO)
1422                 return (EROFS);
1423         if (ip->flags & HAMMER_INODE_RO)
1424                 return (EROFS);
1425         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1426                 return (error);
1427
1428         /*
1429          * Create a transaction to cover the operations we perform.
1430          */
1431         lwkt_gettoken(&hmp->fs_token);
1432         hammer_start_transaction(&trans, hmp);
1433         ++hammer_stats_file_iopsw;
1434
1435         /*
1436          * Add the filesystem object to the directory.  Note that neither
1437          * dip nor ip are referenced or locked, but their vnodes are
1438          * referenced.  This function will bump the inode's link count.
1439          */
1440         error = hammer_ip_add_directory(&trans, dip,
1441                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1442                                         ip);
1443
1444         /*
1445          * Finish up.
1446          */
1447         if (error == 0) {
1448                 cache_setunresolved(nch);
1449                 cache_setvp(nch, ap->a_vp);
1450         }
1451         hammer_done_transaction(&trans);
1452         hammer_knote(ap->a_vp, NOTE_LINK);
1453         hammer_knote(ap->a_dvp, NOTE_WRITE);
1454         lwkt_reltoken(&hmp->fs_token);
1455         return (error);
1456 }
1457
1458 /*
1459  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1460  *
1461  * The operating system has already ensured that the directory entry
1462  * does not exist and done all appropriate namespace locking.
1463  */
1464 static
1465 int
1466 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1467 {
1468         struct hammer_transaction trans;
1469         struct hammer_inode *dip;
1470         struct hammer_inode *nip;
1471         struct nchandle *nch;
1472         hammer_mount_t hmp;
1473         int error;
1474
1475         nch = ap->a_nch;
1476         dip = VTOI(ap->a_dvp);
1477         hmp = dip->hmp;
1478
1479         if (dip->flags & HAMMER_INODE_RO)
1480                 return (EROFS);
1481         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1482                 return (error);
1483
1484         /*
1485          * Create a transaction to cover the operations we perform.
1486          */
1487         lwkt_gettoken(&hmp->fs_token);
1488         hammer_start_transaction(&trans, hmp);
1489         ++hammer_stats_file_iopsw;
1490
1491         /*
1492          * Create a new filesystem object of the requested type.  The
1493          * returned inode will be referenced but not locked.
1494          */
1495         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1496                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1497                                     NULL, &nip);
1498         if (error) {
1499                 hkprintf("hammer_mkdir error %d\n", error);
1500                 hammer_done_transaction(&trans);
1501                 *ap->a_vpp = NULL;
1502                 lwkt_reltoken(&hmp->fs_token);
1503                 return (error);
1504         }
1505         /*
1506          * Add the new filesystem object to the directory.  This will also
1507          * bump the inode's link count.
1508          */
1509         error = hammer_ip_add_directory(&trans, dip,
1510                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1511                                         nip);
1512         if (error)
1513                 hkprintf("hammer_mkdir (add) error %d\n", error);
1514
1515         /*
1516          * Finish up.
1517          */
1518         if (error) {
1519                 hammer_rel_inode(nip, 0);
1520                 *ap->a_vpp = NULL;
1521         } else {
1522                 error = hammer_get_vnode(nip, ap->a_vpp);
1523                 hammer_rel_inode(nip, 0);
1524                 if (error == 0) {
1525                         cache_setunresolved(ap->a_nch);
1526                         cache_setvp(ap->a_nch, *ap->a_vpp);
1527                 }
1528         }
1529         hammer_done_transaction(&trans);
1530         if (error == 0)
1531                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1532         lwkt_reltoken(&hmp->fs_token);
1533         return (error);
1534 }
1535
1536 /*
1537  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1538  *
1539  * The operating system has already ensured that the directory entry
1540  * does not exist and done all appropriate namespace locking.
1541  */
1542 static
1543 int
1544 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1545 {
1546         struct hammer_transaction trans;
1547         struct hammer_inode *dip;
1548         struct hammer_inode *nip;
1549         struct nchandle *nch;
1550         hammer_mount_t hmp;
1551         int error;
1552
1553         nch = ap->a_nch;
1554         dip = VTOI(ap->a_dvp);
1555         hmp = dip->hmp;
1556
1557         if (dip->flags & HAMMER_INODE_RO)
1558                 return (EROFS);
1559         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1560                 return (error);
1561
1562         /*
1563          * Create a transaction to cover the operations we perform.
1564          */
1565         lwkt_gettoken(&hmp->fs_token);
1566         hammer_start_transaction(&trans, hmp);
1567         ++hammer_stats_file_iopsw;
1568
1569         /*
1570          * Create a new filesystem object of the requested type.  The
1571          * returned inode will be referenced but not locked.
1572          *
1573          * If mknod specifies a directory a pseudo-fs is created.
1574          */
1575         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1576                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1577                                     NULL, &nip);
1578         if (error) {
1579                 hammer_done_transaction(&trans);
1580                 *ap->a_vpp = NULL;
1581                 lwkt_reltoken(&hmp->fs_token);
1582                 return (error);
1583         }
1584
1585         /*
1586          * Add the new filesystem object to the directory.  This will also
1587          * bump the inode's link count.
1588          */
1589         error = hammer_ip_add_directory(&trans, dip,
1590                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1591                                         nip);
1592
1593         /*
1594          * Finish up.
1595          */
1596         if (error) {
1597                 hammer_rel_inode(nip, 0);
1598                 *ap->a_vpp = NULL;
1599         } else {
1600                 error = hammer_get_vnode(nip, ap->a_vpp);
1601                 hammer_rel_inode(nip, 0);
1602                 if (error == 0) {
1603                         cache_setunresolved(ap->a_nch);
1604                         cache_setvp(ap->a_nch, *ap->a_vpp);
1605                 }
1606         }
1607         hammer_done_transaction(&trans);
1608         if (error == 0)
1609                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1610         lwkt_reltoken(&hmp->fs_token);
1611         return (error);
1612 }
1613
1614 /*
1615  * hammer_vop_open { vp, mode, cred, fp }
1616  *
1617  * MPSAFE (does not require fs_token)
1618  */
1619 static
1620 int
1621 hammer_vop_open(struct vop_open_args *ap)
1622 {
1623         hammer_inode_t ip;
1624
1625         ++hammer_stats_file_iopsr;
1626         ip = VTOI(ap->a_vp);
1627
1628         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1629                 return (EROFS);
1630         return(vop_stdopen(ap));
1631 }
1632
1633 /*
1634  * hammer_vop_print { vp }
1635  */
1636 static
1637 int
1638 hammer_vop_print(struct vop_print_args *ap)
1639 {
1640         return EOPNOTSUPP;
1641 }
1642
1643 /*
1644  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1645  */
1646 static
1647 int
1648 hammer_vop_readdir(struct vop_readdir_args *ap)
1649 {
1650         struct hammer_transaction trans;
1651         struct hammer_cursor cursor;
1652         struct hammer_inode *ip;
1653         hammer_mount_t hmp;
1654         struct uio *uio;
1655         hammer_base_elm_t base;
1656         int error;
1657         int cookie_index;
1658         int ncookies;
1659         off_t *cookies;
1660         off_t saveoff;
1661         int r;
1662         int dtype;
1663
1664         ++hammer_stats_file_iopsr;
1665         ip = VTOI(ap->a_vp);
1666         uio = ap->a_uio;
1667         saveoff = uio->uio_offset;
1668         hmp = ip->hmp;
1669
1670         if (ap->a_ncookies) {
1671                 ncookies = uio->uio_resid / 16 + 1;
1672                 if (ncookies > 1024)
1673                         ncookies = 1024;
1674                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1675                 cookie_index = 0;
1676         } else {
1677                 ncookies = -1;
1678                 cookies = NULL;
1679                 cookie_index = 0;
1680         }
1681
1682         lwkt_gettoken(&hmp->fs_token);
1683         hammer_simple_transaction(&trans, hmp);
1684
1685         /*
1686          * Handle artificial entries
1687          *
1688          * It should be noted that the minimum value for a directory
1689          * hash key on-media is 0x0000000100000000, so we can use anything
1690          * less then that to represent our 'special' key space.
1691          */
1692         error = 0;
1693         if (saveoff == 0) {
1694                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1695                 if (r)
1696                         goto done;
1697                 if (cookies)
1698                         cookies[cookie_index] = saveoff;
1699                 ++saveoff;
1700                 ++cookie_index;
1701                 if (cookie_index == ncookies)
1702                         goto done;
1703         }
1704         if (saveoff == 1) {
1705                 if (ip->ino_data.parent_obj_id) {
1706                         r = vop_write_dirent(&error, uio,
1707                                              ip->ino_data.parent_obj_id,
1708                                              DT_DIR, 2, "..");
1709                 } else {
1710                         r = vop_write_dirent(&error, uio,
1711                                              ip->obj_id, DT_DIR, 2, "..");
1712                 }
1713                 if (r)
1714                         goto done;
1715                 if (cookies)
1716                         cookies[cookie_index] = saveoff;
1717                 ++saveoff;
1718                 ++cookie_index;
1719                 if (cookie_index == ncookies)
1720                         goto done;
1721         }
1722
1723         /*
1724          * Key range (begin and end inclusive) to scan.  Directory keys
1725          * directly translate to a 64 bit 'seek' position.
1726          */
1727         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1728         cursor.key_beg.localization = ip->obj_localization +
1729                                       hammer_dir_localization(ip);
1730         cursor.key_beg.obj_id = ip->obj_id;
1731         cursor.key_beg.create_tid = 0;
1732         cursor.key_beg.delete_tid = 0;
1733         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1734         cursor.key_beg.obj_type = 0;
1735         cursor.key_beg.key = saveoff;
1736
1737         cursor.key_end = cursor.key_beg;
1738         cursor.key_end.key = HAMMER_MAX_KEY;
1739         cursor.asof = ip->obj_asof;
1740         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1741
1742         error = hammer_ip_first(&cursor);
1743
1744         while (error == 0) {
1745                 error = hammer_ip_resolve_data(&cursor);
1746                 if (error)
1747                         break;
1748                 base = &cursor.leaf->base;
1749                 saveoff = base->key;
1750                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1751
1752                 if (base->obj_id != ip->obj_id)
1753                         panic("readdir: bad record at %p", cursor.node);
1754
1755                 /*
1756                  * Convert pseudo-filesystems into softlinks
1757                  */
1758                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1759                 r = vop_write_dirent(
1760                              &error, uio, cursor.data->entry.obj_id,
1761                              dtype,
1762                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1763                              (void *)cursor.data->entry.name);
1764                 if (r)
1765                         break;
1766                 ++saveoff;
1767                 if (cookies)
1768                         cookies[cookie_index] = base->key;
1769                 ++cookie_index;
1770                 if (cookie_index == ncookies)
1771                         break;
1772                 error = hammer_ip_next(&cursor);
1773         }
1774         hammer_done_cursor(&cursor);
1775
1776 done:
1777         hammer_done_transaction(&trans);
1778
1779         if (ap->a_eofflag)
1780                 *ap->a_eofflag = (error == ENOENT);
1781         uio->uio_offset = saveoff;
1782         if (error && cookie_index == 0) {
1783                 if (error == ENOENT)
1784                         error = 0;
1785                 if (cookies) {
1786                         kfree(cookies, M_TEMP);
1787                         *ap->a_ncookies = 0;
1788                         *ap->a_cookies = NULL;
1789                 }
1790         } else {
1791                 if (error == ENOENT)
1792                         error = 0;
1793                 if (cookies) {
1794                         *ap->a_ncookies = cookie_index;
1795                         *ap->a_cookies = cookies;
1796                 }
1797         }
1798         lwkt_reltoken(&hmp->fs_token);
1799         return(error);
1800 }
1801
1802 /*
1803  * hammer_vop_readlink { vp, uio, cred }
1804  */
1805 static
1806 int
1807 hammer_vop_readlink(struct vop_readlink_args *ap)
1808 {
1809         struct hammer_transaction trans;
1810         struct hammer_cursor cursor;
1811         struct hammer_inode *ip;
1812         hammer_mount_t hmp;
1813         char buf[32];
1814         u_int32_t localization;
1815         hammer_pseudofs_inmem_t pfsm;
1816         int error;
1817
1818         ip = VTOI(ap->a_vp);
1819         hmp = ip->hmp;
1820
1821         lwkt_gettoken(&hmp->fs_token);
1822
1823         /*
1824          * Shortcut if the symlink data was stuffed into ino_data.
1825          *
1826          * Also expand special "@@PFS%05d" softlinks (expansion only
1827          * occurs for non-historical (current) accesses made from the
1828          * primary filesystem).
1829          */
1830         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1831                 char *ptr;
1832                 int bytes;
1833
1834                 ptr = ip->ino_data.ext.symlink;
1835                 bytes = (int)ip->ino_data.size;
1836                 if (bytes == 10 &&
1837                     ip->obj_asof == HAMMER_MAX_TID &&
1838                     ip->obj_localization == 0 &&
1839                     strncmp(ptr, "@@PFS", 5) == 0) {
1840                         hammer_simple_transaction(&trans, hmp);
1841                         bcopy(ptr + 5, buf, 5);
1842                         buf[5] = 0;
1843                         localization = strtoul(buf, NULL, 10) << 16;
1844                         pfsm = hammer_load_pseudofs(&trans, localization,
1845                                                     &error);
1846                         if (error == 0) {
1847                                 if (pfsm->pfsd.mirror_flags &
1848                                     HAMMER_PFSD_SLAVE) {
1849                                         /* vap->va_size == 26 */
1850                                         ksnprintf(buf, sizeof(buf),
1851                                                   "@@0x%016llx:%05d",
1852                                                   (long long)pfsm->pfsd.sync_end_tid,
1853                                                   localization >> 16);
1854                                 } else {
1855                                         /* vap->va_size == 10 */
1856                                         ksnprintf(buf, sizeof(buf),
1857                                                   "@@-1:%05d",
1858                                                   localization >> 16);
1859 #if 0
1860                                         ksnprintf(buf, sizeof(buf),
1861                                                   "@@0x%016llx:%05d",
1862                                                   (long long)HAMMER_MAX_TID,
1863                                                   localization >> 16);
1864 #endif
1865                                 }
1866                                 ptr = buf;
1867                                 bytes = strlen(buf);
1868                         }
1869                         if (pfsm)
1870                                 hammer_rel_pseudofs(hmp, pfsm);
1871                         hammer_done_transaction(&trans);
1872                 }
1873                 error = uiomove(ptr, bytes, ap->a_uio);
1874                 lwkt_reltoken(&hmp->fs_token);
1875                 return(error);
1876         }
1877
1878         /*
1879          * Long version
1880          */
1881         hammer_simple_transaction(&trans, hmp);
1882         ++hammer_stats_file_iopsr;
1883         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1884
1885         /*
1886          * Key range (begin and end inclusive) to scan.  Directory keys
1887          * directly translate to a 64 bit 'seek' position.
1888          */
1889         cursor.key_beg.localization = ip->obj_localization +
1890                                       HAMMER_LOCALIZE_MISC;
1891         cursor.key_beg.obj_id = ip->obj_id;
1892         cursor.key_beg.create_tid = 0;
1893         cursor.key_beg.delete_tid = 0;
1894         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1895         cursor.key_beg.obj_type = 0;
1896         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1897         cursor.asof = ip->obj_asof;
1898         cursor.flags |= HAMMER_CURSOR_ASOF;
1899
1900         error = hammer_ip_lookup(&cursor);
1901         if (error == 0) {
1902                 error = hammer_ip_resolve_data(&cursor);
1903                 if (error == 0) {
1904                         KKASSERT(cursor.leaf->data_len >=
1905                                  HAMMER_SYMLINK_NAME_OFF);
1906                         error = uiomove(cursor.data->symlink.name,
1907                                         cursor.leaf->data_len -
1908                                                 HAMMER_SYMLINK_NAME_OFF,
1909                                         ap->a_uio);
1910                 }
1911         }
1912         hammer_done_cursor(&cursor);
1913         hammer_done_transaction(&trans);
1914         lwkt_reltoken(&hmp->fs_token);
1915         return(error);
1916 }
1917
1918 /*
1919  * hammer_vop_nremove { nch, dvp, cred }
1920  */
1921 static
1922 int
1923 hammer_vop_nremove(struct vop_nremove_args *ap)
1924 {
1925         struct hammer_transaction trans;
1926         struct hammer_inode *dip;
1927         hammer_mount_t hmp;
1928         int error;
1929
1930         dip = VTOI(ap->a_dvp);
1931         hmp = dip->hmp;
1932
1933         if (hammer_nohistory(dip) == 0 &&
1934             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1935                 return (error);
1936         }
1937
1938         lwkt_gettoken(&hmp->fs_token);
1939         hammer_start_transaction(&trans, hmp);
1940         ++hammer_stats_file_iopsw;
1941         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1942         hammer_done_transaction(&trans);
1943         if (error == 0)
1944                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1945         lwkt_reltoken(&hmp->fs_token);
1946         return (error);
1947 }
1948
1949 /*
1950  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1951  */
1952 static
1953 int
1954 hammer_vop_nrename(struct vop_nrename_args *ap)
1955 {
1956         struct hammer_transaction trans;
1957         struct namecache *fncp;
1958         struct namecache *tncp;
1959         struct hammer_inode *fdip;
1960         struct hammer_inode *tdip;
1961         struct hammer_inode *ip;
1962         hammer_mount_t hmp;
1963         struct hammer_cursor cursor;
1964         int64_t namekey;
1965         u_int32_t max_iterations;
1966         int nlen, error;
1967
1968         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1969                 return(EXDEV);
1970         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1971                 return(EXDEV);
1972
1973         fdip = VTOI(ap->a_fdvp);
1974         tdip = VTOI(ap->a_tdvp);
1975         fncp = ap->a_fnch->ncp;
1976         tncp = ap->a_tnch->ncp;
1977         ip = VTOI(fncp->nc_vp);
1978         KKASSERT(ip != NULL);
1979
1980         hmp = ip->hmp;
1981
1982         if (fdip->obj_localization != tdip->obj_localization)
1983                 return(EXDEV);
1984         if (fdip->obj_localization != ip->obj_localization)
1985                 return(EXDEV);
1986
1987         if (fdip->flags & HAMMER_INODE_RO)
1988                 return (EROFS);
1989         if (tdip->flags & HAMMER_INODE_RO)
1990                 return (EROFS);
1991         if (ip->flags & HAMMER_INODE_RO)
1992                 return (EROFS);
1993         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1994                 return (error);
1995
1996         lwkt_gettoken(&hmp->fs_token);
1997         hammer_start_transaction(&trans, hmp);
1998         ++hammer_stats_file_iopsw;
1999
2000         /*
2001          * Remove tncp from the target directory and then link ip as
2002          * tncp. XXX pass trans to dounlink
2003          *
2004          * Force the inode sync-time to match the transaction so it is
2005          * in-sync with the creation of the target directory entry.
2006          */
2007         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
2008                                 ap->a_cred, 0, -1);
2009         if (error == 0 || error == ENOENT) {
2010                 error = hammer_ip_add_directory(&trans, tdip,
2011                                                 tncp->nc_name, tncp->nc_nlen,
2012                                                 ip);
2013                 if (error == 0) {
2014                         ip->ino_data.parent_obj_id = tdip->obj_id;
2015                         ip->ino_data.ctime = trans.time;
2016                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
2017                 }
2018         }
2019         if (error)
2020                 goto failed; /* XXX */
2021
2022         /*
2023          * Locate the record in the originating directory and remove it.
2024          *
2025          * Calculate the namekey and setup the key range for the scan.  This
2026          * works kinda like a chained hash table where the lower 32 bits
2027          * of the namekey synthesize the chain.
2028          *
2029          * The key range is inclusive of both key_beg and key_end.
2030          */
2031         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2032                                            &max_iterations);
2033 retry:
2034         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
2035         cursor.key_beg.localization = fdip->obj_localization +
2036                                       hammer_dir_localization(fdip);
2037         cursor.key_beg.obj_id = fdip->obj_id;
2038         cursor.key_beg.key = namekey;
2039         cursor.key_beg.create_tid = 0;
2040         cursor.key_beg.delete_tid = 0;
2041         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2042         cursor.key_beg.obj_type = 0;
2043
2044         cursor.key_end = cursor.key_beg;
2045         cursor.key_end.key += max_iterations;
2046         cursor.asof = fdip->obj_asof;
2047         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2048
2049         /*
2050          * Scan all matching records (the chain), locate the one matching
2051          * the requested path component.
2052          *
2053          * The hammer_ip_*() functions merge in-memory records with on-disk
2054          * records for the purposes of the search.
2055          */
2056         error = hammer_ip_first(&cursor);
2057         while (error == 0) {
2058                 if (hammer_ip_resolve_data(&cursor) != 0)
2059                         break;
2060                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2061                 KKASSERT(nlen > 0);
2062                 if (fncp->nc_nlen == nlen &&
2063                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2064                         break;
2065                 }
2066                 error = hammer_ip_next(&cursor);
2067         }
2068
2069         /*
2070          * If all is ok we have to get the inode so we can adjust nlinks.
2071          *
2072          * WARNING: hammer_ip_del_directory() may have to terminate the
2073          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2074          * twice.
2075          */
2076         if (error == 0)
2077                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2078
2079         /*
2080          * XXX A deadlock here will break rename's atomicy for the purposes
2081          * of crash recovery.
2082          */
2083         if (error == EDEADLK) {
2084                 hammer_done_cursor(&cursor);
2085                 goto retry;
2086         }
2087
2088         /*
2089          * Cleanup and tell the kernel that the rename succeeded.
2090          *
2091          * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2092          *       without formally acquiring the vp since the vp might
2093          *       have zero refs on it, or in the middle of a reclaim,
2094          *       etc.
2095          */
2096         hammer_done_cursor(&cursor);
2097         if (error == 0) {
2098                 cache_rename(ap->a_fnch, ap->a_tnch);
2099                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
2100                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
2101                 while (ip->vp) {
2102                         struct vnode *vp;
2103
2104                         error = hammer_get_vnode(ip, &vp);
2105                         if (error == 0 && vp) {
2106                                 vn_unlock(vp);
2107                                 hammer_knote(ip->vp, NOTE_RENAME);
2108                                 vrele(vp);
2109                                 break;
2110                         }
2111                         kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2112                 }
2113         }
2114
2115 failed:
2116         hammer_done_transaction(&trans);
2117         lwkt_reltoken(&hmp->fs_token);
2118         return (error);
2119 }
2120
2121 /*
2122  * hammer_vop_nrmdir { nch, dvp, cred }
2123  */
2124 static
2125 int
2126 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2127 {
2128         struct hammer_transaction trans;
2129         struct hammer_inode *dip;
2130         hammer_mount_t hmp;
2131         int error;
2132
2133         dip = VTOI(ap->a_dvp);
2134         hmp = dip->hmp;
2135
2136         if (hammer_nohistory(dip) == 0 &&
2137             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2138                 return (error);
2139         }
2140
2141         lwkt_gettoken(&hmp->fs_token);
2142         hammer_start_transaction(&trans, hmp);
2143         ++hammer_stats_file_iopsw;
2144         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2145         hammer_done_transaction(&trans);
2146         if (error == 0)
2147                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2148         lwkt_reltoken(&hmp->fs_token);
2149         return (error);
2150 }
2151
2152 /*
2153  * hammer_vop_markatime { vp, cred }
2154  */
2155 static
2156 int
2157 hammer_vop_markatime(struct vop_markatime_args *ap)
2158 {
2159         struct hammer_transaction trans;
2160         struct hammer_inode *ip;
2161         hammer_mount_t hmp;
2162
2163         ip = VTOI(ap->a_vp);
2164         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2165                 return (EROFS);
2166         if (ip->flags & HAMMER_INODE_RO)
2167                 return (EROFS);
2168         hmp = ip->hmp;
2169         if (hmp->mp->mnt_flag & MNT_NOATIME)
2170                 return (0);
2171         lwkt_gettoken(&hmp->fs_token);
2172         hammer_start_transaction(&trans, hmp);
2173         ++hammer_stats_file_iopsw;
2174
2175         ip->ino_data.atime = trans.time;
2176         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2177         hammer_done_transaction(&trans);
2178         hammer_knote(ap->a_vp, NOTE_ATTRIB);
2179         lwkt_reltoken(&hmp->fs_token);
2180         return (0);
2181 }
2182
2183 /*
2184  * hammer_vop_setattr { vp, vap, cred }
2185  */
2186 static
2187 int
2188 hammer_vop_setattr(struct vop_setattr_args *ap)
2189 {
2190         struct hammer_transaction trans;
2191         struct hammer_inode *ip;
2192         struct vattr *vap;
2193         hammer_mount_t hmp;
2194         int modflags;
2195         int error;
2196         int truncating;
2197         int blksize;
2198         int kflags;
2199 #if 0
2200         int64_t aligned_size;
2201 #endif
2202         u_int32_t flags;
2203
2204         vap = ap->a_vap;
2205         ip = ap->a_vp->v_data;
2206         modflags = 0;
2207         kflags = 0;
2208         hmp = ip->hmp;
2209
2210         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2211                 return(EROFS);
2212         if (ip->flags & HAMMER_INODE_RO)
2213                 return (EROFS);
2214         if (hammer_nohistory(ip) == 0 &&
2215             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2216                 return (error);
2217         }
2218
2219         lwkt_gettoken(&hmp->fs_token);
2220         hammer_start_transaction(&trans, hmp);
2221         ++hammer_stats_file_iopsw;
2222         error = 0;
2223
2224         if (vap->va_flags != VNOVAL) {
2225                 flags = ip->ino_data.uflags;
2226                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2227                                          hammer_to_unix_xid(&ip->ino_data.uid),
2228                                          ap->a_cred);
2229                 if (error == 0) {
2230                         if (ip->ino_data.uflags != flags) {
2231                                 ip->ino_data.uflags = flags;
2232                                 ip->ino_data.ctime = trans.time;
2233                                 modflags |= HAMMER_INODE_DDIRTY;
2234                                 kflags |= NOTE_ATTRIB;
2235                         }
2236                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2237                                 error = 0;
2238                                 goto done;
2239                         }
2240                 }
2241                 goto done;
2242         }
2243         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2244                 error = EPERM;
2245                 goto done;
2246         }
2247         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2248                 mode_t cur_mode = ip->ino_data.mode;
2249                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2250                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2251                 uuid_t uuid_uid;
2252                 uuid_t uuid_gid;
2253
2254                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2255                                          ap->a_cred,
2256                                          &cur_uid, &cur_gid, &cur_mode);
2257                 if (error == 0) {
2258                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
2259                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
2260                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
2261                                  sizeof(uuid_uid)) ||
2262                             bcmp(&uuid_gid, &ip->ino_data.gid,
2263                                  sizeof(uuid_gid)) ||
2264                             ip->ino_data.mode != cur_mode
2265                         ) {
2266                                 ip->ino_data.uid = uuid_uid;
2267                                 ip->ino_data.gid = uuid_gid;
2268                                 ip->ino_data.mode = cur_mode;
2269                                 ip->ino_data.ctime = trans.time;
2270                                 modflags |= HAMMER_INODE_DDIRTY;
2271                         }
2272                         kflags |= NOTE_ATTRIB;
2273                 }
2274         }
2275         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2276                 switch(ap->a_vp->v_type) {
2277                 case VREG:
2278                         if (vap->va_size == ip->ino_data.size)
2279                                 break;
2280
2281                         /*
2282                          * Log the operation if in fast-fsync mode or if
2283                          * there are unterminated redo write records present.
2284                          *
2285                          * The second check is needed so the recovery code
2286                          * properly truncates write redos even if nominal
2287                          * REDO operations is turned off due to excessive
2288                          * writes, because the related records might be
2289                          * destroyed and never lay down a TERM_WRITE.
2290                          */
2291                         if ((ip->flags & HAMMER_INODE_REDO) ||
2292                             (ip->flags & HAMMER_INODE_RDIRTY)) {
2293                                 error = hammer_generate_redo(&trans, ip,
2294                                                              vap->va_size,
2295                                                              HAMMER_REDO_TRUNC,
2296                                                              NULL, 0);
2297                         }
2298                         blksize = hammer_blocksize(vap->va_size);
2299
2300                         /*
2301                          * XXX break atomicy, we can deadlock the backend
2302                          * if we do not release the lock.  Probably not a
2303                          * big deal here.
2304                          */
2305                         if (vap->va_size < ip->ino_data.size) {
2306                                 nvtruncbuf(ap->a_vp, vap->va_size,
2307                                            blksize,
2308                                            hammer_blockoff(vap->va_size));
2309                                 truncating = 1;
2310                                 kflags |= NOTE_WRITE;
2311                         } else {
2312                                 nvextendbuf(ap->a_vp,
2313                                             ip->ino_data.size,
2314                                             vap->va_size,
2315                                             hammer_blocksize(ip->ino_data.size),
2316                                             hammer_blocksize(vap->va_size),
2317                                             hammer_blockoff(ip->ino_data.size),
2318                                             hammer_blockoff(vap->va_size),
2319                                             0);
2320                                 truncating = 0;
2321                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
2322                         }
2323                         ip->ino_data.size = vap->va_size;
2324                         ip->ino_data.mtime = trans.time;
2325                         /* XXX safe to use SDIRTY instead of DDIRTY here? */
2326                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2327
2328                         /*
2329                          * On-media truncation is cached in the inode until
2330                          * the inode is synchronized.  We must immediately
2331                          * handle any frontend records.
2332                          */
2333                         if (truncating) {
2334                                 hammer_ip_frontend_trunc(ip, vap->va_size);
2335 #ifdef DEBUG_TRUNCATE
2336                                 if (HammerTruncIp == NULL)
2337                                         HammerTruncIp = ip;
2338 #endif
2339                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2340                                         ip->flags |= HAMMER_INODE_TRUNCATED;
2341                                         ip->trunc_off = vap->va_size;
2342 #ifdef DEBUG_TRUNCATE
2343                                         if (ip == HammerTruncIp)
2344                                         kprintf("truncate1 %016llx\n",
2345                                                 (long long)ip->trunc_off);
2346 #endif
2347                                 } else if (ip->trunc_off > vap->va_size) {
2348                                         ip->trunc_off = vap->va_size;
2349 #ifdef DEBUG_TRUNCATE
2350                                         if (ip == HammerTruncIp)
2351                                         kprintf("truncate2 %016llx\n",
2352                                                 (long long)ip->trunc_off);
2353 #endif
2354                                 } else {
2355 #ifdef DEBUG_TRUNCATE
2356                                         if (ip == HammerTruncIp)
2357                                         kprintf("truncate3 %016llx (ignored)\n",
2358                                                 (long long)vap->va_size);
2359 #endif
2360                                 }
2361                         }
2362
2363 #if 0
2364                         /*
2365                          * When truncating, nvtruncbuf() may have cleaned out
2366                          * a portion of the last block on-disk in the buffer
2367                          * cache.  We must clean out any frontend records
2368                          * for blocks beyond the new last block.
2369                          */
2370                         aligned_size = (vap->va_size + (blksize - 1)) &
2371                                        ~(int64_t)(blksize - 1);
2372                         if (truncating && vap->va_size < aligned_size) {
2373                                 aligned_size -= blksize;
2374                                 hammer_ip_frontend_trunc(ip, aligned_size);
2375                         }
2376 #endif
2377                         break;
2378                 case VDATABASE:
2379                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2380                                 ip->flags |= HAMMER_INODE_TRUNCATED;
2381                                 ip->trunc_off = vap->va_size;
2382                         } else if (ip->trunc_off > vap->va_size) {
2383                                 ip->trunc_off = vap->va_size;
2384                         }
2385                         hammer_ip_frontend_trunc(ip, vap->va_size);
2386                         ip->ino_data.size = vap->va_size;
2387                         ip->ino_data.mtime = trans.time;
2388                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2389                         kflags |= NOTE_ATTRIB;
2390                         break;
2391                 default:
2392                         error = EINVAL;
2393                         goto done;
2394                 }
2395                 break;
2396         }
2397         if (vap->va_atime.tv_sec != VNOVAL) {
2398                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2399                 modflags |= HAMMER_INODE_ATIME;
2400                 kflags |= NOTE_ATTRIB;
2401         }
2402         if (vap->va_mtime.tv_sec != VNOVAL) {
2403                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2404                 modflags |= HAMMER_INODE_MTIME;
2405                 kflags |= NOTE_ATTRIB;
2406         }
2407         if (vap->va_mode != (mode_t)VNOVAL) {
2408                 mode_t   cur_mode = ip->ino_data.mode;
2409                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2410                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2411
2412                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2413                                          cur_uid, cur_gid, &cur_mode);
2414                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2415                         ip->ino_data.mode = cur_mode;
2416                         ip->ino_data.ctime = trans.time;
2417                         modflags |= HAMMER_INODE_DDIRTY;
2418                         kflags |= NOTE_ATTRIB;
2419                 }
2420         }
2421 done:
2422         if (error == 0)
2423                 hammer_modify_inode(&trans, ip, modflags);
2424         hammer_done_transaction(&trans);
2425         hammer_knote(ap->a_vp, kflags);
2426         lwkt_reltoken(&hmp->fs_token);
2427         return (error);
2428 }
2429
2430 /*
2431  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2432  */
2433 static
2434 int
2435 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2436 {
2437         struct hammer_transaction trans;
2438         struct hammer_inode *dip;
2439         struct hammer_inode *nip;
2440         hammer_record_t record;
2441         struct nchandle *nch;
2442         hammer_mount_t hmp;
2443         int error;
2444         int bytes;
2445
2446         ap->a_vap->va_type = VLNK;
2447
2448         nch = ap->a_nch;
2449         dip = VTOI(ap->a_dvp);
2450         hmp = dip->hmp;
2451
2452         if (dip->flags & HAMMER_INODE_RO)
2453                 return (EROFS);
2454         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2455                 return (error);
2456
2457         /*
2458          * Create a transaction to cover the operations we perform.
2459          */
2460         lwkt_gettoken(&hmp->fs_token);
2461         hammer_start_transaction(&trans, hmp);
2462         ++hammer_stats_file_iopsw;
2463
2464         /*
2465          * Create a new filesystem object of the requested type.  The
2466          * returned inode will be referenced but not locked.
2467          */
2468
2469         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2470                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2471                                     NULL, &nip);
2472         if (error) {
2473                 hammer_done_transaction(&trans);
2474                 *ap->a_vpp = NULL;
2475                 lwkt_reltoken(&hmp->fs_token);
2476                 return (error);
2477         }
2478
2479         /*
2480          * Add a record representing the symlink.  symlink stores the link
2481          * as pure data, not a string, and is no \0 terminated.
2482          */
2483         if (error == 0) {
2484                 bytes = strlen(ap->a_target);
2485
2486                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2487                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2488                 } else {
2489                         record = hammer_alloc_mem_record(nip, bytes);
2490                         record->type = HAMMER_MEM_RECORD_GENERAL;
2491
2492                         record->leaf.base.localization = nip->obj_localization +
2493                                                          HAMMER_LOCALIZE_MISC;
2494                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2495                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2496                         record->leaf.data_len = bytes;
2497                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2498                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2499                         error = hammer_ip_add_record(&trans, record);
2500                 }
2501
2502                 /*
2503                  * Set the file size to the length of the link.
2504                  */
2505                 if (error == 0) {
2506                         nip->ino_data.size = bytes;
2507                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2508                 }
2509         }
2510         if (error == 0)
2511                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2512                                                 nch->ncp->nc_nlen, nip);
2513
2514         /*
2515          * Finish up.
2516          */
2517         if (error) {
2518                 hammer_rel_inode(nip, 0);
2519                 *ap->a_vpp = NULL;
2520         } else {
2521                 error = hammer_get_vnode(nip, ap->a_vpp);
2522                 hammer_rel_inode(nip, 0);
2523                 if (error == 0) {
2524                         cache_setunresolved(ap->a_nch);
2525                         cache_setvp(ap->a_nch, *ap->a_vpp);
2526                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2527                 }
2528         }
2529         hammer_done_transaction(&trans);
2530         lwkt_reltoken(&hmp->fs_token);
2531         return (error);
2532 }
2533
2534 /*
2535  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2536  */
2537 static
2538 int
2539 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2540 {
2541         struct hammer_transaction trans;
2542         struct hammer_inode *dip;
2543         hammer_mount_t hmp;
2544         int error;
2545
2546         dip = VTOI(ap->a_dvp);
2547         hmp = dip->hmp;
2548
2549         if (hammer_nohistory(dip) == 0 &&
2550             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2551                 return (error);
2552         }
2553
2554         lwkt_gettoken(&hmp->fs_token);
2555         hammer_start_transaction(&trans, hmp);
2556         ++hammer_stats_file_iopsw;
2557         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2558                                 ap->a_cred, ap->a_flags, -1);
2559         hammer_done_transaction(&trans);
2560         lwkt_reltoken(&hmp->fs_token);
2561
2562         return (error);
2563 }
2564
2565 /*
2566  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2567  */
2568 static
2569 int
2570 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2571 {
2572         struct hammer_inode *ip = ap->a_vp->v_data;
2573         hammer_mount_t hmp = ip->hmp;
2574         int error;
2575
2576         ++hammer_stats_file_iopsr;
2577         lwkt_gettoken(&hmp->fs_token);
2578         error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2579                              ap->a_fflag, ap->a_cred);
2580         lwkt_reltoken(&hmp->fs_token);
2581         return (error);
2582 }
2583
2584 static
2585 int
2586 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2587 {
2588         static const struct mountctl_opt extraopt[] = {
2589                 { HMNT_NOHISTORY,       "nohistory" },
2590                 { HMNT_MASTERID,        "master" },
2591                 { 0, NULL}
2592
2593         };
2594         struct hammer_mount *hmp;
2595         struct mount *mp;
2596         int usedbytes;
2597         int error;
2598
2599         error = 0;
2600         usedbytes = 0;
2601         mp = ap->a_head.a_ops->head.vv_mount;
2602         KKASSERT(mp->mnt_data != NULL);
2603         hmp = (struct hammer_mount *)mp->mnt_data;
2604
2605         lwkt_gettoken(&hmp->fs_token);
2606
2607         switch(ap->a_op) {
2608         case MOUNTCTL_SET_EXPORT:
2609                 if (ap->a_ctllen != sizeof(struct export_args))
2610                         error = EINVAL;
2611                 else
2612                         error = hammer_vfs_export(mp, ap->a_op,
2613                                       (const struct export_args *)ap->a_ctl);
2614                 break;
2615         case MOUNTCTL_MOUNTFLAGS:
2616         {
2617                 /*
2618                  * Call standard mountctl VOP function
2619                  * so we get user mount flags.
2620                  */
2621                 error = vop_stdmountctl(ap);
2622                 if (error)
2623                         break;
2624
2625                 usedbytes = *ap->a_res;
2626
2627                 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2628                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2629                                                     ap->a_buf,
2630                                                     ap->a_buflen - usedbytes,
2631                                                     &error);
2632                 }
2633
2634                 *ap->a_res += usedbytes;
2635                 break;
2636         }
2637         default:
2638                 error = vop_stdmountctl(ap);
2639                 break;
2640         }
2641         lwkt_reltoken(&hmp->fs_token);
2642         return(error);
2643 }
2644
2645 /*
2646  * hammer_vop_strategy { vp, bio }
2647  *
2648  * Strategy call, used for regular file read & write only.  Note that the
2649  * bp may represent a cluster.
2650  *
2651  * To simplify operation and allow better optimizations in the future,
2652  * this code does not make any assumptions with regards to buffer alignment
2653  * or size.
2654  */
2655 static
2656 int
2657 hammer_vop_strategy(struct vop_strategy_args *ap)
2658 {
2659         struct buf *bp;
2660         int error;
2661
2662         bp = ap->a_bio->bio_buf;
2663
2664         switch(bp->b_cmd) {
2665         case BUF_CMD_READ:
2666                 error = hammer_vop_strategy_read(ap);
2667                 break;
2668         case BUF_CMD_WRITE:
2669                 error = hammer_vop_strategy_write(ap);
2670                 break;
2671         default:
2672                 bp->b_error = error = EINVAL;
2673                 bp->b_flags |= B_ERROR;
2674                 biodone(ap->a_bio);
2675                 break;
2676         }
2677
2678         /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2679
2680         return (error);
2681 }
2682
2683 /*
2684  * Read from a regular file.  Iterate the related records and fill in the
2685  * BIO/BUF.  Gaps are zero-filled.
2686  *
2687  * The support code in hammer_object.c should be used to deal with mixed
2688  * in-memory and on-disk records.
2689  *
2690  * NOTE: Can be called from the cluster code with an oversized buf.
2691  *
2692  * XXX atime update
2693  */
2694 static
2695 int
2696 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2697 {
2698         struct hammer_transaction trans;
2699         struct hammer_inode *ip;
2700         struct hammer_inode *dip;
2701         hammer_mount_t hmp;
2702         struct hammer_cursor cursor;
2703         hammer_base_elm_t base;
2704         hammer_off_t disk_offset;
2705         struct bio *bio;
2706         struct bio *nbio;
2707         struct buf *bp;
2708         int64_t rec_offset;
2709         int64_t ran_end;
2710         int64_t tmp64;
2711         int error;
2712         int boff;
2713         int roff;
2714         int n;
2715         int isdedupable;
2716
2717         bio = ap->a_bio;
2718         bp = bio->bio_buf;
2719         ip = ap->a_vp->v_data;
2720         hmp = ip->hmp;
2721
2722         /*
2723          * The zone-2 disk offset may have been set by the cluster code via
2724          * a BMAP operation, or else should be NOOFFSET.
2725          *
2726          * Checking the high bits for a match against zone-2 should suffice.
2727          *
2728          * In cases where a lot of data duplication is present it may be
2729          * more beneficial to drop through and doubule-buffer through the
2730          * device.
2731          */
2732         nbio = push_bio(bio);
2733         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2734             HAMMER_ZONE_LARGE_DATA) {
2735                 if (hammer_double_buffer == 0) {
2736                         lwkt_gettoken(&hmp->fs_token);
2737                         error = hammer_io_direct_read(hmp, nbio, NULL);
2738                         lwkt_reltoken(&hmp->fs_token);
2739                         return (error);
2740                 }
2741
2742                 /*
2743                  * Try to shortcut requests for double_buffer mode too.
2744                  * Since this mode runs through the device buffer cache
2745                  * only compatible buffer sizes (meaning those generated
2746                  * by normal filesystem buffers) are legal.
2747                  */
2748                 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
2749                         error = hammer_io_indirect_read(hmp, nbio, NULL);
2750                         return (error);
2751                 }
2752         }
2753
2754         /*
2755          * Well, that sucked.  Do it the hard way.  If all the stars are
2756          * aligned we may still be able to issue a direct-read.
2757          */
2758         lwkt_gettoken(&hmp->fs_token);
2759         hammer_simple_transaction(&trans, hmp);
2760         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2761
2762         /*
2763          * Key range (begin and end inclusive) to scan.  Note that the key's
2764          * stored in the actual records represent BASE+LEN, not BASE.  The
2765          * first record containing bio_offset will have a key > bio_offset.
2766          */
2767         cursor.key_beg.localization = ip->obj_localization +
2768                                       HAMMER_LOCALIZE_MISC;
2769         cursor.key_beg.obj_id = ip->obj_id;
2770         cursor.key_beg.create_tid = 0;
2771         cursor.key_beg.delete_tid = 0;
2772         cursor.key_beg.obj_type = 0;
2773         cursor.key_beg.key = bio->bio_offset + 1;
2774         cursor.asof = ip->obj_asof;
2775         cursor.flags |= HAMMER_CURSOR_ASOF;
2776
2777         cursor.key_end = cursor.key_beg;
2778         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2779 #if 0
2780         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2781                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2782                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2783                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2784         } else
2785 #endif
2786         {
2787                 ran_end = bio->bio_offset + bp->b_bufsize;
2788                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2789                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2790                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2791                 if (tmp64 < ran_end)
2792                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2793                 else
2794                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2795         }
2796         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2797
2798         /*
2799          * Set NOSWAPCACHE for cursor data extraction if double buffering
2800          * is disabled or (if the file is not marked cacheable via chflags
2801          * and vm.swapcache_use_chflags is enabled).
2802          */
2803         if (hammer_double_buffer == 0 ||
2804             ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2805              vm_swapcache_use_chflags)) {
2806                 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2807         }
2808
2809         error = hammer_ip_first(&cursor);
2810         boff = 0;
2811
2812         while (error == 0) {
2813                 /*
2814                  * Get the base file offset of the record.  The key for
2815                  * data records is (base + bytes) rather then (base).
2816                  */
2817                 base = &cursor.leaf->base;
2818                 rec_offset = base->key - cursor.leaf->data_len;
2819
2820                 /*
2821                  * Calculate the gap, if any, and zero-fill it.
2822                  *
2823                  * n is the offset of the start of the record verses our
2824                  * current seek offset in the bio.
2825                  */
2826                 n = (int)(rec_offset - (bio->bio_offset + boff));
2827                 if (n > 0) {
2828                         if (n > bp->b_bufsize - boff)
2829                                 n = bp->b_bufsize - boff;
2830                         bzero((char *)bp->b_data + boff, n);
2831                         boff += n;
2832                         n = 0;
2833                 }
2834
2835                 /*
2836                  * Calculate the data offset in the record and the number
2837                  * of bytes we can copy.
2838                  *
2839                  * There are two degenerate cases.  First, boff may already
2840                  * be at bp->b_bufsize.  Secondly, the data offset within
2841                  * the record may exceed the record's size.
2842                  */
2843                 roff = -n;
2844                 rec_offset += roff;
2845                 n = cursor.leaf->data_len - roff;
2846                 if (n <= 0) {
2847                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2848                         n = 0;
2849                 } else if (n > bp->b_bufsize - boff) {
2850                         n = bp->b_bufsize - boff;
2851                 }
2852
2853                 /*
2854                  * Deal with cached truncations.  This cool bit of code
2855                  * allows truncate()/ftruncate() to avoid having to sync
2856                  * the file.
2857                  *
2858                  * If the frontend is truncated then all backend records are
2859                  * subject to the frontend's truncation.
2860                  *
2861                  * If the backend is truncated then backend records on-disk
2862                  * (but not in-memory) are subject to the backend's
2863                  * truncation.  In-memory records owned by the backend
2864                  * represent data written after the truncation point on the
2865                  * backend and must not be truncated.
2866                  *
2867                  * Truncate operations deal with frontend buffer cache
2868                  * buffers and frontend-owned in-memory records synchronously.
2869                  */
2870                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2871                         if (hammer_cursor_ondisk(&cursor)/* ||
2872                             cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2873                                 if (ip->trunc_off <= rec_offset)
2874                                         n = 0;
2875                                 else if (ip->trunc_off < rec_offset + n)
2876                                         n = (int)(ip->trunc_off - rec_offset);
2877                         }
2878                 }
2879                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2880                         if (hammer_cursor_ondisk(&cursor)) {
2881                                 if (ip->sync_trunc_off <= rec_offset)
2882                                         n = 0;
2883                                 else if (ip->sync_trunc_off < rec_offset + n)
2884                                         n = (int)(ip->sync_trunc_off - rec_offset);
2885                         }
2886                 }
2887
2888                 /*
2889                  * Try to issue a direct read into our bio if possible,
2890                  * otherwise resolve the element data into a hammer_buffer
2891                  * and copy.
2892                  *
2893                  * The buffer on-disk should be zerod past any real
2894                  * truncation point, but may not be for any synthesized
2895                  * truncation point from above.
2896                  *
2897                  * NOTE: disk_offset is only valid if the cursor data is
2898                  *       on-disk.
2899                  */
2900                 disk_offset = cursor.leaf->data_offset + roff;
2901                 isdedupable = (boff == 0 && n == bp->b_bufsize &&
2902                                hammer_cursor_ondisk(&cursor) &&
2903                                ((int)disk_offset & HAMMER_BUFMASK) == 0);
2904
2905                 if (isdedupable && hammer_double_buffer == 0) {
2906                         /*
2907                          * Direct read case
2908                          */
2909                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2910                                  HAMMER_ZONE_LARGE_DATA);
2911                         nbio->bio_offset = disk_offset;
2912                         error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2913                         if (hammer_live_dedup && error == 0)
2914                                 hammer_dedup_cache_add(ip, cursor.leaf);
2915                         goto done;
2916                 } else if (isdedupable) {
2917                         /*
2918                          * Async I/O case for reading from backing store
2919                          * and copying the data to the filesystem buffer.
2920                          * live-dedup has to verify the data anyway if it
2921                          * gets a hit later so we can just add the entry
2922                          * now.
2923                          */
2924                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2925                                  HAMMER_ZONE_LARGE_DATA);
2926                         nbio->bio_offset = disk_offset;
2927                         if (hammer_live_dedup)
2928                                 hammer_dedup_cache_add(ip, cursor.leaf);
2929                         error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
2930                         goto done;
2931                 } else if (n) {
2932                         error = hammer_ip_resolve_data(&cursor);
2933                         if (error == 0) {
2934                                 if (hammer_live_dedup && isdedupable)
2935                                         hammer_dedup_cache_add(ip, cursor.leaf);
2936                                 bcopy((char *)cursor.data + roff,
2937                                       (char *)bp->b_data + boff, n);
2938                         }
2939                 }
2940                 if (error)
2941                         break;
2942
2943                 /*
2944                  * We have to be sure that the only elements added to the
2945                  * dedup cache are those which are already on-media.
2946                  */
2947                 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2948                         hammer_dedup_cache_add(ip, cursor.leaf);
2949
2950                 /*
2951                  * Iterate until we have filled the request.
2952                  */
2953                 boff += n;
2954                 if (boff == bp->b_bufsize)
2955                         break;
2956                 error = hammer_ip_next(&cursor);
2957         }
2958
2959         /*
2960          * There may have been a gap after the last record
2961          */
2962         if (error == ENOENT)
2963                 error = 0;
2964         if (error == 0 && boff != bp->b_bufsize) {
2965                 KKASSERT(boff < bp->b_bufsize);
2966                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2967                 /* boff = bp->b_bufsize; */
2968         }
2969
2970         /*
2971          * Disallow swapcache operation on the vnode buffer if double
2972          * buffering is enabled, the swapcache will get the data via
2973          * the block device buffer.
2974          */
2975         if (hammer_double_buffer)
2976                 bp->b_flags |= B_NOTMETA;
2977
2978         /*
2979          * Cleanup
2980          */
2981         bp->b_resid = 0;
2982         bp->b_error = error;
2983         if (error)
2984                 bp->b_flags |= B_ERROR;
2985         biodone(ap->a_bio);
2986
2987 done:
2988         /*
2989          * Cache the b-tree node for the last data read in cache[1].
2990          *
2991          * If we hit the file EOF then also cache the node in the
2992          * governing director's cache[3], it will be used to initialize
2993          * the inode's cache[1] for any inodes looked up via the directory.
2994          *
2995          * This doesn't reduce disk accesses since the B-Tree chain is
2996          * likely cached, but it does reduce cpu overhead when looking
2997          * up file offsets for cpdup/tar/cpio style iterations.
2998          */
2999         if (cursor.node)
3000                 hammer_cache_node(&ip->cache[1], cursor.node);
3001         if (ran_end >= ip->ino_data.size) {
3002                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
3003                                         ip->obj_asof, ip->obj_localization);
3004                 if (dip) {
3005                         hammer_cache_node(&dip->cache[3], cursor.node);
3006                         hammer_rel_inode(dip, 0);
3007                 }
3008         }
3009         hammer_done_cursor(&cursor);
3010         hammer_done_transaction(&trans);
3011         lwkt_reltoken(&hmp->fs_token);
3012         return(error);
3013 }
3014
3015 /*
3016  * BMAP operation - used to support cluster_read() only.
3017  *
3018  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
3019  *
3020  * This routine may return EOPNOTSUPP if the opration is not supported for
3021  * the specified offset.  The contents of the pointer arguments do not
3022  * need to be initialized in that case. 
3023  *
3024  * If a disk address is available and properly aligned return 0 with 
3025  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
3026  * to the run-length relative to that offset.  Callers may assume that
3027  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
3028  * large, so return EOPNOTSUPP if it is not sufficiently large.
3029  */
3030 static
3031 int
3032 hammer_vop_bmap(struct vop_bmap_args *ap)
3033 {
3034         struct hammer_transaction trans;
3035         struct hammer_inode *ip;
3036         hammer_mount_t hmp;
3037         struct hammer_cursor cursor;
3038         hammer_base_elm_t base;
3039         int64_t rec_offset;
3040         int64_t ran_end;
3041         int64_t tmp64;
3042         int64_t base_offset;
3043         int64_t base_disk_offset;
3044         int64_t last_offset;
3045         hammer_off_t last_disk_offset;
3046         hammer_off_t disk_offset;
3047         int     rec_len;
3048         int     error;
3049         int     blksize;
3050
3051         ++hammer_stats_file_iopsr;
3052         ip = ap->a_vp->v_data;
3053         hmp = ip->hmp;
3054
3055         /*
3056          * We can only BMAP regular files.  We can't BMAP database files,
3057          * directories, etc.
3058          */
3059         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
3060                 return(EOPNOTSUPP);
3061
3062         /*
3063          * bmap is typically called with runp/runb both NULL when used
3064          * for writing.  We do not support BMAP for writing atm.
3065          */
3066         if (ap->a_cmd != BUF_CMD_READ)
3067                 return(EOPNOTSUPP);
3068
3069         /*
3070          * Scan the B-Tree to acquire blockmap addresses, then translate
3071          * to raw addresses.
3072          */
3073         lwkt_gettoken(&hmp->fs_token);
3074         hammer_simple_transaction(&trans, hmp);
3075 #if 0
3076         kprintf("bmap_beg %016llx ip->cache %p\n",
3077                 (long long)ap->a_loffset, ip->cache[1]);
3078 #endif
3079         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3080
3081         /*
3082          * Key range (begin and end inclusive) to scan.  Note that the key's
3083          * stored in the actual records represent BASE+LEN, not BASE.  The
3084          * first record containing bio_offset will have a key > bio_offset.
3085          */
3086         cursor.key_beg.localization = ip->obj_localization +
3087                                       HAMMER_LOCALIZE_MISC;
3088         cursor.key_beg.obj_id = ip->obj_id;
3089         cursor.key_beg.create_tid = 0;
3090         cursor.key_beg.delete_tid = 0;
3091         cursor.key_beg.obj_type = 0;
3092         if (ap->a_runb)
3093                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3094         else
3095                 cursor.key_beg.key = ap->a_loffset + 1;
3096         if (cursor.key_beg.key < 0)
3097                 cursor.key_beg.key = 0;
3098         cursor.asof = ip->obj_asof;
3099         cursor.flags |= HAMMER_CURSOR_ASOF;
3100
3101         cursor.key_end = cursor.key_beg;
3102         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3103
3104         ran_end = ap->a_loffset + MAXPHYS;
3105         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3106         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3107         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
3108         if (tmp64 < ran_end)
3109                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3110         else
3111                 cursor.key_end.key = ran_end + MAXPHYS + 1;
3112
3113         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3114
3115         error = hammer_ip_first(&cursor);
3116         base_offset = last_offset = 0;
3117         base_disk_offset = last_disk_offset = 0;
3118
3119         while (error == 0) {
3120                 /*
3121                  * Get the base file offset of the record.  The key for
3122                  * data records is (base + bytes) rather then (base).
3123                  *
3124                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
3125                  * The extra bytes should be zero on-disk and the BMAP op
3126                  * should still be ok.
3127                  */
3128                 base = &cursor.leaf->base;
3129                 rec_offset = base->key - cursor.leaf->data_len;
3130                 rec_len    = cursor.leaf->data_len;
3131
3132                 /*
3133                  * Incorporate any cached truncation.
3134                  *
3135                  * NOTE: Modifications to rec_len based on synthesized
3136                  * truncation points remove the guarantee that any extended
3137                  * data on disk is zero (since the truncations may not have
3138                  * taken place on-media yet).
3139                  */
3140                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
3141                         if (hammer_cursor_ondisk(&cursor) ||
3142                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3143                                 if (ip->trunc_off <= rec_offset)
3144                                         rec_len = 0;
3145                                 else if (ip->trunc_off < rec_offset + rec_len)
3146                                         rec_len = (int)(ip->trunc_off - rec_offset);
3147                         }
3148                 }
3149                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3150                         if (hammer_cursor_ondisk(&cursor)) {
3151                                 if (ip->sync_trunc_off <= rec_offset)
3152                                         rec_len = 0;
3153                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
3154                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
3155                         }
3156                 }
3157
3158                 /*
3159                  * Accumulate information.  If we have hit a discontiguous
3160                  * block reset base_offset unless we are already beyond the
3161                  * requested offset.  If we are, that's it, we stop.
3162                  */
3163                 if (error)
3164                         break;
3165                 if (hammer_cursor_ondisk(&cursor)) {
3166                         disk_offset = cursor.leaf->data_offset;
3167                         if (rec_offset != last_offset ||
3168                             disk_offset != last_disk_offset) {
3169                                 if (rec_offset > ap->a_loffset)
3170                                         break;
3171                                 base_offset = rec_offset;
3172                                 base_disk_offset = disk_offset;
3173                         }
3174                         last_offset = rec_offset + rec_len;
3175                         last_disk_offset = disk_offset + rec_len;
3176
3177                         if (hammer_live_dedup)
3178                                 hammer_dedup_cache_add(ip, cursor.leaf);
3179                 }
3180                 
3181                 error = hammer_ip_next(&cursor);
3182         }
3183
3184 #if 0
3185         kprintf("BMAP %016llx:  %016llx - %016llx\n",
3186                 (long long)ap->a_loffset,
3187                 (long long)base_offset,
3188                 (long long)last_offset);
3189         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3190                 (long long)base_disk_offset,
3191                 (long long)last_disk_offset);
3192 #endif
3193
3194         if (cursor.node) {
3195                 hammer_cache_node(&ip->cache[1], cursor.node);
3196 #if 0
3197                 kprintf("bmap_end2 %016llx ip->cache %p\n",
3198                         (long long)ap->a_loffset, ip->cache[1]);
3199 #endif
3200         }
3201         hammer_done_cursor(&cursor);
3202         hammer_done_transaction(&trans);
3203         lwkt_reltoken(&hmp->fs_token);
3204
3205         /*
3206          * If we couldn't find any records or the records we did find were
3207          * all behind the requested offset, return failure.  A forward
3208          * truncation can leave a hole w/ no on-disk records.
3209          */
3210         if (last_offset == 0 || last_offset < ap->a_loffset)
3211                 return (EOPNOTSUPP);
3212
3213         /*
3214          * Figure out the block size at the requested offset and adjust
3215          * our limits so the cluster_read() does not create inappropriately
3216          * sized buffer cache buffers.
3217          */
3218         blksize = hammer_blocksize(ap->a_loffset);
3219         if (hammer_blocksize(base_offset) != blksize) {
3220                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3221         }
3222         if (last_offset != ap->a_loffset &&
3223             hammer_blocksize(last_offset - 1) != blksize) {
3224                 last_offset = hammer_blockdemarc(ap->a_loffset,
3225                                                  last_offset - 1);
3226         }
3227
3228         /*
3229          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3230          * from occuring.
3231          */
3232         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3233
3234         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3235                 /*
3236                  * Only large-data zones can be direct-IOd
3237                  */
3238                 error = EOPNOTSUPP;
3239         } else if ((disk_offset & HAMMER_BUFMASK) ||
3240                    (last_offset - ap->a_loffset) < blksize) {
3241                 /*
3242                  * doffsetp is not aligned or the forward run size does
3243                  * not cover a whole buffer, disallow the direct I/O.
3244                  */
3245                 error = EOPNOTSUPP;
3246         } else {
3247                 /*
3248                  * We're good.
3249                  */
3250                 *ap->a_doffsetp = disk_offset;
3251                 if (ap->a_runb) {
3252                         *ap->a_runb = ap->a_loffset - base_offset;
3253                         KKASSERT(*ap->a_runb >= 0);
3254                 }
3255                 if (ap->a_runp) {
3256                         *ap->a_runp = last_offset - ap->a_loffset;
3257                         KKASSERT(*ap->a_runp >= 0);
3258                 }
3259                 error = 0;
3260         }
3261         return(error);
3262 }
3263
3264 /*
3265  * Write to a regular file.   Because this is a strategy call the OS is
3266  * trying to actually get data onto the media.
3267  */
3268 static
3269 int
3270 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3271 {
3272         hammer_record_t record;
3273         hammer_mount_t hmp;
3274         hammer_inode_t ip;
3275         struct bio *bio;
3276         struct buf *bp;
3277         int blksize;
3278         int bytes;
3279         int error;
3280
3281         bio = ap->a_bio;
3282         bp = bio->bio_buf;
3283         ip = ap->a_vp->v_data;
3284         hmp = ip->hmp;
3285
3286         blksize = hammer_blocksize(bio->bio_offset);
3287         KKASSERT(bp->b_bufsize == blksize);
3288
3289         if (ip->flags & HAMMER_INODE_RO) {
3290                 bp->b_error = EROFS;
3291                 bp->b_flags |= B_ERROR;
3292                 biodone(ap->a_bio);
3293                 return(EROFS);
3294         }
3295
3296         lwkt_gettoken(&hmp->fs_token);
3297
3298         /*
3299          * Disallow swapcache operation on the vnode buffer if double
3300          * buffering is enabled, the swapcache will get the data via
3301          * the block device buffer.
3302          */
3303         if (hammer_double_buffer)
3304                 bp->b_flags |= B_NOTMETA;
3305
3306         /*
3307          * Interlock with inode destruction (no in-kernel or directory
3308          * topology visibility).  If we queue new IO while trying to
3309          * destroy the inode we can deadlock the vtrunc call in
3310          * hammer_inode_unloadable_check().
3311          *
3312          * Besides, there's no point flushing a bp associated with an
3313          * inode that is being destroyed on-media and has no kernel
3314          * references.
3315          */
3316         if ((ip->flags | ip->sync_flags) &
3317             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3318                 bp->b_resid = 0;
3319                 biodone(ap->a_bio);
3320                 lwkt_reltoken(&hmp->fs_token);
3321                 return(0);
3322         }
3323
3324         /*
3325          * Reserve space and issue a direct-write from the front-end. 
3326          * NOTE: The direct_io code will hammer_bread/bcopy smaller
3327          * allocations.
3328          *
3329          * An in-memory record will be installed to reference the storage
3330          * until the flusher can get to it.
3331          *
3332          * Since we own the high level bio the front-end will not try to
3333          * do a direct-read until the write completes.
3334          *
3335          * NOTE: The only time we do not reserve a full-sized buffers
3336          * worth of data is if the file is small.  We do not try to
3337          * allocate a fragment (from the small-data zone) at the end of
3338          * an otherwise large file as this can lead to wildly separated
3339          * data.
3340          */
3341         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3342         KKASSERT(bio->bio_offset < ip->ino_data.size);
3343         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3344                 bytes = bp->b_bufsize;
3345         else
3346                 bytes = ((int)ip->ino_data.size + 15) & ~15;
3347
3348         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3349                                     bytes, &error);
3350
3351         /*
3352          * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3353          * in hammer_vop_write().  We must flag the record so the proper
3354          * REDO_TERM_WRITE entry is generated during the flush.
3355          */
3356         if (record) {
3357                 if (bp->b_flags & B_VFSFLAG1) {
3358                         record->flags |= HAMMER_RECF_REDO;
3359                         bp->b_flags &= ~B_VFSFLAG1;
3360                 }
3361                 if (record->flags & HAMMER_RECF_DEDUPED) {
3362                         bp->b_resid = 0;
3363                         hammer_ip_replace_bulk(hmp, record);
3364                         biodone(ap->a_bio);
3365                 } else {
3366                         hammer_io_direct_write(hmp, bio, record);
3367                 }
3368                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3369                         hammer_flush_inode(ip, 0);
3370         } else {
3371                 bp->b_bio2.bio_offset = NOOFFSET;
3372                 bp->b_error = error;
3373                 bp->b_flags |= B_ERROR;
3374                 biodone(ap->a_bio);
3375         }
3376         lwkt_reltoken(&hmp->fs_token);
3377         return(error);
3378 }
3379
3380 /*
3381  * dounlink - disconnect a directory entry
3382  *
3383  * XXX whiteout support not really in yet
3384  */
3385 static int
3386 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3387                 struct vnode *dvp, struct ucred *cred, 
3388                 int flags, int isdir)
3389 {
3390         struct namecache *ncp;
3391         hammer_inode_t dip;
3392         hammer_inode_t ip;
3393         hammer_mount_t hmp;
3394         struct hammer_cursor cursor;
3395         int64_t namekey;
3396         u_int32_t max_iterations;
3397         int nlen, error;
3398
3399         /*
3400          * Calculate the namekey and setup the key range for the scan.  This
3401          * works kinda like a chained hash table where the lower 32 bits
3402          * of the namekey synthesize the chain.
3403          *
3404          * The key range is inclusive of both key_beg and key_end.
3405          */
3406         dip = VTOI(dvp);
3407         ncp = nch->ncp;
3408         hmp = dip->hmp;
3409
3410         if (dip->flags & HAMMER_INODE_RO)
3411                 return (EROFS);
3412
3413         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3414                                            &max_iterations);
3415 retry:
3416         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3417         cursor.key_beg.localization = dip->obj_localization +
3418                                       hammer_dir_localization(dip);
3419         cursor.key_beg.obj_id = dip->obj_id;
3420         cursor.key_beg.key = namekey;
3421         cursor.key_beg.create_tid = 0;
3422         cursor.key_beg.delete_tid = 0;
3423         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3424         cursor.key_beg.obj_type = 0;
3425
3426         cursor.key_end = cursor.key_beg;
3427         cursor.key_end.key += max_iterations;
3428         cursor.asof = dip->obj_asof;
3429         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3430
3431         /*
3432          * Scan all matching records (the chain), locate the one matching
3433          * the requested path component.  info->last_error contains the
3434          * error code on search termination and could be 0, ENOENT, or
3435          * something else.
3436          *
3437          * The hammer_ip_*() functions merge in-memory records with on-disk
3438          * records for the purposes of the search.
3439          */
3440         error = hammer_ip_first(&cursor);
3441
3442         while (error == 0) {
3443                 error = hammer_ip_resolve_data(&cursor);
3444                 if (error)
3445                         break;
3446                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3447                 KKASSERT(nlen > 0);
3448                 if (ncp->nc_nlen == nlen &&
3449                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3450                         break;
3451                 }
3452                 error = hammer_ip_next(&cursor);
3453         }
3454
3455         /*
3456          * If all is ok we have to get the inode so we can adjust nlinks.
3457          * To avoid a deadlock with the flusher we must release the inode
3458          * lock on the directory when acquiring the inode for the entry.
3459          *
3460          * If the target is a directory, it must be empty.
3461          */
3462         if (error == 0) {
3463                 hammer_unlock(&cursor.ip->lock);
3464                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3465                                       hmp->asof,
3466                                       cursor.data->entry.localization,
3467                                       0, &error);
3468                 hammer_lock_sh(&cursor.ip->lock);
3469                 if (error == ENOENT) {
3470                         kprintf("HAMMER: WARNING: Removing "
3471                                 "dirent w/missing inode \"%s\"\n"
3472                                 "\tobj_id = %016llx\n",
3473                                 ncp->nc_name,
3474                                 (long long)cursor.data->entry.obj_id);
3475                         error = 0;
3476                 }
3477
3478                 /*
3479                  * If isdir >= 0 we validate that the entry is or is not a
3480                  * directory.  If isdir < 0 we don't care.
3481                  */
3482                 if (error == 0 && isdir >= 0 && ip) {
3483                         if (isdir &&
3484                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3485                                 error = ENOTDIR;
3486                         } else if (isdir == 0 &&
3487                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3488                                 error = EISDIR;
3489                         }
3490                 }
3491
3492                 /*
3493                  * If we are trying to remove a directory the directory must
3494                  * be empty.
3495                  *
3496                  * The check directory code can loop and deadlock/retry.  Our
3497                  * own cursor's node locks must be released to avoid a 3-way
3498                  * deadlock with the flusher if the check directory code
3499                  * blocks.
3500                  *
3501                  * If any changes whatsoever have been made to the cursor
3502                  * set EDEADLK and retry.
3503                  *
3504                  * WARNING: See warnings in hammer_unlock_cursor()
3505                  *          function.
3506                  */
3507                 if (error == 0 && ip && ip->ino_data.obj_type ==
3508                                         HAMMER_OBJTYPE_DIRECTORY) {
3509                         hammer_unlock_cursor(&cursor);
3510                         error = hammer_ip_check_directory_empty(trans, ip);
3511                         hammer_lock_cursor(&cursor);
3512                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
3513                                 kprintf("HAMMER: Warning: avoided deadlock "
3514                                         "on rmdir '%s'\n",
3515                                         ncp->nc_name);
3516                                 error = EDEADLK;
3517                         }
3518                 }
3519
3520                 /*
3521                  * Delete the directory entry.
3522                  *
3523                  * WARNING: hammer_ip_del_directory() may have to terminate
3524                  * the cursor to avoid a deadlock.  It is ok to call
3525                  * hammer_done_cursor() twice.
3526                  */
3527                 if (error == 0) {
3528                         error = hammer_ip_del_directory(trans, &cursor,
3529                                                         dip, ip);
3530                 }
3531                 hammer_done_cursor(&cursor);
3532                 if (error == 0) {
3533                         cache_setunresolved(nch);
3534                         cache_setvp(nch, NULL);
3535
3536                         /*
3537                          * NOTE: ip->vp, if non-NULL, cannot be directly
3538                          *       referenced without formally acquiring the
3539                          *       vp since the vp might have zero refs on it,
3540                          *       or in the middle of a reclaim, etc.
3541                          *
3542                          * NOTE: The cache_setunresolved() can rip the vp
3543                          *       out from under us since the vp may not have
3544                          *       any refs, in which case ip->vp will be NULL
3545                          *       from the outset.
3546                          */
3547                         while (ip && ip->vp) {
3548                                 struct vnode *vp;
3549
3550                                 error = hammer_get_vnode(ip, &vp);
3551                                 if (error == 0 && vp) {
3552                                         vn_unlock(vp);
3553                                         hammer_knote(ip->vp, NOTE_DELETE);
3554                                         cache_inval_vp(ip->vp, CINV_DESTROY);
3555                                         vrele(vp);
3556                                         break;
3557                                 }
3558                                 kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3559                         }
3560                 }
3561                 if (ip)
3562                         hammer_rel_inode(ip, 0);
3563         } else {
3564                 hammer_done_cursor(&cursor);
3565         }
3566         if (error == EDEADLK)
3567                 goto retry;
3568
3569         return (error);
3570 }
3571
3572 /************************************************************************
3573  *                          FIFO AND SPECFS OPS                         *
3574  ************************************************************************
3575  *
3576  */
3577 static int
3578 hammer_vop_fifoclose (struct vop_close_args *ap)
3579 {
3580         /* XXX update itimes */
3581         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3582 }
3583
3584 static int
3585 hammer_vop_fiforead (struct vop_read_args *ap)
3586 {
3587         int error;
3588
3589         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3590         /* XXX update access time */
3591         return (error);
3592 }
3593
3594 static int
3595 hammer_vop_fifowrite (struct vop_write_args *ap)
3596 {
3597         int error;
3598
3599         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3600         /* XXX update access time */
3601         return (error);
3602 }
3603
3604 static
3605 int
3606 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3607 {
3608         int error;
3609
3610         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3611         if (error)
3612                 error = hammer_vop_kqfilter(ap);
3613         return(error);
3614 }
3615
3616 /************************************************************************
3617  *                          KQFILTER OPS                                *
3618  ************************************************************************
3619  *
3620  */
3621 static void filt_hammerdetach(struct knote *kn);
3622 static int filt_hammerread(struct knote *kn, long hint);
3623 static int filt_hammerwrite(struct knote *kn, long hint);
3624 static int filt_hammervnode(struct knote *kn, long hint);
3625
3626 static struct filterops hammerread_filtops =
3627         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
3628 static struct filterops hammerwrite_filtops =
3629         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
3630 static struct filterops hammervnode_filtops =
3631         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
3632
3633 static
3634 int
3635 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3636 {
3637         struct vnode *vp = ap->a_vp;
3638         struct knote *kn = ap->a_kn;
3639
3640         switch (kn->kn_filter) {
3641         case EVFILT_READ:
3642                 kn->kn_fop = &hammerread_filtops;
3643                 break;
3644         case EVFILT_WRITE:
3645                 kn->kn_fop = &hammerwrite_filtops;
3646                 break;
3647         case EVFILT_VNODE:
3648                 kn->kn_fop = &hammervnode_filtops;
3649                 break;
3650         default:
3651                 return (EOPNOTSUPP);
3652         }
3653
3654         kn->kn_hook = (caddr_t)vp;
3655
3656         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3657
3658         return(0);
3659 }
3660
3661 static void
3662 filt_hammerdetach(struct knote *kn)
3663 {
3664         struct vnode *vp = (void *)kn->kn_hook;
3665
3666         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3667 }
3668
3669 static int
3670 filt_hammerread(struct knote *kn, long hint)
3671 {
3672         struct vnode *vp = (void *)kn->kn_hook;
3673         hammer_inode_t ip = VTOI(vp);
3674         hammer_mount_t hmp = ip->hmp;
3675         off_t off;
3676
3677         if (hint == NOTE_REVOKE) {
3678                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3679                 return(1);
3680         }
3681         lwkt_gettoken(&hmp->fs_token);  /* XXX use per-ip-token */
3682         off = ip->ino_data.size - kn->kn_fp->f_offset;
3683         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
3684         lwkt_reltoken(&hmp->fs_token);
3685         if (kn->kn_sfflags & NOTE_OLDAPI)
3686                 return(1);
3687         return (kn->kn_data != 0);
3688 }
3689
3690 static int
3691 filt_hammerwrite(struct knote *kn, long hint)
3692 {
3693         if (hint == NOTE_REVOKE)
3694                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3695         kn->kn_data = 0;
3696         return (1);
3697 }
3698
3699 static int
3700 filt_hammervnode(struct knote *kn, long hint)
3701 {
3702         if (kn->kn_sfflags & hint)
3703                 kn->kn_fflags |= hint;
3704         if (hint == NOTE_REVOKE) {
3705                 kn->kn_flags |= (EV_EOF | EV_NODATA);
3706                 return (1);
3707         }
3708         return (kn->kn_fflags != 0);
3709 }
3710