Merge branch 'vendor/MPFR'
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/fcntl.h>
39 #include <sys/namecache.h>
40 #include <sys/vnode.h>
41 #include <sys/lockf.h>
42 #include <sys/event.h>
43 #include <sys/stat.h>
44 #include <sys/dirent.h>
45 #include <sys/file.h>
46 #include <vm/vm_extern.h>
47 #include <vm/swap_pager.h>
48 #include <vfs/fifofs/fifo.h>
49
50 #include "hammer.h"
51
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
85
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         vop_stdpathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_markatime =        hammer_vop_markatime,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl,
126         .vop_kqfilter =         hammer_vop_kqfilter
127 };
128
129 struct vop_ops hammer_spec_vops = {
130         .vop_default =          vop_defaultop,
131         .vop_fsync =            hammer_vop_fsync,
132         .vop_read =             vop_stdnoread,
133         .vop_write =            vop_stdnowrite,
134         .vop_access =           hammer_vop_access,
135         .vop_close =            hammer_vop_close,
136         .vop_markatime =        hammer_vop_markatime,
137         .vop_getattr =          hammer_vop_getattr,
138         .vop_inactive =         hammer_vop_inactive,
139         .vop_reclaim =          hammer_vop_reclaim,
140         .vop_setattr =          hammer_vop_setattr
141 };
142
143 struct vop_ops hammer_fifo_vops = {
144         .vop_default =          fifo_vnoperate,
145         .vop_fsync =            hammer_vop_fsync,
146         .vop_read =             hammer_vop_fiforead,
147         .vop_write =            hammer_vop_fifowrite,
148         .vop_access =           hammer_vop_access,
149         .vop_close =            hammer_vop_fifoclose,
150         .vop_markatime =        hammer_vop_markatime,
151         .vop_getattr =          hammer_vop_getattr,
152         .vop_inactive =         hammer_vop_inactive,
153         .vop_reclaim =          hammer_vop_reclaim,
154         .vop_setattr =          hammer_vop_setattr,
155         .vop_kqfilter =         hammer_vop_fifokqfilter
156 };
157
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
161 {
162         if (flags)
163                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
164 }
165
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
169
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171                            struct vnode *dvp, struct ucred *cred,
172                            int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
175
176 #if 0
177 static
178 int
179 hammer_vop_vnoperate(struct vop_generic_args *)
180 {
181         return (VOCALL(&hammer_vnode_vops, ap));
182 }
183 #endif
184
185 /*
186  * hammer_vop_fsync { vp, waitfor }
187  *
188  * fsync() an inode to disk and wait for it to be completely committed
189  * such that the information would not be undone if a crash occured after
190  * return.
191  *
192  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
193  *       a REDO log.  A sysctl is provided to relax HAMMER's fsync()
194  *       operation.
195  *
196  *       Ultimately the combination of a REDO log and use of fast storage
197  *       to front-end cluster caches will make fsync fast, but it aint
198  *       here yet.  And, in anycase, we need real transactional
199  *       all-or-nothing features which are not restricted to a single file.
200  */
201 static
202 int
203 hammer_vop_fsync(struct vop_fsync_args *ap)
204 {
205         hammer_inode_t ip = VTOI(ap->a_vp);
206         hammer_mount_t hmp = ip->hmp;
207         int waitfor = ap->a_waitfor;
208         int mode;
209
210         lwkt_gettoken(&hmp->fs_token);
211
212         /*
213          * Fsync rule relaxation (default is either full synchronous flush
214          * or REDO semantics with synchronous flush).
215          */
216         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
217                 switch(hammer_fsync_mode) {
218                 case 0:
219 mode0:
220                         /* no REDO, full synchronous flush */
221                         goto skip;
222                 case 1:
223 mode1:
224                         /* no REDO, full asynchronous flush */
225                         if (waitfor == MNT_WAIT)
226                                 waitfor = MNT_NOWAIT;
227                         goto skip;
228                 case 2:
229                         /* REDO semantics, synchronous flush */
230                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
231                                 goto mode0;
232                         mode = HAMMER_FLUSH_UNDOS_AUTO;
233                         break;
234                 case 3:
235                         /* REDO semantics, relaxed asynchronous flush */
236                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
237                                 goto mode1;
238                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
239                         if (waitfor == MNT_WAIT)
240                                 waitfor = MNT_NOWAIT;
241                         break;
242                 case 4:
243                         /* ignore the fsync() system call */
244                         lwkt_reltoken(&hmp->fs_token);
245                         return(0);
246                 default:
247                         /* we have to do something */
248                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
249                         if (waitfor == MNT_WAIT)
250                                 waitfor = MNT_NOWAIT;
251                         break;
252                 }
253
254                 /*
255                  * Fast fsync only needs to flush the UNDO/REDO fifo if
256                  * HAMMER_INODE_REDO is non-zero and the only modifications
257                  * made to the file are write or write-extends.
258                  */
259                 if ((ip->flags & HAMMER_INODE_REDO) &&
260                     (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
261                 ) {
262                         ++hammer_count_fsyncs;
263                         hammer_flusher_flush_undos(hmp, mode);
264                         ip->redo_count = 0;
265                         lwkt_reltoken(&hmp->fs_token);
266                         return(0);
267                 }
268
269                 /*
270                  * REDO is enabled by fsync(), the idea being we really only
271                  * want to lay down REDO records when programs are using
272                  * fsync() heavily.  The first fsync() on the file starts
273                  * the gravy train going and later fsync()s keep it hot by
274                  * resetting the redo_count.
275                  *
276                  * We weren't running REDOs before now so we have to fall
277                  * through and do a full fsync of what we have.
278                  */
279                 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
280                     (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
281                         ip->flags |= HAMMER_INODE_REDO;
282                         ip->redo_count = 0;
283                 }
284         }
285 skip:
286
287         /*
288          * Do a full flush sequence.
289          *
290          * Attempt to release the vnode while waiting for the inode to
291          * finish flushing.  This can really mess up inactive->reclaim
292          * sequences so only do it if the vnode is active.
293          */
294         ++hammer_count_fsyncs;
295         vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
296         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
297         if (waitfor == MNT_WAIT) {
298                 if ((ap->a_vp->v_flag & VINACTIVE) == 0)
299                         vn_unlock(ap->a_vp);
300                 hammer_wait_inode(ip);
301                 if ((ap->a_vp->v_flag & VINACTIVE) == 0)
302                         vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
303         }
304         lwkt_reltoken(&hmp->fs_token);
305         return (ip->error);
306 }
307
308 /*
309  * hammer_vop_read { vp, uio, ioflag, cred }
310  *
311  * MPSAFE (for the cache safe does not require fs_token)
312  */
313 static
314 int
315 hammer_vop_read(struct vop_read_args *ap)
316 {
317         struct hammer_transaction trans;
318         hammer_inode_t ip;
319         hammer_mount_t hmp;
320         off_t offset;
321         struct buf *bp;
322         struct uio *uio;
323         int error;
324         int n;
325         int seqcount;
326         int ioseqcount;
327         int blksize;
328         int bigread;
329         int got_fstoken;
330         size_t resid;
331
332         if (ap->a_vp->v_type != VREG)
333                 return (EINVAL);
334         ip = VTOI(ap->a_vp);
335         hmp = ip->hmp;
336         error = 0;
337         got_fstoken = 0;
338         uio = ap->a_uio;
339
340         /*
341          * Attempt to shortcut directly to the VM object using lwbufs.
342          * This is much faster than instantiating buffer cache buffers.
343          */
344         resid = uio->uio_resid;
345         error = vop_helper_read_shortcut(ap);
346         hammer_stats_file_read += resid - uio->uio_resid;
347         if (error)
348                 return (error);
349         if (uio->uio_resid == 0)
350                 goto finished;
351
352         /*
353          * Allow the UIO's size to override the sequential heuristic.
354          */
355         blksize = hammer_blocksize(uio->uio_offset);
356         seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
357         ioseqcount = (ap->a_ioflag >> 16);
358         if (seqcount < ioseqcount)
359                 seqcount = ioseqcount;
360
361         /*
362          * If reading or writing a huge amount of data we have to break
363          * atomicy and allow the operation to be interrupted by a signal
364          * or it can DOS the machine.
365          */
366         bigread = (uio->uio_resid > 100 * 1024 * 1024);
367
368         /*
369          * Access the data typically in HAMMER_BUFSIZE blocks via the
370          * buffer cache, but HAMMER may use a variable block size based
371          * on the offset.
372          *
373          * XXX Temporary hack, delay the start transaction while we remain
374          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
375          *     locked-shared.
376          */
377         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
378                 int64_t base_offset;
379                 int64_t file_limit;
380
381                 blksize = hammer_blocksize(uio->uio_offset);
382                 offset = (int)uio->uio_offset & (blksize - 1);
383                 base_offset = uio->uio_offset - offset;
384
385                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
386                         break;
387
388                 /*
389                  * MPSAFE
390                  */
391                 bp = getblk(ap->a_vp, base_offset, blksize, 0, 0);
392                 if ((bp->b_flags & (B_INVAL | B_CACHE | B_RAM)) == B_CACHE) {
393                         bp->b_flags &= ~B_AGE;
394                         error = 0;
395                         goto skip;
396                 }
397                 if (ap->a_ioflag & IO_NRDELAY) {
398                         bqrelse(bp);
399                         return (EWOULDBLOCK);
400                 }
401
402                 /*
403                  * MPUNSAFE
404                  */
405                 if (got_fstoken == 0) {
406                         lwkt_gettoken(&hmp->fs_token);
407                         got_fstoken = 1;
408                         hammer_start_transaction(&trans, ip->hmp);
409                 }
410
411                 /*
412                  * NOTE: A valid bp has already been acquired, but was not
413                  *       B_CACHE.
414                  */
415                 if (hammer_cluster_enable) {
416                         /*
417                          * Use file_limit to prevent cluster_read() from
418                          * creating buffers of the wrong block size past
419                          * the demarc.
420                          */
421                         file_limit = ip->ino_data.size;
422                         if (base_offset < HAMMER_XDEMARC &&
423                             file_limit > HAMMER_XDEMARC) {
424                                 file_limit = HAMMER_XDEMARC;
425                         }
426                         error = cluster_readx(ap->a_vp,
427                                              file_limit, base_offset,
428                                              blksize, uio->uio_resid,
429                                              seqcount * BKVASIZE, &bp);
430                 } else {
431                         error = breadnx(ap->a_vp, base_offset, blksize,
432                                         NULL, NULL, 0, &bp);
433                 }
434                 if (error) {
435                         brelse(bp);
436                         break;
437                 }
438 skip:
439                 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
440                         kprintf("doff %016jx read file %016jx@%016jx\n",
441                                 (intmax_t)bp->b_bio2.bio_offset,
442                                 (intmax_t)ip->obj_id,
443                                 (intmax_t)bp->b_loffset);
444                 }
445                 bp->b_flags &= ~B_IODEBUG;
446                 if (blksize == HAMMER_XBUFSIZE)
447                         bp->b_flags |= B_CLUSTEROK;
448
449                 n = blksize - offset;
450                 if (n > uio->uio_resid)
451                         n = uio->uio_resid;
452                 if (n > ip->ino_data.size - uio->uio_offset)
453                         n = (int)(ip->ino_data.size - uio->uio_offset);
454                 if (got_fstoken)
455                         lwkt_reltoken(&hmp->fs_token);
456
457                 /*
458                  * Set B_AGE, data has a lower priority than meta-data.
459                  *
460                  * Use a hold/unlock/drop sequence to run the uiomove
461                  * with the buffer unlocked, avoiding deadlocks against
462                  * read()s on mmap()'d spaces.
463                  */
464                 bp->b_flags |= B_AGE;
465                 error = uiomovebp(bp, (char *)bp->b_data + offset, n, uio);
466                 bqrelse(bp);
467
468                 if (got_fstoken)
469                         lwkt_gettoken(&hmp->fs_token);
470
471                 if (error)
472                         break;
473                 hammer_stats_file_read += n;
474         }
475
476 finished:
477
478         /*
479          * Try to update the atime with just the inode lock for maximum
480          * concurrency.  If we can't shortcut it we have to get the full
481          * blown transaction.
482          */
483         if (got_fstoken == 0 && hammer_update_atime_quick(ip) < 0) {
484                 lwkt_gettoken(&hmp->fs_token);
485                 got_fstoken = 1;
486                 hammer_start_transaction(&trans, ip->hmp);
487         }
488
489         if (got_fstoken) {
490                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
491                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
492                         ip->ino_data.atime = trans.time;
493                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
494                 }
495                 hammer_done_transaction(&trans);
496                 lwkt_reltoken(&hmp->fs_token);
497         }
498         return (error);
499 }
500
501 /*
502  * hammer_vop_write { vp, uio, ioflag, cred }
503  */
504 static
505 int
506 hammer_vop_write(struct vop_write_args *ap)
507 {
508         struct hammer_transaction trans;
509         struct hammer_inode *ip;
510         hammer_mount_t hmp;
511         thread_t td;
512         struct uio *uio;
513         int offset;
514         off_t base_offset;
515         int64_t cluster_eof;
516         struct buf *bp;
517         int kflags;
518         int error;
519         int n;
520         int flags;
521         int seqcount;
522         int bigwrite;
523
524         if (ap->a_vp->v_type != VREG)
525                 return (EINVAL);
526         ip = VTOI(ap->a_vp);
527         hmp = ip->hmp;
528         error = 0;
529         kflags = 0;
530         seqcount = ap->a_ioflag >> 16;
531
532         if (ip->flags & HAMMER_INODE_RO)
533                 return (EROFS);
534
535         /*
536          * Create a transaction to cover the operations we perform.
537          */
538         lwkt_gettoken(&hmp->fs_token);
539         hammer_start_transaction(&trans, hmp);
540         uio = ap->a_uio;
541
542         /*
543          * Check append mode
544          */
545         if (ap->a_ioflag & IO_APPEND)
546                 uio->uio_offset = ip->ino_data.size;
547
548         /*
549          * Check for illegal write offsets.  Valid range is 0...2^63-1.
550          *
551          * NOTE: the base_off assignment is required to work around what
552          * I consider to be a GCC-4 optimization bug.
553          */
554         if (uio->uio_offset < 0) {
555                 hammer_done_transaction(&trans);
556                 lwkt_reltoken(&hmp->fs_token);
557                 return (EFBIG);
558         }
559         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
560         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
561                 hammer_done_transaction(&trans);
562                 lwkt_reltoken(&hmp->fs_token);
563                 return (EFBIG);
564         }
565
566         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
567             base_offset > td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
568                 hammer_done_transaction(&trans);
569                 lwkt_reltoken(&hmp->fs_token);
570                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
571                 return (EFBIG);
572         }
573
574         /*
575          * If reading or writing a huge amount of data we have to break
576          * atomicy and allow the operation to be interrupted by a signal
577          * or it can DOS the machine.
578          *
579          * Preset redo_count so we stop generating REDOs earlier if the
580          * limit is exceeded.
581          */
582         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
583         if ((ip->flags & HAMMER_INODE_REDO) &&
584             ip->redo_count < hammer_limit_redo) {
585                 ip->redo_count += uio->uio_resid;
586         }
587
588         /*
589          * Access the data typically in HAMMER_BUFSIZE blocks via the
590          * buffer cache, but HAMMER may use a variable block size based
591          * on the offset.
592          */
593         while (uio->uio_resid > 0) {
594                 int fixsize = 0;
595                 int blksize;
596                 int blkmask;
597                 int trivial;
598                 int endofblk;
599                 off_t nsize;
600
601                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
602                         break;
603                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
604                         break;
605
606                 blksize = hammer_blocksize(uio->uio_offset);
607
608                 /*
609                  * Do not allow HAMMER to blow out the buffer cache.  Very
610                  * large UIOs can lockout other processes due to bwillwrite()
611                  * mechanics.
612                  *
613                  * The hammer inode is not locked during these operations.
614                  * The vnode is locked which can interfere with the pageout
615                  * daemon for non-UIO_NOCOPY writes but should not interfere
616                  * with the buffer cache.  Even so, we cannot afford to
617                  * allow the pageout daemon to build up too many dirty buffer
618                  * cache buffers.
619                  *
620                  * Only call this if we aren't being recursively called from
621                  * a virtual disk device (vn), else we may deadlock.
622                  */
623                 if ((ap->a_ioflag & IO_RECURSE) == 0)
624                         bwillwrite(blksize);
625
626                 /*
627                  * Control the number of pending records associated with
628                  * this inode.  If too many have accumulated start a
629                  * flush.  Try to maintain a pipeline with the flusher.
630                  *
631                  * NOTE: It is possible for other sources to grow the
632                  *       records but not necessarily issue another flush,
633                  *       so use a timeout and ensure that a re-flush occurs.
634                  */
635                 if (ip->rsv_recs >= hammer_limit_inode_recs) {
636                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
637                         while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
638                                 ip->flags |= HAMMER_INODE_RECSW;
639                                 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
640                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
641                         }
642                 }
643
644 #if 0
645                 /*
646                  * Do not allow HAMMER to blow out system memory by
647                  * accumulating too many records.   Records are so well
648                  * decoupled from the buffer cache that it is possible
649                  * for userland to push data out to the media via
650                  * direct-write, but build up the records queued to the
651                  * backend faster then the backend can flush them out.
652                  * HAMMER has hit its write limit but the frontend has
653                  * no pushback to slow it down.
654                  */
655                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
656                         /*
657                          * Get the inode on the flush list
658                          */
659                         if (ip->rsv_recs >= 64)
660                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
661                         else if (ip->rsv_recs >= 16)
662                                 hammer_flush_inode(ip, 0);
663
664                         /*
665                          * Keep the flusher going if the system keeps
666                          * queueing records.
667                          */
668                         delta = hmp->count_newrecords -
669                                 hmp->last_newrecords;
670                         if (delta < 0 || delta > hammer_limit_recs / 2) {
671                                 hmp->last_newrecords = hmp->count_newrecords;
672                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
673                         }
674
675                         /*
676                          * If we have gotten behind start slowing
677                          * down the writers.
678                          */
679                         delta = (hmp->rsv_recs - hammer_limit_recs) *
680                                 hz / hammer_limit_recs;
681                         if (delta > 0)
682                                 tsleep(&trans, 0, "hmrslo", delta);
683                 }
684 #endif
685
686                 /*
687                  * Calculate the blocksize at the current offset and figure
688                  * out how much we can actually write.
689                  */
690                 blkmask = blksize - 1;
691                 offset = (int)uio->uio_offset & blkmask;
692                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
693                 n = blksize - offset;
694                 if (n > uio->uio_resid) {
695                         n = uio->uio_resid;
696                         endofblk = 0;
697                 } else {
698                         endofblk = 1;
699                 }
700                 nsize = uio->uio_offset + n;
701                 if (nsize > ip->ino_data.size) {
702                         if (uio->uio_offset > ip->ino_data.size)
703                                 trivial = 0;
704                         else
705                                 trivial = 1;
706                         nvextendbuf(ap->a_vp,
707                                     ip->ino_data.size,
708                                     nsize,
709                                     hammer_blocksize(ip->ino_data.size),
710                                     hammer_blocksize(nsize),
711                                     hammer_blockoff(ip->ino_data.size),
712                                     hammer_blockoff(nsize),
713                                     trivial);
714                         fixsize = 1;
715                         kflags |= NOTE_EXTEND;
716                 }
717
718                 if (uio->uio_segflg == UIO_NOCOPY) {
719                         /*
720                          * Issuing a write with the same data backing the
721                          * buffer.  Instantiate the buffer to collect the
722                          * backing vm pages, then read-in any missing bits.
723                          *
724                          * This case is used by vop_stdputpages().
725                          */
726                         bp = getblk(ap->a_vp, base_offset,
727                                     blksize, GETBLK_BHEAVY, 0);
728                         if ((bp->b_flags & B_CACHE) == 0) {
729                                 bqrelse(bp);
730                                 error = bread(ap->a_vp, base_offset,
731                                               blksize, &bp);
732                         }
733                 } else if (offset == 0 && uio->uio_resid >= blksize) {
734                         /*
735                          * Even though we are entirely overwriting the buffer
736                          * we may still have to zero it out to avoid a 
737                          * mmap/write visibility issue.
738                          */
739                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
740                         if ((bp->b_flags & B_CACHE) == 0)
741                                 vfs_bio_clrbuf(bp);
742                 } else if (base_offset >= ip->ino_data.size) {
743                         /*
744                          * If the base offset of the buffer is beyond the
745                          * file EOF, we don't have to issue a read.
746                          */
747                         bp = getblk(ap->a_vp, base_offset,
748                                     blksize, GETBLK_BHEAVY, 0);
749                         vfs_bio_clrbuf(bp);
750                 } else {
751                         /*
752                          * Partial overwrite, read in any missing bits then
753                          * replace the portion being written.
754                          */
755                         error = bread(ap->a_vp, base_offset, blksize, &bp);
756                         if (error == 0)
757                                 bheavy(bp);
758                 }
759                 if (error == 0) {
760                         lwkt_reltoken(&hmp->fs_token);
761                         error = uiomovebp(bp, bp->b_data + offset, n, uio);
762                         lwkt_gettoken(&hmp->fs_token);
763                 }
764
765                 /*
766                  * Generate REDO records if enabled and redo_count will not
767                  * exceeded the limit.
768                  *
769                  * If redo_count exceeds the limit we stop generating records
770                  * and clear HAMMER_INODE_REDO.  This will cause the next
771                  * fsync() to do a full meta-data sync instead of just an
772                  * UNDO/REDO fifo update.
773                  *
774                  * When clearing HAMMER_INODE_REDO any pre-existing REDOs
775                  * will still be tracked.  The tracks will be terminated
776                  * when the related meta-data (including possible data
777                  * modifications which are not tracked via REDO) is
778                  * flushed.
779                  */
780                 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
781                         if (ip->redo_count < hammer_limit_redo) {
782                                 bp->b_flags |= B_VFSFLAG1;
783                                 error = hammer_generate_redo(&trans, ip,
784                                                      base_offset + offset,
785                                                      HAMMER_REDO_WRITE,
786                                                      bp->b_data + offset,
787                                                      (size_t)n);
788                         } else {
789                                 ip->flags &= ~HAMMER_INODE_REDO;
790                         }
791                 }
792
793                 /*
794                  * If we screwed up we have to undo any VM size changes we
795                  * made.
796                  */
797                 if (error) {
798                         brelse(bp);
799                         if (fixsize) {
800                                 nvtruncbuf(ap->a_vp, ip->ino_data.size,
801                                           hammer_blocksize(ip->ino_data.size),
802                                           hammer_blockoff(ip->ino_data.size),
803                                           0);
804                         }
805                         break;
806                 }
807                 kflags |= NOTE_WRITE;
808                 hammer_stats_file_write += n;
809                 if (blksize == HAMMER_XBUFSIZE)
810                         bp->b_flags |= B_CLUSTEROK;
811                 if (ip->ino_data.size < uio->uio_offset) {
812                         ip->ino_data.size = uio->uio_offset;
813                         flags = HAMMER_INODE_SDIRTY;
814                 } else {
815                         flags = 0;
816                 }
817                 ip->ino_data.mtime = trans.time;
818                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
819                 hammer_modify_inode(&trans, ip, flags);
820
821                 /*
822                  * Once we dirty the buffer any cached zone-X offset
823                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
824                  * allow overwriting over the same data sector unless
825                  * we provide UNDOs for the old data, which we don't.
826                  */
827                 bp->b_bio2.bio_offset = NOOFFSET;
828
829                 /*
830                  * Final buffer disposition.
831                  *
832                  * Because meta-data updates are deferred, HAMMER is
833                  * especially sensitive to excessive bdwrite()s because
834                  * the I/O stream is not broken up by disk reads.  So the
835                  * buffer cache simply cannot keep up.
836                  *
837                  * WARNING!  blksize is variable.  cluster_write() is
838                  *           expected to not blow up if it encounters
839                  *           buffers that do not match the passed blksize.
840                  *
841                  * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
842                  *        The ip->rsv_recs check should burst-flush the data.
843                  *        If we queue it immediately the buf could be left
844                  *        locked on the device queue for a very long time.
845                  *
846                  *        However, failing to flush a dirty buffer out when
847                  *        issued from the pageout daemon can result in a low
848                  *        memory deadlock against bio_page_alloc(), so we
849                  *        have to bawrite() on IO_ASYNC as well.
850                  *
851                  * NOTE!  To avoid degenerate stalls due to mismatched block
852                  *        sizes we only honor IO_DIRECT on the write which
853                  *        abuts the end of the buffer.  However, we must
854                  *        honor IO_SYNC in case someone is silly enough to
855                  *        configure a HAMMER file as swap, or when HAMMER
856                  *        is serving NFS (for commits).  Ick ick.
857                  */
858                 bp->b_flags |= B_AGE;
859                 if (blksize == HAMMER_XBUFSIZE)
860                         bp->b_flags |= B_CLUSTEROK;
861
862                 if (ap->a_ioflag & IO_SYNC) {
863                         bwrite(bp);
864                 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
865                         bawrite(bp);
866                 } else if (ap->a_ioflag & IO_ASYNC) {
867                         bawrite(bp);
868                 } else if (hammer_cluster_enable &&
869                            !(ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
870                         if (base_offset < HAMMER_XDEMARC)
871                                 cluster_eof = hammer_blockdemarc(base_offset,
872                                                          ip->ino_data.size);
873                         else
874                                 cluster_eof = ip->ino_data.size;
875                         cluster_write(bp, cluster_eof, blksize, seqcount);
876                 } else {
877                         bdwrite(bp);
878                 }
879         }
880         hammer_done_transaction(&trans);
881         hammer_knote(ap->a_vp, kflags);
882         lwkt_reltoken(&hmp->fs_token);
883         return (error);
884 }
885
886 /*
887  * hammer_vop_access { vp, mode, cred }
888  *
889  * MPSAFE - does not require fs_token
890  */
891 static
892 int
893 hammer_vop_access(struct vop_access_args *ap)
894 {
895         struct hammer_inode *ip = VTOI(ap->a_vp);
896         uid_t uid;
897         gid_t gid;
898         int error;
899
900         ++hammer_stats_file_iopsr;
901         uid = hammer_to_unix_xid(&ip->ino_data.uid);
902         gid = hammer_to_unix_xid(&ip->ino_data.gid);
903
904         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
905                                   ip->ino_data.uflags);
906         return (error);
907 }
908
909 /*
910  * hammer_vop_advlock { vp, id, op, fl, flags }
911  *
912  * MPSAFE - does not require fs_token
913  */
914 static
915 int
916 hammer_vop_advlock(struct vop_advlock_args *ap)
917 {
918         hammer_inode_t ip = VTOI(ap->a_vp);
919
920         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
921 }
922
923 /*
924  * hammer_vop_close { vp, fflag }
925  *
926  * We can only sync-on-close for normal closes.  XXX disabled for now.
927  */
928 static
929 int
930 hammer_vop_close(struct vop_close_args *ap)
931 {
932 #if 0
933         struct vnode *vp = ap->a_vp;
934         hammer_inode_t ip = VTOI(vp);
935         int waitfor;
936         if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
937                 if (vn_islocked(vp) == LK_EXCLUSIVE &&
938                     (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
939                         if (ip->flags & HAMMER_INODE_CLOSESYNC)
940                                 waitfor = MNT_WAIT;
941                         else
942                                 waitfor = MNT_NOWAIT;
943                         ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
944                                        HAMMER_INODE_CLOSEASYNC);
945                         VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
946                 }
947         }
948 #endif
949         return (vop_stdclose(ap));
950 }
951
952 /*
953  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
954  *
955  * The operating system has already ensured that the directory entry
956  * does not exist and done all appropriate namespace locking.
957  */
958 static
959 int
960 hammer_vop_ncreate(struct vop_ncreate_args *ap)
961 {
962         struct hammer_transaction trans;
963         struct hammer_inode *dip;
964         struct hammer_inode *nip;
965         struct nchandle *nch;
966         hammer_mount_t hmp;
967         int error;
968
969         nch = ap->a_nch;
970         dip = VTOI(ap->a_dvp);
971         hmp = dip->hmp;
972
973         if (dip->flags & HAMMER_INODE_RO)
974                 return (EROFS);
975         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
976                 return (error);
977
978         /*
979          * Create a transaction to cover the operations we perform.
980          */
981         lwkt_gettoken(&hmp->fs_token);
982         hammer_start_transaction(&trans, hmp);
983         ++hammer_stats_file_iopsw;
984
985         /*
986          * Create a new filesystem object of the requested type.  The
987          * returned inode will be referenced and shared-locked to prevent
988          * it from being moved to the flusher.
989          */
990         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
991                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
992                                     NULL, &nip);
993         if (error) {
994                 hkprintf("hammer_create_inode error %d\n", error);
995                 hammer_done_transaction(&trans);
996                 *ap->a_vpp = NULL;
997                 lwkt_reltoken(&hmp->fs_token);
998                 return (error);
999         }
1000
1001         /*
1002          * Add the new filesystem object to the directory.  This will also
1003          * bump the inode's link count.
1004          */
1005         error = hammer_ip_add_directory(&trans, dip,
1006                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1007                                         nip);
1008         if (error)
1009                 hkprintf("hammer_ip_add_directory error %d\n", error);
1010
1011         /*
1012          * Finish up.
1013          */
1014         if (error) {
1015                 hammer_rel_inode(nip, 0);
1016                 hammer_done_transaction(&trans);
1017                 *ap->a_vpp = NULL;
1018         } else {
1019                 error = hammer_get_vnode(nip, ap->a_vpp);
1020                 hammer_done_transaction(&trans);
1021                 hammer_rel_inode(nip, 0);
1022                 if (error == 0) {
1023                         cache_setunresolved(ap->a_nch);
1024                         cache_setvp(ap->a_nch, *ap->a_vpp);
1025                 }
1026                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1027         }
1028         lwkt_reltoken(&hmp->fs_token);
1029         return (error);
1030 }
1031
1032 /*
1033  * hammer_vop_getattr { vp, vap }
1034  *
1035  * Retrieve an inode's attribute information.  When accessing inodes
1036  * historically we fake the atime field to ensure consistent results.
1037  * The atime field is stored in the B-Tree element and allowed to be
1038  * updated without cycling the element.
1039  *
1040  * MPSAFE - does not require fs_token
1041  */
1042 static
1043 int
1044 hammer_vop_getattr(struct vop_getattr_args *ap)
1045 {
1046         struct hammer_inode *ip = VTOI(ap->a_vp);
1047         struct vattr *vap = ap->a_vap;
1048
1049         /*
1050          * We want the fsid to be different when accessing a filesystem
1051          * with different as-of's so programs like diff don't think
1052          * the files are the same.
1053          *
1054          * We also want the fsid to be the same when comparing snapshots,
1055          * or when comparing mirrors (which might be backed by different
1056          * physical devices).  HAMMER fsids are based on the PFS's
1057          * shared_uuid field.
1058          *
1059          * XXX there is a chance of collision here.  The va_fsid reported
1060          * by stat is different from the more involved fsid used in the
1061          * mount structure.
1062          */
1063         ++hammer_stats_file_iopsr;
1064         hammer_lock_sh(&ip->lock);
1065         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1066                        (u_int32_t)(ip->obj_asof >> 32);
1067
1068         vap->va_fileid = ip->ino_leaf.base.obj_id;
1069         vap->va_mode = ip->ino_data.mode;
1070         vap->va_nlink = ip->ino_data.nlinks;
1071         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1072         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1073         vap->va_rmajor = 0;
1074         vap->va_rminor = 0;
1075         vap->va_size = ip->ino_data.size;
1076
1077         /*
1078          * Special case for @@PFS softlinks.  The actual size of the
1079          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1080          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1081          */
1082         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1083             ip->ino_data.size == 10 &&
1084             ip->obj_asof == HAMMER_MAX_TID &&
1085             ip->obj_localization == 0 &&
1086             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1087                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1088                             vap->va_size = 26;
1089                     else
1090                             vap->va_size = 10;
1091         }
1092
1093         /*
1094          * We must provide a consistent atime and mtime for snapshots
1095          * so people can do a 'tar cf - ... | md5' on them and get
1096          * consistent results.
1097          */
1098         if (ip->flags & HAMMER_INODE_RO) {
1099                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1100                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1101         } else {
1102                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1103                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1104         }
1105         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1106         vap->va_flags = ip->ino_data.uflags;
1107         vap->va_gen = 1;        /* hammer inums are unique for all time */
1108         vap->va_blocksize = HAMMER_BUFSIZE;
1109         if (ip->ino_data.size >= HAMMER_XDEMARC) {
1110                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1111                                 ~HAMMER_XBUFMASK64;
1112         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1113                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1114                                 ~HAMMER_BUFMASK64;
1115         } else {
1116                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1117         }
1118
1119         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1120         vap->va_filerev = 0;    /* XXX */
1121         vap->va_uid_uuid = ip->ino_data.uid;
1122         vap->va_gid_uuid = ip->ino_data.gid;
1123         vap->va_fsid_uuid = ip->hmp->fsid;
1124         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1125                           VA_FSID_UUID_VALID;
1126
1127         switch (ip->ino_data.obj_type) {
1128         case HAMMER_OBJTYPE_CDEV:
1129         case HAMMER_OBJTYPE_BDEV:
1130                 vap->va_rmajor = ip->ino_data.rmajor;
1131                 vap->va_rminor = ip->ino_data.rminor;
1132                 break;
1133         default:
1134                 break;
1135         }
1136         hammer_unlock(&ip->lock);
1137         return(0);
1138 }
1139
1140 /*
1141  * hammer_vop_nresolve { nch, dvp, cred }
1142  *
1143  * Locate the requested directory entry.
1144  */
1145 static
1146 int
1147 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1148 {
1149         struct hammer_transaction trans;
1150         struct namecache *ncp;
1151         hammer_mount_t hmp;
1152         hammer_inode_t dip;
1153         hammer_inode_t ip;
1154         hammer_tid_t asof;
1155         struct hammer_cursor cursor;
1156         struct vnode *vp;
1157         int64_t namekey;
1158         int error;
1159         int i;
1160         int nlen;
1161         int flags;
1162         int ispfs;
1163         int64_t obj_id;
1164         u_int32_t localization;
1165         u_int32_t max_iterations;
1166
1167         /*
1168          * Misc initialization, plus handle as-of name extensions.  Look for
1169          * the '@@' extension.  Note that as-of files and directories cannot
1170          * be modified.
1171          */
1172         dip = VTOI(ap->a_dvp);
1173         ncp = ap->a_nch->ncp;
1174         asof = dip->obj_asof;
1175         localization = dip->obj_localization;   /* for code consistency */
1176         nlen = ncp->nc_nlen;
1177         flags = dip->flags & HAMMER_INODE_RO;
1178         ispfs = 0;
1179         hmp = dip->hmp;
1180
1181         lwkt_gettoken(&hmp->fs_token);
1182         hammer_simple_transaction(&trans, hmp);
1183         ++hammer_stats_file_iopsr;
1184
1185         for (i = 0; i < nlen; ++i) {
1186                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1187                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
1188                                                   &ispfs, &asof, &localization);
1189                         if (error != 0) {
1190                                 i = nlen;
1191                                 break;
1192                         }
1193                         if (asof != HAMMER_MAX_TID)
1194                                 flags |= HAMMER_INODE_RO;
1195                         break;
1196                 }
1197         }
1198         nlen = i;
1199
1200         /*
1201          * If this is a PFS softlink we dive into the PFS
1202          */
1203         if (ispfs && nlen == 0) {
1204                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1205                                       asof, localization,
1206                                       flags, &error);
1207                 if (error == 0) {
1208                         error = hammer_get_vnode(ip, &vp);
1209                         hammer_rel_inode(ip, 0);
1210                 } else {
1211                         vp = NULL;
1212                 }
1213                 if (error == 0) {
1214                         vn_unlock(vp);
1215                         cache_setvp(ap->a_nch, vp);
1216                         vrele(vp);
1217                 }
1218                 goto done;
1219         }
1220
1221         /*
1222          * If there is no path component the time extension is relative to dip.
1223          * e.g. "fubar/@@<snapshot>"
1224          *
1225          * "." is handled by the kernel, but ".@@<snapshot>" is not.
1226          * e.g. "fubar/.@@<snapshot>"
1227          *
1228          * ".." is handled by the kernel.  We do not currently handle
1229          * "..@<snapshot>".
1230          */
1231         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1232                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
1233                                       asof, dip->obj_localization,
1234                                       flags, &error);
1235                 if (error == 0) {
1236                         error = hammer_get_vnode(ip, &vp);
1237                         hammer_rel_inode(ip, 0);
1238                 } else {
1239                         vp = NULL;
1240                 }
1241                 if (error == 0) {
1242                         vn_unlock(vp);
1243                         cache_setvp(ap->a_nch, vp);
1244                         vrele(vp);
1245                 }
1246                 goto done;
1247         }
1248
1249         /*
1250          * Calculate the namekey and setup the key range for the scan.  This
1251          * works kinda like a chained hash table where the lower 32 bits
1252          * of the namekey synthesize the chain.
1253          *
1254          * The key range is inclusive of both key_beg and key_end.
1255          */
1256         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1257                                            &max_iterations);
1258
1259         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1260         cursor.key_beg.localization = dip->obj_localization +
1261                                       hammer_dir_localization(dip);
1262         cursor.key_beg.obj_id = dip->obj_id;
1263         cursor.key_beg.key = namekey;
1264         cursor.key_beg.create_tid = 0;
1265         cursor.key_beg.delete_tid = 0;
1266         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1267         cursor.key_beg.obj_type = 0;
1268
1269         cursor.key_end = cursor.key_beg;
1270         cursor.key_end.key += max_iterations;
1271         cursor.asof = asof;
1272         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1273
1274         /*
1275          * Scan all matching records (the chain), locate the one matching
1276          * the requested path component.
1277          *
1278          * The hammer_ip_*() functions merge in-memory records with on-disk
1279          * records for the purposes of the search.
1280          */
1281         obj_id = 0;
1282         localization = HAMMER_DEF_LOCALIZATION;
1283
1284         if (error == 0) {
1285                 error = hammer_ip_first(&cursor);
1286                 while (error == 0) {
1287                         error = hammer_ip_resolve_data(&cursor);
1288                         if (error)
1289                                 break;
1290                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1291                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1292                                 obj_id = cursor.data->entry.obj_id;
1293                                 localization = cursor.data->entry.localization;
1294                                 break;
1295                         }
1296                         error = hammer_ip_next(&cursor);
1297                 }
1298         }
1299         hammer_done_cursor(&cursor);
1300
1301         /*
1302          * Lookup the obj_id.  This should always succeed.  If it does not
1303          * the filesystem may be damaged and we return a dummy inode.
1304          */
1305         if (error == 0) {
1306                 ip = hammer_get_inode(&trans, dip, obj_id,
1307                                       asof, localization,
1308                                       flags, &error);
1309                 if (error == ENOENT) {
1310                         kprintf("HAMMER: WARNING: Missing "
1311                                 "inode for dirent \"%s\"\n"
1312                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1313                                 ncp->nc_name,
1314                                 (long long)obj_id, (long long)asof,
1315                                 localization);
1316                         error = 0;
1317                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1318                                                     asof, localization,
1319                                                     flags, &error);
1320                 }
1321                 if (error == 0) {
1322                         error = hammer_get_vnode(ip, &vp);
1323                         hammer_rel_inode(ip, 0);
1324                 } else {
1325                         vp = NULL;
1326                 }
1327                 if (error == 0) {
1328                         vn_unlock(vp);
1329                         cache_setvp(ap->a_nch, vp);
1330                         vrele(vp);
1331                 }
1332         } else if (error == ENOENT) {
1333                 cache_setvp(ap->a_nch, NULL);
1334         }
1335 done:
1336         hammer_done_transaction(&trans);
1337         lwkt_reltoken(&hmp->fs_token);
1338         return (error);
1339 }
1340
1341 /*
1342  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1343  *
1344  * Locate the parent directory of a directory vnode.
1345  *
1346  * dvp is referenced but not locked.  *vpp must be returned referenced and
1347  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1348  * at the root, instead it could indicate that the directory we were in was
1349  * removed.
1350  *
1351  * NOTE: as-of sequences are not linked into the directory structure.  If
1352  * we are at the root with a different asof then the mount point, reload
1353  * the same directory with the mount point's asof.   I'm not sure what this
1354  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1355  * get confused, but it hasn't been tested.
1356  */
1357 static
1358 int
1359 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1360 {
1361         struct hammer_transaction trans;
1362         struct hammer_inode *dip;
1363         struct hammer_inode *ip;
1364         hammer_mount_t hmp;
1365         int64_t parent_obj_id;
1366         u_int32_t parent_obj_localization;
1367         hammer_tid_t asof;
1368         int error;
1369
1370         dip = VTOI(ap->a_dvp);
1371         asof = dip->obj_asof;
1372         hmp = dip->hmp;
1373
1374         /*
1375          * Whos are parent?  This could be the root of a pseudo-filesystem
1376          * whos parent is in another localization domain.
1377          */
1378         lwkt_gettoken(&hmp->fs_token);
1379         parent_obj_id = dip->ino_data.parent_obj_id;
1380         if (dip->obj_id == HAMMER_OBJID_ROOT)
1381                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1382         else
1383                 parent_obj_localization = dip->obj_localization;
1384
1385         if (parent_obj_id == 0) {
1386                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1387                    asof != hmp->asof) {
1388                         parent_obj_id = dip->obj_id;
1389                         asof = hmp->asof;
1390                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1391                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1392                                   (long long)dip->obj_asof);
1393                 } else {
1394                         *ap->a_vpp = NULL;
1395                         lwkt_reltoken(&hmp->fs_token);
1396                         return ENOENT;
1397                 }
1398         }
1399
1400         hammer_simple_transaction(&trans, hmp);
1401         ++hammer_stats_file_iopsr;
1402
1403         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1404                               asof, parent_obj_localization,
1405                               dip->flags, &error);
1406         if (ip) {
1407                 error = hammer_get_vnode(ip, ap->a_vpp);
1408                 hammer_rel_inode(ip, 0);
1409         } else {
1410                 *ap->a_vpp = NULL;
1411         }
1412         hammer_done_transaction(&trans);
1413         lwkt_reltoken(&hmp->fs_token);
1414         return (error);
1415 }
1416
1417 /*
1418  * hammer_vop_nlink { nch, dvp, vp, cred }
1419  */
1420 static
1421 int
1422 hammer_vop_nlink(struct vop_nlink_args *ap)
1423 {
1424         struct hammer_transaction trans;
1425         struct hammer_inode *dip;
1426         struct hammer_inode *ip;
1427         struct nchandle *nch;
1428         hammer_mount_t hmp;
1429         int error;
1430
1431         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1432                 return(EXDEV);
1433
1434         nch = ap->a_nch;
1435         dip = VTOI(ap->a_dvp);
1436         ip = VTOI(ap->a_vp);
1437         hmp = dip->hmp;
1438
1439         if (dip->obj_localization != ip->obj_localization)
1440                 return(EXDEV);
1441
1442         if (dip->flags & HAMMER_INODE_RO)
1443                 return (EROFS);
1444         if (ip->flags & HAMMER_INODE_RO)
1445                 return (EROFS);
1446         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1447                 return (error);
1448
1449         /*
1450          * Create a transaction to cover the operations we perform.
1451          */
1452         lwkt_gettoken(&hmp->fs_token);
1453         hammer_start_transaction(&trans, hmp);
1454         ++hammer_stats_file_iopsw;
1455
1456         /*
1457          * Add the filesystem object to the directory.  Note that neither
1458          * dip nor ip are referenced or locked, but their vnodes are
1459          * referenced.  This function will bump the inode's link count.
1460          */
1461         error = hammer_ip_add_directory(&trans, dip,
1462                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1463                                         ip);
1464
1465         /*
1466          * Finish up.
1467          */
1468         if (error == 0) {
1469                 cache_setunresolved(nch);
1470                 cache_setvp(nch, ap->a_vp);
1471         }
1472         hammer_done_transaction(&trans);
1473         hammer_knote(ap->a_vp, NOTE_LINK);
1474         hammer_knote(ap->a_dvp, NOTE_WRITE);
1475         lwkt_reltoken(&hmp->fs_token);
1476         return (error);
1477 }
1478
1479 /*
1480  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1481  *
1482  * The operating system has already ensured that the directory entry
1483  * does not exist and done all appropriate namespace locking.
1484  */
1485 static
1486 int
1487 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1488 {
1489         struct hammer_transaction trans;
1490         struct hammer_inode *dip;
1491         struct hammer_inode *nip;
1492         struct nchandle *nch;
1493         hammer_mount_t hmp;
1494         int error;
1495
1496         nch = ap->a_nch;
1497         dip = VTOI(ap->a_dvp);
1498         hmp = dip->hmp;
1499
1500         if (dip->flags & HAMMER_INODE_RO)
1501                 return (EROFS);
1502         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1503                 return (error);
1504
1505         /*
1506          * Create a transaction to cover the operations we perform.
1507          */
1508         lwkt_gettoken(&hmp->fs_token);
1509         hammer_start_transaction(&trans, hmp);
1510         ++hammer_stats_file_iopsw;
1511
1512         /*
1513          * Create a new filesystem object of the requested type.  The
1514          * returned inode will be referenced but not locked.
1515          */
1516         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1517                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1518                                     NULL, &nip);
1519         if (error) {
1520                 hkprintf("hammer_mkdir error %d\n", error);
1521                 hammer_done_transaction(&trans);
1522                 *ap->a_vpp = NULL;
1523                 lwkt_reltoken(&hmp->fs_token);
1524                 return (error);
1525         }
1526         /*
1527          * Add the new filesystem object to the directory.  This will also
1528          * bump the inode's link count.
1529          */
1530         error = hammer_ip_add_directory(&trans, dip,
1531                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1532                                         nip);
1533         if (error)
1534                 hkprintf("hammer_mkdir (add) error %d\n", error);
1535
1536         /*
1537          * Finish up.
1538          */
1539         if (error) {
1540                 hammer_rel_inode(nip, 0);
1541                 *ap->a_vpp = NULL;
1542         } else {
1543                 error = hammer_get_vnode(nip, ap->a_vpp);
1544                 hammer_rel_inode(nip, 0);
1545                 if (error == 0) {
1546                         cache_setunresolved(ap->a_nch);
1547                         cache_setvp(ap->a_nch, *ap->a_vpp);
1548                 }
1549         }
1550         hammer_done_transaction(&trans);
1551         if (error == 0)
1552                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1553         lwkt_reltoken(&hmp->fs_token);
1554         return (error);
1555 }
1556
1557 /*
1558  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1559  *
1560  * The operating system has already ensured that the directory entry
1561  * does not exist and done all appropriate namespace locking.
1562  */
1563 static
1564 int
1565 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1566 {
1567         struct hammer_transaction trans;
1568         struct hammer_inode *dip;
1569         struct hammer_inode *nip;
1570         struct nchandle *nch;
1571         hammer_mount_t hmp;
1572         int error;
1573
1574         nch = ap->a_nch;
1575         dip = VTOI(ap->a_dvp);
1576         hmp = dip->hmp;
1577
1578         if (dip->flags & HAMMER_INODE_RO)
1579                 return (EROFS);
1580         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1581                 return (error);
1582
1583         /*
1584          * Create a transaction to cover the operations we perform.
1585          */
1586         lwkt_gettoken(&hmp->fs_token);
1587         hammer_start_transaction(&trans, hmp);
1588         ++hammer_stats_file_iopsw;
1589
1590         /*
1591          * Create a new filesystem object of the requested type.  The
1592          * returned inode will be referenced but not locked.
1593          *
1594          * If mknod specifies a directory a pseudo-fs is created.
1595          */
1596         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1597                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1598                                     NULL, &nip);
1599         if (error) {
1600                 hammer_done_transaction(&trans);
1601                 *ap->a_vpp = NULL;
1602                 lwkt_reltoken(&hmp->fs_token);
1603                 return (error);
1604         }
1605
1606         /*
1607          * Add the new filesystem object to the directory.  This will also
1608          * bump the inode's link count.
1609          */
1610         error = hammer_ip_add_directory(&trans, dip,
1611                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1612                                         nip);
1613
1614         /*
1615          * Finish up.
1616          */
1617         if (error) {
1618                 hammer_rel_inode(nip, 0);
1619                 *ap->a_vpp = NULL;
1620         } else {
1621                 error = hammer_get_vnode(nip, ap->a_vpp);
1622                 hammer_rel_inode(nip, 0);
1623                 if (error == 0) {
1624                         cache_setunresolved(ap->a_nch);
1625                         cache_setvp(ap->a_nch, *ap->a_vpp);
1626                 }
1627         }
1628         hammer_done_transaction(&trans);
1629         if (error == 0)
1630                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1631         lwkt_reltoken(&hmp->fs_token);
1632         return (error);
1633 }
1634
1635 /*
1636  * hammer_vop_open { vp, mode, cred, fp }
1637  *
1638  * MPSAFE (does not require fs_token)
1639  */
1640 static
1641 int
1642 hammer_vop_open(struct vop_open_args *ap)
1643 {
1644         hammer_inode_t ip;
1645
1646         ++hammer_stats_file_iopsr;
1647         ip = VTOI(ap->a_vp);
1648
1649         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1650                 return (EROFS);
1651         return(vop_stdopen(ap));
1652 }
1653
1654 /*
1655  * hammer_vop_print { vp }
1656  */
1657 static
1658 int
1659 hammer_vop_print(struct vop_print_args *ap)
1660 {
1661         return EOPNOTSUPP;
1662 }
1663
1664 /*
1665  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1666  */
1667 static
1668 int
1669 hammer_vop_readdir(struct vop_readdir_args *ap)
1670 {
1671         struct hammer_transaction trans;
1672         struct hammer_cursor cursor;
1673         struct hammer_inode *ip;
1674         hammer_mount_t hmp;
1675         struct uio *uio;
1676         hammer_base_elm_t base;
1677         int error;
1678         int cookie_index;
1679         int ncookies;
1680         off_t *cookies;
1681         off_t saveoff;
1682         int r;
1683         int dtype;
1684
1685         ++hammer_stats_file_iopsr;
1686         ip = VTOI(ap->a_vp);
1687         uio = ap->a_uio;
1688         saveoff = uio->uio_offset;
1689         hmp = ip->hmp;
1690
1691         if (ap->a_ncookies) {
1692                 ncookies = uio->uio_resid / 16 + 1;
1693                 if (ncookies > 1024)
1694                         ncookies = 1024;
1695                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1696                 cookie_index = 0;
1697         } else {
1698                 ncookies = -1;
1699                 cookies = NULL;
1700                 cookie_index = 0;
1701         }
1702
1703         lwkt_gettoken(&hmp->fs_token);
1704         hammer_simple_transaction(&trans, hmp);
1705
1706         /*
1707          * Handle artificial entries
1708          *
1709          * It should be noted that the minimum value for a directory
1710          * hash key on-media is 0x0000000100000000, so we can use anything
1711          * less then that to represent our 'special' key space.
1712          */
1713         error = 0;
1714         if (saveoff == 0) {
1715                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1716                 if (r)
1717                         goto done;
1718                 if (cookies)
1719                         cookies[cookie_index] = saveoff;
1720                 ++saveoff;
1721                 ++cookie_index;
1722                 if (cookie_index == ncookies)
1723                         goto done;
1724         }
1725         if (saveoff == 1) {
1726                 if (ip->ino_data.parent_obj_id) {
1727                         r = vop_write_dirent(&error, uio,
1728                                              ip->ino_data.parent_obj_id,
1729                                              DT_DIR, 2, "..");
1730                 } else {
1731                         r = vop_write_dirent(&error, uio,
1732                                              ip->obj_id, DT_DIR, 2, "..");
1733                 }
1734                 if (r)
1735                         goto done;
1736                 if (cookies)
1737                         cookies[cookie_index] = saveoff;
1738                 ++saveoff;
1739                 ++cookie_index;
1740                 if (cookie_index == ncookies)
1741                         goto done;
1742         }
1743
1744         /*
1745          * Key range (begin and end inclusive) to scan.  Directory keys
1746          * directly translate to a 64 bit 'seek' position.
1747          */
1748         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1749         cursor.key_beg.localization = ip->obj_localization +
1750                                       hammer_dir_localization(ip);
1751         cursor.key_beg.obj_id = ip->obj_id;
1752         cursor.key_beg.create_tid = 0;
1753         cursor.key_beg.delete_tid = 0;
1754         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1755         cursor.key_beg.obj_type = 0;
1756         cursor.key_beg.key = saveoff;
1757
1758         cursor.key_end = cursor.key_beg;
1759         cursor.key_end.key = HAMMER_MAX_KEY;
1760         cursor.asof = ip->obj_asof;
1761         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1762
1763         error = hammer_ip_first(&cursor);
1764
1765         while (error == 0) {
1766                 error = hammer_ip_resolve_data(&cursor);
1767                 if (error)
1768                         break;
1769                 base = &cursor.leaf->base;
1770                 saveoff = base->key;
1771                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1772
1773                 if (base->obj_id != ip->obj_id)
1774                         panic("readdir: bad record at %p", cursor.node);
1775
1776                 /*
1777                  * Convert pseudo-filesystems into softlinks
1778                  */
1779                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1780                 r = vop_write_dirent(
1781                              &error, uio, cursor.data->entry.obj_id,
1782                              dtype,
1783                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1784                              (void *)cursor.data->entry.name);
1785                 if (r)
1786                         break;
1787                 ++saveoff;
1788                 if (cookies)
1789                         cookies[cookie_index] = base->key;
1790                 ++cookie_index;
1791                 if (cookie_index == ncookies)
1792                         break;
1793                 error = hammer_ip_next(&cursor);
1794         }
1795         hammer_done_cursor(&cursor);
1796
1797 done:
1798         hammer_done_transaction(&trans);
1799
1800         if (ap->a_eofflag)
1801                 *ap->a_eofflag = (error == ENOENT);
1802         uio->uio_offset = saveoff;
1803         if (error && cookie_index == 0) {
1804                 if (error == ENOENT)
1805                         error = 0;
1806                 if (cookies) {
1807                         kfree(cookies, M_TEMP);
1808                         *ap->a_ncookies = 0;
1809                         *ap->a_cookies = NULL;
1810                 }
1811         } else {
1812                 if (error == ENOENT)
1813                         error = 0;
1814                 if (cookies) {
1815                         *ap->a_ncookies = cookie_index;
1816                         *ap->a_cookies = cookies;
1817                 }
1818         }
1819         lwkt_reltoken(&hmp->fs_token);
1820         return(error);
1821 }
1822
1823 /*
1824  * hammer_vop_readlink { vp, uio, cred }
1825  */
1826 static
1827 int
1828 hammer_vop_readlink(struct vop_readlink_args *ap)
1829 {
1830         struct hammer_transaction trans;
1831         struct hammer_cursor cursor;
1832         struct hammer_inode *ip;
1833         hammer_mount_t hmp;
1834         char buf[32];
1835         u_int32_t localization;
1836         hammer_pseudofs_inmem_t pfsm;
1837         int error;
1838
1839         ip = VTOI(ap->a_vp);
1840         hmp = ip->hmp;
1841
1842         lwkt_gettoken(&hmp->fs_token);
1843
1844         /*
1845          * Shortcut if the symlink data was stuffed into ino_data.
1846          *
1847          * Also expand special "@@PFS%05d" softlinks (expansion only
1848          * occurs for non-historical (current) accesses made from the
1849          * primary filesystem).
1850          */
1851         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1852                 char *ptr;
1853                 int bytes;
1854
1855                 ptr = ip->ino_data.ext.symlink;
1856                 bytes = (int)ip->ino_data.size;
1857                 if (bytes == 10 &&
1858                     ip->obj_asof == HAMMER_MAX_TID &&
1859                     ip->obj_localization == 0 &&
1860                     strncmp(ptr, "@@PFS", 5) == 0) {
1861                         hammer_simple_transaction(&trans, hmp);
1862                         bcopy(ptr + 5, buf, 5);
1863                         buf[5] = 0;
1864                         localization = strtoul(buf, NULL, 10) << 16;
1865                         pfsm = hammer_load_pseudofs(&trans, localization,
1866                                                     &error);
1867                         if (error == 0) {
1868                                 if (pfsm->pfsd.mirror_flags &
1869                                     HAMMER_PFSD_SLAVE) {
1870                                         /* vap->va_size == 26 */
1871                                         ksnprintf(buf, sizeof(buf),
1872                                                   "@@0x%016llx:%05d",
1873                                                   (long long)pfsm->pfsd.sync_end_tid,
1874                                                   localization >> 16);
1875                                 } else {
1876                                         /* vap->va_size == 10 */
1877                                         ksnprintf(buf, sizeof(buf),
1878                                                   "@@-1:%05d",
1879                                                   localization >> 16);
1880 #if 0
1881                                         ksnprintf(buf, sizeof(buf),
1882                                                   "@@0x%016llx:%05d",
1883                                                   (long long)HAMMER_MAX_TID,
1884                                                   localization >> 16);
1885 #endif
1886                                 }
1887                                 ptr = buf;
1888                                 bytes = strlen(buf);
1889                         }
1890                         if (pfsm)
1891                                 hammer_rel_pseudofs(hmp, pfsm);
1892                         hammer_done_transaction(&trans);
1893                 }
1894                 error = uiomove(ptr, bytes, ap->a_uio);
1895                 lwkt_reltoken(&hmp->fs_token);
1896                 return(error);
1897         }
1898
1899         /*
1900          * Long version
1901          */
1902         hammer_simple_transaction(&trans, hmp);
1903         ++hammer_stats_file_iopsr;
1904         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1905
1906         /*
1907          * Key range (begin and end inclusive) to scan.  Directory keys
1908          * directly translate to a 64 bit 'seek' position.
1909          */
1910         cursor.key_beg.localization = ip->obj_localization +
1911                                       HAMMER_LOCALIZE_MISC;
1912         cursor.key_beg.obj_id = ip->obj_id;
1913         cursor.key_beg.create_tid = 0;
1914         cursor.key_beg.delete_tid = 0;
1915         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1916         cursor.key_beg.obj_type = 0;
1917         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1918         cursor.asof = ip->obj_asof;
1919         cursor.flags |= HAMMER_CURSOR_ASOF;
1920
1921         error = hammer_ip_lookup(&cursor);
1922         if (error == 0) {
1923                 error = hammer_ip_resolve_data(&cursor);
1924                 if (error == 0) {
1925                         KKASSERT(cursor.leaf->data_len >=
1926                                  HAMMER_SYMLINK_NAME_OFF);
1927                         error = uiomove(cursor.data->symlink.name,
1928                                         cursor.leaf->data_len -
1929                                                 HAMMER_SYMLINK_NAME_OFF,
1930                                         ap->a_uio);
1931                 }
1932         }
1933         hammer_done_cursor(&cursor);
1934         hammer_done_transaction(&trans);
1935         lwkt_reltoken(&hmp->fs_token);
1936         return(error);
1937 }
1938
1939 /*
1940  * hammer_vop_nremove { nch, dvp, cred }
1941  */
1942 static
1943 int
1944 hammer_vop_nremove(struct vop_nremove_args *ap)
1945 {
1946         struct hammer_transaction trans;
1947         struct hammer_inode *dip;
1948         hammer_mount_t hmp;
1949         int error;
1950
1951         dip = VTOI(ap->a_dvp);
1952         hmp = dip->hmp;
1953
1954         if (hammer_nohistory(dip) == 0 &&
1955             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1956                 return (error);
1957         }
1958
1959         lwkt_gettoken(&hmp->fs_token);
1960         hammer_start_transaction(&trans, hmp);
1961         ++hammer_stats_file_iopsw;
1962         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1963         hammer_done_transaction(&trans);
1964         if (error == 0)
1965                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1966         lwkt_reltoken(&hmp->fs_token);
1967         return (error);
1968 }
1969
1970 /*
1971  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1972  */
1973 static
1974 int
1975 hammer_vop_nrename(struct vop_nrename_args *ap)
1976 {
1977         struct hammer_transaction trans;
1978         struct namecache *fncp;
1979         struct namecache *tncp;
1980         struct hammer_inode *fdip;
1981         struct hammer_inode *tdip;
1982         struct hammer_inode *ip;
1983         hammer_mount_t hmp;
1984         struct hammer_cursor cursor;
1985         int64_t namekey;
1986         u_int32_t max_iterations;
1987         int nlen, error;
1988
1989         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1990                 return(EXDEV);
1991         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1992                 return(EXDEV);
1993
1994         fdip = VTOI(ap->a_fdvp);
1995         tdip = VTOI(ap->a_tdvp);
1996         fncp = ap->a_fnch->ncp;
1997         tncp = ap->a_tnch->ncp;
1998         ip = VTOI(fncp->nc_vp);
1999         KKASSERT(ip != NULL);
2000
2001         hmp = ip->hmp;
2002
2003         if (fdip->obj_localization != tdip->obj_localization)
2004                 return(EXDEV);
2005         if (fdip->obj_localization != ip->obj_localization)
2006                 return(EXDEV);
2007
2008         if (fdip->flags & HAMMER_INODE_RO)
2009                 return (EROFS);
2010         if (tdip->flags & HAMMER_INODE_RO)
2011                 return (EROFS);
2012         if (ip->flags & HAMMER_INODE_RO)
2013                 return (EROFS);
2014         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2015                 return (error);
2016
2017         lwkt_gettoken(&hmp->fs_token);
2018         hammer_start_transaction(&trans, hmp);
2019         ++hammer_stats_file_iopsw;
2020
2021         /*
2022          * Remove tncp from the target directory and then link ip as
2023          * tncp. XXX pass trans to dounlink
2024          *
2025          * Force the inode sync-time to match the transaction so it is
2026          * in-sync with the creation of the target directory entry.
2027          */
2028         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
2029                                 ap->a_cred, 0, -1);
2030         if (error == 0 || error == ENOENT) {
2031                 error = hammer_ip_add_directory(&trans, tdip,
2032                                                 tncp->nc_name, tncp->nc_nlen,
2033                                                 ip);
2034                 if (error == 0) {
2035                         ip->ino_data.parent_obj_id = tdip->obj_id;
2036                         ip->ino_data.ctime = trans.time;
2037                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
2038                 }
2039         }
2040         if (error)
2041                 goto failed; /* XXX */
2042
2043         /*
2044          * Locate the record in the originating directory and remove it.
2045          *
2046          * Calculate the namekey and setup the key range for the scan.  This
2047          * works kinda like a chained hash table where the lower 32 bits
2048          * of the namekey synthesize the chain.
2049          *
2050          * The key range is inclusive of both key_beg and key_end.
2051          */
2052         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
2053                                            &max_iterations);
2054 retry:
2055         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
2056         cursor.key_beg.localization = fdip->obj_localization +
2057                                       hammer_dir_localization(fdip);
2058         cursor.key_beg.obj_id = fdip->obj_id;
2059         cursor.key_beg.key = namekey;
2060         cursor.key_beg.create_tid = 0;
2061         cursor.key_beg.delete_tid = 0;
2062         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2063         cursor.key_beg.obj_type = 0;
2064
2065         cursor.key_end = cursor.key_beg;
2066         cursor.key_end.key += max_iterations;
2067         cursor.asof = fdip->obj_asof;
2068         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2069
2070         /*
2071          * Scan all matching records (the chain), locate the one matching
2072          * the requested path component.
2073          *
2074          * The hammer_ip_*() functions merge in-memory records with on-disk
2075          * records for the purposes of the search.
2076          */
2077         error = hammer_ip_first(&cursor);
2078         while (error == 0) {
2079                 if (hammer_ip_resolve_data(&cursor) != 0)
2080                         break;
2081                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2082                 KKASSERT(nlen > 0);
2083                 if (fncp->nc_nlen == nlen &&
2084                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2085                         break;
2086                 }
2087                 error = hammer_ip_next(&cursor);
2088         }
2089
2090         /*
2091          * If all is ok we have to get the inode so we can adjust nlinks.
2092          *
2093          * WARNING: hammer_ip_del_directory() may have to terminate the
2094          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2095          * twice.
2096          */
2097         if (error == 0)
2098                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2099
2100         /*
2101          * XXX A deadlock here will break rename's atomicy for the purposes
2102          * of crash recovery.
2103          */
2104         if (error == EDEADLK) {
2105                 hammer_done_cursor(&cursor);
2106                 goto retry;
2107         }
2108
2109         /*
2110          * Cleanup and tell the kernel that the rename succeeded.
2111          *
2112          * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2113          *       without formally acquiring the vp since the vp might
2114          *       have zero refs on it, or in the middle of a reclaim,
2115          *       etc.
2116          */
2117         hammer_done_cursor(&cursor);
2118         if (error == 0) {
2119                 cache_rename(ap->a_fnch, ap->a_tnch);
2120                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
2121                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
2122                 while (ip->vp) {
2123                         struct vnode *vp;
2124
2125                         error = hammer_get_vnode(ip, &vp);
2126                         if (error == 0 && vp) {
2127                                 vn_unlock(vp);
2128                                 hammer_knote(ip->vp, NOTE_RENAME);
2129                                 vrele(vp);
2130                                 break;
2131                         }
2132                         kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2133                 }
2134         }
2135
2136 failed:
2137         hammer_done_transaction(&trans);
2138         lwkt_reltoken(&hmp->fs_token);
2139         return (error);
2140 }
2141
2142 /*
2143  * hammer_vop_nrmdir { nch, dvp, cred }
2144  */
2145 static
2146 int
2147 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2148 {
2149         struct hammer_transaction trans;
2150         struct hammer_inode *dip;
2151         hammer_mount_t hmp;
2152         int error;
2153
2154         dip = VTOI(ap->a_dvp);
2155         hmp = dip->hmp;
2156
2157         if (hammer_nohistory(dip) == 0 &&
2158             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2159                 return (error);
2160         }
2161
2162         lwkt_gettoken(&hmp->fs_token);
2163         hammer_start_transaction(&trans, hmp);
2164         ++hammer_stats_file_iopsw;
2165         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2166         hammer_done_transaction(&trans);
2167         if (error == 0)
2168                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2169         lwkt_reltoken(&hmp->fs_token);
2170         return (error);
2171 }
2172
2173 /*
2174  * hammer_vop_markatime { vp, cred }
2175  */
2176 static
2177 int
2178 hammer_vop_markatime(struct vop_markatime_args *ap)
2179 {
2180         struct hammer_transaction trans;
2181         struct hammer_inode *ip;
2182         hammer_mount_t hmp;
2183
2184         ip = VTOI(ap->a_vp);
2185         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2186                 return (EROFS);
2187         if (ip->flags & HAMMER_INODE_RO)
2188                 return (EROFS);
2189         hmp = ip->hmp;
2190         if (hmp->mp->mnt_flag & MNT_NOATIME)
2191                 return (0);
2192         lwkt_gettoken(&hmp->fs_token);
2193         hammer_start_transaction(&trans, hmp);
2194         ++hammer_stats_file_iopsw;
2195
2196         ip->ino_data.atime = trans.time;
2197         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2198         hammer_done_transaction(&trans);
2199         hammer_knote(ap->a_vp, NOTE_ATTRIB);
2200         lwkt_reltoken(&hmp->fs_token);
2201         return (0);
2202 }
2203
2204 /*
2205  * hammer_vop_setattr { vp, vap, cred }
2206  */
2207 static
2208 int
2209 hammer_vop_setattr(struct vop_setattr_args *ap)
2210 {
2211         struct hammer_transaction trans;
2212         struct hammer_inode *ip;
2213         struct vattr *vap;
2214         hammer_mount_t hmp;
2215         int modflags;
2216         int error;
2217         int truncating;
2218         int blksize;
2219         int kflags;
2220 #if 0
2221         int64_t aligned_size;
2222 #endif
2223         u_int32_t flags;
2224
2225         vap = ap->a_vap;
2226         ip = ap->a_vp->v_data;
2227         modflags = 0;
2228         kflags = 0;
2229         hmp = ip->hmp;
2230
2231         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2232                 return(EROFS);
2233         if (ip->flags & HAMMER_INODE_RO)
2234                 return (EROFS);
2235         if (hammer_nohistory(ip) == 0 &&
2236             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2237                 return (error);
2238         }
2239
2240         lwkt_gettoken(&hmp->fs_token);
2241         hammer_start_transaction(&trans, hmp);
2242         ++hammer_stats_file_iopsw;
2243         error = 0;
2244
2245         if (vap->va_flags != VNOVAL) {
2246                 flags = ip->ino_data.uflags;
2247                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2248                                          hammer_to_unix_xid(&ip->ino_data.uid),
2249                                          ap->a_cred);
2250                 if (error == 0) {
2251                         if (ip->ino_data.uflags != flags) {
2252                                 ip->ino_data.uflags = flags;
2253                                 ip->ino_data.ctime = trans.time;
2254                                 modflags |= HAMMER_INODE_DDIRTY;
2255                                 kflags |= NOTE_ATTRIB;
2256                         }
2257                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2258                                 error = 0;
2259                                 goto done;
2260                         }
2261                 }
2262                 goto done;
2263         }
2264         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2265                 error = EPERM;
2266                 goto done;
2267         }
2268         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2269                 mode_t cur_mode = ip->ino_data.mode;
2270                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2271                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2272                 uuid_t uuid_uid;
2273                 uuid_t uuid_gid;
2274
2275                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2276                                          ap->a_cred,
2277                                          &cur_uid, &cur_gid, &cur_mode);
2278                 if (error == 0) {
2279                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
2280                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
2281                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
2282                                  sizeof(uuid_uid)) ||
2283                             bcmp(&uuid_gid, &ip->ino_data.gid,
2284                                  sizeof(uuid_gid)) ||
2285                             ip->ino_data.mode != cur_mode
2286                         ) {
2287                                 ip->ino_data.uid = uuid_uid;
2288                                 ip->ino_data.gid = uuid_gid;
2289                                 ip->ino_data.mode = cur_mode;
2290                                 ip->ino_data.ctime = trans.time;
2291                                 modflags |= HAMMER_INODE_DDIRTY;
2292                         }
2293                         kflags |= NOTE_ATTRIB;
2294                 }
2295         }
2296         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2297                 switch(ap->a_vp->v_type) {
2298                 case VREG:
2299                         if (vap->va_size == ip->ino_data.size)
2300                                 break;
2301
2302                         /*
2303                          * Log the operation if in fast-fsync mode or if
2304                          * there are unterminated redo write records present.
2305                          *
2306                          * The second check is needed so the recovery code
2307                          * properly truncates write redos even if nominal
2308                          * REDO operations is turned off due to excessive
2309                          * writes, because the related records might be
2310                          * destroyed and never lay down a TERM_WRITE.
2311                          */
2312                         if ((ip->flags & HAMMER_INODE_REDO) ||
2313                             (ip->flags & HAMMER_INODE_RDIRTY)) {
2314                                 error = hammer_generate_redo(&trans, ip,
2315                                                              vap->va_size,
2316                                                              HAMMER_REDO_TRUNC,
2317                                                              NULL, 0);
2318                         }
2319                         blksize = hammer_blocksize(vap->va_size);
2320
2321                         /*
2322                          * XXX break atomicy, we can deadlock the backend
2323                          * if we do not release the lock.  Probably not a
2324                          * big deal here.
2325                          */
2326                         if (vap->va_size < ip->ino_data.size) {
2327                                 nvtruncbuf(ap->a_vp, vap->va_size,
2328                                            blksize,
2329                                            hammer_blockoff(vap->va_size),
2330                                            0);
2331                                 truncating = 1;
2332                                 kflags |= NOTE_WRITE;
2333                         } else {
2334                                 nvextendbuf(ap->a_vp,
2335                                             ip->ino_data.size,
2336                                             vap->va_size,
2337                                             hammer_blocksize(ip->ino_data.size),
2338                                             hammer_blocksize(vap->va_size),
2339                                             hammer_blockoff(ip->ino_data.size),
2340                                             hammer_blockoff(vap->va_size),
2341                                             0);
2342                                 truncating = 0;
2343                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
2344                         }
2345                         ip->ino_data.size = vap->va_size;
2346                         ip->ino_data.mtime = trans.time;
2347                         /* XXX safe to use SDIRTY instead of DDIRTY here? */
2348                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2349
2350                         /*
2351                          * On-media truncation is cached in the inode until
2352                          * the inode is synchronized.  We must immediately
2353                          * handle any frontend records.
2354                          */
2355                         if (truncating) {
2356                                 hammer_ip_frontend_trunc(ip, vap->va_size);
2357 #ifdef DEBUG_TRUNCATE
2358                                 if (HammerTruncIp == NULL)
2359                                         HammerTruncIp = ip;
2360 #endif
2361                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2362                                         ip->flags |= HAMMER_INODE_TRUNCATED;
2363                                         ip->trunc_off = vap->va_size;
2364 #ifdef DEBUG_TRUNCATE
2365                                         if (ip == HammerTruncIp)
2366                                         kprintf("truncate1 %016llx\n",
2367                                                 (long long)ip->trunc_off);
2368 #endif
2369                                 } else if (ip->trunc_off > vap->va_size) {
2370                                         ip->trunc_off = vap->va_size;
2371 #ifdef DEBUG_TRUNCATE
2372                                         if (ip == HammerTruncIp)
2373                                         kprintf("truncate2 %016llx\n",
2374                                                 (long long)ip->trunc_off);
2375 #endif
2376                                 } else {
2377 #ifdef DEBUG_TRUNCATE
2378                                         if (ip == HammerTruncIp)
2379                                         kprintf("truncate3 %016llx (ignored)\n",
2380                                                 (long long)vap->va_size);
2381 #endif
2382                                 }
2383                         }
2384
2385 #if 0
2386                         /*
2387                          * When truncating, nvtruncbuf() may have cleaned out
2388                          * a portion of the last block on-disk in the buffer
2389                          * cache.  We must clean out any frontend records
2390                          * for blocks beyond the new last block.
2391                          */
2392                         aligned_size = (vap->va_size + (blksize - 1)) &
2393                                        ~(int64_t)(blksize - 1);
2394                         if (truncating && vap->va_size < aligned_size) {
2395                                 aligned_size -= blksize;
2396                                 hammer_ip_frontend_trunc(ip, aligned_size);
2397                         }
2398 #endif
2399                         break;
2400                 case VDATABASE:
2401                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2402                                 ip->flags |= HAMMER_INODE_TRUNCATED;
2403                                 ip->trunc_off = vap->va_size;
2404                         } else if (ip->trunc_off > vap->va_size) {
2405                                 ip->trunc_off = vap->va_size;
2406                         }
2407                         hammer_ip_frontend_trunc(ip, vap->va_size);
2408                         ip->ino_data.size = vap->va_size;
2409                         ip->ino_data.mtime = trans.time;
2410                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2411                         kflags |= NOTE_ATTRIB;
2412                         break;
2413                 default:
2414                         error = EINVAL;
2415                         goto done;
2416                 }
2417                 break;
2418         }
2419         if (vap->va_atime.tv_sec != VNOVAL) {
2420                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2421                 modflags |= HAMMER_INODE_ATIME;
2422                 kflags |= NOTE_ATTRIB;
2423         }
2424         if (vap->va_mtime.tv_sec != VNOVAL) {
2425                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2426                 modflags |= HAMMER_INODE_MTIME;
2427                 kflags |= NOTE_ATTRIB;
2428         }
2429         if (vap->va_mode != (mode_t)VNOVAL) {
2430                 mode_t   cur_mode = ip->ino_data.mode;
2431                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2432                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2433
2434                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2435                                          cur_uid, cur_gid, &cur_mode);
2436                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2437                         ip->ino_data.mode = cur_mode;
2438                         ip->ino_data.ctime = trans.time;
2439                         modflags |= HAMMER_INODE_DDIRTY;
2440                         kflags |= NOTE_ATTRIB;
2441                 }
2442         }
2443 done:
2444         if (error == 0)
2445                 hammer_modify_inode(&trans, ip, modflags);
2446         hammer_done_transaction(&trans);
2447         hammer_knote(ap->a_vp, kflags);
2448         lwkt_reltoken(&hmp->fs_token);
2449         return (error);
2450 }
2451
2452 /*
2453  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2454  */
2455 static
2456 int
2457 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2458 {
2459         struct hammer_transaction trans;
2460         struct hammer_inode *dip;
2461         struct hammer_inode *nip;
2462         hammer_record_t record;
2463         struct nchandle *nch;
2464         hammer_mount_t hmp;
2465         int error;
2466         int bytes;
2467
2468         ap->a_vap->va_type = VLNK;
2469
2470         nch = ap->a_nch;
2471         dip = VTOI(ap->a_dvp);
2472         hmp = dip->hmp;
2473
2474         if (dip->flags & HAMMER_INODE_RO)
2475                 return (EROFS);
2476         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2477                 return (error);
2478
2479         /*
2480          * Create a transaction to cover the operations we perform.
2481          */
2482         lwkt_gettoken(&hmp->fs_token);
2483         hammer_start_transaction(&trans, hmp);
2484         ++hammer_stats_file_iopsw;
2485
2486         /*
2487          * Create a new filesystem object of the requested type.  The
2488          * returned inode will be referenced but not locked.
2489          */
2490
2491         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2492                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2493                                     NULL, &nip);
2494         if (error) {
2495                 hammer_done_transaction(&trans);
2496                 *ap->a_vpp = NULL;
2497                 lwkt_reltoken(&hmp->fs_token);
2498                 return (error);
2499         }
2500
2501         /*
2502          * Add a record representing the symlink.  symlink stores the link
2503          * as pure data, not a string, and is no \0 terminated.
2504          */
2505         if (error == 0) {
2506                 bytes = strlen(ap->a_target);
2507
2508                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2509                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2510                 } else {
2511                         record = hammer_alloc_mem_record(nip, bytes);
2512                         record->type = HAMMER_MEM_RECORD_GENERAL;
2513
2514                         record->leaf.base.localization = nip->obj_localization +
2515                                                          HAMMER_LOCALIZE_MISC;
2516                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2517                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2518                         record->leaf.data_len = bytes;
2519                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2520                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2521                         error = hammer_ip_add_record(&trans, record);
2522                 }
2523
2524                 /*
2525                  * Set the file size to the length of the link.
2526                  */
2527                 if (error == 0) {
2528                         nip->ino_data.size = bytes;
2529                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2530                 }
2531         }
2532         if (error == 0)
2533                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2534                                                 nch->ncp->nc_nlen, nip);
2535
2536         /*
2537          * Finish up.
2538          */
2539         if (error) {
2540                 hammer_rel_inode(nip, 0);
2541                 *ap->a_vpp = NULL;
2542         } else {
2543                 error = hammer_get_vnode(nip, ap->a_vpp);
2544                 hammer_rel_inode(nip, 0);
2545                 if (error == 0) {
2546                         cache_setunresolved(ap->a_nch);
2547                         cache_setvp(ap->a_nch, *ap->a_vpp);
2548                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2549                 }
2550         }
2551         hammer_done_transaction(&trans);
2552         lwkt_reltoken(&hmp->fs_token);
2553         return (error);
2554 }
2555
2556 /*
2557  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2558  */
2559 static
2560 int
2561 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2562 {
2563         struct hammer_transaction trans;
2564         struct hammer_inode *dip;
2565         hammer_mount_t hmp;
2566         int error;
2567
2568         dip = VTOI(ap->a_dvp);
2569         hmp = dip->hmp;
2570
2571         if (hammer_nohistory(dip) == 0 &&
2572             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2573                 return (error);
2574         }
2575
2576         lwkt_gettoken(&hmp->fs_token);
2577         hammer_start_transaction(&trans, hmp);
2578         ++hammer_stats_file_iopsw;
2579         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2580                                 ap->a_cred, ap->a_flags, -1);
2581         hammer_done_transaction(&trans);
2582         lwkt_reltoken(&hmp->fs_token);
2583
2584         return (error);
2585 }
2586
2587 /*
2588  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2589  */
2590 static
2591 int
2592 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2593 {
2594         struct hammer_inode *ip = ap->a_vp->v_data;
2595         hammer_mount_t hmp = ip->hmp;
2596         int error;
2597
2598         ++hammer_stats_file_iopsr;
2599         lwkt_gettoken(&hmp->fs_token);
2600         error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2601                              ap->a_fflag, ap->a_cred);
2602         lwkt_reltoken(&hmp->fs_token);
2603         return (error);
2604 }
2605
2606 static
2607 int
2608 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2609 {
2610         static const struct mountctl_opt extraopt[] = {
2611                 { HMNT_NOHISTORY,       "nohistory" },
2612                 { HMNT_MASTERID,        "master" },
2613                 { 0, NULL}
2614
2615         };
2616         struct hammer_mount *hmp;
2617         struct mount *mp;
2618         int usedbytes;
2619         int error;
2620
2621         error = 0;
2622         usedbytes = 0;
2623         mp = ap->a_head.a_ops->head.vv_mount;
2624         KKASSERT(mp->mnt_data != NULL);
2625         hmp = (struct hammer_mount *)mp->mnt_data;
2626
2627         lwkt_gettoken(&hmp->fs_token);
2628
2629         switch(ap->a_op) {
2630         case MOUNTCTL_SET_EXPORT:
2631                 if (ap->a_ctllen != sizeof(struct export_args))
2632                         error = EINVAL;
2633                 else
2634                         error = hammer_vfs_export(mp, ap->a_op,
2635                                       (const struct export_args *)ap->a_ctl);
2636                 break;
2637         case MOUNTCTL_MOUNTFLAGS:
2638         {
2639                 /*
2640                  * Call standard mountctl VOP function
2641                  * so we get user mount flags.
2642                  */
2643                 error = vop_stdmountctl(ap);
2644                 if (error)
2645                         break;
2646
2647                 usedbytes = *ap->a_res;
2648
2649                 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2650                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2651                                                     ap->a_buf,
2652                                                     ap->a_buflen - usedbytes,
2653                                                     &error);
2654                 }
2655
2656                 *ap->a_res += usedbytes;
2657                 break;
2658         }
2659         default:
2660                 error = vop_stdmountctl(ap);
2661                 break;
2662         }
2663         lwkt_reltoken(&hmp->fs_token);
2664         return(error);
2665 }
2666
2667 /*
2668  * hammer_vop_strategy { vp, bio }
2669  *
2670  * Strategy call, used for regular file read & write only.  Note that the
2671  * bp may represent a cluster.
2672  *
2673  * To simplify operation and allow better optimizations in the future,
2674  * this code does not make any assumptions with regards to buffer alignment
2675  * or size.
2676  */
2677 static
2678 int
2679 hammer_vop_strategy(struct vop_strategy_args *ap)
2680 {
2681         struct buf *bp;
2682         int error;
2683
2684         bp = ap->a_bio->bio_buf;
2685
2686         switch(bp->b_cmd) {
2687         case BUF_CMD_READ:
2688                 error = hammer_vop_strategy_read(ap);
2689                 break;
2690         case BUF_CMD_WRITE:
2691                 error = hammer_vop_strategy_write(ap);
2692                 break;
2693         default:
2694                 bp->b_error = error = EINVAL;
2695                 bp->b_flags |= B_ERROR;
2696                 biodone(ap->a_bio);
2697                 break;
2698         }
2699
2700         /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2701
2702         return (error);
2703 }
2704
2705 /*
2706  * Read from a regular file.  Iterate the related records and fill in the
2707  * BIO/BUF.  Gaps are zero-filled.
2708  *
2709  * The support code in hammer_object.c should be used to deal with mixed
2710  * in-memory and on-disk records.
2711  *
2712  * NOTE: Can be called from the cluster code with an oversized buf.
2713  *
2714  * XXX atime update
2715  */
2716 static
2717 int
2718 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2719 {
2720         struct hammer_transaction trans;
2721         struct hammer_inode *ip;
2722         struct hammer_inode *dip;
2723         hammer_mount_t hmp;
2724         struct hammer_cursor cursor;
2725         hammer_base_elm_t base;
2726         hammer_off_t disk_offset;
2727         struct bio *bio;
2728         struct bio *nbio;
2729         struct buf *bp;
2730         int64_t rec_offset;
2731         int64_t ran_end;
2732         int64_t tmp64;
2733         int error;
2734         int boff;
2735         int roff;
2736         int n;
2737         int isdedupable;
2738
2739         bio = ap->a_bio;
2740         bp = bio->bio_buf;
2741         ip = ap->a_vp->v_data;
2742         hmp = ip->hmp;
2743
2744         /*
2745          * The zone-2 disk offset may have been set by the cluster code via
2746          * a BMAP operation, or else should be NOOFFSET.
2747          *
2748          * Checking the high bits for a match against zone-2 should suffice.
2749          *
2750          * In cases where a lot of data duplication is present it may be
2751          * more beneficial to drop through and doubule-buffer through the
2752          * device.
2753          */
2754         nbio = push_bio(bio);
2755         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2756             HAMMER_ZONE_LARGE_DATA) {
2757                 if (hammer_double_buffer == 0) {
2758                         lwkt_gettoken(&hmp->fs_token);
2759                         error = hammer_io_direct_read(hmp, nbio, NULL);
2760                         lwkt_reltoken(&hmp->fs_token);
2761                         return (error);
2762                 }
2763
2764                 /*
2765                  * Try to shortcut requests for double_buffer mode too.
2766                  * Since this mode runs through the device buffer cache
2767                  * only compatible buffer sizes (meaning those generated
2768                  * by normal filesystem buffers) are legal.
2769                  */
2770                 if (hammer_live_dedup == 0 && (bp->b_flags & B_PAGING) == 0) {
2771                         error = hammer_io_indirect_read(hmp, nbio, NULL);
2772                         return (error);
2773                 }
2774         }
2775
2776         /*
2777          * Well, that sucked.  Do it the hard way.  If all the stars are
2778          * aligned we may still be able to issue a direct-read.
2779          */
2780         lwkt_gettoken(&hmp->fs_token);
2781         hammer_simple_transaction(&trans, hmp);
2782         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2783
2784         /*
2785          * Key range (begin and end inclusive) to scan.  Note that the key's
2786          * stored in the actual records represent BASE+LEN, not BASE.  The
2787          * first record containing bio_offset will have a key > bio_offset.
2788          */
2789         cursor.key_beg.localization = ip->obj_localization +
2790                                       HAMMER_LOCALIZE_MISC;
2791         cursor.key_beg.obj_id = ip->obj_id;
2792         cursor.key_beg.create_tid = 0;
2793         cursor.key_beg.delete_tid = 0;
2794         cursor.key_beg.obj_type = 0;
2795         cursor.key_beg.key = bio->bio_offset + 1;
2796         cursor.asof = ip->obj_asof;
2797         cursor.flags |= HAMMER_CURSOR_ASOF;
2798
2799         cursor.key_end = cursor.key_beg;
2800         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2801 #if 0
2802         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2803                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2804                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2805                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2806         } else
2807 #endif
2808         {
2809                 ran_end = bio->bio_offset + bp->b_bufsize;
2810                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2811                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2812                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2813                 if (tmp64 < ran_end)
2814                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2815                 else
2816                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2817         }
2818         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2819
2820         /*
2821          * Set NOSWAPCACHE for cursor data extraction if double buffering
2822          * is disabled or (if the file is not marked cacheable via chflags
2823          * and vm.swapcache_use_chflags is enabled).
2824          */
2825         if (hammer_double_buffer == 0 ||
2826             ((ap->a_vp->v_flag & VSWAPCACHE) == 0 &&
2827              vm_swapcache_use_chflags)) {
2828                 cursor.flags |= HAMMER_CURSOR_NOSWAPCACHE;
2829         }
2830
2831         error = hammer_ip_first(&cursor);
2832         boff = 0;
2833
2834         while (error == 0) {
2835                 /*
2836                  * Get the base file offset of the record.  The key for
2837                  * data records is (base + bytes) rather then (base).
2838                  */
2839                 base = &cursor.leaf->base;
2840                 rec_offset = base->key - cursor.leaf->data_len;
2841
2842                 /*
2843                  * Calculate the gap, if any, and zero-fill it.
2844                  *
2845                  * n is the offset of the start of the record verses our
2846                  * current seek offset in the bio.
2847                  */
2848                 n = (int)(rec_offset - (bio->bio_offset + boff));
2849                 if (n > 0) {
2850                         if (n > bp->b_bufsize - boff)
2851                                 n = bp->b_bufsize - boff;
2852                         bzero((char *)bp->b_data + boff, n);
2853                         boff += n;
2854                         n = 0;
2855                 }
2856
2857                 /*
2858                  * Calculate the data offset in the record and the number
2859                  * of bytes we can copy.
2860                  *
2861                  * There are two degenerate cases.  First, boff may already
2862                  * be at bp->b_bufsize.  Secondly, the data offset within
2863                  * the record may exceed the record's size.
2864                  */
2865                 roff = -n;
2866                 rec_offset += roff;
2867                 n = cursor.leaf->data_len - roff;
2868                 if (n <= 0) {
2869                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2870                         n = 0;
2871                 } else if (n > bp->b_bufsize - boff) {
2872                         n = bp->b_bufsize - boff;
2873                 }
2874
2875                 /*
2876                  * Deal with cached truncations.  This cool bit of code
2877                  * allows truncate()/ftruncate() to avoid having to sync
2878                  * the file.
2879                  *
2880                  * If the frontend is truncated then all backend records are
2881                  * subject to the frontend's truncation.
2882                  *
2883                  * If the backend is truncated then backend records on-disk
2884                  * (but not in-memory) are subject to the backend's
2885                  * truncation.  In-memory records owned by the backend
2886                  * represent data written after the truncation point on the
2887                  * backend and must not be truncated.
2888                  *
2889                  * Truncate operations deal with frontend buffer cache
2890                  * buffers and frontend-owned in-memory records synchronously.
2891                  */
2892                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2893                         if (hammer_cursor_ondisk(&cursor)/* ||
2894                             cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2895                                 if (ip->trunc_off <= rec_offset)
2896                                         n = 0;
2897                                 else if (ip->trunc_off < rec_offset + n)
2898                                         n = (int)(ip->trunc_off - rec_offset);
2899                         }
2900                 }
2901                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2902                         if (hammer_cursor_ondisk(&cursor)) {
2903                                 if (ip->sync_trunc_off <= rec_offset)
2904                                         n = 0;
2905                                 else if (ip->sync_trunc_off < rec_offset + n)
2906                                         n = (int)(ip->sync_trunc_off - rec_offset);
2907                         }
2908                 }
2909
2910                 /*
2911                  * Try to issue a direct read into our bio if possible,
2912                  * otherwise resolve the element data into a hammer_buffer
2913                  * and copy.
2914                  *
2915                  * The buffer on-disk should be zerod past any real
2916                  * truncation point, but may not be for any synthesized
2917                  * truncation point from above.
2918                  *
2919                  * NOTE: disk_offset is only valid if the cursor data is
2920                  *       on-disk.
2921                  */
2922                 disk_offset = cursor.leaf->data_offset + roff;
2923                 isdedupable = (boff == 0 && n == bp->b_bufsize &&
2924                                hammer_cursor_ondisk(&cursor) &&
2925                                ((int)disk_offset & HAMMER_BUFMASK) == 0);
2926
2927                 if (isdedupable && hammer_double_buffer == 0) {
2928                         /*
2929                          * Direct read case
2930                          */
2931                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2932                                  HAMMER_ZONE_LARGE_DATA);
2933                         nbio->bio_offset = disk_offset;
2934                         error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2935                         if (hammer_live_dedup && error == 0)
2936                                 hammer_dedup_cache_add(ip, cursor.leaf);
2937                         goto done;
2938                 } else if (isdedupable) {
2939                         /*
2940                          * Async I/O case for reading from backing store
2941                          * and copying the data to the filesystem buffer.
2942                          * live-dedup has to verify the data anyway if it
2943                          * gets a hit later so we can just add the entry
2944                          * now.
2945                          */
2946                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2947                                  HAMMER_ZONE_LARGE_DATA);
2948                         nbio->bio_offset = disk_offset;
2949                         if (hammer_live_dedup)
2950                                 hammer_dedup_cache_add(ip, cursor.leaf);
2951                         error = hammer_io_indirect_read(hmp, nbio, cursor.leaf);
2952                         goto done;
2953                 } else if (n) {
2954                         error = hammer_ip_resolve_data(&cursor);
2955                         if (error == 0) {
2956                                 if (hammer_live_dedup && isdedupable)
2957                                         hammer_dedup_cache_add(ip, cursor.leaf);
2958                                 bcopy((char *)cursor.data + roff,
2959                                       (char *)bp->b_data + boff, n);
2960                         }
2961                 }
2962                 if (error)
2963                         break;
2964
2965                 /*
2966                  * We have to be sure that the only elements added to the
2967                  * dedup cache are those which are already on-media.
2968                  */
2969                 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2970                         hammer_dedup_cache_add(ip, cursor.leaf);
2971
2972                 /*
2973                  * Iterate until we have filled the request.
2974                  */
2975                 boff += n;
2976                 if (boff == bp->b_bufsize)
2977                         break;
2978                 error = hammer_ip_next(&cursor);
2979         }
2980
2981         /*
2982          * There may have been a gap after the last record
2983          */
2984         if (error == ENOENT)
2985                 error = 0;
2986         if (error == 0 && boff != bp->b_bufsize) {
2987                 KKASSERT(boff < bp->b_bufsize);
2988                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2989                 /* boff = bp->b_bufsize; */
2990         }
2991
2992         /*
2993          * Disallow swapcache operation on the vnode buffer if double
2994          * buffering is enabled, the swapcache will get the data via
2995          * the block device buffer.
2996          */
2997         if (hammer_double_buffer)
2998                 bp->b_flags |= B_NOTMETA;
2999
3000         /*
3001          * Cleanup
3002          */
3003         bp->b_resid = 0;
3004         bp->b_error = error;
3005         if (error)
3006                 bp->b_flags |= B_ERROR;
3007         biodone(ap->a_bio);
3008
3009 done:
3010         /*
3011          * Cache the b-tree node for the last data read in cache[1].
3012          *
3013          * If we hit the file EOF then also cache the node in the
3014          * governing director's cache[3], it will be used to initialize
3015          * the inode's cache[1] for any inodes looked up via the directory.
3016          *
3017          * This doesn't reduce disk accesses since the B-Tree chain is
3018          * likely cached, but it does reduce cpu overhead when looking
3019          * up file offsets for cpdup/tar/cpio style iterations.
3020          */
3021         if (cursor.node)
3022                 hammer_cache_node(&ip->cache[1], cursor.node);
3023         if (ran_end >= ip->ino_data.size) {
3024                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
3025                                         ip->obj_asof, ip->obj_localization);
3026                 if (dip) {
3027                         hammer_cache_node(&dip->cache[3], cursor.node);
3028                         hammer_rel_inode(dip, 0);
3029                 }
3030         }
3031         hammer_done_cursor(&cursor);
3032         hammer_done_transaction(&trans);
3033         lwkt_reltoken(&hmp->fs_token);
3034         return(error);
3035 }
3036
3037 /*
3038  * BMAP operation - used to support cluster_read() only.
3039  *
3040  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
3041  *
3042  * This routine may return EOPNOTSUPP if the opration is not supported for
3043  * the specified offset.  The contents of the pointer arguments do not
3044  * need to be initialized in that case. 
3045  *
3046  * If a disk address is available and properly aligned return 0 with 
3047  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
3048  * to the run-length relative to that offset.  Callers may assume that
3049  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
3050  * large, so return EOPNOTSUPP if it is not sufficiently large.
3051  */
3052 static
3053 int
3054 hammer_vop_bmap(struct vop_bmap_args *ap)
3055 {
3056         struct hammer_transaction trans;
3057         struct hammer_inode *ip;
3058         hammer_mount_t hmp;
3059         struct hammer_cursor cursor;
3060         hammer_base_elm_t base;
3061         int64_t rec_offset;
3062         int64_t ran_end;
3063         int64_t tmp64;
3064         int64_t base_offset;
3065         int64_t base_disk_offset;
3066         int64_t last_offset;
3067         hammer_off_t last_disk_offset;
3068         hammer_off_t disk_offset;
3069         int     rec_len;
3070         int     error;
3071         int     blksize;
3072
3073         ++hammer_stats_file_iopsr;
3074         ip = ap->a_vp->v_data;
3075         hmp = ip->hmp;
3076
3077         /*
3078          * We can only BMAP regular files.  We can't BMAP database files,
3079          * directories, etc.
3080          */
3081         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
3082                 return(EOPNOTSUPP);
3083
3084         /*
3085          * bmap is typically called with runp/runb both NULL when used
3086          * for writing.  We do not support BMAP for writing atm.
3087          */
3088         if (ap->a_cmd != BUF_CMD_READ)
3089                 return(EOPNOTSUPP);
3090
3091         /*
3092          * Scan the B-Tree to acquire blockmap addresses, then translate
3093          * to raw addresses.
3094          */
3095         lwkt_gettoken(&hmp->fs_token);
3096         hammer_simple_transaction(&trans, hmp);
3097 #if 0
3098         kprintf("bmap_beg %016llx ip->cache %p\n",
3099                 (long long)ap->a_loffset, ip->cache[1]);
3100 #endif
3101         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
3102
3103         /*
3104          * Key range (begin and end inclusive) to scan.  Note that the key's
3105          * stored in the actual records represent BASE+LEN, not BASE.  The
3106          * first record containing bio_offset will have a key > bio_offset.
3107          */
3108         cursor.key_beg.localization = ip->obj_localization +
3109                                       HAMMER_LOCALIZE_MISC;
3110         cursor.key_beg.obj_id = ip->obj_id;
3111         cursor.key_beg.create_tid = 0;
3112         cursor.key_beg.delete_tid = 0;
3113         cursor.key_beg.obj_type = 0;
3114         if (ap->a_runb)
3115                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3116         else
3117                 cursor.key_beg.key = ap->a_loffset + 1;
3118         if (cursor.key_beg.key < 0)
3119                 cursor.key_beg.key = 0;
3120         cursor.asof = ip->obj_asof;
3121         cursor.flags |= HAMMER_CURSOR_ASOF;
3122
3123         cursor.key_end = cursor.key_beg;
3124         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3125
3126         ran_end = ap->a_loffset + MAXPHYS;
3127         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3128         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3129         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
3130         if (tmp64 < ran_end)
3131                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3132         else
3133                 cursor.key_end.key = ran_end + MAXPHYS + 1;
3134
3135         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3136
3137         error = hammer_ip_first(&cursor);
3138         base_offset = last_offset = 0;
3139         base_disk_offset = last_disk_offset = 0;
3140
3141         while (error == 0) {
3142                 /*
3143                  * Get the base file offset of the record.  The key for
3144                  * data records is (base + bytes) rather then (base).
3145                  *
3146                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
3147                  * The extra bytes should be zero on-disk and the BMAP op
3148                  * should still be ok.
3149                  */
3150                 base = &cursor.leaf->base;
3151                 rec_offset = base->key - cursor.leaf->data_len;
3152                 rec_len    = cursor.leaf->data_len;
3153
3154                 /*
3155                  * Incorporate any cached truncation.
3156                  *
3157                  * NOTE: Modifications to rec_len based on synthesized
3158                  * truncation points remove the guarantee that any extended
3159                  * data on disk is zero (since the truncations may not have
3160                  * taken place on-media yet).
3161                  */
3162                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
3163                         if (hammer_cursor_ondisk(&cursor) ||
3164                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3165                                 if (ip->trunc_off <= rec_offset)
3166                                         rec_len = 0;
3167                                 else if (ip->trunc_off < rec_offset + rec_len)
3168                                         rec_len = (int)(ip->trunc_off - rec_offset);
3169                         }
3170                 }
3171                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3172                         if (hammer_cursor_ondisk(&cursor)) {
3173                                 if (ip->sync_trunc_off <= rec_offset)
3174                                         rec_len = 0;
3175                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
3176                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
3177                         }
3178                 }
3179
3180                 /*
3181                  * Accumulate information.  If we have hit a discontiguous
3182                  * block reset base_offset unless we are already beyond the
3183                  * requested offset.  If we are, that's it, we stop.
3184                  */
3185                 if (error)
3186                         break;
3187                 if (hammer_cursor_ondisk(&cursor)) {
3188                         disk_offset = cursor.leaf->data_offset;
3189                         if (rec_offset != last_offset ||
3190                             disk_offset != last_disk_offset) {
3191                                 if (rec_offset > ap->a_loffset)
3192                                         break;
3193                                 base_offset = rec_offset;
3194                                 base_disk_offset = disk_offset;
3195                         }
3196                         last_offset = rec_offset + rec_len;
3197                         last_disk_offset = disk_offset + rec_len;
3198
3199                         if (hammer_live_dedup)
3200                                 hammer_dedup_cache_add(ip, cursor.leaf);
3201                 }
3202                 
3203                 error = hammer_ip_next(&cursor);
3204         }
3205
3206 #if 0
3207         kprintf("BMAP %016llx:  %016llx - %016llx\n",
3208                 (long long)ap->a_loffset,
3209                 (long long)base_offset,
3210                 (long long)last_offset);
3211         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3212                 (long long)base_disk_offset,
3213                 (long long)last_disk_offset);
3214 #endif
3215
3216         if (cursor.node) {
3217                 hammer_cache_node(&ip->cache[1], cursor.node);
3218 #if 0
3219                 kprintf("bmap_end2 %016llx ip->cache %p\n",
3220                         (long long)ap->a_loffset, ip->cache[1]);
3221 #endif
3222         }
3223         hammer_done_cursor(&cursor);
3224         hammer_done_transaction(&trans);
3225         lwkt_reltoken(&hmp->fs_token);
3226
3227         /*
3228          * If we couldn't find any records or the records we did find were
3229          * all behind the requested offset, return failure.  A forward
3230          * truncation can leave a hole w/ no on-disk records.
3231          */
3232         if (last_offset == 0 || last_offset < ap->a_loffset)
3233                 return (EOPNOTSUPP);
3234
3235         /*
3236          * Figure out the block size at the requested offset and adjust
3237          * our limits so the cluster_read() does not create inappropriately
3238          * sized buffer cache buffers.
3239          */
3240         blksize = hammer_blocksize(ap->a_loffset);
3241         if (hammer_blocksize(base_offset) != blksize) {
3242                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3243         }
3244         if (last_offset != ap->a_loffset &&
3245             hammer_blocksize(last_offset - 1) != blksize) {
3246                 last_offset = hammer_blockdemarc(ap->a_loffset,
3247                                                  last_offset - 1);
3248         }
3249
3250         /*
3251          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3252          * from occuring.
3253          */
3254         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3255
3256         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3257                 /*
3258                  * Only large-data zones can be direct-IOd
3259                  */
3260                 error = EOPNOTSUPP;
3261         } else if ((disk_offset & HAMMER_BUFMASK) ||
3262                    (last_offset - ap->a_loffset) < blksize) {
3263                 /*
3264                  * doffsetp is not aligned or the forward run size does
3265                  * not cover a whole buffer, disallow the direct I/O.
3266                  */
3267                 error = EOPNOTSUPP;
3268         } else {
3269                 /*
3270                  * We're good.
3271                  */
3272                 *ap->a_doffsetp = disk_offset;
3273                 if (ap->a_runb) {
3274                         *ap->a_runb = ap->a_loffset - base_offset;
3275                         KKASSERT(*ap->a_runb >= 0);
3276                 }
3277                 if (ap->a_runp) {
3278                         *ap->a_runp = last_offset - ap->a_loffset;
3279                         KKASSERT(*ap->a_runp >= 0);
3280                 }
3281                 error = 0;
3282         }
3283         return(error);
3284 }
3285
3286 /*
3287  * Write to a regular file.   Because this is a strategy call the OS is
3288  * trying to actually get data onto the media.
3289  */
3290 static
3291 int
3292 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3293 {
3294         hammer_record_t record;
3295         hammer_mount_t hmp;
3296         hammer_inode_t ip;
3297         struct bio *bio;
3298         struct buf *bp;
3299         int blksize __debugvar;
3300         int bytes;
3301         int error;
3302
3303         bio = ap->a_bio;
3304         bp = bio->bio_buf;
3305         ip = ap->a_vp->v_data;
3306         hmp = ip->hmp;
3307
3308         blksize = hammer_blocksize(bio->bio_offset);
3309         KKASSERT(bp->b_bufsize == blksize);
3310
3311         if (ip->flags & HAMMER_INODE_RO) {
3312                 bp->b_error = EROFS;
3313                 bp->b_flags |= B_ERROR;
3314                 biodone(ap->a_bio);
3315                 return(EROFS);
3316         }
3317
3318         lwkt_gettoken(&hmp->fs_token);
3319
3320         /*
3321          * Disallow swapcache operation on the vnode buffer if double
3322          * buffering is enabled, the swapcache will get the data via
3323          * the block device buffer.
3324          */
3325         if (hammer_double_buffer)
3326                 bp->b_flags |= B_NOTMETA;
3327
3328         /*
3329          * Interlock with inode destruction (no in-kernel or directory
3330          * topology visibility).  If we queue new IO while trying to
3331          * destroy the inode we can deadlock the vtrunc call in
3332          * hammer_inode_unloadable_check().
3333          *
3334          * Besides, there's no point flushing a bp associated with an
3335          * inode that is being destroyed on-media and has no kernel
3336          * references.
3337          */
3338         if ((ip->flags | ip->sync_flags) &
3339             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3340                 bp->b_resid = 0;
3341                 biodone(ap->a_bio);
3342                 lwkt_reltoken(&hmp->fs_token);
3343                 return(0);
3344         }
3345
3346         /*
3347          * Reserve space and issue a direct-write from the front-end. 
3348          * NOTE: The direct_io code will hammer_bread/bcopy smaller
3349          * allocations.
3350          *
3351          * An in-memory record will be installed to reference the storage
3352          * until the flusher can get to it.
3353          *
3354          * Since we own the high level bio the front-end will not try to
3355          * do a direct-read until the write completes.
3356          *
3357          * NOTE: The only time we do not reserve a full-sized buffers
3358          * worth of data is if the file is small.  We do not try to
3359          * allocate a fragment (from the small-data zone) at the end of
3360          * an otherwise large file as this can lead to wildly separated
3361          * data.
3362          */
3363         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3364         KKASSERT(bio->bio_offset < ip->ino_data.size);
3365         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3366                 bytes = bp->b_bufsize;
3367         else
3368                 bytes = ((int)ip->ino_data.size + 15) & ~15;
3369
3370         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3371                                     bytes, &error);
3372
3373         /*
3374          * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3375          * in hammer_vop_write().  We must flag the record so the proper
3376          * REDO_TERM_WRITE entry is generated during the flush.
3377          */
3378         if (record) {
3379                 if (bp->b_flags & B_VFSFLAG1) {
3380                         record->flags |= HAMMER_RECF_REDO;
3381                         bp->b_flags &= ~B_VFSFLAG1;
3382                 }
3383                 if (record->flags & HAMMER_RECF_DEDUPED) {
3384                         bp->b_resid = 0;
3385                         hammer_ip_replace_bulk(hmp, record);
3386                         biodone(ap->a_bio);
3387                 } else {
3388                         hammer_io_direct_write(hmp, bio, record);
3389                 }
3390                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3391                         hammer_flush_inode(ip, 0);
3392         } else {
3393                 bp->b_bio2.bio_offset = NOOFFSET;
3394                 bp->b_error = error;
3395                 bp->b_flags |= B_ERROR;
3396                 biodone(ap->a_bio);
3397         }
3398         lwkt_reltoken(&hmp->fs_token);
3399         return(error);
3400 }
3401
3402 /*
3403  * dounlink - disconnect a directory entry
3404  *
3405  * XXX whiteout support not really in yet
3406  */
3407 static int
3408 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3409                 struct vnode *dvp, struct ucred *cred, 
3410                 int flags, int isdir)
3411 {
3412         struct namecache *ncp;
3413         hammer_inode_t dip;
3414         hammer_inode_t ip;
3415         hammer_mount_t hmp;
3416         struct hammer_cursor cursor;
3417         int64_t namekey;
3418         u_int32_t max_iterations;
3419         int nlen, error;
3420
3421         /*
3422          * Calculate the namekey and setup the key range for the scan.  This
3423          * works kinda like a chained hash table where the lower 32 bits
3424          * of the namekey synthesize the chain.
3425          *
3426          * The key range is inclusive of both key_beg and key_end.
3427          */
3428         dip = VTOI(dvp);
3429         ncp = nch->ncp;
3430         hmp = dip->hmp;
3431
3432         if (dip->flags & HAMMER_INODE_RO)
3433                 return (EROFS);
3434
3435         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3436                                            &max_iterations);
3437 retry:
3438         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3439         cursor.key_beg.localization = dip->obj_localization +
3440                                       hammer_dir_localization(dip);
3441         cursor.key_beg.obj_id = dip->obj_id;
3442         cursor.key_beg.key = namekey;
3443         cursor.key_beg.create_tid = 0;
3444         cursor.key_beg.delete_tid = 0;
3445         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3446         cursor.key_beg.obj_type = 0;
3447
3448         cursor.key_end = cursor.key_beg;
3449         cursor.key_end.key += max_iterations;
3450         cursor.asof = dip->obj_asof;
3451         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3452
3453         /*
3454          * Scan all matching records (the chain), locate the one matching
3455          * the requested path component.  info->last_error contains the
3456          * error code on search termination and could be 0, ENOENT, or
3457          * something else.
3458          *
3459          * The hammer_ip_*() functions merge in-memory records with on-disk
3460          * records for the purposes of the search.
3461          */
3462         error = hammer_ip_first(&cursor);
3463
3464         while (error == 0) {
3465                 error = hammer_ip_resolve_data(&cursor);
3466                 if (error)
3467                         break;
3468                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3469                 KKASSERT(nlen > 0);
3470                 if (ncp->nc_nlen == nlen &&
3471                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3472                         break;
3473                 }
3474                 error = hammer_ip_next(&cursor);
3475         }
3476
3477         /*
3478          * If all is ok we have to get the inode so we can adjust nlinks.
3479          * To avoid a deadlock with the flusher we must release the inode
3480          * lock on the directory when acquiring the inode for the entry.
3481          *
3482          * If the target is a directory, it must be empty.
3483          */
3484         if (error == 0) {
3485                 hammer_unlock(&cursor.ip->lock);
3486                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3487                                       hmp->asof,
3488                                       cursor.data->entry.localization,
3489                                       0, &error);
3490                 hammer_lock_sh(&cursor.ip->lock);
3491                 if (error == ENOENT) {
3492                         kprintf("HAMMER: WARNING: Removing "
3493                                 "dirent w/missing inode \"%s\"\n"
3494                                 "\tobj_id = %016llx\n",
3495                                 ncp->nc_name,
3496                                 (long long)cursor.data->entry.obj_id);
3497                         error = 0;
3498                 }
3499
3500                 /*
3501                  * If isdir >= 0 we validate that the entry is or is not a
3502                  * directory.  If isdir < 0 we don't care.
3503                  */
3504                 if (error == 0 && isdir >= 0 && ip) {
3505                         if (isdir &&
3506                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3507                                 error = ENOTDIR;
3508                         } else if (isdir == 0 &&
3509                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3510                                 error = EISDIR;
3511                         }
3512                 }
3513
3514                 /*
3515                  * If we are trying to remove a directory the directory must
3516                  * be empty.
3517                  *
3518                  * The check directory code can loop and deadlock/retry.  Our
3519                  * own cursor's node locks must be released to avoid a 3-way
3520                  * deadlock with the flusher if the check directory code
3521                  * blocks.
3522                  *
3523                  * If any changes whatsoever have been made to the cursor
3524                  * set EDEADLK and retry.
3525                  *
3526                  * WARNING: See warnings in hammer_unlock_cursor()
3527                  *          function.
3528                  */
3529                 if (error == 0 && ip && ip->ino_data.obj_type ==
3530                                         HAMMER_OBJTYPE_DIRECTORY) {
3531                         hammer_unlock_cursor(&cursor);
3532                         error = hammer_ip_check_directory_empty(trans, ip);
3533                         hammer_lock_cursor(&cursor);
3534                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
3535                                 kprintf("HAMMER: Warning: avoided deadlock "
3536                                         "on rmdir '%s'\n",
3537                                         ncp->nc_name);
3538                                 error = EDEADLK;
3539                         }
3540                 }
3541
3542                 /*
3543                  * Delete the directory entry.
3544                  *
3545                  * WARNING: hammer_ip_del_directory() may have to terminate
3546                  * the cursor to avoid a deadlock.  It is ok to call
3547                  * hammer_done_cursor() twice.
3548                  */
3549                 if (error == 0) {
3550                         error = hammer_ip_del_directory(trans, &cursor,
3551                                                         dip, ip);
3552                 }
3553                 hammer_done_cursor(&cursor);
3554                 if (error == 0) {
3555                         /*
3556                          * Tell the namecache that we are now unlinked.
3557                          */
3558                         cache_unlink(nch);
3559
3560                         /*
3561                          * NOTE: ip->vp, if non-NULL, cannot be directly
3562                          *       referenced without formally acquiring the
3563                          *       vp since the vp might have zero refs on it,
3564                          *       or in the middle of a reclaim, etc.
3565                          *
3566                          * NOTE: The cache_setunresolved() can rip the vp
3567                          *       out from under us since the vp may not have
3568                          *       any refs, in which case ip->vp will be NULL
3569                          *       from the outset.
3570                          */
3571                         while (ip && ip->vp) {
3572                                 struct vnode *vp;
3573
3574                                 error = hammer_get_vnode(ip, &vp);
3575                                 if (error == 0 && vp) {
3576                                         vn_unlock(vp);
3577                                         hammer_knote(ip->vp, NOTE_DELETE);
3578                                         cache_inval_vp(ip->vp, CINV_DESTROY);
3579                                         vrele(vp);
3580                                         break;
3581                                 }
3582                                 kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3583                         }
3584                 }
3585                 if (ip)
3586                         hammer_rel_inode(ip, 0);
3587         } else {
3588                 hammer_done_cursor(&cursor);
3589         }
3590         if (error == EDEADLK)
3591                 goto retry;
3592
3593         return (error);
3594 }
3595
3596 /************************************************************************
3597  *                          FIFO AND SPECFS OPS                         *
3598  ************************************************************************
3599  *
3600  */
3601 static int
3602 hammer_vop_fifoclose (struct vop_close_args *ap)
3603 {
3604         /* XXX update itimes */
3605         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3606 }
3607
3608 static int
3609 hammer_vop_fiforead (struct vop_read_args *ap)
3610 {
3611         int error;
3612
3613         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3614         /* XXX update access time */
3615         return (error);
3616 }
3617
3618 static int
3619 hammer_vop_fifowrite (struct vop_write_args *ap)
3620 {
3621         int error;
3622
3623         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3624         /* XXX update access time */
3625         return (error);
3626 }
3627
3628 static
3629 int
3630 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3631 {
3632         int error;
3633
3634         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3635         if (error)
3636                 error = hammer_vop_kqfilter(ap);
3637         return(error);
3638 }
3639
3640 /************************************************************************
3641  *                          KQFILTER OPS                                *
3642  ************************************************************************
3643  *
3644  */
3645 static void filt_hammerdetach(struct knote *kn);
3646 static int filt_hammerread(struct knote *kn, long hint);
3647 static int filt_hammerwrite(struct knote *kn, long hint);
3648 static int filt_hammervnode(struct knote *kn, long hint);
3649
3650 static struct filterops hammerread_filtops =
3651         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
3652 static struct filterops hammerwrite_filtops =
3653         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
3654 static struct filterops hammervnode_filtops =
3655         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
3656
3657 static
3658 int
3659 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3660 {
3661         struct vnode *vp = ap->a_vp;
3662         struct knote *kn = ap->a_kn;
3663
3664         switch (kn->kn_filter) {
3665         case EVFILT_READ:
3666                 kn->kn_fop = &hammerread_filtops;
3667                 break;
3668         case EVFILT_WRITE:
3669                 kn->kn_fop = &hammerwrite_filtops;
3670                 break;
3671         case EVFILT_VNODE:
3672                 kn->kn_fop = &hammervnode_filtops;
3673                 break;
3674         default:
3675                 return (EOPNOTSUPP);
3676         }
3677
3678         kn->kn_hook = (caddr_t)vp;
3679
3680         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3681
3682         return(0);
3683 }
3684
3685 static void
3686 filt_hammerdetach(struct knote *kn)
3687 {
3688         struct vnode *vp = (void *)kn->kn_hook;
3689
3690         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3691 }
3692
3693 static int
3694 filt_hammerread(struct knote *kn, long hint)
3695 {
3696         struct vnode *vp = (void *)kn->kn_hook;
3697         hammer_inode_t ip = VTOI(vp);
3698         hammer_mount_t hmp = ip->hmp;
3699         off_t off;
3700
3701         if (hint == NOTE_REVOKE) {
3702                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3703                 return(1);
3704         }
3705         lwkt_gettoken(&hmp->fs_token);  /* XXX use per-ip-token */
3706         off = ip->ino_data.size - kn->kn_fp->f_offset;
3707         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
3708         lwkt_reltoken(&hmp->fs_token);
3709         if (kn->kn_sfflags & NOTE_OLDAPI)
3710                 return(1);
3711         return (kn->kn_data != 0);
3712 }
3713
3714 static int
3715 filt_hammerwrite(struct knote *kn, long hint)
3716 {
3717         if (hint == NOTE_REVOKE)
3718                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
3719         kn->kn_data = 0;
3720         return (1);
3721 }
3722
3723 static int
3724 filt_hammervnode(struct knote *kn, long hint)
3725 {
3726         if (kn->kn_sfflags & hint)
3727                 kn->kn_fflags |= hint;
3728         if (hint == NOTE_REVOKE) {
3729                 kn->kn_flags |= (EV_EOF | EV_NODATA);
3730                 return (1);
3731         }
3732         return (kn->kn_fflags != 0);
3733 }
3734