HAMMER - Fix long stalls when writing out core files
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50
51 #include "hammer.h"
52
53 /*
54  * USERFS VNOPS
55  */
56 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
57 static int hammer_vop_fsync(struct vop_fsync_args *);
58 static int hammer_vop_read(struct vop_read_args *);
59 static int hammer_vop_write(struct vop_write_args *);
60 static int hammer_vop_access(struct vop_access_args *);
61 static int hammer_vop_advlock(struct vop_advlock_args *);
62 static int hammer_vop_close(struct vop_close_args *);
63 static int hammer_vop_ncreate(struct vop_ncreate_args *);
64 static int hammer_vop_getattr(struct vop_getattr_args *);
65 static int hammer_vop_nresolve(struct vop_nresolve_args *);
66 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
67 static int hammer_vop_nlink(struct vop_nlink_args *);
68 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
69 static int hammer_vop_nmknod(struct vop_nmknod_args *);
70 static int hammer_vop_open(struct vop_open_args *);
71 static int hammer_vop_print(struct vop_print_args *);
72 static int hammer_vop_readdir(struct vop_readdir_args *);
73 static int hammer_vop_readlink(struct vop_readlink_args *);
74 static int hammer_vop_nremove(struct vop_nremove_args *);
75 static int hammer_vop_nrename(struct vop_nrename_args *);
76 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
77 static int hammer_vop_markatime(struct vop_markatime_args *);
78 static int hammer_vop_setattr(struct vop_setattr_args *);
79 static int hammer_vop_strategy(struct vop_strategy_args *);
80 static int hammer_vop_bmap(struct vop_bmap_args *ap);
81 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
82 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
83 static int hammer_vop_ioctl(struct vop_ioctl_args *);
84 static int hammer_vop_mountctl(struct vop_mountctl_args *);
85 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
86
87 static int hammer_vop_fifoclose (struct vop_close_args *);
88 static int hammer_vop_fiforead (struct vop_read_args *);
89 static int hammer_vop_fifowrite (struct vop_write_args *);
90 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
91
92 struct vop_ops hammer_vnode_vops = {
93         .vop_default =          vop_defaultop,
94         .vop_fsync =            hammer_vop_fsync,
95         .vop_getpages =         vop_stdgetpages,
96         .vop_putpages =         vop_stdputpages,
97         .vop_read =             hammer_vop_read,
98         .vop_write =            hammer_vop_write,
99         .vop_access =           hammer_vop_access,
100         .vop_advlock =          hammer_vop_advlock,
101         .vop_close =            hammer_vop_close,
102         .vop_ncreate =          hammer_vop_ncreate,
103         .vop_getattr =          hammer_vop_getattr,
104         .vop_inactive =         hammer_vop_inactive,
105         .vop_reclaim =          hammer_vop_reclaim,
106         .vop_nresolve =         hammer_vop_nresolve,
107         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
108         .vop_nlink =            hammer_vop_nlink,
109         .vop_nmkdir =           hammer_vop_nmkdir,
110         .vop_nmknod =           hammer_vop_nmknod,
111         .vop_open =             hammer_vop_open,
112         .vop_pathconf =         vop_stdpathconf,
113         .vop_print =            hammer_vop_print,
114         .vop_readdir =          hammer_vop_readdir,
115         .vop_readlink =         hammer_vop_readlink,
116         .vop_nremove =          hammer_vop_nremove,
117         .vop_nrename =          hammer_vop_nrename,
118         .vop_nrmdir =           hammer_vop_nrmdir,
119         .vop_markatime =        hammer_vop_markatime,
120         .vop_setattr =          hammer_vop_setattr,
121         .vop_bmap =             hammer_vop_bmap,
122         .vop_strategy =         hammer_vop_strategy,
123         .vop_nsymlink =         hammer_vop_nsymlink,
124         .vop_nwhiteout =        hammer_vop_nwhiteout,
125         .vop_ioctl =            hammer_vop_ioctl,
126         .vop_mountctl =         hammer_vop_mountctl,
127         .vop_kqfilter =         hammer_vop_kqfilter
128 };
129
130 struct vop_ops hammer_spec_vops = {
131         .vop_default =          vop_defaultop,
132         .vop_fsync =            hammer_vop_fsync,
133         .vop_read =             vop_stdnoread,
134         .vop_write =            vop_stdnowrite,
135         .vop_access =           hammer_vop_access,
136         .vop_close =            hammer_vop_close,
137         .vop_markatime =        hammer_vop_markatime,
138         .vop_getattr =          hammer_vop_getattr,
139         .vop_inactive =         hammer_vop_inactive,
140         .vop_reclaim =          hammer_vop_reclaim,
141         .vop_setattr =          hammer_vop_setattr
142 };
143
144 struct vop_ops hammer_fifo_vops = {
145         .vop_default =          fifo_vnoperate,
146         .vop_fsync =            hammer_vop_fsync,
147         .vop_read =             hammer_vop_fiforead,
148         .vop_write =            hammer_vop_fifowrite,
149         .vop_access =           hammer_vop_access,
150         .vop_close =            hammer_vop_fifoclose,
151         .vop_markatime =        hammer_vop_markatime,
152         .vop_getattr =          hammer_vop_getattr,
153         .vop_inactive =         hammer_vop_inactive,
154         .vop_reclaim =          hammer_vop_reclaim,
155         .vop_setattr =          hammer_vop_setattr,
156         .vop_kqfilter =         hammer_vop_fifokqfilter
157 };
158
159 static __inline
160 void
161 hammer_knote(struct vnode *vp, int flags)
162 {
163         if (flags)
164                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
165 }
166
167 #ifdef DEBUG_TRUNCATE
168 struct hammer_inode *HammerTruncIp;
169 #endif
170
171 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
172                            struct vnode *dvp, struct ucred *cred,
173                            int flags, int isdir);
174 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
175 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
176
177 #if 0
178 static
179 int
180 hammer_vop_vnoperate(struct vop_generic_args *)
181 {
182         return (VOCALL(&hammer_vnode_vops, ap));
183 }
184 #endif
185
186 /*
187  * hammer_vop_fsync { vp, waitfor }
188  *
189  * fsync() an inode to disk and wait for it to be completely committed
190  * such that the information would not be undone if a crash occured after
191  * return.
192  *
193  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
194  *       a REDO log.  A sysctl is provided to relax HAMMER's fsync()
195  *       operation.
196  *
197  *       Ultimately the combination of a REDO log and use of fast storage
198  *       to front-end cluster caches will make fsync fast, but it aint
199  *       here yet.  And, in anycase, we need real transactional
200  *       all-or-nothing features which are not restricted to a single file.
201  */
202 static
203 int
204 hammer_vop_fsync(struct vop_fsync_args *ap)
205 {
206         hammer_inode_t ip = VTOI(ap->a_vp);
207         hammer_mount_t hmp = ip->hmp;
208         int waitfor = ap->a_waitfor;
209         int mode;
210
211         lwkt_gettoken(&hmp->fs_token);
212
213         /*
214          * Fsync rule relaxation (default is either full synchronous flush
215          * or REDO semantics with synchronous flush).
216          */
217         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
218                 switch(hammer_fsync_mode) {
219                 case 0:
220 mode0:
221                         /* no REDO, full synchronous flush */
222                         goto skip;
223                 case 1:
224 mode1:
225                         /* no REDO, full asynchronous flush */
226                         if (waitfor == MNT_WAIT)
227                                 waitfor = MNT_NOWAIT;
228                         goto skip;
229                 case 2:
230                         /* REDO semantics, synchronous flush */
231                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
232                                 goto mode0;
233                         mode = HAMMER_FLUSH_UNDOS_AUTO;
234                         break;
235                 case 3:
236                         /* REDO semantics, relaxed asynchronous flush */
237                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
238                                 goto mode1;
239                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
240                         if (waitfor == MNT_WAIT)
241                                 waitfor = MNT_NOWAIT;
242                         break;
243                 case 4:
244                         /* ignore the fsync() system call */
245                         lwkt_reltoken(&hmp->fs_token);
246                         return(0);
247                 default:
248                         /* we have to do something */
249                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
250                         if (waitfor == MNT_WAIT)
251                                 waitfor = MNT_NOWAIT;
252                         break;
253                 }
254
255                 /*
256                  * Fast fsync only needs to flush the UNDO/REDO fifo if
257                  * HAMMER_INODE_REDO is non-zero and the only modifications
258                  * made to the file are write or write-extends.
259                  */
260                 if ((ip->flags & HAMMER_INODE_REDO) &&
261                     (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
262                 ) {
263                         ++hammer_count_fsyncs;
264                         hammer_flusher_flush_undos(hmp, mode);
265                         ip->redo_count = 0;
266                         lwkt_reltoken(&hmp->fs_token);
267                         return(0);
268                 }
269
270                 /*
271                  * REDO is enabled by fsync(), the idea being we really only
272                  * want to lay down REDO records when programs are using
273                  * fsync() heavily.  The first fsync() on the file starts
274                  * the gravy train going and later fsync()s keep it hot by
275                  * resetting the redo_count.
276                  *
277                  * We weren't running REDOs before now so we have to fall
278                  * through and do a full fsync of what we have.
279                  */
280                 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
281                     (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
282                         ip->flags |= HAMMER_INODE_REDO;
283                         ip->redo_count = 0;
284                 }
285         }
286 skip:
287
288         /*
289          * Do a full flush sequence.
290          */
291         ++hammer_count_fsyncs;
292         vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
293         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
294         if (waitfor == MNT_WAIT) {
295                 vn_unlock(ap->a_vp);
296                 hammer_wait_inode(ip);
297                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
298         }
299         lwkt_reltoken(&hmp->fs_token);
300         return (ip->error);
301 }
302
303 /*
304  * hammer_vop_read { vp, uio, ioflag, cred }
305  *
306  * MPSAFE (for the cache safe does not require fs_token)
307  */
308 static
309 int
310 hammer_vop_read(struct vop_read_args *ap)
311 {
312         struct hammer_transaction trans;
313         hammer_inode_t ip;
314         hammer_mount_t hmp;
315         off_t offset;
316         struct buf *bp;
317         struct uio *uio;
318         int error;
319         int n;
320         int seqcount;
321         int ioseqcount;
322         int blksize;
323         int bigread;
324         int got_fstoken;
325
326         if (ap->a_vp->v_type != VREG)
327                 return (EINVAL);
328         ip = VTOI(ap->a_vp);
329         hmp = ip->hmp;
330         error = 0;
331         uio = ap->a_uio;
332
333         /*
334          * Allow the UIO's size to override the sequential heuristic.
335          */
336         blksize = hammer_blocksize(uio->uio_offset);
337         seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
338         ioseqcount = (ap->a_ioflag >> 16);
339         if (seqcount < ioseqcount)
340                 seqcount = ioseqcount;
341
342         /*
343          * If reading or writing a huge amount of data we have to break
344          * atomicy and allow the operation to be interrupted by a signal
345          * or it can DOS the machine.
346          */
347         bigread = (uio->uio_resid > 100 * 1024 * 1024);
348         got_fstoken = 0;
349
350         /*
351          * Access the data typically in HAMMER_BUFSIZE blocks via the
352          * buffer cache, but HAMMER may use a variable block size based
353          * on the offset.
354          *
355          * XXX Temporary hack, delay the start transaction while we remain
356          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
357          *     locked-shared.
358          */
359         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
360                 int64_t base_offset;
361                 int64_t file_limit;
362
363                 blksize = hammer_blocksize(uio->uio_offset);
364                 offset = (int)uio->uio_offset & (blksize - 1);
365                 base_offset = uio->uio_offset - offset;
366
367                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
368                         break;
369
370                 /*
371                  * MPSAFE
372                  */
373                 bp = getcacheblk(ap->a_vp, base_offset, blksize);
374                 if (bp) {
375                         error = 0;
376                         goto skip;
377                 }
378
379                 /*
380                  * MPUNSAFE
381                  */
382                 if (got_fstoken == 0) {
383                         lwkt_gettoken(&hmp->fs_token);
384                         got_fstoken = 1;
385                         hammer_start_transaction(&trans, ip->hmp);
386                 }
387
388                 if (hammer_cluster_enable) {
389                         /*
390                          * Use file_limit to prevent cluster_read() from
391                          * creating buffers of the wrong block size past
392                          * the demarc.
393                          */
394                         file_limit = ip->ino_data.size;
395                         if (base_offset < HAMMER_XDEMARC &&
396                             file_limit > HAMMER_XDEMARC) {
397                                 file_limit = HAMMER_XDEMARC;
398                         }
399                         error = cluster_read(ap->a_vp,
400                                              file_limit, base_offset,
401                                              blksize, uio->uio_resid,
402                                              seqcount * BKVASIZE, &bp);
403                 } else {
404                         error = bread(ap->a_vp, base_offset, blksize, &bp);
405                 }
406                 if (error) {
407                         brelse(bp);
408                         break;
409                 }
410 skip:
411                 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
412                         kprintf("doff %016jx read file %016jx@%016jx\n",
413                                 (intmax_t)bp->b_bio2.bio_offset,
414                                 (intmax_t)ip->obj_id,
415                                 (intmax_t)bp->b_loffset);
416                 }
417                 bp->b_flags &= ~B_IODEBUG;
418
419                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
420                 n = blksize - offset;
421                 if (n > uio->uio_resid)
422                         n = uio->uio_resid;
423                 if (n > ip->ino_data.size - uio->uio_offset)
424                         n = (int)(ip->ino_data.size - uio->uio_offset);
425                 if (got_fstoken)
426                         lwkt_reltoken(&hmp->fs_token);
427
428                 /*
429                  * Set B_AGE, data has a lower priority than meta-data.
430                  *
431                  * Use a hold/unlock/drop sequence to run the uiomove
432                  * with the buffer unlocked, avoiding deadlocks against
433                  * read()s on mmap()'d spaces.
434                  */
435                 bp->b_flags |= B_AGE;
436                 bqhold(bp);
437                 bqrelse(bp);
438                 error = uiomove((char *)bp->b_data + offset, n, uio);
439                 bqdrop(bp);
440
441                 if (got_fstoken)
442                         lwkt_gettoken(&hmp->fs_token);
443
444                 if (error)
445                         break;
446                 hammer_stats_file_read += n;
447         }
448
449         /*
450          * XXX only update the atime if we had to get the MP lock.
451          * XXX hack hack hack, fixme.
452          */
453         if (got_fstoken) {
454                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
455                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
456                         ip->ino_data.atime = trans.time;
457                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
458                 }
459                 hammer_done_transaction(&trans);
460                 lwkt_reltoken(&hmp->fs_token);
461         }
462         return (error);
463 }
464
465 /*
466  * hammer_vop_write { vp, uio, ioflag, cred }
467  */
468 static
469 int
470 hammer_vop_write(struct vop_write_args *ap)
471 {
472         struct hammer_transaction trans;
473         struct hammer_inode *ip;
474         hammer_mount_t hmp;
475         struct uio *uio;
476         int offset;
477         off_t base_offset;
478         struct buf *bp;
479         int kflags;
480         int error;
481         int n;
482         int flags;
483         int seqcount;
484         int bigwrite;
485
486         if (ap->a_vp->v_type != VREG)
487                 return (EINVAL);
488         ip = VTOI(ap->a_vp);
489         hmp = ip->hmp;
490         error = 0;
491         kflags = 0;
492         seqcount = ap->a_ioflag >> 16;
493
494         if (ip->flags & HAMMER_INODE_RO)
495                 return (EROFS);
496
497         /*
498          * Create a transaction to cover the operations we perform.
499          */
500         lwkt_gettoken(&hmp->fs_token);
501         hammer_start_transaction(&trans, hmp);
502         uio = ap->a_uio;
503
504         /*
505          * Check append mode
506          */
507         if (ap->a_ioflag & IO_APPEND)
508                 uio->uio_offset = ip->ino_data.size;
509
510         /*
511          * Check for illegal write offsets.  Valid range is 0...2^63-1.
512          *
513          * NOTE: the base_off assignment is required to work around what
514          * I consider to be a GCC-4 optimization bug.
515          */
516         if (uio->uio_offset < 0) {
517                 hammer_done_transaction(&trans);
518                 lwkt_reltoken(&hmp->fs_token);
519                 return (EFBIG);
520         }
521         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
522         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
523                 hammer_done_transaction(&trans);
524                 lwkt_reltoken(&hmp->fs_token);
525                 return (EFBIG);
526         }
527
528         /*
529          * If reading or writing a huge amount of data we have to break
530          * atomicy and allow the operation to be interrupted by a signal
531          * or it can DOS the machine.
532          *
533          * Preset redo_count so we stop generating REDOs earlier if the
534          * limit is exceeded.
535          */
536         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
537         if ((ip->flags & HAMMER_INODE_REDO) &&
538             ip->redo_count < hammer_limit_redo) {
539                 ip->redo_count += uio->uio_resid;
540         }
541
542         /*
543          * Access the data typically in HAMMER_BUFSIZE blocks via the
544          * buffer cache, but HAMMER may use a variable block size based
545          * on the offset.
546          */
547         while (uio->uio_resid > 0) {
548                 int fixsize = 0;
549                 int blksize;
550                 int blkmask;
551                 int trivial;
552                 int endofblk;
553                 off_t nsize;
554
555                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
556                         break;
557                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
558                         break;
559
560                 blksize = hammer_blocksize(uio->uio_offset);
561
562                 /*
563                  * Do not allow HAMMER to blow out the buffer cache.  Very
564                  * large UIOs can lockout other processes due to bwillwrite()
565                  * mechanics.
566                  *
567                  * The hammer inode is not locked during these operations.
568                  * The vnode is locked which can interfere with the pageout
569                  * daemon for non-UIO_NOCOPY writes but should not interfere
570                  * with the buffer cache.  Even so, we cannot afford to
571                  * allow the pageout daemon to build up too many dirty buffer
572                  * cache buffers.
573                  *
574                  * Only call this if we aren't being recursively called from
575                  * a virtual disk device (vn), else we may deadlock.
576                  */
577                 if ((ap->a_ioflag & IO_RECURSE) == 0)
578                         bwillwrite(blksize);
579
580                 /*
581                  * Control the number of pending records associated with
582                  * this inode.  If too many have accumulated start a
583                  * flush.  Try to maintain a pipeline with the flusher.
584                  *
585                  * NOTE: It is possible for other sources to grow the
586                  *       records but not necessarily issue another flush,
587                  *       so use a timeout and ensure that a re-flush occurs.
588                  */
589                 if (ip->rsv_recs >= hammer_limit_inode_recs) {
590                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
591                         while (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
592                                 ip->flags |= HAMMER_INODE_RECSW;
593                                 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
594                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
595                         }
596                 }
597
598 #if 0
599                 /*
600                  * Do not allow HAMMER to blow out system memory by
601                  * accumulating too many records.   Records are so well
602                  * decoupled from the buffer cache that it is possible
603                  * for userland to push data out to the media via
604                  * direct-write, but build up the records queued to the
605                  * backend faster then the backend can flush them out.
606                  * HAMMER has hit its write limit but the frontend has
607                  * no pushback to slow it down.
608                  */
609                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
610                         /*
611                          * Get the inode on the flush list
612                          */
613                         if (ip->rsv_recs >= 64)
614                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
615                         else if (ip->rsv_recs >= 16)
616                                 hammer_flush_inode(ip, 0);
617
618                         /*
619                          * Keep the flusher going if the system keeps
620                          * queueing records.
621                          */
622                         delta = hmp->count_newrecords -
623                                 hmp->last_newrecords;
624                         if (delta < 0 || delta > hammer_limit_recs / 2) {
625                                 hmp->last_newrecords = hmp->count_newrecords;
626                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
627                         }
628
629                         /*
630                          * If we have gotten behind start slowing
631                          * down the writers.
632                          */
633                         delta = (hmp->rsv_recs - hammer_limit_recs) *
634                                 hz / hammer_limit_recs;
635                         if (delta > 0)
636                                 tsleep(&trans, 0, "hmrslo", delta);
637                 }
638 #endif
639
640                 /*
641                  * Calculate the blocksize at the current offset and figure
642                  * out how much we can actually write.
643                  */
644                 blkmask = blksize - 1;
645                 offset = (int)uio->uio_offset & blkmask;
646                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
647                 n = blksize - offset;
648                 if (n > uio->uio_resid) {
649                         n = uio->uio_resid;
650                         endofblk = 0;
651                 } else {
652                         endofblk = 1;
653                 }
654                 nsize = uio->uio_offset + n;
655                 if (nsize > ip->ino_data.size) {
656                         if (uio->uio_offset > ip->ino_data.size)
657                                 trivial = 0;
658                         else
659                                 trivial = 1;
660                         nvextendbuf(ap->a_vp,
661                                     ip->ino_data.size,
662                                     nsize,
663                                     hammer_blocksize(ip->ino_data.size),
664                                     hammer_blocksize(nsize),
665                                     hammer_blockoff(ip->ino_data.size),
666                                     hammer_blockoff(nsize),
667                                     trivial);
668                         fixsize = 1;
669                         kflags |= NOTE_EXTEND;
670                 }
671
672                 if (uio->uio_segflg == UIO_NOCOPY) {
673                         /*
674                          * Issuing a write with the same data backing the
675                          * buffer.  Instantiate the buffer to collect the
676                          * backing vm pages, then read-in any missing bits.
677                          *
678                          * This case is used by vop_stdputpages().
679                          */
680                         bp = getblk(ap->a_vp, base_offset,
681                                     blksize, GETBLK_BHEAVY, 0);
682                         if ((bp->b_flags & B_CACHE) == 0) {
683                                 bqrelse(bp);
684                                 error = bread(ap->a_vp, base_offset,
685                                               blksize, &bp);
686                         }
687                 } else if (offset == 0 && uio->uio_resid >= blksize) {
688                         /*
689                          * Even though we are entirely overwriting the buffer
690                          * we may still have to zero it out to avoid a 
691                          * mmap/write visibility issue.
692                          */
693                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
694                         if ((bp->b_flags & B_CACHE) == 0)
695                                 vfs_bio_clrbuf(bp);
696                 } else if (base_offset >= ip->ino_data.size) {
697                         /*
698                          * If the base offset of the buffer is beyond the
699                          * file EOF, we don't have to issue a read.
700                          */
701                         bp = getblk(ap->a_vp, base_offset,
702                                     blksize, GETBLK_BHEAVY, 0);
703                         vfs_bio_clrbuf(bp);
704                 } else {
705                         /*
706                          * Partial overwrite, read in any missing bits then
707                          * replace the portion being written.
708                          */
709                         error = bread(ap->a_vp, base_offset, blksize, &bp);
710                         if (error == 0)
711                                 bheavy(bp);
712                 }
713                 if (error == 0) {
714                         lwkt_reltoken(&hmp->fs_token);
715                         error = uiomove(bp->b_data + offset, n, uio);
716                         lwkt_gettoken(&hmp->fs_token);
717                 }
718
719                 /*
720                  * Generate REDO records if enabled and redo_count will not
721                  * exceeded the limit.
722                  *
723                  * If redo_count exceeds the limit we stop generating records
724                  * and clear HAMMER_INODE_REDO.  This will cause the next
725                  * fsync() to do a full meta-data sync instead of just an
726                  * UNDO/REDO fifo update.
727                  *
728                  * When clearing HAMMER_INODE_REDO any pre-existing REDOs
729                  * will still be tracked.  The tracks will be terminated
730                  * when the related meta-data (including possible data
731                  * modifications which are not tracked via REDO) is
732                  * flushed.
733                  */
734                 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
735                         if (ip->redo_count < hammer_limit_redo) {
736                                 bp->b_flags |= B_VFSFLAG1;
737                                 error = hammer_generate_redo(&trans, ip,
738                                                      base_offset + offset,
739                                                      HAMMER_REDO_WRITE,
740                                                      bp->b_data + offset,
741                                                      (size_t)n);
742                         } else {
743                                 ip->flags &= ~HAMMER_INODE_REDO;
744                         }
745                 }
746
747                 /*
748                  * If we screwed up we have to undo any VM size changes we
749                  * made.
750                  */
751                 if (error) {
752                         brelse(bp);
753                         if (fixsize) {
754                                 nvtruncbuf(ap->a_vp, ip->ino_data.size,
755                                           hammer_blocksize(ip->ino_data.size),
756                                           hammer_blockoff(ip->ino_data.size));
757                         }
758                         break;
759                 }
760                 kflags |= NOTE_WRITE;
761                 hammer_stats_file_write += n;
762                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
763                 if (ip->ino_data.size < uio->uio_offset) {
764                         ip->ino_data.size = uio->uio_offset;
765                         flags = HAMMER_INODE_SDIRTY;
766                 } else {
767                         flags = 0;
768                 }
769                 ip->ino_data.mtime = trans.time;
770                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
771                 hammer_modify_inode(&trans, ip, flags);
772
773                 /*
774                  * Once we dirty the buffer any cached zone-X offset
775                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
776                  * allow overwriting over the same data sector unless
777                  * we provide UNDOs for the old data, which we don't.
778                  */
779                 bp->b_bio2.bio_offset = NOOFFSET;
780
781                 /*
782                  * Final buffer disposition.
783                  *
784                  * Because meta-data updates are deferred, HAMMER is
785                  * especially sensitive to excessive bdwrite()s because
786                  * the I/O stream is not broken up by disk reads.  So the
787                  * buffer cache simply cannot keep up.
788                  *
789                  * WARNING!  blksize is variable.  cluster_write() is
790                  *           expected to not blow up if it encounters
791                  *           buffers that do not match the passed blksize.
792                  *
793                  * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
794                  *        The ip->rsv_recs check should burst-flush the data.
795                  *        If we queue it immediately the buf could be left
796                  *        locked on the device queue for a very long time.
797                  *
798                  * NOTE!  To avoid degenerate stalls due to mismatched block
799                  *        sizes we only honor IO_DIRECT on the write which
800                  *        abuts the end of the buffer.  However, we must
801                  *        honor IO_SYNC in case someone is silly enough to
802                  *        configure a HAMMER file as swap, or when HAMMER
803                  *        is serving NFS (for commits).  Ick ick.
804                  */
805                 bp->b_flags |= B_AGE;
806                 if (ap->a_ioflag & IO_SYNC) {
807                         bwrite(bp);
808                 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
809                         bawrite(bp);
810                 } else {
811 #if 0
812                 if (offset + n == blksize) {
813                         if (hammer_cluster_enable == 0 ||
814                             (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
815                                 bawrite(bp);
816                         } else {
817                                 cluster_write(bp, ip->ino_data.size,
818                                               blksize, seqcount);
819                         }
820                 } else {
821 #endif
822                         bdwrite(bp);
823                 }
824         }
825         hammer_done_transaction(&trans);
826         hammer_knote(ap->a_vp, kflags);
827         lwkt_reltoken(&hmp->fs_token);
828         return (error);
829 }
830
831 /*
832  * hammer_vop_access { vp, mode, cred }
833  *
834  * MPSAFE - does not require fs_token
835  */
836 static
837 int
838 hammer_vop_access(struct vop_access_args *ap)
839 {
840         struct hammer_inode *ip = VTOI(ap->a_vp);
841         uid_t uid;
842         gid_t gid;
843         int error;
844
845         ++hammer_stats_file_iopsr;
846         uid = hammer_to_unix_xid(&ip->ino_data.uid);
847         gid = hammer_to_unix_xid(&ip->ino_data.gid);
848
849         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
850                                   ip->ino_data.uflags);
851         return (error);
852 }
853
854 /*
855  * hammer_vop_advlock { vp, id, op, fl, flags }
856  *
857  * MPSAFE - does not require fs_token
858  */
859 static
860 int
861 hammer_vop_advlock(struct vop_advlock_args *ap)
862 {
863         hammer_inode_t ip = VTOI(ap->a_vp);
864
865         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
866 }
867
868 /*
869  * hammer_vop_close { vp, fflag }
870  *
871  * We can only sync-on-close for normal closes.  XXX disabled for now.
872  */
873 static
874 int
875 hammer_vop_close(struct vop_close_args *ap)
876 {
877 #if 0
878         struct vnode *vp = ap->a_vp;
879         hammer_inode_t ip = VTOI(vp);
880         int waitfor;
881         if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
882                 if (vn_islocked(vp) == LK_EXCLUSIVE &&
883                     (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
884                         if (ip->flags & HAMMER_INODE_CLOSESYNC)
885                                 waitfor = MNT_WAIT;
886                         else
887                                 waitfor = MNT_NOWAIT;
888                         ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
889                                        HAMMER_INODE_CLOSEASYNC);
890                         VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
891                 }
892         }
893 #endif
894         return (vop_stdclose(ap));
895 }
896
897 /*
898  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
899  *
900  * The operating system has already ensured that the directory entry
901  * does not exist and done all appropriate namespace locking.
902  */
903 static
904 int
905 hammer_vop_ncreate(struct vop_ncreate_args *ap)
906 {
907         struct hammer_transaction trans;
908         struct hammer_inode *dip;
909         struct hammer_inode *nip;
910         struct nchandle *nch;
911         hammer_mount_t hmp;
912         int error;
913
914         nch = ap->a_nch;
915         dip = VTOI(ap->a_dvp);
916         hmp = dip->hmp;
917
918         if (dip->flags & HAMMER_INODE_RO)
919                 return (EROFS);
920         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
921                 return (error);
922
923         /*
924          * Create a transaction to cover the operations we perform.
925          */
926         lwkt_gettoken(&hmp->fs_token);
927         hammer_start_transaction(&trans, hmp);
928         ++hammer_stats_file_iopsw;
929
930         /*
931          * Create a new filesystem object of the requested type.  The
932          * returned inode will be referenced and shared-locked to prevent
933          * it from being moved to the flusher.
934          */
935         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
936                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
937                                     NULL, &nip);
938         if (error) {
939                 hkprintf("hammer_create_inode error %d\n", error);
940                 hammer_done_transaction(&trans);
941                 *ap->a_vpp = NULL;
942                 lwkt_reltoken(&hmp->fs_token);
943                 return (error);
944         }
945
946         /*
947          * Add the new filesystem object to the directory.  This will also
948          * bump the inode's link count.
949          */
950         error = hammer_ip_add_directory(&trans, dip,
951                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
952                                         nip);
953         if (error)
954                 hkprintf("hammer_ip_add_directory error %d\n", error);
955
956         /*
957          * Finish up.
958          */
959         if (error) {
960                 hammer_rel_inode(nip, 0);
961                 hammer_done_transaction(&trans);
962                 *ap->a_vpp = NULL;
963         } else {
964                 error = hammer_get_vnode(nip, ap->a_vpp);
965                 hammer_done_transaction(&trans);
966                 hammer_rel_inode(nip, 0);
967                 if (error == 0) {
968                         cache_setunresolved(ap->a_nch);
969                         cache_setvp(ap->a_nch, *ap->a_vpp);
970                 }
971                 hammer_knote(ap->a_dvp, NOTE_WRITE);
972         }
973         lwkt_reltoken(&hmp->fs_token);
974         return (error);
975 }
976
977 /*
978  * hammer_vop_getattr { vp, vap }
979  *
980  * Retrieve an inode's attribute information.  When accessing inodes
981  * historically we fake the atime field to ensure consistent results.
982  * The atime field is stored in the B-Tree element and allowed to be
983  * updated without cycling the element.
984  *
985  * MPSAFE - does not require fs_token
986  */
987 static
988 int
989 hammer_vop_getattr(struct vop_getattr_args *ap)
990 {
991         struct hammer_inode *ip = VTOI(ap->a_vp);
992         struct vattr *vap = ap->a_vap;
993
994         /*
995          * We want the fsid to be different when accessing a filesystem
996          * with different as-of's so programs like diff don't think
997          * the files are the same.
998          *
999          * We also want the fsid to be the same when comparing snapshots,
1000          * or when comparing mirrors (which might be backed by different
1001          * physical devices).  HAMMER fsids are based on the PFS's
1002          * shared_uuid field.
1003          *
1004          * XXX there is a chance of collision here.  The va_fsid reported
1005          * by stat is different from the more involved fsid used in the
1006          * mount structure.
1007          */
1008         ++hammer_stats_file_iopsr;
1009         hammer_lock_sh(&ip->lock);
1010         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1011                        (u_int32_t)(ip->obj_asof >> 32);
1012
1013         vap->va_fileid = ip->ino_leaf.base.obj_id;
1014         vap->va_mode = ip->ino_data.mode;
1015         vap->va_nlink = ip->ino_data.nlinks;
1016         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1017         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1018         vap->va_rmajor = 0;
1019         vap->va_rminor = 0;
1020         vap->va_size = ip->ino_data.size;
1021
1022         /*
1023          * Special case for @@PFS softlinks.  The actual size of the
1024          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1025          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1026          */
1027         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1028             ip->ino_data.size == 10 &&
1029             ip->obj_asof == HAMMER_MAX_TID &&
1030             ip->obj_localization == 0 &&
1031             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1032                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1033                             vap->va_size = 26;
1034                     else
1035                             vap->va_size = 10;
1036         }
1037
1038         /*
1039          * We must provide a consistent atime and mtime for snapshots
1040          * so people can do a 'tar cf - ... | md5' on them and get
1041          * consistent results.
1042          */
1043         if (ip->flags & HAMMER_INODE_RO) {
1044                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1045                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1046         } else {
1047                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1048                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1049         }
1050         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1051         vap->va_flags = ip->ino_data.uflags;
1052         vap->va_gen = 1;        /* hammer inums are unique for all time */
1053         vap->va_blocksize = HAMMER_BUFSIZE;
1054         if (ip->ino_data.size >= HAMMER_XDEMARC) {
1055                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1056                                 ~HAMMER_XBUFMASK64;
1057         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1058                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1059                                 ~HAMMER_BUFMASK64;
1060         } else {
1061                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1062         }
1063
1064         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1065         vap->va_filerev = 0;    /* XXX */
1066         vap->va_uid_uuid = ip->ino_data.uid;
1067         vap->va_gid_uuid = ip->ino_data.gid;
1068         vap->va_fsid_uuid = ip->hmp->fsid;
1069         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1070                           VA_FSID_UUID_VALID;
1071
1072         switch (ip->ino_data.obj_type) {
1073         case HAMMER_OBJTYPE_CDEV:
1074         case HAMMER_OBJTYPE_BDEV:
1075                 vap->va_rmajor = ip->ino_data.rmajor;
1076                 vap->va_rminor = ip->ino_data.rminor;
1077                 break;
1078         default:
1079                 break;
1080         }
1081         hammer_unlock(&ip->lock);
1082         return(0);
1083 }
1084
1085 /*
1086  * hammer_vop_nresolve { nch, dvp, cred }
1087  *
1088  * Locate the requested directory entry.
1089  */
1090 static
1091 int
1092 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1093 {
1094         struct hammer_transaction trans;
1095         struct namecache *ncp;
1096         hammer_mount_t hmp;
1097         hammer_inode_t dip;
1098         hammer_inode_t ip;
1099         hammer_tid_t asof;
1100         struct hammer_cursor cursor;
1101         struct vnode *vp;
1102         int64_t namekey;
1103         int error;
1104         int i;
1105         int nlen;
1106         int flags;
1107         int ispfs;
1108         int64_t obj_id;
1109         u_int32_t localization;
1110         u_int32_t max_iterations;
1111
1112         /*
1113          * Misc initialization, plus handle as-of name extensions.  Look for
1114          * the '@@' extension.  Note that as-of files and directories cannot
1115          * be modified.
1116          */
1117         dip = VTOI(ap->a_dvp);
1118         ncp = ap->a_nch->ncp;
1119         asof = dip->obj_asof;
1120         localization = dip->obj_localization;   /* for code consistency */
1121         nlen = ncp->nc_nlen;
1122         flags = dip->flags & HAMMER_INODE_RO;
1123         ispfs = 0;
1124         hmp = dip->hmp;
1125
1126         lwkt_gettoken(&hmp->fs_token);
1127         hammer_simple_transaction(&trans, hmp);
1128         ++hammer_stats_file_iopsr;
1129
1130         for (i = 0; i < nlen; ++i) {
1131                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1132                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
1133                                                   &ispfs, &asof, &localization);
1134                         if (error != 0) {
1135                                 i = nlen;
1136                                 break;
1137                         }
1138                         if (asof != HAMMER_MAX_TID)
1139                                 flags |= HAMMER_INODE_RO;
1140                         break;
1141                 }
1142         }
1143         nlen = i;
1144
1145         /*
1146          * If this is a PFS softlink we dive into the PFS
1147          */
1148         if (ispfs && nlen == 0) {
1149                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1150                                       asof, localization,
1151                                       flags, &error);
1152                 if (error == 0) {
1153                         error = hammer_get_vnode(ip, &vp);
1154                         hammer_rel_inode(ip, 0);
1155                 } else {
1156                         vp = NULL;
1157                 }
1158                 if (error == 0) {
1159                         vn_unlock(vp);
1160                         cache_setvp(ap->a_nch, vp);
1161                         vrele(vp);
1162                 }
1163                 goto done;
1164         }
1165
1166         /*
1167          * If there is no path component the time extension is relative to dip.
1168          * e.g. "fubar/@@<snapshot>"
1169          *
1170          * "." is handled by the kernel, but ".@@<snapshot>" is not.
1171          * e.g. "fubar/.@@<snapshot>"
1172          *
1173          * ".." is handled by the kernel.  We do not currently handle
1174          * "..@<snapshot>".
1175          */
1176         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1177                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
1178                                       asof, dip->obj_localization,
1179                                       flags, &error);
1180                 if (error == 0) {
1181                         error = hammer_get_vnode(ip, &vp);
1182                         hammer_rel_inode(ip, 0);
1183                 } else {
1184                         vp = NULL;
1185                 }
1186                 if (error == 0) {
1187                         vn_unlock(vp);
1188                         cache_setvp(ap->a_nch, vp);
1189                         vrele(vp);
1190                 }
1191                 goto done;
1192         }
1193
1194         /*
1195          * Calculate the namekey and setup the key range for the scan.  This
1196          * works kinda like a chained hash table where the lower 32 bits
1197          * of the namekey synthesize the chain.
1198          *
1199          * The key range is inclusive of both key_beg and key_end.
1200          */
1201         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1202                                            &max_iterations);
1203
1204         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1205         cursor.key_beg.localization = dip->obj_localization +
1206                                       hammer_dir_localization(dip);
1207         cursor.key_beg.obj_id = dip->obj_id;
1208         cursor.key_beg.key = namekey;
1209         cursor.key_beg.create_tid = 0;
1210         cursor.key_beg.delete_tid = 0;
1211         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1212         cursor.key_beg.obj_type = 0;
1213
1214         cursor.key_end = cursor.key_beg;
1215         cursor.key_end.key += max_iterations;
1216         cursor.asof = asof;
1217         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1218
1219         /*
1220          * Scan all matching records (the chain), locate the one matching
1221          * the requested path component.
1222          *
1223          * The hammer_ip_*() functions merge in-memory records with on-disk
1224          * records for the purposes of the search.
1225          */
1226         obj_id = 0;
1227         localization = HAMMER_DEF_LOCALIZATION;
1228
1229         if (error == 0) {
1230                 error = hammer_ip_first(&cursor);
1231                 while (error == 0) {
1232                         error = hammer_ip_resolve_data(&cursor);
1233                         if (error)
1234                                 break;
1235                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1236                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1237                                 obj_id = cursor.data->entry.obj_id;
1238                                 localization = cursor.data->entry.localization;
1239                                 break;
1240                         }
1241                         error = hammer_ip_next(&cursor);
1242                 }
1243         }
1244         hammer_done_cursor(&cursor);
1245
1246         /*
1247          * Lookup the obj_id.  This should always succeed.  If it does not
1248          * the filesystem may be damaged and we return a dummy inode.
1249          */
1250         if (error == 0) {
1251                 ip = hammer_get_inode(&trans, dip, obj_id,
1252                                       asof, localization,
1253                                       flags, &error);
1254                 if (error == ENOENT) {
1255                         kprintf("HAMMER: WARNING: Missing "
1256                                 "inode for dirent \"%s\"\n"
1257                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1258                                 ncp->nc_name,
1259                                 (long long)obj_id, (long long)asof,
1260                                 localization);
1261                         error = 0;
1262                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1263                                                     asof, localization,
1264                                                     flags, &error);
1265                 }
1266                 if (error == 0) {
1267                         error = hammer_get_vnode(ip, &vp);
1268                         hammer_rel_inode(ip, 0);
1269                 } else {
1270                         vp = NULL;
1271                 }
1272                 if (error == 0) {
1273                         vn_unlock(vp);
1274                         cache_setvp(ap->a_nch, vp);
1275                         vrele(vp);
1276                 }
1277         } else if (error == ENOENT) {
1278                 cache_setvp(ap->a_nch, NULL);
1279         }
1280 done:
1281         hammer_done_transaction(&trans);
1282         lwkt_reltoken(&hmp->fs_token);
1283         return (error);
1284 }
1285
1286 /*
1287  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1288  *
1289  * Locate the parent directory of a directory vnode.
1290  *
1291  * dvp is referenced but not locked.  *vpp must be returned referenced and
1292  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1293  * at the root, instead it could indicate that the directory we were in was
1294  * removed.
1295  *
1296  * NOTE: as-of sequences are not linked into the directory structure.  If
1297  * we are at the root with a different asof then the mount point, reload
1298  * the same directory with the mount point's asof.   I'm not sure what this
1299  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1300  * get confused, but it hasn't been tested.
1301  */
1302 static
1303 int
1304 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1305 {
1306         struct hammer_transaction trans;
1307         struct hammer_inode *dip;
1308         struct hammer_inode *ip;
1309         hammer_mount_t hmp;
1310         int64_t parent_obj_id;
1311         u_int32_t parent_obj_localization;
1312         hammer_tid_t asof;
1313         int error;
1314
1315         dip = VTOI(ap->a_dvp);
1316         asof = dip->obj_asof;
1317         hmp = dip->hmp;
1318
1319         /*
1320          * Whos are parent?  This could be the root of a pseudo-filesystem
1321          * whos parent is in another localization domain.
1322          */
1323         lwkt_gettoken(&hmp->fs_token);
1324         parent_obj_id = dip->ino_data.parent_obj_id;
1325         if (dip->obj_id == HAMMER_OBJID_ROOT)
1326                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1327         else
1328                 parent_obj_localization = dip->obj_localization;
1329
1330         if (parent_obj_id == 0) {
1331                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1332                    asof != hmp->asof) {
1333                         parent_obj_id = dip->obj_id;
1334                         asof = hmp->asof;
1335                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1336                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1337                                   (long long)dip->obj_asof);
1338                 } else {
1339                         *ap->a_vpp = NULL;
1340                         lwkt_reltoken(&hmp->fs_token);
1341                         return ENOENT;
1342                 }
1343         }
1344
1345         hammer_simple_transaction(&trans, hmp);
1346         ++hammer_stats_file_iopsr;
1347
1348         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1349                               asof, parent_obj_localization,
1350                               dip->flags, &error);
1351         if (ip) {
1352                 error = hammer_get_vnode(ip, ap->a_vpp);
1353                 hammer_rel_inode(ip, 0);
1354         } else {
1355                 *ap->a_vpp = NULL;
1356         }
1357         hammer_done_transaction(&trans);
1358         lwkt_reltoken(&hmp->fs_token);
1359         return (error);
1360 }
1361
1362 /*
1363  * hammer_vop_nlink { nch, dvp, vp, cred }
1364  */
1365 static
1366 int
1367 hammer_vop_nlink(struct vop_nlink_args *ap)
1368 {
1369         struct hammer_transaction trans;
1370         struct hammer_inode *dip;
1371         struct hammer_inode *ip;
1372         struct nchandle *nch;
1373         hammer_mount_t hmp;
1374         int error;
1375
1376         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1377                 return(EXDEV);
1378
1379         nch = ap->a_nch;
1380         dip = VTOI(ap->a_dvp);
1381         ip = VTOI(ap->a_vp);
1382         hmp = dip->hmp;
1383
1384         if (dip->obj_localization != ip->obj_localization)
1385                 return(EXDEV);
1386
1387         if (dip->flags & HAMMER_INODE_RO)
1388                 return (EROFS);
1389         if (ip->flags & HAMMER_INODE_RO)
1390                 return (EROFS);
1391         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1392                 return (error);
1393
1394         /*
1395          * Create a transaction to cover the operations we perform.
1396          */
1397         lwkt_gettoken(&hmp->fs_token);
1398         hammer_start_transaction(&trans, hmp);
1399         ++hammer_stats_file_iopsw;
1400
1401         /*
1402          * Add the filesystem object to the directory.  Note that neither
1403          * dip nor ip are referenced or locked, but their vnodes are
1404          * referenced.  This function will bump the inode's link count.
1405          */
1406         error = hammer_ip_add_directory(&trans, dip,
1407                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1408                                         ip);
1409
1410         /*
1411          * Finish up.
1412          */
1413         if (error == 0) {
1414                 cache_setunresolved(nch);
1415                 cache_setvp(nch, ap->a_vp);
1416         }
1417         hammer_done_transaction(&trans);
1418         hammer_knote(ap->a_vp, NOTE_LINK);
1419         hammer_knote(ap->a_dvp, NOTE_WRITE);
1420         lwkt_reltoken(&hmp->fs_token);
1421         return (error);
1422 }
1423
1424 /*
1425  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1426  *
1427  * The operating system has already ensured that the directory entry
1428  * does not exist and done all appropriate namespace locking.
1429  */
1430 static
1431 int
1432 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1433 {
1434         struct hammer_transaction trans;
1435         struct hammer_inode *dip;
1436         struct hammer_inode *nip;
1437         struct nchandle *nch;
1438         hammer_mount_t hmp;
1439         int error;
1440
1441         nch = ap->a_nch;
1442         dip = VTOI(ap->a_dvp);
1443         hmp = dip->hmp;
1444
1445         if (dip->flags & HAMMER_INODE_RO)
1446                 return (EROFS);
1447         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1448                 return (error);
1449
1450         /*
1451          * Create a transaction to cover the operations we perform.
1452          */
1453         lwkt_gettoken(&hmp->fs_token);
1454         hammer_start_transaction(&trans, hmp);
1455         ++hammer_stats_file_iopsw;
1456
1457         /*
1458          * Create a new filesystem object of the requested type.  The
1459          * returned inode will be referenced but not locked.
1460          */
1461         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1462                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1463                                     NULL, &nip);
1464         if (error) {
1465                 hkprintf("hammer_mkdir error %d\n", error);
1466                 hammer_done_transaction(&trans);
1467                 *ap->a_vpp = NULL;
1468                 lwkt_reltoken(&hmp->fs_token);
1469                 return (error);
1470         }
1471         /*
1472          * Add the new filesystem object to the directory.  This will also
1473          * bump the inode's link count.
1474          */
1475         error = hammer_ip_add_directory(&trans, dip,
1476                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1477                                         nip);
1478         if (error)
1479                 hkprintf("hammer_mkdir (add) error %d\n", error);
1480
1481         /*
1482          * Finish up.
1483          */
1484         if (error) {
1485                 hammer_rel_inode(nip, 0);
1486                 *ap->a_vpp = NULL;
1487         } else {
1488                 error = hammer_get_vnode(nip, ap->a_vpp);
1489                 hammer_rel_inode(nip, 0);
1490                 if (error == 0) {
1491                         cache_setunresolved(ap->a_nch);
1492                         cache_setvp(ap->a_nch, *ap->a_vpp);
1493                 }
1494         }
1495         hammer_done_transaction(&trans);
1496         if (error == 0)
1497                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1498         lwkt_reltoken(&hmp->fs_token);
1499         return (error);
1500 }
1501
1502 /*
1503  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1504  *
1505  * The operating system has already ensured that the directory entry
1506  * does not exist and done all appropriate namespace locking.
1507  */
1508 static
1509 int
1510 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1511 {
1512         struct hammer_transaction trans;
1513         struct hammer_inode *dip;
1514         struct hammer_inode *nip;
1515         struct nchandle *nch;
1516         hammer_mount_t hmp;
1517         int error;
1518
1519         nch = ap->a_nch;
1520         dip = VTOI(ap->a_dvp);
1521         hmp = dip->hmp;
1522
1523         if (dip->flags & HAMMER_INODE_RO)
1524                 return (EROFS);
1525         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1526                 return (error);
1527
1528         /*
1529          * Create a transaction to cover the operations we perform.
1530          */
1531         lwkt_gettoken(&hmp->fs_token);
1532         hammer_start_transaction(&trans, hmp);
1533         ++hammer_stats_file_iopsw;
1534
1535         /*
1536          * Create a new filesystem object of the requested type.  The
1537          * returned inode will be referenced but not locked.
1538          *
1539          * If mknod specifies a directory a pseudo-fs is created.
1540          */
1541         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1542                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1543                                     NULL, &nip);
1544         if (error) {
1545                 hammer_done_transaction(&trans);
1546                 *ap->a_vpp = NULL;
1547                 lwkt_reltoken(&hmp->fs_token);
1548                 return (error);
1549         }
1550
1551         /*
1552          * Add the new filesystem object to the directory.  This will also
1553          * bump the inode's link count.
1554          */
1555         error = hammer_ip_add_directory(&trans, dip,
1556                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1557                                         nip);
1558
1559         /*
1560          * Finish up.
1561          */
1562         if (error) {
1563                 hammer_rel_inode(nip, 0);
1564                 *ap->a_vpp = NULL;
1565         } else {
1566                 error = hammer_get_vnode(nip, ap->a_vpp);
1567                 hammer_rel_inode(nip, 0);
1568                 if (error == 0) {
1569                         cache_setunresolved(ap->a_nch);
1570                         cache_setvp(ap->a_nch, *ap->a_vpp);
1571                 }
1572         }
1573         hammer_done_transaction(&trans);
1574         if (error == 0)
1575                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1576         lwkt_reltoken(&hmp->fs_token);
1577         return (error);
1578 }
1579
1580 /*
1581  * hammer_vop_open { vp, mode, cred, fp }
1582  *
1583  * MPSAFE (does not require fs_token)
1584  */
1585 static
1586 int
1587 hammer_vop_open(struct vop_open_args *ap)
1588 {
1589         hammer_inode_t ip;
1590
1591         ++hammer_stats_file_iopsr;
1592         ip = VTOI(ap->a_vp);
1593
1594         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1595                 return (EROFS);
1596         return(vop_stdopen(ap));
1597 }
1598
1599 /*
1600  * hammer_vop_print { vp }
1601  */
1602 static
1603 int
1604 hammer_vop_print(struct vop_print_args *ap)
1605 {
1606         return EOPNOTSUPP;
1607 }
1608
1609 /*
1610  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1611  */
1612 static
1613 int
1614 hammer_vop_readdir(struct vop_readdir_args *ap)
1615 {
1616         struct hammer_transaction trans;
1617         struct hammer_cursor cursor;
1618         struct hammer_inode *ip;
1619         hammer_mount_t hmp;
1620         struct uio *uio;
1621         hammer_base_elm_t base;
1622         int error;
1623         int cookie_index;
1624         int ncookies;
1625         off_t *cookies;
1626         off_t saveoff;
1627         int r;
1628         int dtype;
1629
1630         ++hammer_stats_file_iopsr;
1631         ip = VTOI(ap->a_vp);
1632         uio = ap->a_uio;
1633         saveoff = uio->uio_offset;
1634         hmp = ip->hmp;
1635
1636         if (ap->a_ncookies) {
1637                 ncookies = uio->uio_resid / 16 + 1;
1638                 if (ncookies > 1024)
1639                         ncookies = 1024;
1640                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1641                 cookie_index = 0;
1642         } else {
1643                 ncookies = -1;
1644                 cookies = NULL;
1645                 cookie_index = 0;
1646         }
1647
1648         lwkt_gettoken(&hmp->fs_token);
1649         hammer_simple_transaction(&trans, hmp);
1650
1651         /*
1652          * Handle artificial entries
1653          *
1654          * It should be noted that the minimum value for a directory
1655          * hash key on-media is 0x0000000100000000, so we can use anything
1656          * less then that to represent our 'special' key space.
1657          */
1658         error = 0;
1659         if (saveoff == 0) {
1660                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1661                 if (r)
1662                         goto done;
1663                 if (cookies)
1664                         cookies[cookie_index] = saveoff;
1665                 ++saveoff;
1666                 ++cookie_index;
1667                 if (cookie_index == ncookies)
1668                         goto done;
1669         }
1670         if (saveoff == 1) {
1671                 if (ip->ino_data.parent_obj_id) {
1672                         r = vop_write_dirent(&error, uio,
1673                                              ip->ino_data.parent_obj_id,
1674                                              DT_DIR, 2, "..");
1675                 } else {
1676                         r = vop_write_dirent(&error, uio,
1677                                              ip->obj_id, DT_DIR, 2, "..");
1678                 }
1679                 if (r)
1680                         goto done;
1681                 if (cookies)
1682                         cookies[cookie_index] = saveoff;
1683                 ++saveoff;
1684                 ++cookie_index;
1685                 if (cookie_index == ncookies)
1686                         goto done;
1687         }
1688
1689         /*
1690          * Key range (begin and end inclusive) to scan.  Directory keys
1691          * directly translate to a 64 bit 'seek' position.
1692          */
1693         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1694         cursor.key_beg.localization = ip->obj_localization +
1695                                       hammer_dir_localization(ip);
1696         cursor.key_beg.obj_id = ip->obj_id;
1697         cursor.key_beg.create_tid = 0;
1698         cursor.key_beg.delete_tid = 0;
1699         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1700         cursor.key_beg.obj_type = 0;
1701         cursor.key_beg.key = saveoff;
1702
1703         cursor.key_end = cursor.key_beg;
1704         cursor.key_end.key = HAMMER_MAX_KEY;
1705         cursor.asof = ip->obj_asof;
1706         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1707
1708         error = hammer_ip_first(&cursor);
1709
1710         while (error == 0) {
1711                 error = hammer_ip_resolve_data(&cursor);
1712                 if (error)
1713                         break;
1714                 base = &cursor.leaf->base;
1715                 saveoff = base->key;
1716                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1717
1718                 if (base->obj_id != ip->obj_id)
1719                         panic("readdir: bad record at %p", cursor.node);
1720
1721                 /*
1722                  * Convert pseudo-filesystems into softlinks
1723                  */
1724                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1725                 r = vop_write_dirent(
1726                              &error, uio, cursor.data->entry.obj_id,
1727                              dtype,
1728                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1729                              (void *)cursor.data->entry.name);
1730                 if (r)
1731                         break;
1732                 ++saveoff;
1733                 if (cookies)
1734                         cookies[cookie_index] = base->key;
1735                 ++cookie_index;
1736                 if (cookie_index == ncookies)
1737                         break;
1738                 error = hammer_ip_next(&cursor);
1739         }
1740         hammer_done_cursor(&cursor);
1741
1742 done:
1743         hammer_done_transaction(&trans);
1744
1745         if (ap->a_eofflag)
1746                 *ap->a_eofflag = (error == ENOENT);
1747         uio->uio_offset = saveoff;
1748         if (error && cookie_index == 0) {
1749                 if (error == ENOENT)
1750                         error = 0;
1751                 if (cookies) {
1752                         kfree(cookies, M_TEMP);
1753                         *ap->a_ncookies = 0;
1754                         *ap->a_cookies = NULL;
1755                 }
1756         } else {
1757                 if (error == ENOENT)
1758                         error = 0;
1759                 if (cookies) {
1760                         *ap->a_ncookies = cookie_index;
1761                         *ap->a_cookies = cookies;
1762                 }
1763         }
1764         lwkt_reltoken(&hmp->fs_token);
1765         return(error);
1766 }
1767
1768 /*
1769  * hammer_vop_readlink { vp, uio, cred }
1770  */
1771 static
1772 int
1773 hammer_vop_readlink(struct vop_readlink_args *ap)
1774 {
1775         struct hammer_transaction trans;
1776         struct hammer_cursor cursor;
1777         struct hammer_inode *ip;
1778         hammer_mount_t hmp;
1779         char buf[32];
1780         u_int32_t localization;
1781         hammer_pseudofs_inmem_t pfsm;
1782         int error;
1783
1784         ip = VTOI(ap->a_vp);
1785         hmp = ip->hmp;
1786
1787         lwkt_gettoken(&hmp->fs_token);
1788
1789         /*
1790          * Shortcut if the symlink data was stuffed into ino_data.
1791          *
1792          * Also expand special "@@PFS%05d" softlinks (expansion only
1793          * occurs for non-historical (current) accesses made from the
1794          * primary filesystem).
1795          */
1796         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1797                 char *ptr;
1798                 int bytes;
1799
1800                 ptr = ip->ino_data.ext.symlink;
1801                 bytes = (int)ip->ino_data.size;
1802                 if (bytes == 10 &&
1803                     ip->obj_asof == HAMMER_MAX_TID &&
1804                     ip->obj_localization == 0 &&
1805                     strncmp(ptr, "@@PFS", 5) == 0) {
1806                         hammer_simple_transaction(&trans, hmp);
1807                         bcopy(ptr + 5, buf, 5);
1808                         buf[5] = 0;
1809                         localization = strtoul(buf, NULL, 10) << 16;
1810                         pfsm = hammer_load_pseudofs(&trans, localization,
1811                                                     &error);
1812                         if (error == 0) {
1813                                 if (pfsm->pfsd.mirror_flags &
1814                                     HAMMER_PFSD_SLAVE) {
1815                                         /* vap->va_size == 26 */
1816                                         ksnprintf(buf, sizeof(buf),
1817                                                   "@@0x%016llx:%05d",
1818                                                   (long long)pfsm->pfsd.sync_end_tid,
1819                                                   localization >> 16);
1820                                 } else {
1821                                         /* vap->va_size == 10 */
1822                                         ksnprintf(buf, sizeof(buf),
1823                                                   "@@-1:%05d",
1824                                                   localization >> 16);
1825 #if 0
1826                                         ksnprintf(buf, sizeof(buf),
1827                                                   "@@0x%016llx:%05d",
1828                                                   (long long)HAMMER_MAX_TID,
1829                                                   localization >> 16);
1830 #endif
1831                                 }
1832                                 ptr = buf;
1833                                 bytes = strlen(buf);
1834                         }
1835                         if (pfsm)
1836                                 hammer_rel_pseudofs(hmp, pfsm);
1837                         hammer_done_transaction(&trans);
1838                 }
1839                 error = uiomove(ptr, bytes, ap->a_uio);
1840                 lwkt_reltoken(&hmp->fs_token);
1841                 return(error);
1842         }
1843
1844         /*
1845          * Long version
1846          */
1847         hammer_simple_transaction(&trans, hmp);
1848         ++hammer_stats_file_iopsr;
1849         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1850
1851         /*
1852          * Key range (begin and end inclusive) to scan.  Directory keys
1853          * directly translate to a 64 bit 'seek' position.
1854          */
1855         cursor.key_beg.localization = ip->obj_localization +
1856                                       HAMMER_LOCALIZE_MISC;
1857         cursor.key_beg.obj_id = ip->obj_id;
1858         cursor.key_beg.create_tid = 0;
1859         cursor.key_beg.delete_tid = 0;
1860         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1861         cursor.key_beg.obj_type = 0;
1862         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1863         cursor.asof = ip->obj_asof;
1864         cursor.flags |= HAMMER_CURSOR_ASOF;
1865
1866         error = hammer_ip_lookup(&cursor);
1867         if (error == 0) {
1868                 error = hammer_ip_resolve_data(&cursor);
1869                 if (error == 0) {
1870                         KKASSERT(cursor.leaf->data_len >=
1871                                  HAMMER_SYMLINK_NAME_OFF);
1872                         error = uiomove(cursor.data->symlink.name,
1873                                         cursor.leaf->data_len -
1874                                                 HAMMER_SYMLINK_NAME_OFF,
1875                                         ap->a_uio);
1876                 }
1877         }
1878         hammer_done_cursor(&cursor);
1879         hammer_done_transaction(&trans);
1880         lwkt_reltoken(&hmp->fs_token);
1881         return(error);
1882 }
1883
1884 /*
1885  * hammer_vop_nremove { nch, dvp, cred }
1886  */
1887 static
1888 int
1889 hammer_vop_nremove(struct vop_nremove_args *ap)
1890 {
1891         struct hammer_transaction trans;
1892         struct hammer_inode *dip;
1893         hammer_mount_t hmp;
1894         int error;
1895
1896         dip = VTOI(ap->a_dvp);
1897         hmp = dip->hmp;
1898
1899         if (hammer_nohistory(dip) == 0 &&
1900             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1901                 return (error);
1902         }
1903
1904         lwkt_gettoken(&hmp->fs_token);
1905         hammer_start_transaction(&trans, hmp);
1906         ++hammer_stats_file_iopsw;
1907         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1908         hammer_done_transaction(&trans);
1909         if (error == 0)
1910                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1911         lwkt_reltoken(&hmp->fs_token);
1912         return (error);
1913 }
1914
1915 /*
1916  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1917  */
1918 static
1919 int
1920 hammer_vop_nrename(struct vop_nrename_args *ap)
1921 {
1922         struct hammer_transaction trans;
1923         struct namecache *fncp;
1924         struct namecache *tncp;
1925         struct hammer_inode *fdip;
1926         struct hammer_inode *tdip;
1927         struct hammer_inode *ip;
1928         hammer_mount_t hmp;
1929         struct hammer_cursor cursor;
1930         int64_t namekey;
1931         u_int32_t max_iterations;
1932         int nlen, error;
1933
1934         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1935                 return(EXDEV);
1936         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1937                 return(EXDEV);
1938
1939         fdip = VTOI(ap->a_fdvp);
1940         tdip = VTOI(ap->a_tdvp);
1941         fncp = ap->a_fnch->ncp;
1942         tncp = ap->a_tnch->ncp;
1943         ip = VTOI(fncp->nc_vp);
1944         KKASSERT(ip != NULL);
1945
1946         hmp = ip->hmp;
1947
1948         if (fdip->obj_localization != tdip->obj_localization)
1949                 return(EXDEV);
1950         if (fdip->obj_localization != ip->obj_localization)
1951                 return(EXDEV);
1952
1953         if (fdip->flags & HAMMER_INODE_RO)
1954                 return (EROFS);
1955         if (tdip->flags & HAMMER_INODE_RO)
1956                 return (EROFS);
1957         if (ip->flags & HAMMER_INODE_RO)
1958                 return (EROFS);
1959         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1960                 return (error);
1961
1962         lwkt_gettoken(&hmp->fs_token);
1963         hammer_start_transaction(&trans, hmp);
1964         ++hammer_stats_file_iopsw;
1965
1966         /*
1967          * Remove tncp from the target directory and then link ip as
1968          * tncp. XXX pass trans to dounlink
1969          *
1970          * Force the inode sync-time to match the transaction so it is
1971          * in-sync with the creation of the target directory entry.
1972          */
1973         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1974                                 ap->a_cred, 0, -1);
1975         if (error == 0 || error == ENOENT) {
1976                 error = hammer_ip_add_directory(&trans, tdip,
1977                                                 tncp->nc_name, tncp->nc_nlen,
1978                                                 ip);
1979                 if (error == 0) {
1980                         ip->ino_data.parent_obj_id = tdip->obj_id;
1981                         ip->ino_data.ctime = trans.time;
1982                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
1983                 }
1984         }
1985         if (error)
1986                 goto failed; /* XXX */
1987
1988         /*
1989          * Locate the record in the originating directory and remove it.
1990          *
1991          * Calculate the namekey and setup the key range for the scan.  This
1992          * works kinda like a chained hash table where the lower 32 bits
1993          * of the namekey synthesize the chain.
1994          *
1995          * The key range is inclusive of both key_beg and key_end.
1996          */
1997         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1998                                            &max_iterations);
1999 retry:
2000         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
2001         cursor.key_beg.localization = fdip->obj_localization +
2002                                       hammer_dir_localization(fdip);
2003         cursor.key_beg.obj_id = fdip->obj_id;
2004         cursor.key_beg.key = namekey;
2005         cursor.key_beg.create_tid = 0;
2006         cursor.key_beg.delete_tid = 0;
2007         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2008         cursor.key_beg.obj_type = 0;
2009
2010         cursor.key_end = cursor.key_beg;
2011         cursor.key_end.key += max_iterations;
2012         cursor.asof = fdip->obj_asof;
2013         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2014
2015         /*
2016          * Scan all matching records (the chain), locate the one matching
2017          * the requested path component.
2018          *
2019          * The hammer_ip_*() functions merge in-memory records with on-disk
2020          * records for the purposes of the search.
2021          */
2022         error = hammer_ip_first(&cursor);
2023         while (error == 0) {
2024                 if (hammer_ip_resolve_data(&cursor) != 0)
2025                         break;
2026                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2027                 KKASSERT(nlen > 0);
2028                 if (fncp->nc_nlen == nlen &&
2029                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2030                         break;
2031                 }
2032                 error = hammer_ip_next(&cursor);
2033         }
2034
2035         /*
2036          * If all is ok we have to get the inode so we can adjust nlinks.
2037          *
2038          * WARNING: hammer_ip_del_directory() may have to terminate the
2039          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2040          * twice.
2041          */
2042         if (error == 0)
2043                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2044
2045         /*
2046          * XXX A deadlock here will break rename's atomicy for the purposes
2047          * of crash recovery.
2048          */
2049         if (error == EDEADLK) {
2050                 hammer_done_cursor(&cursor);
2051                 goto retry;
2052         }
2053
2054         /*
2055          * Cleanup and tell the kernel that the rename succeeded.
2056          *
2057          * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2058          *       without formally acquiring the vp since the vp might
2059          *       have zero refs on it, or in the middle of a reclaim,
2060          *       etc.
2061          */
2062         hammer_done_cursor(&cursor);
2063         if (error == 0) {
2064                 cache_rename(ap->a_fnch, ap->a_tnch);
2065                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
2066                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
2067                 while (ip->vp) {
2068                         struct vnode *vp;
2069
2070                         error = hammer_get_vnode(ip, &vp);
2071                         if (error == 0 && vp) {
2072                                 vn_unlock(vp);
2073                                 hammer_knote(ip->vp, NOTE_RENAME);
2074                                 vrele(vp);
2075                                 break;
2076                         }
2077                         kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2078                 }
2079         }
2080
2081 failed:
2082         hammer_done_transaction(&trans);
2083         lwkt_reltoken(&hmp->fs_token);
2084         return (error);
2085 }
2086
2087 /*
2088  * hammer_vop_nrmdir { nch, dvp, cred }
2089  */
2090 static
2091 int
2092 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2093 {
2094         struct hammer_transaction trans;
2095         struct hammer_inode *dip;
2096         hammer_mount_t hmp;
2097         int error;
2098
2099         dip = VTOI(ap->a_dvp);
2100         hmp = dip->hmp;
2101
2102         if (hammer_nohistory(dip) == 0 &&
2103             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2104                 return (error);
2105         }
2106
2107         lwkt_gettoken(&hmp->fs_token);
2108         hammer_start_transaction(&trans, hmp);
2109         ++hammer_stats_file_iopsw;
2110         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2111         hammer_done_transaction(&trans);
2112         if (error == 0)
2113                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2114         lwkt_reltoken(&hmp->fs_token);
2115         return (error);
2116 }
2117
2118 /*
2119  * hammer_vop_markatime { vp, cred }
2120  */
2121 static
2122 int
2123 hammer_vop_markatime(struct vop_markatime_args *ap)
2124 {
2125         struct hammer_transaction trans;
2126         struct hammer_inode *ip;
2127         hammer_mount_t hmp;
2128
2129         ip = VTOI(ap->a_vp);
2130         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2131                 return (EROFS);
2132         if (ip->flags & HAMMER_INODE_RO)
2133                 return (EROFS);
2134         hmp = ip->hmp;
2135         if (hmp->mp->mnt_flag & MNT_NOATIME)
2136                 return (0);
2137         lwkt_gettoken(&hmp->fs_token);
2138         hammer_start_transaction(&trans, hmp);
2139         ++hammer_stats_file_iopsw;
2140
2141         ip->ino_data.atime = trans.time;
2142         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2143         hammer_done_transaction(&trans);
2144         hammer_knote(ap->a_vp, NOTE_ATTRIB);
2145         lwkt_reltoken(&hmp->fs_token);
2146         return (0);
2147 }
2148
2149 /*
2150  * hammer_vop_setattr { vp, vap, cred }
2151  */
2152 static
2153 int
2154 hammer_vop_setattr(struct vop_setattr_args *ap)
2155 {
2156         struct hammer_transaction trans;
2157         struct hammer_inode *ip;
2158         struct vattr *vap;
2159         hammer_mount_t hmp;
2160         int modflags;
2161         int error;
2162         int truncating;
2163         int blksize;
2164         int kflags;
2165 #if 0
2166         int64_t aligned_size;
2167 #endif
2168         u_int32_t flags;
2169
2170         vap = ap->a_vap;
2171         ip = ap->a_vp->v_data;
2172         modflags = 0;
2173         kflags = 0;
2174         hmp = ip->hmp;
2175
2176         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2177                 return(EROFS);
2178         if (ip->flags & HAMMER_INODE_RO)
2179                 return (EROFS);
2180         if (hammer_nohistory(ip) == 0 &&
2181             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2182                 return (error);
2183         }
2184
2185         lwkt_gettoken(&hmp->fs_token);
2186         hammer_start_transaction(&trans, hmp);
2187         ++hammer_stats_file_iopsw;
2188         error = 0;
2189
2190         if (vap->va_flags != VNOVAL) {
2191                 flags = ip->ino_data.uflags;
2192                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2193                                          hammer_to_unix_xid(&ip->ino_data.uid),
2194                                          ap->a_cred);
2195                 if (error == 0) {
2196                         if (ip->ino_data.uflags != flags) {
2197                                 ip->ino_data.uflags = flags;
2198                                 ip->ino_data.ctime = trans.time;
2199                                 modflags |= HAMMER_INODE_DDIRTY;
2200                                 kflags |= NOTE_ATTRIB;
2201                         }
2202                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2203                                 error = 0;
2204                                 goto done;
2205                         }
2206                 }
2207                 goto done;
2208         }
2209         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2210                 error = EPERM;
2211                 goto done;
2212         }
2213         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2214                 mode_t cur_mode = ip->ino_data.mode;
2215                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2216                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2217                 uuid_t uuid_uid;
2218                 uuid_t uuid_gid;
2219
2220                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2221                                          ap->a_cred,
2222                                          &cur_uid, &cur_gid, &cur_mode);
2223                 if (error == 0) {
2224                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
2225                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
2226                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
2227                                  sizeof(uuid_uid)) ||
2228                             bcmp(&uuid_gid, &ip->ino_data.gid,
2229                                  sizeof(uuid_gid)) ||
2230                             ip->ino_data.mode != cur_mode
2231                         ) {
2232                                 ip->ino_data.uid = uuid_uid;
2233                                 ip->ino_data.gid = uuid_gid;
2234                                 ip->ino_data.mode = cur_mode;
2235                                 ip->ino_data.ctime = trans.time;
2236                                 modflags |= HAMMER_INODE_DDIRTY;
2237                         }
2238                         kflags |= NOTE_ATTRIB;
2239                 }
2240         }
2241         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2242                 switch(ap->a_vp->v_type) {
2243                 case VREG:
2244                         if (vap->va_size == ip->ino_data.size)
2245                                 break;
2246
2247                         /*
2248                          * Log the operation if in fast-fsync mode or if
2249                          * there are unterminated redo write records present.
2250                          *
2251                          * The second check is needed so the recovery code
2252                          * properly truncates write redos even if nominal
2253                          * REDO operations is turned off due to excessive
2254                          * writes, because the related records might be
2255                          * destroyed and never lay down a TERM_WRITE.
2256                          */
2257                         if ((ip->flags & HAMMER_INODE_REDO) ||
2258                             (ip->flags & HAMMER_INODE_RDIRTY)) {
2259                                 error = hammer_generate_redo(&trans, ip,
2260                                                              vap->va_size,
2261                                                              HAMMER_REDO_TRUNC,
2262                                                              NULL, 0);
2263                         }
2264                         blksize = hammer_blocksize(vap->va_size);
2265
2266                         /*
2267                          * XXX break atomicy, we can deadlock the backend
2268                          * if we do not release the lock.  Probably not a
2269                          * big deal here.
2270                          */
2271                         if (vap->va_size < ip->ino_data.size) {
2272                                 nvtruncbuf(ap->a_vp, vap->va_size,
2273                                            blksize,
2274                                            hammer_blockoff(vap->va_size));
2275                                 truncating = 1;
2276                                 kflags |= NOTE_WRITE;
2277                         } else {
2278                                 nvextendbuf(ap->a_vp,
2279                                             ip->ino_data.size,
2280                                             vap->va_size,
2281                                             hammer_blocksize(ip->ino_data.size),
2282                                             hammer_blocksize(vap->va_size),
2283                                             hammer_blockoff(ip->ino_data.size),
2284                                             hammer_blockoff(vap->va_size),
2285                                             0);
2286                                 truncating = 0;
2287                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
2288                         }
2289                         ip->ino_data.size = vap->va_size;
2290                         ip->ino_data.mtime = trans.time;
2291                         /* XXX safe to use SDIRTY instead of DDIRTY here? */
2292                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2293
2294                         /*
2295                          * On-media truncation is cached in the inode until
2296                          * the inode is synchronized.  We must immediately
2297                          * handle any frontend records.
2298                          */
2299                         if (truncating) {
2300                                 hammer_ip_frontend_trunc(ip, vap->va_size);
2301 #ifdef DEBUG_TRUNCATE
2302                                 if (HammerTruncIp == NULL)
2303                                         HammerTruncIp = ip;
2304 #endif
2305                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2306                                         ip->flags |= HAMMER_INODE_TRUNCATED;
2307                                         ip->trunc_off = vap->va_size;
2308 #ifdef DEBUG_TRUNCATE
2309                                         if (ip == HammerTruncIp)
2310                                         kprintf("truncate1 %016llx\n",
2311                                                 (long long)ip->trunc_off);
2312 #endif
2313                                 } else if (ip->trunc_off > vap->va_size) {
2314                                         ip->trunc_off = vap->va_size;
2315 #ifdef DEBUG_TRUNCATE
2316                                         if (ip == HammerTruncIp)
2317                                         kprintf("truncate2 %016llx\n",
2318                                                 (long long)ip->trunc_off);
2319 #endif
2320                                 } else {
2321 #ifdef DEBUG_TRUNCATE
2322                                         if (ip == HammerTruncIp)
2323                                         kprintf("truncate3 %016llx (ignored)\n",
2324                                                 (long long)vap->va_size);
2325 #endif
2326                                 }
2327                         }
2328
2329 #if 0
2330                         /*
2331                          * When truncating, nvtruncbuf() may have cleaned out
2332                          * a portion of the last block on-disk in the buffer
2333                          * cache.  We must clean out any frontend records
2334                          * for blocks beyond the new last block.
2335                          */
2336                         aligned_size = (vap->va_size + (blksize - 1)) &
2337                                        ~(int64_t)(blksize - 1);
2338                         if (truncating && vap->va_size < aligned_size) {
2339                                 aligned_size -= blksize;
2340                                 hammer_ip_frontend_trunc(ip, aligned_size);
2341                         }
2342 #endif
2343                         break;
2344                 case VDATABASE:
2345                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2346                                 ip->flags |= HAMMER_INODE_TRUNCATED;
2347                                 ip->trunc_off = vap->va_size;
2348                         } else if (ip->trunc_off > vap->va_size) {
2349                                 ip->trunc_off = vap->va_size;
2350                         }
2351                         hammer_ip_frontend_trunc(ip, vap->va_size);
2352                         ip->ino_data.size = vap->va_size;
2353                         ip->ino_data.mtime = trans.time;
2354                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2355                         kflags |= NOTE_ATTRIB;
2356                         break;
2357                 default:
2358                         error = EINVAL;
2359                         goto done;
2360                 }
2361                 break;
2362         }
2363         if (vap->va_atime.tv_sec != VNOVAL) {
2364                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2365                 modflags |= HAMMER_INODE_ATIME;
2366                 kflags |= NOTE_ATTRIB;
2367         }
2368         if (vap->va_mtime.tv_sec != VNOVAL) {
2369                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2370                 modflags |= HAMMER_INODE_MTIME;
2371                 kflags |= NOTE_ATTRIB;
2372         }
2373         if (vap->va_mode != (mode_t)VNOVAL) {
2374                 mode_t   cur_mode = ip->ino_data.mode;
2375                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2376                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2377
2378                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2379                                          cur_uid, cur_gid, &cur_mode);
2380                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2381                         ip->ino_data.mode = cur_mode;
2382                         ip->ino_data.ctime = trans.time;
2383                         modflags |= HAMMER_INODE_DDIRTY;
2384                         kflags |= NOTE_ATTRIB;
2385                 }
2386         }
2387 done:
2388         if (error == 0)
2389                 hammer_modify_inode(&trans, ip, modflags);
2390         hammer_done_transaction(&trans);
2391         hammer_knote(ap->a_vp, kflags);
2392         lwkt_reltoken(&hmp->fs_token);
2393         return (error);
2394 }
2395
2396 /*
2397  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2398  */
2399 static
2400 int
2401 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2402 {
2403         struct hammer_transaction trans;
2404         struct hammer_inode *dip;
2405         struct hammer_inode *nip;
2406         hammer_record_t record;
2407         struct nchandle *nch;
2408         hammer_mount_t hmp;
2409         int error;
2410         int bytes;
2411
2412         ap->a_vap->va_type = VLNK;
2413
2414         nch = ap->a_nch;
2415         dip = VTOI(ap->a_dvp);
2416         hmp = dip->hmp;
2417
2418         if (dip->flags & HAMMER_INODE_RO)
2419                 return (EROFS);
2420         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2421                 return (error);
2422
2423         /*
2424          * Create a transaction to cover the operations we perform.
2425          */
2426         lwkt_gettoken(&hmp->fs_token);
2427         hammer_start_transaction(&trans, hmp);
2428         ++hammer_stats_file_iopsw;
2429
2430         /*
2431          * Create a new filesystem object of the requested type.  The
2432          * returned inode will be referenced but not locked.
2433          */
2434
2435         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2436                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2437                                     NULL, &nip);
2438         if (error) {
2439                 hammer_done_transaction(&trans);
2440                 *ap->a_vpp = NULL;
2441                 lwkt_reltoken(&hmp->fs_token);
2442                 return (error);
2443         }
2444
2445         /*
2446          * Add a record representing the symlink.  symlink stores the link
2447          * as pure data, not a string, and is no \0 terminated.
2448          */
2449         if (error == 0) {
2450                 bytes = strlen(ap->a_target);
2451
2452                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2453                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2454                 } else {
2455                         record = hammer_alloc_mem_record(nip, bytes);
2456                         record->type = HAMMER_MEM_RECORD_GENERAL;
2457
2458                         record->leaf.base.localization = nip->obj_localization +
2459                                                          HAMMER_LOCALIZE_MISC;
2460                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2461                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2462                         record->leaf.data_len = bytes;
2463                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2464                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2465                         error = hammer_ip_add_record(&trans, record);
2466                 }
2467
2468                 /*
2469                  * Set the file size to the length of the link.
2470                  */
2471                 if (error == 0) {
2472                         nip->ino_data.size = bytes;
2473                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2474                 }
2475         }
2476         if (error == 0)
2477                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2478                                                 nch->ncp->nc_nlen, nip);
2479
2480         /*
2481          * Finish up.
2482          */
2483         if (error) {
2484                 hammer_rel_inode(nip, 0);
2485                 *ap->a_vpp = NULL;
2486         } else {
2487                 error = hammer_get_vnode(nip, ap->a_vpp);
2488                 hammer_rel_inode(nip, 0);
2489                 if (error == 0) {
2490                         cache_setunresolved(ap->a_nch);
2491                         cache_setvp(ap->a_nch, *ap->a_vpp);
2492                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2493                 }
2494         }
2495         hammer_done_transaction(&trans);
2496         lwkt_reltoken(&hmp->fs_token);
2497         return (error);
2498 }
2499
2500 /*
2501  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2502  */
2503 static
2504 int
2505 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2506 {
2507         struct hammer_transaction trans;
2508         struct hammer_inode *dip;
2509         hammer_mount_t hmp;
2510         int error;
2511
2512         dip = VTOI(ap->a_dvp);
2513         hmp = dip->hmp;
2514
2515         if (hammer_nohistory(dip) == 0 &&
2516             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2517                 return (error);
2518         }
2519
2520         lwkt_gettoken(&hmp->fs_token);
2521         hammer_start_transaction(&trans, hmp);
2522         ++hammer_stats_file_iopsw;
2523         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2524                                 ap->a_cred, ap->a_flags, -1);
2525         hammer_done_transaction(&trans);
2526         lwkt_reltoken(&hmp->fs_token);
2527
2528         return (error);
2529 }
2530
2531 /*
2532  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2533  */
2534 static
2535 int
2536 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2537 {
2538         struct hammer_inode *ip = ap->a_vp->v_data;
2539         hammer_mount_t hmp = ip->hmp;
2540         int error;
2541
2542         ++hammer_stats_file_iopsr;
2543         lwkt_gettoken(&hmp->fs_token);
2544         error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2545                              ap->a_fflag, ap->a_cred);
2546         lwkt_reltoken(&hmp->fs_token);
2547         return (error);
2548 }
2549
2550 static
2551 int
2552 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2553 {
2554         static const struct mountctl_opt extraopt[] = {
2555                 { HMNT_NOHISTORY,       "nohistory" },
2556                 { HMNT_MASTERID,        "master" },
2557                 { 0, NULL}
2558
2559         };
2560         struct hammer_mount *hmp;
2561         struct mount *mp;
2562         int usedbytes;
2563         int error;
2564
2565         error = 0;
2566         usedbytes = 0;
2567         mp = ap->a_head.a_ops->head.vv_mount;
2568         KKASSERT(mp->mnt_data != NULL);
2569         hmp = (struct hammer_mount *)mp->mnt_data;
2570
2571         lwkt_gettoken(&hmp->fs_token);
2572
2573         switch(ap->a_op) {
2574         case MOUNTCTL_SET_EXPORT:
2575                 if (ap->a_ctllen != sizeof(struct export_args))
2576                         error = EINVAL;
2577                 else
2578                         error = hammer_vfs_export(mp, ap->a_op,
2579                                       (const struct export_args *)ap->a_ctl);
2580                 break;
2581         case MOUNTCTL_MOUNTFLAGS:
2582         {
2583                 /*
2584                  * Call standard mountctl VOP function
2585                  * so we get user mount flags.
2586                  */
2587                 error = vop_stdmountctl(ap);
2588                 if (error)
2589                         break;
2590
2591                 usedbytes = *ap->a_res;
2592
2593                 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2594                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2595                                                     ap->a_buf,
2596                                                     ap->a_buflen - usedbytes,
2597                                                     &error);
2598                 }
2599
2600                 *ap->a_res += usedbytes;
2601                 break;
2602         }
2603         default:
2604                 error = vop_stdmountctl(ap);
2605                 break;
2606         }
2607         lwkt_reltoken(&hmp->fs_token);
2608         return(error);
2609 }
2610
2611 /*
2612  * hammer_vop_strategy { vp, bio }
2613  *
2614  * Strategy call, used for regular file read & write only.  Note that the
2615  * bp may represent a cluster.
2616  *
2617  * To simplify operation and allow better optimizations in the future,
2618  * this code does not make any assumptions with regards to buffer alignment
2619  * or size.
2620  */
2621 static
2622 int
2623 hammer_vop_strategy(struct vop_strategy_args *ap)
2624 {
2625         struct buf *bp;
2626         int error;
2627
2628         bp = ap->a_bio->bio_buf;
2629
2630         switch(bp->b_cmd) {
2631         case BUF_CMD_READ:
2632                 error = hammer_vop_strategy_read(ap);
2633                 break;
2634         case BUF_CMD_WRITE:
2635                 error = hammer_vop_strategy_write(ap);
2636                 break;
2637         default:
2638                 bp->b_error = error = EINVAL;
2639                 bp->b_flags |= B_ERROR;
2640                 biodone(ap->a_bio);
2641                 break;
2642         }
2643
2644         /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2645
2646         return (error);
2647 }
2648
2649 /*
2650  * Read from a regular file.  Iterate the related records and fill in the
2651  * BIO/BUF.  Gaps are zero-filled.
2652  *
2653  * The support code in hammer_object.c should be used to deal with mixed
2654  * in-memory and on-disk records.
2655  *
2656  * NOTE: Can be called from the cluster code with an oversized buf.
2657  *
2658  * XXX atime update
2659  */
2660 static
2661 int
2662 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2663 {
2664         struct hammer_transaction trans;
2665         struct hammer_inode *ip;
2666         struct hammer_inode *dip;
2667         hammer_mount_t hmp;
2668         struct hammer_cursor cursor;
2669         hammer_base_elm_t base;
2670         hammer_off_t disk_offset;
2671         struct bio *bio;
2672         struct bio *nbio;
2673         struct buf *bp;
2674         int64_t rec_offset;
2675         int64_t ran_end;
2676         int64_t tmp64;
2677         int error;
2678         int boff;
2679         int roff;
2680         int n;
2681         int isdedupable;
2682
2683         bio = ap->a_bio;
2684         bp = bio->bio_buf;
2685         ip = ap->a_vp->v_data;
2686         hmp = ip->hmp;
2687
2688         /*
2689          * The zone-2 disk offset may have been set by the cluster code via
2690          * a BMAP operation, or else should be NOOFFSET.
2691          *
2692          * Checking the high bits for a match against zone-2 should suffice.
2693          *
2694          * In cases where a lot of data duplication is present it may be
2695          * more beneficial to drop through and doubule-buffer through the
2696          * device.
2697          */
2698         nbio = push_bio(bio);
2699         if (hammer_double_buffer == 0 &&
2700             (nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2701             HAMMER_ZONE_LARGE_DATA) {
2702                 lwkt_gettoken(&hmp->fs_token);
2703                 error = hammer_io_direct_read(hmp, nbio, NULL);
2704                 lwkt_reltoken(&hmp->fs_token);
2705                 return (error);
2706         }
2707
2708         /*
2709          * Well, that sucked.  Do it the hard way.  If all the stars are
2710          * aligned we may still be able to issue a direct-read.
2711          */
2712         lwkt_gettoken(&hmp->fs_token);
2713         hammer_simple_transaction(&trans, hmp);
2714         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2715
2716         /*
2717          * Key range (begin and end inclusive) to scan.  Note that the key's
2718          * stored in the actual records represent BASE+LEN, not BASE.  The
2719          * first record containing bio_offset will have a key > bio_offset.
2720          */
2721         cursor.key_beg.localization = ip->obj_localization +
2722                                       HAMMER_LOCALIZE_MISC;
2723         cursor.key_beg.obj_id = ip->obj_id;
2724         cursor.key_beg.create_tid = 0;
2725         cursor.key_beg.delete_tid = 0;
2726         cursor.key_beg.obj_type = 0;
2727         cursor.key_beg.key = bio->bio_offset + 1;
2728         cursor.asof = ip->obj_asof;
2729         cursor.flags |= HAMMER_CURSOR_ASOF;
2730
2731         cursor.key_end = cursor.key_beg;
2732         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2733 #if 0
2734         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2735                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2736                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2737                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2738         } else
2739 #endif
2740         {
2741                 ran_end = bio->bio_offset + bp->b_bufsize;
2742                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2743                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2744                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2745                 if (tmp64 < ran_end)
2746                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2747                 else
2748                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2749         }
2750         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2751
2752         error = hammer_ip_first(&cursor);
2753         boff = 0;
2754
2755         while (error == 0) {
2756                 /*
2757                  * Get the base file offset of the record.  The key for
2758                  * data records is (base + bytes) rather then (base).
2759                  */
2760                 base = &cursor.leaf->base;
2761                 rec_offset = base->key - cursor.leaf->data_len;
2762
2763                 /*
2764                  * Calculate the gap, if any, and zero-fill it.
2765                  *
2766                  * n is the offset of the start of the record verses our
2767                  * current seek offset in the bio.
2768                  */
2769                 n = (int)(rec_offset - (bio->bio_offset + boff));
2770                 if (n > 0) {
2771                         if (n > bp->b_bufsize - boff)
2772                                 n = bp->b_bufsize - boff;
2773                         bzero((char *)bp->b_data + boff, n);
2774                         boff += n;
2775                         n = 0;
2776                 }
2777
2778                 /*
2779                  * Calculate the data offset in the record and the number
2780                  * of bytes we can copy.
2781                  *
2782                  * There are two degenerate cases.  First, boff may already
2783                  * be at bp->b_bufsize.  Secondly, the data offset within
2784                  * the record may exceed the record's size.
2785                  */
2786                 roff = -n;
2787                 rec_offset += roff;
2788                 n = cursor.leaf->data_len - roff;
2789                 if (n <= 0) {
2790                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2791                         n = 0;
2792                 } else if (n > bp->b_bufsize - boff) {
2793                         n = bp->b_bufsize - boff;
2794                 }
2795
2796                 /*
2797                  * Deal with cached truncations.  This cool bit of code
2798                  * allows truncate()/ftruncate() to avoid having to sync
2799                  * the file.
2800                  *
2801                  * If the frontend is truncated then all backend records are
2802                  * subject to the frontend's truncation.
2803                  *
2804                  * If the backend is truncated then backend records on-disk
2805                  * (but not in-memory) are subject to the backend's
2806                  * truncation.  In-memory records owned by the backend
2807                  * represent data written after the truncation point on the
2808                  * backend and must not be truncated.
2809                  *
2810                  * Truncate operations deal with frontend buffer cache
2811                  * buffers and frontend-owned in-memory records synchronously.
2812                  */
2813                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2814                         if (hammer_cursor_ondisk(&cursor)/* ||
2815                             cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2816                                 if (ip->trunc_off <= rec_offset)
2817                                         n = 0;
2818                                 else if (ip->trunc_off < rec_offset + n)
2819                                         n = (int)(ip->trunc_off - rec_offset);
2820                         }
2821                 }
2822                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2823                         if (hammer_cursor_ondisk(&cursor)) {
2824                                 if (ip->sync_trunc_off <= rec_offset)
2825                                         n = 0;
2826                                 else if (ip->sync_trunc_off < rec_offset + n)
2827                                         n = (int)(ip->sync_trunc_off - rec_offset);
2828                         }
2829                 }
2830
2831                 /*
2832                  * Try to issue a direct read into our bio if possible,
2833                  * otherwise resolve the element data into a hammer_buffer
2834                  * and copy.
2835                  *
2836                  * The buffer on-disk should be zerod past any real
2837                  * truncation point, but may not be for any synthesized
2838                  * truncation point from above.
2839                  */
2840                 disk_offset = cursor.leaf->data_offset + roff;
2841                 isdedupable = (boff == 0 && n == bp->b_bufsize &&
2842                                hammer_cursor_ondisk(&cursor) &&
2843                                ((int)disk_offset & HAMMER_BUFMASK) == 0);
2844
2845                 if (isdedupable && hammer_double_buffer == 0) {
2846                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2847                                  HAMMER_ZONE_LARGE_DATA);
2848                         nbio->bio_offset = disk_offset;
2849                         error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2850                         if (hammer_live_dedup && error == 0)
2851                                 hammer_dedup_cache_add(ip, cursor.leaf);
2852                         goto done;
2853                 } else if (n) {
2854                         error = hammer_ip_resolve_data(&cursor);
2855                         if (error == 0) {
2856                                 if (hammer_live_dedup && isdedupable)
2857                                         hammer_dedup_cache_add(ip, cursor.leaf);
2858                                 bcopy((char *)cursor.data + roff,
2859                                       (char *)bp->b_data + boff, n);
2860                         }
2861                 }
2862                 if (error)
2863                         break;
2864
2865                 /*
2866                  * We have to be sure that the only elements added to the
2867                  * dedup cache are those which are already on-media.
2868                  */
2869                 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2870                         hammer_dedup_cache_add(ip, cursor.leaf);
2871
2872                 /*
2873                  * Iterate until we have filled the request.
2874                  */
2875                 boff += n;
2876                 if (boff == bp->b_bufsize)
2877                         break;
2878                 error = hammer_ip_next(&cursor);
2879         }
2880
2881         /*
2882          * There may have been a gap after the last record
2883          */
2884         if (error == ENOENT)
2885                 error = 0;
2886         if (error == 0 && boff != bp->b_bufsize) {
2887                 KKASSERT(boff < bp->b_bufsize);
2888                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2889                 /* boff = bp->b_bufsize; */
2890         }
2891         bp->b_resid = 0;
2892         bp->b_error = error;
2893         if (error)
2894                 bp->b_flags |= B_ERROR;
2895         biodone(ap->a_bio);
2896
2897 done:
2898         /*
2899          * Cache the b-tree node for the last data read in cache[1].
2900          *
2901          * If we hit the file EOF then also cache the node in the
2902          * governing director's cache[3], it will be used to initialize
2903          * the inode's cache[1] for any inodes looked up via the directory.
2904          *
2905          * This doesn't reduce disk accesses since the B-Tree chain is
2906          * likely cached, but it does reduce cpu overhead when looking
2907          * up file offsets for cpdup/tar/cpio style iterations.
2908          */
2909         if (cursor.node)
2910                 hammer_cache_node(&ip->cache[1], cursor.node);
2911         if (ran_end >= ip->ino_data.size) {
2912                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2913                                         ip->obj_asof, ip->obj_localization);
2914                 if (dip) {
2915                         hammer_cache_node(&dip->cache[3], cursor.node);
2916                         hammer_rel_inode(dip, 0);
2917                 }
2918         }
2919         hammer_done_cursor(&cursor);
2920         hammer_done_transaction(&trans);
2921         lwkt_reltoken(&hmp->fs_token);
2922         return(error);
2923 }
2924
2925 /*
2926  * BMAP operation - used to support cluster_read() only.
2927  *
2928  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2929  *
2930  * This routine may return EOPNOTSUPP if the opration is not supported for
2931  * the specified offset.  The contents of the pointer arguments do not
2932  * need to be initialized in that case. 
2933  *
2934  * If a disk address is available and properly aligned return 0 with 
2935  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2936  * to the run-length relative to that offset.  Callers may assume that
2937  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2938  * large, so return EOPNOTSUPP if it is not sufficiently large.
2939  */
2940 static
2941 int
2942 hammer_vop_bmap(struct vop_bmap_args *ap)
2943 {
2944         struct hammer_transaction trans;
2945         struct hammer_inode *ip;
2946         hammer_mount_t hmp;
2947         struct hammer_cursor cursor;
2948         hammer_base_elm_t base;
2949         int64_t rec_offset;
2950         int64_t ran_end;
2951         int64_t tmp64;
2952         int64_t base_offset;
2953         int64_t base_disk_offset;
2954         int64_t last_offset;
2955         hammer_off_t last_disk_offset;
2956         hammer_off_t disk_offset;
2957         int     rec_len;
2958         int     error;
2959         int     blksize;
2960
2961         ++hammer_stats_file_iopsr;
2962         ip = ap->a_vp->v_data;
2963         hmp = ip->hmp;
2964
2965         /*
2966          * We can only BMAP regular files.  We can't BMAP database files,
2967          * directories, etc.
2968          */
2969         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2970                 return(EOPNOTSUPP);
2971
2972         /*
2973          * bmap is typically called with runp/runb both NULL when used
2974          * for writing.  We do not support BMAP for writing atm.
2975          */
2976         if (ap->a_cmd != BUF_CMD_READ)
2977                 return(EOPNOTSUPP);
2978
2979         /*
2980          * Scan the B-Tree to acquire blockmap addresses, then translate
2981          * to raw addresses.
2982          */
2983         lwkt_gettoken(&hmp->fs_token);
2984         hammer_simple_transaction(&trans, hmp);
2985 #if 0
2986         kprintf("bmap_beg %016llx ip->cache %p\n",
2987                 (long long)ap->a_loffset, ip->cache[1]);
2988 #endif
2989         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2990
2991         /*
2992          * Key range (begin and end inclusive) to scan.  Note that the key's
2993          * stored in the actual records represent BASE+LEN, not BASE.  The
2994          * first record containing bio_offset will have a key > bio_offset.
2995          */
2996         cursor.key_beg.localization = ip->obj_localization +
2997                                       HAMMER_LOCALIZE_MISC;
2998         cursor.key_beg.obj_id = ip->obj_id;
2999         cursor.key_beg.create_tid = 0;
3000         cursor.key_beg.delete_tid = 0;
3001         cursor.key_beg.obj_type = 0;
3002         if (ap->a_runb)
3003                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
3004         else
3005                 cursor.key_beg.key = ap->a_loffset + 1;
3006         if (cursor.key_beg.key < 0)
3007                 cursor.key_beg.key = 0;
3008         cursor.asof = ip->obj_asof;
3009         cursor.flags |= HAMMER_CURSOR_ASOF;
3010
3011         cursor.key_end = cursor.key_beg;
3012         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3013
3014         ran_end = ap->a_loffset + MAXPHYS;
3015         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3016         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3017         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
3018         if (tmp64 < ran_end)
3019                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3020         else
3021                 cursor.key_end.key = ran_end + MAXPHYS + 1;
3022
3023         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3024
3025         error = hammer_ip_first(&cursor);
3026         base_offset = last_offset = 0;
3027         base_disk_offset = last_disk_offset = 0;
3028
3029         while (error == 0) {
3030                 /*
3031                  * Get the base file offset of the record.  The key for
3032                  * data records is (base + bytes) rather then (base).
3033                  *
3034                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
3035                  * The extra bytes should be zero on-disk and the BMAP op
3036                  * should still be ok.
3037                  */
3038                 base = &cursor.leaf->base;
3039                 rec_offset = base->key - cursor.leaf->data_len;
3040                 rec_len    = cursor.leaf->data_len;
3041
3042                 /*
3043                  * Incorporate any cached truncation.
3044                  *
3045                  * NOTE: Modifications to rec_len based on synthesized
3046                  * truncation points remove the guarantee that any extended
3047                  * data on disk is zero (since the truncations may not have
3048                  * taken place on-media yet).
3049                  */
3050                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
3051                         if (hammer_cursor_ondisk(&cursor) ||
3052                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3053                                 if (ip->trunc_off <= rec_offset)
3054                                         rec_len = 0;
3055                                 else if (ip->trunc_off < rec_offset + rec_len)
3056                                         rec_len = (int)(ip->trunc_off - rec_offset);
3057                         }
3058                 }
3059                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3060                         if (hammer_cursor_ondisk(&cursor)) {
3061                                 if (ip->sync_trunc_off <= rec_offset)
3062                                         rec_len = 0;
3063                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
3064                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
3065                         }
3066                 }
3067
3068                 /*
3069                  * Accumulate information.  If we have hit a discontiguous
3070                  * block reset base_offset unless we are already beyond the
3071                  * requested offset.  If we are, that's it, we stop.
3072                  */
3073                 if (error)
3074                         break;
3075                 if (hammer_cursor_ondisk(&cursor)) {
3076                         disk_offset = cursor.leaf->data_offset;
3077                         if (rec_offset != last_offset ||
3078                             disk_offset != last_disk_offset) {
3079                                 if (rec_offset > ap->a_loffset)
3080                                         break;
3081                                 base_offset = rec_offset;
3082                                 base_disk_offset = disk_offset;
3083                         }
3084                         last_offset = rec_offset + rec_len;
3085                         last_disk_offset = disk_offset + rec_len;
3086
3087                         if (hammer_live_dedup)
3088                                 hammer_dedup_cache_add(ip, cursor.leaf);
3089                 }
3090                 
3091                 error = hammer_ip_next(&cursor);
3092         }
3093
3094 #if 0
3095         kprintf("BMAP %016llx:  %016llx - %016llx\n",
3096                 (long long)ap->a_loffset,
3097                 (long long)base_offset,
3098                 (long long)last_offset);
3099         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3100                 (long long)base_disk_offset,
3101                 (long long)last_disk_offset);
3102 #endif
3103
3104         if (cursor.node) {
3105                 hammer_cache_node(&ip->cache[1], cursor.node);
3106 #if 0
3107                 kprintf("bmap_end2 %016llx ip->cache %p\n",
3108                         (long long)ap->a_loffset, ip->cache[1]);
3109 #endif
3110         }
3111         hammer_done_cursor(&cursor);
3112         hammer_done_transaction(&trans);
3113         lwkt_reltoken(&hmp->fs_token);
3114
3115         /*
3116          * If we couldn't find any records or the records we did find were
3117          * all behind the requested offset, return failure.  A forward
3118          * truncation can leave a hole w/ no on-disk records.
3119          */
3120         if (last_offset == 0 || last_offset < ap->a_loffset)
3121                 return (EOPNOTSUPP);
3122
3123         /*
3124          * Figure out the block size at the requested offset and adjust
3125          * our limits so the cluster_read() does not create inappropriately
3126          * sized buffer cache buffers.
3127          */
3128         blksize = hammer_blocksize(ap->a_loffset);
3129         if (hammer_blocksize(base_offset) != blksize) {
3130                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3131         }
3132         if (last_offset != ap->a_loffset &&
3133             hammer_blocksize(last_offset - 1) != blksize) {
3134                 last_offset = hammer_blockdemarc(ap->a_loffset,
3135                                                  last_offset - 1);
3136         }
3137
3138         /*
3139          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3140          * from occuring.
3141          */
3142         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3143
3144         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3145                 /*
3146                  * Only large-data zones can be direct-IOd
3147                  */
3148                 error = EOPNOTSUPP;
3149         } else if ((disk_offset & HAMMER_BUFMASK) ||
3150                    (last_offset - ap->a_loffset) < blksize) {
3151                 /*
3152                  * doffsetp is not aligned or the forward run size does
3153                  * not cover a whole buffer, disallow the direct I/O.
3154                  */
3155                 error = EOPNOTSUPP;
3156         } else {
3157                 /*
3158                  * We're good.
3159                  */
3160                 *ap->a_doffsetp = disk_offset;
3161                 if (ap->a_runb) {
3162                         *ap->a_runb = ap->a_loffset - base_offset;
3163                         KKASSERT(*ap->a_runb >= 0);
3164                 }
3165                 if (ap->a_runp) {
3166                         *ap->a_runp = last_offset - ap->a_loffset;
3167                         KKASSERT(*ap->a_runp >= 0);
3168                 }
3169                 error = 0;
3170         }
3171         return(error);
3172 }
3173
3174 /*
3175  * Write to a regular file.   Because this is a strategy call the OS is
3176  * trying to actually get data onto the media.
3177  */
3178 static
3179 int
3180 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3181 {
3182         hammer_record_t record;
3183         hammer_mount_t hmp;
3184         hammer_inode_t ip;
3185         struct bio *bio;
3186         struct buf *bp;
3187         int blksize;
3188         int bytes;
3189         int error;
3190
3191         bio = ap->a_bio;
3192         bp = bio->bio_buf;
3193         ip = ap->a_vp->v_data;
3194         hmp = ip->hmp;
3195
3196         blksize = hammer_blocksize(bio->bio_offset);
3197         KKASSERT(bp->b_bufsize == blksize);
3198
3199         if (ip->flags & HAMMER_INODE_RO) {
3200                 bp->b_error = EROFS;
3201                 bp->b_flags |= B_ERROR;
3202                 biodone(ap->a_bio);
3203                 return(EROFS);
3204         }
3205
3206         lwkt_gettoken(&hmp->fs_token);
3207
3208         /*
3209          * Interlock with inode destruction (no in-kernel or directory
3210          * topology visibility).  If we queue new IO while trying to
3211          * destroy the inode we can deadlock the vtrunc call in
3212          * hammer_inode_unloadable_check().
3213          *
3214          * Besides, there's no point flushing a bp associated with an
3215          * inode that is being destroyed on-media and has no kernel
3216          * references.
3217          */
3218         if ((ip->flags | ip->sync_flags) &
3219             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3220                 bp->b_resid = 0;
3221                 biodone(ap->a_bio);
3222                 lwkt_reltoken(&hmp->fs_token);
3223                 return(0);
3224         }
3225
3226         /*
3227          * Reserve space and issue a direct-write from the front-end. 
3228          * NOTE: The direct_io code will hammer_bread/bcopy smaller
3229          * allocations.
3230          *
3231          * An in-memory record will be installed to reference the storage
3232          * until the flusher can get to it.
3233          *
3234          * Since we own the high level bio the front-end will not try to
3235          * do a direct-read until the write completes.
3236          *
3237          * NOTE: The only time we do not reserve a full-sized buffers
3238          * worth of data is if the file is small.  We do not try to
3239          * allocate a fragment (from the small-data zone) at the end of
3240          * an otherwise large file as this can lead to wildly separated
3241          * data.
3242          */
3243         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3244         KKASSERT(bio->bio_offset < ip->ino_data.size);
3245         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3246                 bytes = bp->b_bufsize;
3247         else
3248                 bytes = ((int)ip->ino_data.size + 15) & ~15;
3249
3250         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3251                                     bytes, &error);
3252
3253         /*
3254          * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3255          * in hammer_vop_write().  We must flag the record so the proper
3256          * REDO_TERM_WRITE entry is generated during the flush.
3257          */
3258         if (record) {
3259                 if (bp->b_flags & B_VFSFLAG1) {
3260                         record->flags |= HAMMER_RECF_REDO;
3261                         bp->b_flags &= ~B_VFSFLAG1;
3262                 }
3263                 if (record->flags & HAMMER_RECF_DEDUPED) {
3264                         bp->b_resid = 0;
3265                         hammer_ip_replace_bulk(hmp, record);
3266                         biodone(ap->a_bio);
3267                 } else {
3268                         hammer_io_direct_write(hmp, bio, record);
3269                 }
3270                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3271                         hammer_flush_inode(ip, 0);
3272         } else {
3273                 bp->b_bio2.bio_offset = NOOFFSET;
3274                 bp->b_error = error;
3275                 bp->b_flags |= B_ERROR;
3276                 biodone(ap->a_bio);
3277         }
3278         lwkt_reltoken(&hmp->fs_token);
3279         return(error);
3280 }
3281
3282 /*
3283  * dounlink - disconnect a directory entry
3284  *
3285  * XXX whiteout support not really in yet
3286  */
3287 static int
3288 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3289                 struct vnode *dvp, struct ucred *cred, 
3290                 int flags, int isdir)
3291 {
3292         struct namecache *ncp;
3293         hammer_inode_t dip;
3294         hammer_inode_t ip;
3295         hammer_mount_t hmp;
3296         struct hammer_cursor cursor;
3297         int64_t namekey;
3298         u_int32_t max_iterations;
3299         int nlen, error;
3300
3301         /*
3302          * Calculate the namekey and setup the key range for the scan.  This
3303          * works kinda like a chained hash table where the lower 32 bits
3304          * of the namekey synthesize the chain.
3305          *
3306          * The key range is inclusive of both key_beg and key_end.
3307          */
3308         dip = VTOI(dvp);
3309         ncp = nch->ncp;
3310         hmp = dip->hmp;
3311
3312         if (dip->flags & HAMMER_INODE_RO)
3313                 return (EROFS);
3314
3315         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3316                                            &max_iterations);
3317 retry:
3318         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3319         cursor.key_beg.localization = dip->obj_localization +
3320                                       hammer_dir_localization(dip);
3321         cursor.key_beg.obj_id = dip->obj_id;
3322         cursor.key_beg.key = namekey;
3323         cursor.key_beg.create_tid = 0;
3324         cursor.key_beg.delete_tid = 0;
3325         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3326         cursor.key_beg.obj_type = 0;
3327
3328         cursor.key_end = cursor.key_beg;
3329         cursor.key_end.key += max_iterations;
3330         cursor.asof = dip->obj_asof;
3331         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3332
3333         /*
3334          * Scan all matching records (the chain), locate the one matching
3335          * the requested path component.  info->last_error contains the
3336          * error code on search termination and could be 0, ENOENT, or
3337          * something else.
3338          *
3339          * The hammer_ip_*() functions merge in-memory records with on-disk
3340          * records for the purposes of the search.
3341          */
3342         error = hammer_ip_first(&cursor);
3343
3344         while (error == 0) {
3345                 error = hammer_ip_resolve_data(&cursor);
3346                 if (error)
3347                         break;
3348                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3349                 KKASSERT(nlen > 0);
3350                 if (ncp->nc_nlen == nlen &&
3351                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3352                         break;
3353                 }
3354                 error = hammer_ip_next(&cursor);
3355         }
3356
3357         /*
3358          * If all is ok we have to get the inode so we can adjust nlinks.
3359          * To avoid a deadlock with the flusher we must release the inode
3360          * lock on the directory when acquiring the inode for the entry.
3361          *
3362          * If the target is a directory, it must be empty.
3363          */
3364         if (error == 0) {
3365                 hammer_unlock(&cursor.ip->lock);
3366                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3367                                       hmp->asof,
3368                                       cursor.data->entry.localization,
3369                                       0, &error);
3370                 hammer_lock_sh(&cursor.ip->lock);
3371                 if (error == ENOENT) {
3372                         kprintf("HAMMER: WARNING: Removing "
3373                                 "dirent w/missing inode \"%s\"\n"
3374                                 "\tobj_id = %016llx\n",
3375                                 ncp->nc_name,
3376                                 (long long)cursor.data->entry.obj_id);
3377                         error = 0;
3378                 }
3379
3380                 /*
3381                  * If isdir >= 0 we validate that the entry is or is not a
3382                  * directory.  If isdir < 0 we don't care.
3383                  */
3384                 if (error == 0 && isdir >= 0 && ip) {
3385                         if (isdir &&
3386                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3387                                 error = ENOTDIR;
3388                         } else if (isdir == 0 &&
3389                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3390                                 error = EISDIR;
3391                         }
3392                 }
3393
3394                 /*
3395                  * If we are trying to remove a directory the directory must
3396                  * be empty.
3397                  *
3398                  * The check directory code can loop and deadlock/retry.  Our
3399                  * own cursor's node locks must be released to avoid a 3-way
3400                  * deadlock with the flusher if the check directory code
3401                  * blocks.
3402                  *
3403                  * If any changes whatsoever have been made to the cursor
3404                  * set EDEADLK and retry.
3405                  *
3406                  * WARNING: See warnings in hammer_unlock_cursor()
3407                  *          function.
3408                  */
3409                 if (error == 0 && ip && ip->ino_data.obj_type ==
3410                                         HAMMER_OBJTYPE_DIRECTORY) {
3411                         hammer_unlock_cursor(&cursor);
3412                         error = hammer_ip_check_directory_empty(trans, ip);
3413                         hammer_lock_cursor(&cursor);
3414                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
3415                                 kprintf("HAMMER: Warning: avoided deadlock "
3416                                         "on rmdir '%s'\n",
3417                                         ncp->nc_name);
3418                                 error = EDEADLK;
3419                         }
3420                 }
3421
3422                 /*
3423                  * Delete the directory entry.
3424                  *
3425                  * WARNING: hammer_ip_del_directory() may have to terminate
3426                  * the cursor to avoid a deadlock.  It is ok to call
3427                  * hammer_done_cursor() twice.
3428                  */
3429                 if (error == 0) {
3430                         error = hammer_ip_del_directory(trans, &cursor,
3431                                                         dip, ip);
3432                 }
3433                 hammer_done_cursor(&cursor);
3434                 if (error == 0) {
3435                         cache_setunresolved(nch);
3436                         cache_setvp(nch, NULL);
3437
3438                         /*
3439                          * NOTE: ip->vp, if non-NULL, cannot be directly
3440                          *       referenced without formally acquiring the
3441                          *       vp since the vp might have zero refs on it,
3442                          *       or in the middle of a reclaim, etc.
3443                          *
3444                          * NOTE: The cache_setunresolved() can rip the vp
3445                          *       out from under us since the vp may not have
3446                          *       any refs, in which case ip->vp will be NULL
3447                          *       from the outset.
3448                          */
3449                         while (ip && ip->vp) {
3450                                 struct vnode *vp;
3451
3452                                 error = hammer_get_vnode(ip, &vp);
3453                                 if (error == 0 && vp) {
3454                                         vn_unlock(vp);
3455                                         hammer_knote(ip->vp, NOTE_DELETE);
3456                                         cache_inval_vp(ip->vp, CINV_DESTROY);
3457                                         vrele(vp);
3458                                         break;
3459                                 }
3460                                 kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3461                         }
3462                 }
3463                 if (ip)
3464                         hammer_rel_inode(ip, 0);
3465         } else {
3466                 hammer_done_cursor(&cursor);
3467         }
3468         if (error == EDEADLK)
3469                 goto retry;
3470
3471         return (error);
3472 }
3473
3474 /************************************************************************
3475  *                          FIFO AND SPECFS OPS                         *
3476  ************************************************************************
3477  *
3478  */
3479 static int
3480 hammer_vop_fifoclose (struct vop_close_args *ap)
3481 {
3482         /* XXX update itimes */
3483         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3484 }
3485
3486 static int
3487 hammer_vop_fiforead (struct vop_read_args *ap)
3488 {
3489         int error;
3490
3491         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3492         /* XXX update access time */
3493         return (error);
3494 }
3495
3496 static int
3497 hammer_vop_fifowrite (struct vop_write_args *ap)
3498 {
3499         int error;
3500
3501         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3502         /* XXX update access time */
3503         return (error);
3504 }
3505
3506 static
3507 int
3508 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3509 {
3510         int error;
3511
3512         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3513         if (error)
3514                 error = hammer_vop_kqfilter(ap);
3515         return(error);
3516 }
3517
3518 /************************************************************************
3519  *                          KQFILTER OPS                                *
3520  ************************************************************************
3521  *
3522  */
3523 static void filt_hammerdetach(struct knote *kn);
3524 static int filt_hammerread(struct knote *kn, long hint);
3525 static int filt_hammerwrite(struct knote *kn, long hint);
3526 static int filt_hammervnode(struct knote *kn, long hint);
3527
3528 static struct filterops hammerread_filtops =
3529         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
3530 static struct filterops hammerwrite_filtops =
3531         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
3532 static struct filterops hammervnode_filtops =
3533         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
3534
3535 static
3536 int
3537 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3538 {
3539         struct vnode *vp = ap->a_vp;
3540         struct knote *kn = ap->a_kn;
3541
3542         switch (kn->kn_filter) {
3543         case EVFILT_READ:
3544                 kn->kn_fop = &hammerread_filtops;
3545                 break;
3546         case EVFILT_WRITE:
3547                 kn->kn_fop = &hammerwrite_filtops;
3548                 break;
3549         case EVFILT_VNODE:
3550                 kn->kn_fop = &hammervnode_filtops;
3551                 break;
3552         default:
3553                 return (EOPNOTSUPP);
3554         }
3555
3556         kn->kn_hook = (caddr_t)vp;
3557
3558         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3559
3560         return(0);
3561 }
3562
3563 static void
3564 filt_hammerdetach(struct knote *kn)
3565 {
3566         struct vnode *vp = (void *)kn->kn_hook;
3567
3568         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3569 }
3570
3571 static int
3572 filt_hammerread(struct knote *kn, long hint)
3573 {
3574         struct vnode *vp = (void *)kn->kn_hook;
3575         hammer_inode_t ip = VTOI(vp);
3576         hammer_mount_t hmp = ip->hmp;
3577         off_t off;
3578
3579         if (hint == NOTE_REVOKE) {
3580                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3581                 return(1);
3582         }
3583         lwkt_gettoken(&hmp->fs_token);  /* XXX use per-ip-token */
3584         off = ip->ino_data.size - kn->kn_fp->f_offset;
3585         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
3586         lwkt_reltoken(&hmp->fs_token);
3587         if (kn->kn_sfflags & NOTE_OLDAPI)
3588                 return(1);
3589         return (kn->kn_data != 0);
3590 }
3591
3592 static int
3593 filt_hammerwrite(struct knote *kn, long hint)
3594 {
3595         if (hint == NOTE_REVOKE)
3596                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3597         kn->kn_data = 0;
3598         return (1);
3599 }
3600
3601 static int
3602 filt_hammervnode(struct knote *kn, long hint)
3603 {
3604         if (kn->kn_sfflags & hint)
3605                 kn->kn_fflags |= hint;
3606         if (hint == NOTE_REVOKE) {
3607                 kn->kn_flags |= EV_EOF;
3608                 return (1);
3609         }
3610         return (kn->kn_fflags != 0);
3611 }
3612