HAMMER VFS - Remove B-Tree allocation hints, add double_buffer option.
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50
51 #include <sys/mplock2.h>
52
53 #include "hammer.h"
54
55 /*
56  * USERFS VNOPS
57  */
58 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
59 static int hammer_vop_fsync(struct vop_fsync_args *);
60 static int hammer_vop_read(struct vop_read_args *);
61 static int hammer_vop_write(struct vop_write_args *);
62 static int hammer_vop_access(struct vop_access_args *);
63 static int hammer_vop_advlock(struct vop_advlock_args *);
64 static int hammer_vop_close(struct vop_close_args *);
65 static int hammer_vop_ncreate(struct vop_ncreate_args *);
66 static int hammer_vop_getattr(struct vop_getattr_args *);
67 static int hammer_vop_nresolve(struct vop_nresolve_args *);
68 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
69 static int hammer_vop_nlink(struct vop_nlink_args *);
70 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
71 static int hammer_vop_nmknod(struct vop_nmknod_args *);
72 static int hammer_vop_open(struct vop_open_args *);
73 static int hammer_vop_print(struct vop_print_args *);
74 static int hammer_vop_readdir(struct vop_readdir_args *);
75 static int hammer_vop_readlink(struct vop_readlink_args *);
76 static int hammer_vop_nremove(struct vop_nremove_args *);
77 static int hammer_vop_nrename(struct vop_nrename_args *);
78 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
79 static int hammer_vop_markatime(struct vop_markatime_args *);
80 static int hammer_vop_setattr(struct vop_setattr_args *);
81 static int hammer_vop_strategy(struct vop_strategy_args *);
82 static int hammer_vop_bmap(struct vop_bmap_args *ap);
83 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
84 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
85 static int hammer_vop_ioctl(struct vop_ioctl_args *);
86 static int hammer_vop_mountctl(struct vop_mountctl_args *);
87 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
88
89 static int hammer_vop_fifoclose (struct vop_close_args *);
90 static int hammer_vop_fiforead (struct vop_read_args *);
91 static int hammer_vop_fifowrite (struct vop_write_args *);
92 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
93
94 struct vop_ops hammer_vnode_vops = {
95         .vop_default =          vop_defaultop,
96         .vop_fsync =            hammer_vop_fsync,
97         .vop_getpages =         vop_stdgetpages,
98         .vop_putpages =         vop_stdputpages,
99         .vop_read =             hammer_vop_read,
100         .vop_write =            hammer_vop_write,
101         .vop_access =           hammer_vop_access,
102         .vop_advlock =          hammer_vop_advlock,
103         .vop_close =            hammer_vop_close,
104         .vop_ncreate =          hammer_vop_ncreate,
105         .vop_getattr =          hammer_vop_getattr,
106         .vop_inactive =         hammer_vop_inactive,
107         .vop_reclaim =          hammer_vop_reclaim,
108         .vop_nresolve =         hammer_vop_nresolve,
109         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
110         .vop_nlink =            hammer_vop_nlink,
111         .vop_nmkdir =           hammer_vop_nmkdir,
112         .vop_nmknod =           hammer_vop_nmknod,
113         .vop_open =             hammer_vop_open,
114         .vop_pathconf =         vop_stdpathconf,
115         .vop_print =            hammer_vop_print,
116         .vop_readdir =          hammer_vop_readdir,
117         .vop_readlink =         hammer_vop_readlink,
118         .vop_nremove =          hammer_vop_nremove,
119         .vop_nrename =          hammer_vop_nrename,
120         .vop_nrmdir =           hammer_vop_nrmdir,
121         .vop_markatime =        hammer_vop_markatime,
122         .vop_setattr =          hammer_vop_setattr,
123         .vop_bmap =             hammer_vop_bmap,
124         .vop_strategy =         hammer_vop_strategy,
125         .vop_nsymlink =         hammer_vop_nsymlink,
126         .vop_nwhiteout =        hammer_vop_nwhiteout,
127         .vop_ioctl =            hammer_vop_ioctl,
128         .vop_mountctl =         hammer_vop_mountctl,
129         .vop_kqfilter =         hammer_vop_kqfilter
130 };
131
132 struct vop_ops hammer_spec_vops = {
133         .vop_default =          vop_defaultop,
134         .vop_fsync =            hammer_vop_fsync,
135         .vop_read =             vop_stdnoread,
136         .vop_write =            vop_stdnowrite,
137         .vop_access =           hammer_vop_access,
138         .vop_close =            hammer_vop_close,
139         .vop_markatime =        hammer_vop_markatime,
140         .vop_getattr =          hammer_vop_getattr,
141         .vop_inactive =         hammer_vop_inactive,
142         .vop_reclaim =          hammer_vop_reclaim,
143         .vop_setattr =          hammer_vop_setattr
144 };
145
146 struct vop_ops hammer_fifo_vops = {
147         .vop_default =          fifo_vnoperate,
148         .vop_fsync =            hammer_vop_fsync,
149         .vop_read =             hammer_vop_fiforead,
150         .vop_write =            hammer_vop_fifowrite,
151         .vop_access =           hammer_vop_access,
152         .vop_close =            hammer_vop_fifoclose,
153         .vop_markatime =        hammer_vop_markatime,
154         .vop_getattr =          hammer_vop_getattr,
155         .vop_inactive =         hammer_vop_inactive,
156         .vop_reclaim =          hammer_vop_reclaim,
157         .vop_setattr =          hammer_vop_setattr,
158         .vop_kqfilter =         hammer_vop_fifokqfilter
159 };
160
161 static __inline
162 void
163 hammer_knote(struct vnode *vp, int flags)
164 {
165         if (flags)
166                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
167 }
168
169 #ifdef DEBUG_TRUNCATE
170 struct hammer_inode *HammerTruncIp;
171 #endif
172
173 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
174                            struct vnode *dvp, struct ucred *cred,
175                            int flags, int isdir);
176 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
177 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
178
179 #if 0
180 static
181 int
182 hammer_vop_vnoperate(struct vop_generic_args *)
183 {
184         return (VOCALL(&hammer_vnode_vops, ap));
185 }
186 #endif
187
188 /*
189  * hammer_vop_fsync { vp, waitfor }
190  *
191  * fsync() an inode to disk and wait for it to be completely committed
192  * such that the information would not be undone if a crash occured after
193  * return.
194  *
195  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
196  *       a REDO log.  A sysctl is provided to relax HAMMER's fsync()
197  *       operation.
198  *
199  *       Ultimately the combination of a REDO log and use of fast storage
200  *       to front-end cluster caches will make fsync fast, but it aint
201  *       here yet.  And, in anycase, we need real transactional
202  *       all-or-nothing features which are not restricted to a single file.
203  */
204 static
205 int
206 hammer_vop_fsync(struct vop_fsync_args *ap)
207 {
208         hammer_inode_t ip = VTOI(ap->a_vp);
209         hammer_mount_t hmp = ip->hmp;
210         int waitfor = ap->a_waitfor;
211         int mode;
212
213         lwkt_gettoken(&hmp->fs_token);
214
215         /*
216          * Fsync rule relaxation (default is either full synchronous flush
217          * or REDO semantics with synchronous flush).
218          */
219         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
220                 switch(hammer_fsync_mode) {
221                 case 0:
222 mode0:
223                         /* no REDO, full synchronous flush */
224                         goto skip;
225                 case 1:
226 mode1:
227                         /* no REDO, full asynchronous flush */
228                         if (waitfor == MNT_WAIT)
229                                 waitfor = MNT_NOWAIT;
230                         goto skip;
231                 case 2:
232                         /* REDO semantics, synchronous flush */
233                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
234                                 goto mode0;
235                         mode = HAMMER_FLUSH_UNDOS_AUTO;
236                         break;
237                 case 3:
238                         /* REDO semantics, relaxed asynchronous flush */
239                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
240                                 goto mode1;
241                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
242                         if (waitfor == MNT_WAIT)
243                                 waitfor = MNT_NOWAIT;
244                         break;
245                 case 4:
246                         /* ignore the fsync() system call */
247                         lwkt_reltoken(&hmp->fs_token);
248                         return(0);
249                 default:
250                         /* we have to do something */
251                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
252                         if (waitfor == MNT_WAIT)
253                                 waitfor = MNT_NOWAIT;
254                         break;
255                 }
256
257                 /*
258                  * Fast fsync only needs to flush the UNDO/REDO fifo if
259                  * HAMMER_INODE_REDO is non-zero and the only modifications
260                  * made to the file are write or write-extends.
261                  */
262                 if ((ip->flags & HAMMER_INODE_REDO) &&
263                     (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
264                 ) {
265                         ++hammer_count_fsyncs;
266                         hammer_flusher_flush_undos(hmp, mode);
267                         ip->redo_count = 0;
268                         lwkt_reltoken(&hmp->fs_token);
269                         return(0);
270                 }
271
272                 /*
273                  * REDO is enabled by fsync(), the idea being we really only
274                  * want to lay down REDO records when programs are using
275                  * fsync() heavily.  The first fsync() on the file starts
276                  * the gravy train going and later fsync()s keep it hot by
277                  * resetting the redo_count.
278                  *
279                  * We weren't running REDOs before now so we have to fall
280                  * through and do a full fsync of what we have.
281                  */
282                 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
283                     (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
284                         ip->flags |= HAMMER_INODE_REDO;
285                         ip->redo_count = 0;
286                 }
287         }
288 skip:
289
290         /*
291          * Do a full flush sequence.
292          */
293         ++hammer_count_fsyncs;
294         vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
295         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
296         if (waitfor == MNT_WAIT) {
297                 vn_unlock(ap->a_vp);
298                 hammer_wait_inode(ip);
299                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
300         }
301         lwkt_reltoken(&hmp->fs_token);
302         return (ip->error);
303 }
304
305 /*
306  * hammer_vop_read { vp, uio, ioflag, cred }
307  *
308  * MPSAFE (for the cache safe does not require fs_token)
309  */
310 static
311 int
312 hammer_vop_read(struct vop_read_args *ap)
313 {
314         struct hammer_transaction trans;
315         hammer_inode_t ip;
316         hammer_mount_t hmp;
317         off_t offset;
318         struct buf *bp;
319         struct uio *uio;
320         int error;
321         int n;
322         int seqcount;
323         int ioseqcount;
324         int blksize;
325         int bigread;
326         int got_fstoken;
327
328         if (ap->a_vp->v_type != VREG)
329                 return (EINVAL);
330         ip = VTOI(ap->a_vp);
331         hmp = ip->hmp;
332         error = 0;
333         uio = ap->a_uio;
334
335         /*
336          * Allow the UIO's size to override the sequential heuristic.
337          */
338         blksize = hammer_blocksize(uio->uio_offset);
339         seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
340         ioseqcount = (ap->a_ioflag >> 16);
341         if (seqcount < ioseqcount)
342                 seqcount = ioseqcount;
343
344         /*
345          * If reading or writing a huge amount of data we have to break
346          * atomicy and allow the operation to be interrupted by a signal
347          * or it can DOS the machine.
348          */
349         bigread = (uio->uio_resid > 100 * 1024 * 1024);
350         got_fstoken = 0;
351
352         /*
353          * Access the data typically in HAMMER_BUFSIZE blocks via the
354          * buffer cache, but HAMMER may use a variable block size based
355          * on the offset.
356          *
357          * XXX Temporary hack, delay the start transaction while we remain
358          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
359          *     locked-shared.
360          */
361         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
362                 int64_t base_offset;
363                 int64_t file_limit;
364
365                 blksize = hammer_blocksize(uio->uio_offset);
366                 offset = (int)uio->uio_offset & (blksize - 1);
367                 base_offset = uio->uio_offset - offset;
368
369                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
370                         break;
371
372                 /*
373                  * MPSAFE
374                  */
375                 bp = getcacheblk(ap->a_vp, base_offset, blksize);
376                 if (bp) {
377                         error = 0;
378                         goto skip;
379                 }
380
381                 /*
382                  * MPUNSAFE
383                  */
384                 if (got_fstoken == 0) {
385                         lwkt_gettoken(&hmp->fs_token);
386                         got_fstoken = 1;
387                         hammer_start_transaction(&trans, ip->hmp);
388                 }
389
390                 if (hammer_cluster_enable) {
391                         /*
392                          * Use file_limit to prevent cluster_read() from
393                          * creating buffers of the wrong block size past
394                          * the demarc.
395                          */
396                         file_limit = ip->ino_data.size;
397                         if (base_offset < HAMMER_XDEMARC &&
398                             file_limit > HAMMER_XDEMARC) {
399                                 file_limit = HAMMER_XDEMARC;
400                         }
401                         error = cluster_read(ap->a_vp,
402                                              file_limit, base_offset,
403                                              blksize, uio->uio_resid,
404                                              seqcount * BKVASIZE, &bp);
405                 } else {
406                         error = bread(ap->a_vp, base_offset, blksize, &bp);
407                 }
408                 if (error) {
409                         brelse(bp);
410                         break;
411                 }
412 skip:
413                 if ((hammer_debug_io & 0x0001) && (bp->b_flags & B_IODEBUG)) {
414                         kprintf("doff %016jx read file %016jx@%016jx\n",
415                                 (intmax_t)bp->b_bio2.bio_offset,
416                                 (intmax_t)ip->obj_id,
417                                 (intmax_t)bp->b_loffset);
418                 }
419                 bp->b_flags &= ~B_IODEBUG;
420
421                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
422                 n = blksize - offset;
423                 if (n > uio->uio_resid)
424                         n = uio->uio_resid;
425                 if (n > ip->ino_data.size - uio->uio_offset)
426                         n = (int)(ip->ino_data.size - uio->uio_offset);
427                 if (got_fstoken)
428                         lwkt_reltoken(&hmp->fs_token);
429                 error = uiomove((char *)bp->b_data + offset, n, uio);
430                 if (got_fstoken)
431                         lwkt_gettoken(&hmp->fs_token);
432
433                 /* data has a lower priority then meta-data */
434                 bp->b_flags |= B_AGE;
435                 bqrelse(bp);
436                 if (error)
437                         break;
438                 hammer_stats_file_read += n;
439         }
440
441         /*
442          * XXX only update the atime if we had to get the MP lock.
443          * XXX hack hack hack, fixme.
444          */
445         if (got_fstoken) {
446                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
447                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
448                         ip->ino_data.atime = trans.time;
449                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
450                 }
451                 hammer_done_transaction(&trans);
452                 lwkt_reltoken(&hmp->fs_token);
453         }
454         return (error);
455 }
456
457 /*
458  * hammer_vop_write { vp, uio, ioflag, cred }
459  */
460 static
461 int
462 hammer_vop_write(struct vop_write_args *ap)
463 {
464         struct hammer_transaction trans;
465         struct hammer_inode *ip;
466         hammer_mount_t hmp;
467         struct uio *uio;
468         int offset;
469         off_t base_offset;
470         struct buf *bp;
471         int kflags;
472         int error;
473         int n;
474         int flags;
475         int seqcount;
476         int bigwrite;
477
478         if (ap->a_vp->v_type != VREG)
479                 return (EINVAL);
480         ip = VTOI(ap->a_vp);
481         hmp = ip->hmp;
482         error = 0;
483         kflags = 0;
484         seqcount = ap->a_ioflag >> 16;
485
486         if (ip->flags & HAMMER_INODE_RO)
487                 return (EROFS);
488
489         /*
490          * Create a transaction to cover the operations we perform.
491          */
492         lwkt_gettoken(&hmp->fs_token);
493         hammer_start_transaction(&trans, hmp);
494         uio = ap->a_uio;
495
496         /*
497          * Check append mode
498          */
499         if (ap->a_ioflag & IO_APPEND)
500                 uio->uio_offset = ip->ino_data.size;
501
502         /*
503          * Check for illegal write offsets.  Valid range is 0...2^63-1.
504          *
505          * NOTE: the base_off assignment is required to work around what
506          * I consider to be a GCC-4 optimization bug.
507          */
508         if (uio->uio_offset < 0) {
509                 hammer_done_transaction(&trans);
510                 lwkt_reltoken(&hmp->fs_token);
511                 return (EFBIG);
512         }
513         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
514         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
515                 hammer_done_transaction(&trans);
516                 lwkt_reltoken(&hmp->fs_token);
517                 return (EFBIG);
518         }
519
520         /*
521          * If reading or writing a huge amount of data we have to break
522          * atomicy and allow the operation to be interrupted by a signal
523          * or it can DOS the machine.
524          *
525          * Preset redo_count so we stop generating REDOs earlier if the
526          * limit is exceeded.
527          */
528         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
529         if ((ip->flags & HAMMER_INODE_REDO) &&
530             ip->redo_count < hammer_limit_redo) {
531                 ip->redo_count += uio->uio_resid;
532         }
533
534         /*
535          * Access the data typically in HAMMER_BUFSIZE blocks via the
536          * buffer cache, but HAMMER may use a variable block size based
537          * on the offset.
538          */
539         while (uio->uio_resid > 0) {
540                 int fixsize = 0;
541                 int blksize;
542                 int blkmask;
543                 int trivial;
544                 int endofblk;
545                 off_t nsize;
546
547                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
548                         break;
549                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
550                         break;
551
552                 blksize = hammer_blocksize(uio->uio_offset);
553
554                 /*
555                  * Do not allow HAMMER to blow out the buffer cache.  Very
556                  * large UIOs can lockout other processes due to bwillwrite()
557                  * mechanics.
558                  *
559                  * The hammer inode is not locked during these operations.
560                  * The vnode is locked which can interfere with the pageout
561                  * daemon for non-UIO_NOCOPY writes but should not interfere
562                  * with the buffer cache.  Even so, we cannot afford to
563                  * allow the pageout daemon to build up too many dirty buffer
564                  * cache buffers.
565                  *
566                  * Only call this if we aren't being recursively called from
567                  * a virtual disk device (vn), else we may deadlock.
568                  */
569                 if ((ap->a_ioflag & IO_RECURSE) == 0)
570                         bwillwrite(blksize);
571
572                 /*
573                  * Control the number of pending records associated with
574                  * this inode.  If too many have accumulated start a
575                  * flush.  Try to maintain a pipeline with the flusher.
576                  */
577                 if (ip->rsv_recs >= hammer_limit_inode_recs) {
578                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
579                 }
580                 if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
581                         while (ip->rsv_recs >= hammer_limit_inode_recs) {
582                                 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
583                         }
584                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
585                 }
586
587 #if 0
588                 /*
589                  * Do not allow HAMMER to blow out system memory by
590                  * accumulating too many records.   Records are so well
591                  * decoupled from the buffer cache that it is possible
592                  * for userland to push data out to the media via
593                  * direct-write, but build up the records queued to the
594                  * backend faster then the backend can flush them out.
595                  * HAMMER has hit its write limit but the frontend has
596                  * no pushback to slow it down.
597                  */
598                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
599                         /*
600                          * Get the inode on the flush list
601                          */
602                         if (ip->rsv_recs >= 64)
603                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
604                         else if (ip->rsv_recs >= 16)
605                                 hammer_flush_inode(ip, 0);
606
607                         /*
608                          * Keep the flusher going if the system keeps
609                          * queueing records.
610                          */
611                         delta = hmp->count_newrecords -
612                                 hmp->last_newrecords;
613                         if (delta < 0 || delta > hammer_limit_recs / 2) {
614                                 hmp->last_newrecords = hmp->count_newrecords;
615                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
616                         }
617
618                         /*
619                          * If we have gotten behind start slowing
620                          * down the writers.
621                          */
622                         delta = (hmp->rsv_recs - hammer_limit_recs) *
623                                 hz / hammer_limit_recs;
624                         if (delta > 0)
625                                 tsleep(&trans, 0, "hmrslo", delta);
626                 }
627 #endif
628
629                 /*
630                  * Calculate the blocksize at the current offset and figure
631                  * out how much we can actually write.
632                  */
633                 blkmask = blksize - 1;
634                 offset = (int)uio->uio_offset & blkmask;
635                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
636                 n = blksize - offset;
637                 if (n > uio->uio_resid) {
638                         n = uio->uio_resid;
639                         endofblk = 0;
640                 } else {
641                         endofblk = 1;
642                 }
643                 nsize = uio->uio_offset + n;
644                 if (nsize > ip->ino_data.size) {
645                         if (uio->uio_offset > ip->ino_data.size)
646                                 trivial = 0;
647                         else
648                                 trivial = 1;
649                         nvextendbuf(ap->a_vp,
650                                     ip->ino_data.size,
651                                     nsize,
652                                     hammer_blocksize(ip->ino_data.size),
653                                     hammer_blocksize(nsize),
654                                     hammer_blockoff(ip->ino_data.size),
655                                     hammer_blockoff(nsize),
656                                     trivial);
657                         fixsize = 1;
658                         kflags |= NOTE_EXTEND;
659                 }
660
661                 if (uio->uio_segflg == UIO_NOCOPY) {
662                         /*
663                          * Issuing a write with the same data backing the
664                          * buffer.  Instantiate the buffer to collect the
665                          * backing vm pages, then read-in any missing bits.
666                          *
667                          * This case is used by vop_stdputpages().
668                          */
669                         bp = getblk(ap->a_vp, base_offset,
670                                     blksize, GETBLK_BHEAVY, 0);
671                         if ((bp->b_flags & B_CACHE) == 0) {
672                                 bqrelse(bp);
673                                 error = bread(ap->a_vp, base_offset,
674                                               blksize, &bp);
675                         }
676                 } else if (offset == 0 && uio->uio_resid >= blksize) {
677                         /*
678                          * Even though we are entirely overwriting the buffer
679                          * we may still have to zero it out to avoid a 
680                          * mmap/write visibility issue.
681                          */
682                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
683                         if ((bp->b_flags & B_CACHE) == 0)
684                                 vfs_bio_clrbuf(bp);
685                 } else if (base_offset >= ip->ino_data.size) {
686                         /*
687                          * If the base offset of the buffer is beyond the
688                          * file EOF, we don't have to issue a read.
689                          */
690                         bp = getblk(ap->a_vp, base_offset,
691                                     blksize, GETBLK_BHEAVY, 0);
692                         vfs_bio_clrbuf(bp);
693                 } else {
694                         /*
695                          * Partial overwrite, read in any missing bits then
696                          * replace the portion being written.
697                          */
698                         error = bread(ap->a_vp, base_offset, blksize, &bp);
699                         if (error == 0)
700                                 bheavy(bp);
701                 }
702                 if (error == 0) {
703                         lwkt_reltoken(&hmp->fs_token);
704                         error = uiomove(bp->b_data + offset, n, uio);
705                         lwkt_gettoken(&hmp->fs_token);
706                 }
707
708                 /*
709                  * Generate REDO records if enabled and redo_count will not
710                  * exceeded the limit.
711                  *
712                  * If redo_count exceeds the limit we stop generating records
713                  * and clear HAMMER_INODE_REDO.  This will cause the next
714                  * fsync() to do a full meta-data sync instead of just an
715                  * UNDO/REDO fifo update.
716                  *
717                  * When clearing HAMMER_INODE_REDO any pre-existing REDOs
718                  * will still be tracked.  The tracks will be terminated
719                  * when the related meta-data (including possible data
720                  * modifications which are not tracked via REDO) is
721                  * flushed.
722                  */
723                 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
724                         if (ip->redo_count < hammer_limit_redo) {
725                                 bp->b_flags |= B_VFSFLAG1;
726                                 error = hammer_generate_redo(&trans, ip,
727                                                      base_offset + offset,
728                                                      HAMMER_REDO_WRITE,
729                                                      bp->b_data + offset,
730                                                      (size_t)n);
731                         } else {
732                                 ip->flags &= ~HAMMER_INODE_REDO;
733                         }
734                 }
735
736                 /*
737                  * If we screwed up we have to undo any VM size changes we
738                  * made.
739                  */
740                 if (error) {
741                         brelse(bp);
742                         if (fixsize) {
743                                 nvtruncbuf(ap->a_vp, ip->ino_data.size,
744                                           hammer_blocksize(ip->ino_data.size),
745                                           hammer_blockoff(ip->ino_data.size));
746                         }
747                         break;
748                 }
749                 kflags |= NOTE_WRITE;
750                 hammer_stats_file_write += n;
751                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
752                 if (ip->ino_data.size < uio->uio_offset) {
753                         ip->ino_data.size = uio->uio_offset;
754                         flags = HAMMER_INODE_SDIRTY;
755                 } else {
756                         flags = 0;
757                 }
758                 ip->ino_data.mtime = trans.time;
759                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
760                 hammer_modify_inode(&trans, ip, flags);
761
762                 /*
763                  * Once we dirty the buffer any cached zone-X offset
764                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
765                  * allow overwriting over the same data sector unless
766                  * we provide UNDOs for the old data, which we don't.
767                  */
768                 bp->b_bio2.bio_offset = NOOFFSET;
769
770                 /*
771                  * Final buffer disposition.
772                  *
773                  * Because meta-data updates are deferred, HAMMER is
774                  * especially sensitive to excessive bdwrite()s because
775                  * the I/O stream is not broken up by disk reads.  So the
776                  * buffer cache simply cannot keep up.
777                  *
778                  * WARNING!  blksize is variable.  cluster_write() is
779                  *           expected to not blow up if it encounters
780                  *           buffers that do not match the passed blksize.
781                  *
782                  * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
783                  *        The ip->rsv_recs check should burst-flush the data.
784                  *        If we queue it immediately the buf could be left
785                  *        locked on the device queue for a very long time.
786                  *
787                  * NOTE!  To avoid degenerate stalls due to mismatched block
788                  *        sizes we only honor IO_DIRECT on the write which
789                  *        abuts the end of the buffer.  However, we must
790                  *        honor IO_SYNC in case someone is silly enough to
791                  *        configure a HAMMER file as swap, or when HAMMER
792                  *        is serving NFS (for commits).  Ick ick.
793                  */
794                 bp->b_flags |= B_AGE;
795                 if (ap->a_ioflag & IO_SYNC) {
796                         bwrite(bp);
797                 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
798                         bawrite(bp);
799                 } else {
800 #if 0
801                 if (offset + n == blksize) {
802                         if (hammer_cluster_enable == 0 ||
803                             (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
804                                 bawrite(bp);
805                         } else {
806                                 cluster_write(bp, ip->ino_data.size,
807                                               blksize, seqcount);
808                         }
809                 } else {
810 #endif
811                         bdwrite(bp);
812                 }
813         }
814         hammer_done_transaction(&trans);
815         hammer_knote(ap->a_vp, kflags);
816         lwkt_reltoken(&hmp->fs_token);
817         return (error);
818 }
819
820 /*
821  * hammer_vop_access { vp, mode, cred }
822  *
823  * MPSAFE - does not require fs_token
824  */
825 static
826 int
827 hammer_vop_access(struct vop_access_args *ap)
828 {
829         struct hammer_inode *ip = VTOI(ap->a_vp);
830         uid_t uid;
831         gid_t gid;
832         int error;
833
834         ++hammer_stats_file_iopsr;
835         uid = hammer_to_unix_xid(&ip->ino_data.uid);
836         gid = hammer_to_unix_xid(&ip->ino_data.gid);
837
838         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
839                                   ip->ino_data.uflags);
840         return (error);
841 }
842
843 /*
844  * hammer_vop_advlock { vp, id, op, fl, flags }
845  *
846  * MPSAFE - does not require fs_token
847  */
848 static
849 int
850 hammer_vop_advlock(struct vop_advlock_args *ap)
851 {
852         hammer_inode_t ip = VTOI(ap->a_vp);
853
854         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
855 }
856
857 /*
858  * hammer_vop_close { vp, fflag }
859  *
860  * We can only sync-on-close for normal closes.  XXX disabled for now.
861  */
862 static
863 int
864 hammer_vop_close(struct vop_close_args *ap)
865 {
866 #if 0
867         struct vnode *vp = ap->a_vp;
868         hammer_inode_t ip = VTOI(vp);
869         int waitfor;
870         if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
871                 if (vn_islocked(vp) == LK_EXCLUSIVE &&
872                     (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
873                         if (ip->flags & HAMMER_INODE_CLOSESYNC)
874                                 waitfor = MNT_WAIT;
875                         else
876                                 waitfor = MNT_NOWAIT;
877                         ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
878                                        HAMMER_INODE_CLOSEASYNC);
879                         VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
880                 }
881         }
882 #endif
883         return (vop_stdclose(ap));
884 }
885
886 /*
887  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
888  *
889  * The operating system has already ensured that the directory entry
890  * does not exist and done all appropriate namespace locking.
891  */
892 static
893 int
894 hammer_vop_ncreate(struct vop_ncreate_args *ap)
895 {
896         struct hammer_transaction trans;
897         struct hammer_inode *dip;
898         struct hammer_inode *nip;
899         struct nchandle *nch;
900         hammer_mount_t hmp;
901         int error;
902
903         nch = ap->a_nch;
904         dip = VTOI(ap->a_dvp);
905         hmp = dip->hmp;
906
907         if (dip->flags & HAMMER_INODE_RO)
908                 return (EROFS);
909         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
910                 return (error);
911
912         /*
913          * Create a transaction to cover the operations we perform.
914          */
915         lwkt_gettoken(&hmp->fs_token);
916         hammer_start_transaction(&trans, hmp);
917         ++hammer_stats_file_iopsw;
918
919         /*
920          * Create a new filesystem object of the requested type.  The
921          * returned inode will be referenced and shared-locked to prevent
922          * it from being moved to the flusher.
923          */
924         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
925                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
926                                     NULL, &nip);
927         if (error) {
928                 hkprintf("hammer_create_inode error %d\n", error);
929                 hammer_done_transaction(&trans);
930                 *ap->a_vpp = NULL;
931                 lwkt_reltoken(&hmp->fs_token);
932                 return (error);
933         }
934
935         /*
936          * Add the new filesystem object to the directory.  This will also
937          * bump the inode's link count.
938          */
939         error = hammer_ip_add_directory(&trans, dip,
940                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
941                                         nip);
942         if (error)
943                 hkprintf("hammer_ip_add_directory error %d\n", error);
944
945         /*
946          * Finish up.
947          */
948         if (error) {
949                 hammer_rel_inode(nip, 0);
950                 hammer_done_transaction(&trans);
951                 *ap->a_vpp = NULL;
952         } else {
953                 error = hammer_get_vnode(nip, ap->a_vpp);
954                 hammer_done_transaction(&trans);
955                 hammer_rel_inode(nip, 0);
956                 if (error == 0) {
957                         cache_setunresolved(ap->a_nch);
958                         cache_setvp(ap->a_nch, *ap->a_vpp);
959                 }
960                 hammer_knote(ap->a_dvp, NOTE_WRITE);
961         }
962         lwkt_reltoken(&hmp->fs_token);
963         return (error);
964 }
965
966 /*
967  * hammer_vop_getattr { vp, vap }
968  *
969  * Retrieve an inode's attribute information.  When accessing inodes
970  * historically we fake the atime field to ensure consistent results.
971  * The atime field is stored in the B-Tree element and allowed to be
972  * updated without cycling the element.
973  *
974  * MPSAFE - does not require fs_token
975  */
976 static
977 int
978 hammer_vop_getattr(struct vop_getattr_args *ap)
979 {
980         struct hammer_inode *ip = VTOI(ap->a_vp);
981         struct vattr *vap = ap->a_vap;
982
983         /*
984          * We want the fsid to be different when accessing a filesystem
985          * with different as-of's so programs like diff don't think
986          * the files are the same.
987          *
988          * We also want the fsid to be the same when comparing snapshots,
989          * or when comparing mirrors (which might be backed by different
990          * physical devices).  HAMMER fsids are based on the PFS's
991          * shared_uuid field.
992          *
993          * XXX there is a chance of collision here.  The va_fsid reported
994          * by stat is different from the more involved fsid used in the
995          * mount structure.
996          */
997         ++hammer_stats_file_iopsr;
998         hammer_lock_sh(&ip->lock);
999         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
1000                        (u_int32_t)(ip->obj_asof >> 32);
1001
1002         vap->va_fileid = ip->ino_leaf.base.obj_id;
1003         vap->va_mode = ip->ino_data.mode;
1004         vap->va_nlink = ip->ino_data.nlinks;
1005         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1006         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1007         vap->va_rmajor = 0;
1008         vap->va_rminor = 0;
1009         vap->va_size = ip->ino_data.size;
1010
1011         /*
1012          * Special case for @@PFS softlinks.  The actual size of the
1013          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
1014          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
1015          */
1016         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
1017             ip->ino_data.size == 10 &&
1018             ip->obj_asof == HAMMER_MAX_TID &&
1019             ip->obj_localization == 0 &&
1020             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1021                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1022                             vap->va_size = 26;
1023                     else
1024                             vap->va_size = 10;
1025         }
1026
1027         /*
1028          * We must provide a consistent atime and mtime for snapshots
1029          * so people can do a 'tar cf - ... | md5' on them and get
1030          * consistent results.
1031          */
1032         if (ip->flags & HAMMER_INODE_RO) {
1033                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1034                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1035         } else {
1036                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1037                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1038         }
1039         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1040         vap->va_flags = ip->ino_data.uflags;
1041         vap->va_gen = 1;        /* hammer inums are unique for all time */
1042         vap->va_blocksize = HAMMER_BUFSIZE;
1043         if (ip->ino_data.size >= HAMMER_XDEMARC) {
1044                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1045                                 ~HAMMER_XBUFMASK64;
1046         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1047                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1048                                 ~HAMMER_BUFMASK64;
1049         } else {
1050                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1051         }
1052
1053         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1054         vap->va_filerev = 0;    /* XXX */
1055         vap->va_uid_uuid = ip->ino_data.uid;
1056         vap->va_gid_uuid = ip->ino_data.gid;
1057         vap->va_fsid_uuid = ip->hmp->fsid;
1058         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1059                           VA_FSID_UUID_VALID;
1060
1061         switch (ip->ino_data.obj_type) {
1062         case HAMMER_OBJTYPE_CDEV:
1063         case HAMMER_OBJTYPE_BDEV:
1064                 vap->va_rmajor = ip->ino_data.rmajor;
1065                 vap->va_rminor = ip->ino_data.rminor;
1066                 break;
1067         default:
1068                 break;
1069         }
1070         hammer_unlock(&ip->lock);
1071         return(0);
1072 }
1073
1074 /*
1075  * hammer_vop_nresolve { nch, dvp, cred }
1076  *
1077  * Locate the requested directory entry.
1078  */
1079 static
1080 int
1081 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1082 {
1083         struct hammer_transaction trans;
1084         struct namecache *ncp;
1085         hammer_mount_t hmp;
1086         hammer_inode_t dip;
1087         hammer_inode_t ip;
1088         hammer_tid_t asof;
1089         struct hammer_cursor cursor;
1090         struct vnode *vp;
1091         int64_t namekey;
1092         int error;
1093         int i;
1094         int nlen;
1095         int flags;
1096         int ispfs;
1097         int64_t obj_id;
1098         u_int32_t localization;
1099         u_int32_t max_iterations;
1100
1101         /*
1102          * Misc initialization, plus handle as-of name extensions.  Look for
1103          * the '@@' extension.  Note that as-of files and directories cannot
1104          * be modified.
1105          */
1106         dip = VTOI(ap->a_dvp);
1107         ncp = ap->a_nch->ncp;
1108         asof = dip->obj_asof;
1109         localization = dip->obj_localization;   /* for code consistency */
1110         nlen = ncp->nc_nlen;
1111         flags = dip->flags & HAMMER_INODE_RO;
1112         ispfs = 0;
1113         hmp = dip->hmp;
1114
1115         lwkt_gettoken(&hmp->fs_token);
1116         hammer_simple_transaction(&trans, hmp);
1117         ++hammer_stats_file_iopsr;
1118
1119         for (i = 0; i < nlen; ++i) {
1120                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1121                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
1122                                                   &ispfs, &asof, &localization);
1123                         if (error != 0) {
1124                                 i = nlen;
1125                                 break;
1126                         }
1127                         if (asof != HAMMER_MAX_TID)
1128                                 flags |= HAMMER_INODE_RO;
1129                         break;
1130                 }
1131         }
1132         nlen = i;
1133
1134         /*
1135          * If this is a PFS softlink we dive into the PFS
1136          */
1137         if (ispfs && nlen == 0) {
1138                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1139                                       asof, localization,
1140                                       flags, &error);
1141                 if (error == 0) {
1142                         error = hammer_get_vnode(ip, &vp);
1143                         hammer_rel_inode(ip, 0);
1144                 } else {
1145                         vp = NULL;
1146                 }
1147                 if (error == 0) {
1148                         vn_unlock(vp);
1149                         cache_setvp(ap->a_nch, vp);
1150                         vrele(vp);
1151                 }
1152                 goto done;
1153         }
1154
1155         /*
1156          * If there is no path component the time extension is relative to dip.
1157          * e.g. "fubar/@@<snapshot>"
1158          *
1159          * "." is handled by the kernel, but ".@@<snapshot>" is not.
1160          * e.g. "fubar/.@@<snapshot>"
1161          *
1162          * ".." is handled by the kernel.  We do not currently handle
1163          * "..@<snapshot>".
1164          */
1165         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1166                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
1167                                       asof, dip->obj_localization,
1168                                       flags, &error);
1169                 if (error == 0) {
1170                         error = hammer_get_vnode(ip, &vp);
1171                         hammer_rel_inode(ip, 0);
1172                 } else {
1173                         vp = NULL;
1174                 }
1175                 if (error == 0) {
1176                         vn_unlock(vp);
1177                         cache_setvp(ap->a_nch, vp);
1178                         vrele(vp);
1179                 }
1180                 goto done;
1181         }
1182
1183         /*
1184          * Calculate the namekey and setup the key range for the scan.  This
1185          * works kinda like a chained hash table where the lower 32 bits
1186          * of the namekey synthesize the chain.
1187          *
1188          * The key range is inclusive of both key_beg and key_end.
1189          */
1190         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1191                                            &max_iterations);
1192
1193         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1194         cursor.key_beg.localization = dip->obj_localization +
1195                                       hammer_dir_localization(dip);
1196         cursor.key_beg.obj_id = dip->obj_id;
1197         cursor.key_beg.key = namekey;
1198         cursor.key_beg.create_tid = 0;
1199         cursor.key_beg.delete_tid = 0;
1200         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1201         cursor.key_beg.obj_type = 0;
1202
1203         cursor.key_end = cursor.key_beg;
1204         cursor.key_end.key += max_iterations;
1205         cursor.asof = asof;
1206         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1207
1208         /*
1209          * Scan all matching records (the chain), locate the one matching
1210          * the requested path component.
1211          *
1212          * The hammer_ip_*() functions merge in-memory records with on-disk
1213          * records for the purposes of the search.
1214          */
1215         obj_id = 0;
1216         localization = HAMMER_DEF_LOCALIZATION;
1217
1218         if (error == 0) {
1219                 error = hammer_ip_first(&cursor);
1220                 while (error == 0) {
1221                         error = hammer_ip_resolve_data(&cursor);
1222                         if (error)
1223                                 break;
1224                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1225                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1226                                 obj_id = cursor.data->entry.obj_id;
1227                                 localization = cursor.data->entry.localization;
1228                                 break;
1229                         }
1230                         error = hammer_ip_next(&cursor);
1231                 }
1232         }
1233         hammer_done_cursor(&cursor);
1234
1235         /*
1236          * Lookup the obj_id.  This should always succeed.  If it does not
1237          * the filesystem may be damaged and we return a dummy inode.
1238          */
1239         if (error == 0) {
1240                 ip = hammer_get_inode(&trans, dip, obj_id,
1241                                       asof, localization,
1242                                       flags, &error);
1243                 if (error == ENOENT) {
1244                         kprintf("HAMMER: WARNING: Missing "
1245                                 "inode for dirent \"%s\"\n"
1246                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1247                                 ncp->nc_name,
1248                                 (long long)obj_id, (long long)asof,
1249                                 localization);
1250                         error = 0;
1251                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1252                                                     asof, localization,
1253                                                     flags, &error);
1254                 }
1255                 if (error == 0) {
1256                         error = hammer_get_vnode(ip, &vp);
1257                         hammer_rel_inode(ip, 0);
1258                 } else {
1259                         vp = NULL;
1260                 }
1261                 if (error == 0) {
1262                         vn_unlock(vp);
1263                         cache_setvp(ap->a_nch, vp);
1264                         vrele(vp);
1265                 }
1266         } else if (error == ENOENT) {
1267                 cache_setvp(ap->a_nch, NULL);
1268         }
1269 done:
1270         hammer_done_transaction(&trans);
1271         lwkt_reltoken(&hmp->fs_token);
1272         return (error);
1273 }
1274
1275 /*
1276  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1277  *
1278  * Locate the parent directory of a directory vnode.
1279  *
1280  * dvp is referenced but not locked.  *vpp must be returned referenced and
1281  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1282  * at the root, instead it could indicate that the directory we were in was
1283  * removed.
1284  *
1285  * NOTE: as-of sequences are not linked into the directory structure.  If
1286  * we are at the root with a different asof then the mount point, reload
1287  * the same directory with the mount point's asof.   I'm not sure what this
1288  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1289  * get confused, but it hasn't been tested.
1290  */
1291 static
1292 int
1293 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1294 {
1295         struct hammer_transaction trans;
1296         struct hammer_inode *dip;
1297         struct hammer_inode *ip;
1298         hammer_mount_t hmp;
1299         int64_t parent_obj_id;
1300         u_int32_t parent_obj_localization;
1301         hammer_tid_t asof;
1302         int error;
1303
1304         dip = VTOI(ap->a_dvp);
1305         asof = dip->obj_asof;
1306         hmp = dip->hmp;
1307
1308         /*
1309          * Whos are parent?  This could be the root of a pseudo-filesystem
1310          * whos parent is in another localization domain.
1311          */
1312         lwkt_gettoken(&hmp->fs_token);
1313         parent_obj_id = dip->ino_data.parent_obj_id;
1314         if (dip->obj_id == HAMMER_OBJID_ROOT)
1315                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1316         else
1317                 parent_obj_localization = dip->obj_localization;
1318
1319         if (parent_obj_id == 0) {
1320                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1321                    asof != hmp->asof) {
1322                         parent_obj_id = dip->obj_id;
1323                         asof = hmp->asof;
1324                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1325                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1326                                   (long long)dip->obj_asof);
1327                 } else {
1328                         *ap->a_vpp = NULL;
1329                         lwkt_reltoken(&hmp->fs_token);
1330                         return ENOENT;
1331                 }
1332         }
1333
1334         hammer_simple_transaction(&trans, hmp);
1335         ++hammer_stats_file_iopsr;
1336
1337         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1338                               asof, parent_obj_localization,
1339                               dip->flags, &error);
1340         if (ip) {
1341                 error = hammer_get_vnode(ip, ap->a_vpp);
1342                 hammer_rel_inode(ip, 0);
1343         } else {
1344                 *ap->a_vpp = NULL;
1345         }
1346         hammer_done_transaction(&trans);
1347         lwkt_reltoken(&hmp->fs_token);
1348         return (error);
1349 }
1350
1351 /*
1352  * hammer_vop_nlink { nch, dvp, vp, cred }
1353  */
1354 static
1355 int
1356 hammer_vop_nlink(struct vop_nlink_args *ap)
1357 {
1358         struct hammer_transaction trans;
1359         struct hammer_inode *dip;
1360         struct hammer_inode *ip;
1361         struct nchandle *nch;
1362         hammer_mount_t hmp;
1363         int error;
1364
1365         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1366                 return(EXDEV);
1367
1368         nch = ap->a_nch;
1369         dip = VTOI(ap->a_dvp);
1370         ip = VTOI(ap->a_vp);
1371         hmp = dip->hmp;
1372
1373         if (dip->obj_localization != ip->obj_localization)
1374                 return(EXDEV);
1375
1376         if (dip->flags & HAMMER_INODE_RO)
1377                 return (EROFS);
1378         if (ip->flags & HAMMER_INODE_RO)
1379                 return (EROFS);
1380         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1381                 return (error);
1382
1383         /*
1384          * Create a transaction to cover the operations we perform.
1385          */
1386         lwkt_gettoken(&hmp->fs_token);
1387         hammer_start_transaction(&trans, hmp);
1388         ++hammer_stats_file_iopsw;
1389
1390         /*
1391          * Add the filesystem object to the directory.  Note that neither
1392          * dip nor ip are referenced or locked, but their vnodes are
1393          * referenced.  This function will bump the inode's link count.
1394          */
1395         error = hammer_ip_add_directory(&trans, dip,
1396                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1397                                         ip);
1398
1399         /*
1400          * Finish up.
1401          */
1402         if (error == 0) {
1403                 cache_setunresolved(nch);
1404                 cache_setvp(nch, ap->a_vp);
1405         }
1406         hammer_done_transaction(&trans);
1407         hammer_knote(ap->a_vp, NOTE_LINK);
1408         hammer_knote(ap->a_dvp, NOTE_WRITE);
1409         lwkt_reltoken(&hmp->fs_token);
1410         return (error);
1411 }
1412
1413 /*
1414  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1415  *
1416  * The operating system has already ensured that the directory entry
1417  * does not exist and done all appropriate namespace locking.
1418  */
1419 static
1420 int
1421 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1422 {
1423         struct hammer_transaction trans;
1424         struct hammer_inode *dip;
1425         struct hammer_inode *nip;
1426         struct nchandle *nch;
1427         hammer_mount_t hmp;
1428         int error;
1429
1430         nch = ap->a_nch;
1431         dip = VTOI(ap->a_dvp);
1432         hmp = dip->hmp;
1433
1434         if (dip->flags & HAMMER_INODE_RO)
1435                 return (EROFS);
1436         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1437                 return (error);
1438
1439         /*
1440          * Create a transaction to cover the operations we perform.
1441          */
1442         lwkt_gettoken(&hmp->fs_token);
1443         hammer_start_transaction(&trans, hmp);
1444         ++hammer_stats_file_iopsw;
1445
1446         /*
1447          * Create a new filesystem object of the requested type.  The
1448          * returned inode will be referenced but not locked.
1449          */
1450         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1451                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1452                                     NULL, &nip);
1453         if (error) {
1454                 hkprintf("hammer_mkdir error %d\n", error);
1455                 hammer_done_transaction(&trans);
1456                 *ap->a_vpp = NULL;
1457                 lwkt_reltoken(&hmp->fs_token);
1458                 return (error);
1459         }
1460         /*
1461          * Add the new filesystem object to the directory.  This will also
1462          * bump the inode's link count.
1463          */
1464         error = hammer_ip_add_directory(&trans, dip,
1465                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1466                                         nip);
1467         if (error)
1468                 hkprintf("hammer_mkdir (add) error %d\n", error);
1469
1470         /*
1471          * Finish up.
1472          */
1473         if (error) {
1474                 hammer_rel_inode(nip, 0);
1475                 *ap->a_vpp = NULL;
1476         } else {
1477                 error = hammer_get_vnode(nip, ap->a_vpp);
1478                 hammer_rel_inode(nip, 0);
1479                 if (error == 0) {
1480                         cache_setunresolved(ap->a_nch);
1481                         cache_setvp(ap->a_nch, *ap->a_vpp);
1482                 }
1483         }
1484         hammer_done_transaction(&trans);
1485         if (error == 0)
1486                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1487         lwkt_reltoken(&hmp->fs_token);
1488         return (error);
1489 }
1490
1491 /*
1492  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1493  *
1494  * The operating system has already ensured that the directory entry
1495  * does not exist and done all appropriate namespace locking.
1496  */
1497 static
1498 int
1499 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1500 {
1501         struct hammer_transaction trans;
1502         struct hammer_inode *dip;
1503         struct hammer_inode *nip;
1504         struct nchandle *nch;
1505         hammer_mount_t hmp;
1506         int error;
1507
1508         nch = ap->a_nch;
1509         dip = VTOI(ap->a_dvp);
1510         hmp = dip->hmp;
1511
1512         if (dip->flags & HAMMER_INODE_RO)
1513                 return (EROFS);
1514         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1515                 return (error);
1516
1517         /*
1518          * Create a transaction to cover the operations we perform.
1519          */
1520         lwkt_gettoken(&hmp->fs_token);
1521         hammer_start_transaction(&trans, hmp);
1522         ++hammer_stats_file_iopsw;
1523
1524         /*
1525          * Create a new filesystem object of the requested type.  The
1526          * returned inode will be referenced but not locked.
1527          *
1528          * If mknod specifies a directory a pseudo-fs is created.
1529          */
1530         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1531                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1532                                     NULL, &nip);
1533         if (error) {
1534                 hammer_done_transaction(&trans);
1535                 *ap->a_vpp = NULL;
1536                 lwkt_reltoken(&hmp->fs_token);
1537                 return (error);
1538         }
1539
1540         /*
1541          * Add the new filesystem object to the directory.  This will also
1542          * bump the inode's link count.
1543          */
1544         error = hammer_ip_add_directory(&trans, dip,
1545                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1546                                         nip);
1547
1548         /*
1549          * Finish up.
1550          */
1551         if (error) {
1552                 hammer_rel_inode(nip, 0);
1553                 *ap->a_vpp = NULL;
1554         } else {
1555                 error = hammer_get_vnode(nip, ap->a_vpp);
1556                 hammer_rel_inode(nip, 0);
1557                 if (error == 0) {
1558                         cache_setunresolved(ap->a_nch);
1559                         cache_setvp(ap->a_nch, *ap->a_vpp);
1560                 }
1561         }
1562         hammer_done_transaction(&trans);
1563         if (error == 0)
1564                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1565         lwkt_reltoken(&hmp->fs_token);
1566         return (error);
1567 }
1568
1569 /*
1570  * hammer_vop_open { vp, mode, cred, fp }
1571  *
1572  * MPSAFE (does not require fs_token)
1573  */
1574 static
1575 int
1576 hammer_vop_open(struct vop_open_args *ap)
1577 {
1578         hammer_inode_t ip;
1579
1580         ++hammer_stats_file_iopsr;
1581         ip = VTOI(ap->a_vp);
1582
1583         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1584                 return (EROFS);
1585         return(vop_stdopen(ap));
1586 }
1587
1588 /*
1589  * hammer_vop_print { vp }
1590  */
1591 static
1592 int
1593 hammer_vop_print(struct vop_print_args *ap)
1594 {
1595         return EOPNOTSUPP;
1596 }
1597
1598 /*
1599  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1600  */
1601 static
1602 int
1603 hammer_vop_readdir(struct vop_readdir_args *ap)
1604 {
1605         struct hammer_transaction trans;
1606         struct hammer_cursor cursor;
1607         struct hammer_inode *ip;
1608         hammer_mount_t hmp;
1609         struct uio *uio;
1610         hammer_base_elm_t base;
1611         int error;
1612         int cookie_index;
1613         int ncookies;
1614         off_t *cookies;
1615         off_t saveoff;
1616         int r;
1617         int dtype;
1618
1619         ++hammer_stats_file_iopsr;
1620         ip = VTOI(ap->a_vp);
1621         uio = ap->a_uio;
1622         saveoff = uio->uio_offset;
1623         hmp = ip->hmp;
1624
1625         if (ap->a_ncookies) {
1626                 ncookies = uio->uio_resid / 16 + 1;
1627                 if (ncookies > 1024)
1628                         ncookies = 1024;
1629                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1630                 cookie_index = 0;
1631         } else {
1632                 ncookies = -1;
1633                 cookies = NULL;
1634                 cookie_index = 0;
1635         }
1636
1637         lwkt_gettoken(&hmp->fs_token);
1638         hammer_simple_transaction(&trans, hmp);
1639
1640         /*
1641          * Handle artificial entries
1642          *
1643          * It should be noted that the minimum value for a directory
1644          * hash key on-media is 0x0000000100000000, so we can use anything
1645          * less then that to represent our 'special' key space.
1646          */
1647         error = 0;
1648         if (saveoff == 0) {
1649                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1650                 if (r)
1651                         goto done;
1652                 if (cookies)
1653                         cookies[cookie_index] = saveoff;
1654                 ++saveoff;
1655                 ++cookie_index;
1656                 if (cookie_index == ncookies)
1657                         goto done;
1658         }
1659         if (saveoff == 1) {
1660                 if (ip->ino_data.parent_obj_id) {
1661                         r = vop_write_dirent(&error, uio,
1662                                              ip->ino_data.parent_obj_id,
1663                                              DT_DIR, 2, "..");
1664                 } else {
1665                         r = vop_write_dirent(&error, uio,
1666                                              ip->obj_id, DT_DIR, 2, "..");
1667                 }
1668                 if (r)
1669                         goto done;
1670                 if (cookies)
1671                         cookies[cookie_index] = saveoff;
1672                 ++saveoff;
1673                 ++cookie_index;
1674                 if (cookie_index == ncookies)
1675                         goto done;
1676         }
1677
1678         /*
1679          * Key range (begin and end inclusive) to scan.  Directory keys
1680          * directly translate to a 64 bit 'seek' position.
1681          */
1682         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1683         cursor.key_beg.localization = ip->obj_localization +
1684                                       hammer_dir_localization(ip);
1685         cursor.key_beg.obj_id = ip->obj_id;
1686         cursor.key_beg.create_tid = 0;
1687         cursor.key_beg.delete_tid = 0;
1688         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1689         cursor.key_beg.obj_type = 0;
1690         cursor.key_beg.key = saveoff;
1691
1692         cursor.key_end = cursor.key_beg;
1693         cursor.key_end.key = HAMMER_MAX_KEY;
1694         cursor.asof = ip->obj_asof;
1695         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1696
1697         error = hammer_ip_first(&cursor);
1698
1699         while (error == 0) {
1700                 error = hammer_ip_resolve_data(&cursor);
1701                 if (error)
1702                         break;
1703                 base = &cursor.leaf->base;
1704                 saveoff = base->key;
1705                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1706
1707                 if (base->obj_id != ip->obj_id)
1708                         panic("readdir: bad record at %p", cursor.node);
1709
1710                 /*
1711                  * Convert pseudo-filesystems into softlinks
1712                  */
1713                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1714                 r = vop_write_dirent(
1715                              &error, uio, cursor.data->entry.obj_id,
1716                              dtype,
1717                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1718                              (void *)cursor.data->entry.name);
1719                 if (r)
1720                         break;
1721                 ++saveoff;
1722                 if (cookies)
1723                         cookies[cookie_index] = base->key;
1724                 ++cookie_index;
1725                 if (cookie_index == ncookies)
1726                         break;
1727                 error = hammer_ip_next(&cursor);
1728         }
1729         hammer_done_cursor(&cursor);
1730
1731 done:
1732         hammer_done_transaction(&trans);
1733
1734         if (ap->a_eofflag)
1735                 *ap->a_eofflag = (error == ENOENT);
1736         uio->uio_offset = saveoff;
1737         if (error && cookie_index == 0) {
1738                 if (error == ENOENT)
1739                         error = 0;
1740                 if (cookies) {
1741                         kfree(cookies, M_TEMP);
1742                         *ap->a_ncookies = 0;
1743                         *ap->a_cookies = NULL;
1744                 }
1745         } else {
1746                 if (error == ENOENT)
1747                         error = 0;
1748                 if (cookies) {
1749                         *ap->a_ncookies = cookie_index;
1750                         *ap->a_cookies = cookies;
1751                 }
1752         }
1753         lwkt_reltoken(&hmp->fs_token);
1754         return(error);
1755 }
1756
1757 /*
1758  * hammer_vop_readlink { vp, uio, cred }
1759  */
1760 static
1761 int
1762 hammer_vop_readlink(struct vop_readlink_args *ap)
1763 {
1764         struct hammer_transaction trans;
1765         struct hammer_cursor cursor;
1766         struct hammer_inode *ip;
1767         hammer_mount_t hmp;
1768         char buf[32];
1769         u_int32_t localization;
1770         hammer_pseudofs_inmem_t pfsm;
1771         int error;
1772
1773         ip = VTOI(ap->a_vp);
1774         hmp = ip->hmp;
1775
1776         lwkt_gettoken(&hmp->fs_token);
1777
1778         /*
1779          * Shortcut if the symlink data was stuffed into ino_data.
1780          *
1781          * Also expand special "@@PFS%05d" softlinks (expansion only
1782          * occurs for non-historical (current) accesses made from the
1783          * primary filesystem).
1784          */
1785         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1786                 char *ptr;
1787                 int bytes;
1788
1789                 ptr = ip->ino_data.ext.symlink;
1790                 bytes = (int)ip->ino_data.size;
1791                 if (bytes == 10 &&
1792                     ip->obj_asof == HAMMER_MAX_TID &&
1793                     ip->obj_localization == 0 &&
1794                     strncmp(ptr, "@@PFS", 5) == 0) {
1795                         hammer_simple_transaction(&trans, hmp);
1796                         bcopy(ptr + 5, buf, 5);
1797                         buf[5] = 0;
1798                         localization = strtoul(buf, NULL, 10) << 16;
1799                         pfsm = hammer_load_pseudofs(&trans, localization,
1800                                                     &error);
1801                         if (error == 0) {
1802                                 if (pfsm->pfsd.mirror_flags &
1803                                     HAMMER_PFSD_SLAVE) {
1804                                         /* vap->va_size == 26 */
1805                                         ksnprintf(buf, sizeof(buf),
1806                                                   "@@0x%016llx:%05d",
1807                                                   (long long)pfsm->pfsd.sync_end_tid,
1808                                                   localization >> 16);
1809                                 } else {
1810                                         /* vap->va_size == 10 */
1811                                         ksnprintf(buf, sizeof(buf),
1812                                                   "@@-1:%05d",
1813                                                   localization >> 16);
1814 #if 0
1815                                         ksnprintf(buf, sizeof(buf),
1816                                                   "@@0x%016llx:%05d",
1817                                                   (long long)HAMMER_MAX_TID,
1818                                                   localization >> 16);
1819 #endif
1820                                 }
1821                                 ptr = buf;
1822                                 bytes = strlen(buf);
1823                         }
1824                         if (pfsm)
1825                                 hammer_rel_pseudofs(hmp, pfsm);
1826                         hammer_done_transaction(&trans);
1827                 }
1828                 error = uiomove(ptr, bytes, ap->a_uio);
1829                 lwkt_reltoken(&hmp->fs_token);
1830                 return(error);
1831         }
1832
1833         /*
1834          * Long version
1835          */
1836         hammer_simple_transaction(&trans, hmp);
1837         ++hammer_stats_file_iopsr;
1838         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1839
1840         /*
1841          * Key range (begin and end inclusive) to scan.  Directory keys
1842          * directly translate to a 64 bit 'seek' position.
1843          */
1844         cursor.key_beg.localization = ip->obj_localization +
1845                                       HAMMER_LOCALIZE_MISC;
1846         cursor.key_beg.obj_id = ip->obj_id;
1847         cursor.key_beg.create_tid = 0;
1848         cursor.key_beg.delete_tid = 0;
1849         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1850         cursor.key_beg.obj_type = 0;
1851         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1852         cursor.asof = ip->obj_asof;
1853         cursor.flags |= HAMMER_CURSOR_ASOF;
1854
1855         error = hammer_ip_lookup(&cursor);
1856         if (error == 0) {
1857                 error = hammer_ip_resolve_data(&cursor);
1858                 if (error == 0) {
1859                         KKASSERT(cursor.leaf->data_len >=
1860                                  HAMMER_SYMLINK_NAME_OFF);
1861                         error = uiomove(cursor.data->symlink.name,
1862                                         cursor.leaf->data_len -
1863                                                 HAMMER_SYMLINK_NAME_OFF,
1864                                         ap->a_uio);
1865                 }
1866         }
1867         hammer_done_cursor(&cursor);
1868         hammer_done_transaction(&trans);
1869         lwkt_reltoken(&hmp->fs_token);
1870         return(error);
1871 }
1872
1873 /*
1874  * hammer_vop_nremove { nch, dvp, cred }
1875  */
1876 static
1877 int
1878 hammer_vop_nremove(struct vop_nremove_args *ap)
1879 {
1880         struct hammer_transaction trans;
1881         struct hammer_inode *dip;
1882         hammer_mount_t hmp;
1883         int error;
1884
1885         dip = VTOI(ap->a_dvp);
1886         hmp = dip->hmp;
1887
1888         if (hammer_nohistory(dip) == 0 &&
1889             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1890                 return (error);
1891         }
1892
1893         lwkt_gettoken(&hmp->fs_token);
1894         hammer_start_transaction(&trans, hmp);
1895         ++hammer_stats_file_iopsw;
1896         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1897         hammer_done_transaction(&trans);
1898         if (error == 0)
1899                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1900         lwkt_reltoken(&hmp->fs_token);
1901         return (error);
1902 }
1903
1904 /*
1905  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1906  */
1907 static
1908 int
1909 hammer_vop_nrename(struct vop_nrename_args *ap)
1910 {
1911         struct hammer_transaction trans;
1912         struct namecache *fncp;
1913         struct namecache *tncp;
1914         struct hammer_inode *fdip;
1915         struct hammer_inode *tdip;
1916         struct hammer_inode *ip;
1917         hammer_mount_t hmp;
1918         struct hammer_cursor cursor;
1919         int64_t namekey;
1920         u_int32_t max_iterations;
1921         int nlen, error;
1922
1923         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1924                 return(EXDEV);
1925         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1926                 return(EXDEV);
1927
1928         fdip = VTOI(ap->a_fdvp);
1929         tdip = VTOI(ap->a_tdvp);
1930         fncp = ap->a_fnch->ncp;
1931         tncp = ap->a_tnch->ncp;
1932         ip = VTOI(fncp->nc_vp);
1933         KKASSERT(ip != NULL);
1934
1935         hmp = ip->hmp;
1936
1937         if (fdip->obj_localization != tdip->obj_localization)
1938                 return(EXDEV);
1939         if (fdip->obj_localization != ip->obj_localization)
1940                 return(EXDEV);
1941
1942         if (fdip->flags & HAMMER_INODE_RO)
1943                 return (EROFS);
1944         if (tdip->flags & HAMMER_INODE_RO)
1945                 return (EROFS);
1946         if (ip->flags & HAMMER_INODE_RO)
1947                 return (EROFS);
1948         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
1949                 return (error);
1950
1951         lwkt_gettoken(&hmp->fs_token);
1952         hammer_start_transaction(&trans, hmp);
1953         ++hammer_stats_file_iopsw;
1954
1955         /*
1956          * Remove tncp from the target directory and then link ip as
1957          * tncp. XXX pass trans to dounlink
1958          *
1959          * Force the inode sync-time to match the transaction so it is
1960          * in-sync with the creation of the target directory entry.
1961          */
1962         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1963                                 ap->a_cred, 0, -1);
1964         if (error == 0 || error == ENOENT) {
1965                 error = hammer_ip_add_directory(&trans, tdip,
1966                                                 tncp->nc_name, tncp->nc_nlen,
1967                                                 ip);
1968                 if (error == 0) {
1969                         ip->ino_data.parent_obj_id = tdip->obj_id;
1970                         ip->ino_data.ctime = trans.time;
1971                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
1972                 }
1973         }
1974         if (error)
1975                 goto failed; /* XXX */
1976
1977         /*
1978          * Locate the record in the originating directory and remove it.
1979          *
1980          * Calculate the namekey and setup the key range for the scan.  This
1981          * works kinda like a chained hash table where the lower 32 bits
1982          * of the namekey synthesize the chain.
1983          *
1984          * The key range is inclusive of both key_beg and key_end.
1985          */
1986         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1987                                            &max_iterations);
1988 retry:
1989         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1990         cursor.key_beg.localization = fdip->obj_localization +
1991                                       hammer_dir_localization(fdip);
1992         cursor.key_beg.obj_id = fdip->obj_id;
1993         cursor.key_beg.key = namekey;
1994         cursor.key_beg.create_tid = 0;
1995         cursor.key_beg.delete_tid = 0;
1996         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1997         cursor.key_beg.obj_type = 0;
1998
1999         cursor.key_end = cursor.key_beg;
2000         cursor.key_end.key += max_iterations;
2001         cursor.asof = fdip->obj_asof;
2002         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2003
2004         /*
2005          * Scan all matching records (the chain), locate the one matching
2006          * the requested path component.
2007          *
2008          * The hammer_ip_*() functions merge in-memory records with on-disk
2009          * records for the purposes of the search.
2010          */
2011         error = hammer_ip_first(&cursor);
2012         while (error == 0) {
2013                 if (hammer_ip_resolve_data(&cursor) != 0)
2014                         break;
2015                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2016                 KKASSERT(nlen > 0);
2017                 if (fncp->nc_nlen == nlen &&
2018                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2019                         break;
2020                 }
2021                 error = hammer_ip_next(&cursor);
2022         }
2023
2024         /*
2025          * If all is ok we have to get the inode so we can adjust nlinks.
2026          *
2027          * WARNING: hammer_ip_del_directory() may have to terminate the
2028          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
2029          * twice.
2030          */
2031         if (error == 0)
2032                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
2033
2034         /*
2035          * XXX A deadlock here will break rename's atomicy for the purposes
2036          * of crash recovery.
2037          */
2038         if (error == EDEADLK) {
2039                 hammer_done_cursor(&cursor);
2040                 goto retry;
2041         }
2042
2043         /*
2044          * Cleanup and tell the kernel that the rename succeeded.
2045          *
2046          * NOTE: ip->vp, if non-NULL, cannot be directly referenced
2047          *       without formally acquiring the vp since the vp might
2048          *       have zero refs on it, or in the middle of a reclaim,
2049          *       etc.
2050          */
2051         hammer_done_cursor(&cursor);
2052         if (error == 0) {
2053                 cache_rename(ap->a_fnch, ap->a_tnch);
2054                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
2055                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
2056                 while (ip->vp) {
2057                         struct vnode *vp;
2058
2059                         error = hammer_get_vnode(ip, &vp);
2060                         if (error == 0 && vp) {
2061                                 vn_unlock(vp);
2062                                 hammer_knote(ip->vp, NOTE_RENAME);
2063                                 vrele(vp);
2064                                 break;
2065                         }
2066                         kprintf("Debug: HAMMER ip/vp race2 avoided\n");
2067                 }
2068         }
2069
2070 failed:
2071         hammer_done_transaction(&trans);
2072         lwkt_reltoken(&hmp->fs_token);
2073         return (error);
2074 }
2075
2076 /*
2077  * hammer_vop_nrmdir { nch, dvp, cred }
2078  */
2079 static
2080 int
2081 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2082 {
2083         struct hammer_transaction trans;
2084         struct hammer_inode *dip;
2085         hammer_mount_t hmp;
2086         int error;
2087
2088         dip = VTOI(ap->a_dvp);
2089         hmp = dip->hmp;
2090
2091         if (hammer_nohistory(dip) == 0 &&
2092             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2093                 return (error);
2094         }
2095
2096         lwkt_gettoken(&hmp->fs_token);
2097         hammer_start_transaction(&trans, hmp);
2098         ++hammer_stats_file_iopsw;
2099         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2100         hammer_done_transaction(&trans);
2101         if (error == 0)
2102                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2103         lwkt_reltoken(&hmp->fs_token);
2104         return (error);
2105 }
2106
2107 /*
2108  * hammer_vop_markatime { vp, cred }
2109  */
2110 static
2111 int
2112 hammer_vop_markatime(struct vop_markatime_args *ap)
2113 {
2114         struct hammer_transaction trans;
2115         struct hammer_inode *ip;
2116         hammer_mount_t hmp;
2117
2118         ip = VTOI(ap->a_vp);
2119         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2120                 return (EROFS);
2121         if (ip->flags & HAMMER_INODE_RO)
2122                 return (EROFS);
2123         hmp = ip->hmp;
2124         if (hmp->mp->mnt_flag & MNT_NOATIME)
2125                 return (0);
2126         lwkt_gettoken(&hmp->fs_token);
2127         hammer_start_transaction(&trans, hmp);
2128         ++hammer_stats_file_iopsw;
2129
2130         ip->ino_data.atime = trans.time;
2131         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2132         hammer_done_transaction(&trans);
2133         hammer_knote(ap->a_vp, NOTE_ATTRIB);
2134         lwkt_reltoken(&hmp->fs_token);
2135         return (0);
2136 }
2137
2138 /*
2139  * hammer_vop_setattr { vp, vap, cred }
2140  */
2141 static
2142 int
2143 hammer_vop_setattr(struct vop_setattr_args *ap)
2144 {
2145         struct hammer_transaction trans;
2146         struct hammer_inode *ip;
2147         struct vattr *vap;
2148         hammer_mount_t hmp;
2149         int modflags;
2150         int error;
2151         int truncating;
2152         int blksize;
2153         int kflags;
2154 #if 0
2155         int64_t aligned_size;
2156 #endif
2157         u_int32_t flags;
2158
2159         vap = ap->a_vap;
2160         ip = ap->a_vp->v_data;
2161         modflags = 0;
2162         kflags = 0;
2163         hmp = ip->hmp;
2164
2165         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2166                 return(EROFS);
2167         if (ip->flags & HAMMER_INODE_RO)
2168                 return (EROFS);
2169         if (hammer_nohistory(ip) == 0 &&
2170             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2171                 return (error);
2172         }
2173
2174         lwkt_gettoken(&hmp->fs_token);
2175         hammer_start_transaction(&trans, hmp);
2176         ++hammer_stats_file_iopsw;
2177         error = 0;
2178
2179         if (vap->va_flags != VNOVAL) {
2180                 flags = ip->ino_data.uflags;
2181                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2182                                          hammer_to_unix_xid(&ip->ino_data.uid),
2183                                          ap->a_cred);
2184                 if (error == 0) {
2185                         if (ip->ino_data.uflags != flags) {
2186                                 ip->ino_data.uflags = flags;
2187                                 ip->ino_data.ctime = trans.time;
2188                                 modflags |= HAMMER_INODE_DDIRTY;
2189                                 kflags |= NOTE_ATTRIB;
2190                         }
2191                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2192                                 error = 0;
2193                                 goto done;
2194                         }
2195                 }
2196                 goto done;
2197         }
2198         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2199                 error = EPERM;
2200                 goto done;
2201         }
2202         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2203                 mode_t cur_mode = ip->ino_data.mode;
2204                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2205                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2206                 uuid_t uuid_uid;
2207                 uuid_t uuid_gid;
2208
2209                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2210                                          ap->a_cred,
2211                                          &cur_uid, &cur_gid, &cur_mode);
2212                 if (error == 0) {
2213                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
2214                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
2215                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
2216                                  sizeof(uuid_uid)) ||
2217                             bcmp(&uuid_gid, &ip->ino_data.gid,
2218                                  sizeof(uuid_gid)) ||
2219                             ip->ino_data.mode != cur_mode
2220                         ) {
2221                                 ip->ino_data.uid = uuid_uid;
2222                                 ip->ino_data.gid = uuid_gid;
2223                                 ip->ino_data.mode = cur_mode;
2224                                 ip->ino_data.ctime = trans.time;
2225                                 modflags |= HAMMER_INODE_DDIRTY;
2226                         }
2227                         kflags |= NOTE_ATTRIB;
2228                 }
2229         }
2230         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2231                 switch(ap->a_vp->v_type) {
2232                 case VREG:
2233                         if (vap->va_size == ip->ino_data.size)
2234                                 break;
2235
2236                         /*
2237                          * Log the operation if in fast-fsync mode or if
2238                          * there are unterminated redo write records present.
2239                          *
2240                          * The second check is needed so the recovery code
2241                          * properly truncates write redos even if nominal
2242                          * REDO operations is turned off due to excessive
2243                          * writes, because the related records might be
2244                          * destroyed and never lay down a TERM_WRITE.
2245                          */
2246                         if ((ip->flags & HAMMER_INODE_REDO) ||
2247                             (ip->flags & HAMMER_INODE_RDIRTY)) {
2248                                 error = hammer_generate_redo(&trans, ip,
2249                                                              vap->va_size,
2250                                                              HAMMER_REDO_TRUNC,
2251                                                              NULL, 0);
2252                         }
2253                         blksize = hammer_blocksize(vap->va_size);
2254
2255                         /*
2256                          * XXX break atomicy, we can deadlock the backend
2257                          * if we do not release the lock.  Probably not a
2258                          * big deal here.
2259                          */
2260                         if (vap->va_size < ip->ino_data.size) {
2261                                 nvtruncbuf(ap->a_vp, vap->va_size,
2262                                            blksize,
2263                                            hammer_blockoff(vap->va_size));
2264                                 truncating = 1;
2265                                 kflags |= NOTE_WRITE;
2266                         } else {
2267                                 nvextendbuf(ap->a_vp,
2268                                             ip->ino_data.size,
2269                                             vap->va_size,
2270                                             hammer_blocksize(ip->ino_data.size),
2271                                             hammer_blocksize(vap->va_size),
2272                                             hammer_blockoff(ip->ino_data.size),
2273                                             hammer_blockoff(vap->va_size),
2274                                             0);
2275                                 truncating = 0;
2276                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
2277                         }
2278                         ip->ino_data.size = vap->va_size;
2279                         ip->ino_data.mtime = trans.time;
2280                         /* XXX safe to use SDIRTY instead of DDIRTY here? */
2281                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2282
2283                         /*
2284                          * On-media truncation is cached in the inode until
2285                          * the inode is synchronized.  We must immediately
2286                          * handle any frontend records.
2287                          */
2288                         if (truncating) {
2289                                 hammer_ip_frontend_trunc(ip, vap->va_size);
2290 #ifdef DEBUG_TRUNCATE
2291                                 if (HammerTruncIp == NULL)
2292                                         HammerTruncIp = ip;
2293 #endif
2294                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2295                                         ip->flags |= HAMMER_INODE_TRUNCATED;
2296                                         ip->trunc_off = vap->va_size;
2297 #ifdef DEBUG_TRUNCATE
2298                                         if (ip == HammerTruncIp)
2299                                         kprintf("truncate1 %016llx\n",
2300                                                 (long long)ip->trunc_off);
2301 #endif
2302                                 } else if (ip->trunc_off > vap->va_size) {
2303                                         ip->trunc_off = vap->va_size;
2304 #ifdef DEBUG_TRUNCATE
2305                                         if (ip == HammerTruncIp)
2306                                         kprintf("truncate2 %016llx\n",
2307                                                 (long long)ip->trunc_off);
2308 #endif
2309                                 } else {
2310 #ifdef DEBUG_TRUNCATE
2311                                         if (ip == HammerTruncIp)
2312                                         kprintf("truncate3 %016llx (ignored)\n",
2313                                                 (long long)vap->va_size);
2314 #endif
2315                                 }
2316                         }
2317
2318 #if 0
2319                         /*
2320                          * When truncating, nvtruncbuf() may have cleaned out
2321                          * a portion of the last block on-disk in the buffer
2322                          * cache.  We must clean out any frontend records
2323                          * for blocks beyond the new last block.
2324                          */
2325                         aligned_size = (vap->va_size + (blksize - 1)) &
2326                                        ~(int64_t)(blksize - 1);
2327                         if (truncating && vap->va_size < aligned_size) {
2328                                 aligned_size -= blksize;
2329                                 hammer_ip_frontend_trunc(ip, aligned_size);
2330                         }
2331 #endif
2332                         break;
2333                 case VDATABASE:
2334                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2335                                 ip->flags |= HAMMER_INODE_TRUNCATED;
2336                                 ip->trunc_off = vap->va_size;
2337                         } else if (ip->trunc_off > vap->va_size) {
2338                                 ip->trunc_off = vap->va_size;
2339                         }
2340                         hammer_ip_frontend_trunc(ip, vap->va_size);
2341                         ip->ino_data.size = vap->va_size;
2342                         ip->ino_data.mtime = trans.time;
2343                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2344                         kflags |= NOTE_ATTRIB;
2345                         break;
2346                 default:
2347                         error = EINVAL;
2348                         goto done;
2349                 }
2350                 break;
2351         }
2352         if (vap->va_atime.tv_sec != VNOVAL) {
2353                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2354                 modflags |= HAMMER_INODE_ATIME;
2355                 kflags |= NOTE_ATTRIB;
2356         }
2357         if (vap->va_mtime.tv_sec != VNOVAL) {
2358                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2359                 modflags |= HAMMER_INODE_MTIME;
2360                 kflags |= NOTE_ATTRIB;
2361         }
2362         if (vap->va_mode != (mode_t)VNOVAL) {
2363                 mode_t   cur_mode = ip->ino_data.mode;
2364                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2365                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2366
2367                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2368                                          cur_uid, cur_gid, &cur_mode);
2369                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2370                         ip->ino_data.mode = cur_mode;
2371                         ip->ino_data.ctime = trans.time;
2372                         modflags |= HAMMER_INODE_DDIRTY;
2373                         kflags |= NOTE_ATTRIB;
2374                 }
2375         }
2376 done:
2377         if (error == 0)
2378                 hammer_modify_inode(&trans, ip, modflags);
2379         hammer_done_transaction(&trans);
2380         hammer_knote(ap->a_vp, kflags);
2381         lwkt_reltoken(&hmp->fs_token);
2382         return (error);
2383 }
2384
2385 /*
2386  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2387  */
2388 static
2389 int
2390 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2391 {
2392         struct hammer_transaction trans;
2393         struct hammer_inode *dip;
2394         struct hammer_inode *nip;
2395         hammer_record_t record;
2396         struct nchandle *nch;
2397         hammer_mount_t hmp;
2398         int error;
2399         int bytes;
2400
2401         ap->a_vap->va_type = VLNK;
2402
2403         nch = ap->a_nch;
2404         dip = VTOI(ap->a_dvp);
2405         hmp = dip->hmp;
2406
2407         if (dip->flags & HAMMER_INODE_RO)
2408                 return (EROFS);
2409         if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0)
2410                 return (error);
2411
2412         /*
2413          * Create a transaction to cover the operations we perform.
2414          */
2415         lwkt_gettoken(&hmp->fs_token);
2416         hammer_start_transaction(&trans, hmp);
2417         ++hammer_stats_file_iopsw;
2418
2419         /*
2420          * Create a new filesystem object of the requested type.  The
2421          * returned inode will be referenced but not locked.
2422          */
2423
2424         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2425                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2426                                     NULL, &nip);
2427         if (error) {
2428                 hammer_done_transaction(&trans);
2429                 *ap->a_vpp = NULL;
2430                 lwkt_reltoken(&hmp->fs_token);
2431                 return (error);
2432         }
2433
2434         /*
2435          * Add a record representing the symlink.  symlink stores the link
2436          * as pure data, not a string, and is no \0 terminated.
2437          */
2438         if (error == 0) {
2439                 bytes = strlen(ap->a_target);
2440
2441                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2442                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2443                 } else {
2444                         record = hammer_alloc_mem_record(nip, bytes);
2445                         record->type = HAMMER_MEM_RECORD_GENERAL;
2446
2447                         record->leaf.base.localization = nip->obj_localization +
2448                                                          HAMMER_LOCALIZE_MISC;
2449                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2450                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2451                         record->leaf.data_len = bytes;
2452                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2453                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2454                         error = hammer_ip_add_record(&trans, record);
2455                 }
2456
2457                 /*
2458                  * Set the file size to the length of the link.
2459                  */
2460                 if (error == 0) {
2461                         nip->ino_data.size = bytes;
2462                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2463                 }
2464         }
2465         if (error == 0)
2466                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2467                                                 nch->ncp->nc_nlen, nip);
2468
2469         /*
2470          * Finish up.
2471          */
2472         if (error) {
2473                 hammer_rel_inode(nip, 0);
2474                 *ap->a_vpp = NULL;
2475         } else {
2476                 error = hammer_get_vnode(nip, ap->a_vpp);
2477                 hammer_rel_inode(nip, 0);
2478                 if (error == 0) {
2479                         cache_setunresolved(ap->a_nch);
2480                         cache_setvp(ap->a_nch, *ap->a_vpp);
2481                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2482                 }
2483         }
2484         hammer_done_transaction(&trans);
2485         lwkt_reltoken(&hmp->fs_token);
2486         return (error);
2487 }
2488
2489 /*
2490  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2491  */
2492 static
2493 int
2494 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2495 {
2496         struct hammer_transaction trans;
2497         struct hammer_inode *dip;
2498         hammer_mount_t hmp;
2499         int error;
2500
2501         dip = VTOI(ap->a_dvp);
2502         hmp = dip->hmp;
2503
2504         if (hammer_nohistory(dip) == 0 &&
2505             (error = hammer_checkspace(hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2506                 return (error);
2507         }
2508
2509         lwkt_gettoken(&hmp->fs_token);
2510         hammer_start_transaction(&trans, hmp);
2511         ++hammer_stats_file_iopsw;
2512         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2513                                 ap->a_cred, ap->a_flags, -1);
2514         hammer_done_transaction(&trans);
2515         lwkt_reltoken(&hmp->fs_token);
2516
2517         return (error);
2518 }
2519
2520 /*
2521  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2522  */
2523 static
2524 int
2525 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2526 {
2527         struct hammer_inode *ip = ap->a_vp->v_data;
2528         hammer_mount_t hmp = ip->hmp;
2529         int error;
2530
2531         ++hammer_stats_file_iopsr;
2532         lwkt_gettoken(&hmp->fs_token);
2533         error = hammer_ioctl(ip, ap->a_command, ap->a_data,
2534                              ap->a_fflag, ap->a_cred);
2535         lwkt_reltoken(&hmp->fs_token);
2536         return (error);
2537 }
2538
2539 static
2540 int
2541 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2542 {
2543         static const struct mountctl_opt extraopt[] = {
2544                 { HMNT_NOHISTORY,       "nohistory" },
2545                 { HMNT_MASTERID,        "master" },
2546                 { 0, NULL}
2547
2548         };
2549         struct hammer_mount *hmp;
2550         struct mount *mp;
2551         int usedbytes;
2552         int error;
2553
2554         error = 0;
2555         usedbytes = 0;
2556         mp = ap->a_head.a_ops->head.vv_mount;
2557         KKASSERT(mp->mnt_data != NULL);
2558         hmp = (struct hammer_mount *)mp->mnt_data;
2559
2560         lwkt_gettoken(&hmp->fs_token);
2561
2562         switch(ap->a_op) {
2563         case MOUNTCTL_SET_EXPORT:
2564                 if (ap->a_ctllen != sizeof(struct export_args))
2565                         error = EINVAL;
2566                 else
2567                         error = hammer_vfs_export(mp, ap->a_op,
2568                                       (const struct export_args *)ap->a_ctl);
2569                 break;
2570         case MOUNTCTL_MOUNTFLAGS:
2571         {
2572                 /*
2573                  * Call standard mountctl VOP function
2574                  * so we get user mount flags.
2575                  */
2576                 error = vop_stdmountctl(ap);
2577                 if (error)
2578                         break;
2579
2580                 usedbytes = *ap->a_res;
2581
2582                 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2583                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt,
2584                                                     ap->a_buf,
2585                                                     ap->a_buflen - usedbytes,
2586                                                     &error);
2587                 }
2588
2589                 *ap->a_res += usedbytes;
2590                 break;
2591         }
2592         default:
2593                 error = vop_stdmountctl(ap);
2594                 break;
2595         }
2596         lwkt_reltoken(&hmp->fs_token);
2597         return(error);
2598 }
2599
2600 /*
2601  * hammer_vop_strategy { vp, bio }
2602  *
2603  * Strategy call, used for regular file read & write only.  Note that the
2604  * bp may represent a cluster.
2605  *
2606  * To simplify operation and allow better optimizations in the future,
2607  * this code does not make any assumptions with regards to buffer alignment
2608  * or size.
2609  */
2610 static
2611 int
2612 hammer_vop_strategy(struct vop_strategy_args *ap)
2613 {
2614         struct buf *bp;
2615         int error;
2616
2617         bp = ap->a_bio->bio_buf;
2618
2619         switch(bp->b_cmd) {
2620         case BUF_CMD_READ:
2621                 error = hammer_vop_strategy_read(ap);
2622                 break;
2623         case BUF_CMD_WRITE:
2624                 error = hammer_vop_strategy_write(ap);
2625                 break;
2626         default:
2627                 bp->b_error = error = EINVAL;
2628                 bp->b_flags |= B_ERROR;
2629                 biodone(ap->a_bio);
2630                 break;
2631         }
2632
2633         /* hammer_dump_dedup_cache(((hammer_inode_t)ap->a_vp->v_data)->hmp); */
2634
2635         return (error);
2636 }
2637
2638 /*
2639  * Read from a regular file.  Iterate the related records and fill in the
2640  * BIO/BUF.  Gaps are zero-filled.
2641  *
2642  * The support code in hammer_object.c should be used to deal with mixed
2643  * in-memory and on-disk records.
2644  *
2645  * NOTE: Can be called from the cluster code with an oversized buf.
2646  *
2647  * XXX atime update
2648  */
2649 static
2650 int
2651 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2652 {
2653         struct hammer_transaction trans;
2654         struct hammer_inode *ip;
2655         struct hammer_inode *dip;
2656         hammer_mount_t hmp;
2657         struct hammer_cursor cursor;
2658         hammer_base_elm_t base;
2659         hammer_off_t disk_offset;
2660         struct bio *bio;
2661         struct bio *nbio;
2662         struct buf *bp;
2663         int64_t rec_offset;
2664         int64_t ran_end;
2665         int64_t tmp64;
2666         int error;
2667         int boff;
2668         int roff;
2669         int n;
2670         int isdedupable;
2671
2672         bio = ap->a_bio;
2673         bp = bio->bio_buf;
2674         ip = ap->a_vp->v_data;
2675         hmp = ip->hmp;
2676
2677         /*
2678          * The zone-2 disk offset may have been set by the cluster code via
2679          * a BMAP operation, or else should be NOOFFSET.
2680          *
2681          * Checking the high bits for a match against zone-2 should suffice.
2682          *
2683          * In cases where a lot of data duplication is present it may be
2684          * more beneficial to drop through and doubule-buffer through the
2685          * device.
2686          */
2687         nbio = push_bio(bio);
2688         if (hammer_double_buffer == 0 &&
2689             (nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2690             HAMMER_ZONE_LARGE_DATA) {
2691                 lwkt_gettoken(&hmp->fs_token);
2692                 error = hammer_io_direct_read(hmp, nbio, NULL);
2693                 lwkt_reltoken(&hmp->fs_token);
2694                 return (error);
2695         }
2696
2697         /*
2698          * Well, that sucked.  Do it the hard way.  If all the stars are
2699          * aligned we may still be able to issue a direct-read.
2700          */
2701         lwkt_gettoken(&hmp->fs_token);
2702         hammer_simple_transaction(&trans, hmp);
2703         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2704
2705         /*
2706          * Key range (begin and end inclusive) to scan.  Note that the key's
2707          * stored in the actual records represent BASE+LEN, not BASE.  The
2708          * first record containing bio_offset will have a key > bio_offset.
2709          */
2710         cursor.key_beg.localization = ip->obj_localization +
2711                                       HAMMER_LOCALIZE_MISC;
2712         cursor.key_beg.obj_id = ip->obj_id;
2713         cursor.key_beg.create_tid = 0;
2714         cursor.key_beg.delete_tid = 0;
2715         cursor.key_beg.obj_type = 0;
2716         cursor.key_beg.key = bio->bio_offset + 1;
2717         cursor.asof = ip->obj_asof;
2718         cursor.flags |= HAMMER_CURSOR_ASOF;
2719
2720         cursor.key_end = cursor.key_beg;
2721         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2722 #if 0
2723         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2724                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2725                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2726                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2727         } else
2728 #endif
2729         {
2730                 ran_end = bio->bio_offset + bp->b_bufsize;
2731                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2732                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2733                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2734                 if (tmp64 < ran_end)
2735                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2736                 else
2737                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2738         }
2739         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2740
2741         error = hammer_ip_first(&cursor);
2742         boff = 0;
2743
2744         while (error == 0) {
2745                 /*
2746                  * Get the base file offset of the record.  The key for
2747                  * data records is (base + bytes) rather then (base).
2748                  */
2749                 base = &cursor.leaf->base;
2750                 rec_offset = base->key - cursor.leaf->data_len;
2751
2752                 /*
2753                  * Calculate the gap, if any, and zero-fill it.
2754                  *
2755                  * n is the offset of the start of the record verses our
2756                  * current seek offset in the bio.
2757                  */
2758                 n = (int)(rec_offset - (bio->bio_offset + boff));
2759                 if (n > 0) {
2760                         if (n > bp->b_bufsize - boff)
2761                                 n = bp->b_bufsize - boff;
2762                         bzero((char *)bp->b_data + boff, n);
2763                         boff += n;
2764                         n = 0;
2765                 }
2766
2767                 /*
2768                  * Calculate the data offset in the record and the number
2769                  * of bytes we can copy.
2770                  *
2771                  * There are two degenerate cases.  First, boff may already
2772                  * be at bp->b_bufsize.  Secondly, the data offset within
2773                  * the record may exceed the record's size.
2774                  */
2775                 roff = -n;
2776                 rec_offset += roff;
2777                 n = cursor.leaf->data_len - roff;
2778                 if (n <= 0) {
2779                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2780                         n = 0;
2781                 } else if (n > bp->b_bufsize - boff) {
2782                         n = bp->b_bufsize - boff;
2783                 }
2784
2785                 /*
2786                  * Deal with cached truncations.  This cool bit of code
2787                  * allows truncate()/ftruncate() to avoid having to sync
2788                  * the file.
2789                  *
2790                  * If the frontend is truncated then all backend records are
2791                  * subject to the frontend's truncation.
2792                  *
2793                  * If the backend is truncated then backend records on-disk
2794                  * (but not in-memory) are subject to the backend's
2795                  * truncation.  In-memory records owned by the backend
2796                  * represent data written after the truncation point on the
2797                  * backend and must not be truncated.
2798                  *
2799                  * Truncate operations deal with frontend buffer cache
2800                  * buffers and frontend-owned in-memory records synchronously.
2801                  */
2802                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2803                         if (hammer_cursor_ondisk(&cursor)/* ||
2804                             cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2805                                 if (ip->trunc_off <= rec_offset)
2806                                         n = 0;
2807                                 else if (ip->trunc_off < rec_offset + n)
2808                                         n = (int)(ip->trunc_off - rec_offset);
2809                         }
2810                 }
2811                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2812                         if (hammer_cursor_ondisk(&cursor)) {
2813                                 if (ip->sync_trunc_off <= rec_offset)
2814                                         n = 0;
2815                                 else if (ip->sync_trunc_off < rec_offset + n)
2816                                         n = (int)(ip->sync_trunc_off - rec_offset);
2817                         }
2818                 }
2819
2820                 /*
2821                  * Try to issue a direct read into our bio if possible,
2822                  * otherwise resolve the element data into a hammer_buffer
2823                  * and copy.
2824                  *
2825                  * The buffer on-disk should be zerod past any real
2826                  * truncation point, but may not be for any synthesized
2827                  * truncation point from above.
2828                  */
2829                 disk_offset = cursor.leaf->data_offset + roff;
2830                 isdedupable = (boff == 0 && n == bp->b_bufsize &&
2831                                hammer_cursor_ondisk(&cursor) &&
2832                                ((int)disk_offset & HAMMER_BUFMASK) == 0);
2833
2834                 if (isdedupable && hammer_double_buffer == 0) {
2835                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2836                                  HAMMER_ZONE_LARGE_DATA);
2837                         nbio->bio_offset = disk_offset;
2838                         error = hammer_io_direct_read(hmp, nbio, cursor.leaf);
2839                         if (hammer_live_dedup && error == 0)
2840                                 hammer_dedup_cache_add(ip, cursor.leaf);
2841                         goto done;
2842                 } else if (n) {
2843                         error = hammer_ip_resolve_data(&cursor);
2844                         if (error == 0) {
2845                                 if (hammer_live_dedup && isdedupable)
2846                                         hammer_dedup_cache_add(ip, cursor.leaf);
2847                                 bcopy((char *)cursor.data + roff,
2848                                       (char *)bp->b_data + boff, n);
2849                         }
2850                 }
2851                 if (error)
2852                         break;
2853
2854                 /*
2855                  * We have to be sure that the only elements added to the
2856                  * dedup cache are those which are already on-media.
2857                  */
2858                 if (hammer_live_dedup && hammer_cursor_ondisk(&cursor))
2859                         hammer_dedup_cache_add(ip, cursor.leaf);
2860
2861                 /*
2862                  * Iterate until we have filled the request.
2863                  */
2864                 boff += n;
2865                 if (boff == bp->b_bufsize)
2866                         break;
2867                 error = hammer_ip_next(&cursor);
2868         }
2869
2870         /*
2871          * There may have been a gap after the last record
2872          */
2873         if (error == ENOENT)
2874                 error = 0;
2875         if (error == 0 && boff != bp->b_bufsize) {
2876                 KKASSERT(boff < bp->b_bufsize);
2877                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2878                 /* boff = bp->b_bufsize; */
2879         }
2880         bp->b_resid = 0;
2881         bp->b_error = error;
2882         if (error)
2883                 bp->b_flags |= B_ERROR;
2884         biodone(ap->a_bio);
2885
2886 done:
2887         /*
2888          * Cache the b-tree node for the last data read in cache[1].
2889          *
2890          * If we hit the file EOF then also cache the node in the
2891          * governing director's cache[3], it will be used to initialize
2892          * the inode's cache[1] for any inodes looked up via the directory.
2893          *
2894          * This doesn't reduce disk accesses since the B-Tree chain is
2895          * likely cached, but it does reduce cpu overhead when looking
2896          * up file offsets for cpdup/tar/cpio style iterations.
2897          */
2898         if (cursor.node)
2899                 hammer_cache_node(&ip->cache[1], cursor.node);
2900         if (ran_end >= ip->ino_data.size) {
2901                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2902                                         ip->obj_asof, ip->obj_localization);
2903                 if (dip) {
2904                         hammer_cache_node(&dip->cache[3], cursor.node);
2905                         hammer_rel_inode(dip, 0);
2906                 }
2907         }
2908         hammer_done_cursor(&cursor);
2909         hammer_done_transaction(&trans);
2910         lwkt_reltoken(&hmp->fs_token);
2911         return(error);
2912 }
2913
2914 /*
2915  * BMAP operation - used to support cluster_read() only.
2916  *
2917  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2918  *
2919  * This routine may return EOPNOTSUPP if the opration is not supported for
2920  * the specified offset.  The contents of the pointer arguments do not
2921  * need to be initialized in that case. 
2922  *
2923  * If a disk address is available and properly aligned return 0 with 
2924  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2925  * to the run-length relative to that offset.  Callers may assume that
2926  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2927  * large, so return EOPNOTSUPP if it is not sufficiently large.
2928  */
2929 static
2930 int
2931 hammer_vop_bmap(struct vop_bmap_args *ap)
2932 {
2933         struct hammer_transaction trans;
2934         struct hammer_inode *ip;
2935         hammer_mount_t hmp;
2936         struct hammer_cursor cursor;
2937         hammer_base_elm_t base;
2938         int64_t rec_offset;
2939         int64_t ran_end;
2940         int64_t tmp64;
2941         int64_t base_offset;
2942         int64_t base_disk_offset;
2943         int64_t last_offset;
2944         hammer_off_t last_disk_offset;
2945         hammer_off_t disk_offset;
2946         int     rec_len;
2947         int     error;
2948         int     blksize;
2949
2950         ++hammer_stats_file_iopsr;
2951         ip = ap->a_vp->v_data;
2952         hmp = ip->hmp;
2953
2954         /*
2955          * We can only BMAP regular files.  We can't BMAP database files,
2956          * directories, etc.
2957          */
2958         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2959                 return(EOPNOTSUPP);
2960
2961         /*
2962          * bmap is typically called with runp/runb both NULL when used
2963          * for writing.  We do not support BMAP for writing atm.
2964          */
2965         if (ap->a_cmd != BUF_CMD_READ)
2966                 return(EOPNOTSUPP);
2967
2968         /*
2969          * Scan the B-Tree to acquire blockmap addresses, then translate
2970          * to raw addresses.
2971          */
2972         lwkt_gettoken(&hmp->fs_token);
2973         hammer_simple_transaction(&trans, hmp);
2974 #if 0
2975         kprintf("bmap_beg %016llx ip->cache %p\n",
2976                 (long long)ap->a_loffset, ip->cache[1]);
2977 #endif
2978         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2979
2980         /*
2981          * Key range (begin and end inclusive) to scan.  Note that the key's
2982          * stored in the actual records represent BASE+LEN, not BASE.  The
2983          * first record containing bio_offset will have a key > bio_offset.
2984          */
2985         cursor.key_beg.localization = ip->obj_localization +
2986                                       HAMMER_LOCALIZE_MISC;
2987         cursor.key_beg.obj_id = ip->obj_id;
2988         cursor.key_beg.create_tid = 0;
2989         cursor.key_beg.delete_tid = 0;
2990         cursor.key_beg.obj_type = 0;
2991         if (ap->a_runb)
2992                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2993         else
2994                 cursor.key_beg.key = ap->a_loffset + 1;
2995         if (cursor.key_beg.key < 0)
2996                 cursor.key_beg.key = 0;
2997         cursor.asof = ip->obj_asof;
2998         cursor.flags |= HAMMER_CURSOR_ASOF;
2999
3000         cursor.key_end = cursor.key_beg;
3001         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
3002
3003         ran_end = ap->a_loffset + MAXPHYS;
3004         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
3005         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
3006         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
3007         if (tmp64 < ran_end)
3008                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
3009         else
3010                 cursor.key_end.key = ran_end + MAXPHYS + 1;
3011
3012         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
3013
3014         error = hammer_ip_first(&cursor);
3015         base_offset = last_offset = 0;
3016         base_disk_offset = last_disk_offset = 0;
3017
3018         while (error == 0) {
3019                 /*
3020                  * Get the base file offset of the record.  The key for
3021                  * data records is (base + bytes) rather then (base).
3022                  *
3023                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
3024                  * The extra bytes should be zero on-disk and the BMAP op
3025                  * should still be ok.
3026                  */
3027                 base = &cursor.leaf->base;
3028                 rec_offset = base->key - cursor.leaf->data_len;
3029                 rec_len    = cursor.leaf->data_len;
3030
3031                 /*
3032                  * Incorporate any cached truncation.
3033                  *
3034                  * NOTE: Modifications to rec_len based on synthesized
3035                  * truncation points remove the guarantee that any extended
3036                  * data on disk is zero (since the truncations may not have
3037                  * taken place on-media yet).
3038                  */
3039                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
3040                         if (hammer_cursor_ondisk(&cursor) ||
3041                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
3042                                 if (ip->trunc_off <= rec_offset)
3043                                         rec_len = 0;
3044                                 else if (ip->trunc_off < rec_offset + rec_len)
3045                                         rec_len = (int)(ip->trunc_off - rec_offset);
3046                         }
3047                 }
3048                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
3049                         if (hammer_cursor_ondisk(&cursor)) {
3050                                 if (ip->sync_trunc_off <= rec_offset)
3051                                         rec_len = 0;
3052                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
3053                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
3054                         }
3055                 }
3056
3057                 /*
3058                  * Accumulate information.  If we have hit a discontiguous
3059                  * block reset base_offset unless we are already beyond the
3060                  * requested offset.  If we are, that's it, we stop.
3061                  */
3062                 if (error)
3063                         break;
3064                 if (hammer_cursor_ondisk(&cursor)) {
3065                         disk_offset = cursor.leaf->data_offset;
3066                         if (rec_offset != last_offset ||
3067                             disk_offset != last_disk_offset) {
3068                                 if (rec_offset > ap->a_loffset)
3069                                         break;
3070                                 base_offset = rec_offset;
3071                                 base_disk_offset = disk_offset;
3072                         }
3073                         last_offset = rec_offset + rec_len;
3074                         last_disk_offset = disk_offset + rec_len;
3075
3076                         if (hammer_live_dedup)
3077                                 hammer_dedup_cache_add(ip, cursor.leaf);
3078                 }
3079                 
3080                 error = hammer_ip_next(&cursor);
3081         }
3082
3083 #if 0
3084         kprintf("BMAP %016llx:  %016llx - %016llx\n",
3085                 (long long)ap->a_loffset,
3086                 (long long)base_offset,
3087                 (long long)last_offset);
3088         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
3089                 (long long)base_disk_offset,
3090                 (long long)last_disk_offset);
3091 #endif
3092
3093         if (cursor.node) {
3094                 hammer_cache_node(&ip->cache[1], cursor.node);
3095 #if 0
3096                 kprintf("bmap_end2 %016llx ip->cache %p\n",
3097                         (long long)ap->a_loffset, ip->cache[1]);
3098 #endif
3099         }
3100         hammer_done_cursor(&cursor);
3101         hammer_done_transaction(&trans);
3102         lwkt_reltoken(&hmp->fs_token);
3103
3104         /*
3105          * If we couldn't find any records or the records we did find were
3106          * all behind the requested offset, return failure.  A forward
3107          * truncation can leave a hole w/ no on-disk records.
3108          */
3109         if (last_offset == 0 || last_offset < ap->a_loffset)
3110                 return (EOPNOTSUPP);
3111
3112         /*
3113          * Figure out the block size at the requested offset and adjust
3114          * our limits so the cluster_read() does not create inappropriately
3115          * sized buffer cache buffers.
3116          */
3117         blksize = hammer_blocksize(ap->a_loffset);
3118         if (hammer_blocksize(base_offset) != blksize) {
3119                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
3120         }
3121         if (last_offset != ap->a_loffset &&
3122             hammer_blocksize(last_offset - 1) != blksize) {
3123                 last_offset = hammer_blockdemarc(ap->a_loffset,
3124                                                  last_offset - 1);
3125         }
3126
3127         /*
3128          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
3129          * from occuring.
3130          */
3131         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
3132
3133         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
3134                 /*
3135                  * Only large-data zones can be direct-IOd
3136                  */
3137                 error = EOPNOTSUPP;
3138         } else if ((disk_offset & HAMMER_BUFMASK) ||
3139                    (last_offset - ap->a_loffset) < blksize) {
3140                 /*
3141                  * doffsetp is not aligned or the forward run size does
3142                  * not cover a whole buffer, disallow the direct I/O.
3143                  */
3144                 error = EOPNOTSUPP;
3145         } else {
3146                 /*
3147                  * We're good.
3148                  */
3149                 *ap->a_doffsetp = disk_offset;
3150                 if (ap->a_runb) {
3151                         *ap->a_runb = ap->a_loffset - base_offset;
3152                         KKASSERT(*ap->a_runb >= 0);
3153                 }
3154                 if (ap->a_runp) {
3155                         *ap->a_runp = last_offset - ap->a_loffset;
3156                         KKASSERT(*ap->a_runp >= 0);
3157                 }
3158                 error = 0;
3159         }
3160         return(error);
3161 }
3162
3163 /*
3164  * Write to a regular file.   Because this is a strategy call the OS is
3165  * trying to actually get data onto the media.
3166  */
3167 static
3168 int
3169 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3170 {
3171         hammer_record_t record;
3172         hammer_mount_t hmp;
3173         hammer_inode_t ip;
3174         struct bio *bio;
3175         struct buf *bp;
3176         int blksize;
3177         int bytes;
3178         int error;
3179
3180         bio = ap->a_bio;
3181         bp = bio->bio_buf;
3182         ip = ap->a_vp->v_data;
3183         hmp = ip->hmp;
3184
3185         blksize = hammer_blocksize(bio->bio_offset);
3186         KKASSERT(bp->b_bufsize == blksize);
3187
3188         if (ip->flags & HAMMER_INODE_RO) {
3189                 bp->b_error = EROFS;
3190                 bp->b_flags |= B_ERROR;
3191                 biodone(ap->a_bio);
3192                 return(EROFS);
3193         }
3194
3195         lwkt_gettoken(&hmp->fs_token);
3196
3197         /*
3198          * Interlock with inode destruction (no in-kernel or directory
3199          * topology visibility).  If we queue new IO while trying to
3200          * destroy the inode we can deadlock the vtrunc call in
3201          * hammer_inode_unloadable_check().
3202          *
3203          * Besides, there's no point flushing a bp associated with an
3204          * inode that is being destroyed on-media and has no kernel
3205          * references.
3206          */
3207         if ((ip->flags | ip->sync_flags) &
3208             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3209                 bp->b_resid = 0;
3210                 biodone(ap->a_bio);
3211                 lwkt_reltoken(&hmp->fs_token);
3212                 return(0);
3213         }
3214
3215         /*
3216          * Reserve space and issue a direct-write from the front-end. 
3217          * NOTE: The direct_io code will hammer_bread/bcopy smaller
3218          * allocations.
3219          *
3220          * An in-memory record will be installed to reference the storage
3221          * until the flusher can get to it.
3222          *
3223          * Since we own the high level bio the front-end will not try to
3224          * do a direct-read until the write completes.
3225          *
3226          * NOTE: The only time we do not reserve a full-sized buffers
3227          * worth of data is if the file is small.  We do not try to
3228          * allocate a fragment (from the small-data zone) at the end of
3229          * an otherwise large file as this can lead to wildly separated
3230          * data.
3231          */
3232         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3233         KKASSERT(bio->bio_offset < ip->ino_data.size);
3234         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3235                 bytes = bp->b_bufsize;
3236         else
3237                 bytes = ((int)ip->ino_data.size + 15) & ~15;
3238
3239         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3240                                     bytes, &error);
3241
3242         /*
3243          * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3244          * in hammer_vop_write().  We must flag the record so the proper
3245          * REDO_TERM_WRITE entry is generated during the flush.
3246          */
3247         if (record) {
3248                 if (bp->b_flags & B_VFSFLAG1) {
3249                         record->flags |= HAMMER_RECF_REDO;
3250                         bp->b_flags &= ~B_VFSFLAG1;
3251                 }
3252                 if (record->flags & HAMMER_RECF_DEDUPED) {
3253                         bp->b_resid = 0;
3254                         hammer_ip_replace_bulk(hmp, record);
3255                         biodone(ap->a_bio);
3256                 } else {
3257                         hammer_io_direct_write(hmp, bio, record);
3258                 }
3259                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3260                         hammer_flush_inode(ip, 0);
3261         } else {
3262                 bp->b_bio2.bio_offset = NOOFFSET;
3263                 bp->b_error = error;
3264                 bp->b_flags |= B_ERROR;
3265                 biodone(ap->a_bio);
3266         }
3267         lwkt_reltoken(&hmp->fs_token);
3268         return(error);
3269 }
3270
3271 /*
3272  * dounlink - disconnect a directory entry
3273  *
3274  * XXX whiteout support not really in yet
3275  */
3276 static int
3277 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3278                 struct vnode *dvp, struct ucred *cred, 
3279                 int flags, int isdir)
3280 {
3281         struct namecache *ncp;
3282         hammer_inode_t dip;
3283         hammer_inode_t ip;
3284         hammer_mount_t hmp;
3285         struct hammer_cursor cursor;
3286         int64_t namekey;
3287         u_int32_t max_iterations;
3288         int nlen, error;
3289
3290         /*
3291          * Calculate the namekey and setup the key range for the scan.  This
3292          * works kinda like a chained hash table where the lower 32 bits
3293          * of the namekey synthesize the chain.
3294          *
3295          * The key range is inclusive of both key_beg and key_end.
3296          */
3297         dip = VTOI(dvp);
3298         ncp = nch->ncp;
3299         hmp = dip->hmp;
3300
3301         if (dip->flags & HAMMER_INODE_RO)
3302                 return (EROFS);
3303
3304         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3305                                            &max_iterations);
3306 retry:
3307         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3308         cursor.key_beg.localization = dip->obj_localization +
3309                                       hammer_dir_localization(dip);
3310         cursor.key_beg.obj_id = dip->obj_id;
3311         cursor.key_beg.key = namekey;
3312         cursor.key_beg.create_tid = 0;
3313         cursor.key_beg.delete_tid = 0;
3314         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3315         cursor.key_beg.obj_type = 0;
3316
3317         cursor.key_end = cursor.key_beg;
3318         cursor.key_end.key += max_iterations;
3319         cursor.asof = dip->obj_asof;
3320         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3321
3322         /*
3323          * Scan all matching records (the chain), locate the one matching
3324          * the requested path component.  info->last_error contains the
3325          * error code on search termination and could be 0, ENOENT, or
3326          * something else.
3327          *
3328          * The hammer_ip_*() functions merge in-memory records with on-disk
3329          * records for the purposes of the search.
3330          */
3331         error = hammer_ip_first(&cursor);
3332
3333         while (error == 0) {
3334                 error = hammer_ip_resolve_data(&cursor);
3335                 if (error)
3336                         break;
3337                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3338                 KKASSERT(nlen > 0);
3339                 if (ncp->nc_nlen == nlen &&
3340                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3341                         break;
3342                 }
3343                 error = hammer_ip_next(&cursor);
3344         }
3345
3346         /*
3347          * If all is ok we have to get the inode so we can adjust nlinks.
3348          * To avoid a deadlock with the flusher we must release the inode
3349          * lock on the directory when acquiring the inode for the entry.
3350          *
3351          * If the target is a directory, it must be empty.
3352          */
3353         if (error == 0) {
3354                 hammer_unlock(&cursor.ip->lock);
3355                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3356                                       hmp->asof,
3357                                       cursor.data->entry.localization,
3358                                       0, &error);
3359                 hammer_lock_sh(&cursor.ip->lock);
3360                 if (error == ENOENT) {
3361                         kprintf("HAMMER: WARNING: Removing "
3362                                 "dirent w/missing inode \"%s\"\n"
3363                                 "\tobj_id = %016llx\n",
3364                                 ncp->nc_name,
3365                                 (long long)cursor.data->entry.obj_id);
3366                         error = 0;
3367                 }
3368
3369                 /*
3370                  * If isdir >= 0 we validate that the entry is or is not a
3371                  * directory.  If isdir < 0 we don't care.
3372                  */
3373                 if (error == 0 && isdir >= 0 && ip) {
3374                         if (isdir &&
3375                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3376                                 error = ENOTDIR;
3377                         } else if (isdir == 0 &&
3378                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3379                                 error = EISDIR;
3380                         }
3381                 }
3382
3383                 /*
3384                  * If we are trying to remove a directory the directory must
3385                  * be empty.
3386                  *
3387                  * The check directory code can loop and deadlock/retry.  Our
3388                  * own cursor's node locks must be released to avoid a 3-way
3389                  * deadlock with the flusher if the check directory code
3390                  * blocks.
3391                  *
3392                  * If any changes whatsoever have been made to the cursor
3393                  * set EDEADLK and retry.
3394                  *
3395                  * WARNING: See warnings in hammer_unlock_cursor()
3396                  *          function.
3397                  */
3398                 if (error == 0 && ip && ip->ino_data.obj_type ==
3399                                         HAMMER_OBJTYPE_DIRECTORY) {
3400                         hammer_unlock_cursor(&cursor);
3401                         error = hammer_ip_check_directory_empty(trans, ip);
3402                         hammer_lock_cursor(&cursor);
3403                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
3404                                 kprintf("HAMMER: Warning: avoided deadlock "
3405                                         "on rmdir '%s'\n",
3406                                         ncp->nc_name);
3407                                 error = EDEADLK;
3408                         }
3409                 }
3410
3411                 /*
3412                  * Delete the directory entry.
3413                  *
3414                  * WARNING: hammer_ip_del_directory() may have to terminate
3415                  * the cursor to avoid a deadlock.  It is ok to call
3416                  * hammer_done_cursor() twice.
3417                  */
3418                 if (error == 0) {
3419                         error = hammer_ip_del_directory(trans, &cursor,
3420                                                         dip, ip);
3421                 }
3422                 hammer_done_cursor(&cursor);
3423                 if (error == 0) {
3424                         cache_setunresolved(nch);
3425                         cache_setvp(nch, NULL);
3426
3427                         /*
3428                          * NOTE: ip->vp, if non-NULL, cannot be directly
3429                          *       referenced without formally acquiring the
3430                          *       vp since the vp might have zero refs on it,
3431                          *       or in the middle of a reclaim, etc.
3432                          *
3433                          * NOTE: The cache_setunresolved() can rip the vp
3434                          *       out from under us since the vp may not have
3435                          *       any refs, in which case ip->vp will be NULL
3436                          *       from the outset.
3437                          */
3438                         while (ip && ip->vp) {
3439                                 struct vnode *vp;
3440
3441                                 error = hammer_get_vnode(ip, &vp);
3442                                 if (error == 0 && vp) {
3443                                         vn_unlock(vp);
3444                                         hammer_knote(ip->vp, NOTE_DELETE);
3445                                         cache_inval_vp(ip->vp, CINV_DESTROY);
3446                                         vrele(vp);
3447                                         break;
3448                                 }
3449                                 kprintf("Debug: HAMMER ip/vp race1 avoided\n");
3450                         }
3451                 }
3452                 if (ip)
3453                         hammer_rel_inode(ip, 0);
3454         } else {
3455                 hammer_done_cursor(&cursor);
3456         }
3457         if (error == EDEADLK)
3458                 goto retry;
3459
3460         return (error);
3461 }
3462
3463 /************************************************************************
3464  *                          FIFO AND SPECFS OPS                         *
3465  ************************************************************************
3466  *
3467  */
3468 static int
3469 hammer_vop_fifoclose (struct vop_close_args *ap)
3470 {
3471         /* XXX update itimes */
3472         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3473 }
3474
3475 static int
3476 hammer_vop_fiforead (struct vop_read_args *ap)
3477 {
3478         int error;
3479
3480         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3481         /* XXX update access time */
3482         return (error);
3483 }
3484
3485 static int
3486 hammer_vop_fifowrite (struct vop_write_args *ap)
3487 {
3488         int error;
3489
3490         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3491         /* XXX update access time */
3492         return (error);
3493 }
3494
3495 static
3496 int
3497 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3498 {
3499         int error;
3500
3501         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3502         if (error)
3503                 error = hammer_vop_kqfilter(ap);
3504         return(error);
3505 }
3506
3507 /************************************************************************
3508  *                          KQFILTER OPS                                *
3509  ************************************************************************
3510  *
3511  */
3512 static void filt_hammerdetach(struct knote *kn);
3513 static int filt_hammerread(struct knote *kn, long hint);
3514 static int filt_hammerwrite(struct knote *kn, long hint);
3515 static int filt_hammervnode(struct knote *kn, long hint);
3516
3517 static struct filterops hammerread_filtops =
3518         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerread };
3519 static struct filterops hammerwrite_filtops =
3520         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammerwrite };
3521 static struct filterops hammervnode_filtops =
3522         { FILTEROP_ISFD, NULL, filt_hammerdetach, filt_hammervnode };
3523
3524 static
3525 int
3526 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3527 {
3528         struct vnode *vp = ap->a_vp;
3529         struct knote *kn = ap->a_kn;
3530
3531         switch (kn->kn_filter) {
3532         case EVFILT_READ:
3533                 kn->kn_fop = &hammerread_filtops;
3534                 break;
3535         case EVFILT_WRITE:
3536                 kn->kn_fop = &hammerwrite_filtops;
3537                 break;
3538         case EVFILT_VNODE:
3539                 kn->kn_fop = &hammervnode_filtops;
3540                 break;
3541         default:
3542                 return (EOPNOTSUPP);
3543         }
3544
3545         kn->kn_hook = (caddr_t)vp;
3546
3547         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3548
3549         return(0);
3550 }
3551
3552 static void
3553 filt_hammerdetach(struct knote *kn)
3554 {
3555         struct vnode *vp = (void *)kn->kn_hook;
3556
3557         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
3558 }
3559
3560 static int
3561 filt_hammerread(struct knote *kn, long hint)
3562 {
3563         struct vnode *vp = (void *)kn->kn_hook;
3564         hammer_inode_t ip = VTOI(vp);
3565         hammer_mount_t hmp = ip->hmp;
3566         off_t off;
3567
3568         if (hint == NOTE_REVOKE) {
3569                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3570                 return(1);
3571         }
3572         lwkt_gettoken(&hmp->fs_token);  /* XXX use per-ip-token */
3573         off = ip->ino_data.size - kn->kn_fp->f_offset;
3574         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
3575         lwkt_reltoken(&hmp->fs_token);
3576         if (kn->kn_sfflags & NOTE_OLDAPI)
3577                 return(1);
3578         return (kn->kn_data != 0);
3579 }
3580
3581 static int
3582 filt_hammerwrite(struct knote *kn, long hint)
3583 {
3584         if (hint == NOTE_REVOKE)
3585                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3586         kn->kn_data = 0;
3587         return (1);
3588 }
3589
3590 static int
3591 filt_hammervnode(struct knote *kn, long hint)
3592 {
3593         if (kn->kn_sfflags & hint)
3594                 kn->kn_fflags |= hint;
3595         if (hint == NOTE_REVOKE) {
3596                 kn->kn_flags |= EV_EOF;
3597                 return (1);
3598         }
3599         return (kn->kn_fflags != 0);
3600 }
3601