kernel - Fix kqfilter error return codes
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50
51 #include <sys/mplock2.h>
52
53 #include "hammer.h"
54
55 /*
56  * USERFS VNOPS
57  */
58 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
59 static int hammer_vop_fsync(struct vop_fsync_args *);
60 static int hammer_vop_read(struct vop_read_args *);
61 static int hammer_vop_write(struct vop_write_args *);
62 static int hammer_vop_access(struct vop_access_args *);
63 static int hammer_vop_advlock(struct vop_advlock_args *);
64 static int hammer_vop_close(struct vop_close_args *);
65 static int hammer_vop_ncreate(struct vop_ncreate_args *);
66 static int hammer_vop_getattr(struct vop_getattr_args *);
67 static int hammer_vop_nresolve(struct vop_nresolve_args *);
68 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
69 static int hammer_vop_nlink(struct vop_nlink_args *);
70 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
71 static int hammer_vop_nmknod(struct vop_nmknod_args *);
72 static int hammer_vop_open(struct vop_open_args *);
73 static int hammer_vop_print(struct vop_print_args *);
74 static int hammer_vop_readdir(struct vop_readdir_args *);
75 static int hammer_vop_readlink(struct vop_readlink_args *);
76 static int hammer_vop_nremove(struct vop_nremove_args *);
77 static int hammer_vop_nrename(struct vop_nrename_args *);
78 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
79 static int hammer_vop_markatime(struct vop_markatime_args *);
80 static int hammer_vop_setattr(struct vop_setattr_args *);
81 static int hammer_vop_strategy(struct vop_strategy_args *);
82 static int hammer_vop_bmap(struct vop_bmap_args *ap);
83 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
84 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
85 static int hammer_vop_ioctl(struct vop_ioctl_args *);
86 static int hammer_vop_mountctl(struct vop_mountctl_args *);
87 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
88
89 static int hammer_vop_fifoclose (struct vop_close_args *);
90 static int hammer_vop_fiforead (struct vop_read_args *);
91 static int hammer_vop_fifowrite (struct vop_write_args *);
92 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
93
94 struct vop_ops hammer_vnode_vops = {
95         .vop_default =          vop_defaultop,
96         .vop_fsync =            hammer_vop_fsync,
97         .vop_getpages =         vop_stdgetpages,
98         .vop_putpages =         vop_stdputpages,
99         .vop_read =             hammer_vop_read,
100         .vop_write =            hammer_vop_write,
101         .vop_access =           hammer_vop_access,
102         .vop_advlock =          hammer_vop_advlock,
103         .vop_close =            hammer_vop_close,
104         .vop_ncreate =          hammer_vop_ncreate,
105         .vop_getattr =          hammer_vop_getattr,
106         .vop_inactive =         hammer_vop_inactive,
107         .vop_reclaim =          hammer_vop_reclaim,
108         .vop_nresolve =         hammer_vop_nresolve,
109         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
110         .vop_nlink =            hammer_vop_nlink,
111         .vop_nmkdir =           hammer_vop_nmkdir,
112         .vop_nmknod =           hammer_vop_nmknod,
113         .vop_open =             hammer_vop_open,
114         .vop_pathconf =         vop_stdpathconf,
115         .vop_print =            hammer_vop_print,
116         .vop_readdir =          hammer_vop_readdir,
117         .vop_readlink =         hammer_vop_readlink,
118         .vop_nremove =          hammer_vop_nremove,
119         .vop_nrename =          hammer_vop_nrename,
120         .vop_nrmdir =           hammer_vop_nrmdir,
121         .vop_markatime =        hammer_vop_markatime,
122         .vop_setattr =          hammer_vop_setattr,
123         .vop_bmap =             hammer_vop_bmap,
124         .vop_strategy =         hammer_vop_strategy,
125         .vop_nsymlink =         hammer_vop_nsymlink,
126         .vop_nwhiteout =        hammer_vop_nwhiteout,
127         .vop_ioctl =            hammer_vop_ioctl,
128         .vop_mountctl =         hammer_vop_mountctl,
129         .vop_kqfilter =         hammer_vop_kqfilter
130 };
131
132 struct vop_ops hammer_spec_vops = {
133         .vop_default =          vop_defaultop,
134         .vop_fsync =            hammer_vop_fsync,
135         .vop_read =             vop_stdnoread,
136         .vop_write =            vop_stdnowrite,
137         .vop_access =           hammer_vop_access,
138         .vop_close =            hammer_vop_close,
139         .vop_markatime =        hammer_vop_markatime,
140         .vop_getattr =          hammer_vop_getattr,
141         .vop_inactive =         hammer_vop_inactive,
142         .vop_reclaim =          hammer_vop_reclaim,
143         .vop_setattr =          hammer_vop_setattr
144 };
145
146 struct vop_ops hammer_fifo_vops = {
147         .vop_default =          fifo_vnoperate,
148         .vop_fsync =            hammer_vop_fsync,
149         .vop_read =             hammer_vop_fiforead,
150         .vop_write =            hammer_vop_fifowrite,
151         .vop_access =           hammer_vop_access,
152         .vop_close =            hammer_vop_fifoclose,
153         .vop_markatime =        hammer_vop_markatime,
154         .vop_getattr =          hammer_vop_getattr,
155         .vop_inactive =         hammer_vop_inactive,
156         .vop_reclaim =          hammer_vop_reclaim,
157         .vop_setattr =          hammer_vop_setattr,
158         .vop_kqfilter =         hammer_vop_fifokqfilter
159 };
160
161 static __inline
162 void
163 hammer_knote(struct vnode *vp, int flags)
164 {
165         if (flags)
166                 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
167 }
168
169 #ifdef DEBUG_TRUNCATE
170 struct hammer_inode *HammerTruncIp;
171 #endif
172
173 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
174                            struct vnode *dvp, struct ucred *cred,
175                            int flags, int isdir);
176 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
177 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
178
179 #if 0
180 static
181 int
182 hammer_vop_vnoperate(struct vop_generic_args *)
183 {
184         return (VOCALL(&hammer_vnode_vops, ap));
185 }
186 #endif
187
188 /*
189  * hammer_vop_fsync { vp, waitfor }
190  *
191  * fsync() an inode to disk and wait for it to be completely committed
192  * such that the information would not be undone if a crash occured after
193  * return.
194  *
195  * NOTE: HAMMER's fsync()'s are going to remain expensive until we implement
196  *       a REDO log.  A sysctl is provided to relax HAMMER's fsync()
197  *       operation.
198  *
199  *       Ultimately the combination of a REDO log and use of fast storage
200  *       to front-end cluster caches will make fsync fast, but it aint
201  *       here yet.  And, in anycase, we need real transactional
202  *       all-or-nothing features which are not restricted to a single file.
203  */
204 static
205 int
206 hammer_vop_fsync(struct vop_fsync_args *ap)
207 {
208         hammer_inode_t ip = VTOI(ap->a_vp);
209         hammer_mount_t hmp = ip->hmp;
210         int waitfor = ap->a_waitfor;
211         int mode;
212
213         /*
214          * Fsync rule relaxation (default is either full synchronous flush
215          * or REDO semantics with synchronous flush).
216          */
217         if (ap->a_flags & VOP_FSYNC_SYSCALL) {
218                 switch(hammer_fsync_mode) {
219                 case 0:
220 mode0:
221                         /* no REDO, full synchronous flush */
222                         goto skip;
223                 case 1:
224 mode1:
225                         /* no REDO, full asynchronous flush */
226                         if (waitfor == MNT_WAIT)
227                                 waitfor = MNT_NOWAIT;
228                         goto skip;
229                 case 2:
230                         /* REDO semantics, synchronous flush */
231                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
232                                 goto mode0;
233                         mode = HAMMER_FLUSH_UNDOS_AUTO;
234                         break;
235                 case 3:
236                         /* REDO semantics, relaxed asynchronous flush */
237                         if (hmp->version < HAMMER_VOL_VERSION_FOUR)
238                                 goto mode1;
239                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
240                         if (waitfor == MNT_WAIT)
241                                 waitfor = MNT_NOWAIT;
242                         break;
243                 case 4:
244                         /* ignore the fsync() system call */
245                         return(0);
246                 default:
247                         /* we have to do something */
248                         mode = HAMMER_FLUSH_UNDOS_RELAXED;
249                         if (waitfor == MNT_WAIT)
250                                 waitfor = MNT_NOWAIT;
251                         break;
252                 }
253
254                 /*
255                  * Fast fsync only needs to flush the UNDO/REDO fifo if
256                  * HAMMER_INODE_REDO is non-zero and the only modifications
257                  * made to the file are write or write-extends.
258                  */
259                 if ((ip->flags & HAMMER_INODE_REDO) &&
260                     (ip->flags & HAMMER_INODE_MODMASK_NOREDO) == 0
261                 ) {
262                         ++hammer_count_fsyncs;
263                         hammer_flusher_flush_undos(hmp, mode);
264                         ip->redo_count = 0;
265                         return(0);
266                 }
267
268                 /*
269                  * REDO is enabled by fsync(), the idea being we really only
270                  * want to lay down REDO records when programs are using
271                  * fsync() heavily.  The first fsync() on the file starts
272                  * the gravy train going and later fsync()s keep it hot by
273                  * resetting the redo_count.
274                  *
275                  * We weren't running REDOs before now so we have to fall
276                  * through and do a full fsync of what we have.
277                  */
278                 if (hmp->version >= HAMMER_VOL_VERSION_FOUR &&
279                     (hmp->flags & HAMMER_MOUNT_REDO_RECOVERY_RUN) == 0) {
280                         ip->flags |= HAMMER_INODE_REDO;
281                         ip->redo_count = 0;
282                 }
283         }
284 skip:
285
286         /*
287          * Do a full flush sequence.
288          */
289         ++hammer_count_fsyncs;
290         vfsync(ap->a_vp, waitfor, 1, NULL, NULL);
291         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
292         if (waitfor == MNT_WAIT) {
293                 vn_unlock(ap->a_vp);
294                 hammer_wait_inode(ip);
295                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
296         }
297         return (ip->error);
298 }
299
300 /*
301  * hammer_vop_read { vp, uio, ioflag, cred }
302  *
303  * MPALMOSTSAFE
304  */
305 static
306 int
307 hammer_vop_read(struct vop_read_args *ap)
308 {
309         struct hammer_transaction trans;
310         hammer_inode_t ip;
311         off_t offset;
312         struct buf *bp;
313         struct uio *uio;
314         int error;
315         int n;
316         int seqcount;
317         int ioseqcount;
318         int blksize;
319         int got_mplock;
320         int bigread;
321
322         if (ap->a_vp->v_type != VREG)
323                 return (EINVAL);
324         ip = VTOI(ap->a_vp);
325         error = 0;
326         uio = ap->a_uio;
327
328         /*
329          * Allow the UIO's size to override the sequential heuristic.
330          */
331         blksize = hammer_blocksize(uio->uio_offset);
332         seqcount = (uio->uio_resid + (BKVASIZE - 1)) / BKVASIZE;
333         ioseqcount = (ap->a_ioflag >> 16);
334         if (seqcount < ioseqcount)
335                 seqcount = ioseqcount;
336
337         /*
338          * Temporary hack until more of HAMMER can be made MPSAFE.
339          */
340 #ifdef SMP
341         if (curthread->td_mpcount) {
342                 got_mplock = -1;
343                 hammer_start_transaction(&trans, ip->hmp);
344         } else {
345                 got_mplock = 0;
346         }
347 #else
348         hammer_start_transaction(&trans, ip->hmp);
349         got_mplock = -1;
350 #endif
351
352         /*
353          * If reading or writing a huge amount of data we have to break
354          * atomicy and allow the operation to be interrupted by a signal
355          * or it can DOS the machine.
356          */
357         bigread = (uio->uio_resid > 100 * 1024 * 1024);
358
359         /*
360          * Access the data typically in HAMMER_BUFSIZE blocks via the
361          * buffer cache, but HAMMER may use a variable block size based
362          * on the offset.
363          *
364          * XXX Temporary hack, delay the start transaction while we remain
365          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
366          *     locked-shared.
367          */
368         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
369                 int64_t base_offset;
370                 int64_t file_limit;
371
372                 blksize = hammer_blocksize(uio->uio_offset);
373                 offset = (int)uio->uio_offset & (blksize - 1);
374                 base_offset = uio->uio_offset - offset;
375
376                 if (bigread && (error = hammer_signal_check(ip->hmp)) != 0)
377                         break;
378
379                 /*
380                  * MPSAFE
381                  */
382                 bp = getcacheblk(ap->a_vp, base_offset);
383                 if (bp) {
384                         error = 0;
385                         goto skip;
386                 }
387
388                 /*
389                  * MPUNSAFE
390                  */
391                 if (got_mplock == 0) {
392                         got_mplock = 1;
393                         get_mplock();
394                         hammer_start_transaction(&trans, ip->hmp);
395                 }
396
397                 if (hammer_cluster_enable) {
398                         /*
399                          * Use file_limit to prevent cluster_read() from
400                          * creating buffers of the wrong block size past
401                          * the demarc.
402                          */
403                         file_limit = ip->ino_data.size;
404                         if (base_offset < HAMMER_XDEMARC &&
405                             file_limit > HAMMER_XDEMARC) {
406                                 file_limit = HAMMER_XDEMARC;
407                         }
408                         error = cluster_read(ap->a_vp,
409                                              file_limit, base_offset,
410                                              blksize, MAXPHYS,
411                                              seqcount, &bp);
412                 } else {
413                         error = bread(ap->a_vp, base_offset, blksize, &bp);
414                 }
415                 if (error) {
416                         brelse(bp);
417                         break;
418                 }
419 skip:
420
421                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
422                 n = blksize - offset;
423                 if (n > uio->uio_resid)
424                         n = uio->uio_resid;
425                 if (n > ip->ino_data.size - uio->uio_offset)
426                         n = (int)(ip->ino_data.size - uio->uio_offset);
427                 error = uiomove((char *)bp->b_data + offset, n, uio);
428
429                 /* data has a lower priority then meta-data */
430                 bp->b_flags |= B_AGE;
431                 bqrelse(bp);
432                 if (error)
433                         break;
434                 hammer_stats_file_read += n;
435         }
436
437         /*
438          * XXX only update the atime if we had to get the MP lock.
439          * XXX hack hack hack, fixme.
440          */
441         if (got_mplock) {
442                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
443                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
444                         ip->ino_data.atime = trans.time;
445                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
446                 }
447                 hammer_done_transaction(&trans);
448                 if (got_mplock > 0)
449                         rel_mplock();
450         }
451         return (error);
452 }
453
454 /*
455  * hammer_vop_write { vp, uio, ioflag, cred }
456  */
457 static
458 int
459 hammer_vop_write(struct vop_write_args *ap)
460 {
461         struct hammer_transaction trans;
462         struct hammer_inode *ip;
463         hammer_mount_t hmp;
464         struct uio *uio;
465         int offset;
466         off_t base_offset;
467         struct buf *bp;
468         int kflags;
469         int error;
470         int n;
471         int flags;
472         int seqcount;
473         int bigwrite;
474
475         if (ap->a_vp->v_type != VREG)
476                 return (EINVAL);
477         ip = VTOI(ap->a_vp);
478         hmp = ip->hmp;
479         error = 0;
480         kflags = 0;
481         seqcount = ap->a_ioflag >> 16;
482
483         if (ip->flags & HAMMER_INODE_RO)
484                 return (EROFS);
485
486         /*
487          * Create a transaction to cover the operations we perform.
488          */
489         hammer_start_transaction(&trans, hmp);
490         uio = ap->a_uio;
491
492         /*
493          * Check append mode
494          */
495         if (ap->a_ioflag & IO_APPEND)
496                 uio->uio_offset = ip->ino_data.size;
497
498         /*
499          * Check for illegal write offsets.  Valid range is 0...2^63-1.
500          *
501          * NOTE: the base_off assignment is required to work around what
502          * I consider to be a GCC-4 optimization bug.
503          */
504         if (uio->uio_offset < 0) {
505                 hammer_done_transaction(&trans);
506                 return (EFBIG);
507         }
508         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
509         if (uio->uio_resid > 0 && base_offset <= uio->uio_offset) {
510                 hammer_done_transaction(&trans);
511                 return (EFBIG);
512         }
513
514         /*
515          * If reading or writing a huge amount of data we have to break
516          * atomicy and allow the operation to be interrupted by a signal
517          * or it can DOS the machine.
518          *
519          * Preset redo_count so we stop generating REDOs earlier if the
520          * limit is exceeded.
521          */
522         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
523         if ((ip->flags & HAMMER_INODE_REDO) &&
524             ip->redo_count < hammer_limit_redo) {
525                 ip->redo_count += uio->uio_resid;
526         }
527
528         /*
529          * Access the data typically in HAMMER_BUFSIZE blocks via the
530          * buffer cache, but HAMMER may use a variable block size based
531          * on the offset.
532          */
533         while (uio->uio_resid > 0) {
534                 int fixsize = 0;
535                 int blksize;
536                 int blkmask;
537                 int trivial;
538                 int endofblk;
539                 off_t nsize;
540
541                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
542                         break;
543                 if (bigwrite && (error = hammer_signal_check(hmp)) != 0)
544                         break;
545
546                 blksize = hammer_blocksize(uio->uio_offset);
547
548                 /*
549                  * Do not allow HAMMER to blow out the buffer cache.  Very
550                  * large UIOs can lockout other processes due to bwillwrite()
551                  * mechanics.
552                  *
553                  * The hammer inode is not locked during these operations.
554                  * The vnode is locked which can interfere with the pageout
555                  * daemon for non-UIO_NOCOPY writes but should not interfere
556                  * with the buffer cache.  Even so, we cannot afford to
557                  * allow the pageout daemon to build up too many dirty buffer
558                  * cache buffers.
559                  *
560                  * Only call this if we aren't being recursively called from
561                  * a virtual disk device (vn), else we may deadlock.
562                  */
563                 if ((ap->a_ioflag & IO_RECURSE) == 0)
564                         bwillwrite(blksize);
565
566                 /*
567                  * Control the number of pending records associated with
568                  * this inode.  If too many have accumulated start a
569                  * flush.  Try to maintain a pipeline with the flusher.
570                  */
571                 if (ip->rsv_recs >= hammer_limit_inode_recs) {
572                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
573                 }
574                 if (ip->rsv_recs >= hammer_limit_inode_recs * 2) {
575                         while (ip->rsv_recs >= hammer_limit_inode_recs) {
576                                 tsleep(&ip->rsv_recs, 0, "hmrwww", hz);
577                         }
578                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
579                 }
580
581 #if 0
582                 /*
583                  * Do not allow HAMMER to blow out system memory by
584                  * accumulating too many records.   Records are so well
585                  * decoupled from the buffer cache that it is possible
586                  * for userland to push data out to the media via
587                  * direct-write, but build up the records queued to the
588                  * backend faster then the backend can flush them out.
589                  * HAMMER has hit its write limit but the frontend has
590                  * no pushback to slow it down.
591                  */
592                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
593                         /*
594                          * Get the inode on the flush list
595                          */
596                         if (ip->rsv_recs >= 64)
597                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
598                         else if (ip->rsv_recs >= 16)
599                                 hammer_flush_inode(ip, 0);
600
601                         /*
602                          * Keep the flusher going if the system keeps
603                          * queueing records.
604                          */
605                         delta = hmp->count_newrecords -
606                                 hmp->last_newrecords;
607                         if (delta < 0 || delta > hammer_limit_recs / 2) {
608                                 hmp->last_newrecords = hmp->count_newrecords;
609                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
610                         }
611
612                         /*
613                          * If we have gotten behind start slowing
614                          * down the writers.
615                          */
616                         delta = (hmp->rsv_recs - hammer_limit_recs) *
617                                 hz / hammer_limit_recs;
618                         if (delta > 0)
619                                 tsleep(&trans, 0, "hmrslo", delta);
620                 }
621 #endif
622
623                 /*
624                  * Calculate the blocksize at the current offset and figure
625                  * out how much we can actually write.
626                  */
627                 blkmask = blksize - 1;
628                 offset = (int)uio->uio_offset & blkmask;
629                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
630                 n = blksize - offset;
631                 if (n > uio->uio_resid) {
632                         n = uio->uio_resid;
633                         endofblk = 0;
634                 } else {
635                         endofblk = 1;
636                 }
637                 nsize = uio->uio_offset + n;
638                 if (nsize > ip->ino_data.size) {
639                         if (uio->uio_offset > ip->ino_data.size)
640                                 trivial = 0;
641                         else
642                                 trivial = 1;
643                         nvextendbuf(ap->a_vp,
644                                     ip->ino_data.size,
645                                     nsize,
646                                     hammer_blocksize(ip->ino_data.size),
647                                     hammer_blocksize(nsize),
648                                     hammer_blockoff(ip->ino_data.size),
649                                     hammer_blockoff(nsize),
650                                     trivial);
651                         fixsize = 1;
652                         kflags |= NOTE_EXTEND;
653                 }
654
655                 if (uio->uio_segflg == UIO_NOCOPY) {
656                         /*
657                          * Issuing a write with the same data backing the
658                          * buffer.  Instantiate the buffer to collect the
659                          * backing vm pages, then read-in any missing bits.
660                          *
661                          * This case is used by vop_stdputpages().
662                          */
663                         bp = getblk(ap->a_vp, base_offset,
664                                     blksize, GETBLK_BHEAVY, 0);
665                         if ((bp->b_flags & B_CACHE) == 0) {
666                                 bqrelse(bp);
667                                 error = bread(ap->a_vp, base_offset,
668                                               blksize, &bp);
669                         }
670                 } else if (offset == 0 && uio->uio_resid >= blksize) {
671                         /*
672                          * Even though we are entirely overwriting the buffer
673                          * we may still have to zero it out to avoid a 
674                          * mmap/write visibility issue.
675                          */
676                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
677                         if ((bp->b_flags & B_CACHE) == 0)
678                                 vfs_bio_clrbuf(bp);
679                 } else if (base_offset >= ip->ino_data.size) {
680                         /*
681                          * If the base offset of the buffer is beyond the
682                          * file EOF, we don't have to issue a read.
683                          */
684                         bp = getblk(ap->a_vp, base_offset,
685                                     blksize, GETBLK_BHEAVY, 0);
686                         vfs_bio_clrbuf(bp);
687                 } else {
688                         /*
689                          * Partial overwrite, read in any missing bits then
690                          * replace the portion being written.
691                          */
692                         error = bread(ap->a_vp, base_offset, blksize, &bp);
693                         if (error == 0)
694                                 bheavy(bp);
695                 }
696                 if (error == 0)
697                         error = uiomove(bp->b_data + offset, n, uio);
698
699                 /*
700                  * Generate REDO records if enabled and redo_count will not
701                  * exceeded the limit.
702                  *
703                  * If redo_count exceeds the limit we stop generating records
704                  * and clear HAMMER_INODE_REDO.  This will cause the next
705                  * fsync() to do a full meta-data sync instead of just an
706                  * UNDO/REDO fifo update.
707                  *
708                  * When clearing HAMMER_INODE_REDO any pre-existing REDOs
709                  * will still be tracked.  The tracks will be terminated
710                  * when the related meta-data (including possible data
711                  * modifications which are not tracked via REDO) is
712                  * flushed.
713                  */
714                 if ((ip->flags & HAMMER_INODE_REDO) && error == 0) {
715                         if (ip->redo_count < hammer_limit_redo) {
716                                 bp->b_flags |= B_VFSFLAG1;
717                                 error = hammer_generate_redo(&trans, ip,
718                                                      base_offset + offset,
719                                                      HAMMER_REDO_WRITE,
720                                                      bp->b_data + offset,
721                                                      (size_t)n);
722                         } else {
723                                 ip->flags &= ~HAMMER_INODE_REDO;
724                         }
725                 }
726
727                 /*
728                  * If we screwed up we have to undo any VM size changes we
729                  * made.
730                  */
731                 if (error) {
732                         brelse(bp);
733                         if (fixsize) {
734                                 nvtruncbuf(ap->a_vp, ip->ino_data.size,
735                                           hammer_blocksize(ip->ino_data.size),
736                                           hammer_blockoff(ip->ino_data.size));
737                         }
738                         break;
739                 }
740                 kflags |= NOTE_WRITE;
741                 hammer_stats_file_write += n;
742                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
743                 if (ip->ino_data.size < uio->uio_offset) {
744                         ip->ino_data.size = uio->uio_offset;
745                         flags = HAMMER_INODE_SDIRTY;
746                 } else {
747                         flags = 0;
748                 }
749                 ip->ino_data.mtime = trans.time;
750                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
751                 hammer_modify_inode(&trans, ip, flags);
752
753                 /*
754                  * Once we dirty the buffer any cached zone-X offset
755                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
756                  * allow overwriting over the same data sector unless
757                  * we provide UNDOs for the old data, which we don't.
758                  */
759                 bp->b_bio2.bio_offset = NOOFFSET;
760
761                 /*
762                  * Final buffer disposition.
763                  *
764                  * Because meta-data updates are deferred, HAMMER is
765                  * especially sensitive to excessive bdwrite()s because
766                  * the I/O stream is not broken up by disk reads.  So the
767                  * buffer cache simply cannot keep up.
768                  *
769                  * WARNING!  blksize is variable.  cluster_write() is
770                  *           expected to not blow up if it encounters
771                  *           buffers that do not match the passed blksize.
772                  *
773                  * NOTE!  Hammer shouldn't need to bawrite()/cluster_write().
774                  *        The ip->rsv_recs check should burst-flush the data.
775                  *        If we queue it immediately the buf could be left
776                  *        locked on the device queue for a very long time.
777                  *
778                  * NOTE!  To avoid degenerate stalls due to mismatched block
779                  *        sizes we only honor IO_DIRECT on the write which
780                  *        abuts the end of the buffer.  However, we must
781                  *        honor IO_SYNC in case someone is silly enough to
782                  *        configure a HAMMER file as swap, or when HAMMER
783                  *        is serving NFS (for commits).  Ick ick.
784                  */
785                 bp->b_flags |= B_AGE;
786                 if (ap->a_ioflag & IO_SYNC) {
787                         bwrite(bp);
788                 } else if ((ap->a_ioflag & IO_DIRECT) && endofblk) {
789                         bawrite(bp);
790                 } else {
791 #if 0
792                 if (offset + n == blksize) {
793                         if (hammer_cluster_enable == 0 ||
794                             (ap->a_vp->v_mount->mnt_flag & MNT_NOCLUSTERW)) {
795                                 bawrite(bp);
796                         } else {
797                                 cluster_write(bp, ip->ino_data.size,
798                                               blksize, seqcount);
799                         }
800                 } else {
801 #endif
802                         bdwrite(bp);
803                 }
804         }
805         hammer_done_transaction(&trans);
806         hammer_knote(ap->a_vp, kflags);
807         return (error);
808 }
809
810 /*
811  * hammer_vop_access { vp, mode, cred }
812  */
813 static
814 int
815 hammer_vop_access(struct vop_access_args *ap)
816 {
817         struct hammer_inode *ip = VTOI(ap->a_vp);
818         uid_t uid;
819         gid_t gid;
820         int error;
821
822         ++hammer_stats_file_iopsr;
823         uid = hammer_to_unix_xid(&ip->ino_data.uid);
824         gid = hammer_to_unix_xid(&ip->ino_data.gid);
825
826         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
827                                   ip->ino_data.uflags);
828         return (error);
829 }
830
831 /*
832  * hammer_vop_advlock { vp, id, op, fl, flags }
833  */
834 static
835 int
836 hammer_vop_advlock(struct vop_advlock_args *ap)
837 {
838         hammer_inode_t ip = VTOI(ap->a_vp);
839
840         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
841 }
842
843 /*
844  * hammer_vop_close { vp, fflag }
845  *
846  * We can only sync-on-close for normal closes.
847  */
848 static
849 int
850 hammer_vop_close(struct vop_close_args *ap)
851 {
852 #if 0
853         struct vnode *vp = ap->a_vp;
854         hammer_inode_t ip = VTOI(vp);
855         int waitfor;
856         if (ip->flags & (HAMMER_INODE_CLOSESYNC|HAMMER_INODE_CLOSEASYNC)) {
857                 if (vn_islocked(vp) == LK_EXCLUSIVE &&
858                     (vp->v_flag & (VINACTIVE|VRECLAIMED)) == 0) {
859                         if (ip->flags & HAMMER_INODE_CLOSESYNC)
860                                 waitfor = MNT_WAIT;
861                         else
862                                 waitfor = MNT_NOWAIT;
863                         ip->flags &= ~(HAMMER_INODE_CLOSESYNC |
864                                        HAMMER_INODE_CLOSEASYNC);
865                         VOP_FSYNC(vp, MNT_NOWAIT, waitfor);
866                 }
867         }
868 #endif
869         return (vop_stdclose(ap));
870 }
871
872 /*
873  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
874  *
875  * The operating system has already ensured that the directory entry
876  * does not exist and done all appropriate namespace locking.
877  */
878 static
879 int
880 hammer_vop_ncreate(struct vop_ncreate_args *ap)
881 {
882         struct hammer_transaction trans;
883         struct hammer_inode *dip;
884         struct hammer_inode *nip;
885         struct nchandle *nch;
886         int error;
887
888         nch = ap->a_nch;
889         dip = VTOI(ap->a_dvp);
890
891         if (dip->flags & HAMMER_INODE_RO)
892                 return (EROFS);
893         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
894                 return (error);
895
896         /*
897          * Create a transaction to cover the operations we perform.
898          */
899         hammer_start_transaction(&trans, dip->hmp);
900         ++hammer_stats_file_iopsw;
901
902         /*
903          * Create a new filesystem object of the requested type.  The
904          * returned inode will be referenced and shared-locked to prevent
905          * it from being moved to the flusher.
906          */
907         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
908                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
909                                     NULL, &nip);
910         if (error) {
911                 hkprintf("hammer_create_inode error %d\n", error);
912                 hammer_done_transaction(&trans);
913                 *ap->a_vpp = NULL;
914                 return (error);
915         }
916
917         /*
918          * Add the new filesystem object to the directory.  This will also
919          * bump the inode's link count.
920          */
921         error = hammer_ip_add_directory(&trans, dip,
922                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
923                                         nip);
924         if (error)
925                 hkprintf("hammer_ip_add_directory error %d\n", error);
926
927         /*
928          * Finish up.
929          */
930         if (error) {
931                 hammer_rel_inode(nip, 0);
932                 hammer_done_transaction(&trans);
933                 *ap->a_vpp = NULL;
934         } else {
935                 error = hammer_get_vnode(nip, ap->a_vpp);
936                 hammer_done_transaction(&trans);
937                 hammer_rel_inode(nip, 0);
938                 if (error == 0) {
939                         cache_setunresolved(ap->a_nch);
940                         cache_setvp(ap->a_nch, *ap->a_vpp);
941                 }
942                 hammer_knote(ap->a_dvp, NOTE_WRITE);
943         }
944         return (error);
945 }
946
947 /*
948  * hammer_vop_getattr { vp, vap }
949  *
950  * Retrieve an inode's attribute information.  When accessing inodes
951  * historically we fake the atime field to ensure consistent results.
952  * The atime field is stored in the B-Tree element and allowed to be
953  * updated without cycling the element.
954  *
955  * MPSAFE
956  */
957 static
958 int
959 hammer_vop_getattr(struct vop_getattr_args *ap)
960 {
961         struct hammer_inode *ip = VTOI(ap->a_vp);
962         struct vattr *vap = ap->a_vap;
963
964         /*
965          * We want the fsid to be different when accessing a filesystem
966          * with different as-of's so programs like diff don't think
967          * the files are the same.
968          *
969          * We also want the fsid to be the same when comparing snapshots,
970          * or when comparing mirrors (which might be backed by different
971          * physical devices).  HAMMER fsids are based on the PFS's
972          * shared_uuid field.
973          *
974          * XXX there is a chance of collision here.  The va_fsid reported
975          * by stat is different from the more involved fsid used in the
976          * mount structure.
977          */
978         ++hammer_stats_file_iopsr;
979         hammer_lock_sh(&ip->lock);
980         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
981                        (u_int32_t)(ip->obj_asof >> 32);
982
983         vap->va_fileid = ip->ino_leaf.base.obj_id;
984         vap->va_mode = ip->ino_data.mode;
985         vap->va_nlink = ip->ino_data.nlinks;
986         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
987         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
988         vap->va_rmajor = 0;
989         vap->va_rminor = 0;
990         vap->va_size = ip->ino_data.size;
991
992         /*
993          * Special case for @@PFS softlinks.  The actual size of the
994          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
995          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
996          */
997         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
998             ip->ino_data.size == 10 &&
999             ip->obj_asof == HAMMER_MAX_TID &&
1000             ip->obj_localization == 0 &&
1001             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
1002                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
1003                             vap->va_size = 26;
1004                     else
1005                             vap->va_size = 10;
1006         }
1007
1008         /*
1009          * We must provide a consistent atime and mtime for snapshots
1010          * so people can do a 'tar cf - ... | md5' on them and get
1011          * consistent results.
1012          */
1013         if (ip->flags & HAMMER_INODE_RO) {
1014                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
1015                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
1016         } else {
1017                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
1018                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
1019         }
1020         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
1021         vap->va_flags = ip->ino_data.uflags;
1022         vap->va_gen = 1;        /* hammer inums are unique for all time */
1023         vap->va_blocksize = HAMMER_BUFSIZE;
1024         if (ip->ino_data.size >= HAMMER_XDEMARC) {
1025                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
1026                                 ~HAMMER_XBUFMASK64;
1027         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
1028                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
1029                                 ~HAMMER_BUFMASK64;
1030         } else {
1031                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
1032         }
1033
1034         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
1035         vap->va_filerev = 0;    /* XXX */
1036         vap->va_uid_uuid = ip->ino_data.uid;
1037         vap->va_gid_uuid = ip->ino_data.gid;
1038         vap->va_fsid_uuid = ip->hmp->fsid;
1039         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
1040                           VA_FSID_UUID_VALID;
1041
1042         switch (ip->ino_data.obj_type) {
1043         case HAMMER_OBJTYPE_CDEV:
1044         case HAMMER_OBJTYPE_BDEV:
1045                 vap->va_rmajor = ip->ino_data.rmajor;
1046                 vap->va_rminor = ip->ino_data.rminor;
1047                 break;
1048         default:
1049                 break;
1050         }
1051         hammer_unlock(&ip->lock);
1052         return(0);
1053 }
1054
1055 /*
1056  * hammer_vop_nresolve { nch, dvp, cred }
1057  *
1058  * Locate the requested directory entry.
1059  */
1060 static
1061 int
1062 hammer_vop_nresolve(struct vop_nresolve_args *ap)
1063 {
1064         struct hammer_transaction trans;
1065         struct namecache *ncp;
1066         hammer_inode_t dip;
1067         hammer_inode_t ip;
1068         hammer_tid_t asof;
1069         struct hammer_cursor cursor;
1070         struct vnode *vp;
1071         int64_t namekey;
1072         int error;
1073         int i;
1074         int nlen;
1075         int flags;
1076         int ispfs;
1077         int64_t obj_id;
1078         u_int32_t localization;
1079         u_int32_t max_iterations;
1080
1081         /*
1082          * Misc initialization, plus handle as-of name extensions.  Look for
1083          * the '@@' extension.  Note that as-of files and directories cannot
1084          * be modified.
1085          */
1086         dip = VTOI(ap->a_dvp);
1087         ncp = ap->a_nch->ncp;
1088         asof = dip->obj_asof;
1089         localization = dip->obj_localization;   /* for code consistency */
1090         nlen = ncp->nc_nlen;
1091         flags = dip->flags & HAMMER_INODE_RO;
1092         ispfs = 0;
1093
1094         hammer_simple_transaction(&trans, dip->hmp);
1095         ++hammer_stats_file_iopsr;
1096
1097         for (i = 0; i < nlen; ++i) {
1098                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
1099                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
1100                                                   &ispfs, &asof, &localization);
1101                         if (error != 0) {
1102                                 i = nlen;
1103                                 break;
1104                         }
1105                         if (asof != HAMMER_MAX_TID)
1106                                 flags |= HAMMER_INODE_RO;
1107                         break;
1108                 }
1109         }
1110         nlen = i;
1111
1112         /*
1113          * If this is a PFS softlink we dive into the PFS
1114          */
1115         if (ispfs && nlen == 0) {
1116                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
1117                                       asof, localization,
1118                                       flags, &error);
1119                 if (error == 0) {
1120                         error = hammer_get_vnode(ip, &vp);
1121                         hammer_rel_inode(ip, 0);
1122                 } else {
1123                         vp = NULL;
1124                 }
1125                 if (error == 0) {
1126                         vn_unlock(vp);
1127                         cache_setvp(ap->a_nch, vp);
1128                         vrele(vp);
1129                 }
1130                 goto done;
1131         }
1132
1133         /*
1134          * If there is no path component the time extension is relative to dip.
1135          * e.g. "fubar/@@<snapshot>"
1136          *
1137          * "." is handled by the kernel, but ".@@<snapshot>" is not.
1138          * e.g. "fubar/.@@<snapshot>"
1139          *
1140          * ".." is handled by the kernel.  We do not currently handle
1141          * "..@<snapshot>".
1142          */
1143         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
1144                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
1145                                       asof, dip->obj_localization,
1146                                       flags, &error);
1147                 if (error == 0) {
1148                         error = hammer_get_vnode(ip, &vp);
1149                         hammer_rel_inode(ip, 0);
1150                 } else {
1151                         vp = NULL;
1152                 }
1153                 if (error == 0) {
1154                         vn_unlock(vp);
1155                         cache_setvp(ap->a_nch, vp);
1156                         vrele(vp);
1157                 }
1158                 goto done;
1159         }
1160
1161         /*
1162          * Calculate the namekey and setup the key range for the scan.  This
1163          * works kinda like a chained hash table where the lower 32 bits
1164          * of the namekey synthesize the chain.
1165          *
1166          * The key range is inclusive of both key_beg and key_end.
1167          */
1168         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
1169                                            &max_iterations);
1170
1171         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
1172         cursor.key_beg.localization = dip->obj_localization +
1173                                       hammer_dir_localization(dip);
1174         cursor.key_beg.obj_id = dip->obj_id;
1175         cursor.key_beg.key = namekey;
1176         cursor.key_beg.create_tid = 0;
1177         cursor.key_beg.delete_tid = 0;
1178         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1179         cursor.key_beg.obj_type = 0;
1180
1181         cursor.key_end = cursor.key_beg;
1182         cursor.key_end.key += max_iterations;
1183         cursor.asof = asof;
1184         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1185
1186         /*
1187          * Scan all matching records (the chain), locate the one matching
1188          * the requested path component.
1189          *
1190          * The hammer_ip_*() functions merge in-memory records with on-disk
1191          * records for the purposes of the search.
1192          */
1193         obj_id = 0;
1194         localization = HAMMER_DEF_LOCALIZATION;
1195
1196         if (error == 0) {
1197                 error = hammer_ip_first(&cursor);
1198                 while (error == 0) {
1199                         error = hammer_ip_resolve_data(&cursor);
1200                         if (error)
1201                                 break;
1202                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
1203                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1204                                 obj_id = cursor.data->entry.obj_id;
1205                                 localization = cursor.data->entry.localization;
1206                                 break;
1207                         }
1208                         error = hammer_ip_next(&cursor);
1209                 }
1210         }
1211         hammer_done_cursor(&cursor);
1212
1213         /*
1214          * Lookup the obj_id.  This should always succeed.  If it does not
1215          * the filesystem may be damaged and we return a dummy inode.
1216          */
1217         if (error == 0) {
1218                 ip = hammer_get_inode(&trans, dip, obj_id,
1219                                       asof, localization,
1220                                       flags, &error);
1221                 if (error == ENOENT) {
1222                         kprintf("HAMMER: WARNING: Missing "
1223                                 "inode for dirent \"%s\"\n"
1224                                 "\tobj_id = %016llx, asof=%016llx, lo=%08x\n",
1225                                 ncp->nc_name,
1226                                 (long long)obj_id, (long long)asof,
1227                                 localization);
1228                         error = 0;
1229                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
1230                                                     asof, localization,
1231                                                     flags, &error);
1232                 }
1233                 if (error == 0) {
1234                         error = hammer_get_vnode(ip, &vp);
1235                         hammer_rel_inode(ip, 0);
1236                 } else {
1237                         vp = NULL;
1238                 }
1239                 if (error == 0) {
1240                         vn_unlock(vp);
1241                         cache_setvp(ap->a_nch, vp);
1242                         vrele(vp);
1243                 }
1244         } else if (error == ENOENT) {
1245                 cache_setvp(ap->a_nch, NULL);
1246         }
1247 done:
1248         hammer_done_transaction(&trans);
1249         return (error);
1250 }
1251
1252 /*
1253  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1254  *
1255  * Locate the parent directory of a directory vnode.
1256  *
1257  * dvp is referenced but not locked.  *vpp must be returned referenced and
1258  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1259  * at the root, instead it could indicate that the directory we were in was
1260  * removed.
1261  *
1262  * NOTE: as-of sequences are not linked into the directory structure.  If
1263  * we are at the root with a different asof then the mount point, reload
1264  * the same directory with the mount point's asof.   I'm not sure what this
1265  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1266  * get confused, but it hasn't been tested.
1267  */
1268 static
1269 int
1270 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1271 {
1272         struct hammer_transaction trans;
1273         struct hammer_inode *dip;
1274         struct hammer_inode *ip;
1275         int64_t parent_obj_id;
1276         u_int32_t parent_obj_localization;
1277         hammer_tid_t asof;
1278         int error;
1279
1280         dip = VTOI(ap->a_dvp);
1281         asof = dip->obj_asof;
1282
1283         /*
1284          * Whos are parent?  This could be the root of a pseudo-filesystem
1285          * whos parent is in another localization domain.
1286          */
1287         parent_obj_id = dip->ino_data.parent_obj_id;
1288         if (dip->obj_id == HAMMER_OBJID_ROOT)
1289                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1290         else
1291                 parent_obj_localization = dip->obj_localization;
1292
1293         if (parent_obj_id == 0) {
1294                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1295                    asof != dip->hmp->asof) {
1296                         parent_obj_id = dip->obj_id;
1297                         asof = dip->hmp->asof;
1298                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1299                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1300                                   (long long)dip->obj_asof);
1301                 } else {
1302                         *ap->a_vpp = NULL;
1303                         return ENOENT;
1304                 }
1305         }
1306
1307         hammer_simple_transaction(&trans, dip->hmp);
1308         ++hammer_stats_file_iopsr;
1309
1310         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1311                               asof, parent_obj_localization,
1312                               dip->flags, &error);
1313         if (ip) {
1314                 error = hammer_get_vnode(ip, ap->a_vpp);
1315                 hammer_rel_inode(ip, 0);
1316         } else {
1317                 *ap->a_vpp = NULL;
1318         }
1319         hammer_done_transaction(&trans);
1320         return (error);
1321 }
1322
1323 /*
1324  * hammer_vop_nlink { nch, dvp, vp, cred }
1325  */
1326 static
1327 int
1328 hammer_vop_nlink(struct vop_nlink_args *ap)
1329 {
1330         struct hammer_transaction trans;
1331         struct hammer_inode *dip;
1332         struct hammer_inode *ip;
1333         struct nchandle *nch;
1334         int error;
1335
1336         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1337                 return(EXDEV);
1338
1339         nch = ap->a_nch;
1340         dip = VTOI(ap->a_dvp);
1341         ip = VTOI(ap->a_vp);
1342
1343         if (dip->obj_localization != ip->obj_localization)
1344                 return(EXDEV);
1345
1346         if (dip->flags & HAMMER_INODE_RO)
1347                 return (EROFS);
1348         if (ip->flags & HAMMER_INODE_RO)
1349                 return (EROFS);
1350         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1351                 return (error);
1352
1353         /*
1354          * Create a transaction to cover the operations we perform.
1355          */
1356         hammer_start_transaction(&trans, dip->hmp);
1357         ++hammer_stats_file_iopsw;
1358
1359         /*
1360          * Add the filesystem object to the directory.  Note that neither
1361          * dip nor ip are referenced or locked, but their vnodes are
1362          * referenced.  This function will bump the inode's link count.
1363          */
1364         error = hammer_ip_add_directory(&trans, dip,
1365                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1366                                         ip);
1367
1368         /*
1369          * Finish up.
1370          */
1371         if (error == 0) {
1372                 cache_setunresolved(nch);
1373                 cache_setvp(nch, ap->a_vp);
1374         }
1375         hammer_done_transaction(&trans);
1376         hammer_knote(ap->a_vp, NOTE_LINK);
1377         hammer_knote(ap->a_dvp, NOTE_WRITE);
1378         return (error);
1379 }
1380
1381 /*
1382  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1383  *
1384  * The operating system has already ensured that the directory entry
1385  * does not exist and done all appropriate namespace locking.
1386  */
1387 static
1388 int
1389 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1390 {
1391         struct hammer_transaction trans;
1392         struct hammer_inode *dip;
1393         struct hammer_inode *nip;
1394         struct nchandle *nch;
1395         int error;
1396
1397         nch = ap->a_nch;
1398         dip = VTOI(ap->a_dvp);
1399
1400         if (dip->flags & HAMMER_INODE_RO)
1401                 return (EROFS);
1402         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1403                 return (error);
1404
1405         /*
1406          * Create a transaction to cover the operations we perform.
1407          */
1408         hammer_start_transaction(&trans, dip->hmp);
1409         ++hammer_stats_file_iopsw;
1410
1411         /*
1412          * Create a new filesystem object of the requested type.  The
1413          * returned inode will be referenced but not locked.
1414          */
1415         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1416                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1417                                     NULL, &nip);
1418         if (error) {
1419                 hkprintf("hammer_mkdir error %d\n", error);
1420                 hammer_done_transaction(&trans);
1421                 *ap->a_vpp = NULL;
1422                 return (error);
1423         }
1424         /*
1425          * Add the new filesystem object to the directory.  This will also
1426          * bump the inode's link count.
1427          */
1428         error = hammer_ip_add_directory(&trans, dip,
1429                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1430                                         nip);
1431         if (error)
1432                 hkprintf("hammer_mkdir (add) error %d\n", error);
1433
1434         /*
1435          * Finish up.
1436          */
1437         if (error) {
1438                 hammer_rel_inode(nip, 0);
1439                 *ap->a_vpp = NULL;
1440         } else {
1441                 error = hammer_get_vnode(nip, ap->a_vpp);
1442                 hammer_rel_inode(nip, 0);
1443                 if (error == 0) {
1444                         cache_setunresolved(ap->a_nch);
1445                         cache_setvp(ap->a_nch, *ap->a_vpp);
1446                 }
1447         }
1448         hammer_done_transaction(&trans);
1449         if (error == 0)
1450                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1451         return (error);
1452 }
1453
1454 /*
1455  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1456  *
1457  * The operating system has already ensured that the directory entry
1458  * does not exist and done all appropriate namespace locking.
1459  */
1460 static
1461 int
1462 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1463 {
1464         struct hammer_transaction trans;
1465         struct hammer_inode *dip;
1466         struct hammer_inode *nip;
1467         struct nchandle *nch;
1468         int error;
1469
1470         nch = ap->a_nch;
1471         dip = VTOI(ap->a_dvp);
1472
1473         if (dip->flags & HAMMER_INODE_RO)
1474                 return (EROFS);
1475         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1476                 return (error);
1477
1478         /*
1479          * Create a transaction to cover the operations we perform.
1480          */
1481         hammer_start_transaction(&trans, dip->hmp);
1482         ++hammer_stats_file_iopsw;
1483
1484         /*
1485          * Create a new filesystem object of the requested type.  The
1486          * returned inode will be referenced but not locked.
1487          *
1488          * If mknod specifies a directory a pseudo-fs is created.
1489          */
1490         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1491                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1492                                     NULL, &nip);
1493         if (error) {
1494                 hammer_done_transaction(&trans);
1495                 *ap->a_vpp = NULL;
1496                 return (error);
1497         }
1498
1499         /*
1500          * Add the new filesystem object to the directory.  This will also
1501          * bump the inode's link count.
1502          */
1503         error = hammer_ip_add_directory(&trans, dip,
1504                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1505                                         nip);
1506
1507         /*
1508          * Finish up.
1509          */
1510         if (error) {
1511                 hammer_rel_inode(nip, 0);
1512                 *ap->a_vpp = NULL;
1513         } else {
1514                 error = hammer_get_vnode(nip, ap->a_vpp);
1515                 hammer_rel_inode(nip, 0);
1516                 if (error == 0) {
1517                         cache_setunresolved(ap->a_nch);
1518                         cache_setvp(ap->a_nch, *ap->a_vpp);
1519                 }
1520         }
1521         hammer_done_transaction(&trans);
1522         if (error == 0)
1523                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1524         return (error);
1525 }
1526
1527 /*
1528  * hammer_vop_open { vp, mode, cred, fp }
1529  */
1530 static
1531 int
1532 hammer_vop_open(struct vop_open_args *ap)
1533 {
1534         hammer_inode_t ip;
1535
1536         ++hammer_stats_file_iopsr;
1537         ip = VTOI(ap->a_vp);
1538
1539         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1540                 return (EROFS);
1541         return(vop_stdopen(ap));
1542 }
1543
1544 /*
1545  * hammer_vop_print { vp }
1546  */
1547 static
1548 int
1549 hammer_vop_print(struct vop_print_args *ap)
1550 {
1551         return EOPNOTSUPP;
1552 }
1553
1554 /*
1555  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1556  */
1557 static
1558 int
1559 hammer_vop_readdir(struct vop_readdir_args *ap)
1560 {
1561         struct hammer_transaction trans;
1562         struct hammer_cursor cursor;
1563         struct hammer_inode *ip;
1564         struct uio *uio;
1565         hammer_base_elm_t base;
1566         int error;
1567         int cookie_index;
1568         int ncookies;
1569         off_t *cookies;
1570         off_t saveoff;
1571         int r;
1572         int dtype;
1573
1574         ++hammer_stats_file_iopsr;
1575         ip = VTOI(ap->a_vp);
1576         uio = ap->a_uio;
1577         saveoff = uio->uio_offset;
1578
1579         if (ap->a_ncookies) {
1580                 ncookies = uio->uio_resid / 16 + 1;
1581                 if (ncookies > 1024)
1582                         ncookies = 1024;
1583                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1584                 cookie_index = 0;
1585         } else {
1586                 ncookies = -1;
1587                 cookies = NULL;
1588                 cookie_index = 0;
1589         }
1590
1591         hammer_simple_transaction(&trans, ip->hmp);
1592
1593         /*
1594          * Handle artificial entries
1595          *
1596          * It should be noted that the minimum value for a directory
1597          * hash key on-media is 0x0000000100000000, so we can use anything
1598          * less then that to represent our 'special' key space.
1599          */
1600         error = 0;
1601         if (saveoff == 0) {
1602                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1603                 if (r)
1604                         goto done;
1605                 if (cookies)
1606                         cookies[cookie_index] = saveoff;
1607                 ++saveoff;
1608                 ++cookie_index;
1609                 if (cookie_index == ncookies)
1610                         goto done;
1611         }
1612         if (saveoff == 1) {
1613                 if (ip->ino_data.parent_obj_id) {
1614                         r = vop_write_dirent(&error, uio,
1615                                              ip->ino_data.parent_obj_id,
1616                                              DT_DIR, 2, "..");
1617                 } else {
1618                         r = vop_write_dirent(&error, uio,
1619                                              ip->obj_id, DT_DIR, 2, "..");
1620                 }
1621                 if (r)
1622                         goto done;
1623                 if (cookies)
1624                         cookies[cookie_index] = saveoff;
1625                 ++saveoff;
1626                 ++cookie_index;
1627                 if (cookie_index == ncookies)
1628                         goto done;
1629         }
1630
1631         /*
1632          * Key range (begin and end inclusive) to scan.  Directory keys
1633          * directly translate to a 64 bit 'seek' position.
1634          */
1635         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1636         cursor.key_beg.localization = ip->obj_localization +
1637                                       hammer_dir_localization(ip);
1638         cursor.key_beg.obj_id = ip->obj_id;
1639         cursor.key_beg.create_tid = 0;
1640         cursor.key_beg.delete_tid = 0;
1641         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1642         cursor.key_beg.obj_type = 0;
1643         cursor.key_beg.key = saveoff;
1644
1645         cursor.key_end = cursor.key_beg;
1646         cursor.key_end.key = HAMMER_MAX_KEY;
1647         cursor.asof = ip->obj_asof;
1648         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1649
1650         error = hammer_ip_first(&cursor);
1651
1652         while (error == 0) {
1653                 error = hammer_ip_resolve_data(&cursor);
1654                 if (error)
1655                         break;
1656                 base = &cursor.leaf->base;
1657                 saveoff = base->key;
1658                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1659
1660                 if (base->obj_id != ip->obj_id)
1661                         panic("readdir: bad record at %p", cursor.node);
1662
1663                 /*
1664                  * Convert pseudo-filesystems into softlinks
1665                  */
1666                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1667                 r = vop_write_dirent(
1668                              &error, uio, cursor.data->entry.obj_id,
1669                              dtype,
1670                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1671                              (void *)cursor.data->entry.name);
1672                 if (r)
1673                         break;
1674                 ++saveoff;
1675                 if (cookies)
1676                         cookies[cookie_index] = base->key;
1677                 ++cookie_index;
1678                 if (cookie_index == ncookies)
1679                         break;
1680                 error = hammer_ip_next(&cursor);
1681         }
1682         hammer_done_cursor(&cursor);
1683
1684 done:
1685         hammer_done_transaction(&trans);
1686
1687         if (ap->a_eofflag)
1688                 *ap->a_eofflag = (error == ENOENT);
1689         uio->uio_offset = saveoff;
1690         if (error && cookie_index == 0) {
1691                 if (error == ENOENT)
1692                         error = 0;
1693                 if (cookies) {
1694                         kfree(cookies, M_TEMP);
1695                         *ap->a_ncookies = 0;
1696                         *ap->a_cookies = NULL;
1697                 }
1698         } else {
1699                 if (error == ENOENT)
1700                         error = 0;
1701                 if (cookies) {
1702                         *ap->a_ncookies = cookie_index;
1703                         *ap->a_cookies = cookies;
1704                 }
1705         }
1706         return(error);
1707 }
1708
1709 /*
1710  * hammer_vop_readlink { vp, uio, cred }
1711  */
1712 static
1713 int
1714 hammer_vop_readlink(struct vop_readlink_args *ap)
1715 {
1716         struct hammer_transaction trans;
1717         struct hammer_cursor cursor;
1718         struct hammer_inode *ip;
1719         char buf[32];
1720         u_int32_t localization;
1721         hammer_pseudofs_inmem_t pfsm;
1722         int error;
1723
1724         ip = VTOI(ap->a_vp);
1725
1726         /*
1727          * Shortcut if the symlink data was stuffed into ino_data.
1728          *
1729          * Also expand special "@@PFS%05d" softlinks (expansion only
1730          * occurs for non-historical (current) accesses made from the
1731          * primary filesystem).
1732          */
1733         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1734                 char *ptr;
1735                 int bytes;
1736
1737                 ptr = ip->ino_data.ext.symlink;
1738                 bytes = (int)ip->ino_data.size;
1739                 if (bytes == 10 &&
1740                     ip->obj_asof == HAMMER_MAX_TID &&
1741                     ip->obj_localization == 0 &&
1742                     strncmp(ptr, "@@PFS", 5) == 0) {
1743                         hammer_simple_transaction(&trans, ip->hmp);
1744                         bcopy(ptr + 5, buf, 5);
1745                         buf[5] = 0;
1746                         localization = strtoul(buf, NULL, 10) << 16;
1747                         pfsm = hammer_load_pseudofs(&trans, localization,
1748                                                     &error);
1749                         if (error == 0) {
1750                                 if (pfsm->pfsd.mirror_flags &
1751                                     HAMMER_PFSD_SLAVE) {
1752                                         /* vap->va_size == 26 */
1753                                         ksnprintf(buf, sizeof(buf),
1754                                                   "@@0x%016llx:%05d",
1755                                                   (long long)pfsm->pfsd.sync_end_tid,
1756                                                   localization >> 16);
1757                                 } else {
1758                                         /* vap->va_size == 10 */
1759                                         ksnprintf(buf, sizeof(buf),
1760                                                   "@@-1:%05d",
1761                                                   localization >> 16);
1762 #if 0
1763                                         ksnprintf(buf, sizeof(buf),
1764                                                   "@@0x%016llx:%05d",
1765                                                   (long long)HAMMER_MAX_TID,
1766                                                   localization >> 16);
1767 #endif
1768                                 }
1769                                 ptr = buf;
1770                                 bytes = strlen(buf);
1771                         }
1772                         if (pfsm)
1773                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1774                         hammer_done_transaction(&trans);
1775                 }
1776                 error = uiomove(ptr, bytes, ap->a_uio);
1777                 return(error);
1778         }
1779
1780         /*
1781          * Long version
1782          */
1783         hammer_simple_transaction(&trans, ip->hmp);
1784         ++hammer_stats_file_iopsr;
1785         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1786
1787         /*
1788          * Key range (begin and end inclusive) to scan.  Directory keys
1789          * directly translate to a 64 bit 'seek' position.
1790          */
1791         cursor.key_beg.localization = ip->obj_localization +
1792                                       HAMMER_LOCALIZE_MISC;
1793         cursor.key_beg.obj_id = ip->obj_id;
1794         cursor.key_beg.create_tid = 0;
1795         cursor.key_beg.delete_tid = 0;
1796         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1797         cursor.key_beg.obj_type = 0;
1798         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1799         cursor.asof = ip->obj_asof;
1800         cursor.flags |= HAMMER_CURSOR_ASOF;
1801
1802         error = hammer_ip_lookup(&cursor);
1803         if (error == 0) {
1804                 error = hammer_ip_resolve_data(&cursor);
1805                 if (error == 0) {
1806                         KKASSERT(cursor.leaf->data_len >=
1807                                  HAMMER_SYMLINK_NAME_OFF);
1808                         error = uiomove(cursor.data->symlink.name,
1809                                         cursor.leaf->data_len -
1810                                                 HAMMER_SYMLINK_NAME_OFF,
1811                                         ap->a_uio);
1812                 }
1813         }
1814         hammer_done_cursor(&cursor);
1815         hammer_done_transaction(&trans);
1816         return(error);
1817 }
1818
1819 /*
1820  * hammer_vop_nremove { nch, dvp, cred }
1821  */
1822 static
1823 int
1824 hammer_vop_nremove(struct vop_nremove_args *ap)
1825 {
1826         struct hammer_transaction trans;
1827         struct hammer_inode *dip;
1828         int error;
1829
1830         dip = VTOI(ap->a_dvp);
1831
1832         if (hammer_nohistory(dip) == 0 &&
1833             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1834                 return (error);
1835         }
1836
1837         hammer_start_transaction(&trans, dip->hmp);
1838         ++hammer_stats_file_iopsw;
1839         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1840         hammer_done_transaction(&trans);
1841         if (error == 0)
1842                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1843         return (error);
1844 }
1845
1846 /*
1847  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1848  */
1849 static
1850 int
1851 hammer_vop_nrename(struct vop_nrename_args *ap)
1852 {
1853         struct hammer_transaction trans;
1854         struct namecache *fncp;
1855         struct namecache *tncp;
1856         struct hammer_inode *fdip;
1857         struct hammer_inode *tdip;
1858         struct hammer_inode *ip;
1859         struct hammer_cursor cursor;
1860         int64_t namekey;
1861         u_int32_t max_iterations;
1862         int nlen, error;
1863
1864         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1865                 return(EXDEV);
1866         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1867                 return(EXDEV);
1868
1869         fdip = VTOI(ap->a_fdvp);
1870         tdip = VTOI(ap->a_tdvp);
1871         fncp = ap->a_fnch->ncp;
1872         tncp = ap->a_tnch->ncp;
1873         ip = VTOI(fncp->nc_vp);
1874         KKASSERT(ip != NULL);
1875
1876         if (fdip->obj_localization != tdip->obj_localization)
1877                 return(EXDEV);
1878         if (fdip->obj_localization != ip->obj_localization)
1879                 return(EXDEV);
1880
1881         if (fdip->flags & HAMMER_INODE_RO)
1882                 return (EROFS);
1883         if (tdip->flags & HAMMER_INODE_RO)
1884                 return (EROFS);
1885         if (ip->flags & HAMMER_INODE_RO)
1886                 return (EROFS);
1887         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1888                 return (error);
1889
1890         hammer_start_transaction(&trans, fdip->hmp);
1891         ++hammer_stats_file_iopsw;
1892
1893         /*
1894          * Remove tncp from the target directory and then link ip as
1895          * tncp. XXX pass trans to dounlink
1896          *
1897          * Force the inode sync-time to match the transaction so it is
1898          * in-sync with the creation of the target directory entry.
1899          */
1900         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1901                                 ap->a_cred, 0, -1);
1902         if (error == 0 || error == ENOENT) {
1903                 error = hammer_ip_add_directory(&trans, tdip,
1904                                                 tncp->nc_name, tncp->nc_nlen,
1905                                                 ip);
1906                 if (error == 0) {
1907                         ip->ino_data.parent_obj_id = tdip->obj_id;
1908                         ip->ino_data.ctime = trans.time;
1909                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
1910                 }
1911         }
1912         if (error)
1913                 goto failed; /* XXX */
1914
1915         /*
1916          * Locate the record in the originating directory and remove it.
1917          *
1918          * Calculate the namekey and setup the key range for the scan.  This
1919          * works kinda like a chained hash table where the lower 32 bits
1920          * of the namekey synthesize the chain.
1921          *
1922          * The key range is inclusive of both key_beg and key_end.
1923          */
1924         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1925                                            &max_iterations);
1926 retry:
1927         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1928         cursor.key_beg.localization = fdip->obj_localization +
1929                                       hammer_dir_localization(fdip);
1930         cursor.key_beg.obj_id = fdip->obj_id;
1931         cursor.key_beg.key = namekey;
1932         cursor.key_beg.create_tid = 0;
1933         cursor.key_beg.delete_tid = 0;
1934         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1935         cursor.key_beg.obj_type = 0;
1936
1937         cursor.key_end = cursor.key_beg;
1938         cursor.key_end.key += max_iterations;
1939         cursor.asof = fdip->obj_asof;
1940         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1941
1942         /*
1943          * Scan all matching records (the chain), locate the one matching
1944          * the requested path component.
1945          *
1946          * The hammer_ip_*() functions merge in-memory records with on-disk
1947          * records for the purposes of the search.
1948          */
1949         error = hammer_ip_first(&cursor);
1950         while (error == 0) {
1951                 if (hammer_ip_resolve_data(&cursor) != 0)
1952                         break;
1953                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1954                 KKASSERT(nlen > 0);
1955                 if (fncp->nc_nlen == nlen &&
1956                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1957                         break;
1958                 }
1959                 error = hammer_ip_next(&cursor);
1960         }
1961
1962         /*
1963          * If all is ok we have to get the inode so we can adjust nlinks.
1964          *
1965          * WARNING: hammer_ip_del_directory() may have to terminate the
1966          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1967          * twice.
1968          */
1969         if (error == 0)
1970                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1971
1972         /*
1973          * XXX A deadlock here will break rename's atomicy for the purposes
1974          * of crash recovery.
1975          */
1976         if (error == EDEADLK) {
1977                 hammer_done_cursor(&cursor);
1978                 goto retry;
1979         }
1980
1981         /*
1982          * Cleanup and tell the kernel that the rename succeeded.
1983          */
1984         hammer_done_cursor(&cursor);
1985         if (error == 0) {
1986                 cache_rename(ap->a_fnch, ap->a_tnch);
1987                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1988                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1989                 if (ip->vp)
1990                         hammer_knote(ip->vp, NOTE_RENAME);
1991         }
1992
1993 failed:
1994         hammer_done_transaction(&trans);
1995         return (error);
1996 }
1997
1998 /*
1999  * hammer_vop_nrmdir { nch, dvp, cred }
2000  */
2001 static
2002 int
2003 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
2004 {
2005         struct hammer_transaction trans;
2006         struct hammer_inode *dip;
2007         int error;
2008
2009         dip = VTOI(ap->a_dvp);
2010
2011         if (hammer_nohistory(dip) == 0 &&
2012             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2013                 return (error);
2014         }
2015
2016         hammer_start_transaction(&trans, dip->hmp);
2017         ++hammer_stats_file_iopsw;
2018         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
2019         hammer_done_transaction(&trans);
2020         if (error == 0)
2021                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2022         return (error);
2023 }
2024
2025 /*
2026  * hammer_vop_markatime { vp, cred }
2027  */
2028 static
2029 int
2030 hammer_vop_markatime(struct vop_markatime_args *ap)
2031 {
2032         struct hammer_transaction trans;
2033         struct hammer_inode *ip;
2034
2035         ip = VTOI(ap->a_vp);
2036         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2037                 return (EROFS);
2038         if (ip->flags & HAMMER_INODE_RO)
2039                 return (EROFS);
2040         if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
2041                 return (0);
2042         hammer_start_transaction(&trans, ip->hmp);
2043         ++hammer_stats_file_iopsw;
2044
2045         ip->ino_data.atime = trans.time;
2046         hammer_modify_inode(&trans, ip, HAMMER_INODE_ATIME);
2047         hammer_done_transaction(&trans);
2048         hammer_knote(ap->a_vp, NOTE_ATTRIB);
2049         return (0);
2050 }
2051
2052 /*
2053  * hammer_vop_setattr { vp, vap, cred }
2054  */
2055 static
2056 int
2057 hammer_vop_setattr(struct vop_setattr_args *ap)
2058 {
2059         struct hammer_transaction trans;
2060         struct vattr *vap;
2061         struct hammer_inode *ip;
2062         int modflags;
2063         int error;
2064         int truncating;
2065         int blksize;
2066         int kflags;
2067 #if 0
2068         int64_t aligned_size;
2069 #endif
2070         u_int32_t flags;
2071
2072         vap = ap->a_vap;
2073         ip = ap->a_vp->v_data;
2074         modflags = 0;
2075         kflags = 0;
2076
2077         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
2078                 return(EROFS);
2079         if (ip->flags & HAMMER_INODE_RO)
2080                 return (EROFS);
2081         if (hammer_nohistory(ip) == 0 &&
2082             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
2083                 return (error);
2084         }
2085
2086         hammer_start_transaction(&trans, ip->hmp);
2087         ++hammer_stats_file_iopsw;
2088         error = 0;
2089
2090         if (vap->va_flags != VNOVAL) {
2091                 flags = ip->ino_data.uflags;
2092                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
2093                                          hammer_to_unix_xid(&ip->ino_data.uid),
2094                                          ap->a_cred);
2095                 if (error == 0) {
2096                         if (ip->ino_data.uflags != flags) {
2097                                 ip->ino_data.uflags = flags;
2098                                 ip->ino_data.ctime = trans.time;
2099                                 modflags |= HAMMER_INODE_DDIRTY;
2100                                 kflags |= NOTE_ATTRIB;
2101                         }
2102                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2103                                 error = 0;
2104                                 goto done;
2105                         }
2106                 }
2107                 goto done;
2108         }
2109         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
2110                 error = EPERM;
2111                 goto done;
2112         }
2113         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
2114                 mode_t cur_mode = ip->ino_data.mode;
2115                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2116                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2117                 uuid_t uuid_uid;
2118                 uuid_t uuid_gid;
2119
2120                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
2121                                          ap->a_cred,
2122                                          &cur_uid, &cur_gid, &cur_mode);
2123                 if (error == 0) {
2124                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
2125                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
2126                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
2127                                  sizeof(uuid_uid)) ||
2128                             bcmp(&uuid_gid, &ip->ino_data.gid,
2129                                  sizeof(uuid_gid)) ||
2130                             ip->ino_data.mode != cur_mode
2131                         ) {
2132                                 ip->ino_data.uid = uuid_uid;
2133                                 ip->ino_data.gid = uuid_gid;
2134                                 ip->ino_data.mode = cur_mode;
2135                                 ip->ino_data.ctime = trans.time;
2136                                 modflags |= HAMMER_INODE_DDIRTY;
2137                         }
2138                         kflags |= NOTE_ATTRIB;
2139                 }
2140         }
2141         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
2142                 switch(ap->a_vp->v_type) {
2143                 case VREG:
2144                         if (vap->va_size == ip->ino_data.size)
2145                                 break;
2146
2147                         /*
2148                          * Log the operation if in fast-fsync mode or if
2149                          * there are unterminated redo write records present.
2150                          *
2151                          * The second check is needed so the recovery code
2152                          * properly truncates write redos even if nominal
2153                          * REDO operations is turned off due to excessive
2154                          * writes, because the related records might be
2155                          * destroyed and never lay down a TERM_WRITE.
2156                          */
2157                         if ((ip->flags & HAMMER_INODE_REDO) ||
2158                             (ip->flags & HAMMER_INODE_RDIRTY)) {
2159                                 error = hammer_generate_redo(&trans, ip,
2160                                                              vap->va_size,
2161                                                              HAMMER_REDO_TRUNC,
2162                                                              NULL, 0);
2163                         }
2164                         blksize = hammer_blocksize(vap->va_size);
2165
2166                         /*
2167                          * XXX break atomicy, we can deadlock the backend
2168                          * if we do not release the lock.  Probably not a
2169                          * big deal here.
2170                          */
2171                         if (vap->va_size < ip->ino_data.size) {
2172                                 nvtruncbuf(ap->a_vp, vap->va_size,
2173                                            blksize,
2174                                            hammer_blockoff(vap->va_size));
2175                                 truncating = 1;
2176                                 kflags |= NOTE_WRITE;
2177                         } else {
2178                                 nvextendbuf(ap->a_vp,
2179                                             ip->ino_data.size,
2180                                             vap->va_size,
2181                                             hammer_blocksize(ip->ino_data.size),
2182                                             hammer_blocksize(vap->va_size),
2183                                             hammer_blockoff(ip->ino_data.size),
2184                                             hammer_blockoff(vap->va_size),
2185                                             0);
2186                                 truncating = 0;
2187                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
2188                         }
2189                         ip->ino_data.size = vap->va_size;
2190                         ip->ino_data.mtime = trans.time;
2191                         /* XXX safe to use SDIRTY instead of DDIRTY here? */
2192                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2193
2194                         /*
2195                          * On-media truncation is cached in the inode until
2196                          * the inode is synchronized.  We must immediately
2197                          * handle any frontend records.
2198                          */
2199                         if (truncating) {
2200                                 hammer_ip_frontend_trunc(ip, vap->va_size);
2201 #ifdef DEBUG_TRUNCATE
2202                                 if (HammerTruncIp == NULL)
2203                                         HammerTruncIp = ip;
2204 #endif
2205                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2206                                         ip->flags |= HAMMER_INODE_TRUNCATED;
2207                                         ip->trunc_off = vap->va_size;
2208 #ifdef DEBUG_TRUNCATE
2209                                         if (ip == HammerTruncIp)
2210                                         kprintf("truncate1 %016llx\n",
2211                                                 (long long)ip->trunc_off);
2212 #endif
2213                                 } else if (ip->trunc_off > vap->va_size) {
2214                                         ip->trunc_off = vap->va_size;
2215 #ifdef DEBUG_TRUNCATE
2216                                         if (ip == HammerTruncIp)
2217                                         kprintf("truncate2 %016llx\n",
2218                                                 (long long)ip->trunc_off);
2219 #endif
2220                                 } else {
2221 #ifdef DEBUG_TRUNCATE
2222                                         if (ip == HammerTruncIp)
2223                                         kprintf("truncate3 %016llx (ignored)\n",
2224                                                 (long long)vap->va_size);
2225 #endif
2226                                 }
2227                         }
2228
2229 #if 0
2230                         /*
2231                          * When truncating, nvtruncbuf() may have cleaned out
2232                          * a portion of the last block on-disk in the buffer
2233                          * cache.  We must clean out any frontend records
2234                          * for blocks beyond the new last block.
2235                          */
2236                         aligned_size = (vap->va_size + (blksize - 1)) &
2237                                        ~(int64_t)(blksize - 1);
2238                         if (truncating && vap->va_size < aligned_size) {
2239                                 aligned_size -= blksize;
2240                                 hammer_ip_frontend_trunc(ip, aligned_size);
2241                         }
2242 #endif
2243                         break;
2244                 case VDATABASE:
2245                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
2246                                 ip->flags |= HAMMER_INODE_TRUNCATED;
2247                                 ip->trunc_off = vap->va_size;
2248                         } else if (ip->trunc_off > vap->va_size) {
2249                                 ip->trunc_off = vap->va_size;
2250                         }
2251                         hammer_ip_frontend_trunc(ip, vap->va_size);
2252                         ip->ino_data.size = vap->va_size;
2253                         ip->ino_data.mtime = trans.time;
2254                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2255                         kflags |= NOTE_ATTRIB;
2256                         break;
2257                 default:
2258                         error = EINVAL;
2259                         goto done;
2260                 }
2261                 break;
2262         }
2263         if (vap->va_atime.tv_sec != VNOVAL) {
2264                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2265                 modflags |= HAMMER_INODE_ATIME;
2266                 kflags |= NOTE_ATTRIB;
2267         }
2268         if (vap->va_mtime.tv_sec != VNOVAL) {
2269                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2270                 modflags |= HAMMER_INODE_MTIME;
2271                 kflags |= NOTE_ATTRIB;
2272         }
2273         if (vap->va_mode != (mode_t)VNOVAL) {
2274                 mode_t   cur_mode = ip->ino_data.mode;
2275                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2276                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2277
2278                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2279                                          cur_uid, cur_gid, &cur_mode);
2280                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2281                         ip->ino_data.mode = cur_mode;
2282                         ip->ino_data.ctime = trans.time;
2283                         modflags |= HAMMER_INODE_DDIRTY;
2284                         kflags |= NOTE_ATTRIB;
2285                 }
2286         }
2287 done:
2288         if (error == 0)
2289                 hammer_modify_inode(&trans, ip, modflags);
2290         hammer_done_transaction(&trans);
2291         hammer_knote(ap->a_vp, kflags);
2292         return (error);
2293 }
2294
2295 /*
2296  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2297  */
2298 static
2299 int
2300 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2301 {
2302         struct hammer_transaction trans;
2303         struct hammer_inode *dip;
2304         struct hammer_inode *nip;
2305         struct nchandle *nch;
2306         hammer_record_t record;
2307         int error;
2308         int bytes;
2309
2310         ap->a_vap->va_type = VLNK;
2311
2312         nch = ap->a_nch;
2313         dip = VTOI(ap->a_dvp);
2314
2315         if (dip->flags & HAMMER_INODE_RO)
2316                 return (EROFS);
2317         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
2318                 return (error);
2319
2320         /*
2321          * Create a transaction to cover the operations we perform.
2322          */
2323         hammer_start_transaction(&trans, dip->hmp);
2324         ++hammer_stats_file_iopsw;
2325
2326         /*
2327          * Create a new filesystem object of the requested type.  The
2328          * returned inode will be referenced but not locked.
2329          */
2330
2331         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2332                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2333                                     NULL, &nip);
2334         if (error) {
2335                 hammer_done_transaction(&trans);
2336                 *ap->a_vpp = NULL;
2337                 return (error);
2338         }
2339
2340         /*
2341          * Add a record representing the symlink.  symlink stores the link
2342          * as pure data, not a string, and is no \0 terminated.
2343          */
2344         if (error == 0) {
2345                 bytes = strlen(ap->a_target);
2346
2347                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2348                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2349                 } else {
2350                         record = hammer_alloc_mem_record(nip, bytes);
2351                         record->type = HAMMER_MEM_RECORD_GENERAL;
2352
2353                         record->leaf.base.localization = nip->obj_localization +
2354                                                          HAMMER_LOCALIZE_MISC;
2355                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2356                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2357                         record->leaf.data_len = bytes;
2358                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2359                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2360                         error = hammer_ip_add_record(&trans, record);
2361                 }
2362
2363                 /*
2364                  * Set the file size to the length of the link.
2365                  */
2366                 if (error == 0) {
2367                         nip->ino_data.size = bytes;
2368                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
2369                 }
2370         }
2371         if (error == 0)
2372                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2373                                                 nch->ncp->nc_nlen, nip);
2374
2375         /*
2376          * Finish up.
2377          */
2378         if (error) {
2379                 hammer_rel_inode(nip, 0);
2380                 *ap->a_vpp = NULL;
2381         } else {
2382                 error = hammer_get_vnode(nip, ap->a_vpp);
2383                 hammer_rel_inode(nip, 0);
2384                 if (error == 0) {
2385                         cache_setunresolved(ap->a_nch);
2386                         cache_setvp(ap->a_nch, *ap->a_vpp);
2387                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2388                 }
2389         }
2390         hammer_done_transaction(&trans);
2391         return (error);
2392 }
2393
2394 /*
2395  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2396  */
2397 static
2398 int
2399 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2400 {
2401         struct hammer_transaction trans;
2402         struct hammer_inode *dip;
2403         int error;
2404
2405         dip = VTOI(ap->a_dvp);
2406
2407         if (hammer_nohistory(dip) == 0 &&
2408             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2409                 return (error);
2410         }
2411
2412         hammer_start_transaction(&trans, dip->hmp);
2413         ++hammer_stats_file_iopsw;
2414         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2415                                 ap->a_cred, ap->a_flags, -1);
2416         hammer_done_transaction(&trans);
2417
2418         return (error);
2419 }
2420
2421 /*
2422  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2423  */
2424 static
2425 int
2426 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2427 {
2428         struct hammer_inode *ip = ap->a_vp->v_data;
2429
2430         ++hammer_stats_file_iopsr;
2431         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2432                             ap->a_fflag, ap->a_cred));
2433 }
2434
2435 static
2436 int
2437 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2438 {
2439         static const struct mountctl_opt extraopt[] = {
2440                 { HMNT_NOHISTORY,       "nohistory" },
2441                 { HMNT_MASTERID,        "master" },
2442                 { 0, NULL}
2443
2444         };
2445         struct hammer_mount *hmp;
2446         struct mount *mp;
2447         int usedbytes;
2448         int error;
2449
2450         error = 0;
2451         usedbytes = 0;
2452         mp = ap->a_head.a_ops->head.vv_mount;
2453         KKASSERT(mp->mnt_data != NULL);
2454         hmp = (struct hammer_mount *)mp->mnt_data;
2455
2456         switch(ap->a_op) {
2457
2458         case MOUNTCTL_SET_EXPORT:
2459                 if (ap->a_ctllen != sizeof(struct export_args))
2460                         error = EINVAL;
2461                 else
2462                         error = hammer_vfs_export(mp, ap->a_op,
2463                                       (const struct export_args *)ap->a_ctl);
2464                 break;
2465         case MOUNTCTL_MOUNTFLAGS:
2466         {
2467                 /*
2468                  * Call standard mountctl VOP function
2469                  * so we get user mount flags.
2470                  */
2471                 error = vop_stdmountctl(ap);
2472                 if (error)
2473                         break;
2474
2475                 usedbytes = *ap->a_res;
2476
2477                 if (usedbytes > 0 && usedbytes < ap->a_buflen) {
2478                         usedbytes += vfs_flagstostr(hmp->hflags, extraopt, ap->a_buf,
2479                                                     ap->a_buflen - usedbytes,
2480                                                     &error);
2481                 }
2482
2483                 *ap->a_res += usedbytes;
2484                 break;
2485         }
2486         default:
2487                 error = vop_stdmountctl(ap);
2488                 break;
2489         }
2490         return(error);
2491 }
2492
2493 /*
2494  * hammer_vop_strategy { vp, bio }
2495  *
2496  * Strategy call, used for regular file read & write only.  Note that the
2497  * bp may represent a cluster.
2498  *
2499  * To simplify operation and allow better optimizations in the future,
2500  * this code does not make any assumptions with regards to buffer alignment
2501  * or size.
2502  */
2503 static
2504 int
2505 hammer_vop_strategy(struct vop_strategy_args *ap)
2506 {
2507         struct buf *bp;
2508         int error;
2509
2510         bp = ap->a_bio->bio_buf;
2511
2512         switch(bp->b_cmd) {
2513         case BUF_CMD_READ:
2514                 error = hammer_vop_strategy_read(ap);
2515                 break;
2516         case BUF_CMD_WRITE:
2517                 error = hammer_vop_strategy_write(ap);
2518                 break;
2519         default:
2520                 bp->b_error = error = EINVAL;
2521                 bp->b_flags |= B_ERROR;
2522                 biodone(ap->a_bio);
2523                 break;
2524         }
2525         return (error);
2526 }
2527
2528 /*
2529  * Read from a regular file.  Iterate the related records and fill in the
2530  * BIO/BUF.  Gaps are zero-filled.
2531  *
2532  * The support code in hammer_object.c should be used to deal with mixed
2533  * in-memory and on-disk records.
2534  *
2535  * NOTE: Can be called from the cluster code with an oversized buf.
2536  *
2537  * XXX atime update
2538  */
2539 static
2540 int
2541 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2542 {
2543         struct hammer_transaction trans;
2544         struct hammer_inode *ip;
2545         struct hammer_inode *dip;
2546         struct hammer_cursor cursor;
2547         hammer_base_elm_t base;
2548         hammer_off_t disk_offset;
2549         struct bio *bio;
2550         struct bio *nbio;
2551         struct buf *bp;
2552         int64_t rec_offset;
2553         int64_t ran_end;
2554         int64_t tmp64;
2555         int error;
2556         int boff;
2557         int roff;
2558         int n;
2559
2560         bio = ap->a_bio;
2561         bp = bio->bio_buf;
2562         ip = ap->a_vp->v_data;
2563
2564         /*
2565          * The zone-2 disk offset may have been set by the cluster code via
2566          * a BMAP operation, or else should be NOOFFSET.
2567          *
2568          * Checking the high bits for a match against zone-2 should suffice.
2569          */
2570         nbio = push_bio(bio);
2571         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2572             HAMMER_ZONE_LARGE_DATA) {
2573                 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2574                 return (error);
2575         }
2576
2577         /*
2578          * Well, that sucked.  Do it the hard way.  If all the stars are
2579          * aligned we may still be able to issue a direct-read.
2580          */
2581         hammer_simple_transaction(&trans, ip->hmp);
2582         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2583
2584         /*
2585          * Key range (begin and end inclusive) to scan.  Note that the key's
2586          * stored in the actual records represent BASE+LEN, not BASE.  The
2587          * first record containing bio_offset will have a key > bio_offset.
2588          */
2589         cursor.key_beg.localization = ip->obj_localization +
2590                                       HAMMER_LOCALIZE_MISC;
2591         cursor.key_beg.obj_id = ip->obj_id;
2592         cursor.key_beg.create_tid = 0;
2593         cursor.key_beg.delete_tid = 0;
2594         cursor.key_beg.obj_type = 0;
2595         cursor.key_beg.key = bio->bio_offset + 1;
2596         cursor.asof = ip->obj_asof;
2597         cursor.flags |= HAMMER_CURSOR_ASOF;
2598
2599         cursor.key_end = cursor.key_beg;
2600         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2601 #if 0
2602         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2603                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2604                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2605                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2606         } else
2607 #endif
2608         {
2609                 ran_end = bio->bio_offset + bp->b_bufsize;
2610                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2611                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2612                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2613                 if (tmp64 < ran_end)
2614                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2615                 else
2616                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2617         }
2618         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2619
2620         error = hammer_ip_first(&cursor);
2621         boff = 0;
2622
2623         while (error == 0) {
2624                 /*
2625                  * Get the base file offset of the record.  The key for
2626                  * data records is (base + bytes) rather then (base).
2627                  */
2628                 base = &cursor.leaf->base;
2629                 rec_offset = base->key - cursor.leaf->data_len;
2630
2631                 /*
2632                  * Calculate the gap, if any, and zero-fill it.
2633                  *
2634                  * n is the offset of the start of the record verses our
2635                  * current seek offset in the bio.
2636                  */
2637                 n = (int)(rec_offset - (bio->bio_offset + boff));
2638                 if (n > 0) {
2639                         if (n > bp->b_bufsize - boff)
2640                                 n = bp->b_bufsize - boff;
2641                         bzero((char *)bp->b_data + boff, n);
2642                         boff += n;
2643                         n = 0;
2644                 }
2645
2646                 /*
2647                  * Calculate the data offset in the record and the number
2648                  * of bytes we can copy.
2649                  *
2650                  * There are two degenerate cases.  First, boff may already
2651                  * be at bp->b_bufsize.  Secondly, the data offset within
2652                  * the record may exceed the record's size.
2653                  */
2654                 roff = -n;
2655                 rec_offset += roff;
2656                 n = cursor.leaf->data_len - roff;
2657                 if (n <= 0) {
2658                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2659                         n = 0;
2660                 } else if (n > bp->b_bufsize - boff) {
2661                         n = bp->b_bufsize - boff;
2662                 }
2663
2664                 /*
2665                  * Deal with cached truncations.  This cool bit of code
2666                  * allows truncate()/ftruncate() to avoid having to sync
2667                  * the file.
2668                  *
2669                  * If the frontend is truncated then all backend records are
2670                  * subject to the frontend's truncation.
2671                  *
2672                  * If the backend is truncated then backend records on-disk
2673                  * (but not in-memory) are subject to the backend's
2674                  * truncation.  In-memory records owned by the backend
2675                  * represent data written after the truncation point on the
2676                  * backend and must not be truncated.
2677                  *
2678                  * Truncate operations deal with frontend buffer cache
2679                  * buffers and frontend-owned in-memory records synchronously.
2680                  */
2681                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2682                         if (hammer_cursor_ondisk(&cursor)/* ||
2683                             cursor.iprec->flush_state == HAMMER_FST_FLUSH*/) {
2684                                 if (ip->trunc_off <= rec_offset)
2685                                         n = 0;
2686                                 else if (ip->trunc_off < rec_offset + n)
2687                                         n = (int)(ip->trunc_off - rec_offset);
2688                         }
2689                 }
2690                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2691                         if (hammer_cursor_ondisk(&cursor)) {
2692                                 if (ip->sync_trunc_off <= rec_offset)
2693                                         n = 0;
2694                                 else if (ip->sync_trunc_off < rec_offset + n)
2695                                         n = (int)(ip->sync_trunc_off - rec_offset);
2696                         }
2697                 }
2698
2699                 /*
2700                  * Try to issue a direct read into our bio if possible,
2701                  * otherwise resolve the element data into a hammer_buffer
2702                  * and copy.
2703                  *
2704                  * The buffer on-disk should be zerod past any real
2705                  * truncation point, but may not be for any synthesized
2706                  * truncation point from above.
2707                  */
2708                 disk_offset = cursor.leaf->data_offset + roff;
2709                 if (boff == 0 && n == bp->b_bufsize &&
2710                     hammer_cursor_ondisk(&cursor) &&
2711                     (disk_offset & HAMMER_BUFMASK) == 0) {
2712                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2713                                  HAMMER_ZONE_LARGE_DATA);
2714                         nbio->bio_offset = disk_offset;
2715                         error = hammer_io_direct_read(trans.hmp, nbio,
2716                                                       cursor.leaf);
2717                         goto done;
2718                 } else if (n) {
2719                         error = hammer_ip_resolve_data(&cursor);
2720                         if (error == 0) {
2721                                 bcopy((char *)cursor.data + roff,
2722                                       (char *)bp->b_data + boff, n);
2723                         }
2724                 }
2725                 if (error)
2726                         break;
2727
2728                 /*
2729                  * Iterate until we have filled the request.
2730                  */
2731                 boff += n;
2732                 if (boff == bp->b_bufsize)
2733                         break;
2734                 error = hammer_ip_next(&cursor);
2735         }
2736
2737         /*
2738          * There may have been a gap after the last record
2739          */
2740         if (error == ENOENT)
2741                 error = 0;
2742         if (error == 0 && boff != bp->b_bufsize) {
2743                 KKASSERT(boff < bp->b_bufsize);
2744                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2745                 /* boff = bp->b_bufsize; */
2746         }
2747         bp->b_resid = 0;
2748         bp->b_error = error;
2749         if (error)
2750                 bp->b_flags |= B_ERROR;
2751         biodone(ap->a_bio);
2752
2753 done:
2754         /*
2755          * Cache the b-tree node for the last data read in cache[1].
2756          *
2757          * If we hit the file EOF then also cache the node in the
2758          * governing director's cache[3], it will be used to initialize
2759          * the inode's cache[1] for any inodes looked up via the directory.
2760          *
2761          * This doesn't reduce disk accesses since the B-Tree chain is
2762          * likely cached, but it does reduce cpu overhead when looking
2763          * up file offsets for cpdup/tar/cpio style iterations.
2764          */
2765         if (cursor.node)
2766                 hammer_cache_node(&ip->cache[1], cursor.node);
2767         if (ran_end >= ip->ino_data.size) {
2768                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2769                                         ip->obj_asof, ip->obj_localization);
2770                 if (dip) {
2771                         hammer_cache_node(&dip->cache[3], cursor.node);
2772                         hammer_rel_inode(dip, 0);
2773                 }
2774         }
2775         hammer_done_cursor(&cursor);
2776         hammer_done_transaction(&trans);
2777         return(error);
2778 }
2779
2780 /*
2781  * BMAP operation - used to support cluster_read() only.
2782  *
2783  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2784  *
2785  * This routine may return EOPNOTSUPP if the opration is not supported for
2786  * the specified offset.  The contents of the pointer arguments do not
2787  * need to be initialized in that case. 
2788  *
2789  * If a disk address is available and properly aligned return 0 with 
2790  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2791  * to the run-length relative to that offset.  Callers may assume that
2792  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2793  * large, so return EOPNOTSUPP if it is not sufficiently large.
2794  */
2795 static
2796 int
2797 hammer_vop_bmap(struct vop_bmap_args *ap)
2798 {
2799         struct hammer_transaction trans;
2800         struct hammer_inode *ip;
2801         struct hammer_cursor cursor;
2802         hammer_base_elm_t base;
2803         int64_t rec_offset;
2804         int64_t ran_end;
2805         int64_t tmp64;
2806         int64_t base_offset;
2807         int64_t base_disk_offset;
2808         int64_t last_offset;
2809         hammer_off_t last_disk_offset;
2810         hammer_off_t disk_offset;
2811         int     rec_len;
2812         int     error;
2813         int     blksize;
2814
2815         ++hammer_stats_file_iopsr;
2816         ip = ap->a_vp->v_data;
2817
2818         /*
2819          * We can only BMAP regular files.  We can't BMAP database files,
2820          * directories, etc.
2821          */
2822         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2823                 return(EOPNOTSUPP);
2824
2825         /*
2826          * bmap is typically called with runp/runb both NULL when used
2827          * for writing.  We do not support BMAP for writing atm.
2828          */
2829         if (ap->a_cmd != BUF_CMD_READ)
2830                 return(EOPNOTSUPP);
2831
2832         /*
2833          * Scan the B-Tree to acquire blockmap addresses, then translate
2834          * to raw addresses.
2835          */
2836         hammer_simple_transaction(&trans, ip->hmp);
2837 #if 0
2838         kprintf("bmap_beg %016llx ip->cache %p\n",
2839                 (long long)ap->a_loffset, ip->cache[1]);
2840 #endif
2841         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2842
2843         /*
2844          * Key range (begin and end inclusive) to scan.  Note that the key's
2845          * stored in the actual records represent BASE+LEN, not BASE.  The
2846          * first record containing bio_offset will have a key > bio_offset.
2847          */
2848         cursor.key_beg.localization = ip->obj_localization +
2849                                       HAMMER_LOCALIZE_MISC;
2850         cursor.key_beg.obj_id = ip->obj_id;
2851         cursor.key_beg.create_tid = 0;
2852         cursor.key_beg.delete_tid = 0;
2853         cursor.key_beg.obj_type = 0;
2854         if (ap->a_runb)
2855                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2856         else
2857                 cursor.key_beg.key = ap->a_loffset + 1;
2858         if (cursor.key_beg.key < 0)
2859                 cursor.key_beg.key = 0;
2860         cursor.asof = ip->obj_asof;
2861         cursor.flags |= HAMMER_CURSOR_ASOF;
2862
2863         cursor.key_end = cursor.key_beg;
2864         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2865
2866         ran_end = ap->a_loffset + MAXPHYS;
2867         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2868         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2869         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2870         if (tmp64 < ran_end)
2871                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2872         else
2873                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2874
2875         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2876
2877         error = hammer_ip_first(&cursor);
2878         base_offset = last_offset = 0;
2879         base_disk_offset = last_disk_offset = 0;
2880
2881         while (error == 0) {
2882                 /*
2883                  * Get the base file offset of the record.  The key for
2884                  * data records is (base + bytes) rather then (base).
2885                  *
2886                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2887                  * The extra bytes should be zero on-disk and the BMAP op
2888                  * should still be ok.
2889                  */
2890                 base = &cursor.leaf->base;
2891                 rec_offset = base->key - cursor.leaf->data_len;
2892                 rec_len    = cursor.leaf->data_len;
2893
2894                 /*
2895                  * Incorporate any cached truncation.
2896                  *
2897                  * NOTE: Modifications to rec_len based on synthesized
2898                  * truncation points remove the guarantee that any extended
2899                  * data on disk is zero (since the truncations may not have
2900                  * taken place on-media yet).
2901                  */
2902                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2903                         if (hammer_cursor_ondisk(&cursor) ||
2904                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2905                                 if (ip->trunc_off <= rec_offset)
2906                                         rec_len = 0;
2907                                 else if (ip->trunc_off < rec_offset + rec_len)
2908                                         rec_len = (int)(ip->trunc_off - rec_offset);
2909                         }
2910                 }
2911                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2912                         if (hammer_cursor_ondisk(&cursor)) {
2913                                 if (ip->sync_trunc_off <= rec_offset)
2914                                         rec_len = 0;
2915                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2916                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2917                         }
2918                 }
2919
2920                 /*
2921                  * Accumulate information.  If we have hit a discontiguous
2922                  * block reset base_offset unless we are already beyond the
2923                  * requested offset.  If we are, that's it, we stop.
2924                  */
2925                 if (error)
2926                         break;
2927                 if (hammer_cursor_ondisk(&cursor)) {
2928                         disk_offset = cursor.leaf->data_offset;
2929                         if (rec_offset != last_offset ||
2930                             disk_offset != last_disk_offset) {
2931                                 if (rec_offset > ap->a_loffset)
2932                                         break;
2933                                 base_offset = rec_offset;
2934                                 base_disk_offset = disk_offset;
2935                         }
2936                         last_offset = rec_offset + rec_len;
2937                         last_disk_offset = disk_offset + rec_len;
2938                 }
2939                 error = hammer_ip_next(&cursor);
2940         }
2941
2942 #if 0
2943         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2944                 (long long)ap->a_loffset,
2945                 (long long)base_offset,
2946                 (long long)last_offset);
2947         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
2948                 (long long)base_disk_offset,
2949                 (long long)last_disk_offset);
2950 #endif
2951
2952         if (cursor.node) {
2953                 hammer_cache_node(&ip->cache[1], cursor.node);
2954 #if 0
2955                 kprintf("bmap_end2 %016llx ip->cache %p\n",
2956                         (long long)ap->a_loffset, ip->cache[1]);
2957 #endif
2958         }
2959         hammer_done_cursor(&cursor);
2960         hammer_done_transaction(&trans);
2961
2962         /*
2963          * If we couldn't find any records or the records we did find were
2964          * all behind the requested offset, return failure.  A forward
2965          * truncation can leave a hole w/ no on-disk records.
2966          */
2967         if (last_offset == 0 || last_offset < ap->a_loffset)
2968                 return (EOPNOTSUPP);
2969
2970         /*
2971          * Figure out the block size at the requested offset and adjust
2972          * our limits so the cluster_read() does not create inappropriately
2973          * sized buffer cache buffers.
2974          */
2975         blksize = hammer_blocksize(ap->a_loffset);
2976         if (hammer_blocksize(base_offset) != blksize) {
2977                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2978         }
2979         if (last_offset != ap->a_loffset &&
2980             hammer_blocksize(last_offset - 1) != blksize) {
2981                 last_offset = hammer_blockdemarc(ap->a_loffset,
2982                                                  last_offset - 1);
2983         }
2984
2985         /*
2986          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2987          * from occuring.
2988          */
2989         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2990
2991         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2992                 /*
2993                  * Only large-data zones can be direct-IOd
2994                  */
2995                 error = EOPNOTSUPP;
2996         } else if ((disk_offset & HAMMER_BUFMASK) ||
2997                    (last_offset - ap->a_loffset) < blksize) {
2998                 /*
2999                  * doffsetp is not aligned or the forward run size does
3000                  * not cover a whole buffer, disallow the direct I/O.
3001                  */
3002                 error = EOPNOTSUPP;
3003         } else {
3004                 /*
3005                  * We're good.
3006                  */
3007                 *ap->a_doffsetp = disk_offset;
3008                 if (ap->a_runb) {
3009                         *ap->a_runb = ap->a_loffset - base_offset;
3010                         KKASSERT(*ap->a_runb >= 0);
3011                 }
3012                 if (ap->a_runp) {
3013                         *ap->a_runp = last_offset - ap->a_loffset;
3014                         KKASSERT(*ap->a_runp >= 0);
3015                 }
3016                 error = 0;
3017         }
3018         return(error);
3019 }
3020
3021 /*
3022  * Write to a regular file.   Because this is a strategy call the OS is
3023  * trying to actually get data onto the media.
3024  */
3025 static
3026 int
3027 hammer_vop_strategy_write(struct vop_strategy_args *ap)
3028 {
3029         hammer_record_t record;
3030         hammer_mount_t hmp;
3031         hammer_inode_t ip;
3032         struct bio *bio;
3033         struct buf *bp;
3034         int blksize;
3035         int bytes;
3036         int error;
3037
3038         bio = ap->a_bio;
3039         bp = bio->bio_buf;
3040         ip = ap->a_vp->v_data;
3041         hmp = ip->hmp;
3042
3043         blksize = hammer_blocksize(bio->bio_offset);
3044         KKASSERT(bp->b_bufsize == blksize);
3045
3046         if (ip->flags & HAMMER_INODE_RO) {
3047                 bp->b_error = EROFS;
3048                 bp->b_flags |= B_ERROR;
3049                 biodone(ap->a_bio);
3050                 return(EROFS);
3051         }
3052
3053         /*
3054          * Interlock with inode destruction (no in-kernel or directory
3055          * topology visibility).  If we queue new IO while trying to
3056          * destroy the inode we can deadlock the vtrunc call in
3057          * hammer_inode_unloadable_check().
3058          *
3059          * Besides, there's no point flushing a bp associated with an
3060          * inode that is being destroyed on-media and has no kernel
3061          * references.
3062          */
3063         if ((ip->flags | ip->sync_flags) &
3064             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
3065                 bp->b_resid = 0;
3066                 biodone(ap->a_bio);
3067                 return(0);
3068         }
3069
3070         /*
3071          * Reserve space and issue a direct-write from the front-end. 
3072          * NOTE: The direct_io code will hammer_bread/bcopy smaller
3073          * allocations.
3074          *
3075          * An in-memory record will be installed to reference the storage
3076          * until the flusher can get to it.
3077          *
3078          * Since we own the high level bio the front-end will not try to
3079          * do a direct-read until the write completes.
3080          *
3081          * NOTE: The only time we do not reserve a full-sized buffers
3082          * worth of data is if the file is small.  We do not try to
3083          * allocate a fragment (from the small-data zone) at the end of
3084          * an otherwise large file as this can lead to wildly separated
3085          * data.
3086          */
3087         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
3088         KKASSERT(bio->bio_offset < ip->ino_data.size);
3089         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
3090                 bytes = bp->b_bufsize;
3091         else
3092                 bytes = ((int)ip->ino_data.size + 15) & ~15;
3093
3094         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
3095                                     bytes, &error);
3096
3097         /*
3098          * B_VFSFLAG1 indicates that a REDO_WRITE entry was generated
3099          * in hammer_vop_write().  We must flag the record so the proper
3100          * REDO_TERM_WRITE entry is generated during the flush.
3101          */
3102         if (record) {
3103                 if (bp->b_flags & B_VFSFLAG1) {
3104                         record->flags |= HAMMER_RECF_REDO;
3105                         bp->b_flags &= ~B_VFSFLAG1;
3106                 }
3107                 hammer_io_direct_write(hmp, bio, record);
3108                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
3109                         hammer_flush_inode(ip, 0);
3110         } else {
3111                 bp->b_bio2.bio_offset = NOOFFSET;
3112                 bp->b_error = error;
3113                 bp->b_flags |= B_ERROR;
3114                 biodone(ap->a_bio);
3115         }
3116         return(error);
3117 }
3118
3119 /*
3120  * dounlink - disconnect a directory entry
3121  *
3122  * XXX whiteout support not really in yet
3123  */
3124 static int
3125 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
3126                 struct vnode *dvp, struct ucred *cred, 
3127                 int flags, int isdir)
3128 {
3129         struct namecache *ncp;
3130         hammer_inode_t dip;
3131         hammer_inode_t ip;
3132         struct hammer_cursor cursor;
3133         int64_t namekey;
3134         u_int32_t max_iterations;
3135         int nlen, error;
3136
3137         /*
3138          * Calculate the namekey and setup the key range for the scan.  This
3139          * works kinda like a chained hash table where the lower 32 bits
3140          * of the namekey synthesize the chain.
3141          *
3142          * The key range is inclusive of both key_beg and key_end.
3143          */
3144         dip = VTOI(dvp);
3145         ncp = nch->ncp;
3146
3147         if (dip->flags & HAMMER_INODE_RO)
3148                 return (EROFS);
3149
3150         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
3151                                            &max_iterations);
3152 retry:
3153         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
3154         cursor.key_beg.localization = dip->obj_localization +
3155                                       hammer_dir_localization(dip);
3156         cursor.key_beg.obj_id = dip->obj_id;
3157         cursor.key_beg.key = namekey;
3158         cursor.key_beg.create_tid = 0;
3159         cursor.key_beg.delete_tid = 0;
3160         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
3161         cursor.key_beg.obj_type = 0;
3162
3163         cursor.key_end = cursor.key_beg;
3164         cursor.key_end.key += max_iterations;
3165         cursor.asof = dip->obj_asof;
3166         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
3167
3168         /*
3169          * Scan all matching records (the chain), locate the one matching
3170          * the requested path component.  info->last_error contains the
3171          * error code on search termination and could be 0, ENOENT, or
3172          * something else.
3173          *
3174          * The hammer_ip_*() functions merge in-memory records with on-disk
3175          * records for the purposes of the search.
3176          */
3177         error = hammer_ip_first(&cursor);
3178
3179         while (error == 0) {
3180                 error = hammer_ip_resolve_data(&cursor);
3181                 if (error)
3182                         break;
3183                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
3184                 KKASSERT(nlen > 0);
3185                 if (ncp->nc_nlen == nlen &&
3186                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
3187                         break;
3188                 }
3189                 error = hammer_ip_next(&cursor);
3190         }
3191
3192         /*
3193          * If all is ok we have to get the inode so we can adjust nlinks.
3194          * To avoid a deadlock with the flusher we must release the inode
3195          * lock on the directory when acquiring the inode for the entry.
3196          *
3197          * If the target is a directory, it must be empty.
3198          */
3199         if (error == 0) {
3200                 hammer_unlock(&cursor.ip->lock);
3201                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
3202                                       dip->hmp->asof,
3203                                       cursor.data->entry.localization,
3204                                       0, &error);
3205                 hammer_lock_sh(&cursor.ip->lock);
3206                 if (error == ENOENT) {
3207                         kprintf("HAMMER: WARNING: Removing "
3208                                 "dirent w/missing inode \"%s\"\n"
3209                                 "\tobj_id = %016llx\n",
3210                                 ncp->nc_name,
3211                                 (long long)cursor.data->entry.obj_id);
3212                         error = 0;
3213                 }
3214
3215                 /*
3216                  * If isdir >= 0 we validate that the entry is or is not a
3217                  * directory.  If isdir < 0 we don't care.
3218                  */
3219                 if (error == 0 && isdir >= 0 && ip) {
3220                         if (isdir &&
3221                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
3222                                 error = ENOTDIR;
3223                         } else if (isdir == 0 &&
3224                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
3225                                 error = EISDIR;
3226                         }
3227                 }
3228
3229                 /*
3230                  * If we are trying to remove a directory the directory must
3231                  * be empty.
3232                  *
3233                  * The check directory code can loop and deadlock/retry.  Our
3234                  * own cursor's node locks must be released to avoid a 3-way
3235                  * deadlock with the flusher if the check directory code
3236                  * blocks.
3237                  *
3238                  * If any changes whatsoever have been made to the cursor
3239                  * set EDEADLK and retry.
3240                  *
3241                  * WARNING: See warnings in hammer_unlock_cursor()
3242                  *          function.
3243                  */
3244                 if (error == 0 && ip && ip->ino_data.obj_type ==
3245                                         HAMMER_OBJTYPE_DIRECTORY) {
3246                         hammer_unlock_cursor(&cursor);
3247                         error = hammer_ip_check_directory_empty(trans, ip);
3248                         hammer_lock_cursor(&cursor);
3249                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
3250                                 kprintf("HAMMER: Warning: avoided deadlock "
3251                                         "on rmdir '%s'\n",
3252                                         ncp->nc_name);
3253                                 error = EDEADLK;
3254                         }
3255                 }
3256
3257                 /*
3258                  * Delete the directory entry.
3259                  *
3260                  * WARNING: hammer_ip_del_directory() may have to terminate
3261                  * the cursor to avoid a deadlock.  It is ok to call
3262                  * hammer_done_cursor() twice.
3263                  */
3264                 if (error == 0) {
3265                         error = hammer_ip_del_directory(trans, &cursor,
3266                                                         dip, ip);
3267                 }
3268                 hammer_done_cursor(&cursor);
3269                 if (error == 0) {
3270                         cache_setunresolved(nch);
3271                         cache_setvp(nch, NULL);
3272
3273                         /*
3274                          * XXX locking.  Note: ip->vp might get ripped out
3275                          * when we setunresolved() the nch since we had
3276                          * no other reference to it.  In that case ip->vp
3277                          * will be NULL.
3278                          */
3279                         if (ip && ip->vp) {
3280                                 hammer_knote(ip->vp, NOTE_DELETE);
3281                                 cache_inval_vp(ip->vp, CINV_DESTROY);
3282                         }
3283                 }
3284                 if (ip)
3285                         hammer_rel_inode(ip, 0);
3286         } else {
3287                 hammer_done_cursor(&cursor);
3288         }
3289         if (error == EDEADLK)
3290                 goto retry;
3291
3292         return (error);
3293 }
3294
3295 /************************************************************************
3296  *                          FIFO AND SPECFS OPS                         *
3297  ************************************************************************
3298  *
3299  */
3300
3301 static int
3302 hammer_vop_fifoclose (struct vop_close_args *ap)
3303 {
3304         /* XXX update itimes */
3305         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3306 }
3307
3308 static int
3309 hammer_vop_fiforead (struct vop_read_args *ap)
3310 {
3311         int error;
3312
3313         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3314         /* XXX update access time */
3315         return (error);
3316 }
3317
3318 static int
3319 hammer_vop_fifowrite (struct vop_write_args *ap)
3320 {
3321         int error;
3322
3323         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3324         /* XXX update access time */
3325         return (error);
3326 }
3327
3328 static
3329 int
3330 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3331 {
3332         int error;
3333
3334         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3335         if (error)
3336                 error = hammer_vop_kqfilter(ap);
3337         return(error);
3338 }
3339
3340 /************************************************************************
3341  *                          KQFILTER OPS                                *
3342  ************************************************************************
3343  *
3344  */
3345 static void filt_hammerdetach(struct knote *kn);
3346 static int filt_hammerread(struct knote *kn, long hint);
3347 static int filt_hammerwrite(struct knote *kn, long hint);
3348 static int filt_hammervnode(struct knote *kn, long hint);
3349
3350 static struct filterops hammerread_filtops =
3351         { 1, NULL, filt_hammerdetach, filt_hammerread };
3352 static struct filterops hammerwrite_filtops =
3353         { 1, NULL, filt_hammerdetach, filt_hammerwrite };
3354 static struct filterops hammervnode_filtops =
3355         { 1, NULL, filt_hammerdetach, filt_hammervnode };
3356
3357 static
3358 int
3359 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3360 {
3361         struct vnode *vp = ap->a_vp;
3362         struct knote *kn = ap->a_kn;
3363
3364         switch (kn->kn_filter) {
3365         case EVFILT_READ:
3366                 kn->kn_fop = &hammerread_filtops;
3367                 break;
3368         case EVFILT_WRITE:
3369                 kn->kn_fop = &hammerwrite_filtops;
3370                 break;
3371         case EVFILT_VNODE:
3372                 kn->kn_fop = &hammervnode_filtops;
3373                 break;
3374         default:
3375                 return (EOPNOTSUPP);
3376         }
3377
3378         kn->kn_hook = (caddr_t)vp;
3379
3380         lwkt_gettoken(&vp->v_token);
3381         SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
3382         lwkt_reltoken(&vp->v_token);
3383
3384         return(0);
3385 }
3386
3387 static void
3388 filt_hammerdetach(struct knote *kn)
3389 {
3390         struct vnode *vp = (void *)kn->kn_hook;
3391
3392         lwkt_gettoken(&vp->v_token);
3393         SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
3394                      kn, knote, kn_selnext);
3395         lwkt_reltoken(&vp->v_token);
3396 }
3397
3398 static int
3399 filt_hammerread(struct knote *kn, long hint)
3400 {
3401         struct vnode *vp = (void *)kn->kn_hook;
3402         hammer_inode_t ip = VTOI(vp);
3403
3404         if (hint == NOTE_REVOKE) {
3405                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3406                 return(1);
3407         }
3408         kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
3409         return (kn->kn_data != 0);
3410 }
3411
3412 static int
3413 filt_hammerwrite(struct knote *kn, long hint)
3414 {
3415         if (hint == NOTE_REVOKE)
3416                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3417         kn->kn_data = 0;
3418         return (1);
3419 }
3420
3421 static int
3422 filt_hammervnode(struct knote *kn, long hint)
3423 {
3424         if (kn->kn_sfflags & hint)
3425                 kn->kn_fflags |= hint;
3426         if (hint == NOTE_REVOKE) {
3427                 kn->kn_flags |= EV_EOF;
3428                 return (1);
3429         }
3430         return (kn->kn_fflags != 0);
3431 }
3432