8b4eb35c3d5c840fbdaf2f310983b333df95c053
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.87 2008/07/12 02:47:39 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91
92 struct vop_ops hammer_vnode_vops = {
93         .vop_default =          vop_defaultop,
94         .vop_fsync =            hammer_vop_fsync,
95         .vop_getpages =         vop_stdgetpages,
96         .vop_putpages =         vop_stdputpages,
97         .vop_read =             hammer_vop_read,
98         .vop_write =            hammer_vop_write,
99         .vop_access =           hammer_vop_access,
100         .vop_advlock =          hammer_vop_advlock,
101         .vop_close =            hammer_vop_close,
102         .vop_ncreate =          hammer_vop_ncreate,
103         .vop_getattr =          hammer_vop_getattr,
104         .vop_inactive =         hammer_vop_inactive,
105         .vop_reclaim =          hammer_vop_reclaim,
106         .vop_nresolve =         hammer_vop_nresolve,
107         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
108         .vop_nlink =            hammer_vop_nlink,
109         .vop_nmkdir =           hammer_vop_nmkdir,
110         .vop_nmknod =           hammer_vop_nmknod,
111         .vop_open =             hammer_vop_open,
112         .vop_pathconf =         hammer_vop_pathconf,
113         .vop_print =            hammer_vop_print,
114         .vop_readdir =          hammer_vop_readdir,
115         .vop_readlink =         hammer_vop_readlink,
116         .vop_nremove =          hammer_vop_nremove,
117         .vop_nrename =          hammer_vop_nrename,
118         .vop_nrmdir =           hammer_vop_nrmdir,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl
126 };
127
128 struct vop_ops hammer_spec_vops = {
129         .vop_default =          spec_vnoperate,
130         .vop_fsync =            hammer_vop_fsync,
131         .vop_read =             hammer_vop_specread,
132         .vop_write =            hammer_vop_specwrite,
133         .vop_access =           hammer_vop_access,
134         .vop_close =            hammer_vop_specclose,
135         .vop_getattr =          hammer_vop_getattr,
136         .vop_inactive =         hammer_vop_inactive,
137         .vop_reclaim =          hammer_vop_reclaim,
138         .vop_setattr =          hammer_vop_setattr
139 };
140
141 struct vop_ops hammer_fifo_vops = {
142         .vop_default =          fifo_vnoperate,
143         .vop_fsync =            hammer_vop_fsync,
144         .vop_read =             hammer_vop_fiforead,
145         .vop_write =            hammer_vop_fifowrite,
146         .vop_access =           hammer_vop_access,
147         .vop_close =            hammer_vop_fifoclose,
148         .vop_getattr =          hammer_vop_getattr,
149         .vop_inactive =         hammer_vop_inactive,
150         .vop_reclaim =          hammer_vop_reclaim,
151         .vop_setattr =          hammer_vop_setattr
152 };
153
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159                            struct vnode *dvp, struct ucred *cred, int flags);
160 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
161 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
162
163 #if 0
164 static
165 int
166 hammer_vop_vnoperate(struct vop_generic_args *)
167 {
168         return (VOCALL(&hammer_vnode_vops, ap));
169 }
170 #endif
171
172 /*
173  * hammer_vop_fsync { vp, waitfor }
174  *
175  * fsync() an inode to disk and wait for it to be completely committed
176  * such that the information would not be undone if a crash occured after
177  * return.
178  */
179 static
180 int
181 hammer_vop_fsync(struct vop_fsync_args *ap)
182 {
183         hammer_inode_t ip = VTOI(ap->a_vp);
184
185         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
186         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
187         if (ap->a_waitfor == MNT_WAIT)
188                 hammer_wait_inode(ip);
189         return (ip->error);
190 }
191
192 /*
193  * hammer_vop_read { vp, uio, ioflag, cred }
194  */
195 static
196 int
197 hammer_vop_read(struct vop_read_args *ap)
198 {
199         struct hammer_transaction trans;
200         hammer_inode_t ip;
201         off_t offset;
202         struct buf *bp;
203         struct uio *uio;
204         int error;
205         int n;
206         int seqcount;
207         int ioseqcount;
208         int blksize;
209
210         if (ap->a_vp->v_type != VREG)
211                 return (EINVAL);
212         ip = VTOI(ap->a_vp);
213         error = 0;
214         uio = ap->a_uio;
215
216         /*
217          * Allow the UIO's size to override the sequential heuristic.
218          */
219         blksize = hammer_blocksize(uio->uio_offset);
220         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
221         ioseqcount = ap->a_ioflag >> 16;
222         if (seqcount < ioseqcount)
223                 seqcount = ioseqcount;
224
225         hammer_start_transaction(&trans, ip->hmp);
226
227         /*
228          * Access the data typically in HAMMER_BUFSIZE blocks via the
229          * buffer cache, but HAMMER may use a variable block size based
230          * on the offset.
231          */
232         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
233                 int64_t base_offset;
234                 int64_t file_limit;
235
236                 blksize = hammer_blocksize(uio->uio_offset);
237                 offset = (int)uio->uio_offset & (blksize - 1);
238                 base_offset = uio->uio_offset - offset;
239
240                 if (hammer_debug_cluster_enable) {
241                         /*
242                          * Use file_limit to prevent cluster_read() from
243                          * creating buffers of the wrong block size past
244                          * the demarc.
245                          */
246                         file_limit = ip->ino_data.size;
247                         if (base_offset < HAMMER_XDEMARC &&
248                             file_limit > HAMMER_XDEMARC) {
249                                 file_limit = HAMMER_XDEMARC;
250                         }
251                         error = cluster_read(ap->a_vp,
252                                              file_limit, base_offset,
253                                              blksize, MAXPHYS,
254                                              seqcount, &bp);
255                 } else {
256                         error = bread(ap->a_vp, base_offset, blksize, &bp);
257                 }
258                 if (error) {
259                         kprintf("error %d\n", error);
260                         brelse(bp);
261                         break;
262                 }
263
264                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
265                 n = blksize - offset;
266                 if (n > uio->uio_resid)
267                         n = uio->uio_resid;
268                 if (n > ip->ino_data.size - uio->uio_offset)
269                         n = (int)(ip->ino_data.size - uio->uio_offset);
270                 error = uiomove((char *)bp->b_data + offset, n, uio);
271
272                 /* data has a lower priority then meta-data */
273                 bp->b_flags |= B_AGE;
274                 bqrelse(bp);
275                 if (error)
276                         break;
277         }
278         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
279             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
280                 ip->ino_data.atime = trans.time;
281                 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
282         }
283         hammer_done_transaction(&trans);
284         return (error);
285 }
286
287 /*
288  * hammer_vop_write { vp, uio, ioflag, cred }
289  */
290 static
291 int
292 hammer_vop_write(struct vop_write_args *ap)
293 {
294         struct hammer_transaction trans;
295         struct hammer_inode *ip;
296         hammer_mount_t hmp;
297         struct uio *uio;
298         int offset;
299         off_t base_offset;
300         struct buf *bp;
301         int error;
302         int n;
303         int flags;
304         int delta;
305         int seqcount;
306
307         if (ap->a_vp->v_type != VREG)
308                 return (EINVAL);
309         ip = VTOI(ap->a_vp);
310         hmp = ip->hmp;
311         error = 0;
312         seqcount = ap->a_ioflag >> 16;
313
314         if (ip->flags & HAMMER_INODE_RO)
315                 return (EROFS);
316
317         /*
318          * Create a transaction to cover the operations we perform.
319          */
320         hammer_start_transaction(&trans, hmp);
321         uio = ap->a_uio;
322
323         /*
324          * Check append mode
325          */
326         if (ap->a_ioflag & IO_APPEND)
327                 uio->uio_offset = ip->ino_data.size;
328
329         /*
330          * Check for illegal write offsets.  Valid range is 0...2^63-1.
331          *
332          * NOTE: the base_off assignment is required to work around what
333          * I consider to be a GCC-4 optimization bug.
334          */
335         if (uio->uio_offset < 0) {
336                 hammer_done_transaction(&trans);
337                 return (EFBIG);
338         }
339         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
340         if (uio->uio_resid > 0 && base_offset <= 0) {
341                 hammer_done_transaction(&trans);
342                 return (EFBIG);
343         }
344
345         /*
346          * Access the data typically in HAMMER_BUFSIZE blocks via the
347          * buffer cache, but HAMMER may use a variable block size based
348          * on the offset.
349          */
350         while (uio->uio_resid > 0) {
351                 int fixsize = 0;
352                 int blksize;
353                 int blkmask;
354
355                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
356                         break;
357
358                 blksize = hammer_blocksize(uio->uio_offset);
359
360                 /*
361                  * Do not allow HAMMER to blow out the buffer cache.  Very
362                  * large UIOs can lockout other processes due to bwillwrite()
363                  * mechanics.
364                  *
365                  * The hammer inode is not locked during these operations.
366                  * The vnode is locked which can interfere with the pageout
367                  * daemon for non-UIO_NOCOPY writes but should not interfere
368                  * with the buffer cache.  Even so, we cannot afford to
369                  * allow the pageout daemon to build up too many dirty buffer
370                  * cache buffers.
371                  */
372                 /*if (((int)uio->uio_offset & (blksize - 1)) == 0)*/
373                 bwillwrite(blksize);
374
375                 /*
376                  * Do not allow HAMMER to blow out system memory by
377                  * accumulating too many records.   Records are so well
378                  * decoupled from the buffer cache that it is possible
379                  * for userland to push data out to the media via
380                  * direct-write, but build up the records queued to the
381                  * backend faster then the backend can flush them out.
382                  * HAMMER has hit its write limit but the frontend has
383                  * no pushback to slow it down.
384                  */
385                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
386                         /*
387                          * Get the inode on the flush list
388                          */
389                         if (ip->rsv_recs >= 64)
390                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
391                         else if (ip->rsv_recs >= 16)
392                                 hammer_flush_inode(ip, 0);
393
394                         /*
395                          * Keep the flusher going if the system keeps
396                          * queueing records.
397                          */
398                         delta = hmp->count_newrecords -
399                                 hmp->last_newrecords;
400                         if (delta < 0 || delta > hammer_limit_recs / 2) {
401                                 hmp->last_newrecords = hmp->count_newrecords;
402                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
403                         }
404
405                         /*
406                          * If we have gotten behind start slowing
407                          * down the writers.
408                          */
409                         delta = (hmp->rsv_recs - hammer_limit_recs) *
410                                 hz / hammer_limit_recs;
411                         if (delta > 0)
412                                 tsleep(&trans, 0, "hmrslo", delta);
413                 }
414
415                 /*
416                  * Calculate the blocksize at the current offset and figure
417                  * out how much we can actually write.
418                  */
419                 blkmask = blksize - 1;
420                 offset = (int)uio->uio_offset & blkmask;
421                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
422                 n = blksize - offset;
423                 if (n > uio->uio_resid)
424                         n = uio->uio_resid;
425                 if (uio->uio_offset + n > ip->ino_data.size) {
426                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
427                         fixsize = 1;
428                 }
429
430                 if (uio->uio_segflg == UIO_NOCOPY) {
431                         /*
432                          * Issuing a write with the same data backing the
433                          * buffer.  Instantiate the buffer to collect the
434                          * backing vm pages, then read-in any missing bits.
435                          *
436                          * This case is used by vop_stdputpages().
437                          */
438                         bp = getblk(ap->a_vp, base_offset,
439                                     blksize, GETBLK_BHEAVY, 0);
440                         if ((bp->b_flags & B_CACHE) == 0) {
441                                 bqrelse(bp);
442                                 error = bread(ap->a_vp, base_offset,
443                                               blksize, &bp);
444                         }
445                 } else if (offset == 0 && uio->uio_resid >= blksize) {
446                         /*
447                          * Even though we are entirely overwriting the buffer
448                          * we may still have to zero it out to avoid a 
449                          * mmap/write visibility issue.
450                          */
451                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
452                         if ((bp->b_flags & B_CACHE) == 0)
453                                 vfs_bio_clrbuf(bp);
454                 } else if (base_offset >= ip->ino_data.size) {
455                         /*
456                          * If the base offset of the buffer is beyond the
457                          * file EOF, we don't have to issue a read.
458                          */
459                         bp = getblk(ap->a_vp, base_offset,
460                                     blksize, GETBLK_BHEAVY, 0);
461                         vfs_bio_clrbuf(bp);
462                 } else {
463                         /*
464                          * Partial overwrite, read in any missing bits then
465                          * replace the portion being written.
466                          */
467                         error = bread(ap->a_vp, base_offset, blksize, &bp);
468                         if (error == 0)
469                                 bheavy(bp);
470                 }
471                 if (error == 0) {
472                         error = uiomove((char *)bp->b_data + offset,
473                                         n, uio);
474                 }
475
476                 /*
477                  * If we screwed up we have to undo any VM size changes we
478                  * made.
479                  */
480                 if (error) {
481                         brelse(bp);
482                         if (fixsize) {
483                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
484                                           hammer_blocksize(ip->ino_data.size));
485                         }
486                         break;
487                 }
488                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
489                 if (ip->ino_data.size < uio->uio_offset) {
490                         ip->ino_data.size = uio->uio_offset;
491                         flags = HAMMER_INODE_DDIRTY;
492                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
493                 } else {
494                         flags = 0;
495                 }
496                 ip->ino_data.mtime = trans.time;
497                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
498                 hammer_modify_inode(ip, flags);
499
500                 /*
501                  * Final buffer disposition.
502                  */
503                 bp->b_flags |= B_AGE;
504                 if (ap->a_ioflag & IO_SYNC) {
505                         bwrite(bp);
506                 } else if (ap->a_ioflag & IO_DIRECT) {
507                         bawrite(bp);
508                 } else {
509                         bdwrite(bp);
510                 }
511         }
512         hammer_done_transaction(&trans);
513         return (error);
514 }
515
516 /*
517  * hammer_vop_access { vp, mode, cred }
518  */
519 static
520 int
521 hammer_vop_access(struct vop_access_args *ap)
522 {
523         struct hammer_inode *ip = VTOI(ap->a_vp);
524         uid_t uid;
525         gid_t gid;
526         int error;
527
528         uid = hammer_to_unix_xid(&ip->ino_data.uid);
529         gid = hammer_to_unix_xid(&ip->ino_data.gid);
530
531         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
532                                   ip->ino_data.uflags);
533         return (error);
534 }
535
536 /*
537  * hammer_vop_advlock { vp, id, op, fl, flags }
538  */
539 static
540 int
541 hammer_vop_advlock(struct vop_advlock_args *ap)
542 {
543         hammer_inode_t ip = VTOI(ap->a_vp);
544
545         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
546 }
547
548 /*
549  * hammer_vop_close { vp, fflag }
550  */
551 static
552 int
553 hammer_vop_close(struct vop_close_args *ap)
554 {
555         hammer_inode_t ip = VTOI(ap->a_vp);
556
557         if ((ip->flags | ip->sync_flags) & HAMMER_INODE_MODMASK)
558                 hammer_inode_waitreclaims(ip->hmp);
559         return (vop_stdclose(ap));
560 }
561
562 /*
563  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
564  *
565  * The operating system has already ensured that the directory entry
566  * does not exist and done all appropriate namespace locking.
567  */
568 static
569 int
570 hammer_vop_ncreate(struct vop_ncreate_args *ap)
571 {
572         struct hammer_transaction trans;
573         struct hammer_inode *dip;
574         struct hammer_inode *nip;
575         struct nchandle *nch;
576         int error;
577
578         nch = ap->a_nch;
579         dip = VTOI(ap->a_dvp);
580
581         if (dip->flags & HAMMER_INODE_RO)
582                 return (EROFS);
583         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
584                 return (error);
585
586         /*
587          * Create a transaction to cover the operations we perform.
588          */
589         hammer_start_transaction(&trans, dip->hmp);
590
591         /*
592          * Create a new filesystem object of the requested type.  The
593          * returned inode will be referenced and shared-locked to prevent
594          * it from being moved to the flusher.
595          */
596
597         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
598                                     dip, NULL, &nip);
599         if (error) {
600                 hkprintf("hammer_create_inode error %d\n", error);
601                 hammer_done_transaction(&trans);
602                 *ap->a_vpp = NULL;
603                 return (error);
604         }
605
606         /*
607          * Add the new filesystem object to the directory.  This will also
608          * bump the inode's link count.
609          */
610         error = hammer_ip_add_directory(&trans, dip,
611                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
612                                         nip);
613         if (error)
614                 hkprintf("hammer_ip_add_directory error %d\n", error);
615
616         /*
617          * Finish up.
618          */
619         if (error) {
620                 hammer_rel_inode(nip, 0);
621                 hammer_done_transaction(&trans);
622                 *ap->a_vpp = NULL;
623         } else {
624                 error = hammer_get_vnode(nip, ap->a_vpp);
625                 hammer_done_transaction(&trans);
626                 hammer_rel_inode(nip, 0);
627                 if (error == 0) {
628                         cache_setunresolved(ap->a_nch);
629                         cache_setvp(ap->a_nch, *ap->a_vpp);
630                 }
631         }
632         return (error);
633 }
634
635 /*
636  * hammer_vop_getattr { vp, vap }
637  *
638  * Retrieve an inode's attribute information.  When accessing inodes
639  * historically we fake the atime field to ensure consistent results.
640  * The atime field is stored in the B-Tree element and allowed to be
641  * updated without cycling the element.
642  */
643 static
644 int
645 hammer_vop_getattr(struct vop_getattr_args *ap)
646 {
647         struct hammer_inode *ip = VTOI(ap->a_vp);
648         struct vattr *vap = ap->a_vap;
649
650         /*
651          * We want the fsid to be different when accessing a filesystem
652          * with different as-of's so programs like diff don't think
653          * the files are the same.
654          *
655          * We also want the fsid to be the same when comparing snapshots,
656          * or when comparing mirrors (which might be backed by different
657          * physical devices).  HAMMER fsids are based on the PFS's
658          * shared_uuid field.
659          *
660          * XXX there is a chance of collision here.  The va_fsid reported
661          * by stat is different from the more involved fsid used in the
662          * mount structure.
663          */
664         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
665                        (u_int32_t)(ip->obj_asof >> 32);
666
667         vap->va_fileid = ip->ino_leaf.base.obj_id;
668         vap->va_mode = ip->ino_data.mode;
669         vap->va_nlink = ip->ino_data.nlinks;
670         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
671         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
672         vap->va_rmajor = 0;
673         vap->va_rminor = 0;
674         vap->va_size = ip->ino_data.size;
675
676         /*
677          * We must provide a consistent atime and mtime for snapshots
678          * so people can do a 'tar cf - ... | md5' on them and get
679          * consistent results.
680          */
681         if (ip->flags & HAMMER_INODE_RO) {
682                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
683                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
684         } else {
685                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
686                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
687         }
688         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
689         vap->va_flags = ip->ino_data.uflags;
690         vap->va_gen = 1;        /* hammer inums are unique for all time */
691         vap->va_blocksize = HAMMER_BUFSIZE;
692         if (ip->ino_data.size >= HAMMER_XDEMARC) {
693                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
694                                 ~HAMMER_XBUFMASK64;
695         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
696                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
697                                 ~HAMMER_BUFMASK64;
698         } else {
699                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
700         }
701         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
702         vap->va_filerev = 0;    /* XXX */
703         /* mtime uniquely identifies any adjustments made to the file XXX */
704         vap->va_fsmid = ip->ino_data.mtime;
705         vap->va_uid_uuid = ip->ino_data.uid;
706         vap->va_gid_uuid = ip->ino_data.gid;
707         vap->va_fsid_uuid = ip->hmp->fsid;
708         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
709                           VA_FSID_UUID_VALID;
710
711         switch (ip->ino_data.obj_type) {
712         case HAMMER_OBJTYPE_CDEV:
713         case HAMMER_OBJTYPE_BDEV:
714                 vap->va_rmajor = ip->ino_data.rmajor;
715                 vap->va_rminor = ip->ino_data.rminor;
716                 break;
717         default:
718                 break;
719         }
720         return(0);
721 }
722
723 /*
724  * hammer_vop_nresolve { nch, dvp, cred }
725  *
726  * Locate the requested directory entry.
727  */
728 static
729 int
730 hammer_vop_nresolve(struct vop_nresolve_args *ap)
731 {
732         struct hammer_transaction trans;
733         struct namecache *ncp;
734         hammer_inode_t dip;
735         hammer_inode_t ip;
736         hammer_tid_t asof;
737         struct hammer_cursor cursor;
738         struct vnode *vp;
739         int64_t namekey;
740         int error;
741         int i;
742         int nlen;
743         int flags;
744         int ispfs;
745         int64_t obj_id;
746         u_int32_t localization;
747
748         /*
749          * Misc initialization, plus handle as-of name extensions.  Look for
750          * the '@@' extension.  Note that as-of files and directories cannot
751          * be modified.
752          */
753         dip = VTOI(ap->a_dvp);
754         ncp = ap->a_nch->ncp;
755         asof = dip->obj_asof;
756         nlen = ncp->nc_nlen;
757         flags = dip->flags & HAMMER_INODE_RO;
758         ispfs = 0;
759
760         hammer_simple_transaction(&trans, dip->hmp);
761
762         for (i = 0; i < nlen; ++i) {
763                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
764                         asof = hammer_str_to_tid(ncp->nc_name + i + 2,
765                                                  &ispfs, &localization);
766                         if (asof != HAMMER_MAX_TID)
767                                 flags |= HAMMER_INODE_RO;
768                         break;
769                 }
770         }
771         nlen = i;
772
773         /*
774          * If this is a PFS softlink we dive into the PFS
775          */
776         if (ispfs && nlen == 0) {
777                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
778                                       asof, localization,
779                                       flags, &error);
780                 if (error == 0) {
781                         error = hammer_get_vnode(ip, &vp);
782                         hammer_rel_inode(ip, 0);
783                 } else {
784                         vp = NULL;
785                 }
786                 if (error == 0) {
787                         vn_unlock(vp);
788                         cache_setvp(ap->a_nch, vp);
789                         vrele(vp);
790                 }
791                 goto done;
792         }
793
794         /*
795          * If there is no path component the time extension is relative to
796          * dip.
797          */
798         if (nlen == 0) {
799                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
800                                       asof, dip->obj_localization,
801                                       flags, &error);
802                 if (error == 0) {
803                         error = hammer_get_vnode(ip, &vp);
804                         hammer_rel_inode(ip, 0);
805                 } else {
806                         vp = NULL;
807                 }
808                 if (error == 0) {
809                         vn_unlock(vp);
810                         cache_setvp(ap->a_nch, vp);
811                         vrele(vp);
812                 }
813                 goto done;
814         }
815
816         /*
817          * Calculate the namekey and setup the key range for the scan.  This
818          * works kinda like a chained hash table where the lower 32 bits
819          * of the namekey synthesize the chain.
820          *
821          * The key range is inclusive of both key_beg and key_end.
822          */
823         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
824
825         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
826         cursor.key_beg.localization = dip->obj_localization +
827                                       HAMMER_LOCALIZE_MISC;
828         cursor.key_beg.obj_id = dip->obj_id;
829         cursor.key_beg.key = namekey;
830         cursor.key_beg.create_tid = 0;
831         cursor.key_beg.delete_tid = 0;
832         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
833         cursor.key_beg.obj_type = 0;
834
835         cursor.key_end = cursor.key_beg;
836         cursor.key_end.key |= 0xFFFFFFFFULL;
837         cursor.asof = asof;
838         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
839
840         /*
841          * Scan all matching records (the chain), locate the one matching
842          * the requested path component.
843          *
844          * The hammer_ip_*() functions merge in-memory records with on-disk
845          * records for the purposes of the search.
846          */
847         obj_id = 0;
848         localization = HAMMER_DEF_LOCALIZATION;
849
850         if (error == 0) {
851                 error = hammer_ip_first(&cursor);
852                 while (error == 0) {
853                         error = hammer_ip_resolve_data(&cursor);
854                         if (error)
855                                 break;
856                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
857                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
858                                 obj_id = cursor.data->entry.obj_id;
859                                 localization = cursor.data->entry.localization;
860                                 break;
861                         }
862                         error = hammer_ip_next(&cursor);
863                 }
864         }
865         hammer_done_cursor(&cursor);
866         if (error == 0) {
867                 ip = hammer_get_inode(&trans, dip, obj_id,
868                                       asof, localization,
869                                       flags, &error);
870                 if (error == 0) {
871                         error = hammer_get_vnode(ip, &vp);
872                         hammer_rel_inode(ip, 0);
873                 } else {
874                         vp = NULL;
875                 }
876                 if (error == 0) {
877                         vn_unlock(vp);
878                         cache_setvp(ap->a_nch, vp);
879                         vrele(vp);
880                 }
881         } else if (error == ENOENT) {
882                 cache_setvp(ap->a_nch, NULL);
883         }
884 done:
885         hammer_done_transaction(&trans);
886         return (error);
887 }
888
889 /*
890  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
891  *
892  * Locate the parent directory of a directory vnode.
893  *
894  * dvp is referenced but not locked.  *vpp must be returned referenced and
895  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
896  * at the root, instead it could indicate that the directory we were in was
897  * removed.
898  *
899  * NOTE: as-of sequences are not linked into the directory structure.  If
900  * we are at the root with a different asof then the mount point, reload
901  * the same directory with the mount point's asof.   I'm not sure what this
902  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
903  * get confused, but it hasn't been tested.
904  */
905 static
906 int
907 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
908 {
909         struct hammer_transaction trans;
910         struct hammer_inode *dip;
911         struct hammer_inode *ip;
912         int64_t parent_obj_id;
913         u_int32_t parent_obj_localization;
914         hammer_tid_t asof;
915         int error;
916
917         dip = VTOI(ap->a_dvp);
918         asof = dip->obj_asof;
919
920         /*
921          * Whos are parent?  This could be the root of a pseudo-filesystem
922          * whos parent is in another localization domain.
923          */
924         parent_obj_id = dip->ino_data.parent_obj_id;
925         if (dip->obj_id == HAMMER_OBJID_ROOT)
926                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
927         else
928                 parent_obj_localization = dip->obj_localization;
929
930         if (parent_obj_id == 0) {
931                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
932                    asof != dip->hmp->asof) {
933                         parent_obj_id = dip->obj_id;
934                         asof = dip->hmp->asof;
935                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
936                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
937                                    dip->obj_asof);
938                 } else {
939                         *ap->a_vpp = NULL;
940                         return ENOENT;
941                 }
942         }
943
944         hammer_simple_transaction(&trans, dip->hmp);
945
946         ip = hammer_get_inode(&trans, dip, parent_obj_id,
947                               asof, parent_obj_localization,
948                               dip->flags, &error);
949         if (ip) {
950                 error = hammer_get_vnode(ip, ap->a_vpp);
951                 hammer_rel_inode(ip, 0);
952         } else {
953                 *ap->a_vpp = NULL;
954         }
955         hammer_done_transaction(&trans);
956         return (error);
957 }
958
959 /*
960  * hammer_vop_nlink { nch, dvp, vp, cred }
961  */
962 static
963 int
964 hammer_vop_nlink(struct vop_nlink_args *ap)
965 {
966         struct hammer_transaction trans;
967         struct hammer_inode *dip;
968         struct hammer_inode *ip;
969         struct nchandle *nch;
970         int error;
971
972         nch = ap->a_nch;
973         dip = VTOI(ap->a_dvp);
974         ip = VTOI(ap->a_vp);
975
976         if (dip->flags & HAMMER_INODE_RO)
977                 return (EROFS);
978         if (ip->flags & HAMMER_INODE_RO)
979                 return (EROFS);
980         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
981                 return (error);
982
983         /*
984          * Create a transaction to cover the operations we perform.
985          */
986         hammer_start_transaction(&trans, dip->hmp);
987
988         /*
989          * Add the filesystem object to the directory.  Note that neither
990          * dip nor ip are referenced or locked, but their vnodes are
991          * referenced.  This function will bump the inode's link count.
992          */
993         error = hammer_ip_add_directory(&trans, dip,
994                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
995                                         ip);
996
997         /*
998          * Finish up.
999          */
1000         if (error == 0) {
1001                 cache_setunresolved(nch);
1002                 cache_setvp(nch, ap->a_vp);
1003         }
1004         hammer_done_transaction(&trans);
1005         return (error);
1006 }
1007
1008 /*
1009  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1010  *
1011  * The operating system has already ensured that the directory entry
1012  * does not exist and done all appropriate namespace locking.
1013  */
1014 static
1015 int
1016 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1017 {
1018         struct hammer_transaction trans;
1019         struct hammer_inode *dip;
1020         struct hammer_inode *nip;
1021         struct nchandle *nch;
1022         int error;
1023
1024         nch = ap->a_nch;
1025         dip = VTOI(ap->a_dvp);
1026
1027         if (dip->flags & HAMMER_INODE_RO)
1028                 return (EROFS);
1029         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1030                 return (error);
1031
1032         /*
1033          * Create a transaction to cover the operations we perform.
1034          */
1035         hammer_start_transaction(&trans, dip->hmp);
1036
1037         /*
1038          * Create a new filesystem object of the requested type.  The
1039          * returned inode will be referenced but not locked.
1040          */
1041         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1042                                     dip, NULL, &nip);
1043         if (error) {
1044                 hkprintf("hammer_mkdir error %d\n", error);
1045                 hammer_done_transaction(&trans);
1046                 *ap->a_vpp = NULL;
1047                 return (error);
1048         }
1049         /*
1050          * Add the new filesystem object to the directory.  This will also
1051          * bump the inode's link count.
1052          */
1053         error = hammer_ip_add_directory(&trans, dip,
1054                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1055                                         nip);
1056         if (error)
1057                 hkprintf("hammer_mkdir (add) error %d\n", error);
1058
1059         /*
1060          * Finish up.
1061          */
1062         if (error) {
1063                 hammer_rel_inode(nip, 0);
1064                 *ap->a_vpp = NULL;
1065         } else {
1066                 error = hammer_get_vnode(nip, ap->a_vpp);
1067                 hammer_rel_inode(nip, 0);
1068                 if (error == 0) {
1069                         cache_setunresolved(ap->a_nch);
1070                         cache_setvp(ap->a_nch, *ap->a_vpp);
1071                 }
1072         }
1073         hammer_done_transaction(&trans);
1074         return (error);
1075 }
1076
1077 /*
1078  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1079  *
1080  * The operating system has already ensured that the directory entry
1081  * does not exist and done all appropriate namespace locking.
1082  */
1083 static
1084 int
1085 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1086 {
1087         struct hammer_transaction trans;
1088         struct hammer_inode *dip;
1089         struct hammer_inode *nip;
1090         struct nchandle *nch;
1091         int error;
1092
1093         nch = ap->a_nch;
1094         dip = VTOI(ap->a_dvp);
1095
1096         if (dip->flags & HAMMER_INODE_RO)
1097                 return (EROFS);
1098         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1099                 return (error);
1100
1101         /*
1102          * Create a transaction to cover the operations we perform.
1103          */
1104         hammer_start_transaction(&trans, dip->hmp);
1105
1106         /*
1107          * Create a new filesystem object of the requested type.  The
1108          * returned inode will be referenced but not locked.
1109          *
1110          * If mknod specifies a directory a pseudo-fs is created.
1111          */
1112         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1113                                     dip, NULL, &nip);
1114         if (error) {
1115                 hammer_done_transaction(&trans);
1116                 *ap->a_vpp = NULL;
1117                 return (error);
1118         }
1119
1120         /*
1121          * Add the new filesystem object to the directory.  This will also
1122          * bump the inode's link count.
1123          */
1124         error = hammer_ip_add_directory(&trans, dip,
1125                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1126                                         nip);
1127
1128         /*
1129          * Finish up.
1130          */
1131         if (error) {
1132                 hammer_rel_inode(nip, 0);
1133                 *ap->a_vpp = NULL;
1134         } else {
1135                 error = hammer_get_vnode(nip, ap->a_vpp);
1136                 hammer_rel_inode(nip, 0);
1137                 if (error == 0) {
1138                         cache_setunresolved(ap->a_nch);
1139                         cache_setvp(ap->a_nch, *ap->a_vpp);
1140                 }
1141         }
1142         hammer_done_transaction(&trans);
1143         return (error);
1144 }
1145
1146 /*
1147  * hammer_vop_open { vp, mode, cred, fp }
1148  */
1149 static
1150 int
1151 hammer_vop_open(struct vop_open_args *ap)
1152 {
1153         hammer_inode_t ip;
1154
1155         ip = VTOI(ap->a_vp);
1156
1157         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1158                 return (EROFS);
1159         return(vop_stdopen(ap));
1160 }
1161
1162 /*
1163  * hammer_vop_pathconf { vp, name, retval }
1164  */
1165 static
1166 int
1167 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1168 {
1169         return EOPNOTSUPP;
1170 }
1171
1172 /*
1173  * hammer_vop_print { vp }
1174  */
1175 static
1176 int
1177 hammer_vop_print(struct vop_print_args *ap)
1178 {
1179         return EOPNOTSUPP;
1180 }
1181
1182 /*
1183  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1184  */
1185 static
1186 int
1187 hammer_vop_readdir(struct vop_readdir_args *ap)
1188 {
1189         struct hammer_transaction trans;
1190         struct hammer_cursor cursor;
1191         struct hammer_inode *ip;
1192         struct uio *uio;
1193         hammer_base_elm_t base;
1194         int error;
1195         int cookie_index;
1196         int ncookies;
1197         off_t *cookies;
1198         off_t saveoff;
1199         int r;
1200         int dtype;
1201
1202         ip = VTOI(ap->a_vp);
1203         uio = ap->a_uio;
1204         saveoff = uio->uio_offset;
1205
1206         if (ap->a_ncookies) {
1207                 ncookies = uio->uio_resid / 16 + 1;
1208                 if (ncookies > 1024)
1209                         ncookies = 1024;
1210                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1211                 cookie_index = 0;
1212         } else {
1213                 ncookies = -1;
1214                 cookies = NULL;
1215                 cookie_index = 0;
1216         }
1217
1218         hammer_simple_transaction(&trans, ip->hmp);
1219
1220         /*
1221          * Handle artificial entries
1222          */
1223         error = 0;
1224         if (saveoff == 0) {
1225                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1226                 if (r)
1227                         goto done;
1228                 if (cookies)
1229                         cookies[cookie_index] = saveoff;
1230                 ++saveoff;
1231                 ++cookie_index;
1232                 if (cookie_index == ncookies)
1233                         goto done;
1234         }
1235         if (saveoff == 1) {
1236                 if (ip->ino_data.parent_obj_id) {
1237                         r = vop_write_dirent(&error, uio,
1238                                              ip->ino_data.parent_obj_id,
1239                                              DT_DIR, 2, "..");
1240                 } else {
1241                         r = vop_write_dirent(&error, uio,
1242                                              ip->obj_id, DT_DIR, 2, "..");
1243                 }
1244                 if (r)
1245                         goto done;
1246                 if (cookies)
1247                         cookies[cookie_index] = saveoff;
1248                 ++saveoff;
1249                 ++cookie_index;
1250                 if (cookie_index == ncookies)
1251                         goto done;
1252         }
1253
1254         /*
1255          * Key range (begin and end inclusive) to scan.  Directory keys
1256          * directly translate to a 64 bit 'seek' position.
1257          */
1258         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1259         cursor.key_beg.localization = ip->obj_localization +
1260                                       HAMMER_LOCALIZE_MISC;
1261         cursor.key_beg.obj_id = ip->obj_id;
1262         cursor.key_beg.create_tid = 0;
1263         cursor.key_beg.delete_tid = 0;
1264         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1265         cursor.key_beg.obj_type = 0;
1266         cursor.key_beg.key = saveoff;
1267
1268         cursor.key_end = cursor.key_beg;
1269         cursor.key_end.key = HAMMER_MAX_KEY;
1270         cursor.asof = ip->obj_asof;
1271         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1272
1273         error = hammer_ip_first(&cursor);
1274
1275         while (error == 0) {
1276                 error = hammer_ip_resolve_data(&cursor);
1277                 if (error)
1278                         break;
1279                 base = &cursor.leaf->base;
1280                 saveoff = base->key;
1281                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1282
1283                 if (base->obj_id != ip->obj_id)
1284                         panic("readdir: bad record at %p", cursor.node);
1285
1286                 /*
1287                  * Convert pseudo-filesystems into softlinks
1288                  */
1289                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1290                 r = vop_write_dirent(
1291                              &error, uio, cursor.data->entry.obj_id,
1292                              dtype,
1293                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1294                              (void *)cursor.data->entry.name);
1295                 if (r)
1296                         break;
1297                 ++saveoff;
1298                 if (cookies)
1299                         cookies[cookie_index] = base->key;
1300                 ++cookie_index;
1301                 if (cookie_index == ncookies)
1302                         break;
1303                 error = hammer_ip_next(&cursor);
1304         }
1305         hammer_done_cursor(&cursor);
1306
1307 done:
1308         hammer_done_transaction(&trans);
1309
1310         if (ap->a_eofflag)
1311                 *ap->a_eofflag = (error == ENOENT);
1312         uio->uio_offset = saveoff;
1313         if (error && cookie_index == 0) {
1314                 if (error == ENOENT)
1315                         error = 0;
1316                 if (cookies) {
1317                         kfree(cookies, M_TEMP);
1318                         *ap->a_ncookies = 0;
1319                         *ap->a_cookies = NULL;
1320                 }
1321         } else {
1322                 if (error == ENOENT)
1323                         error = 0;
1324                 if (cookies) {
1325                         *ap->a_ncookies = cookie_index;
1326                         *ap->a_cookies = cookies;
1327                 }
1328         }
1329         return(error);
1330 }
1331
1332 /*
1333  * hammer_vop_readlink { vp, uio, cred }
1334  */
1335 static
1336 int
1337 hammer_vop_readlink(struct vop_readlink_args *ap)
1338 {
1339         struct hammer_transaction trans;
1340         struct hammer_cursor cursor;
1341         struct hammer_inode *ip;
1342         char buf[32];
1343         u_int32_t localization;
1344         hammer_pseudofs_inmem_t pfsm;
1345         int error;
1346
1347         ip = VTOI(ap->a_vp);
1348
1349         /*
1350          * Shortcut if the symlink data was stuffed into ino_data.
1351          *
1352          * Also expand special "@@PFS%05d" softlinks (expansion only
1353          * occurs for non-historical (current) accesses made from the
1354          * primary filesystem).
1355          */
1356         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1357                 char *ptr;
1358                 int bytes;
1359
1360                 ptr = ip->ino_data.ext.symlink;
1361                 bytes = (int)ip->ino_data.size;
1362                 if (bytes == 10 &&
1363                     ip->obj_asof == HAMMER_MAX_TID &&
1364                     ip->obj_localization == 0 &&
1365                     strncmp(ptr, "@@PFS", 5) == 0) {
1366                         hammer_simple_transaction(&trans, ip->hmp);
1367                         bcopy(ptr + 5, buf, 5);
1368                         buf[5] = 0;
1369                         localization = strtoul(buf, NULL, 10) << 16;
1370                         pfsm = hammer_load_pseudofs(&trans, localization,
1371                                                     &error);
1372                         if (error == 0) {
1373                                 if (pfsm->pfsd.mirror_flags &
1374                                     HAMMER_PFSD_SLAVE) {
1375                                         ksnprintf(buf, sizeof(buf),
1376                                                   "@@0x%016llx:%05d",
1377                                                   pfsm->pfsd.sync_end_tid,
1378                                                   localization >> 16);
1379                                 } else {
1380                                         ksnprintf(buf, sizeof(buf),
1381                                                   "@@0x%016llx:%05d",
1382                                                   HAMMER_MAX_TID,
1383                                                   localization >> 16);
1384                                 }
1385                                 ptr = buf;
1386                                 bytes = strlen(buf);
1387                         }
1388                         if (pfsm)
1389                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1390                         hammer_done_transaction(&trans);
1391                 }
1392                 error = uiomove(ptr, bytes, ap->a_uio);
1393                 return(error);
1394         }
1395
1396         /*
1397          * Long version
1398          */
1399         hammer_simple_transaction(&trans, ip->hmp);
1400         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1401
1402         /*
1403          * Key range (begin and end inclusive) to scan.  Directory keys
1404          * directly translate to a 64 bit 'seek' position.
1405          */
1406         cursor.key_beg.localization = ip->obj_localization +
1407                                       HAMMER_LOCALIZE_MISC;
1408         cursor.key_beg.obj_id = ip->obj_id;
1409         cursor.key_beg.create_tid = 0;
1410         cursor.key_beg.delete_tid = 0;
1411         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1412         cursor.key_beg.obj_type = 0;
1413         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1414         cursor.asof = ip->obj_asof;
1415         cursor.flags |= HAMMER_CURSOR_ASOF;
1416
1417         error = hammer_ip_lookup(&cursor);
1418         if (error == 0) {
1419                 error = hammer_ip_resolve_data(&cursor);
1420                 if (error == 0) {
1421                         KKASSERT(cursor.leaf->data_len >=
1422                                  HAMMER_SYMLINK_NAME_OFF);
1423                         error = uiomove(cursor.data->symlink.name,
1424                                         cursor.leaf->data_len -
1425                                                 HAMMER_SYMLINK_NAME_OFF,
1426                                         ap->a_uio);
1427                 }
1428         }
1429         hammer_done_cursor(&cursor);
1430         hammer_done_transaction(&trans);
1431         return(error);
1432 }
1433
1434 /*
1435  * hammer_vop_nremove { nch, dvp, cred }
1436  */
1437 static
1438 int
1439 hammer_vop_nremove(struct vop_nremove_args *ap)
1440 {
1441         struct hammer_transaction trans;
1442         struct hammer_inode *dip;
1443         int error;
1444
1445         dip = VTOI(ap->a_dvp);
1446
1447         if (hammer_nohistory(dip) == 0 &&
1448             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1449                 return (error);
1450         }
1451
1452         hammer_start_transaction(&trans, dip->hmp);
1453         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1454         hammer_done_transaction(&trans);
1455
1456         return (error);
1457 }
1458
1459 /*
1460  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1461  */
1462 static
1463 int
1464 hammer_vop_nrename(struct vop_nrename_args *ap)
1465 {
1466         struct hammer_transaction trans;
1467         struct namecache *fncp;
1468         struct namecache *tncp;
1469         struct hammer_inode *fdip;
1470         struct hammer_inode *tdip;
1471         struct hammer_inode *ip;
1472         struct hammer_cursor cursor;
1473         int64_t namekey;
1474         int nlen, error;
1475
1476         fdip = VTOI(ap->a_fdvp);
1477         tdip = VTOI(ap->a_tdvp);
1478         fncp = ap->a_fnch->ncp;
1479         tncp = ap->a_tnch->ncp;
1480         ip = VTOI(fncp->nc_vp);
1481         KKASSERT(ip != NULL);
1482
1483         if (fdip->flags & HAMMER_INODE_RO)
1484                 return (EROFS);
1485         if (tdip->flags & HAMMER_INODE_RO)
1486                 return (EROFS);
1487         if (ip->flags & HAMMER_INODE_RO)
1488                 return (EROFS);
1489         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1490                 return (error);
1491
1492         hammer_start_transaction(&trans, fdip->hmp);
1493
1494         /*
1495          * Remove tncp from the target directory and then link ip as
1496          * tncp. XXX pass trans to dounlink
1497          *
1498          * Force the inode sync-time to match the transaction so it is
1499          * in-sync with the creation of the target directory entry.
1500          */
1501         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1502         if (error == 0 || error == ENOENT) {
1503                 error = hammer_ip_add_directory(&trans, tdip,
1504                                                 tncp->nc_name, tncp->nc_nlen,
1505                                                 ip);
1506                 if (error == 0) {
1507                         ip->ino_data.parent_obj_id = tdip->obj_id;
1508                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1509                 }
1510         }
1511         if (error)
1512                 goto failed; /* XXX */
1513
1514         /*
1515          * Locate the record in the originating directory and remove it.
1516          *
1517          * Calculate the namekey and setup the key range for the scan.  This
1518          * works kinda like a chained hash table where the lower 32 bits
1519          * of the namekey synthesize the chain.
1520          *
1521          * The key range is inclusive of both key_beg and key_end.
1522          */
1523         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1524 retry:
1525         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1526         cursor.key_beg.localization = fdip->obj_localization +
1527                                       HAMMER_LOCALIZE_MISC;
1528         cursor.key_beg.obj_id = fdip->obj_id;
1529         cursor.key_beg.key = namekey;
1530         cursor.key_beg.create_tid = 0;
1531         cursor.key_beg.delete_tid = 0;
1532         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1533         cursor.key_beg.obj_type = 0;
1534
1535         cursor.key_end = cursor.key_beg;
1536         cursor.key_end.key |= 0xFFFFFFFFULL;
1537         cursor.asof = fdip->obj_asof;
1538         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1539
1540         /*
1541          * Scan all matching records (the chain), locate the one matching
1542          * the requested path component.
1543          *
1544          * The hammer_ip_*() functions merge in-memory records with on-disk
1545          * records for the purposes of the search.
1546          */
1547         error = hammer_ip_first(&cursor);
1548         while (error == 0) {
1549                 if (hammer_ip_resolve_data(&cursor) != 0)
1550                         break;
1551                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1552                 KKASSERT(nlen > 0);
1553                 if (fncp->nc_nlen == nlen &&
1554                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1555                         break;
1556                 }
1557                 error = hammer_ip_next(&cursor);
1558         }
1559
1560         /*
1561          * If all is ok we have to get the inode so we can adjust nlinks.
1562          *
1563          * WARNING: hammer_ip_del_directory() may have to terminate the
1564          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1565          * twice.
1566          */
1567         if (error == 0)
1568                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1569
1570         /*
1571          * XXX A deadlock here will break rename's atomicy for the purposes
1572          * of crash recovery.
1573          */
1574         if (error == EDEADLK) {
1575                 hammer_done_cursor(&cursor);
1576                 goto retry;
1577         }
1578
1579         /*
1580          * Cleanup and tell the kernel that the rename succeeded.
1581          */
1582         hammer_done_cursor(&cursor);
1583         if (error == 0)
1584                 cache_rename(ap->a_fnch, ap->a_tnch);
1585
1586 failed:
1587         hammer_done_transaction(&trans);
1588         return (error);
1589 }
1590
1591 /*
1592  * hammer_vop_nrmdir { nch, dvp, cred }
1593  */
1594 static
1595 int
1596 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1597 {
1598         struct hammer_transaction trans;
1599         struct hammer_inode *dip;
1600         int error;
1601
1602         dip = VTOI(ap->a_dvp);
1603
1604         if (hammer_nohistory(dip) == 0 &&
1605             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1606                 return (error);
1607         }
1608
1609         hammer_start_transaction(&trans, dip->hmp);
1610         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1611         hammer_done_transaction(&trans);
1612
1613         return (error);
1614 }
1615
1616 /*
1617  * hammer_vop_setattr { vp, vap, cred }
1618  */
1619 static
1620 int
1621 hammer_vop_setattr(struct vop_setattr_args *ap)
1622 {
1623         struct hammer_transaction trans;
1624         struct vattr *vap;
1625         struct hammer_inode *ip;
1626         int modflags;
1627         int error;
1628         int truncating;
1629         int blksize;
1630         int64_t aligned_size;
1631         u_int32_t flags;
1632
1633         vap = ap->a_vap;
1634         ip = ap->a_vp->v_data;
1635         modflags = 0;
1636
1637         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1638                 return(EROFS);
1639         if (ip->flags & HAMMER_INODE_RO)
1640                 return (EROFS);
1641         if (hammer_nohistory(ip) == 0 &&
1642             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1643                 return (error);
1644         }
1645
1646         hammer_start_transaction(&trans, ip->hmp);
1647         error = 0;
1648
1649         if (vap->va_flags != VNOVAL) {
1650                 flags = ip->ino_data.uflags;
1651                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1652                                          hammer_to_unix_xid(&ip->ino_data.uid),
1653                                          ap->a_cred);
1654                 if (error == 0) {
1655                         if (ip->ino_data.uflags != flags) {
1656                                 ip->ino_data.uflags = flags;
1657                                 modflags |= HAMMER_INODE_DDIRTY;
1658                         }
1659                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1660                                 error = 0;
1661                                 goto done;
1662                         }
1663                 }
1664                 goto done;
1665         }
1666         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1667                 error = EPERM;
1668                 goto done;
1669         }
1670         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1671                 mode_t cur_mode = ip->ino_data.mode;
1672                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1673                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1674                 uuid_t uuid_uid;
1675                 uuid_t uuid_gid;
1676
1677                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1678                                          ap->a_cred,
1679                                          &cur_uid, &cur_gid, &cur_mode);
1680                 if (error == 0) {
1681                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1682                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1683                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1684                                  sizeof(uuid_uid)) ||
1685                             bcmp(&uuid_gid, &ip->ino_data.gid,
1686                                  sizeof(uuid_gid)) ||
1687                             ip->ino_data.mode != cur_mode
1688                         ) {
1689                                 ip->ino_data.uid = uuid_uid;
1690                                 ip->ino_data.gid = uuid_gid;
1691                                 ip->ino_data.mode = cur_mode;
1692                         }
1693                         modflags |= HAMMER_INODE_DDIRTY;
1694                 }
1695         }
1696         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1697                 switch(ap->a_vp->v_type) {
1698                 case VREG:
1699                         if (vap->va_size == ip->ino_data.size)
1700                                 break;
1701                         /*
1702                          * XXX break atomicy, we can deadlock the backend
1703                          * if we do not release the lock.  Probably not a
1704                          * big deal here.
1705                          */
1706                         blksize = hammer_blocksize(vap->va_size);
1707                         if (vap->va_size < ip->ino_data.size) {
1708                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1709                                 truncating = 1;
1710                         } else {
1711                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1712                                 truncating = 0;
1713                         }
1714                         ip->ino_data.size = vap->va_size;
1715                         modflags |= HAMMER_INODE_DDIRTY;
1716
1717                         /*
1718                          * on-media truncation is cached in the inode until
1719                          * the inode is synchronized.
1720                          */
1721                         if (truncating) {
1722                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1723 #ifdef DEBUG_TRUNCATE
1724                                 if (HammerTruncIp == NULL)
1725                                         HammerTruncIp = ip;
1726 #endif
1727                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1728                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1729                                         ip->trunc_off = vap->va_size;
1730 #ifdef DEBUG_TRUNCATE
1731                                         if (ip == HammerTruncIp)
1732                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1733 #endif
1734                                 } else if (ip->trunc_off > vap->va_size) {
1735                                         ip->trunc_off = vap->va_size;
1736 #ifdef DEBUG_TRUNCATE
1737                                         if (ip == HammerTruncIp)
1738                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1739 #endif
1740                                 } else {
1741 #ifdef DEBUG_TRUNCATE
1742                                         if (ip == HammerTruncIp)
1743                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1744 #endif
1745                                 }
1746                         }
1747
1748                         /*
1749                          * If truncating we have to clean out a portion of
1750                          * the last block on-disk.  We do this in the
1751                          * front-end buffer cache.
1752                          */
1753                         aligned_size = (vap->va_size + (blksize - 1)) &
1754                                        ~(int64_t)(blksize - 1);
1755                         if (truncating && vap->va_size < aligned_size) {
1756                                 struct buf *bp;
1757                                 int offset;
1758
1759                                 aligned_size -= blksize;
1760
1761                                 offset = (int)vap->va_size & (blksize - 1);
1762                                 error = bread(ap->a_vp, aligned_size,
1763                                               blksize, &bp);
1764                                 hammer_ip_frontend_trunc(ip, aligned_size);
1765                                 if (error == 0) {
1766                                         bzero(bp->b_data + offset,
1767                                               blksize - offset);
1768                                         bdwrite(bp);
1769                                 } else {
1770                                         kprintf("ERROR %d\n", error);
1771                                         brelse(bp);
1772                                 }
1773                         }
1774                         break;
1775                 case VDATABASE:
1776                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1777                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1778                                 ip->trunc_off = vap->va_size;
1779                         } else if (ip->trunc_off > vap->va_size) {
1780                                 ip->trunc_off = vap->va_size;
1781                         }
1782                         hammer_ip_frontend_trunc(ip, vap->va_size);
1783                         ip->ino_data.size = vap->va_size;
1784                         modflags |= HAMMER_INODE_DDIRTY;
1785                         break;
1786                 default:
1787                         error = EINVAL;
1788                         goto done;
1789                 }
1790                 break;
1791         }
1792         if (vap->va_atime.tv_sec != VNOVAL) {
1793                 ip->ino_data.atime =
1794                         hammer_timespec_to_time(&vap->va_atime);
1795                 modflags |= HAMMER_INODE_ATIME;
1796         }
1797         if (vap->va_mtime.tv_sec != VNOVAL) {
1798                 ip->ino_data.mtime =
1799                         hammer_timespec_to_time(&vap->va_mtime);
1800                 modflags |= HAMMER_INODE_MTIME;
1801         }
1802         if (vap->va_mode != (mode_t)VNOVAL) {
1803                 mode_t   cur_mode = ip->ino_data.mode;
1804                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1805                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1806
1807                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1808                                          cur_uid, cur_gid, &cur_mode);
1809                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1810                         ip->ino_data.mode = cur_mode;
1811                         modflags |= HAMMER_INODE_DDIRTY;
1812                 }
1813         }
1814 done:
1815         if (error == 0)
1816                 hammer_modify_inode(ip, modflags);
1817         hammer_done_transaction(&trans);
1818         return (error);
1819 }
1820
1821 /*
1822  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1823  */
1824 static
1825 int
1826 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1827 {
1828         struct hammer_transaction trans;
1829         struct hammer_inode *dip;
1830         struct hammer_inode *nip;
1831         struct nchandle *nch;
1832         hammer_record_t record;
1833         int error;
1834         int bytes;
1835
1836         ap->a_vap->va_type = VLNK;
1837
1838         nch = ap->a_nch;
1839         dip = VTOI(ap->a_dvp);
1840
1841         if (dip->flags & HAMMER_INODE_RO)
1842                 return (EROFS);
1843         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1844                 return (error);
1845
1846         /*
1847          * Create a transaction to cover the operations we perform.
1848          */
1849         hammer_start_transaction(&trans, dip->hmp);
1850
1851         /*
1852          * Create a new filesystem object of the requested type.  The
1853          * returned inode will be referenced but not locked.
1854          */
1855
1856         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1857                                     dip, NULL, &nip);
1858         if (error) {
1859                 hammer_done_transaction(&trans);
1860                 *ap->a_vpp = NULL;
1861                 return (error);
1862         }
1863
1864         /*
1865          * Add a record representing the symlink.  symlink stores the link
1866          * as pure data, not a string, and is no \0 terminated.
1867          */
1868         if (error == 0) {
1869                 bytes = strlen(ap->a_target);
1870
1871                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1872                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1873                 } else {
1874                         record = hammer_alloc_mem_record(nip, bytes);
1875                         record->type = HAMMER_MEM_RECORD_GENERAL;
1876
1877                         record->leaf.base.localization = nip->obj_localization +
1878                                                          HAMMER_LOCALIZE_MISC;
1879                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1880                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1881                         record->leaf.data_len = bytes;
1882                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1883                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1884                         error = hammer_ip_add_record(&trans, record);
1885                 }
1886
1887                 /*
1888                  * Set the file size to the length of the link.
1889                  */
1890                 if (error == 0) {
1891                         nip->ino_data.size = bytes;
1892                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1893                 }
1894         }
1895         if (error == 0)
1896                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1897                                                 nch->ncp->nc_nlen, nip);
1898
1899         /*
1900          * Finish up.
1901          */
1902         if (error) {
1903                 hammer_rel_inode(nip, 0);
1904                 *ap->a_vpp = NULL;
1905         } else {
1906                 error = hammer_get_vnode(nip, ap->a_vpp);
1907                 hammer_rel_inode(nip, 0);
1908                 if (error == 0) {
1909                         cache_setunresolved(ap->a_nch);
1910                         cache_setvp(ap->a_nch, *ap->a_vpp);
1911                 }
1912         }
1913         hammer_done_transaction(&trans);
1914         return (error);
1915 }
1916
1917 /*
1918  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1919  */
1920 static
1921 int
1922 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1923 {
1924         struct hammer_transaction trans;
1925         struct hammer_inode *dip;
1926         int error;
1927
1928         dip = VTOI(ap->a_dvp);
1929
1930         if (hammer_nohistory(dip) == 0 &&
1931             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
1932                 return (error);
1933         }
1934
1935         hammer_start_transaction(&trans, dip->hmp);
1936         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1937                                 ap->a_cred, ap->a_flags);
1938         hammer_done_transaction(&trans);
1939
1940         return (error);
1941 }
1942
1943 /*
1944  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1945  */
1946 static
1947 int
1948 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1949 {
1950         struct hammer_inode *ip = ap->a_vp->v_data;
1951
1952         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1953                             ap->a_fflag, ap->a_cred));
1954 }
1955
1956 static
1957 int
1958 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1959 {
1960         struct mount *mp;
1961         int error;
1962
1963         mp = ap->a_head.a_ops->head.vv_mount;
1964
1965         switch(ap->a_op) {
1966         case MOUNTCTL_SET_EXPORT:
1967                 if (ap->a_ctllen != sizeof(struct export_args))
1968                         error = EINVAL;
1969                 error = hammer_vfs_export(mp, ap->a_op,
1970                                       (const struct export_args *)ap->a_ctl);
1971                 break;
1972         default:
1973                 error = journal_mountctl(ap);
1974                 break;
1975         }
1976         return(error);
1977 }
1978
1979 /*
1980  * hammer_vop_strategy { vp, bio }
1981  *
1982  * Strategy call, used for regular file read & write only.  Note that the
1983  * bp may represent a cluster.
1984  *
1985  * To simplify operation and allow better optimizations in the future,
1986  * this code does not make any assumptions with regards to buffer alignment
1987  * or size.
1988  */
1989 static
1990 int
1991 hammer_vop_strategy(struct vop_strategy_args *ap)
1992 {
1993         struct buf *bp;
1994         int error;
1995
1996         bp = ap->a_bio->bio_buf;
1997
1998         switch(bp->b_cmd) {
1999         case BUF_CMD_READ:
2000                 error = hammer_vop_strategy_read(ap);
2001                 break;
2002         case BUF_CMD_WRITE:
2003                 error = hammer_vop_strategy_write(ap);
2004                 break;
2005         default:
2006                 bp->b_error = error = EINVAL;
2007                 bp->b_flags |= B_ERROR;
2008                 biodone(ap->a_bio);
2009                 break;
2010         }
2011         return (error);
2012 }
2013
2014 /*
2015  * Read from a regular file.  Iterate the related records and fill in the
2016  * BIO/BUF.  Gaps are zero-filled.
2017  *
2018  * The support code in hammer_object.c should be used to deal with mixed
2019  * in-memory and on-disk records.
2020  *
2021  * NOTE: Can be called from the cluster code with an oversized buf.
2022  *
2023  * XXX atime update
2024  */
2025 static
2026 int
2027 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2028 {
2029         struct hammer_transaction trans;
2030         struct hammer_inode *ip;
2031         struct hammer_cursor cursor;
2032         hammer_base_elm_t base;
2033         hammer_off_t disk_offset;
2034         struct bio *bio;
2035         struct bio *nbio;
2036         struct buf *bp;
2037         int64_t rec_offset;
2038         int64_t ran_end;
2039         int64_t tmp64;
2040         int error;
2041         int boff;
2042         int roff;
2043         int n;
2044
2045         bio = ap->a_bio;
2046         bp = bio->bio_buf;
2047         ip = ap->a_vp->v_data;
2048
2049         /*
2050          * The zone-2 disk offset may have been set by the cluster code via
2051          * a BMAP operation, or else should be NOOFFSET.
2052          *
2053          * Checking the high bits for a match against zone-2 should suffice.
2054          */
2055         nbio = push_bio(bio);
2056         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2057             HAMMER_ZONE_RAW_BUFFER) {
2058                 error = hammer_io_direct_read(ip->hmp, nbio);
2059                 return (error);
2060         }
2061
2062         /*
2063          * Well, that sucked.  Do it the hard way.  If all the stars are
2064          * aligned we may still be able to issue a direct-read.
2065          */
2066         hammer_simple_transaction(&trans, ip->hmp);
2067         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2068
2069         /*
2070          * Key range (begin and end inclusive) to scan.  Note that the key's
2071          * stored in the actual records represent BASE+LEN, not BASE.  The
2072          * first record containing bio_offset will have a key > bio_offset.
2073          */
2074         cursor.key_beg.localization = ip->obj_localization +
2075                                       HAMMER_LOCALIZE_MISC;
2076         cursor.key_beg.obj_id = ip->obj_id;
2077         cursor.key_beg.create_tid = 0;
2078         cursor.key_beg.delete_tid = 0;
2079         cursor.key_beg.obj_type = 0;
2080         cursor.key_beg.key = bio->bio_offset + 1;
2081         cursor.asof = ip->obj_asof;
2082         cursor.flags |= HAMMER_CURSOR_ASOF;
2083
2084         cursor.key_end = cursor.key_beg;
2085         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2086 #if 0
2087         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2088                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2089                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2090                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2091         } else
2092 #endif
2093         {
2094                 ran_end = bio->bio_offset + bp->b_bufsize;
2095                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2096                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2097                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2098                 if (tmp64 < ran_end)
2099                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2100                 else
2101                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2102         }
2103         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2104
2105         error = hammer_ip_first(&cursor);
2106         boff = 0;
2107
2108         while (error == 0) {
2109                 /*
2110                  * Get the base file offset of the record.  The key for
2111                  * data records is (base + bytes) rather then (base).
2112                  */
2113                 base = &cursor.leaf->base;
2114                 rec_offset = base->key - cursor.leaf->data_len;
2115
2116                 /*
2117                  * Calculate the gap, if any, and zero-fill it.
2118                  *
2119                  * n is the offset of the start of the record verses our
2120                  * current seek offset in the bio.
2121                  */
2122                 n = (int)(rec_offset - (bio->bio_offset + boff));
2123                 if (n > 0) {
2124                         if (n > bp->b_bufsize - boff)
2125                                 n = bp->b_bufsize - boff;
2126                         bzero((char *)bp->b_data + boff, n);
2127                         boff += n;
2128                         n = 0;
2129                 }
2130
2131                 /*
2132                  * Calculate the data offset in the record and the number
2133                  * of bytes we can copy.
2134                  *
2135                  * There are two degenerate cases.  First, boff may already
2136                  * be at bp->b_bufsize.  Secondly, the data offset within
2137                  * the record may exceed the record's size.
2138                  */
2139                 roff = -n;
2140                 rec_offset += roff;
2141                 n = cursor.leaf->data_len - roff;
2142                 if (n <= 0) {
2143                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2144                         n = 0;
2145                 } else if (n > bp->b_bufsize - boff) {
2146                         n = bp->b_bufsize - boff;
2147                 }
2148
2149                 /*
2150                  * Deal with cached truncations.  This cool bit of code
2151                  * allows truncate()/ftruncate() to avoid having to sync
2152                  * the file.
2153                  *
2154                  * If the frontend is truncated then all backend records are
2155                  * subject to the frontend's truncation.
2156                  *
2157                  * If the backend is truncated then backend records on-disk
2158                  * (but not in-memory) are subject to the backend's
2159                  * truncation.  In-memory records owned by the backend
2160                  * represent data written after the truncation point on the
2161                  * backend and must not be truncated.
2162                  *
2163                  * Truncate operations deal with frontend buffer cache
2164                  * buffers and frontend-owned in-memory records synchronously.
2165                  */
2166                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2167                         if (hammer_cursor_ondisk(&cursor) ||
2168                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2169                                 if (ip->trunc_off <= rec_offset)
2170                                         n = 0;
2171                                 else if (ip->trunc_off < rec_offset + n)
2172                                         n = (int)(ip->trunc_off - rec_offset);
2173                         }
2174                 }
2175                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2176                         if (hammer_cursor_ondisk(&cursor)) {
2177                                 if (ip->sync_trunc_off <= rec_offset)
2178                                         n = 0;
2179                                 else if (ip->sync_trunc_off < rec_offset + n)
2180                                         n = (int)(ip->sync_trunc_off - rec_offset);
2181                         }
2182                 }
2183
2184                 /*
2185                  * Try to issue a direct read into our bio if possible,
2186                  * otherwise resolve the element data into a hammer_buffer
2187                  * and copy.
2188                  *
2189                  * The buffer on-disk should be zerod past any real
2190                  * truncation point, but may not be for any synthesized
2191                  * truncation point from above.
2192                  */
2193                 if (boff == 0 && n == bp->b_bufsize &&
2194                     ((cursor.leaf->data_offset + roff) & HAMMER_BUFMASK) == 0) {
2195                         disk_offset = hammer_blockmap_lookup(
2196                                                 trans.hmp,
2197                                                 cursor.leaf->data_offset + roff,
2198                                                 &error);
2199                         if (error)
2200                                 break;
2201                         nbio->bio_offset = disk_offset;
2202                         error = hammer_io_direct_read(trans.hmp, nbio);
2203                         goto done;
2204                 } else if (n) {
2205                         error = hammer_ip_resolve_data(&cursor);
2206                         if (error == 0) {
2207                                 bcopy((char *)cursor.data + roff,
2208                                       (char *)bp->b_data + boff, n);
2209                         }
2210                 }
2211                 if (error)
2212                         break;
2213
2214                 /*
2215                  * Iterate until we have filled the request.
2216                  */
2217                 boff += n;
2218                 if (boff == bp->b_bufsize)
2219                         break;
2220                 error = hammer_ip_next(&cursor);
2221         }
2222
2223         /*
2224          * There may have been a gap after the last record
2225          */
2226         if (error == ENOENT)
2227                 error = 0;
2228         if (error == 0 && boff != bp->b_bufsize) {
2229                 KKASSERT(boff < bp->b_bufsize);
2230                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2231                 /* boff = bp->b_bufsize; */
2232         }
2233         bp->b_resid = 0;
2234         bp->b_error = error;
2235         if (error)
2236                 bp->b_flags |= B_ERROR;
2237         biodone(ap->a_bio);
2238
2239 done:
2240         if (cursor.node)
2241                 hammer_cache_node(&ip->cache[1], cursor.node);
2242         hammer_done_cursor(&cursor);
2243         hammer_done_transaction(&trans);
2244         return(error);
2245 }
2246
2247 /*
2248  * BMAP operation - used to support cluster_read() only.
2249  *
2250  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2251  *
2252  * This routine may return EOPNOTSUPP if the opration is not supported for
2253  * the specified offset.  The contents of the pointer arguments do not
2254  * need to be initialized in that case. 
2255  *
2256  * If a disk address is available and properly aligned return 0 with 
2257  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2258  * to the run-length relative to that offset.  Callers may assume that
2259  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2260  * large, so return EOPNOTSUPP if it is not sufficiently large.
2261  */
2262 static
2263 int
2264 hammer_vop_bmap(struct vop_bmap_args *ap)
2265 {
2266         struct hammer_transaction trans;
2267         struct hammer_inode *ip;
2268         struct hammer_cursor cursor;
2269         hammer_base_elm_t base;
2270         int64_t rec_offset;
2271         int64_t ran_end;
2272         int64_t tmp64;
2273         int64_t base_offset;
2274         int64_t base_disk_offset;
2275         int64_t last_offset;
2276         hammer_off_t last_disk_offset;
2277         hammer_off_t disk_offset;
2278         int     rec_len;
2279         int     error;
2280         int     blksize;
2281
2282         ip = ap->a_vp->v_data;
2283
2284         /*
2285          * We can only BMAP regular files.  We can't BMAP database files,
2286          * directories, etc.
2287          */
2288         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2289                 return(EOPNOTSUPP);
2290
2291         /*
2292          * bmap is typically called with runp/runb both NULL when used
2293          * for writing.  We do not support BMAP for writing atm.
2294          */
2295         if (ap->a_cmd != BUF_CMD_READ)
2296                 return(EOPNOTSUPP);
2297
2298         /*
2299          * Scan the B-Tree to acquire blockmap addresses, then translate
2300          * to raw addresses.
2301          */
2302         hammer_simple_transaction(&trans, ip->hmp);
2303 #if 0
2304         kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2305 #endif
2306         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2307
2308         /*
2309          * Key range (begin and end inclusive) to scan.  Note that the key's
2310          * stored in the actual records represent BASE+LEN, not BASE.  The
2311          * first record containing bio_offset will have a key > bio_offset.
2312          */
2313         cursor.key_beg.localization = ip->obj_localization +
2314                                       HAMMER_LOCALIZE_MISC;
2315         cursor.key_beg.obj_id = ip->obj_id;
2316         cursor.key_beg.create_tid = 0;
2317         cursor.key_beg.delete_tid = 0;
2318         cursor.key_beg.obj_type = 0;
2319         if (ap->a_runb)
2320                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2321         else
2322                 cursor.key_beg.key = ap->a_loffset + 1;
2323         if (cursor.key_beg.key < 0)
2324                 cursor.key_beg.key = 0;
2325         cursor.asof = ip->obj_asof;
2326         cursor.flags |= HAMMER_CURSOR_ASOF;
2327
2328         cursor.key_end = cursor.key_beg;
2329         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2330
2331         ran_end = ap->a_loffset + MAXPHYS;
2332         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2333         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2334         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2335         if (tmp64 < ran_end)
2336                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2337         else
2338                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2339
2340         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2341
2342         error = hammer_ip_first(&cursor);
2343         base_offset = last_offset = 0;
2344         base_disk_offset = last_disk_offset = 0;
2345
2346         while (error == 0) {
2347                 /*
2348                  * Get the base file offset of the record.  The key for
2349                  * data records is (base + bytes) rather then (base).
2350                  *
2351                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2352                  * The extra bytes should be zero on-disk and the BMAP op
2353                  * should still be ok.
2354                  */
2355                 base = &cursor.leaf->base;
2356                 rec_offset = base->key - cursor.leaf->data_len;
2357                 rec_len    = cursor.leaf->data_len;
2358
2359                 /*
2360                  * Incorporate any cached truncation.
2361                  *
2362                  * NOTE: Modifications to rec_len based on synthesized
2363                  * truncation points remove the guarantee that any extended
2364                  * data on disk is zero (since the truncations may not have
2365                  * taken place on-media yet).
2366                  */
2367                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2368                         if (hammer_cursor_ondisk(&cursor) ||
2369                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2370                                 if (ip->trunc_off <= rec_offset)
2371                                         rec_len = 0;
2372                                 else if (ip->trunc_off < rec_offset + rec_len)
2373                                         rec_len = (int)(ip->trunc_off - rec_offset);
2374                         }
2375                 }
2376                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2377                         if (hammer_cursor_ondisk(&cursor)) {
2378                                 if (ip->sync_trunc_off <= rec_offset)
2379                                         rec_len = 0;
2380                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2381                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2382                         }
2383                 }
2384
2385                 /*
2386                  * Accumulate information.  If we have hit a discontiguous
2387                  * block reset base_offset unless we are already beyond the
2388                  * requested offset.  If we are, that's it, we stop.
2389                  */
2390                 disk_offset = hammer_blockmap_lookup(trans.hmp,
2391                                                      cursor.leaf->data_offset,
2392                                                      &error);
2393                 if (error)
2394                         break;
2395                 if (rec_offset != last_offset ||
2396                     disk_offset != last_disk_offset) {
2397                         if (rec_offset > ap->a_loffset)
2398                                 break;
2399                         base_offset = rec_offset;
2400                         base_disk_offset = disk_offset;
2401                 }
2402                 last_offset = rec_offset + rec_len;
2403                 last_disk_offset = disk_offset + rec_len;
2404
2405                 error = hammer_ip_next(&cursor);
2406         }
2407
2408 #if 0
2409         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2410                 ap->a_loffset, base_offset, last_offset);
2411         kprintf("BMAP %16s:  %016llx - %016llx\n",
2412                 "", base_disk_offset, last_disk_offset);
2413 #endif
2414
2415         if (cursor.node) {
2416                 hammer_cache_node(&ip->cache[1], cursor.node);
2417 #if 0
2418                 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2419 #endif
2420         }
2421         hammer_done_cursor(&cursor);
2422         hammer_done_transaction(&trans);
2423
2424         /*
2425          * If we couldn't find any records or the records we did find were
2426          * all behind the requested offset, return failure.  A forward
2427          * truncation can leave a hole w/ no on-disk records.
2428          */
2429         if (last_offset == 0 || last_offset < ap->a_loffset)
2430                 return (EOPNOTSUPP);
2431
2432         /*
2433          * Figure out the block size at the requested offset and adjust
2434          * our limits so the cluster_read() does not create inappropriately
2435          * sized buffer cache buffers.
2436          */
2437         blksize = hammer_blocksize(ap->a_loffset);
2438         if (hammer_blocksize(base_offset) != blksize) {
2439                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2440         }
2441         if (last_offset != ap->a_loffset &&
2442             hammer_blocksize(last_offset - 1) != blksize) {
2443                 last_offset = hammer_blockdemarc(ap->a_loffset,
2444                                                  last_offset - 1);
2445         }
2446
2447         /*
2448          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2449          * from occuring.
2450          */
2451         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2452
2453         /*
2454          * If doffsetp is not aligned or the forward run size does
2455          * not cover a whole buffer, disallow the direct I/O.
2456          */
2457         if ((disk_offset & HAMMER_BUFMASK) ||
2458             (last_offset - ap->a_loffset) < blksize) {
2459                 error = EOPNOTSUPP;
2460         } else {
2461                 *ap->a_doffsetp = disk_offset;
2462                 if (ap->a_runb) {
2463                         *ap->a_runb = ap->a_loffset - base_offset;
2464                         KKASSERT(*ap->a_runb >= 0);
2465                 }
2466                 if (ap->a_runp) {
2467                         *ap->a_runp = last_offset - ap->a_loffset;
2468                         KKASSERT(*ap->a_runp >= 0);
2469                 }
2470                 error = 0;
2471         }
2472         return(error);
2473 }
2474
2475 /*
2476  * Write to a regular file.   Because this is a strategy call the OS is
2477  * trying to actually get data onto the media.
2478  */
2479 static
2480 int
2481 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2482 {
2483         hammer_record_t record;
2484         hammer_mount_t hmp;
2485         hammer_inode_t ip;
2486         struct bio *bio;
2487         struct buf *bp;
2488         int blksize;
2489         int bytes;
2490         int error;
2491
2492         bio = ap->a_bio;
2493         bp = bio->bio_buf;
2494         ip = ap->a_vp->v_data;
2495         hmp = ip->hmp;
2496
2497         blksize = hammer_blocksize(bio->bio_offset);
2498         KKASSERT(bp->b_bufsize == blksize);
2499
2500         if (ip->flags & HAMMER_INODE_RO) {
2501                 bp->b_error = EROFS;
2502                 bp->b_flags |= B_ERROR;
2503                 biodone(ap->a_bio);
2504                 return(EROFS);
2505         }
2506
2507         /*
2508          * Interlock with inode destruction (no in-kernel or directory
2509          * topology visibility).  If we queue new IO while trying to
2510          * destroy the inode we can deadlock the vtrunc call in
2511          * hammer_inode_unloadable_check().
2512          */
2513         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2514                 bp->b_resid = 0;
2515                 biodone(ap->a_bio);
2516                 return(0);
2517         }
2518
2519         /*
2520          * Reserve space and issue a direct-write from the front-end. 
2521          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2522          * allocations.
2523          *
2524          * An in-memory record will be installed to reference the storage
2525          * until the flusher can get to it.
2526          *
2527          * Since we own the high level bio the front-end will not try to
2528          * do a direct-read until the write completes.
2529          *
2530          * NOTE: The only time we do not reserve a full-sized buffers
2531          * worth of data is if the file is small.  We do not try to
2532          * allocate a fragment (from the small-data zone) at the end of
2533          * an otherwise large file as this can lead to wildly separated
2534          * data.
2535          */
2536         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2537         KKASSERT(bio->bio_offset < ip->ino_data.size);
2538         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2539                 bytes = bp->b_bufsize;
2540         else
2541                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2542
2543         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2544                                     bytes, &error);
2545         if (record) {
2546                 hammer_io_direct_write(hmp, &record->leaf, bio);
2547                 hammer_rel_mem_record(record);
2548                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2549                         hammer_flush_inode(ip, 0);
2550         } else {
2551                 bp->b_bio2.bio_offset = NOOFFSET;
2552                 bp->b_error = error;
2553                 bp->b_flags |= B_ERROR;
2554                 biodone(ap->a_bio);
2555         }
2556         return(error);
2557 }
2558
2559 /*
2560  * dounlink - disconnect a directory entry
2561  *
2562  * XXX whiteout support not really in yet
2563  */
2564 static int
2565 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2566                 struct vnode *dvp, struct ucred *cred, int flags)
2567 {
2568         struct namecache *ncp;
2569         hammer_inode_t dip;
2570         hammer_inode_t ip;
2571         struct hammer_cursor cursor;
2572         int64_t namekey;
2573         int nlen, error;
2574
2575         /*
2576          * Calculate the namekey and setup the key range for the scan.  This
2577          * works kinda like a chained hash table where the lower 32 bits
2578          * of the namekey synthesize the chain.
2579          *
2580          * The key range is inclusive of both key_beg and key_end.
2581          */
2582         dip = VTOI(dvp);
2583         ncp = nch->ncp;
2584
2585         if (dip->flags & HAMMER_INODE_RO)
2586                 return (EROFS);
2587
2588         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2589 retry:
2590         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2591         cursor.key_beg.localization = dip->obj_localization +
2592                                       HAMMER_LOCALIZE_MISC;
2593         cursor.key_beg.obj_id = dip->obj_id;
2594         cursor.key_beg.key = namekey;
2595         cursor.key_beg.create_tid = 0;
2596         cursor.key_beg.delete_tid = 0;
2597         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2598         cursor.key_beg.obj_type = 0;
2599
2600         cursor.key_end = cursor.key_beg;
2601         cursor.key_end.key |= 0xFFFFFFFFULL;
2602         cursor.asof = dip->obj_asof;
2603         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2604
2605         /*
2606          * Scan all matching records (the chain), locate the one matching
2607          * the requested path component.  info->last_error contains the
2608          * error code on search termination and could be 0, ENOENT, or
2609          * something else.
2610          *
2611          * The hammer_ip_*() functions merge in-memory records with on-disk
2612          * records for the purposes of the search.
2613          */
2614         error = hammer_ip_first(&cursor);
2615
2616         while (error == 0) {
2617                 error = hammer_ip_resolve_data(&cursor);
2618                 if (error)
2619                         break;
2620                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2621                 KKASSERT(nlen > 0);
2622                 if (ncp->nc_nlen == nlen &&
2623                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2624                         break;
2625                 }
2626                 error = hammer_ip_next(&cursor);
2627         }
2628
2629         /*
2630          * If all is ok we have to get the inode so we can adjust nlinks.
2631          * To avoid a deadlock with the flusher we must release the inode
2632          * lock on the directory when acquiring the inode for the entry.
2633          *
2634          * If the target is a directory, it must be empty.
2635          */
2636         if (error == 0) {
2637                 hammer_unlock(&cursor.ip->lock);
2638                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2639                                       dip->hmp->asof,
2640                                       cursor.data->entry.localization,
2641                                       0, &error);
2642                 hammer_lock_sh(&cursor.ip->lock);
2643                 if (error == ENOENT) {
2644                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2645                         Debugger("ENOENT unlinking object that should exist");
2646                 }
2647
2648                 /*
2649                  * If we are trying to remove a directory the directory must
2650                  * be empty.
2651                  *
2652                  * WARNING: hammer_ip_check_directory_empty() may have to
2653                  * terminate the cursor to avoid a deadlock.  It is ok to
2654                  * call hammer_done_cursor() twice.
2655                  */
2656                 if (error == 0 && ip->ino_data.obj_type ==
2657                                   HAMMER_OBJTYPE_DIRECTORY) {
2658                         error = hammer_ip_check_directory_empty(trans, ip);
2659                 }
2660
2661                 /*
2662                  * Delete the directory entry.
2663                  *
2664                  * WARNING: hammer_ip_del_directory() may have to terminate
2665                  * the cursor to avoid a deadlock.  It is ok to call
2666                  * hammer_done_cursor() twice.
2667                  */
2668                 if (error == 0) {
2669                         error = hammer_ip_del_directory(trans, &cursor,
2670                                                         dip, ip);
2671                 }
2672                 hammer_done_cursor(&cursor);
2673                 if (error == 0) {
2674                         cache_setunresolved(nch);
2675                         cache_setvp(nch, NULL);
2676                         /* XXX locking */
2677                         if (ip->vp)
2678                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2679                 }
2680                 if (ip)
2681                         hammer_rel_inode(ip, 0);
2682         } else {
2683                 hammer_done_cursor(&cursor);
2684         }
2685         if (error == EDEADLK)
2686                 goto retry;
2687
2688         return (error);
2689 }
2690
2691 /************************************************************************
2692  *                          FIFO AND SPECFS OPS                         *
2693  ************************************************************************
2694  *
2695  */
2696
2697 static int
2698 hammer_vop_fifoclose (struct vop_close_args *ap)
2699 {
2700         /* XXX update itimes */
2701         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2702 }
2703
2704 static int
2705 hammer_vop_fiforead (struct vop_read_args *ap)
2706 {
2707         int error;
2708
2709         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2710         /* XXX update access time */
2711         return (error);
2712 }
2713
2714 static int
2715 hammer_vop_fifowrite (struct vop_write_args *ap)
2716 {
2717         int error;
2718
2719         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2720         /* XXX update access time */
2721         return (error);
2722 }
2723
2724 static int
2725 hammer_vop_specclose (struct vop_close_args *ap)
2726 {
2727         /* XXX update itimes */
2728         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2729 }
2730
2731 static int
2732 hammer_vop_specread (struct vop_read_args *ap)
2733 {
2734         /* XXX update access time */
2735         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2736 }
2737
2738 static int
2739 hammer_vop_specwrite (struct vop_write_args *ap)
2740 {
2741         /* XXX update last change time */
2742         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2743 }
2744