HAMMER 40A/Many: Inode/link-count sequencer.
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.43 2008/05/02 01:00:42 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81 static int hammer_vop_mountctl(struct vop_mountctl_args *);
82
83 static int hammer_vop_fifoclose (struct vop_close_args *);
84 static int hammer_vop_fiforead (struct vop_read_args *);
85 static int hammer_vop_fifowrite (struct vop_write_args *);
86
87 static int hammer_vop_specclose (struct vop_close_args *);
88 static int hammer_vop_specread (struct vop_read_args *);
89 static int hammer_vop_specwrite (struct vop_write_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         hammer_vop_pathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_setattr =          hammer_vop_setattr,
119         .vop_strategy =         hammer_vop_strategy,
120         .vop_nsymlink =         hammer_vop_nsymlink,
121         .vop_nwhiteout =        hammer_vop_nwhiteout,
122         .vop_ioctl =            hammer_vop_ioctl,
123         .vop_mountctl =         hammer_vop_mountctl
124 };
125
126 struct vop_ops hammer_spec_vops = {
127         .vop_default =          spec_vnoperate,
128         .vop_fsync =            hammer_vop_fsync,
129         .vop_read =             hammer_vop_specread,
130         .vop_write =            hammer_vop_specwrite,
131         .vop_access =           hammer_vop_access,
132         .vop_close =            hammer_vop_specclose,
133         .vop_getattr =          hammer_vop_getattr,
134         .vop_inactive =         hammer_vop_inactive,
135         .vop_reclaim =          hammer_vop_reclaim,
136         .vop_setattr =          hammer_vop_setattr
137 };
138
139 struct vop_ops hammer_fifo_vops = {
140         .vop_default =          fifo_vnoperate,
141         .vop_fsync =            hammer_vop_fsync,
142         .vop_read =             hammer_vop_fiforead,
143         .vop_write =            hammer_vop_fifowrite,
144         .vop_access =           hammer_vop_access,
145         .vop_close =            hammer_vop_fifoclose,
146         .vop_getattr =          hammer_vop_getattr,
147         .vop_inactive =         hammer_vop_inactive,
148         .vop_reclaim =          hammer_vop_reclaim,
149         .vop_setattr =          hammer_vop_setattr
150 };
151
152 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
153                            struct vnode *dvp, struct ucred *cred, int flags);
154 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
155 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
156
157 #if 0
158 static
159 int
160 hammer_vop_vnoperate(struct vop_generic_args *)
161 {
162         return (VOCALL(&hammer_vnode_vops, ap));
163 }
164 #endif
165
166 /*
167  * hammer_vop_fsync { vp, waitfor }
168  */
169 static
170 int
171 hammer_vop_fsync(struct vop_fsync_args *ap)
172 {
173         hammer_inode_t ip = VTOI(ap->a_vp);
174
175         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
176         if (ap->a_waitfor == MNT_WAIT)
177                 hammer_wait_inode(ip);
178         return (ip->error);
179 }
180
181 /*
182  * hammer_vop_read { vp, uio, ioflag, cred }
183  */
184 static
185 int
186 hammer_vop_read(struct vop_read_args *ap)
187 {
188         struct hammer_transaction trans;
189         hammer_inode_t ip;
190         off_t offset;
191         struct buf *bp;
192         struct uio *uio;
193         int error;
194         int n;
195         int seqcount;
196
197         if (ap->a_vp->v_type != VREG)
198                 return (EINVAL);
199         ip = VTOI(ap->a_vp);
200         error = 0;
201         seqcount = ap->a_ioflag >> 16;
202
203         hammer_start_transaction(&trans, ip->hmp);
204
205         /*
206          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
207          */
208         uio = ap->a_uio;
209         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
210                 offset = uio->uio_offset & HAMMER_BUFMASK;
211 #if 0
212                 error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
213                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
214                                      MAXBSIZE, seqcount, &bp);
215 #endif
216                 error = bread(ap->a_vp, uio->uio_offset - offset,
217                               HAMMER_BUFSIZE, &bp);
218                 if (error) {
219                         brelse(bp);
220                         break;
221                 }
222                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
223                 n = HAMMER_BUFSIZE - offset;
224                 if (n > uio->uio_resid)
225                         n = uio->uio_resid;
226                 if (n > ip->ino_rec.ino_size - uio->uio_offset)
227                         n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
228                 error = uiomove((char *)bp->b_data + offset, n, uio);
229                 if (error) {
230                         bqrelse(bp);
231                         break;
232                 }
233                 bqrelse(bp);
234         }
235         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
236             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
237                 ip->ino_rec.ino_atime = trans.time;
238                 hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
239         }
240         hammer_done_transaction(&trans);
241         return (error);
242 }
243
244 /*
245  * hammer_vop_write { vp, uio, ioflag, cred }
246  */
247 static
248 int
249 hammer_vop_write(struct vop_write_args *ap)
250 {
251         struct hammer_transaction trans;
252         struct hammer_inode *ip;
253         struct uio *uio;
254         off_t offset;
255         struct buf *bp;
256         int error;
257         int n;
258         int flags;
259         int count;
260
261         if (ap->a_vp->v_type != VREG)
262                 return (EINVAL);
263         ip = VTOI(ap->a_vp);
264         error = 0;
265
266         if (ip->flags & HAMMER_INODE_RO)
267                 return (EROFS);
268
269         /*
270          * Create a transaction to cover the operations we perform.
271          */
272         hammer_start_transaction(&trans, ip->hmp);
273         uio = ap->a_uio;
274
275         /*
276          * Check append mode
277          */
278         if (ap->a_ioflag & IO_APPEND)
279                 uio->uio_offset = ip->ino_rec.ino_size;
280
281         /*
282          * Check for illegal write offsets.  Valid range is 0...2^63-1
283          */
284         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
285                 hammer_done_transaction(&trans);
286                 return (EFBIG);
287         }
288
289         /*
290          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
291          */
292         count = 0;
293         while (uio->uio_resid > 0) {
294                 int fixsize = 0;
295
296                 /*
297                  * Do not allow huge writes to deadlock the buffer cache
298                  */
299                 if ((++count & 15) == 0) {
300                         vn_unlock(ap->a_vp);
301                         if ((ap->a_ioflag & IO_NOBWILL) == 0)
302                                 bwillwrite();
303                         vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
304                 }
305
306                 offset = uio->uio_offset & HAMMER_BUFMASK;
307                 n = HAMMER_BUFSIZE - offset;
308                 if (n > uio->uio_resid)
309                         n = uio->uio_resid;
310                 if (uio->uio_offset + n > ip->ino_rec.ino_size) {
311                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
312                         fixsize = 1;
313                 }
314
315                 if (uio->uio_segflg == UIO_NOCOPY) {
316                         /*
317                          * Issuing a write with the same data backing the
318                          * buffer.  Instantiate the buffer to collect the
319                          * backing vm pages, then read-in any missing bits.
320                          *
321                          * This case is used by vop_stdputpages().
322                          */
323                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
324                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
325                         if ((bp->b_flags & B_CACHE) == 0) {
326                                 bqrelse(bp);
327                                 error = bread(ap->a_vp,
328                                               uio->uio_offset - offset,
329                                               HAMMER_BUFSIZE, &bp);
330                         }
331                 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
332                         /*
333                          * Even though we are entirely overwriting the buffer
334                          * we may still have to zero it out to avoid a 
335                          * mmap/write visibility issue.
336                          */
337                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
338                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
339                         if ((bp->b_flags & B_CACHE) == 0)
340                                 vfs_bio_clrbuf(bp);
341                 } else if (uio->uio_offset - offset >= ip->ino_rec.ino_size) {
342                         /*
343                          * If the base offset of the buffer is beyond the
344                          * file EOF, we don't have to issue a read.
345                          */
346                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
347                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
348                         vfs_bio_clrbuf(bp);
349                 } else {
350                         /*
351                          * Partial overwrite, read in any missing bits then
352                          * replace the portion being written.
353                          */
354                         error = bread(ap->a_vp, uio->uio_offset - offset,
355                                       HAMMER_BUFSIZE, &bp);
356                         if (error == 0)
357                                 bheavy(bp);
358                 }
359                 if (error == 0)
360                         error = uiomove((char *)bp->b_data + offset, n, uio);
361
362                 /*
363                  * If we screwed up we have to undo any VM size changes we
364                  * made.
365                  */
366                 if (error) {
367                         brelse(bp);
368                         if (fixsize) {
369                                 vtruncbuf(ap->a_vp, ip->ino_rec.ino_size,
370                                           HAMMER_BUFSIZE);
371                         }
372                         break;
373                 }
374                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
375                 hammer_lock_sh(&ip->lock);
376                 if (ip->ino_rec.ino_size < uio->uio_offset) {
377                         ip->ino_rec.ino_size = uio->uio_offset;
378                         flags = HAMMER_INODE_RDIRTY;
379                         vnode_pager_setsize(ap->a_vp, ip->ino_rec.ino_size);
380                 } else {
381                         flags = 0;
382                 }
383                 ip->ino_rec.ino_mtime = trans.time;
384                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
385                 hammer_modify_inode(&trans, ip, flags);
386                 hammer_unlock(&ip->lock);
387
388                 if (ap->a_ioflag & IO_SYNC) {
389                         bwrite(bp);
390                 } else if (ap->a_ioflag & IO_DIRECT) {
391                         bawrite(bp);
392 #if 0
393                 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
394                            (uio->uio_offset & HAMMER_BUFMASK) == 0) {
395                         /*
396                          * XXX HAMMER can only fsync the whole inode,
397                          * doing it on every buffer would be a bad idea.
398                          */
399                         /*
400                          * If seqcount indicates sequential operation and
401                          * we just finished filling a buffer, push it out
402                          * now to prevent the buffer cache from becoming
403                          * too full, which would trigger non-optimal
404                          * flushes.
405                          */
406                         bdwrite(bp);
407 #endif
408                 } else {
409                         bdwrite(bp);
410                 }
411         }
412         hammer_done_transaction(&trans);
413         return (error);
414 }
415
416 /*
417  * hammer_vop_access { vp, mode, cred }
418  */
419 static
420 int
421 hammer_vop_access(struct vop_access_args *ap)
422 {
423         struct hammer_inode *ip = VTOI(ap->a_vp);
424         uid_t uid;
425         gid_t gid;
426         int error;
427
428         uid = hammer_to_unix_xid(&ip->ino_data.uid);
429         gid = hammer_to_unix_xid(&ip->ino_data.gid);
430
431         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
432                                   ip->ino_data.uflags);
433         return (error);
434 }
435
436 /*
437  * hammer_vop_advlock { vp, id, op, fl, flags }
438  */
439 static
440 int
441 hammer_vop_advlock(struct vop_advlock_args *ap)
442 {
443         struct hammer_inode *ip = VTOI(ap->a_vp);
444
445         return (lf_advlock(ap, &ip->advlock, ip->ino_rec.ino_size));
446 }
447
448 /*
449  * hammer_vop_close { vp, fflag }
450  */
451 static
452 int
453 hammer_vop_close(struct vop_close_args *ap)
454 {
455         return (vop_stdclose(ap));
456 }
457
458 /*
459  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
460  *
461  * The operating system has already ensured that the directory entry
462  * does not exist and done all appropriate namespace locking.
463  */
464 static
465 int
466 hammer_vop_ncreate(struct vop_ncreate_args *ap)
467 {
468         struct hammer_transaction trans;
469         struct hammer_inode *dip;
470         struct hammer_inode *nip;
471         struct nchandle *nch;
472         int error;
473
474         nch = ap->a_nch;
475         dip = VTOI(ap->a_dvp);
476
477         if (dip->flags & HAMMER_INODE_RO)
478                 return (EROFS);
479
480         /*
481          * Create a transaction to cover the operations we perform.
482          */
483         hammer_start_transaction(&trans, dip->hmp);
484
485         /*
486          * Create a new filesystem object of the requested type.  The
487          * returned inode will be referenced and shared-locked to prevent
488          * it from being moved to the flusher.
489          */
490
491         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
492         if (error) {
493                 kprintf("hammer_create_inode error %d\n", error);
494                 hammer_done_transaction(&trans);
495                 *ap->a_vpp = NULL;
496                 return (error);
497         }
498         hammer_lock_sh(&nip->lock);
499         hammer_lock_sh(&dip->lock);
500
501         /*
502          * Add the new filesystem object to the directory.  This will also
503          * bump the inode's link count.
504          */
505         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
506         if (error)
507                 kprintf("hammer_ip_add_directory error %d\n", error);
508         hammer_unlock(&dip->lock);
509         hammer_unlock(&nip->lock);
510
511         /*
512          * Finish up.
513          */
514         if (error) {
515                 hammer_rel_inode(nip, 0);
516                 hammer_done_transaction(&trans);
517                 *ap->a_vpp = NULL;
518         } else {
519                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
520                 hammer_done_transaction(&trans);
521                 hammer_rel_inode(nip, 0);
522                 if (error == 0) {
523                         cache_setunresolved(ap->a_nch);
524                         cache_setvp(ap->a_nch, *ap->a_vpp);
525                 }
526         }
527         return (error);
528 }
529
530 /*
531  * hammer_vop_getattr { vp, vap }
532  */
533 static
534 int
535 hammer_vop_getattr(struct vop_getattr_args *ap)
536 {
537         struct hammer_inode *ip = VTOI(ap->a_vp);
538         struct vattr *vap = ap->a_vap;
539
540 #if 0
541         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
542             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
543             ip->obj_asof == XXX
544         ) {
545                 /* LAZYMOD XXX */
546         }
547         hammer_itimes(ap->a_vp);
548 #endif
549
550         vap->va_fsid = ip->hmp->fsid_udev;
551         vap->va_fileid = ip->ino_rec.base.base.obj_id;
552         vap->va_mode = ip->ino_data.mode;
553         vap->va_nlink = ip->ino_rec.ino_nlinks;
554         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
555         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
556         vap->va_rmajor = 0;
557         vap->va_rminor = 0;
558         vap->va_size = ip->ino_rec.ino_size;
559         hammer_to_timespec(ip->ino_rec.ino_atime, &vap->va_atime);
560         hammer_to_timespec(ip->ino_rec.ino_mtime, &vap->va_mtime);
561         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
562         vap->va_flags = ip->ino_data.uflags;
563         vap->va_gen = 1;        /* hammer inums are unique for all time */
564         vap->va_blocksize = HAMMER_BUFSIZE;
565         vap->va_bytes = (ip->ino_rec.ino_size + 63) & ~63;
566         vap->va_type = hammer_get_vnode_type(ip->ino_rec.base.base.obj_type);
567         vap->va_filerev = 0;    /* XXX */
568         /* mtime uniquely identifies any adjustments made to the file */
569         vap->va_fsmid = ip->ino_rec.ino_mtime;
570         vap->va_uid_uuid = ip->ino_data.uid;
571         vap->va_gid_uuid = ip->ino_data.gid;
572         vap->va_fsid_uuid = ip->hmp->fsid;
573         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
574                           VA_FSID_UUID_VALID;
575
576         switch (ip->ino_rec.base.base.obj_type) {
577         case HAMMER_OBJTYPE_CDEV:
578         case HAMMER_OBJTYPE_BDEV:
579                 vap->va_rmajor = ip->ino_data.rmajor;
580                 vap->va_rminor = ip->ino_data.rminor;
581                 break;
582         default:
583                 break;
584         }
585
586         return(0);
587 }
588
589 /*
590  * hammer_vop_nresolve { nch, dvp, cred }
591  *
592  * Locate the requested directory entry.
593  */
594 static
595 int
596 hammer_vop_nresolve(struct vop_nresolve_args *ap)
597 {
598         struct hammer_transaction trans;
599         struct namecache *ncp;
600         hammer_inode_t dip;
601         hammer_inode_t ip;
602         hammer_tid_t asof;
603         struct hammer_cursor cursor;
604         union hammer_record_ondisk *rec;
605         struct vnode *vp;
606         int64_t namekey;
607         int error;
608         int i;
609         int nlen;
610         int flags;
611         u_int64_t obj_id;
612
613         /*
614          * Misc initialization, plus handle as-of name extensions.  Look for
615          * the '@@' extension.  Note that as-of files and directories cannot
616          * be modified.
617          */
618         dip = VTOI(ap->a_dvp);
619         ncp = ap->a_nch->ncp;
620         asof = dip->obj_asof;
621         nlen = ncp->nc_nlen;
622         flags = dip->flags;
623
624         hammer_simple_transaction(&trans, dip->hmp);
625
626         for (i = 0; i < nlen; ++i) {
627                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
628                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
629                         flags |= HAMMER_INODE_RO;
630                         break;
631                 }
632         }
633         nlen = i;
634
635         /*
636          * If there is no path component the time extension is relative to
637          * dip.
638          */
639         if (nlen == 0) {
640                 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
641                                       asof, flags, &error);
642                 if (error == 0) {
643                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
644                         hammer_rel_inode(ip, 0);
645                 } else {
646                         vp = NULL;
647                 }
648                 if (error == 0) {
649                         vn_unlock(vp);
650                         cache_setvp(ap->a_nch, vp);
651                         vrele(vp);
652                 }
653                 goto done;
654         }
655
656         /*
657          * Calculate the namekey and setup the key range for the scan.  This
658          * works kinda like a chained hash table where the lower 32 bits
659          * of the namekey synthesize the chain.
660          *
661          * The key range is inclusive of both key_beg and key_end.
662          */
663         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
664
665         error = hammer_init_cursor(&trans, &cursor, &dip->cache[0]);
666         cursor.key_beg.obj_id = dip->obj_id;
667         cursor.key_beg.key = namekey;
668         cursor.key_beg.create_tid = 0;
669         cursor.key_beg.delete_tid = 0;
670         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
671         cursor.key_beg.obj_type = 0;
672
673         cursor.key_end = cursor.key_beg;
674         cursor.key_end.key |= 0xFFFFFFFFULL;
675         cursor.asof = asof;
676         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
677
678         /*
679          * Scan all matching records (the chain), locate the one matching
680          * the requested path component.
681          *
682          * The hammer_ip_*() functions merge in-memory records with on-disk
683          * records for the purposes of the search.
684          */
685         if (error == 0)
686                 error = hammer_ip_first(&cursor, dip);
687
688         rec = NULL;
689         obj_id = 0;
690
691         while (error == 0) {
692                 error = hammer_ip_resolve_data(&cursor);
693                 if (error)
694                         break;
695                 rec = cursor.record;
696                 if (nlen == rec->entry.base.data_len &&
697                     bcmp(ncp->nc_name, cursor.data, nlen) == 0) {
698                         obj_id = rec->entry.obj_id;
699                         break;
700                 }
701                 error = hammer_ip_next(&cursor);
702         }
703         hammer_done_cursor(&cursor);
704         if (error == 0) {
705                 ip = hammer_get_inode(&trans, &dip->cache[1],
706                                       obj_id, asof, flags, &error);
707                 if (error == 0) {
708                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
709                         hammer_rel_inode(ip, 0);
710                 } else {
711                         vp = NULL;
712                 }
713                 if (error == 0) {
714                         vn_unlock(vp);
715                         cache_setvp(ap->a_nch, vp);
716                         vrele(vp);
717                 }
718         } else if (error == ENOENT) {
719                 cache_setvp(ap->a_nch, NULL);
720         }
721 done:
722         hammer_done_transaction(&trans);
723         return (error);
724 }
725
726 /*
727  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
728  *
729  * Locate the parent directory of a directory vnode.
730  *
731  * dvp is referenced but not locked.  *vpp must be returned referenced and
732  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
733  * at the root, instead it could indicate that the directory we were in was
734  * removed.
735  *
736  * NOTE: as-of sequences are not linked into the directory structure.  If
737  * we are at the root with a different asof then the mount point, reload
738  * the same directory with the mount point's asof.   I'm not sure what this
739  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
740  * get confused, but it hasn't been tested.
741  */
742 static
743 int
744 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
745 {
746         struct hammer_transaction trans;
747         struct hammer_inode *dip;
748         struct hammer_inode *ip;
749         int64_t parent_obj_id;
750         hammer_tid_t asof;
751         int error;
752
753         dip = VTOI(ap->a_dvp);
754         asof = dip->obj_asof;
755         parent_obj_id = dip->ino_data.parent_obj_id;
756
757         if (parent_obj_id == 0) {
758                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
759                    asof != dip->hmp->asof) {
760                         parent_obj_id = dip->obj_id;
761                         asof = dip->hmp->asof;
762                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
763                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
764                                    dip->obj_asof);
765                 } else {
766                         *ap->a_vpp = NULL;
767                         return ENOENT;
768                 }
769         }
770
771         hammer_simple_transaction(&trans, dip->hmp);
772
773         ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
774                               asof, dip->flags, &error);
775         if (ip) {
776                 error = hammer_get_vnode(ip, LK_EXCLUSIVE, ap->a_vpp);
777                 hammer_rel_inode(ip, 0);
778         } else {
779                 *ap->a_vpp = NULL;
780         }
781         hammer_done_transaction(&trans);
782         return (error);
783 }
784
785 /*
786  * hammer_vop_nlink { nch, dvp, vp, cred }
787  */
788 static
789 int
790 hammer_vop_nlink(struct vop_nlink_args *ap)
791 {
792         struct hammer_transaction trans;
793         struct hammer_inode *dip;
794         struct hammer_inode *ip;
795         struct nchandle *nch;
796         int error;
797
798         nch = ap->a_nch;
799         dip = VTOI(ap->a_dvp);
800         ip = VTOI(ap->a_vp);
801
802         if (dip->flags & HAMMER_INODE_RO)
803                 return (EROFS);
804         if (ip->flags & HAMMER_INODE_RO)
805                 return (EROFS);
806
807         /*
808          * Create a transaction to cover the operations we perform.
809          */
810         hammer_start_transaction(&trans, dip->hmp);
811
812         /*
813          * Add the filesystem object to the directory.  Note that neither
814          * dip nor ip are referenced or locked, but their vnodes are
815          * referenced.  This function will bump the inode's link count.
816          */
817         hammer_lock_sh(&ip->lock);
818         hammer_lock_sh(&dip->lock);
819         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
820         hammer_unlock(&dip->lock);
821         hammer_unlock(&ip->lock);
822
823         /*
824          * Finish up.
825          */
826         if (error == 0) {
827                 cache_setunresolved(nch);
828                 cache_setvp(nch, ap->a_vp);
829         }
830         hammer_done_transaction(&trans);
831         return (error);
832 }
833
834 /*
835  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
836  *
837  * The operating system has already ensured that the directory entry
838  * does not exist and done all appropriate namespace locking.
839  */
840 static
841 int
842 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
843 {
844         struct hammer_transaction trans;
845         struct hammer_inode *dip;
846         struct hammer_inode *nip;
847         struct nchandle *nch;
848         int error;
849
850         nch = ap->a_nch;
851         dip = VTOI(ap->a_dvp);
852
853         if (dip->flags & HAMMER_INODE_RO)
854                 return (EROFS);
855
856         /*
857          * Create a transaction to cover the operations we perform.
858          */
859         hammer_start_transaction(&trans, dip->hmp);
860
861         /*
862          * Create a new filesystem object of the requested type.  The
863          * returned inode will be referenced but not locked.
864          */
865         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
866         if (error) {
867                 kprintf("hammer_mkdir error %d\n", error);
868                 hammer_done_transaction(&trans);
869                 *ap->a_vpp = NULL;
870                 return (error);
871         }
872         /*
873          * Add the new filesystem object to the directory.  This will also
874          * bump the inode's link count.
875          */
876         hammer_lock_sh(&nip->lock);
877         hammer_lock_sh(&dip->lock);
878         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
879         hammer_unlock(&dip->lock);
880         hammer_unlock(&nip->lock);
881         if (error)
882                 kprintf("hammer_mkdir (add) error %d\n", error);
883
884         /*
885          * Finish up.
886          */
887         if (error) {
888                 hammer_rel_inode(nip, 0);
889                 *ap->a_vpp = NULL;
890         } else {
891                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
892                 hammer_rel_inode(nip, 0);
893                 if (error == 0) {
894                         cache_setunresolved(ap->a_nch);
895                         cache_setvp(ap->a_nch, *ap->a_vpp);
896                 }
897         }
898         hammer_done_transaction(&trans);
899         return (error);
900 }
901
902 /*
903  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
904  *
905  * The operating system has already ensured that the directory entry
906  * does not exist and done all appropriate namespace locking.
907  */
908 static
909 int
910 hammer_vop_nmknod(struct vop_nmknod_args *ap)
911 {
912         struct hammer_transaction trans;
913         struct hammer_inode *dip;
914         struct hammer_inode *nip;
915         struct nchandle *nch;
916         int error;
917
918         nch = ap->a_nch;
919         dip = VTOI(ap->a_dvp);
920
921         if (dip->flags & HAMMER_INODE_RO)
922                 return (EROFS);
923
924         /*
925          * Create a transaction to cover the operations we perform.
926          */
927         hammer_start_transaction(&trans, dip->hmp);
928
929         /*
930          * Create a new filesystem object of the requested type.  The
931          * returned inode will be referenced but not locked.
932          */
933         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
934         if (error) {
935                 hammer_done_transaction(&trans);
936                 *ap->a_vpp = NULL;
937                 return (error);
938         }
939
940         /*
941          * Add the new filesystem object to the directory.  This will also
942          * bump the inode's link count.
943          */
944         hammer_lock_sh(&nip->lock);
945         hammer_lock_sh(&dip->lock);
946         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
947         hammer_unlock(&dip->lock);
948         hammer_unlock(&nip->lock);
949
950         /*
951          * Finish up.
952          */
953         if (error) {
954                 hammer_rel_inode(nip, 0);
955                 *ap->a_vpp = NULL;
956         } else {
957                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
958                 hammer_rel_inode(nip, 0);
959                 if (error == 0) {
960                         cache_setunresolved(ap->a_nch);
961                         cache_setvp(ap->a_nch, *ap->a_vpp);
962                 }
963         }
964         hammer_done_transaction(&trans);
965         return (error);
966 }
967
968 /*
969  * hammer_vop_open { vp, mode, cred, fp }
970  */
971 static
972 int
973 hammer_vop_open(struct vop_open_args *ap)
974 {
975         if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
976                 return (EROFS);
977
978         return(vop_stdopen(ap));
979 }
980
981 /*
982  * hammer_vop_pathconf { vp, name, retval }
983  */
984 static
985 int
986 hammer_vop_pathconf(struct vop_pathconf_args *ap)
987 {
988         return EOPNOTSUPP;
989 }
990
991 /*
992  * hammer_vop_print { vp }
993  */
994 static
995 int
996 hammer_vop_print(struct vop_print_args *ap)
997 {
998         return EOPNOTSUPP;
999 }
1000
1001 /*
1002  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1003  */
1004 static
1005 int
1006 hammer_vop_readdir(struct vop_readdir_args *ap)
1007 {
1008         struct hammer_transaction trans;
1009         struct hammer_cursor cursor;
1010         struct hammer_inode *ip;
1011         struct uio *uio;
1012         hammer_record_ondisk_t rec;
1013         hammer_base_elm_t base;
1014         int error;
1015         int cookie_index;
1016         int ncookies;
1017         off_t *cookies;
1018         off_t saveoff;
1019         int r;
1020
1021         ip = VTOI(ap->a_vp);
1022         uio = ap->a_uio;
1023         saveoff = uio->uio_offset;
1024
1025         if (ap->a_ncookies) {
1026                 ncookies = uio->uio_resid / 16 + 1;
1027                 if (ncookies > 1024)
1028                         ncookies = 1024;
1029                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1030                 cookie_index = 0;
1031         } else {
1032                 ncookies = -1;
1033                 cookies = NULL;
1034                 cookie_index = 0;
1035         }
1036
1037         hammer_simple_transaction(&trans, ip->hmp);
1038
1039         /*
1040          * Handle artificial entries
1041          */
1042         error = 0;
1043         if (saveoff == 0) {
1044                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1045                 if (r)
1046                         goto done;
1047                 if (cookies)
1048                         cookies[cookie_index] = saveoff;
1049                 ++saveoff;
1050                 ++cookie_index;
1051                 if (cookie_index == ncookies)
1052                         goto done;
1053         }
1054         if (saveoff == 1) {
1055                 if (ip->ino_data.parent_obj_id) {
1056                         r = vop_write_dirent(&error, uio,
1057                                              ip->ino_data.parent_obj_id,
1058                                              DT_DIR, 2, "..");
1059                 } else {
1060                         r = vop_write_dirent(&error, uio,
1061                                              ip->obj_id, DT_DIR, 2, "..");
1062                 }
1063                 if (r)
1064                         goto done;
1065                 if (cookies)
1066                         cookies[cookie_index] = saveoff;
1067                 ++saveoff;
1068                 ++cookie_index;
1069                 if (cookie_index == ncookies)
1070                         goto done;
1071         }
1072
1073         /*
1074          * Key range (begin and end inclusive) to scan.  Directory keys
1075          * directly translate to a 64 bit 'seek' position.
1076          */
1077         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1078         cursor.key_beg.obj_id = ip->obj_id;
1079         cursor.key_beg.create_tid = 0;
1080         cursor.key_beg.delete_tid = 0;
1081         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1082         cursor.key_beg.obj_type = 0;
1083         cursor.key_beg.key = saveoff;
1084
1085         cursor.key_end = cursor.key_beg;
1086         cursor.key_end.key = HAMMER_MAX_KEY;
1087         cursor.asof = ip->obj_asof;
1088         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1089
1090         error = hammer_ip_first(&cursor, ip);
1091
1092         while (error == 0) {
1093                 error = hammer_ip_resolve_record_and_data(&cursor);
1094                 if (error)
1095                         break;
1096                 rec = cursor.record;
1097                 base = &rec->base.base;
1098                 saveoff = base->key;
1099
1100                 if (base->obj_id != ip->obj_id)
1101                         panic("readdir: bad record at %p", cursor.node);
1102
1103                 r = vop_write_dirent(
1104                              &error, uio, rec->entry.obj_id,
1105                              hammer_get_dtype(rec->entry.base.base.obj_type),
1106                              rec->entry.base.data_len,
1107                              (void *)cursor.data);
1108                 if (r)
1109                         break;
1110                 ++saveoff;
1111                 if (cookies)
1112                         cookies[cookie_index] = base->key;
1113                 ++cookie_index;
1114                 if (cookie_index == ncookies)
1115                         break;
1116                 error = hammer_ip_next(&cursor);
1117         }
1118         hammer_done_cursor(&cursor);
1119
1120 done:
1121         hammer_done_transaction(&trans);
1122
1123         if (ap->a_eofflag)
1124                 *ap->a_eofflag = (error == ENOENT);
1125         uio->uio_offset = saveoff;
1126         if (error && cookie_index == 0) {
1127                 if (error == ENOENT)
1128                         error = 0;
1129                 if (cookies) {
1130                         kfree(cookies, M_TEMP);
1131                         *ap->a_ncookies = 0;
1132                         *ap->a_cookies = NULL;
1133                 }
1134         } else {
1135                 if (error == ENOENT)
1136                         error = 0;
1137                 if (cookies) {
1138                         *ap->a_ncookies = cookie_index;
1139                         *ap->a_cookies = cookies;
1140                 }
1141         }
1142         return(error);
1143 }
1144
1145 /*
1146  * hammer_vop_readlink { vp, uio, cred }
1147  */
1148 static
1149 int
1150 hammer_vop_readlink(struct vop_readlink_args *ap)
1151 {
1152         struct hammer_transaction trans;
1153         struct hammer_cursor cursor;
1154         struct hammer_inode *ip;
1155         int error;
1156
1157         ip = VTOI(ap->a_vp);
1158
1159         hammer_simple_transaction(&trans, ip->hmp);
1160
1161         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1162
1163         /*
1164          * Key range (begin and end inclusive) to scan.  Directory keys
1165          * directly translate to a 64 bit 'seek' position.
1166          */
1167         cursor.key_beg.obj_id = ip->obj_id;
1168         cursor.key_beg.create_tid = 0;
1169         cursor.key_beg.delete_tid = 0;
1170         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1171         cursor.key_beg.obj_type = 0;
1172         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1173         cursor.asof = ip->obj_asof;
1174         cursor.flags |= HAMMER_CURSOR_ASOF;
1175
1176         error = hammer_ip_lookup(&cursor, ip);
1177         if (error == 0) {
1178                 error = hammer_ip_resolve_data(&cursor);
1179                 if (error == 0) {
1180                         error = uiomove((char *)cursor.data,
1181                                         cursor.record->base.data_len,
1182                                         ap->a_uio);
1183                 }
1184         }
1185         hammer_done_cursor(&cursor);
1186         hammer_done_transaction(&trans);
1187         return(error);
1188 }
1189
1190 /*
1191  * hammer_vop_nremove { nch, dvp, cred }
1192  */
1193 static
1194 int
1195 hammer_vop_nremove(struct vop_nremove_args *ap)
1196 {
1197         struct hammer_transaction trans;
1198         int error;
1199
1200         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1201         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1202         hammer_done_transaction(&trans);
1203
1204         return (error);
1205 }
1206
1207 /*
1208  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1209  */
1210 static
1211 int
1212 hammer_vop_nrename(struct vop_nrename_args *ap)
1213 {
1214         struct hammer_transaction trans;
1215         struct namecache *fncp;
1216         struct namecache *tncp;
1217         struct hammer_inode *fdip;
1218         struct hammer_inode *tdip;
1219         struct hammer_inode *ip;
1220         struct hammer_cursor cursor;
1221         union hammer_record_ondisk *rec;
1222         int64_t namekey;
1223         int error;
1224
1225         fdip = VTOI(ap->a_fdvp);
1226         tdip = VTOI(ap->a_tdvp);
1227         fncp = ap->a_fnch->ncp;
1228         tncp = ap->a_tnch->ncp;
1229         ip = VTOI(fncp->nc_vp);
1230         KKASSERT(ip != NULL);
1231
1232         if (fdip->flags & HAMMER_INODE_RO)
1233                 return (EROFS);
1234         if (tdip->flags & HAMMER_INODE_RO)
1235                 return (EROFS);
1236         if (ip->flags & HAMMER_INODE_RO)
1237                 return (EROFS);
1238
1239         hammer_start_transaction(&trans, fdip->hmp);
1240
1241         hammer_lock_sh(&ip->lock);
1242         if (fdip->obj_id < tdip->obj_id) {
1243                 hammer_lock_sh(&fdip->lock);
1244                 hammer_lock_sh(&tdip->lock);
1245         } else {
1246                 hammer_lock_sh(&tdip->lock);
1247                 hammer_lock_sh(&fdip->lock);
1248         }
1249
1250         /*
1251          * Remove tncp from the target directory and then link ip as
1252          * tncp. XXX pass trans to dounlink
1253          *
1254          * Force the inode sync-time to match the transaction so it is
1255          * in-sync with the creation of the target directory entry.
1256          */
1257         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1258         if (error == 0 || error == ENOENT) {
1259                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1260                 if (error == 0) {
1261                         ip->ino_data.parent_obj_id = tdip->obj_id;
1262                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
1263                 }
1264         }
1265         if (error)
1266                 goto failed; /* XXX */
1267
1268         /*
1269          * Locate the record in the originating directory and remove it.
1270          *
1271          * Calculate the namekey and setup the key range for the scan.  This
1272          * works kinda like a chained hash table where the lower 32 bits
1273          * of the namekey synthesize the chain.
1274          *
1275          * The key range is inclusive of both key_beg and key_end.
1276          */
1277         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1278 retry:
1279         hammer_init_cursor(&trans, &cursor, &fdip->cache[0]);
1280         cursor.key_beg.obj_id = fdip->obj_id;
1281         cursor.key_beg.key = namekey;
1282         cursor.key_beg.create_tid = 0;
1283         cursor.key_beg.delete_tid = 0;
1284         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1285         cursor.key_beg.obj_type = 0;
1286
1287         cursor.key_end = cursor.key_beg;
1288         cursor.key_end.key |= 0xFFFFFFFFULL;
1289         cursor.asof = fdip->obj_asof;
1290         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1291
1292         /*
1293          * Scan all matching records (the chain), locate the one matching
1294          * the requested path component.
1295          *
1296          * The hammer_ip_*() functions merge in-memory records with on-disk
1297          * records for the purposes of the search.
1298          */
1299         error = hammer_ip_first(&cursor, fdip);
1300         while (error == 0) {
1301                 if (hammer_ip_resolve_data(&cursor) != 0)
1302                         break;
1303                 rec = cursor.record;
1304                 if (fncp->nc_nlen == rec->entry.base.data_len &&
1305                     bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
1306                         break;
1307                 }
1308                 error = hammer_ip_next(&cursor);
1309         }
1310
1311         /*
1312          * If all is ok we have to get the inode so we can adjust nlinks.
1313          *
1314          * WARNING: hammer_ip_del_directory() may have to terminate the
1315          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1316          * twice.
1317          */
1318         if (error == 0)
1319                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1320
1321         /*
1322          * XXX A deadlock here will break rename's atomicy for the purposes
1323          * of crash recovery.
1324          */
1325         if (error == EDEADLK) {
1326                 hammer_unlock(&ip->lock);
1327                 hammer_unlock(&fdip->lock);
1328                 hammer_unlock(&tdip->lock);
1329                 hammer_done_cursor(&cursor);
1330                 hammer_lock_sh(&ip->lock);
1331                 if (fdip->obj_id < tdip->obj_id) {
1332                         hammer_lock_sh(&fdip->lock);
1333                         hammer_lock_sh(&tdip->lock);
1334                 } else {
1335                         hammer_lock_sh(&tdip->lock);
1336                         hammer_lock_sh(&fdip->lock);
1337                 }
1338                 goto retry;
1339         }
1340
1341         /*
1342          * Cleanup and tell the kernel that the rename succeeded.
1343          */
1344         hammer_done_cursor(&cursor);
1345         if (error == 0)
1346                 cache_rename(ap->a_fnch, ap->a_tnch);
1347
1348 failed:
1349         hammer_unlock(&ip->lock);
1350         hammer_unlock(&fdip->lock);
1351         hammer_unlock(&tdip->lock);
1352         hammer_done_transaction(&trans);
1353         return (error);
1354 }
1355
1356 /*
1357  * hammer_vop_nrmdir { nch, dvp, cred }
1358  */
1359 static
1360 int
1361 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1362 {
1363         struct hammer_transaction trans;
1364         int error;
1365
1366         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1367         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1368         hammer_done_transaction(&trans);
1369
1370         return (error);
1371 }
1372
1373 /*
1374  * hammer_vop_setattr { vp, vap, cred }
1375  */
1376 static
1377 int
1378 hammer_vop_setattr(struct vop_setattr_args *ap)
1379 {
1380         struct hammer_transaction trans;
1381         struct vattr *vap;
1382         struct hammer_inode *ip;
1383         int modflags;
1384         int error;
1385         int truncating;
1386         off_t aligned_size;
1387         u_int32_t flags;
1388         uuid_t uuid;
1389
1390         vap = ap->a_vap;
1391         ip = ap->a_vp->v_data;
1392         modflags = 0;
1393
1394         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1395                 return(EROFS);
1396         if (ip->flags & HAMMER_INODE_RO)
1397                 return (EROFS);
1398
1399         hammer_start_transaction(&trans, ip->hmp);
1400         hammer_lock_sh(&ip->lock);
1401         error = 0;
1402
1403         if (vap->va_flags != VNOVAL) {
1404                 flags = ip->ino_data.uflags;
1405                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1406                                          hammer_to_unix_xid(&ip->ino_data.uid),
1407                                          ap->a_cred);
1408                 if (error == 0) {
1409                         if (ip->ino_data.uflags != flags) {
1410                                 ip->ino_data.uflags = flags;
1411                                 modflags |= HAMMER_INODE_DDIRTY;
1412                         }
1413                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1414                                 error = 0;
1415                                 goto done;
1416                         }
1417                 }
1418                 goto done;
1419         }
1420         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1421                 error = EPERM;
1422                 goto done;
1423         }
1424         if (vap->va_uid != (uid_t)VNOVAL) {
1425                 hammer_guid_to_uuid(&uuid, vap->va_uid);
1426                 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1427                         ip->ino_data.uid = uuid;
1428                         modflags |= HAMMER_INODE_DDIRTY;
1429                 }
1430         }
1431         if (vap->va_gid != (uid_t)VNOVAL) {
1432                 hammer_guid_to_uuid(&uuid, vap->va_gid);
1433                 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1434                         ip->ino_data.gid = uuid;
1435                         modflags |= HAMMER_INODE_DDIRTY;
1436                 }
1437         }
1438         while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
1439                 switch(ap->a_vp->v_type) {
1440                 case VREG:
1441                         if (vap->va_size == ip->ino_rec.ino_size)
1442                                 break;
1443                         /*
1444                          * XXX break atomicy, we can deadlock the backend
1445                          * if we do not release the lock.  Probably not a
1446                          * big deal here.
1447                          */
1448                         hammer_unlock(&ip->lock);
1449                         if (vap->va_size < ip->ino_rec.ino_size) {
1450                                 vtruncbuf(ap->a_vp, vap->va_size,
1451                                           HAMMER_BUFSIZE);
1452                                 truncating = 1;
1453                         } else {
1454                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1455                                 truncating = 0;
1456                         }
1457                         hammer_lock_sh(&ip->lock);
1458                         ip->ino_rec.ino_size = vap->va_size;
1459                         modflags |= HAMMER_INODE_RDIRTY;
1460                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1461                                        ~HAMMER_BUFMASK64;
1462
1463                         /*
1464                          * on-media truncation is cached in the inode until
1465                          * the inode is synchronized.
1466                          */
1467                         if (truncating) {
1468                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1469                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1470                                         ip->trunc_off = vap->va_size;
1471                                 } else if (ip->trunc_off > vap->va_size) {
1472                                         ip->trunc_off = vap->va_size;
1473                                 }
1474                         }
1475
1476                         /*
1477                          * If truncating we have to clean out a portion of
1478                          * the last block on-disk.  We do this in the
1479                          * front-end buffer cache.
1480                          */
1481                         if (truncating && vap->va_size < aligned_size) {
1482                                 struct buf *bp;
1483                                 int offset;
1484
1485                                 offset = vap->va_size & HAMMER_BUFMASK;
1486                                 error = bread(ap->a_vp,
1487                                               aligned_size - HAMMER_BUFSIZE,
1488                                               HAMMER_BUFSIZE, &bp);
1489                                 if (error == 0) {
1490                                         bzero(bp->b_data + offset,
1491                                               HAMMER_BUFSIZE - offset);
1492                                         bdwrite(bp);
1493                                 } else {
1494                                         brelse(bp);
1495                                 }
1496                         }
1497                         break;
1498                 case VDATABASE:
1499                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1500                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1501                                 ip->trunc_off = vap->va_size;
1502                         } else if (ip->trunc_off > vap->va_size) {
1503                                 ip->trunc_off = vap->va_size;
1504                         }
1505                         ip->ino_rec.ino_size = vap->va_size;
1506                         modflags |= HAMMER_INODE_RDIRTY;
1507                         break;
1508                 default:
1509                         error = EINVAL;
1510                         goto done;
1511                 }
1512                 break;
1513         }
1514         if (vap->va_atime.tv_sec != VNOVAL) {
1515                 ip->ino_rec.ino_atime =
1516                         hammer_timespec_to_transid(&vap->va_atime);
1517                 modflags |= HAMMER_INODE_ITIMES;
1518         }
1519         if (vap->va_mtime.tv_sec != VNOVAL) {
1520                 ip->ino_rec.ino_mtime =
1521                         hammer_timespec_to_transid(&vap->va_mtime);
1522                 modflags |= HAMMER_INODE_ITIMES;
1523         }
1524         if (vap->va_mode != (mode_t)VNOVAL) {
1525                 if (ip->ino_data.mode != vap->va_mode) {
1526                         ip->ino_data.mode = vap->va_mode;
1527                         modflags |= HAMMER_INODE_DDIRTY;
1528                 }
1529         }
1530 done:
1531         if (error == 0)
1532                 hammer_modify_inode(&trans, ip, modflags);
1533         hammer_unlock(&ip->lock);
1534         hammer_done_transaction(&trans);
1535         return (error);
1536 }
1537
1538 /*
1539  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1540  */
1541 static
1542 int
1543 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1544 {
1545         struct hammer_transaction trans;
1546         struct hammer_inode *dip;
1547         struct hammer_inode *nip;
1548         struct nchandle *nch;
1549         hammer_record_t record;
1550         int error;
1551         int bytes;
1552
1553         ap->a_vap->va_type = VLNK;
1554
1555         nch = ap->a_nch;
1556         dip = VTOI(ap->a_dvp);
1557
1558         if (dip->flags & HAMMER_INODE_RO)
1559                 return (EROFS);
1560
1561         /*
1562          * Create a transaction to cover the operations we perform.
1563          */
1564         hammer_start_transaction(&trans, dip->hmp);
1565
1566         /*
1567          * Create a new filesystem object of the requested type.  The
1568          * returned inode will be referenced but not locked.
1569          */
1570
1571         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1572         if (error) {
1573                 hammer_done_transaction(&trans);
1574                 *ap->a_vpp = NULL;
1575                 return (error);
1576         }
1577
1578         /*
1579          * Add the new filesystem object to the directory.  This will also
1580          * bump the inode's link count.
1581          */
1582         hammer_lock_sh(&nip->lock);
1583         hammer_lock_sh(&dip->lock);
1584
1585         /*
1586          * Add a record representing the symlink.  symlink stores the link
1587          * as pure data, not a string, and is no \0 terminated.
1588          */
1589         if (error == 0) {
1590                 record = hammer_alloc_mem_record(nip);
1591                 bytes = strlen(ap->a_target);
1592
1593                 record->rec.base.base.key = HAMMER_FIXKEY_SYMLINK;
1594                 record->rec.base.base.rec_type = HAMMER_RECTYPE_FIX;
1595                 record->rec.base.data_len = bytes;
1596                 record->data = (void *)ap->a_target;
1597                 /* will be reallocated by routine below */
1598                 error = hammer_ip_add_record(&trans, record);
1599
1600                 /*
1601                  * Set the file size to the length of the link.
1602                  */
1603                 if (error == 0) {
1604                         nip->ino_rec.ino_size = bytes;
1605                         hammer_modify_inode(&trans, nip, HAMMER_INODE_RDIRTY);
1606                 }
1607         }
1608         if (error == 0)
1609                 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1610         hammer_unlock(&dip->lock);
1611         hammer_unlock(&nip->lock);
1612
1613         /*
1614          * Finish up.
1615          */
1616         if (error) {
1617                 hammer_rel_inode(nip, 0);
1618                 *ap->a_vpp = NULL;
1619         } else {
1620                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
1621                 hammer_rel_inode(nip, 0);
1622                 if (error == 0) {
1623                         cache_setunresolved(ap->a_nch);
1624                         cache_setvp(ap->a_nch, *ap->a_vpp);
1625                 }
1626         }
1627         hammer_done_transaction(&trans);
1628         return (error);
1629 }
1630
1631 /*
1632  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1633  */
1634 static
1635 int
1636 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1637 {
1638         struct hammer_transaction trans;
1639         int error;
1640
1641         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1642         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1643                                 ap->a_cred, ap->a_flags);
1644         hammer_done_transaction(&trans);
1645
1646         return (error);
1647 }
1648
1649 /*
1650  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1651  */
1652 static
1653 int
1654 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1655 {
1656         struct hammer_inode *ip = ap->a_vp->v_data;
1657
1658         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1659                             ap->a_fflag, ap->a_cred));
1660 }
1661
1662 static
1663 int
1664 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1665 {
1666         struct mount *mp;
1667         int error;
1668
1669         mp = ap->a_head.a_ops->head.vv_mount;
1670
1671         switch(ap->a_op) {
1672         case MOUNTCTL_SET_EXPORT:
1673                 if (ap->a_ctllen != sizeof(struct export_args))
1674                         error = EINVAL;
1675                 error = hammer_vfs_export(mp, ap->a_op,
1676                                       (const struct export_args *)ap->a_ctl);
1677                 break;
1678         default:
1679                 error = journal_mountctl(ap);
1680                 break;
1681         }
1682         return(error);
1683 }
1684
1685 /*
1686  * hammer_vop_strategy { vp, bio }
1687  *
1688  * Strategy call, used for regular file read & write only.  Note that the
1689  * bp may represent a cluster.
1690  *
1691  * To simplify operation and allow better optimizations in the future,
1692  * this code does not make any assumptions with regards to buffer alignment
1693  * or size.
1694  */
1695 static
1696 int
1697 hammer_vop_strategy(struct vop_strategy_args *ap)
1698 {
1699         struct buf *bp;
1700         int error;
1701
1702         bp = ap->a_bio->bio_buf;
1703
1704         switch(bp->b_cmd) {
1705         case BUF_CMD_READ:
1706                 error = hammer_vop_strategy_read(ap);
1707                 break;
1708         case BUF_CMD_WRITE:
1709                 error = hammer_vop_strategy_write(ap);
1710                 break;
1711         default:
1712                 bp->b_error = error = EINVAL;
1713                 bp->b_flags |= B_ERROR;
1714                 biodone(ap->a_bio);
1715                 break;
1716         }
1717         return (error);
1718 }
1719
1720 /*
1721  * Read from a regular file.  Iterate the related records and fill in the
1722  * BIO/BUF.  Gaps are zero-filled.
1723  *
1724  * The support code in hammer_object.c should be used to deal with mixed
1725  * in-memory and on-disk records.
1726  *
1727  * XXX atime update
1728  */
1729 static
1730 int
1731 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1732 {
1733         struct hammer_transaction trans;
1734         struct hammer_inode *ip;
1735         struct hammer_cursor cursor;
1736         hammer_record_ondisk_t rec;
1737         hammer_base_elm_t base;
1738         struct bio *bio;
1739         struct buf *bp;
1740         int64_t rec_offset;
1741         int64_t ran_end;
1742         int64_t tmp64;
1743         int error;
1744         int boff;
1745         int roff;
1746         int n;
1747
1748         bio = ap->a_bio;
1749         bp = bio->bio_buf;
1750         ip = ap->a_vp->v_data;
1751
1752         hammer_simple_transaction(&trans, ip->hmp);
1753         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1754
1755         /*
1756          * Key range (begin and end inclusive) to scan.  Note that the key's
1757          * stored in the actual records represent BASE+LEN, not BASE.  The
1758          * first record containing bio_offset will have a key > bio_offset.
1759          */
1760         cursor.key_beg.obj_id = ip->obj_id;
1761         cursor.key_beg.create_tid = 0;
1762         cursor.key_beg.delete_tid = 0;
1763         cursor.key_beg.obj_type = 0;
1764         cursor.key_beg.key = bio->bio_offset + 1;
1765         cursor.asof = ip->obj_asof;
1766         cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
1767
1768         cursor.key_end = cursor.key_beg;
1769         KKASSERT(ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_REGFILE);
1770 #if 0
1771         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1772                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1773                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1774                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1775         } else
1776 #endif
1777         {
1778                 ran_end = bio->bio_offset + bp->b_bufsize;
1779                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1780                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1781                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1782                 if (tmp64 < ran_end)
1783                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1784                 else
1785                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1786         }
1787         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1788
1789         error = hammer_ip_first(&cursor, ip);
1790         boff = 0;
1791
1792         while (error == 0) {
1793                 error = hammer_ip_resolve_data(&cursor);
1794                 if (error)
1795                         break;
1796                 rec = cursor.record;
1797                 base = &rec->base.base;
1798
1799                 rec_offset = base->key - rec->data.base.data_len;
1800
1801                 /*
1802                  * Calculate the gap, if any, and zero-fill it.
1803                  */
1804                 n = (int)(rec_offset - (bio->bio_offset + boff));
1805                 if (n > 0) {
1806                         if (n > bp->b_bufsize - boff)
1807                                 n = bp->b_bufsize - boff;
1808                         bzero((char *)bp->b_data + boff, n);
1809                         boff += n;
1810                         n = 0;
1811                 }
1812
1813                 /*
1814                  * Calculate the data offset in the record and the number
1815                  * of bytes we can copy.
1816                  *
1817                  * Note there is a degenerate case here where boff may
1818                  * already be at bp->b_bufsize.
1819                  */
1820                 roff = -n;
1821                 rec_offset += roff;
1822                 n = rec->data.base.data_len - roff;
1823                 KKASSERT(n > 0);
1824                 if (n > bp->b_bufsize - boff)
1825                         n = bp->b_bufsize - boff;
1826
1827                 /*
1828                  * If we cached a truncation point on our front-end the
1829                  * on-disk version may still have physical records beyond
1830                  * that point.  Truncate visibility.
1831                  */
1832                 if (ip->trunc_off <= rec_offset)
1833                         n = 0;
1834                 else if (ip->trunc_off < rec_offset + n)
1835                         n = (int)(ip->trunc_off - rec_offset);
1836
1837                 /*
1838                  * Copy
1839                  */
1840                 if (n) {
1841                         bcopy((char *)cursor.data + roff,
1842                               (char *)bp->b_data + boff, n);
1843                         boff += n;
1844                 }
1845                 if (boff == bp->b_bufsize)
1846                         break;
1847                 error = hammer_ip_next(&cursor);
1848         }
1849         hammer_done_cursor(&cursor);
1850         hammer_done_transaction(&trans);
1851
1852         /*
1853          * There may have been a gap after the last record
1854          */
1855         if (error == ENOENT)
1856                 error = 0;
1857         if (error == 0 && boff != bp->b_bufsize) {
1858                 KKASSERT(boff < bp->b_bufsize);
1859                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1860                 /* boff = bp->b_bufsize; */
1861         }
1862         bp->b_resid = 0;
1863         bp->b_error = error;
1864         if (error)
1865                 bp->b_flags |= B_ERROR;
1866         biodone(ap->a_bio);
1867         return(error);
1868 }
1869
1870 /*
1871  * Write to a regular file.   Because this is a strategy call the OS is
1872  * trying to actually sync data to the media.   HAMMER can only flush
1873  * the entire inode (so the TID remains properly synchronized).
1874  *
1875  * Basically all we do here is place the bio on the inode's flush queue
1876  * and activate the flusher.
1877  */
1878 static
1879 int
1880 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1881 {
1882         hammer_inode_t ip;
1883         struct bio *bio;
1884         struct buf *bp;
1885
1886         bio = ap->a_bio;
1887         bp = bio->bio_buf;
1888         ip = ap->a_vp->v_data;
1889
1890         if (ip->flags & HAMMER_INODE_RO) {
1891                 bp->b_error = EROFS;
1892                 bp->b_flags |= B_ERROR;
1893                 biodone(ap->a_bio);
1894                 return(EROFS);
1895         }
1896
1897         /*
1898          * If the inode is being flushed we cannot re-queue buffers
1899          * it may have already flushed, or it could result in duplicate
1900          * records in the database.
1901          */
1902         BUF_KERNPROC(bp);
1903         if (ip->flags & HAMMER_INODE_WRITE_ALT)
1904                 TAILQ_INSERT_TAIL(&ip->bio_alt_list, bio, bio_act);
1905         else
1906                 TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
1907         ++hammer_bio_count;
1908         hammer_modify_inode(NULL, ip, HAMMER_INODE_BUFS);
1909         hammer_flush_inode(ip, HAMMER_FLUSH_FORCE|HAMMER_FLUSH_SIGNAL);
1910         return(0);
1911 }
1912
1913 /*
1914  * Backend code which actually performs the write to the media.  This
1915  * routine is typically called from the flusher.  The bio will be disposed
1916  * of (biodone'd) by this routine.
1917  *
1918  * Iterate the related records and mark for deletion.  If existing edge
1919  * records (left and right side) overlap our write they have to be marked
1920  * deleted and new records created, usually referencing a portion of the
1921  * original data.  Then add a record to represent the buffer.
1922  */
1923 int
1924 hammer_dowrite(hammer_transaction_t trans, hammer_inode_t ip, struct bio *bio)
1925 {
1926         struct buf *bp = bio->bio_buf;
1927         int error;
1928
1929         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
1930
1931         /*
1932          * Delete any records overlapping our range.  This function will
1933          * (eventually) properly truncate partial overlaps.
1934          */
1935         if (ip->sync_ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1936                 error = hammer_ip_delete_range(trans, ip, bio->bio_offset,
1937                                                bio->bio_offset);
1938         } else {
1939                 error = hammer_ip_delete_range(trans, ip, bio->bio_offset,
1940                                                bio->bio_offset +
1941                                                 bp->b_bufsize - 1);
1942         }
1943
1944         /*
1945          * Add a single record to cover the write.  We can write a record
1946          * with only the actual file data - for example, a small 200 byte
1947          * file does not have to write out a 16K record.
1948          *
1949          * While the data size does not have to be aligned, we still do it
1950          * to reduce fragmentation in a future allocation model.
1951          */
1952         if (error == 0) {
1953                 int limit_size;
1954
1955                 if (ip->sync_ino_rec.ino_size - bio->bio_offset > 
1956                     bp->b_bufsize) {
1957                             limit_size = bp->b_bufsize;
1958                 } else {
1959                         limit_size = (int)(ip->sync_ino_rec.ino_size -
1960                                            bio->bio_offset);
1961                         KKASSERT(limit_size >= 0);
1962                         limit_size = (limit_size + 63) & ~63;
1963                 }
1964
1965                 error = hammer_ip_sync_data(trans, ip, bio->bio_offset,
1966                                             bp->b_data, limit_size);
1967
1968         }
1969         if (error)
1970                 Debugger("hammer_dowrite: error");
1971
1972         if (error) {
1973                 bp->b_resid = bp->b_bufsize;
1974                 bp->b_error = error;
1975                 bp->b_flags |= B_ERROR;
1976         } else {
1977                 bp->b_resid = 0;
1978         }
1979         biodone(bio);
1980         --hammer_bio_count;
1981         return(error);
1982 }
1983
1984 /*
1985  * dounlink - disconnect a directory entry
1986  *
1987  * XXX whiteout support not really in yet
1988  */
1989 static int
1990 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
1991                 struct vnode *dvp, struct ucred *cred, int flags)
1992 {
1993         struct namecache *ncp;
1994         hammer_inode_t dip;
1995         hammer_inode_t ip;
1996         hammer_record_ondisk_t rec;
1997         struct hammer_cursor cursor;
1998         int64_t namekey;
1999         int error;
2000
2001         /*
2002          * Calculate the namekey and setup the key range for the scan.  This
2003          * works kinda like a chained hash table where the lower 32 bits
2004          * of the namekey synthesize the chain.
2005          *
2006          * The key range is inclusive of both key_beg and key_end.
2007          */
2008         dip = VTOI(dvp);
2009         ncp = nch->ncp;
2010
2011         if (dip->flags & HAMMER_INODE_RO)
2012                 return (EROFS);
2013
2014         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2015 retry:
2016         hammer_init_cursor(trans, &cursor, &dip->cache[0]);
2017         cursor.key_beg.obj_id = dip->obj_id;
2018         cursor.key_beg.key = namekey;
2019         cursor.key_beg.create_tid = 0;
2020         cursor.key_beg.delete_tid = 0;
2021         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2022         cursor.key_beg.obj_type = 0;
2023
2024         cursor.key_end = cursor.key_beg;
2025         cursor.key_end.key |= 0xFFFFFFFFULL;
2026         cursor.asof = dip->obj_asof;
2027         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2028
2029         /*
2030          * Scan all matching records (the chain), locate the one matching
2031          * the requested path component.  info->last_error contains the
2032          * error code on search termination and could be 0, ENOENT, or
2033          * something else.
2034          *
2035          * The hammer_ip_*() functions merge in-memory records with on-disk
2036          * records for the purposes of the search.
2037          */
2038         error = hammer_ip_first(&cursor, dip);
2039         while (error == 0) {
2040                 error = hammer_ip_resolve_data(&cursor);
2041                 if (error)
2042                         break;
2043                 rec = cursor.record;
2044                 if (ncp->nc_nlen == rec->entry.base.data_len &&
2045                     bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
2046                         break;
2047                 }
2048                 error = hammer_ip_next(&cursor);
2049         }
2050
2051         /*
2052          * If all is ok we have to get the inode so we can adjust nlinks.
2053          *
2054          * If the target is a directory, it must be empty.
2055          */
2056         if (error == 0) {
2057                 ip = hammer_get_inode(trans, &dip->cache[1],
2058                                       rec->entry.obj_id,
2059                                       dip->hmp->asof, 0, &error);
2060                 if (error == ENOENT) {
2061                         kprintf("obj_id %016llx\n", rec->entry.obj_id);
2062                         Debugger("ENOENT unlinking object that should exist");
2063                 }
2064
2065                 /*
2066                  * If we are trying to remove a directory the directory must
2067                  * be empty.
2068                  *
2069                  * WARNING: hammer_ip_check_directory_empty() may have to
2070                  * terminate the cursor to avoid a deadlock.  It is ok to
2071                  * call hammer_done_cursor() twice.
2072                  */
2073                 if (error == 0 && ip->ino_rec.base.base.obj_type ==
2074                                   HAMMER_OBJTYPE_DIRECTORY) {
2075                         error = hammer_ip_check_directory_empty(trans, &cursor,
2076                                                                 ip);
2077                 }
2078
2079                 /*
2080                  * Delete the directory entry.
2081                  *
2082                  * WARNING: hammer_ip_del_directory() may have to terminate
2083                  * the cursor to avoid a deadlock.  It is ok to call
2084                  * hammer_done_cursor() twice.
2085                  */
2086                 if (error == 0) {
2087                         hammer_lock_sh(&ip->lock);
2088                         hammer_lock_sh(&dip->lock);
2089                         error = hammer_ip_del_directory(trans, &cursor,
2090                                                         dip, ip);
2091                         hammer_unlock(&dip->lock);
2092                         hammer_unlock(&ip->lock);
2093                 }
2094                 if (error == 0) {
2095                         cache_setunresolved(nch);
2096                         cache_setvp(nch, NULL);
2097                         /* XXX locking */
2098                         if (ip->vp)
2099                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2100                 }
2101                 hammer_rel_inode(ip, 0);
2102         }
2103         hammer_done_cursor(&cursor);
2104         if (error == EDEADLK)
2105                 goto retry;
2106
2107         return (error);
2108 }
2109
2110 /************************************************************************
2111  *                          FIFO AND SPECFS OPS                         *
2112  ************************************************************************
2113  *
2114  */
2115
2116 static int
2117 hammer_vop_fifoclose (struct vop_close_args *ap)
2118 {
2119         /* XXX update itimes */
2120         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2121 }
2122
2123 static int
2124 hammer_vop_fiforead (struct vop_read_args *ap)
2125 {
2126         int error;
2127
2128         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2129         /* XXX update access time */
2130         return (error);
2131 }
2132
2133 static int
2134 hammer_vop_fifowrite (struct vop_write_args *ap)
2135 {
2136         int error;
2137
2138         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2139         /* XXX update access time */
2140         return (error);
2141 }
2142
2143 static int
2144 hammer_vop_specclose (struct vop_close_args *ap)
2145 {
2146         /* XXX update itimes */
2147         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2148 }
2149
2150 static int
2151 hammer_vop_specread (struct vop_read_args *ap)
2152 {
2153         /* XXX update access time */
2154         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2155 }
2156
2157 static int
2158 hammer_vop_specwrite (struct vop_write_args *ap)
2159 {
2160         /* XXX update last change time */
2161         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2162 }
2163