HAMMER 38C/Many: Undo/Synchronization and crash recovery
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.39 2008/04/25 21:49:49 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81 static int hammer_vop_mountctl(struct vop_mountctl_args *);
82
83 static int hammer_vop_fifoclose (struct vop_close_args *);
84 static int hammer_vop_fiforead (struct vop_read_args *);
85 static int hammer_vop_fifowrite (struct vop_write_args *);
86
87 static int hammer_vop_specclose (struct vop_close_args *);
88 static int hammer_vop_specread (struct vop_read_args *);
89 static int hammer_vop_specwrite (struct vop_write_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         hammer_vop_pathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_setattr =          hammer_vop_setattr,
119         .vop_strategy =         hammer_vop_strategy,
120         .vop_nsymlink =         hammer_vop_nsymlink,
121         .vop_nwhiteout =        hammer_vop_nwhiteout,
122         .vop_ioctl =            hammer_vop_ioctl,
123         .vop_mountctl =         hammer_vop_mountctl
124 };
125
126 struct vop_ops hammer_spec_vops = {
127         .vop_default =          spec_vnoperate,
128         .vop_fsync =            hammer_vop_fsync,
129         .vop_read =             hammer_vop_specread,
130         .vop_write =            hammer_vop_specwrite,
131         .vop_access =           hammer_vop_access,
132         .vop_close =            hammer_vop_specclose,
133         .vop_getattr =          hammer_vop_getattr,
134         .vop_inactive =         hammer_vop_inactive,
135         .vop_reclaim =          hammer_vop_reclaim,
136         .vop_setattr =          hammer_vop_setattr
137 };
138
139 struct vop_ops hammer_fifo_vops = {
140         .vop_default =          fifo_vnoperate,
141         .vop_fsync =            hammer_vop_fsync,
142         .vop_read =             hammer_vop_fiforead,
143         .vop_write =            hammer_vop_fifowrite,
144         .vop_access =           hammer_vop_access,
145         .vop_close =            hammer_vop_fifoclose,
146         .vop_getattr =          hammer_vop_getattr,
147         .vop_inactive =         hammer_vop_inactive,
148         .vop_reclaim =          hammer_vop_reclaim,
149         .vop_setattr =          hammer_vop_setattr
150 };
151
152 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
153                            struct vnode *dvp, struct ucred *cred, int flags);
154 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
155 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
156
157 #if 0
158 static
159 int
160 hammer_vop_vnoperate(struct vop_generic_args *)
161 {
162         return (VOCALL(&hammer_vnode_vops, ap));
163 }
164 #endif
165
166 /*
167  * hammer_vop_fsync { vp, waitfor }
168  */
169 static
170 int
171 hammer_vop_fsync(struct vop_fsync_args *ap)
172 {
173         hammer_inode_t ip = VTOI(ap->a_vp);
174
175         hammer_flush_inode(ip, 0);
176         if (ap->a_waitfor == MNT_WAIT)
177                 hammer_wait_inode(ip);
178         return (ip->error);
179 }
180
181 /*
182  * hammer_vop_read { vp, uio, ioflag, cred }
183  */
184 static
185 int
186 hammer_vop_read(struct vop_read_args *ap)
187 {
188         struct hammer_transaction trans;
189         hammer_inode_t ip;
190         off_t offset;
191         struct buf *bp;
192         struct uio *uio;
193         int error;
194         int n;
195         int seqcount;
196
197         if (ap->a_vp->v_type != VREG)
198                 return (EINVAL);
199         ip = VTOI(ap->a_vp);
200         error = 0;
201         seqcount = ap->a_ioflag >> 16;
202
203         hammer_start_transaction(&trans, ip->hmp);
204
205         /*
206          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
207          */
208         uio = ap->a_uio;
209         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
210                 offset = uio->uio_offset & HAMMER_BUFMASK;
211 #if 0
212                 error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
213                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
214                                      MAXBSIZE, seqcount, &bp);
215 #endif
216                 error = bread(ap->a_vp, uio->uio_offset - offset,
217                               HAMMER_BUFSIZE, &bp);
218                 if (error) {
219                         brelse(bp);
220                         break;
221                 }
222                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
223                 n = HAMMER_BUFSIZE - offset;
224                 if (n > uio->uio_resid)
225                         n = uio->uio_resid;
226                 if (n > ip->ino_rec.ino_size - uio->uio_offset)
227                         n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
228                 error = uiomove((char *)bp->b_data + offset, n, uio);
229                 if (error) {
230                         bqrelse(bp);
231                         break;
232                 }
233                 bqrelse(bp);
234         }
235         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
236             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
237                 ip->ino_rec.ino_atime = trans.time;
238                 hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
239         }
240         hammer_done_transaction(&trans);
241         return (error);
242 }
243
244 /*
245  * hammer_vop_write { vp, uio, ioflag, cred }
246  */
247 static
248 int
249 hammer_vop_write(struct vop_write_args *ap)
250 {
251         struct hammer_transaction trans;
252         struct hammer_inode *ip;
253         struct uio *uio;
254         off_t offset;
255         struct buf *bp;
256         int error;
257         int n;
258         int flags;
259         int count;
260
261         if (ap->a_vp->v_type != VREG)
262                 return (EINVAL);
263         ip = VTOI(ap->a_vp);
264         error = 0;
265
266         if (ip->flags & HAMMER_INODE_RO)
267                 return (EROFS);
268
269         /*
270          * Create a transaction to cover the operations we perform.
271          */
272         hammer_start_transaction(&trans, ip->hmp);
273         uio = ap->a_uio;
274
275         /*
276          * Check append mode
277          */
278         if (ap->a_ioflag & IO_APPEND)
279                 uio->uio_offset = ip->ino_rec.ino_size;
280
281         /*
282          * Check for illegal write offsets.  Valid range is 0...2^63-1
283          */
284         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
285                 hammer_done_transaction(&trans);
286                 return (EFBIG);
287         }
288
289         /*
290          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
291          */
292         count = 0;
293         while (uio->uio_resid > 0) {
294                 int fixsize = 0;
295
296                 /*
297                  * Do not allow huge writes to deadlock the buffer cache
298                  */
299                 if ((++count & 15) == 0) {
300                         vn_unlock(ap->a_vp);
301                         if ((ap->a_ioflag & IO_NOBWILL) == 0)
302                                 bwillwrite();
303                         vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
304                 }
305
306                 offset = uio->uio_offset & HAMMER_BUFMASK;
307                 n = HAMMER_BUFSIZE - offset;
308                 if (n > uio->uio_resid)
309                         n = uio->uio_resid;
310                 if (uio->uio_offset + n > ip->ino_rec.ino_size) {
311                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
312                         fixsize = 1;
313                 }
314
315                 if (uio->uio_segflg == UIO_NOCOPY) {
316                         /*
317                          * Issuing a write with the same data backing the
318                          * buffer.  Instantiate the buffer to collect the
319                          * backing vm pages, then read-in any missing bits.
320                          *
321                          * This case is used by vop_stdputpages().
322                          */
323                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
324                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
325                         if ((bp->b_flags & B_CACHE) == 0) {
326                                 bqrelse(bp);
327                                 error = bread(ap->a_vp,
328                                               uio->uio_offset - offset,
329                                               HAMMER_BUFSIZE, &bp);
330                         }
331                 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
332                         /*
333                          * entirely overwrite the buffer
334                          */
335                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
336                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
337                 } else if (offset == 0 && uio->uio_offset >= ip->ino_rec.ino_size) {
338                         /*
339                          * XXX
340                          */
341                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
342                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
343                         vfs_bio_clrbuf(bp);
344                 } else {
345                         /*
346                          * Partial overwrite, read in any missing bits then
347                          * replace the portion being written.
348                          */
349                         error = bread(ap->a_vp, uio->uio_offset - offset,
350                                       HAMMER_BUFSIZE, &bp);
351                         if (error == 0)
352                                 bheavy(bp);
353                 }
354                 if (error == 0)
355                         error = uiomove((char *)bp->b_data + offset, n, uio);
356
357                 /*
358                  * If we screwed up we have to undo any VM size changes we
359                  * made.
360                  */
361                 if (error) {
362                         brelse(bp);
363                         if (fixsize) {
364                                 vtruncbuf(ap->a_vp, ip->ino_rec.ino_size,
365                                           HAMMER_BUFSIZE);
366                         }
367                         break;
368                 }
369                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
370                 hammer_lock_sh(&ip->lock);
371                 if (ip->ino_rec.ino_size < uio->uio_offset) {
372                         ip->ino_rec.ino_size = uio->uio_offset;
373                         flags = HAMMER_INODE_RDIRTY;
374                         vnode_pager_setsize(ap->a_vp, ip->ino_rec.ino_size);
375                 } else {
376                         flags = 0;
377                 }
378                 ip->ino_rec.ino_mtime = trans.time;
379                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
380                 hammer_modify_inode(&trans, ip, flags);
381                 hammer_unlock(&ip->lock);
382
383                 if (ap->a_ioflag & IO_SYNC) {
384                         bwrite(bp);
385                 } else if (ap->a_ioflag & IO_DIRECT) {
386                         bawrite(bp);
387 #if 0
388                 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
389                            (uio->uio_offset & HAMMER_BUFMASK) == 0) {
390                         /*
391                          * XXX HAMMER can only fsync the whole inode,
392                          * doing it on every buffer would be a bad idea.
393                          */
394                         /*
395                          * If seqcount indicates sequential operation and
396                          * we just finished filling a buffer, push it out
397                          * now to prevent the buffer cache from becoming
398                          * too full, which would trigger non-optimal
399                          * flushes.
400                          */
401                         bdwrite(bp);
402 #endif
403                 } else {
404                         bdwrite(bp);
405                 }
406         }
407         hammer_done_transaction(&trans);
408         return (error);
409 }
410
411 /*
412  * hammer_vop_access { vp, mode, cred }
413  */
414 static
415 int
416 hammer_vop_access(struct vop_access_args *ap)
417 {
418         struct hammer_inode *ip = VTOI(ap->a_vp);
419         uid_t uid;
420         gid_t gid;
421         int error;
422
423         uid = hammer_to_unix_xid(&ip->ino_data.uid);
424         gid = hammer_to_unix_xid(&ip->ino_data.gid);
425
426         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
427                                   ip->ino_data.uflags);
428         return (error);
429 }
430
431 /*
432  * hammer_vop_advlock { vp, id, op, fl, flags }
433  */
434 static
435 int
436 hammer_vop_advlock(struct vop_advlock_args *ap)
437 {
438         struct hammer_inode *ip = VTOI(ap->a_vp);
439
440         return (lf_advlock(ap, &ip->advlock, ip->ino_rec.ino_size));
441 }
442
443 /*
444  * hammer_vop_close { vp, fflag }
445  */
446 static
447 int
448 hammer_vop_close(struct vop_close_args *ap)
449 {
450         return (vop_stdclose(ap));
451 }
452
453 /*
454  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
455  *
456  * The operating system has already ensured that the directory entry
457  * does not exist and done all appropriate namespace locking.
458  */
459 static
460 int
461 hammer_vop_ncreate(struct vop_ncreate_args *ap)
462 {
463         struct hammer_transaction trans;
464         struct hammer_inode *dip;
465         struct hammer_inode *nip;
466         struct nchandle *nch;
467         int error;
468
469         nch = ap->a_nch;
470         dip = VTOI(ap->a_dvp);
471
472         if (dip->flags & HAMMER_INODE_RO)
473                 return (EROFS);
474
475         /*
476          * Create a transaction to cover the operations we perform.
477          */
478         hammer_start_transaction(&trans, dip->hmp);
479
480         /*
481          * Create a new filesystem object of the requested type.  The
482          * returned inode will be referenced and shared-locked to prevent
483          * it from being moved to the flusher.
484          */
485
486         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
487         if (error)
488                 kprintf("hammer_create_inode error %d\n", error);
489         if (error) {
490                 hammer_done_transaction(&trans);
491                 *ap->a_vpp = NULL;
492                 return (error);
493         }
494         hammer_lock_sh(&dip->lock);
495
496         /*
497          * Add the new filesystem object to the directory.  This will also
498          * bump the inode's link count.
499          */
500         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
501         if (error)
502                 kprintf("hammer_ip_add_directory error %d\n", error);
503         hammer_unlock(&dip->lock);
504         hammer_unlock(&nip->lock);
505
506         /*
507          * Finish up.
508          */
509         if (error) {
510                 hammer_rel_inode(nip, 0);
511                 hammer_done_transaction(&trans);
512                 *ap->a_vpp = NULL;
513         } else {
514                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
515                 hammer_done_transaction(&trans);
516                 hammer_rel_inode(nip, 0);
517                 if (error == 0) {
518                         cache_setunresolved(ap->a_nch);
519                         cache_setvp(ap->a_nch, *ap->a_vpp);
520                 }
521         }
522         return (error);
523 }
524
525 /*
526  * hammer_vop_getattr { vp, vap }
527  */
528 static
529 int
530 hammer_vop_getattr(struct vop_getattr_args *ap)
531 {
532         struct hammer_inode *ip = VTOI(ap->a_vp);
533         struct vattr *vap = ap->a_vap;
534
535 #if 0
536         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
537             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
538             ip->obj_asof == XXX
539         ) {
540                 /* LAZYMOD XXX */
541         }
542         hammer_itimes(ap->a_vp);
543 #endif
544
545         vap->va_fsid = ip->hmp->fsid_udev;
546         vap->va_fileid = ip->ino_rec.base.base.obj_id;
547         vap->va_mode = ip->ino_data.mode;
548         vap->va_nlink = ip->ino_rec.ino_nlinks;
549         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
550         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
551         vap->va_rmajor = 0;
552         vap->va_rminor = 0;
553         vap->va_size = ip->ino_rec.ino_size;
554         hammer_to_timespec(ip->ino_rec.ino_atime, &vap->va_atime);
555         hammer_to_timespec(ip->ino_rec.ino_mtime, &vap->va_mtime);
556         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
557         vap->va_flags = ip->ino_data.uflags;
558         vap->va_gen = 1;        /* hammer inums are unique for all time */
559         vap->va_blocksize = HAMMER_BUFSIZE;
560         vap->va_bytes = (ip->ino_rec.ino_size + 63) & ~63;
561         vap->va_type = hammer_get_vnode_type(ip->ino_rec.base.base.obj_type);
562         vap->va_filerev = 0;    /* XXX */
563         /* mtime uniquely identifies any adjustments made to the file */
564         vap->va_fsmid = ip->ino_rec.ino_mtime;
565         vap->va_uid_uuid = ip->ino_data.uid;
566         vap->va_gid_uuid = ip->ino_data.gid;
567         vap->va_fsid_uuid = ip->hmp->fsid;
568         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
569                           VA_FSID_UUID_VALID;
570
571         switch (ip->ino_rec.base.base.obj_type) {
572         case HAMMER_OBJTYPE_CDEV:
573         case HAMMER_OBJTYPE_BDEV:
574                 vap->va_rmajor = ip->ino_data.rmajor;
575                 vap->va_rminor = ip->ino_data.rminor;
576                 break;
577         default:
578                 break;
579         }
580
581         return(0);
582 }
583
584 /*
585  * hammer_vop_nresolve { nch, dvp, cred }
586  *
587  * Locate the requested directory entry.
588  */
589 static
590 int
591 hammer_vop_nresolve(struct vop_nresolve_args *ap)
592 {
593         struct hammer_transaction trans;
594         struct namecache *ncp;
595         hammer_inode_t dip;
596         hammer_inode_t ip;
597         hammer_tid_t asof;
598         struct hammer_cursor cursor;
599         union hammer_record_ondisk *rec;
600         struct vnode *vp;
601         int64_t namekey;
602         int error;
603         int i;
604         int nlen;
605         int flags;
606         u_int64_t obj_id;
607
608         /*
609          * Misc initialization, plus handle as-of name extensions.  Look for
610          * the '@@' extension.  Note that as-of files and directories cannot
611          * be modified.
612          */
613         dip = VTOI(ap->a_dvp);
614         ncp = ap->a_nch->ncp;
615         asof = dip->obj_asof;
616         nlen = ncp->nc_nlen;
617         flags = dip->flags;
618
619         hammer_simple_transaction(&trans, dip->hmp);
620
621         for (i = 0; i < nlen; ++i) {
622                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
623                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
624                         flags |= HAMMER_INODE_RO;
625                         break;
626                 }
627         }
628         nlen = i;
629
630         /*
631          * If there is no path component the time extension is relative to
632          * dip.
633          */
634         if (nlen == 0) {
635                 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
636                                       asof, flags, &error);
637                 if (error == 0) {
638                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
639                         hammer_rel_inode(ip, 0);
640                 } else {
641                         vp = NULL;
642                 }
643                 if (error == 0) {
644                         vn_unlock(vp);
645                         cache_setvp(ap->a_nch, vp);
646                         vrele(vp);
647                 }
648                 goto done;
649         }
650
651         /*
652          * Calculate the namekey and setup the key range for the scan.  This
653          * works kinda like a chained hash table where the lower 32 bits
654          * of the namekey synthesize the chain.
655          *
656          * The key range is inclusive of both key_beg and key_end.
657          */
658         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
659
660         error = hammer_init_cursor(&trans, &cursor, &dip->cache[0]);
661         cursor.key_beg.obj_id = dip->obj_id;
662         cursor.key_beg.key = namekey;
663         cursor.key_beg.create_tid = 0;
664         cursor.key_beg.delete_tid = 0;
665         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
666         cursor.key_beg.obj_type = 0;
667
668         cursor.key_end = cursor.key_beg;
669         cursor.key_end.key |= 0xFFFFFFFFULL;
670         cursor.asof = asof;
671         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
672
673         /*
674          * Scan all matching records (the chain), locate the one matching
675          * the requested path component.
676          *
677          * The hammer_ip_*() functions merge in-memory records with on-disk
678          * records for the purposes of the search.
679          */
680         if (error == 0)
681                 error = hammer_ip_first(&cursor, dip);
682
683         rec = NULL;
684         obj_id = 0;
685
686         while (error == 0) {
687                 error = hammer_ip_resolve_data(&cursor);
688                 if (error)
689                         break;
690                 rec = cursor.record;
691                 if (nlen == rec->entry.base.data_len &&
692                     bcmp(ncp->nc_name, cursor.data, nlen) == 0) {
693                         obj_id = rec->entry.obj_id;
694                         break;
695                 }
696                 error = hammer_ip_next(&cursor);
697         }
698         hammer_done_cursor(&cursor);
699         if (error == 0) {
700                 ip = hammer_get_inode(&trans, &dip->cache[1],
701                                       obj_id, asof, flags, &error);
702                 if (error == 0) {
703                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
704                         hammer_rel_inode(ip, 0);
705                 } else {
706                         vp = NULL;
707                 }
708                 if (error == 0) {
709                         vn_unlock(vp);
710                         cache_setvp(ap->a_nch, vp);
711                         vrele(vp);
712                 }
713         } else if (error == ENOENT) {
714                 cache_setvp(ap->a_nch, NULL);
715         }
716 done:
717         hammer_done_transaction(&trans);
718         return (error);
719 }
720
721 /*
722  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
723  *
724  * Locate the parent directory of a directory vnode.
725  *
726  * dvp is referenced but not locked.  *vpp must be returned referenced and
727  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
728  * at the root, instead it could indicate that the directory we were in was
729  * removed.
730  *
731  * NOTE: as-of sequences are not linked into the directory structure.  If
732  * we are at the root with a different asof then the mount point, reload
733  * the same directory with the mount point's asof.   I'm not sure what this
734  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
735  * get confused, but it hasn't been tested.
736  */
737 static
738 int
739 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
740 {
741         struct hammer_transaction trans;
742         struct hammer_inode *dip;
743         struct hammer_inode *ip;
744         int64_t parent_obj_id;
745         hammer_tid_t asof;
746         int error;
747
748         dip = VTOI(ap->a_dvp);
749         asof = dip->obj_asof;
750         parent_obj_id = dip->ino_data.parent_obj_id;
751
752         if (parent_obj_id == 0) {
753                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
754                    asof != dip->hmp->asof) {
755                         parent_obj_id = dip->obj_id;
756                         asof = dip->hmp->asof;
757                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
758                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
759                                    dip->obj_asof);
760                 } else {
761                         *ap->a_vpp = NULL;
762                         return ENOENT;
763                 }
764         }
765
766         hammer_simple_transaction(&trans, dip->hmp);
767
768         ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
769                               asof, dip->flags, &error);
770         if (ip) {
771                 error = hammer_get_vnode(ip, LK_EXCLUSIVE, ap->a_vpp);
772                 hammer_rel_inode(ip, 0);
773         } else {
774                 *ap->a_vpp = NULL;
775         }
776         hammer_done_transaction(&trans);
777         return (error);
778 }
779
780 /*
781  * hammer_vop_nlink { nch, dvp, vp, cred }
782  */
783 static
784 int
785 hammer_vop_nlink(struct vop_nlink_args *ap)
786 {
787         struct hammer_transaction trans;
788         struct hammer_inode *dip;
789         struct hammer_inode *ip;
790         struct nchandle *nch;
791         int error;
792
793         nch = ap->a_nch;
794         dip = VTOI(ap->a_dvp);
795         ip = VTOI(ap->a_vp);
796
797         if (dip->flags & HAMMER_INODE_RO)
798                 return (EROFS);
799         if (ip->flags & HAMMER_INODE_RO)
800                 return (EROFS);
801
802         /*
803          * Create a transaction to cover the operations we perform.
804          */
805         hammer_start_transaction(&trans, dip->hmp);
806
807         /*
808          * Add the filesystem object to the directory.  Note that neither
809          * dip nor ip are referenced or locked, but their vnodes are
810          * referenced.  This function will bump the inode's link count.
811          */
812         hammer_lock_sh(&ip->lock);
813         hammer_lock_sh(&dip->lock);
814         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
815         hammer_unlock(&dip->lock);
816         hammer_unlock(&ip->lock);
817
818         /*
819          * Finish up.
820          */
821         if (error == 0) {
822                 cache_setunresolved(nch);
823                 cache_setvp(nch, ap->a_vp);
824         }
825         hammer_done_transaction(&trans);
826         return (error);
827 }
828
829 /*
830  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
831  *
832  * The operating system has already ensured that the directory entry
833  * does not exist and done all appropriate namespace locking.
834  */
835 static
836 int
837 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
838 {
839         struct hammer_transaction trans;
840         struct hammer_inode *dip;
841         struct hammer_inode *nip;
842         struct nchandle *nch;
843         int error;
844
845         nch = ap->a_nch;
846         dip = VTOI(ap->a_dvp);
847
848         if (dip->flags & HAMMER_INODE_RO)
849                 return (EROFS);
850
851         /*
852          * Create a transaction to cover the operations we perform.
853          */
854         hammer_start_transaction(&trans, dip->hmp);
855
856         /*
857          * Create a new filesystem object of the requested type.  The
858          * returned inode will be referenced but not locked.
859          */
860         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
861         if (error) {
862                 kprintf("hammer_mkdir error %d\n", error);
863                 hammer_done_transaction(&trans);
864                 *ap->a_vpp = NULL;
865                 return (error);
866         }
867         /*
868          * Add the new filesystem object to the directory.  This will also
869          * bump the inode's link count.
870          */
871         hammer_lock_sh(&dip->lock);
872         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
873         hammer_unlock(&dip->lock);
874         hammer_unlock(&nip->lock);
875         if (error)
876                 kprintf("hammer_mkdir (add) error %d\n", error);
877
878         /*
879          * Finish up.
880          */
881         if (error) {
882                 hammer_rel_inode(nip, 0);
883                 *ap->a_vpp = NULL;
884         } else {
885                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
886                 hammer_rel_inode(nip, 0);
887                 if (error == 0) {
888                         cache_setunresolved(ap->a_nch);
889                         cache_setvp(ap->a_nch, *ap->a_vpp);
890                 }
891         }
892         hammer_done_transaction(&trans);
893         return (error);
894 }
895
896 /*
897  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
898  *
899  * The operating system has already ensured that the directory entry
900  * does not exist and done all appropriate namespace locking.
901  */
902 static
903 int
904 hammer_vop_nmknod(struct vop_nmknod_args *ap)
905 {
906         struct hammer_transaction trans;
907         struct hammer_inode *dip;
908         struct hammer_inode *nip;
909         struct nchandle *nch;
910         int error;
911
912         nch = ap->a_nch;
913         dip = VTOI(ap->a_dvp);
914
915         if (dip->flags & HAMMER_INODE_RO)
916                 return (EROFS);
917
918         /*
919          * Create a transaction to cover the operations we perform.
920          */
921         hammer_start_transaction(&trans, dip->hmp);
922
923         /*
924          * Create a new filesystem object of the requested type.  The
925          * returned inode will be referenced but not locked.
926          */
927         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
928         if (error) {
929                 hammer_done_transaction(&trans);
930                 *ap->a_vpp = NULL;
931                 return (error);
932         }
933
934         /*
935          * Add the new filesystem object to the directory.  This will also
936          * bump the inode's link count.
937          */
938         hammer_lock_sh(&dip->lock);
939         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
940         hammer_unlock(&dip->lock);
941         hammer_unlock(&nip->lock);
942
943         /*
944          * Finish up.
945          */
946         if (error) {
947                 hammer_rel_inode(nip, 0);
948                 *ap->a_vpp = NULL;
949         } else {
950                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
951                 hammer_rel_inode(nip, 0);
952                 if (error == 0) {
953                         cache_setunresolved(ap->a_nch);
954                         cache_setvp(ap->a_nch, *ap->a_vpp);
955                 }
956         }
957         hammer_done_transaction(&trans);
958         return (error);
959 }
960
961 /*
962  * hammer_vop_open { vp, mode, cred, fp }
963  */
964 static
965 int
966 hammer_vop_open(struct vop_open_args *ap)
967 {
968         if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
969                 return (EROFS);
970
971         return(vop_stdopen(ap));
972 }
973
974 /*
975  * hammer_vop_pathconf { vp, name, retval }
976  */
977 static
978 int
979 hammer_vop_pathconf(struct vop_pathconf_args *ap)
980 {
981         return EOPNOTSUPP;
982 }
983
984 /*
985  * hammer_vop_print { vp }
986  */
987 static
988 int
989 hammer_vop_print(struct vop_print_args *ap)
990 {
991         return EOPNOTSUPP;
992 }
993
994 /*
995  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
996  */
997 static
998 int
999 hammer_vop_readdir(struct vop_readdir_args *ap)
1000 {
1001         struct hammer_transaction trans;
1002         struct hammer_cursor cursor;
1003         struct hammer_inode *ip;
1004         struct uio *uio;
1005         hammer_record_ondisk_t rec;
1006         hammer_base_elm_t base;
1007         int error;
1008         int cookie_index;
1009         int ncookies;
1010         off_t *cookies;
1011         off_t saveoff;
1012         int r;
1013
1014         ip = VTOI(ap->a_vp);
1015         uio = ap->a_uio;
1016         saveoff = uio->uio_offset;
1017
1018         if (ap->a_ncookies) {
1019                 ncookies = uio->uio_resid / 16 + 1;
1020                 if (ncookies > 1024)
1021                         ncookies = 1024;
1022                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1023                 cookie_index = 0;
1024         } else {
1025                 ncookies = -1;
1026                 cookies = NULL;
1027                 cookie_index = 0;
1028         }
1029
1030         hammer_simple_transaction(&trans, ip->hmp);
1031
1032         /*
1033          * Handle artificial entries
1034          */
1035         error = 0;
1036         if (saveoff == 0) {
1037                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1038                 if (r)
1039                         goto done;
1040                 if (cookies)
1041                         cookies[cookie_index] = saveoff;
1042                 ++saveoff;
1043                 ++cookie_index;
1044                 if (cookie_index == ncookies)
1045                         goto done;
1046         }
1047         if (saveoff == 1) {
1048                 if (ip->ino_data.parent_obj_id) {
1049                         r = vop_write_dirent(&error, uio,
1050                                              ip->ino_data.parent_obj_id,
1051                                              DT_DIR, 2, "..");
1052                 } else {
1053                         r = vop_write_dirent(&error, uio,
1054                                              ip->obj_id, DT_DIR, 2, "..");
1055                 }
1056                 if (r)
1057                         goto done;
1058                 if (cookies)
1059                         cookies[cookie_index] = saveoff;
1060                 ++saveoff;
1061                 ++cookie_index;
1062                 if (cookie_index == ncookies)
1063                         goto done;
1064         }
1065
1066         /*
1067          * Key range (begin and end inclusive) to scan.  Directory keys
1068          * directly translate to a 64 bit 'seek' position.
1069          */
1070         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1071         cursor.key_beg.obj_id = ip->obj_id;
1072         cursor.key_beg.create_tid = 0;
1073         cursor.key_beg.delete_tid = 0;
1074         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1075         cursor.key_beg.obj_type = 0;
1076         cursor.key_beg.key = saveoff;
1077
1078         cursor.key_end = cursor.key_beg;
1079         cursor.key_end.key = HAMMER_MAX_KEY;
1080         cursor.asof = ip->obj_asof;
1081         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1082
1083         error = hammer_ip_first(&cursor, ip);
1084
1085         while (error == 0) {
1086                 error = hammer_ip_resolve_record_and_data(&cursor);
1087                 if (error)
1088                         break;
1089                 rec = cursor.record;
1090                 base = &rec->base.base;
1091                 saveoff = base->key;
1092
1093                 if (base->obj_id != ip->obj_id)
1094                         panic("readdir: bad record at %p", cursor.node);
1095
1096                 r = vop_write_dirent(
1097                              &error, uio, rec->entry.obj_id,
1098                              hammer_get_dtype(rec->entry.base.base.obj_type),
1099                              rec->entry.base.data_len,
1100                              (void *)cursor.data);
1101                 if (r)
1102                         break;
1103                 ++saveoff;
1104                 if (cookies)
1105                         cookies[cookie_index] = base->key;
1106                 ++cookie_index;
1107                 if (cookie_index == ncookies)
1108                         break;
1109                 error = hammer_ip_next(&cursor);
1110         }
1111         hammer_done_cursor(&cursor);
1112
1113 done:
1114         hammer_done_transaction(&trans);
1115
1116         if (ap->a_eofflag)
1117                 *ap->a_eofflag = (error == ENOENT);
1118         uio->uio_offset = saveoff;
1119         if (error && cookie_index == 0) {
1120                 if (error == ENOENT)
1121                         error = 0;
1122                 if (cookies) {
1123                         kfree(cookies, M_TEMP);
1124                         *ap->a_ncookies = 0;
1125                         *ap->a_cookies = NULL;
1126                 }
1127         } else {
1128                 if (error == ENOENT)
1129                         error = 0;
1130                 if (cookies) {
1131                         *ap->a_ncookies = cookie_index;
1132                         *ap->a_cookies = cookies;
1133                 }
1134         }
1135         return(error);
1136 }
1137
1138 /*
1139  * hammer_vop_readlink { vp, uio, cred }
1140  */
1141 static
1142 int
1143 hammer_vop_readlink(struct vop_readlink_args *ap)
1144 {
1145         struct hammer_transaction trans;
1146         struct hammer_cursor cursor;
1147         struct hammer_inode *ip;
1148         int error;
1149
1150         ip = VTOI(ap->a_vp);
1151
1152         hammer_simple_transaction(&trans, ip->hmp);
1153
1154         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1155
1156         /*
1157          * Key range (begin and end inclusive) to scan.  Directory keys
1158          * directly translate to a 64 bit 'seek' position.
1159          */
1160         cursor.key_beg.obj_id = ip->obj_id;
1161         cursor.key_beg.create_tid = 0;
1162         cursor.key_beg.delete_tid = 0;
1163         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1164         cursor.key_beg.obj_type = 0;
1165         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1166         cursor.asof = ip->obj_asof;
1167         cursor.flags |= HAMMER_CURSOR_ASOF;
1168
1169         error = hammer_ip_lookup(&cursor, ip);
1170         if (error == 0) {
1171                 error = hammer_ip_resolve_data(&cursor);
1172                 if (error == 0) {
1173                         error = uiomove((char *)cursor.data,
1174                                         cursor.record->base.data_len,
1175                                         ap->a_uio);
1176                 }
1177         }
1178         hammer_done_cursor(&cursor);
1179         hammer_done_transaction(&trans);
1180         return(error);
1181 }
1182
1183 /*
1184  * hammer_vop_nremove { nch, dvp, cred }
1185  */
1186 static
1187 int
1188 hammer_vop_nremove(struct vop_nremove_args *ap)
1189 {
1190         struct hammer_transaction trans;
1191         int error;
1192
1193         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1194         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1195         hammer_done_transaction(&trans);
1196
1197         return (error);
1198 }
1199
1200 /*
1201  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1202  */
1203 static
1204 int
1205 hammer_vop_nrename(struct vop_nrename_args *ap)
1206 {
1207         struct hammer_transaction trans;
1208         struct namecache *fncp;
1209         struct namecache *tncp;
1210         struct hammer_inode *fdip;
1211         struct hammer_inode *tdip;
1212         struct hammer_inode *ip;
1213         struct hammer_cursor cursor;
1214         union hammer_record_ondisk *rec;
1215         int64_t namekey;
1216         int error;
1217
1218         fdip = VTOI(ap->a_fdvp);
1219         tdip = VTOI(ap->a_tdvp);
1220         fncp = ap->a_fnch->ncp;
1221         tncp = ap->a_tnch->ncp;
1222         ip = VTOI(fncp->nc_vp);
1223         KKASSERT(ip != NULL);
1224
1225         if (fdip->flags & HAMMER_INODE_RO)
1226                 return (EROFS);
1227         if (tdip->flags & HAMMER_INODE_RO)
1228                 return (EROFS);
1229         if (ip->flags & HAMMER_INODE_RO)
1230                 return (EROFS);
1231
1232         hammer_start_transaction(&trans, fdip->hmp);
1233
1234         hammer_lock_sh(&ip->lock);
1235         if (fdip->obj_id < tdip->obj_id) {
1236                 hammer_lock_sh(&fdip->lock);
1237                 hammer_lock_sh(&tdip->lock);
1238         } else {
1239                 hammer_lock_sh(&tdip->lock);
1240                 hammer_lock_sh(&fdip->lock);
1241         }
1242
1243         /*
1244          * Remove tncp from the target directory and then link ip as
1245          * tncp. XXX pass trans to dounlink
1246          *
1247          * Force the inode sync-time to match the transaction so it is
1248          * in-sync with the creation of the target directory entry.
1249          */
1250         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1251         if (error == 0 || error == ENOENT) {
1252                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1253                 if (error == 0) {
1254                         ip->ino_data.parent_obj_id = tdip->obj_id;
1255                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
1256                 }
1257         }
1258         if (error)
1259                 goto failed; /* XXX */
1260
1261         /*
1262          * Locate the record in the originating directory and remove it.
1263          *
1264          * Calculate the namekey and setup the key range for the scan.  This
1265          * works kinda like a chained hash table where the lower 32 bits
1266          * of the namekey synthesize the chain.
1267          *
1268          * The key range is inclusive of both key_beg and key_end.
1269          */
1270         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1271 retry:
1272         hammer_init_cursor(&trans, &cursor, &fdip->cache[0]);
1273         cursor.key_beg.obj_id = fdip->obj_id;
1274         cursor.key_beg.key = namekey;
1275         cursor.key_beg.create_tid = 0;
1276         cursor.key_beg.delete_tid = 0;
1277         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1278         cursor.key_beg.obj_type = 0;
1279
1280         cursor.key_end = cursor.key_beg;
1281         cursor.key_end.key |= 0xFFFFFFFFULL;
1282         cursor.asof = fdip->obj_asof;
1283         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1284
1285         /*
1286          * Scan all matching records (the chain), locate the one matching
1287          * the requested path component.
1288          *
1289          * The hammer_ip_*() functions merge in-memory records with on-disk
1290          * records for the purposes of the search.
1291          */
1292         error = hammer_ip_first(&cursor, fdip);
1293         while (error == 0) {
1294                 if (hammer_ip_resolve_data(&cursor) != 0)
1295                         break;
1296                 rec = cursor.record;
1297                 if (fncp->nc_nlen == rec->entry.base.data_len &&
1298                     bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
1299                         break;
1300                 }
1301                 error = hammer_ip_next(&cursor);
1302         }
1303
1304         /*
1305          * If all is ok we have to get the inode so we can adjust nlinks.
1306          *
1307          * WARNING: hammer_ip_del_directory() may have to terminate the
1308          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1309          * twice.
1310          */
1311         if (error == 0)
1312                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1313
1314         /*
1315          * XXX A deadlock here will break rename's atomicy for the purposes
1316          * of crash recovery.
1317          */
1318         if (error == EDEADLK) {
1319                 hammer_unlock(&ip->lock);
1320                 hammer_unlock(&fdip->lock);
1321                 hammer_unlock(&tdip->lock);
1322                 hammer_done_cursor(&cursor);
1323                 hammer_lock_sh(&ip->lock);
1324                 if (fdip->obj_id < tdip->obj_id) {
1325                         hammer_lock_sh(&fdip->lock);
1326                         hammer_lock_sh(&tdip->lock);
1327                 } else {
1328                         hammer_lock_sh(&tdip->lock);
1329                         hammer_lock_sh(&fdip->lock);
1330                 }
1331                 goto retry;
1332         }
1333
1334         /*
1335          * Cleanup and tell the kernel that the rename succeeded.
1336          */
1337         hammer_done_cursor(&cursor);
1338         if (error == 0)
1339                 cache_rename(ap->a_fnch, ap->a_tnch);
1340
1341 failed:
1342         hammer_unlock(&ip->lock);
1343         hammer_unlock(&fdip->lock);
1344         hammer_unlock(&tdip->lock);
1345         hammer_done_transaction(&trans);
1346         return (error);
1347 }
1348
1349 /*
1350  * hammer_vop_nrmdir { nch, dvp, cred }
1351  */
1352 static
1353 int
1354 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1355 {
1356         struct hammer_transaction trans;
1357         int error;
1358
1359         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1360         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1361         hammer_done_transaction(&trans);
1362
1363         return (error);
1364 }
1365
1366 /*
1367  * hammer_vop_setattr { vp, vap, cred }
1368  */
1369 static
1370 int
1371 hammer_vop_setattr(struct vop_setattr_args *ap)
1372 {
1373         struct hammer_transaction trans;
1374         struct vattr *vap;
1375         struct hammer_inode *ip;
1376         int modflags;
1377         int error;
1378         int truncating;
1379         off_t aligned_size;
1380         u_int32_t flags;
1381         uuid_t uuid;
1382
1383         vap = ap->a_vap;
1384         ip = ap->a_vp->v_data;
1385         modflags = 0;
1386
1387         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1388                 return(EROFS);
1389         if (ip->flags & HAMMER_INODE_RO)
1390                 return (EROFS);
1391
1392         hammer_start_transaction(&trans, ip->hmp);
1393         hammer_lock_sh(&ip->lock);
1394         error = 0;
1395
1396         if (vap->va_flags != VNOVAL) {
1397                 flags = ip->ino_data.uflags;
1398                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1399                                          hammer_to_unix_xid(&ip->ino_data.uid),
1400                                          ap->a_cred);
1401                 if (error == 0) {
1402                         if (ip->ino_data.uflags != flags) {
1403                                 ip->ino_data.uflags = flags;
1404                                 modflags |= HAMMER_INODE_DDIRTY;
1405                         }
1406                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1407                                 error = 0;
1408                                 goto done;
1409                         }
1410                 }
1411                 goto done;
1412         }
1413         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1414                 error = EPERM;
1415                 goto done;
1416         }
1417         if (vap->va_uid != (uid_t)VNOVAL) {
1418                 hammer_guid_to_uuid(&uuid, vap->va_uid);
1419                 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1420                         ip->ino_data.uid = uuid;
1421                         modflags |= HAMMER_INODE_DDIRTY;
1422                 }
1423         }
1424         if (vap->va_gid != (uid_t)VNOVAL) {
1425                 hammer_guid_to_uuid(&uuid, vap->va_gid);
1426                 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1427                         ip->ino_data.gid = uuid;
1428                         modflags |= HAMMER_INODE_DDIRTY;
1429                 }
1430         }
1431         while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
1432                 switch(ap->a_vp->v_type) {
1433                 case VREG:
1434                         if (vap->va_size == ip->ino_rec.ino_size)
1435                                 break;
1436                         /*
1437                          * XXX break atomicy, we can deadlock the backend
1438                          * if we do not release the lock.  Probably not a
1439                          * big deal here.
1440                          */
1441                         hammer_unlock(&ip->lock);
1442                         if (vap->va_size < ip->ino_rec.ino_size) {
1443                                 vtruncbuf(ap->a_vp, vap->va_size,
1444                                           HAMMER_BUFSIZE);
1445                                 truncating = 1;
1446                         } else {
1447                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1448                                 truncating = 0;
1449                         }
1450                         hammer_lock_sh(&ip->lock);
1451                         ip->ino_rec.ino_size = vap->va_size;
1452                         modflags |= HAMMER_INODE_RDIRTY;
1453                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1454                                        ~HAMMER_BUFMASK64;
1455
1456                         /*
1457                          * on-media truncation is cached in the inode until
1458                          * the inode is synchronized.
1459                          */
1460                         if (truncating) {
1461                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1462                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1463                                         ip->trunc_off = vap->va_size;
1464                                 } else if (ip->trunc_off > vap->va_size) {
1465                                         ip->trunc_off = vap->va_size;
1466                                 }
1467                         }
1468
1469                         /*
1470                          * If truncating we have to clean out a portion of
1471                          * the last block on-disk.  We do this in the
1472                          * front-end buffer cache.
1473                          */
1474                         if (truncating && vap->va_size < aligned_size) {
1475                                 struct buf *bp;
1476                                 int offset;
1477
1478                                 offset = vap->va_size & HAMMER_BUFMASK;
1479                                 error = bread(ap->a_vp,
1480                                               aligned_size - HAMMER_BUFSIZE,
1481                                               HAMMER_BUFSIZE, &bp);
1482                                 if (error == 0) {
1483                                         bzero(bp->b_data + offset,
1484                                               HAMMER_BUFSIZE - offset);
1485                                         bdwrite(bp);
1486                                 } else {
1487                                         brelse(bp);
1488                                 }
1489                         }
1490                         break;
1491                 case VDATABASE:
1492                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1493                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1494                                 ip->trunc_off = vap->va_size;
1495                         } else if (ip->trunc_off > vap->va_size) {
1496                                 ip->trunc_off = vap->va_size;
1497                         }
1498                         ip->ino_rec.ino_size = vap->va_size;
1499                         modflags |= HAMMER_INODE_RDIRTY;
1500                         break;
1501                 default:
1502                         error = EINVAL;
1503                         goto done;
1504                 }
1505                 break;
1506         }
1507         if (vap->va_atime.tv_sec != VNOVAL) {
1508                 ip->ino_rec.ino_atime =
1509                         hammer_timespec_to_transid(&vap->va_atime);
1510                 modflags |= HAMMER_INODE_ITIMES;
1511         }
1512         if (vap->va_mtime.tv_sec != VNOVAL) {
1513                 ip->ino_rec.ino_mtime =
1514                         hammer_timespec_to_transid(&vap->va_mtime);
1515                 modflags |= HAMMER_INODE_ITIMES;
1516         }
1517         if (vap->va_mode != (mode_t)VNOVAL) {
1518                 if (ip->ino_data.mode != vap->va_mode) {
1519                         ip->ino_data.mode = vap->va_mode;
1520                         modflags |= HAMMER_INODE_DDIRTY;
1521                 }
1522         }
1523 done:
1524         if (error == 0)
1525                 hammer_modify_inode(&trans, ip, modflags);
1526         hammer_unlock(&ip->lock);
1527         hammer_done_transaction(&trans);
1528         return (error);
1529 }
1530
1531 /*
1532  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1533  */
1534 static
1535 int
1536 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1537 {
1538         struct hammer_transaction trans;
1539         struct hammer_inode *dip;
1540         struct hammer_inode *nip;
1541         struct nchandle *nch;
1542         hammer_record_t record;
1543         int error;
1544         int bytes;
1545
1546         ap->a_vap->va_type = VLNK;
1547
1548         nch = ap->a_nch;
1549         dip = VTOI(ap->a_dvp);
1550
1551         if (dip->flags & HAMMER_INODE_RO)
1552                 return (EROFS);
1553
1554         /*
1555          * Create a transaction to cover the operations we perform.
1556          */
1557         hammer_start_transaction(&trans, dip->hmp);
1558
1559         /*
1560          * Create a new filesystem object of the requested type.  The
1561          * returned inode will be referenced but not locked.
1562          */
1563
1564         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1565         if (error) {
1566                 hammer_done_transaction(&trans);
1567                 *ap->a_vpp = NULL;
1568                 return (error);
1569         }
1570
1571         /*
1572          * Add the new filesystem object to the directory.  This will also
1573          * bump the inode's link count.
1574          */
1575         hammer_lock_sh(&dip->lock);
1576         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1577
1578         /*
1579          * Add a record representing the symlink.  symlink stores the link
1580          * as pure data, not a string, and is no \0 terminated.
1581          */
1582         if (error == 0) {
1583                 record = hammer_alloc_mem_record(nip);
1584                 bytes = strlen(ap->a_target);
1585
1586                 record->rec.base.base.key = HAMMER_FIXKEY_SYMLINK;
1587                 record->rec.base.base.rec_type = HAMMER_RECTYPE_FIX;
1588                 record->rec.base.data_len = bytes;
1589                 record->data = (void *)ap->a_target;
1590                 /* will be reallocated by routine below */
1591                 error = hammer_ip_add_record(&trans, record);
1592
1593                 /*
1594                  * Set the file size to the length of the link.
1595                  */
1596                 if (error == 0) {
1597                         nip->ino_rec.ino_size = bytes;
1598                         hammer_modify_inode(&trans, nip, HAMMER_INODE_RDIRTY);
1599                 }
1600         }
1601         hammer_unlock(&dip->lock);
1602         hammer_unlock(&nip->lock);
1603
1604         /*
1605          * Finish up.
1606          */
1607         if (error) {
1608                 hammer_rel_inode(nip, 0);
1609                 *ap->a_vpp = NULL;
1610         } else {
1611                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
1612                 hammer_rel_inode(nip, 0);
1613                 if (error == 0) {
1614                         cache_setunresolved(ap->a_nch);
1615                         cache_setvp(ap->a_nch, *ap->a_vpp);
1616                 }
1617         }
1618         hammer_done_transaction(&trans);
1619         return (error);
1620 }
1621
1622 /*
1623  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1624  */
1625 static
1626 int
1627 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1628 {
1629         struct hammer_transaction trans;
1630         int error;
1631
1632         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1633         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1634                                 ap->a_cred, ap->a_flags);
1635         hammer_done_transaction(&trans);
1636
1637         return (error);
1638 }
1639
1640 /*
1641  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1642  */
1643 static
1644 int
1645 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1646 {
1647         struct hammer_inode *ip = ap->a_vp->v_data;
1648
1649         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1650                             ap->a_fflag, ap->a_cred));
1651 }
1652
1653 static
1654 int
1655 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1656 {
1657         struct mount *mp;
1658         int error;
1659
1660         mp = ap->a_head.a_ops->head.vv_mount;
1661
1662         switch(ap->a_op) {
1663         case MOUNTCTL_SET_EXPORT:
1664                 if (ap->a_ctllen != sizeof(struct export_args))
1665                         error = EINVAL;
1666                 error = hammer_vfs_export(mp, ap->a_op,
1667                                       (const struct export_args *)ap->a_ctl);
1668                 break;
1669         default:
1670                 error = journal_mountctl(ap);
1671                 break;
1672         }
1673         return(error);
1674 }
1675
1676 /*
1677  * hammer_vop_strategy { vp, bio }
1678  *
1679  * Strategy call, used for regular file read & write only.  Note that the
1680  * bp may represent a cluster.
1681  *
1682  * To simplify operation and allow better optimizations in the future,
1683  * this code does not make any assumptions with regards to buffer alignment
1684  * or size.
1685  */
1686 static
1687 int
1688 hammer_vop_strategy(struct vop_strategy_args *ap)
1689 {
1690         struct buf *bp;
1691         int error;
1692
1693         bp = ap->a_bio->bio_buf;
1694
1695         switch(bp->b_cmd) {
1696         case BUF_CMD_READ:
1697                 error = hammer_vop_strategy_read(ap);
1698                 break;
1699         case BUF_CMD_WRITE:
1700                 error = hammer_vop_strategy_write(ap);
1701                 break;
1702         default:
1703                 bp->b_error = error = EINVAL;
1704                 bp->b_flags |= B_ERROR;
1705                 biodone(ap->a_bio);
1706                 break;
1707         }
1708         return (error);
1709 }
1710
1711 /*
1712  * Read from a regular file.  Iterate the related records and fill in the
1713  * BIO/BUF.  Gaps are zero-filled.
1714  *
1715  * The support code in hammer_object.c should be used to deal with mixed
1716  * in-memory and on-disk records.
1717  *
1718  * XXX atime update
1719  */
1720 static
1721 int
1722 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1723 {
1724         struct hammer_transaction trans;
1725         struct hammer_inode *ip;
1726         struct hammer_cursor cursor;
1727         hammer_record_ondisk_t rec;
1728         hammer_base_elm_t base;
1729         struct bio *bio;
1730         struct buf *bp;
1731         int64_t rec_offset;
1732         int64_t ran_end;
1733         int64_t tmp64;
1734         int error;
1735         int boff;
1736         int roff;
1737         int n;
1738
1739         bio = ap->a_bio;
1740         bp = bio->bio_buf;
1741         ip = ap->a_vp->v_data;
1742
1743         hammer_simple_transaction(&trans, ip->hmp);
1744         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1745
1746         /*
1747          * Key range (begin and end inclusive) to scan.  Note that the key's
1748          * stored in the actual records represent BASE+LEN, not BASE.  The
1749          * first record containing bio_offset will have a key > bio_offset.
1750          */
1751         cursor.key_beg.obj_id = ip->obj_id;
1752         cursor.key_beg.create_tid = 0;
1753         cursor.key_beg.delete_tid = 0;
1754         cursor.key_beg.obj_type = 0;
1755         cursor.key_beg.key = bio->bio_offset + 1;
1756         cursor.asof = ip->obj_asof;
1757         cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
1758
1759         cursor.key_end = cursor.key_beg;
1760         KKASSERT(ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_REGFILE);
1761 #if 0
1762         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1763                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1764                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1765                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1766         } else
1767 #endif
1768         {
1769                 ran_end = bio->bio_offset + bp->b_bufsize;
1770                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1771                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1772                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1773                 if (tmp64 < ran_end)
1774                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1775                 else
1776                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1777         }
1778         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1779
1780         error = hammer_ip_first(&cursor, ip);
1781         boff = 0;
1782
1783         while (error == 0) {
1784                 error = hammer_ip_resolve_data(&cursor);
1785                 if (error)
1786                         break;
1787                 rec = cursor.record;
1788                 base = &rec->base.base;
1789
1790                 rec_offset = base->key - rec->data.base.data_len;
1791
1792                 /*
1793                  * Calculate the gap, if any, and zero-fill it.
1794                  */
1795                 n = (int)(rec_offset - (bio->bio_offset + boff));
1796                 if (n > 0) {
1797                         if (n > bp->b_bufsize - boff)
1798                                 n = bp->b_bufsize - boff;
1799                         bzero((char *)bp->b_data + boff, n);
1800                         boff += n;
1801                         n = 0;
1802                 }
1803
1804                 /*
1805                  * Calculate the data offset in the record and the number
1806                  * of bytes we can copy.
1807                  *
1808                  * Note there is a degenerate case here where boff may
1809                  * already be at bp->b_bufsize.
1810                  */
1811                 roff = -n;
1812                 rec_offset += roff;
1813                 n = rec->data.base.data_len - roff;
1814                 KKASSERT(n > 0);
1815                 if (n > bp->b_bufsize - boff)
1816                         n = bp->b_bufsize - boff;
1817
1818                 /*
1819                  * If we cached a truncation point on our front-end the
1820                  * on-disk version may still have physical records beyond
1821                  * that point.  Truncate visibility.
1822                  */
1823                 if (ip->trunc_off <= rec_offset)
1824                         n = 0;
1825                 else if (ip->trunc_off < rec_offset + n)
1826                         n = (int)(ip->trunc_off - rec_offset);
1827
1828                 /*
1829                  * Copy
1830                  */
1831                 if (n) {
1832                         bcopy((char *)cursor.data + roff,
1833                               (char *)bp->b_data + boff, n);
1834                         boff += n;
1835                 }
1836                 if (boff == bp->b_bufsize)
1837                         break;
1838                 error = hammer_ip_next(&cursor);
1839         }
1840         hammer_done_cursor(&cursor);
1841         hammer_done_transaction(&trans);
1842
1843         /*
1844          * There may have been a gap after the last record
1845          */
1846         if (error == ENOENT)
1847                 error = 0;
1848         if (error == 0 && boff != bp->b_bufsize) {
1849                 KKASSERT(boff < bp->b_bufsize);
1850                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1851                 /* boff = bp->b_bufsize; */
1852         }
1853         bp->b_resid = 0;
1854         bp->b_error = error;
1855         if (error)
1856                 bp->b_flags |= B_ERROR;
1857         biodone(ap->a_bio);
1858         return(error);
1859 }
1860
1861 /*
1862  * Write to a regular file.   Because this is a strategy call the OS is
1863  * trying to actually sync data to the media.   HAMMER can only flush
1864  * the entire inode (so the TID remains properly synchronized).
1865  *
1866  * Basically all we do here is place the bio on the inode's flush queue
1867  * and activate the flusher.
1868  */
1869 static
1870 int
1871 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1872 {
1873         hammer_inode_t ip;
1874         struct bio *bio;
1875         struct buf *bp;
1876
1877         bio = ap->a_bio;
1878         bp = bio->bio_buf;
1879         ip = ap->a_vp->v_data;
1880
1881         if (ip->flags & HAMMER_INODE_RO) {
1882                 bp->b_error = EROFS;
1883                 bp->b_flags |= B_ERROR;
1884                 biodone(ap->a_bio);
1885                 return(EROFS);
1886         }
1887
1888         /*
1889          * If the inode is being flushed we cannot re-queue buffers
1890          * it may have already flushed, or it could result in duplicate
1891          * records in the database.
1892          */
1893         BUF_KERNPROC(bp);
1894         if (ip->flush_state == HAMMER_FST_FLUSH)
1895                 TAILQ_INSERT_TAIL(&ip->bio_alt_list, bio, bio_act);
1896         else
1897                 TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
1898         hammer_modify_inode(NULL, ip, HAMMER_INODE_XDIRTY);
1899         hammer_flush_inode(ip, 0);
1900         return(0);
1901 }
1902
1903 /*
1904  * Backend code which actually performs the write to the media.  This
1905  * routine is typically called from the flusher.  The bio will be disposed
1906  * of (biodone'd) by this routine.
1907  *
1908  * Iterate the related records and mark for deletion.  If existing edge
1909  * records (left and right side) overlap our write they have to be marked
1910  * deleted and new records created, usually referencing a portion of the
1911  * original data.  Then add a record to represent the buffer.
1912  */
1913 int
1914 hammer_dowrite(hammer_transaction_t trans, hammer_inode_t ip, struct bio *bio)
1915 {
1916         struct buf *bp = bio->bio_buf;
1917         int error;
1918
1919         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
1920
1921         /*
1922          * Delete any records overlapping our range.  This function will
1923          * (eventually) properly truncate partial overlaps.
1924          */
1925         if (ip->sync_ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1926                 error = hammer_ip_delete_range(trans, ip, bio->bio_offset,
1927                                                bio->bio_offset);
1928         } else {
1929                 error = hammer_ip_delete_range(trans, ip, bio->bio_offset,
1930                                                bio->bio_offset +
1931                                                 bp->b_bufsize - 1);
1932         }
1933
1934         /*
1935          * Add a single record to cover the write.  We can write a record
1936          * with only the actual file data - for example, a small 200 byte
1937          * file does not have to write out a 16K record.
1938          *
1939          * While the data size does not have to be aligned, we still do it
1940          * to reduce fragmentation in a future allocation model.
1941          */
1942         if (error == 0) {
1943                 int limit_size;
1944
1945                 if (ip->sync_ino_rec.ino_size - bio->bio_offset > 
1946                     bp->b_bufsize) {
1947                             limit_size = bp->b_bufsize;
1948                 } else {
1949                         limit_size = (int)(ip->sync_ino_rec.ino_size -
1950                                            bio->bio_offset);
1951                         KKASSERT(limit_size >= 0);
1952                         limit_size = (limit_size + 63) & ~63;
1953                 }
1954                 error = hammer_ip_sync_data(trans, ip, bio->bio_offset,
1955                                             bp->b_data, limit_size);
1956         }
1957
1958         if (error) {
1959                 bp->b_resid = bp->b_bufsize;
1960                 bp->b_error = error;
1961                 bp->b_flags |= B_ERROR;
1962         } else {
1963                 bp->b_resid = 0;
1964         }
1965         biodone(bio);
1966         return(error);
1967 }
1968
1969 /*
1970  * dounlink - disconnect a directory entry
1971  *
1972  * XXX whiteout support not really in yet
1973  */
1974 static int
1975 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
1976                 struct vnode *dvp, struct ucred *cred, int flags)
1977 {
1978         struct namecache *ncp;
1979         hammer_inode_t dip;
1980         hammer_inode_t ip;
1981         hammer_record_ondisk_t rec;
1982         struct hammer_cursor cursor;
1983         int64_t namekey;
1984         int error;
1985
1986         /*
1987          * Calculate the namekey and setup the key range for the scan.  This
1988          * works kinda like a chained hash table where the lower 32 bits
1989          * of the namekey synthesize the chain.
1990          *
1991          * The key range is inclusive of both key_beg and key_end.
1992          */
1993         dip = VTOI(dvp);
1994         ncp = nch->ncp;
1995
1996         if (dip->flags & HAMMER_INODE_RO)
1997                 return (EROFS);
1998
1999         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2000 retry:
2001         hammer_init_cursor(trans, &cursor, &dip->cache[0]);
2002         cursor.key_beg.obj_id = dip->obj_id;
2003         cursor.key_beg.key = namekey;
2004         cursor.key_beg.create_tid = 0;
2005         cursor.key_beg.delete_tid = 0;
2006         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2007         cursor.key_beg.obj_type = 0;
2008
2009         cursor.key_end = cursor.key_beg;
2010         cursor.key_end.key |= 0xFFFFFFFFULL;
2011         cursor.asof = dip->obj_asof;
2012         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2013
2014         /*
2015          * Scan all matching records (the chain), locate the one matching
2016          * the requested path component.  info->last_error contains the
2017          * error code on search termination and could be 0, ENOENT, or
2018          * something else.
2019          *
2020          * The hammer_ip_*() functions merge in-memory records with on-disk
2021          * records for the purposes of the search.
2022          */
2023         error = hammer_ip_first(&cursor, dip);
2024         while (error == 0) {
2025                 error = hammer_ip_resolve_data(&cursor);
2026                 if (error)
2027                         break;
2028                 rec = cursor.record;
2029                 if (ncp->nc_nlen == rec->entry.base.data_len &&
2030                     bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
2031                         break;
2032                 }
2033                 error = hammer_ip_next(&cursor);
2034         }
2035
2036         /*
2037          * If all is ok we have to get the inode so we can adjust nlinks.
2038          *
2039          * If the target is a directory, it must be empty.
2040          */
2041         if (error == 0) {
2042                 ip = hammer_get_inode(trans, &dip->cache[1],
2043                                       rec->entry.obj_id,
2044                                       dip->hmp->asof, 0, &error);
2045                 if (error == ENOENT) {
2046                         kprintf("obj_id %016llx\n", rec->entry.obj_id);
2047                         Debugger("ENOENT unlinking object that should exist");
2048                 }
2049                 if (error == 0 && ip->ino_rec.base.base.obj_type ==
2050                                   HAMMER_OBJTYPE_DIRECTORY) {
2051                         error = hammer_ip_check_directory_empty(trans, ip);
2052                 }
2053                 /*
2054                  * WARNING: hammer_ip_del_directory() may have to terminate
2055                  * the cursor to avoid a lock recursion.  It's ok to call
2056                  * hammer_done_cursor() twice.
2057                  */
2058                 if (error == 0) {
2059                         hammer_lock_sh(&ip->lock);
2060                         hammer_lock_sh(&dip->lock);
2061                         error = hammer_ip_del_directory(trans, &cursor,
2062                                                         dip, ip);
2063                         hammer_unlock(&dip->lock);
2064                         hammer_unlock(&ip->lock);
2065                 }
2066                 if (error == 0) {
2067                         cache_setunresolved(nch);
2068                         cache_setvp(nch, NULL);
2069                         /* XXX locking */
2070                         if (ip->vp)
2071                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2072                 }
2073                 hammer_rel_inode(ip, 0);
2074         }
2075         hammer_done_cursor(&cursor);
2076         if (error == EDEADLK)
2077                 goto retry;
2078
2079         return (error);
2080 }
2081
2082 /************************************************************************
2083  *                          FIFO AND SPECFS OPS                         *
2084  ************************************************************************
2085  *
2086  */
2087
2088 static int
2089 hammer_vop_fifoclose (struct vop_close_args *ap)
2090 {
2091         /* XXX update itimes */
2092         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2093 }
2094
2095 static int
2096 hammer_vop_fiforead (struct vop_read_args *ap)
2097 {
2098         int error;
2099
2100         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2101         /* XXX update access time */
2102         return (error);
2103 }
2104
2105 static int
2106 hammer_vop_fifowrite (struct vop_write_args *ap)
2107 {
2108         int error;
2109
2110         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2111         /* XXX update access time */
2112         return (error);
2113 }
2114
2115 static int
2116 hammer_vop_specclose (struct vop_close_args *ap)
2117 {
2118         /* XXX update itimes */
2119         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2120 }
2121
2122 static int
2123 hammer_vop_specread (struct vop_read_args *ap)
2124 {
2125         /* XXX update access time */
2126         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2127 }
2128
2129 static int
2130 hammer_vop_specwrite (struct vop_write_args *ap)
2131 {
2132         /* XXX update last change time */
2133         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2134 }
2135