HAMMER 38E/Many: Undo/Synchronization and crash recovery
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.41 2008/04/27 00:45:37 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81 static int hammer_vop_mountctl(struct vop_mountctl_args *);
82
83 static int hammer_vop_fifoclose (struct vop_close_args *);
84 static int hammer_vop_fiforead (struct vop_read_args *);
85 static int hammer_vop_fifowrite (struct vop_write_args *);
86
87 static int hammer_vop_specclose (struct vop_close_args *);
88 static int hammer_vop_specread (struct vop_read_args *);
89 static int hammer_vop_specwrite (struct vop_write_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         hammer_vop_pathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_setattr =          hammer_vop_setattr,
119         .vop_strategy =         hammer_vop_strategy,
120         .vop_nsymlink =         hammer_vop_nsymlink,
121         .vop_nwhiteout =        hammer_vop_nwhiteout,
122         .vop_ioctl =            hammer_vop_ioctl,
123         .vop_mountctl =         hammer_vop_mountctl
124 };
125
126 struct vop_ops hammer_spec_vops = {
127         .vop_default =          spec_vnoperate,
128         .vop_fsync =            hammer_vop_fsync,
129         .vop_read =             hammer_vop_specread,
130         .vop_write =            hammer_vop_specwrite,
131         .vop_access =           hammer_vop_access,
132         .vop_close =            hammer_vop_specclose,
133         .vop_getattr =          hammer_vop_getattr,
134         .vop_inactive =         hammer_vop_inactive,
135         .vop_reclaim =          hammer_vop_reclaim,
136         .vop_setattr =          hammer_vop_setattr
137 };
138
139 struct vop_ops hammer_fifo_vops = {
140         .vop_default =          fifo_vnoperate,
141         .vop_fsync =            hammer_vop_fsync,
142         .vop_read =             hammer_vop_fiforead,
143         .vop_write =            hammer_vop_fifowrite,
144         .vop_access =           hammer_vop_access,
145         .vop_close =            hammer_vop_fifoclose,
146         .vop_getattr =          hammer_vop_getattr,
147         .vop_inactive =         hammer_vop_inactive,
148         .vop_reclaim =          hammer_vop_reclaim,
149         .vop_setattr =          hammer_vop_setattr
150 };
151
152 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
153                            struct vnode *dvp, struct ucred *cred, int flags);
154 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
155 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
156
157 #if 0
158 static
159 int
160 hammer_vop_vnoperate(struct vop_generic_args *)
161 {
162         return (VOCALL(&hammer_vnode_vops, ap));
163 }
164 #endif
165
166 /*
167  * hammer_vop_fsync { vp, waitfor }
168  */
169 static
170 int
171 hammer_vop_fsync(struct vop_fsync_args *ap)
172 {
173         hammer_inode_t ip = VTOI(ap->a_vp);
174
175         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
176         if (ap->a_waitfor == MNT_WAIT)
177                 hammer_wait_inode(ip);
178         return (ip->error);
179 }
180
181 /*
182  * hammer_vop_read { vp, uio, ioflag, cred }
183  */
184 static
185 int
186 hammer_vop_read(struct vop_read_args *ap)
187 {
188         struct hammer_transaction trans;
189         hammer_inode_t ip;
190         off_t offset;
191         struct buf *bp;
192         struct uio *uio;
193         int error;
194         int n;
195         int seqcount;
196
197         if (ap->a_vp->v_type != VREG)
198                 return (EINVAL);
199         ip = VTOI(ap->a_vp);
200         error = 0;
201         seqcount = ap->a_ioflag >> 16;
202
203         hammer_start_transaction(&trans, ip->hmp);
204
205         /*
206          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
207          */
208         uio = ap->a_uio;
209         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
210                 offset = uio->uio_offset & HAMMER_BUFMASK;
211 #if 0
212                 error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
213                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
214                                      MAXBSIZE, seqcount, &bp);
215 #endif
216                 error = bread(ap->a_vp, uio->uio_offset - offset,
217                               HAMMER_BUFSIZE, &bp);
218                 if (error) {
219                         brelse(bp);
220                         break;
221                 }
222                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
223                 n = HAMMER_BUFSIZE - offset;
224                 if (n > uio->uio_resid)
225                         n = uio->uio_resid;
226                 if (n > ip->ino_rec.ino_size - uio->uio_offset)
227                         n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
228                 error = uiomove((char *)bp->b_data + offset, n, uio);
229                 if (error) {
230                         bqrelse(bp);
231                         break;
232                 }
233                 bqrelse(bp);
234         }
235         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
236             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
237                 ip->ino_rec.ino_atime = trans.time;
238                 hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
239         }
240         hammer_done_transaction(&trans);
241         return (error);
242 }
243
244 /*
245  * hammer_vop_write { vp, uio, ioflag, cred }
246  */
247 static
248 int
249 hammer_vop_write(struct vop_write_args *ap)
250 {
251         struct hammer_transaction trans;
252         struct hammer_inode *ip;
253         struct uio *uio;
254         off_t offset;
255         struct buf *bp;
256         int error;
257         int n;
258         int flags;
259         int count;
260
261         if (ap->a_vp->v_type != VREG)
262                 return (EINVAL);
263         ip = VTOI(ap->a_vp);
264         error = 0;
265
266         if (ip->flags & HAMMER_INODE_RO)
267                 return (EROFS);
268
269         /*
270          * Create a transaction to cover the operations we perform.
271          */
272         hammer_start_transaction(&trans, ip->hmp);
273         uio = ap->a_uio;
274
275         /*
276          * Check append mode
277          */
278         if (ap->a_ioflag & IO_APPEND)
279                 uio->uio_offset = ip->ino_rec.ino_size;
280
281         /*
282          * Check for illegal write offsets.  Valid range is 0...2^63-1
283          */
284         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
285                 hammer_done_transaction(&trans);
286                 return (EFBIG);
287         }
288
289         /*
290          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
291          */
292         count = 0;
293         while (uio->uio_resid > 0) {
294                 int fixsize = 0;
295
296                 /*
297                  * Do not allow huge writes to deadlock the buffer cache
298                  */
299                 if ((++count & 15) == 0) {
300                         vn_unlock(ap->a_vp);
301                         if ((ap->a_ioflag & IO_NOBWILL) == 0)
302                                 bwillwrite();
303                         vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
304                 }
305
306                 offset = uio->uio_offset & HAMMER_BUFMASK;
307                 n = HAMMER_BUFSIZE - offset;
308                 if (n > uio->uio_resid)
309                         n = uio->uio_resid;
310                 if (uio->uio_offset + n > ip->ino_rec.ino_size) {
311                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
312                         fixsize = 1;
313                 }
314
315                 if (uio->uio_segflg == UIO_NOCOPY) {
316                         /*
317                          * Issuing a write with the same data backing the
318                          * buffer.  Instantiate the buffer to collect the
319                          * backing vm pages, then read-in any missing bits.
320                          *
321                          * This case is used by vop_stdputpages().
322                          */
323                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
324                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
325                         if ((bp->b_flags & B_CACHE) == 0) {
326                                 bqrelse(bp);
327                                 error = bread(ap->a_vp,
328                                               uio->uio_offset - offset,
329                                               HAMMER_BUFSIZE, &bp);
330                         }
331                 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
332                         /*
333                          * entirely overwrite the buffer
334                          */
335                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
336                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
337                 } else if (offset == 0 && uio->uio_offset >= ip->ino_rec.ino_size) {
338                         /*
339                          * XXX
340                          */
341                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
342                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
343                         vfs_bio_clrbuf(bp);
344                 } else {
345                         /*
346                          * Partial overwrite, read in any missing bits then
347                          * replace the portion being written.
348                          */
349                         error = bread(ap->a_vp, uio->uio_offset - offset,
350                                       HAMMER_BUFSIZE, &bp);
351                         if (error == 0)
352                                 bheavy(bp);
353                 }
354                 if (error == 0)
355                         error = uiomove((char *)bp->b_data + offset, n, uio);
356
357                 /*
358                  * If we screwed up we have to undo any VM size changes we
359                  * made.
360                  */
361                 if (error) {
362                         brelse(bp);
363                         if (fixsize) {
364                                 vtruncbuf(ap->a_vp, ip->ino_rec.ino_size,
365                                           HAMMER_BUFSIZE);
366                         }
367                         break;
368                 }
369                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
370                 hammer_lock_sh(&ip->lock);
371                 if (ip->ino_rec.ino_size < uio->uio_offset) {
372                         ip->ino_rec.ino_size = uio->uio_offset;
373                         flags = HAMMER_INODE_RDIRTY;
374                         vnode_pager_setsize(ap->a_vp, ip->ino_rec.ino_size);
375                 } else {
376                         flags = 0;
377                 }
378                 ip->ino_rec.ino_mtime = trans.time;
379                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
380                 hammer_modify_inode(&trans, ip, flags);
381                 hammer_unlock(&ip->lock);
382
383                 if (ap->a_ioflag & IO_SYNC) {
384                         bwrite(bp);
385                 } else if (ap->a_ioflag & IO_DIRECT) {
386                         bawrite(bp);
387 #if 0
388                 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
389                            (uio->uio_offset & HAMMER_BUFMASK) == 0) {
390                         /*
391                          * XXX HAMMER can only fsync the whole inode,
392                          * doing it on every buffer would be a bad idea.
393                          */
394                         /*
395                          * If seqcount indicates sequential operation and
396                          * we just finished filling a buffer, push it out
397                          * now to prevent the buffer cache from becoming
398                          * too full, which would trigger non-optimal
399                          * flushes.
400                          */
401                         bdwrite(bp);
402 #endif
403                 } else {
404                         bdwrite(bp);
405                 }
406         }
407         hammer_done_transaction(&trans);
408         return (error);
409 }
410
411 /*
412  * hammer_vop_access { vp, mode, cred }
413  */
414 static
415 int
416 hammer_vop_access(struct vop_access_args *ap)
417 {
418         struct hammer_inode *ip = VTOI(ap->a_vp);
419         uid_t uid;
420         gid_t gid;
421         int error;
422
423         uid = hammer_to_unix_xid(&ip->ino_data.uid);
424         gid = hammer_to_unix_xid(&ip->ino_data.gid);
425
426         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
427                                   ip->ino_data.uflags);
428         return (error);
429 }
430
431 /*
432  * hammer_vop_advlock { vp, id, op, fl, flags }
433  */
434 static
435 int
436 hammer_vop_advlock(struct vop_advlock_args *ap)
437 {
438         struct hammer_inode *ip = VTOI(ap->a_vp);
439
440         return (lf_advlock(ap, &ip->advlock, ip->ino_rec.ino_size));
441 }
442
443 /*
444  * hammer_vop_close { vp, fflag }
445  */
446 static
447 int
448 hammer_vop_close(struct vop_close_args *ap)
449 {
450         return (vop_stdclose(ap));
451 }
452
453 /*
454  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
455  *
456  * The operating system has already ensured that the directory entry
457  * does not exist and done all appropriate namespace locking.
458  */
459 static
460 int
461 hammer_vop_ncreate(struct vop_ncreate_args *ap)
462 {
463         struct hammer_transaction trans;
464         struct hammer_inode *dip;
465         struct hammer_inode *nip;
466         struct nchandle *nch;
467         int error;
468
469         nch = ap->a_nch;
470         dip = VTOI(ap->a_dvp);
471
472         if (dip->flags & HAMMER_INODE_RO)
473                 return (EROFS);
474
475         /*
476          * Create a transaction to cover the operations we perform.
477          */
478         hammer_start_transaction(&trans, dip->hmp);
479
480         /*
481          * Create a new filesystem object of the requested type.  The
482          * returned inode will be referenced and shared-locked to prevent
483          * it from being moved to the flusher.
484          */
485
486         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
487         if (error) {
488                 kprintf("hammer_create_inode error %d\n", error);
489                 hammer_done_transaction(&trans);
490                 *ap->a_vpp = NULL;
491                 return (error);
492         }
493         hammer_lock_sh(&nip->lock);
494         hammer_lock_sh(&dip->lock);
495
496         /*
497          * Add the new filesystem object to the directory.  This will also
498          * bump the inode's link count.
499          */
500         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
501         hammer_finalize_inode(&trans, nip, error);
502         if (error)
503                 kprintf("hammer_ip_add_directory error %d\n", error);
504         hammer_unlock(&dip->lock);
505         hammer_unlock(&nip->lock);
506
507         /*
508          * Finish up.
509          */
510         if (error) {
511                 hammer_rel_inode(nip, 0);
512                 hammer_done_transaction(&trans);
513                 *ap->a_vpp = NULL;
514         } else {
515                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
516                 hammer_done_transaction(&trans);
517                 hammer_rel_inode(nip, 0);
518                 if (error == 0) {
519                         cache_setunresolved(ap->a_nch);
520                         cache_setvp(ap->a_nch, *ap->a_vpp);
521                 }
522         }
523         return (error);
524 }
525
526 /*
527  * hammer_vop_getattr { vp, vap }
528  */
529 static
530 int
531 hammer_vop_getattr(struct vop_getattr_args *ap)
532 {
533         struct hammer_inode *ip = VTOI(ap->a_vp);
534         struct vattr *vap = ap->a_vap;
535
536 #if 0
537         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
538             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
539             ip->obj_asof == XXX
540         ) {
541                 /* LAZYMOD XXX */
542         }
543         hammer_itimes(ap->a_vp);
544 #endif
545
546         vap->va_fsid = ip->hmp->fsid_udev;
547         vap->va_fileid = ip->ino_rec.base.base.obj_id;
548         vap->va_mode = ip->ino_data.mode;
549         vap->va_nlink = ip->ino_rec.ino_nlinks;
550         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
551         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
552         vap->va_rmajor = 0;
553         vap->va_rminor = 0;
554         vap->va_size = ip->ino_rec.ino_size;
555         hammer_to_timespec(ip->ino_rec.ino_atime, &vap->va_atime);
556         hammer_to_timespec(ip->ino_rec.ino_mtime, &vap->va_mtime);
557         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
558         vap->va_flags = ip->ino_data.uflags;
559         vap->va_gen = 1;        /* hammer inums are unique for all time */
560         vap->va_blocksize = HAMMER_BUFSIZE;
561         vap->va_bytes = (ip->ino_rec.ino_size + 63) & ~63;
562         vap->va_type = hammer_get_vnode_type(ip->ino_rec.base.base.obj_type);
563         vap->va_filerev = 0;    /* XXX */
564         /* mtime uniquely identifies any adjustments made to the file */
565         vap->va_fsmid = ip->ino_rec.ino_mtime;
566         vap->va_uid_uuid = ip->ino_data.uid;
567         vap->va_gid_uuid = ip->ino_data.gid;
568         vap->va_fsid_uuid = ip->hmp->fsid;
569         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
570                           VA_FSID_UUID_VALID;
571
572         switch (ip->ino_rec.base.base.obj_type) {
573         case HAMMER_OBJTYPE_CDEV:
574         case HAMMER_OBJTYPE_BDEV:
575                 vap->va_rmajor = ip->ino_data.rmajor;
576                 vap->va_rminor = ip->ino_data.rminor;
577                 break;
578         default:
579                 break;
580         }
581
582         return(0);
583 }
584
585 /*
586  * hammer_vop_nresolve { nch, dvp, cred }
587  *
588  * Locate the requested directory entry.
589  */
590 static
591 int
592 hammer_vop_nresolve(struct vop_nresolve_args *ap)
593 {
594         struct hammer_transaction trans;
595         struct namecache *ncp;
596         hammer_inode_t dip;
597         hammer_inode_t ip;
598         hammer_tid_t asof;
599         struct hammer_cursor cursor;
600         union hammer_record_ondisk *rec;
601         struct vnode *vp;
602         int64_t namekey;
603         int error;
604         int i;
605         int nlen;
606         int flags;
607         u_int64_t obj_id;
608
609         /*
610          * Misc initialization, plus handle as-of name extensions.  Look for
611          * the '@@' extension.  Note that as-of files and directories cannot
612          * be modified.
613          */
614         dip = VTOI(ap->a_dvp);
615         ncp = ap->a_nch->ncp;
616         asof = dip->obj_asof;
617         nlen = ncp->nc_nlen;
618         flags = dip->flags;
619
620         hammer_simple_transaction(&trans, dip->hmp);
621
622         for (i = 0; i < nlen; ++i) {
623                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
624                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
625                         flags |= HAMMER_INODE_RO;
626                         break;
627                 }
628         }
629         nlen = i;
630
631         /*
632          * If there is no path component the time extension is relative to
633          * dip.
634          */
635         if (nlen == 0) {
636                 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
637                                       asof, flags, &error);
638                 if (error == 0) {
639                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
640                         hammer_rel_inode(ip, 0);
641                 } else {
642                         vp = NULL;
643                 }
644                 if (error == 0) {
645                         vn_unlock(vp);
646                         cache_setvp(ap->a_nch, vp);
647                         vrele(vp);
648                 }
649                 goto done;
650         }
651
652         /*
653          * Calculate the namekey and setup the key range for the scan.  This
654          * works kinda like a chained hash table where the lower 32 bits
655          * of the namekey synthesize the chain.
656          *
657          * The key range is inclusive of both key_beg and key_end.
658          */
659         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
660
661         error = hammer_init_cursor(&trans, &cursor, &dip->cache[0]);
662         cursor.key_beg.obj_id = dip->obj_id;
663         cursor.key_beg.key = namekey;
664         cursor.key_beg.create_tid = 0;
665         cursor.key_beg.delete_tid = 0;
666         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
667         cursor.key_beg.obj_type = 0;
668
669         cursor.key_end = cursor.key_beg;
670         cursor.key_end.key |= 0xFFFFFFFFULL;
671         cursor.asof = asof;
672         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
673
674         /*
675          * Scan all matching records (the chain), locate the one matching
676          * the requested path component.
677          *
678          * The hammer_ip_*() functions merge in-memory records with on-disk
679          * records for the purposes of the search.
680          */
681         if (error == 0)
682                 error = hammer_ip_first(&cursor, dip);
683
684         rec = NULL;
685         obj_id = 0;
686
687         while (error == 0) {
688                 error = hammer_ip_resolve_data(&cursor);
689                 if (error)
690                         break;
691                 rec = cursor.record;
692                 if (nlen == rec->entry.base.data_len &&
693                     bcmp(ncp->nc_name, cursor.data, nlen) == 0) {
694                         obj_id = rec->entry.obj_id;
695                         break;
696                 }
697                 error = hammer_ip_next(&cursor);
698         }
699         hammer_done_cursor(&cursor);
700         if (error == 0) {
701                 ip = hammer_get_inode(&trans, &dip->cache[1],
702                                       obj_id, asof, flags, &error);
703                 if (error == 0) {
704                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
705                         hammer_rel_inode(ip, 0);
706                 } else {
707                         vp = NULL;
708                 }
709                 if (error == 0) {
710                         vn_unlock(vp);
711                         cache_setvp(ap->a_nch, vp);
712                         vrele(vp);
713                 }
714         } else if (error == ENOENT) {
715                 cache_setvp(ap->a_nch, NULL);
716         }
717 done:
718         hammer_done_transaction(&trans);
719         return (error);
720 }
721
722 /*
723  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
724  *
725  * Locate the parent directory of a directory vnode.
726  *
727  * dvp is referenced but not locked.  *vpp must be returned referenced and
728  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
729  * at the root, instead it could indicate that the directory we were in was
730  * removed.
731  *
732  * NOTE: as-of sequences are not linked into the directory structure.  If
733  * we are at the root with a different asof then the mount point, reload
734  * the same directory with the mount point's asof.   I'm not sure what this
735  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
736  * get confused, but it hasn't been tested.
737  */
738 static
739 int
740 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
741 {
742         struct hammer_transaction trans;
743         struct hammer_inode *dip;
744         struct hammer_inode *ip;
745         int64_t parent_obj_id;
746         hammer_tid_t asof;
747         int error;
748
749         dip = VTOI(ap->a_dvp);
750         asof = dip->obj_asof;
751         parent_obj_id = dip->ino_data.parent_obj_id;
752
753         if (parent_obj_id == 0) {
754                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
755                    asof != dip->hmp->asof) {
756                         parent_obj_id = dip->obj_id;
757                         asof = dip->hmp->asof;
758                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
759                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
760                                    dip->obj_asof);
761                 } else {
762                         *ap->a_vpp = NULL;
763                         return ENOENT;
764                 }
765         }
766
767         hammer_simple_transaction(&trans, dip->hmp);
768
769         ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
770                               asof, dip->flags, &error);
771         if (ip) {
772                 error = hammer_get_vnode(ip, LK_EXCLUSIVE, ap->a_vpp);
773                 hammer_rel_inode(ip, 0);
774         } else {
775                 *ap->a_vpp = NULL;
776         }
777         hammer_done_transaction(&trans);
778         return (error);
779 }
780
781 /*
782  * hammer_vop_nlink { nch, dvp, vp, cred }
783  */
784 static
785 int
786 hammer_vop_nlink(struct vop_nlink_args *ap)
787 {
788         struct hammer_transaction trans;
789         struct hammer_inode *dip;
790         struct hammer_inode *ip;
791         struct nchandle *nch;
792         int error;
793
794         nch = ap->a_nch;
795         dip = VTOI(ap->a_dvp);
796         ip = VTOI(ap->a_vp);
797
798         if (dip->flags & HAMMER_INODE_RO)
799                 return (EROFS);
800         if (ip->flags & HAMMER_INODE_RO)
801                 return (EROFS);
802
803         /*
804          * Create a transaction to cover the operations we perform.
805          */
806         hammer_start_transaction(&trans, dip->hmp);
807
808         /*
809          * Add the filesystem object to the directory.  Note that neither
810          * dip nor ip are referenced or locked, but their vnodes are
811          * referenced.  This function will bump the inode's link count.
812          */
813         hammer_lock_sh(&ip->lock);
814         hammer_lock_sh(&dip->lock);
815         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
816         hammer_unlock(&dip->lock);
817         hammer_unlock(&ip->lock);
818
819         /*
820          * Finish up.
821          */
822         if (error == 0) {
823                 cache_setunresolved(nch);
824                 cache_setvp(nch, ap->a_vp);
825         }
826         hammer_done_transaction(&trans);
827         return (error);
828 }
829
830 /*
831  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
832  *
833  * The operating system has already ensured that the directory entry
834  * does not exist and done all appropriate namespace locking.
835  */
836 static
837 int
838 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
839 {
840         struct hammer_transaction trans;
841         struct hammer_inode *dip;
842         struct hammer_inode *nip;
843         struct nchandle *nch;
844         int error;
845
846         nch = ap->a_nch;
847         dip = VTOI(ap->a_dvp);
848
849         if (dip->flags & HAMMER_INODE_RO)
850                 return (EROFS);
851
852         /*
853          * Create a transaction to cover the operations we perform.
854          */
855         hammer_start_transaction(&trans, dip->hmp);
856
857         /*
858          * Create a new filesystem object of the requested type.  The
859          * returned inode will be referenced but not locked.
860          */
861         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
862         if (error) {
863                 kprintf("hammer_mkdir error %d\n", error);
864                 hammer_done_transaction(&trans);
865                 *ap->a_vpp = NULL;
866                 return (error);
867         }
868         /*
869          * Add the new filesystem object to the directory.  This will also
870          * bump the inode's link count.
871          */
872         hammer_lock_sh(&nip->lock);
873         hammer_lock_sh(&dip->lock);
874         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
875         hammer_finalize_inode(&trans, nip, error);
876         hammer_unlock(&dip->lock);
877         hammer_unlock(&nip->lock);
878         if (error)
879                 kprintf("hammer_mkdir (add) error %d\n", error);
880
881         /*
882          * Finish up.
883          */
884         if (error) {
885                 hammer_rel_inode(nip, 0);
886                 *ap->a_vpp = NULL;
887         } else {
888                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
889                 hammer_rel_inode(nip, 0);
890                 if (error == 0) {
891                         cache_setunresolved(ap->a_nch);
892                         cache_setvp(ap->a_nch, *ap->a_vpp);
893                 }
894         }
895         hammer_done_transaction(&trans);
896         return (error);
897 }
898
899 /*
900  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
901  *
902  * The operating system has already ensured that the directory entry
903  * does not exist and done all appropriate namespace locking.
904  */
905 static
906 int
907 hammer_vop_nmknod(struct vop_nmknod_args *ap)
908 {
909         struct hammer_transaction trans;
910         struct hammer_inode *dip;
911         struct hammer_inode *nip;
912         struct nchandle *nch;
913         int error;
914
915         nch = ap->a_nch;
916         dip = VTOI(ap->a_dvp);
917
918         if (dip->flags & HAMMER_INODE_RO)
919                 return (EROFS);
920
921         /*
922          * Create a transaction to cover the operations we perform.
923          */
924         hammer_start_transaction(&trans, dip->hmp);
925
926         /*
927          * Create a new filesystem object of the requested type.  The
928          * returned inode will be referenced but not locked.
929          */
930         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
931         if (error) {
932                 hammer_done_transaction(&trans);
933                 *ap->a_vpp = NULL;
934                 return (error);
935         }
936
937         /*
938          * Add the new filesystem object to the directory.  This will also
939          * bump the inode's link count.
940          */
941         hammer_lock_sh(&nip->lock);
942         hammer_lock_sh(&dip->lock);
943         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
944         hammer_finalize_inode(&trans, nip, error);
945         hammer_unlock(&dip->lock);
946         hammer_unlock(&nip->lock);
947
948         /*
949          * Finish up.
950          */
951         if (error) {
952                 hammer_rel_inode(nip, 0);
953                 *ap->a_vpp = NULL;
954         } else {
955                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
956                 hammer_rel_inode(nip, 0);
957                 if (error == 0) {
958                         cache_setunresolved(ap->a_nch);
959                         cache_setvp(ap->a_nch, *ap->a_vpp);
960                 }
961         }
962         hammer_done_transaction(&trans);
963         return (error);
964 }
965
966 /*
967  * hammer_vop_open { vp, mode, cred, fp }
968  */
969 static
970 int
971 hammer_vop_open(struct vop_open_args *ap)
972 {
973         if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
974                 return (EROFS);
975
976         return(vop_stdopen(ap));
977 }
978
979 /*
980  * hammer_vop_pathconf { vp, name, retval }
981  */
982 static
983 int
984 hammer_vop_pathconf(struct vop_pathconf_args *ap)
985 {
986         return EOPNOTSUPP;
987 }
988
989 /*
990  * hammer_vop_print { vp }
991  */
992 static
993 int
994 hammer_vop_print(struct vop_print_args *ap)
995 {
996         return EOPNOTSUPP;
997 }
998
999 /*
1000  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1001  */
1002 static
1003 int
1004 hammer_vop_readdir(struct vop_readdir_args *ap)
1005 {
1006         struct hammer_transaction trans;
1007         struct hammer_cursor cursor;
1008         struct hammer_inode *ip;
1009         struct uio *uio;
1010         hammer_record_ondisk_t rec;
1011         hammer_base_elm_t base;
1012         int error;
1013         int cookie_index;
1014         int ncookies;
1015         off_t *cookies;
1016         off_t saveoff;
1017         int r;
1018
1019         ip = VTOI(ap->a_vp);
1020         uio = ap->a_uio;
1021         saveoff = uio->uio_offset;
1022
1023         if (ap->a_ncookies) {
1024                 ncookies = uio->uio_resid / 16 + 1;
1025                 if (ncookies > 1024)
1026                         ncookies = 1024;
1027                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1028                 cookie_index = 0;
1029         } else {
1030                 ncookies = -1;
1031                 cookies = NULL;
1032                 cookie_index = 0;
1033         }
1034
1035         hammer_simple_transaction(&trans, ip->hmp);
1036
1037         /*
1038          * Handle artificial entries
1039          */
1040         error = 0;
1041         if (saveoff == 0) {
1042                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1043                 if (r)
1044                         goto done;
1045                 if (cookies)
1046                         cookies[cookie_index] = saveoff;
1047                 ++saveoff;
1048                 ++cookie_index;
1049                 if (cookie_index == ncookies)
1050                         goto done;
1051         }
1052         if (saveoff == 1) {
1053                 if (ip->ino_data.parent_obj_id) {
1054                         r = vop_write_dirent(&error, uio,
1055                                              ip->ino_data.parent_obj_id,
1056                                              DT_DIR, 2, "..");
1057                 } else {
1058                         r = vop_write_dirent(&error, uio,
1059                                              ip->obj_id, DT_DIR, 2, "..");
1060                 }
1061                 if (r)
1062                         goto done;
1063                 if (cookies)
1064                         cookies[cookie_index] = saveoff;
1065                 ++saveoff;
1066                 ++cookie_index;
1067                 if (cookie_index == ncookies)
1068                         goto done;
1069         }
1070
1071         /*
1072          * Key range (begin and end inclusive) to scan.  Directory keys
1073          * directly translate to a 64 bit 'seek' position.
1074          */
1075         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1076         cursor.key_beg.obj_id = ip->obj_id;
1077         cursor.key_beg.create_tid = 0;
1078         cursor.key_beg.delete_tid = 0;
1079         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1080         cursor.key_beg.obj_type = 0;
1081         cursor.key_beg.key = saveoff;
1082
1083         cursor.key_end = cursor.key_beg;
1084         cursor.key_end.key = HAMMER_MAX_KEY;
1085         cursor.asof = ip->obj_asof;
1086         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1087
1088         error = hammer_ip_first(&cursor, ip);
1089
1090         while (error == 0) {
1091                 error = hammer_ip_resolve_record_and_data(&cursor);
1092                 if (error)
1093                         break;
1094                 rec = cursor.record;
1095                 base = &rec->base.base;
1096                 saveoff = base->key;
1097
1098                 if (base->obj_id != ip->obj_id)
1099                         panic("readdir: bad record at %p", cursor.node);
1100
1101                 r = vop_write_dirent(
1102                              &error, uio, rec->entry.obj_id,
1103                              hammer_get_dtype(rec->entry.base.base.obj_type),
1104                              rec->entry.base.data_len,
1105                              (void *)cursor.data);
1106                 if (r)
1107                         break;
1108                 ++saveoff;
1109                 if (cookies)
1110                         cookies[cookie_index] = base->key;
1111                 ++cookie_index;
1112                 if (cookie_index == ncookies)
1113                         break;
1114                 error = hammer_ip_next(&cursor);
1115         }
1116         hammer_done_cursor(&cursor);
1117
1118 done:
1119         hammer_done_transaction(&trans);
1120
1121         if (ap->a_eofflag)
1122                 *ap->a_eofflag = (error == ENOENT);
1123         uio->uio_offset = saveoff;
1124         if (error && cookie_index == 0) {
1125                 if (error == ENOENT)
1126                         error = 0;
1127                 if (cookies) {
1128                         kfree(cookies, M_TEMP);
1129                         *ap->a_ncookies = 0;
1130                         *ap->a_cookies = NULL;
1131                 }
1132         } else {
1133                 if (error == ENOENT)
1134                         error = 0;
1135                 if (cookies) {
1136                         *ap->a_ncookies = cookie_index;
1137                         *ap->a_cookies = cookies;
1138                 }
1139         }
1140         return(error);
1141 }
1142
1143 /*
1144  * hammer_vop_readlink { vp, uio, cred }
1145  */
1146 static
1147 int
1148 hammer_vop_readlink(struct vop_readlink_args *ap)
1149 {
1150         struct hammer_transaction trans;
1151         struct hammer_cursor cursor;
1152         struct hammer_inode *ip;
1153         int error;
1154
1155         ip = VTOI(ap->a_vp);
1156
1157         hammer_simple_transaction(&trans, ip->hmp);
1158
1159         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1160
1161         /*
1162          * Key range (begin and end inclusive) to scan.  Directory keys
1163          * directly translate to a 64 bit 'seek' position.
1164          */
1165         cursor.key_beg.obj_id = ip->obj_id;
1166         cursor.key_beg.create_tid = 0;
1167         cursor.key_beg.delete_tid = 0;
1168         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1169         cursor.key_beg.obj_type = 0;
1170         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1171         cursor.asof = ip->obj_asof;
1172         cursor.flags |= HAMMER_CURSOR_ASOF;
1173
1174         error = hammer_ip_lookup(&cursor, ip);
1175         if (error == 0) {
1176                 error = hammer_ip_resolve_data(&cursor);
1177                 if (error == 0) {
1178                         error = uiomove((char *)cursor.data,
1179                                         cursor.record->base.data_len,
1180                                         ap->a_uio);
1181                 }
1182         }
1183         hammer_done_cursor(&cursor);
1184         hammer_done_transaction(&trans);
1185         return(error);
1186 }
1187
1188 /*
1189  * hammer_vop_nremove { nch, dvp, cred }
1190  */
1191 static
1192 int
1193 hammer_vop_nremove(struct vop_nremove_args *ap)
1194 {
1195         struct hammer_transaction trans;
1196         int error;
1197
1198         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1199         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1200         hammer_done_transaction(&trans);
1201
1202         return (error);
1203 }
1204
1205 /*
1206  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1207  */
1208 static
1209 int
1210 hammer_vop_nrename(struct vop_nrename_args *ap)
1211 {
1212         struct hammer_transaction trans;
1213         struct namecache *fncp;
1214         struct namecache *tncp;
1215         struct hammer_inode *fdip;
1216         struct hammer_inode *tdip;
1217         struct hammer_inode *ip;
1218         struct hammer_cursor cursor;
1219         union hammer_record_ondisk *rec;
1220         int64_t namekey;
1221         int error;
1222
1223         fdip = VTOI(ap->a_fdvp);
1224         tdip = VTOI(ap->a_tdvp);
1225         fncp = ap->a_fnch->ncp;
1226         tncp = ap->a_tnch->ncp;
1227         ip = VTOI(fncp->nc_vp);
1228         KKASSERT(ip != NULL);
1229
1230         if (fdip->flags & HAMMER_INODE_RO)
1231                 return (EROFS);
1232         if (tdip->flags & HAMMER_INODE_RO)
1233                 return (EROFS);
1234         if (ip->flags & HAMMER_INODE_RO)
1235                 return (EROFS);
1236
1237         hammer_start_transaction(&trans, fdip->hmp);
1238
1239         hammer_lock_sh(&ip->lock);
1240         if (fdip->obj_id < tdip->obj_id) {
1241                 hammer_lock_sh(&fdip->lock);
1242                 hammer_lock_sh(&tdip->lock);
1243         } else {
1244                 hammer_lock_sh(&tdip->lock);
1245                 hammer_lock_sh(&fdip->lock);
1246         }
1247
1248         /*
1249          * Remove tncp from the target directory and then link ip as
1250          * tncp. XXX pass trans to dounlink
1251          *
1252          * Force the inode sync-time to match the transaction so it is
1253          * in-sync with the creation of the target directory entry.
1254          */
1255         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1256         if (error == 0 || error == ENOENT) {
1257                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1258                 if (error == 0) {
1259                         ip->ino_data.parent_obj_id = tdip->obj_id;
1260                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
1261                 }
1262         }
1263         if (error)
1264                 goto failed; /* XXX */
1265
1266         /*
1267          * Locate the record in the originating directory and remove it.
1268          *
1269          * Calculate the namekey and setup the key range for the scan.  This
1270          * works kinda like a chained hash table where the lower 32 bits
1271          * of the namekey synthesize the chain.
1272          *
1273          * The key range is inclusive of both key_beg and key_end.
1274          */
1275         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1276 retry:
1277         hammer_init_cursor(&trans, &cursor, &fdip->cache[0]);
1278         cursor.key_beg.obj_id = fdip->obj_id;
1279         cursor.key_beg.key = namekey;
1280         cursor.key_beg.create_tid = 0;
1281         cursor.key_beg.delete_tid = 0;
1282         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1283         cursor.key_beg.obj_type = 0;
1284
1285         cursor.key_end = cursor.key_beg;
1286         cursor.key_end.key |= 0xFFFFFFFFULL;
1287         cursor.asof = fdip->obj_asof;
1288         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1289
1290         /*
1291          * Scan all matching records (the chain), locate the one matching
1292          * the requested path component.
1293          *
1294          * The hammer_ip_*() functions merge in-memory records with on-disk
1295          * records for the purposes of the search.
1296          */
1297         error = hammer_ip_first(&cursor, fdip);
1298         while (error == 0) {
1299                 if (hammer_ip_resolve_data(&cursor) != 0)
1300                         break;
1301                 rec = cursor.record;
1302                 if (fncp->nc_nlen == rec->entry.base.data_len &&
1303                     bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
1304                         break;
1305                 }
1306                 error = hammer_ip_next(&cursor);
1307         }
1308
1309         /*
1310          * If all is ok we have to get the inode so we can adjust nlinks.
1311          *
1312          * WARNING: hammer_ip_del_directory() may have to terminate the
1313          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1314          * twice.
1315          */
1316         if (error == 0)
1317                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1318
1319         /*
1320          * XXX A deadlock here will break rename's atomicy for the purposes
1321          * of crash recovery.
1322          */
1323         if (error == EDEADLK) {
1324                 hammer_unlock(&ip->lock);
1325                 hammer_unlock(&fdip->lock);
1326                 hammer_unlock(&tdip->lock);
1327                 hammer_done_cursor(&cursor);
1328                 hammer_lock_sh(&ip->lock);
1329                 if (fdip->obj_id < tdip->obj_id) {
1330                         hammer_lock_sh(&fdip->lock);
1331                         hammer_lock_sh(&tdip->lock);
1332                 } else {
1333                         hammer_lock_sh(&tdip->lock);
1334                         hammer_lock_sh(&fdip->lock);
1335                 }
1336                 goto retry;
1337         }
1338
1339         /*
1340          * Cleanup and tell the kernel that the rename succeeded.
1341          */
1342         hammer_done_cursor(&cursor);
1343         if (error == 0)
1344                 cache_rename(ap->a_fnch, ap->a_tnch);
1345
1346 failed:
1347         hammer_unlock(&ip->lock);
1348         hammer_unlock(&fdip->lock);
1349         hammer_unlock(&tdip->lock);
1350         hammer_done_transaction(&trans);
1351         return (error);
1352 }
1353
1354 /*
1355  * hammer_vop_nrmdir { nch, dvp, cred }
1356  */
1357 static
1358 int
1359 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1360 {
1361         struct hammer_transaction trans;
1362         int error;
1363
1364         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1365         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1366         hammer_done_transaction(&trans);
1367
1368         return (error);
1369 }
1370
1371 /*
1372  * hammer_vop_setattr { vp, vap, cred }
1373  */
1374 static
1375 int
1376 hammer_vop_setattr(struct vop_setattr_args *ap)
1377 {
1378         struct hammer_transaction trans;
1379         struct vattr *vap;
1380         struct hammer_inode *ip;
1381         int modflags;
1382         int error;
1383         int truncating;
1384         off_t aligned_size;
1385         u_int32_t flags;
1386         uuid_t uuid;
1387
1388         vap = ap->a_vap;
1389         ip = ap->a_vp->v_data;
1390         modflags = 0;
1391
1392         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1393                 return(EROFS);
1394         if (ip->flags & HAMMER_INODE_RO)
1395                 return (EROFS);
1396
1397         hammer_start_transaction(&trans, ip->hmp);
1398         hammer_lock_sh(&ip->lock);
1399         error = 0;
1400
1401         if (vap->va_flags != VNOVAL) {
1402                 flags = ip->ino_data.uflags;
1403                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1404                                          hammer_to_unix_xid(&ip->ino_data.uid),
1405                                          ap->a_cred);
1406                 if (error == 0) {
1407                         if (ip->ino_data.uflags != flags) {
1408                                 ip->ino_data.uflags = flags;
1409                                 modflags |= HAMMER_INODE_DDIRTY;
1410                         }
1411                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1412                                 error = 0;
1413                                 goto done;
1414                         }
1415                 }
1416                 goto done;
1417         }
1418         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1419                 error = EPERM;
1420                 goto done;
1421         }
1422         if (vap->va_uid != (uid_t)VNOVAL) {
1423                 hammer_guid_to_uuid(&uuid, vap->va_uid);
1424                 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1425                         ip->ino_data.uid = uuid;
1426                         modflags |= HAMMER_INODE_DDIRTY;
1427                 }
1428         }
1429         if (vap->va_gid != (uid_t)VNOVAL) {
1430                 hammer_guid_to_uuid(&uuid, vap->va_gid);
1431                 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1432                         ip->ino_data.gid = uuid;
1433                         modflags |= HAMMER_INODE_DDIRTY;
1434                 }
1435         }
1436         while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
1437                 switch(ap->a_vp->v_type) {
1438                 case VREG:
1439                         if (vap->va_size == ip->ino_rec.ino_size)
1440                                 break;
1441                         /*
1442                          * XXX break atomicy, we can deadlock the backend
1443                          * if we do not release the lock.  Probably not a
1444                          * big deal here.
1445                          */
1446                         hammer_unlock(&ip->lock);
1447                         if (vap->va_size < ip->ino_rec.ino_size) {
1448                                 vtruncbuf(ap->a_vp, vap->va_size,
1449                                           HAMMER_BUFSIZE);
1450                                 truncating = 1;
1451                         } else {
1452                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1453                                 truncating = 0;
1454                         }
1455                         hammer_lock_sh(&ip->lock);
1456                         ip->ino_rec.ino_size = vap->va_size;
1457                         modflags |= HAMMER_INODE_RDIRTY;
1458                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1459                                        ~HAMMER_BUFMASK64;
1460
1461                         /*
1462                          * on-media truncation is cached in the inode until
1463                          * the inode is synchronized.
1464                          */
1465                         if (truncating) {
1466                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1467                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1468                                         ip->trunc_off = vap->va_size;
1469                                 } else if (ip->trunc_off > vap->va_size) {
1470                                         ip->trunc_off = vap->va_size;
1471                                 }
1472                         }
1473
1474                         /*
1475                          * If truncating we have to clean out a portion of
1476                          * the last block on-disk.  We do this in the
1477                          * front-end buffer cache.
1478                          */
1479                         if (truncating && vap->va_size < aligned_size) {
1480                                 struct buf *bp;
1481                                 int offset;
1482
1483                                 offset = vap->va_size & HAMMER_BUFMASK;
1484                                 error = bread(ap->a_vp,
1485                                               aligned_size - HAMMER_BUFSIZE,
1486                                               HAMMER_BUFSIZE, &bp);
1487                                 if (error == 0) {
1488                                         bzero(bp->b_data + offset,
1489                                               HAMMER_BUFSIZE - offset);
1490                                         bdwrite(bp);
1491                                 } else {
1492                                         brelse(bp);
1493                                 }
1494                         }
1495                         break;
1496                 case VDATABASE:
1497                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1498                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1499                                 ip->trunc_off = vap->va_size;
1500                         } else if (ip->trunc_off > vap->va_size) {
1501                                 ip->trunc_off = vap->va_size;
1502                         }
1503                         ip->ino_rec.ino_size = vap->va_size;
1504                         modflags |= HAMMER_INODE_RDIRTY;
1505                         break;
1506                 default:
1507                         error = EINVAL;
1508                         goto done;
1509                 }
1510                 break;
1511         }
1512         if (vap->va_atime.tv_sec != VNOVAL) {
1513                 ip->ino_rec.ino_atime =
1514                         hammer_timespec_to_transid(&vap->va_atime);
1515                 modflags |= HAMMER_INODE_ITIMES;
1516         }
1517         if (vap->va_mtime.tv_sec != VNOVAL) {
1518                 ip->ino_rec.ino_mtime =
1519                         hammer_timespec_to_transid(&vap->va_mtime);
1520                 modflags |= HAMMER_INODE_ITIMES;
1521         }
1522         if (vap->va_mode != (mode_t)VNOVAL) {
1523                 if (ip->ino_data.mode != vap->va_mode) {
1524                         ip->ino_data.mode = vap->va_mode;
1525                         modflags |= HAMMER_INODE_DDIRTY;
1526                 }
1527         }
1528 done:
1529         if (error == 0)
1530                 hammer_modify_inode(&trans, ip, modflags);
1531         hammer_unlock(&ip->lock);
1532         hammer_done_transaction(&trans);
1533         return (error);
1534 }
1535
1536 /*
1537  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1538  */
1539 static
1540 int
1541 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1542 {
1543         struct hammer_transaction trans;
1544         struct hammer_inode *dip;
1545         struct hammer_inode *nip;
1546         struct nchandle *nch;
1547         hammer_record_t record;
1548         int error;
1549         int bytes;
1550
1551         ap->a_vap->va_type = VLNK;
1552
1553         nch = ap->a_nch;
1554         dip = VTOI(ap->a_dvp);
1555
1556         if (dip->flags & HAMMER_INODE_RO)
1557                 return (EROFS);
1558
1559         /*
1560          * Create a transaction to cover the operations we perform.
1561          */
1562         hammer_start_transaction(&trans, dip->hmp);
1563
1564         /*
1565          * Create a new filesystem object of the requested type.  The
1566          * returned inode will be referenced but not locked.
1567          */
1568
1569         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1570         if (error) {
1571                 hammer_done_transaction(&trans);
1572                 *ap->a_vpp = NULL;
1573                 return (error);
1574         }
1575
1576         /*
1577          * Add the new filesystem object to the directory.  This will also
1578          * bump the inode's link count.
1579          */
1580         hammer_lock_sh(&nip->lock);
1581         hammer_lock_sh(&dip->lock);
1582         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1583
1584         /*
1585          * Add a record representing the symlink.  symlink stores the link
1586          * as pure data, not a string, and is no \0 terminated.
1587          */
1588         if (error == 0) {
1589                 record = hammer_alloc_mem_record(nip);
1590                 bytes = strlen(ap->a_target);
1591
1592                 record->rec.base.base.key = HAMMER_FIXKEY_SYMLINK;
1593                 record->rec.base.base.rec_type = HAMMER_RECTYPE_FIX;
1594                 record->rec.base.data_len = bytes;
1595                 record->data = (void *)ap->a_target;
1596                 /* will be reallocated by routine below */
1597                 error = hammer_ip_add_record(&trans, record);
1598
1599                 /*
1600                  * Set the file size to the length of the link.
1601                  */
1602                 if (error == 0) {
1603                         nip->ino_rec.ino_size = bytes;
1604                         hammer_modify_inode(&trans, nip, HAMMER_INODE_RDIRTY);
1605                 }
1606         }
1607         hammer_finalize_inode(&trans, nip, error);
1608         hammer_unlock(&dip->lock);
1609         hammer_unlock(&nip->lock);
1610
1611         /*
1612          * Finish up.
1613          */
1614         if (error) {
1615                 hammer_rel_inode(nip, 0);
1616                 *ap->a_vpp = NULL;
1617         } else {
1618                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
1619                 hammer_rel_inode(nip, 0);
1620                 if (error == 0) {
1621                         cache_setunresolved(ap->a_nch);
1622                         cache_setvp(ap->a_nch, *ap->a_vpp);
1623                 }
1624         }
1625         hammer_done_transaction(&trans);
1626         return (error);
1627 }
1628
1629 /*
1630  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1631  */
1632 static
1633 int
1634 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1635 {
1636         struct hammer_transaction trans;
1637         int error;
1638
1639         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1640         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1641                                 ap->a_cred, ap->a_flags);
1642         hammer_done_transaction(&trans);
1643
1644         return (error);
1645 }
1646
1647 /*
1648  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1649  */
1650 static
1651 int
1652 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1653 {
1654         struct hammer_inode *ip = ap->a_vp->v_data;
1655
1656         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1657                             ap->a_fflag, ap->a_cred));
1658 }
1659
1660 static
1661 int
1662 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1663 {
1664         struct mount *mp;
1665         int error;
1666
1667         mp = ap->a_head.a_ops->head.vv_mount;
1668
1669         switch(ap->a_op) {
1670         case MOUNTCTL_SET_EXPORT:
1671                 if (ap->a_ctllen != sizeof(struct export_args))
1672                         error = EINVAL;
1673                 error = hammer_vfs_export(mp, ap->a_op,
1674                                       (const struct export_args *)ap->a_ctl);
1675                 break;
1676         default:
1677                 error = journal_mountctl(ap);
1678                 break;
1679         }
1680         return(error);
1681 }
1682
1683 /*
1684  * hammer_vop_strategy { vp, bio }
1685  *
1686  * Strategy call, used for regular file read & write only.  Note that the
1687  * bp may represent a cluster.
1688  *
1689  * To simplify operation and allow better optimizations in the future,
1690  * this code does not make any assumptions with regards to buffer alignment
1691  * or size.
1692  */
1693 static
1694 int
1695 hammer_vop_strategy(struct vop_strategy_args *ap)
1696 {
1697         struct buf *bp;
1698         int error;
1699
1700         bp = ap->a_bio->bio_buf;
1701
1702         switch(bp->b_cmd) {
1703         case BUF_CMD_READ:
1704                 error = hammer_vop_strategy_read(ap);
1705                 break;
1706         case BUF_CMD_WRITE:
1707                 error = hammer_vop_strategy_write(ap);
1708                 break;
1709         default:
1710                 bp->b_error = error = EINVAL;
1711                 bp->b_flags |= B_ERROR;
1712                 biodone(ap->a_bio);
1713                 break;
1714         }
1715         return (error);
1716 }
1717
1718 /*
1719  * Read from a regular file.  Iterate the related records and fill in the
1720  * BIO/BUF.  Gaps are zero-filled.
1721  *
1722  * The support code in hammer_object.c should be used to deal with mixed
1723  * in-memory and on-disk records.
1724  *
1725  * XXX atime update
1726  */
1727 static
1728 int
1729 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1730 {
1731         struct hammer_transaction trans;
1732         struct hammer_inode *ip;
1733         struct hammer_cursor cursor;
1734         hammer_record_ondisk_t rec;
1735         hammer_base_elm_t base;
1736         struct bio *bio;
1737         struct buf *bp;
1738         int64_t rec_offset;
1739         int64_t ran_end;
1740         int64_t tmp64;
1741         int error;
1742         int boff;
1743         int roff;
1744         int n;
1745
1746         bio = ap->a_bio;
1747         bp = bio->bio_buf;
1748         ip = ap->a_vp->v_data;
1749
1750         hammer_simple_transaction(&trans, ip->hmp);
1751         hammer_init_cursor(&trans, &cursor, &ip->cache[0]);
1752
1753         /*
1754          * Key range (begin and end inclusive) to scan.  Note that the key's
1755          * stored in the actual records represent BASE+LEN, not BASE.  The
1756          * first record containing bio_offset will have a key > bio_offset.
1757          */
1758         cursor.key_beg.obj_id = ip->obj_id;
1759         cursor.key_beg.create_tid = 0;
1760         cursor.key_beg.delete_tid = 0;
1761         cursor.key_beg.obj_type = 0;
1762         cursor.key_beg.key = bio->bio_offset + 1;
1763         cursor.asof = ip->obj_asof;
1764         cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
1765
1766         cursor.key_end = cursor.key_beg;
1767         KKASSERT(ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_REGFILE);
1768 #if 0
1769         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1770                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1771                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1772                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1773         } else
1774 #endif
1775         {
1776                 ran_end = bio->bio_offset + bp->b_bufsize;
1777                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1778                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1779                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1780                 if (tmp64 < ran_end)
1781                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1782                 else
1783                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1784         }
1785         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1786
1787         error = hammer_ip_first(&cursor, ip);
1788         boff = 0;
1789
1790         while (error == 0) {
1791                 error = hammer_ip_resolve_data(&cursor);
1792                 if (error)
1793                         break;
1794                 rec = cursor.record;
1795                 base = &rec->base.base;
1796
1797                 rec_offset = base->key - rec->data.base.data_len;
1798
1799                 /*
1800                  * Calculate the gap, if any, and zero-fill it.
1801                  */
1802                 n = (int)(rec_offset - (bio->bio_offset + boff));
1803                 if (n > 0) {
1804                         if (n > bp->b_bufsize - boff)
1805                                 n = bp->b_bufsize - boff;
1806                         bzero((char *)bp->b_data + boff, n);
1807                         boff += n;
1808                         n = 0;
1809                 }
1810
1811                 /*
1812                  * Calculate the data offset in the record and the number
1813                  * of bytes we can copy.
1814                  *
1815                  * Note there is a degenerate case here where boff may
1816                  * already be at bp->b_bufsize.
1817                  */
1818                 roff = -n;
1819                 rec_offset += roff;
1820                 n = rec->data.base.data_len - roff;
1821                 KKASSERT(n > 0);
1822                 if (n > bp->b_bufsize - boff)
1823                         n = bp->b_bufsize - boff;
1824
1825                 /*
1826                  * If we cached a truncation point on our front-end the
1827                  * on-disk version may still have physical records beyond
1828                  * that point.  Truncate visibility.
1829                  */
1830                 if (ip->trunc_off <= rec_offset)
1831                         n = 0;
1832                 else if (ip->trunc_off < rec_offset + n)
1833                         n = (int)(ip->trunc_off - rec_offset);
1834
1835                 /*
1836                  * Copy
1837                  */
1838                 if (n) {
1839                         bcopy((char *)cursor.data + roff,
1840                               (char *)bp->b_data + boff, n);
1841                         boff += n;
1842                 }
1843                 if (boff == bp->b_bufsize)
1844                         break;
1845                 error = hammer_ip_next(&cursor);
1846         }
1847         hammer_done_cursor(&cursor);
1848         hammer_done_transaction(&trans);
1849
1850         /*
1851          * There may have been a gap after the last record
1852          */
1853         if (error == ENOENT)
1854                 error = 0;
1855         if (error == 0 && boff != bp->b_bufsize) {
1856                 KKASSERT(boff < bp->b_bufsize);
1857                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1858                 /* boff = bp->b_bufsize; */
1859         }
1860         bp->b_resid = 0;
1861         bp->b_error = error;
1862         if (error)
1863                 bp->b_flags |= B_ERROR;
1864         biodone(ap->a_bio);
1865         return(error);
1866 }
1867
1868 /*
1869  * Write to a regular file.   Because this is a strategy call the OS is
1870  * trying to actually sync data to the media.   HAMMER can only flush
1871  * the entire inode (so the TID remains properly synchronized).
1872  *
1873  * Basically all we do here is place the bio on the inode's flush queue
1874  * and activate the flusher.
1875  */
1876 static
1877 int
1878 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1879 {
1880         hammer_inode_t ip;
1881         struct bio *bio;
1882         struct buf *bp;
1883
1884         bio = ap->a_bio;
1885         bp = bio->bio_buf;
1886         ip = ap->a_vp->v_data;
1887
1888         if (ip->flags & HAMMER_INODE_RO) {
1889                 bp->b_error = EROFS;
1890                 bp->b_flags |= B_ERROR;
1891                 biodone(ap->a_bio);
1892                 return(EROFS);
1893         }
1894
1895         /*
1896          * If the inode is being flushed we cannot re-queue buffers
1897          * it may have already flushed, or it could result in duplicate
1898          * records in the database.
1899          */
1900         BUF_KERNPROC(bp);
1901         if (ip->flush_state == HAMMER_FST_FLUSH)
1902                 TAILQ_INSERT_TAIL(&ip->bio_alt_list, bio, bio_act);
1903         else
1904                 TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
1905         hammer_modify_inode(NULL, ip, HAMMER_INODE_XDIRTY);
1906         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1907         kprintf("a");
1908         return(0);
1909 }
1910
1911 /*
1912  * Backend code which actually performs the write to the media.  This
1913  * routine is typically called from the flusher.  The bio will be disposed
1914  * of (biodone'd) by this routine.
1915  *
1916  * Iterate the related records and mark for deletion.  If existing edge
1917  * records (left and right side) overlap our write they have to be marked
1918  * deleted and new records created, usually referencing a portion of the
1919  * original data.  Then add a record to represent the buffer.
1920  */
1921 int
1922 hammer_dowrite(hammer_transaction_t trans, hammer_inode_t ip, struct bio *bio)
1923 {
1924         struct buf *bp = bio->bio_buf;
1925         int error;
1926
1927         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
1928
1929         /*
1930          * Delete any records overlapping our range.  This function will
1931          * (eventually) properly truncate partial overlaps.
1932          */
1933         if (ip->sync_ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1934                 error = hammer_ip_delete_range(trans, ip, bio->bio_offset,
1935                                                bio->bio_offset);
1936         } else {
1937                 error = hammer_ip_delete_range(trans, ip, bio->bio_offset,
1938                                                bio->bio_offset +
1939                                                 bp->b_bufsize - 1);
1940         }
1941
1942         /*
1943          * Add a single record to cover the write.  We can write a record
1944          * with only the actual file data - for example, a small 200 byte
1945          * file does not have to write out a 16K record.
1946          *
1947          * While the data size does not have to be aligned, we still do it
1948          * to reduce fragmentation in a future allocation model.
1949          */
1950         if (error == 0) {
1951                 int limit_size;
1952
1953                 if (ip->sync_ino_rec.ino_size - bio->bio_offset > 
1954                     bp->b_bufsize) {
1955                             limit_size = bp->b_bufsize;
1956                 } else {
1957                         limit_size = (int)(ip->sync_ino_rec.ino_size -
1958                                            bio->bio_offset);
1959                         KKASSERT(limit_size >= 0);
1960                         limit_size = (limit_size + 63) & ~63;
1961                 }
1962                 error = hammer_ip_sync_data(trans, ip, bio->bio_offset,
1963                                             bp->b_data, limit_size);
1964         }
1965
1966         if (error) {
1967                 bp->b_resid = bp->b_bufsize;
1968                 bp->b_error = error;
1969                 bp->b_flags |= B_ERROR;
1970         } else {
1971                 bp->b_resid = 0;
1972         }
1973         biodone(bio);
1974         return(error);
1975 }
1976
1977 /*
1978  * dounlink - disconnect a directory entry
1979  *
1980  * XXX whiteout support not really in yet
1981  */
1982 static int
1983 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
1984                 struct vnode *dvp, struct ucred *cred, int flags)
1985 {
1986         struct namecache *ncp;
1987         hammer_inode_t dip;
1988         hammer_inode_t ip;
1989         hammer_record_ondisk_t rec;
1990         struct hammer_cursor cursor;
1991         int64_t namekey;
1992         int error;
1993
1994         /*
1995          * Calculate the namekey and setup the key range for the scan.  This
1996          * works kinda like a chained hash table where the lower 32 bits
1997          * of the namekey synthesize the chain.
1998          *
1999          * The key range is inclusive of both key_beg and key_end.
2000          */
2001         dip = VTOI(dvp);
2002         ncp = nch->ncp;
2003
2004         if (dip->flags & HAMMER_INODE_RO)
2005                 return (EROFS);
2006
2007         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2008 retry:
2009         hammer_init_cursor(trans, &cursor, &dip->cache[0]);
2010         cursor.key_beg.obj_id = dip->obj_id;
2011         cursor.key_beg.key = namekey;
2012         cursor.key_beg.create_tid = 0;
2013         cursor.key_beg.delete_tid = 0;
2014         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2015         cursor.key_beg.obj_type = 0;
2016
2017         cursor.key_end = cursor.key_beg;
2018         cursor.key_end.key |= 0xFFFFFFFFULL;
2019         cursor.asof = dip->obj_asof;
2020         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2021
2022         /*
2023          * Scan all matching records (the chain), locate the one matching
2024          * the requested path component.  info->last_error contains the
2025          * error code on search termination and could be 0, ENOENT, or
2026          * something else.
2027          *
2028          * The hammer_ip_*() functions merge in-memory records with on-disk
2029          * records for the purposes of the search.
2030          */
2031         error = hammer_ip_first(&cursor, dip);
2032         while (error == 0) {
2033                 error = hammer_ip_resolve_data(&cursor);
2034                 if (error)
2035                         break;
2036                 rec = cursor.record;
2037                 if (ncp->nc_nlen == rec->entry.base.data_len &&
2038                     bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
2039                         break;
2040                 }
2041                 error = hammer_ip_next(&cursor);
2042         }
2043
2044         /*
2045          * If all is ok we have to get the inode so we can adjust nlinks.
2046          *
2047          * If the target is a directory, it must be empty.
2048          */
2049         if (error == 0) {
2050                 ip = hammer_get_inode(trans, &dip->cache[1],
2051                                       rec->entry.obj_id,
2052                                       dip->hmp->asof, 0, &error);
2053                 if (error == ENOENT) {
2054                         kprintf("obj_id %016llx\n", rec->entry.obj_id);
2055                         Debugger("ENOENT unlinking object that should exist");
2056                 }
2057                 if (error == 0 && ip->ino_rec.base.base.obj_type ==
2058                                   HAMMER_OBJTYPE_DIRECTORY) {
2059                         error = hammer_ip_check_directory_empty(trans, ip);
2060                 }
2061                 /*
2062                  * WARNING: hammer_ip_del_directory() may have to terminate
2063                  * the cursor to avoid a lock recursion.  It's ok to call
2064                  * hammer_done_cursor() twice.
2065                  */
2066                 if (error == 0) {
2067                         hammer_lock_sh(&ip->lock);
2068                         hammer_lock_sh(&dip->lock);
2069                         error = hammer_ip_del_directory(trans, &cursor,
2070                                                         dip, ip);
2071                         hammer_unlock(&dip->lock);
2072                         hammer_unlock(&ip->lock);
2073                 }
2074                 if (error == 0) {
2075                         cache_setunresolved(nch);
2076                         cache_setvp(nch, NULL);
2077                         /* XXX locking */
2078                         if (ip->vp)
2079                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2080                 }
2081                 hammer_rel_inode(ip, 0);
2082         }
2083         hammer_done_cursor(&cursor);
2084         if (error == EDEADLK)
2085                 goto retry;
2086
2087         return (error);
2088 }
2089
2090 /************************************************************************
2091  *                          FIFO AND SPECFS OPS                         *
2092  ************************************************************************
2093  *
2094  */
2095
2096 static int
2097 hammer_vop_fifoclose (struct vop_close_args *ap)
2098 {
2099         /* XXX update itimes */
2100         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2101 }
2102
2103 static int
2104 hammer_vop_fiforead (struct vop_read_args *ap)
2105 {
2106         int error;
2107
2108         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2109         /* XXX update access time */
2110         return (error);
2111 }
2112
2113 static int
2114 hammer_vop_fifowrite (struct vop_write_args *ap)
2115 {
2116         int error;
2117
2118         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2119         /* XXX update access time */
2120         return (error);
2121 }
2122
2123 static int
2124 hammer_vop_specclose (struct vop_close_args *ap)
2125 {
2126         /* XXX update itimes */
2127         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2128 }
2129
2130 static int
2131 hammer_vop_specread (struct vop_read_args *ap)
2132 {
2133         /* XXX update access time */
2134         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2135 }
2136
2137 static int
2138 hammer_vop_specwrite (struct vop_write_args *ap)
2139 {
2140         /* XXX update last change time */
2141         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2142 }
2143