Fix some NFS related bugs which cause the mount point's mnt_refs counter
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.27 2008/02/05 20:52:01 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81 static int hammer_vop_mountctl(struct vop_mountctl_args *);
82
83 static int hammer_vop_fifoclose (struct vop_close_args *);
84 static int hammer_vop_fiforead (struct vop_read_args *);
85 static int hammer_vop_fifowrite (struct vop_write_args *);
86
87 static int hammer_vop_specclose (struct vop_close_args *);
88 static int hammer_vop_specread (struct vop_read_args *);
89 static int hammer_vop_specwrite (struct vop_write_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         hammer_vop_pathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_setattr =          hammer_vop_setattr,
119         .vop_strategy =         hammer_vop_strategy,
120         .vop_nsymlink =         hammer_vop_nsymlink,
121         .vop_nwhiteout =        hammer_vop_nwhiteout,
122         .vop_ioctl =            hammer_vop_ioctl,
123         .vop_mountctl =         hammer_vop_mountctl
124 };
125
126 struct vop_ops hammer_spec_vops = {
127         .vop_default =          spec_vnoperate,
128         .vop_fsync =            hammer_vop_fsync,
129         .vop_read =             hammer_vop_specread,
130         .vop_write =            hammer_vop_specwrite,
131         .vop_access =           hammer_vop_access,
132         .vop_close =            hammer_vop_specclose,
133         .vop_getattr =          hammer_vop_getattr,
134         .vop_inactive =         hammer_vop_inactive,
135         .vop_reclaim =          hammer_vop_reclaim,
136         .vop_setattr =          hammer_vop_setattr
137 };
138
139 struct vop_ops hammer_fifo_vops = {
140         .vop_default =          fifo_vnoperate,
141         .vop_fsync =            hammer_vop_fsync,
142         .vop_read =             hammer_vop_fiforead,
143         .vop_write =            hammer_vop_fifowrite,
144         .vop_access =           hammer_vop_access,
145         .vop_close =            hammer_vop_fifoclose,
146         .vop_getattr =          hammer_vop_getattr,
147         .vop_inactive =         hammer_vop_inactive,
148         .vop_reclaim =          hammer_vop_reclaim,
149         .vop_setattr =          hammer_vop_setattr
150 };
151
152 static int hammer_dounlink(struct nchandle *nch, struct vnode *dvp,
153                            struct ucred *cred, int flags);
154 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
155 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
156
157 #if 0
158 static
159 int
160 hammer_vop_vnoperate(struct vop_generic_args *)
161 {
162         return (VOCALL(&hammer_vnode_vops, ap));
163 }
164 #endif
165
166 /*
167  * hammer_vop_fsync { vp, waitfor }
168  */
169 static
170 int
171 hammer_vop_fsync(struct vop_fsync_args *ap)
172 {
173         hammer_inode_t ip;
174         int error;
175
176         ip = VTOI(ap->a_vp);
177         error = hammer_sync_inode(ip, ap->a_waitfor, 0);
178         return (error);
179 }
180
181 /*
182  * hammer_vop_read { vp, uio, ioflag, cred }
183  */
184 static
185 int
186 hammer_vop_read(struct vop_read_args *ap)
187 {
188         struct hammer_transaction trans;
189         hammer_inode_t ip;
190         off_t offset;
191         struct buf *bp;
192         struct uio *uio;
193         int error;
194         int n;
195         int seqcount;
196
197         if (ap->a_vp->v_type != VREG)
198                 return (EINVAL);
199         ip = VTOI(ap->a_vp);
200         error = 0;
201         seqcount = ap->a_ioflag >> 16;
202
203         hammer_start_transaction(&trans, ip->hmp);
204
205         /*
206          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
207          */
208         uio = ap->a_uio;
209         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
210                 offset = uio->uio_offset & HAMMER_BUFMASK;
211 #if 0
212                 error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
213                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
214                                      MAXBSIZE, seqcount, &bp);
215 #endif
216                 error = bread(ap->a_vp, uio->uio_offset - offset,
217                               HAMMER_BUFSIZE, &bp);
218                 if (error) {
219                         brelse(bp);
220                         break;
221                 }
222                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
223                 n = HAMMER_BUFSIZE - offset;
224                 if (n > uio->uio_resid)
225                         n = uio->uio_resid;
226                 if (n > ip->ino_rec.ino_size - uio->uio_offset)
227                         n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
228                 error = uiomove((char *)bp->b_data + offset, n, uio);
229                 if (error) {
230                         bqrelse(bp);
231                         break;
232                 }
233                 if ((ip->flags & HAMMER_INODE_RO) == 0) {
234                         ip->ino_rec.ino_atime = trans.tid;
235                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
236                 }
237                 bqrelse(bp);
238         }
239         hammer_commit_transaction(&trans);
240         return (error);
241 }
242
243 /*
244  * hammer_vop_write { vp, uio, ioflag, cred }
245  */
246 static
247 int
248 hammer_vop_write(struct vop_write_args *ap)
249 {
250         struct hammer_transaction trans;
251         struct hammer_inode *ip;
252         struct uio *uio;
253         off_t offset;
254         struct buf *bp;
255         int error;
256         int n;
257         int flags;
258
259         if (ap->a_vp->v_type != VREG)
260                 return (EINVAL);
261         ip = VTOI(ap->a_vp);
262         error = 0;
263
264         if (ip->flags & HAMMER_INODE_RO)
265                 return (EROFS);
266
267         /*
268          * Create a transaction to cover the operations we perform.
269          */
270         hammer_start_transaction(&trans, ip->hmp);
271         uio = ap->a_uio;
272
273         /*
274          * Check append mode
275          */
276         if (ap->a_ioflag & IO_APPEND)
277                 uio->uio_offset = ip->ino_rec.ino_size;
278
279         /*
280          * Check for illegal write offsets.  Valid range is 0...2^63-1
281          */
282         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
283                 hammer_commit_transaction(&trans);
284                 return (EFBIG);
285         }
286
287         /*
288          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
289          */
290         while (uio->uio_resid > 0) {
291                 int fixsize = 0;
292
293                 offset = uio->uio_offset & HAMMER_BUFMASK;
294                 n = HAMMER_BUFSIZE - offset;
295                 if (n > uio->uio_resid)
296                         n = uio->uio_resid;
297                 if (uio->uio_offset + n > ip->ino_rec.ino_size) {
298                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
299                         fixsize = 1;
300                 }
301
302                 if (uio->uio_segflg == UIO_NOCOPY) {
303                         /*
304                          * Issuing a write with the same data backing the
305                          * buffer.  Instantiate the buffer to collect the
306                          * backing vm pages, then read-in any missing bits.
307                          *
308                          * This case is used by vop_stdputpages().
309                          */
310                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
311                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
312                         if ((bp->b_flags & B_CACHE) == 0) {
313                                 bqrelse(bp);
314                                 error = bread(ap->a_vp,
315                                               uio->uio_offset - offset,
316                                               HAMMER_BUFSIZE, &bp);
317                         }
318                 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
319                         /*
320                          * entirely overwrite the buffer
321                          */
322                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
323                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
324                 } else if (offset == 0 && uio->uio_offset >= ip->ino_rec.ino_size) {
325                         /*
326                          * XXX
327                          */
328                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
329                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
330                         vfs_bio_clrbuf(bp);
331                 } else {
332                         /*
333                          * Partial overwrite, read in any missing bits then
334                          * replace the portion being written.
335                          */
336                         error = bread(ap->a_vp, uio->uio_offset - offset,
337                                       HAMMER_BUFSIZE, &bp);
338                         if (error == 0)
339                                 bheavy(bp);
340                 }
341                 if (error == 0)
342                         error = uiomove((char *)bp->b_data + offset, n, uio);
343
344                 /*
345                  * If we screwed up we have to undo any VM size changes we
346                  * made.
347                  */
348                 if (error) {
349                         brelse(bp);
350                         if (fixsize) {
351                                 vtruncbuf(ap->a_vp, ip->ino_rec.ino_size,
352                                           HAMMER_BUFSIZE);
353                         }
354                         break;
355                 }
356                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
357                 if (ip->ino_rec.ino_size < uio->uio_offset) {
358                         ip->ino_rec.ino_size = uio->uio_offset;
359                         flags = HAMMER_INODE_RDIRTY;
360                         vnode_pager_setsize(ap->a_vp, ip->ino_rec.ino_size);
361                 } else {
362                         flags = 0;
363                 }
364                 ip->ino_rec.ino_mtime = trans.tid;
365                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
366                 hammer_modify_inode(&trans, ip, flags);
367
368                 /*
369                  * The file write must be tagged with the same TID as the
370                  * inode, for consistency in case the inode changed size.
371                  * This guarantees the on-disk data records will have a
372                  * TID <= the inode TID representing the size change.
373                  *
374                  * If a prior write has not yet flushed, retain its TID.
375                  */
376                 if (bp->b_tid == 0)
377                         bp->b_tid = ip->last_tid;
378
379                 if (ap->a_ioflag & IO_SYNC) {
380                         bwrite(bp);
381                 } else if (ap->a_ioflag & IO_DIRECT) {
382                         bawrite(bp);
383                 } else {
384                         bdwrite(bp);
385                 }
386         }
387         if (error)
388                 hammer_abort_transaction(&trans);
389         else
390                 hammer_commit_transaction(&trans);
391         return (error);
392 }
393
394 /*
395  * hammer_vop_access { vp, mode, cred }
396  */
397 static
398 int
399 hammer_vop_access(struct vop_access_args *ap)
400 {
401         struct hammer_inode *ip = VTOI(ap->a_vp);
402         uid_t uid;
403         gid_t gid;
404         int error;
405
406         uid = hammer_to_unix_xid(&ip->ino_data.uid);
407         gid = hammer_to_unix_xid(&ip->ino_data.gid);
408
409         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
410                                   ip->ino_data.uflags);
411         return (error);
412 }
413
414 /*
415  * hammer_vop_advlock { vp, id, op, fl, flags }
416  */
417 static
418 int
419 hammer_vop_advlock(struct vop_advlock_args *ap)
420 {
421         struct hammer_inode *ip = VTOI(ap->a_vp);
422
423         return (lf_advlock(ap, &ip->advlock, ip->ino_rec.ino_size));
424 }
425
426 /*
427  * hammer_vop_close { vp, fflag }
428  */
429 static
430 int
431 hammer_vop_close(struct vop_close_args *ap)
432 {
433         return (vop_stdclose(ap));
434 }
435
436 /*
437  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
438  *
439  * The operating system has already ensured that the directory entry
440  * does not exist and done all appropriate namespace locking.
441  */
442 static
443 int
444 hammer_vop_ncreate(struct vop_ncreate_args *ap)
445 {
446         struct hammer_transaction trans;
447         struct hammer_inode *dip;
448         struct hammer_inode *nip;
449         struct nchandle *nch;
450         int error;
451
452         nch = ap->a_nch;
453         dip = VTOI(ap->a_dvp);
454
455         if (dip->flags & HAMMER_INODE_RO)
456                 return (EROFS);
457
458         /*
459          * Create a transaction to cover the operations we perform.
460          */
461         hammer_start_transaction(&trans, dip->hmp);
462
463         /*
464          * Create a new filesystem object of the requested type.  The
465          * returned inode will be referenced but not locked.
466          */
467
468         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
469         if (error)
470                 kprintf("hammer_create_inode error %d\n", error);
471         if (error) {
472                 hammer_abort_transaction(&trans);
473                 *ap->a_vpp = NULL;
474                 return (error);
475         }
476
477         /*
478          * Add the new filesystem object to the directory.  This will also
479          * bump the inode's link count.
480          */
481         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
482         if (error)
483                 kprintf("hammer_ip_add_directory error %d\n", error);
484
485         /*
486          * Finish up.
487          */
488         if (error) {
489                 hammer_rel_inode(nip, 0);
490                 hammer_abort_transaction(&trans);
491                 *ap->a_vpp = NULL;
492         } else {
493                 hammer_commit_transaction(&trans);
494                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
495                 hammer_rel_inode(nip, 0);
496                 if (error == 0) {
497                         cache_setunresolved(ap->a_nch);
498                         cache_setvp(ap->a_nch, *ap->a_vpp);
499                 }
500         }
501         return (error);
502 }
503
504 /*
505  * hammer_vop_getattr { vp, vap }
506  */
507 static
508 int
509 hammer_vop_getattr(struct vop_getattr_args *ap)
510 {
511         struct hammer_inode *ip = VTOI(ap->a_vp);
512         struct vattr *vap = ap->a_vap;
513
514 #if 0
515         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
516             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
517             ip->obj_asof == XXX
518         ) {
519                 /* LAZYMOD XXX */
520         }
521         hammer_itimes(ap->a_vp);
522 #endif
523
524         vap->va_fsid = ip->hmp->fsid_udev;
525         vap->va_fileid = ip->ino_rec.base.base.obj_id;
526         vap->va_mode = ip->ino_data.mode;
527         vap->va_nlink = ip->ino_rec.ino_nlinks;
528         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
529         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
530         vap->va_rmajor = 0;
531         vap->va_rminor = 0;
532         vap->va_size = ip->ino_rec.ino_size;
533         hammer_to_timespec(ip->ino_rec.ino_atime, &vap->va_atime);
534         hammer_to_timespec(ip->ino_rec.ino_mtime, &vap->va_mtime);
535         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
536         vap->va_flags = ip->ino_data.uflags;
537         vap->va_gen = 1;        /* hammer inums are unique for all time */
538         vap->va_blocksize = 32768; /* XXX - extract from root volume */
539         vap->va_bytes = ip->ino_rec.ino_size;
540         vap->va_type = hammer_get_vnode_type(ip->ino_rec.base.base.obj_type);
541         vap->va_filerev = 0;    /* XXX */
542         /* mtime uniquely identifies any adjustments made to the file */
543         vap->va_fsmid = ip->ino_rec.ino_mtime;
544         vap->va_uid_uuid = ip->ino_data.uid;
545         vap->va_gid_uuid = ip->ino_data.gid;
546         vap->va_fsid_uuid = ip->hmp->fsid;
547         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
548                           VA_FSID_UUID_VALID;
549
550         switch (ip->ino_rec.base.base.obj_type) {
551         case HAMMER_OBJTYPE_CDEV:
552         case HAMMER_OBJTYPE_BDEV:
553                 vap->va_rmajor = ip->ino_data.rmajor;
554                 vap->va_rminor = ip->ino_data.rminor;
555                 break;
556         default:
557                 break;
558         }
559
560         return(0);
561 }
562
563 /*
564  * hammer_vop_nresolve { nch, dvp, cred }
565  *
566  * Locate the requested directory entry.
567  */
568 static
569 int
570 hammer_vop_nresolve(struct vop_nresolve_args *ap)
571 {
572         struct namecache *ncp;
573         hammer_inode_t dip;
574         hammer_inode_t ip;
575         hammer_tid_t asof;
576         struct hammer_cursor cursor;
577         union hammer_record_ondisk *rec;
578         struct vnode *vp;
579         int64_t namekey;
580         int error;
581         int i;
582         int nlen;
583         int flags;
584         u_int64_t obj_id;
585
586         /*
587          * Misc initialization, plus handle as-of name extensions.  Look for
588          * the '@@' extension.  Note that as-of files and directories cannot
589          * be modified.
590          */
591         dip = VTOI(ap->a_dvp);
592         ncp = ap->a_nch->ncp;
593         asof = dip->obj_asof;
594         nlen = ncp->nc_nlen;
595         flags = dip->flags;
596
597         for (i = 0; i < nlen; ++i) {
598                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
599                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
600                         flags |= HAMMER_INODE_RO;
601                         break;
602                 }
603         }
604         nlen = i;
605
606         /*
607          * If there is no path component the time extension is relative to
608          * dip.
609          */
610         if (nlen == 0) {
611                 ip = hammer_get_inode(dip->hmp, &dip->cache[1], dip->obj_id,
612                                       asof, flags, &error);
613                 if (error == 0) {
614                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
615                         hammer_rel_inode(ip, 0);
616                 } else {
617                         vp = NULL;
618                 }
619                 if (error == 0) {
620                         vn_unlock(vp);
621                         cache_setvp(ap->a_nch, vp);
622                         vrele(vp);
623                 }
624                 return(error);
625         }
626
627         /*
628          * Calculate the namekey and setup the key range for the scan.  This
629          * works kinda like a chained hash table where the lower 32 bits
630          * of the namekey synthesize the chain.
631          *
632          * The key range is inclusive of both key_beg and key_end.
633          */
634         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
635
636         error = hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp);
637         cursor.key_beg.obj_id = dip->obj_id;
638         cursor.key_beg.key = namekey;
639         cursor.key_beg.create_tid = 0;
640         cursor.key_beg.delete_tid = 0;
641         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
642         cursor.key_beg.obj_type = 0;
643
644         cursor.key_end = cursor.key_beg;
645         cursor.key_end.key |= 0xFFFFFFFFULL;
646         cursor.asof = asof;
647         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
648
649         /*
650          * Scan all matching records (the chain), locate the one matching
651          * the requested path component.
652          *
653          * The hammer_ip_*() functions merge in-memory records with on-disk
654          * records for the purposes of the search.
655          */
656         if (error == 0)
657                 error = hammer_ip_first(&cursor, dip);
658
659         rec = NULL;
660         obj_id = 0;
661
662         while (error == 0) {
663                 error = hammer_ip_resolve_data(&cursor);
664                 if (error)
665                         break;
666                 rec = cursor.record;
667                 if (nlen == rec->entry.base.data_len &&
668                     bcmp(ncp->nc_name, cursor.data, nlen) == 0) {
669                         obj_id = rec->entry.obj_id;
670                         break;
671                 }
672                 error = hammer_ip_next(&cursor);
673         }
674         hammer_done_cursor(&cursor);
675         if (error == 0) {
676                 ip = hammer_get_inode(dip->hmp, &dip->cache[1],
677                                       obj_id, asof, flags, &error);
678                 if (error == 0) {
679                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
680                         hammer_rel_inode(ip, 0);
681                 } else {
682                         vp = NULL;
683                 }
684                 if (error == 0) {
685                         vn_unlock(vp);
686                         cache_setvp(ap->a_nch, vp);
687                         vrele(vp);
688                 }
689         } else if (error == ENOENT) {
690                 cache_setvp(ap->a_nch, NULL);
691         }
692         return (error);
693 }
694
695 /*
696  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
697  *
698  * Locate the parent directory of a directory vnode.
699  *
700  * dvp is referenced but not locked.  *vpp must be returned referenced and
701  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
702  * at the root, instead it could indicate that the directory we were in was
703  * removed.
704  */
705 static
706 int
707 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
708 {
709         struct hammer_inode *dip;
710         struct hammer_inode *ip;
711         u_int64_t parent_obj_id;
712         int error;
713
714         dip = VTOI(ap->a_dvp);
715         if ((parent_obj_id = dip->ino_data.parent_obj_id) == 0) {
716                 *ap->a_vpp = NULL;
717                 return ENOENT;
718         }
719
720         ip = hammer_get_inode(dip->hmp, &dip->cache[1], parent_obj_id,
721                               dip->obj_asof, dip->flags, &error);
722         if (ip == NULL) {
723                 *ap->a_vpp = NULL;
724                 return(error);
725         }
726         error = hammer_get_vnode(ip, LK_EXCLUSIVE, ap->a_vpp);
727         hammer_rel_inode(ip, 0);
728         return (error);
729 }
730
731 /*
732  * hammer_vop_nlink { nch, dvp, vp, cred }
733  */
734 static
735 int
736 hammer_vop_nlink(struct vop_nlink_args *ap)
737 {
738         struct hammer_transaction trans;
739         struct hammer_inode *dip;
740         struct hammer_inode *ip;
741         struct nchandle *nch;
742         int error;
743
744         nch = ap->a_nch;
745         dip = VTOI(ap->a_dvp);
746         ip = VTOI(ap->a_vp);
747
748         if (dip->flags & HAMMER_INODE_RO)
749                 return (EROFS);
750         if (ip->flags & HAMMER_INODE_RO)
751                 return (EROFS);
752
753         /*
754          * Create a transaction to cover the operations we perform.
755          */
756         hammer_start_transaction(&trans, dip->hmp);
757
758         /*
759          * Add the filesystem object to the directory.  Note that neither
760          * dip nor ip are referenced or locked, but their vnodes are
761          * referenced.  This function will bump the inode's link count.
762          */
763         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
764
765         /*
766          * Finish up.
767          */
768         if (error) {
769                 hammer_abort_transaction(&trans);
770         } else {
771                 cache_setunresolved(nch);
772                 cache_setvp(nch, ap->a_vp);
773                 hammer_commit_transaction(&trans);
774         }
775         return (error);
776 }
777
778 /*
779  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
780  *
781  * The operating system has already ensured that the directory entry
782  * does not exist and done all appropriate namespace locking.
783  */
784 static
785 int
786 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
787 {
788         struct hammer_transaction trans;
789         struct hammer_inode *dip;
790         struct hammer_inode *nip;
791         struct nchandle *nch;
792         int error;
793
794         nch = ap->a_nch;
795         dip = VTOI(ap->a_dvp);
796
797         if (dip->flags & HAMMER_INODE_RO)
798                 return (EROFS);
799
800         /*
801          * Create a transaction to cover the operations we perform.
802          */
803         hammer_start_transaction(&trans, dip->hmp);
804
805         /*
806          * Create a new filesystem object of the requested type.  The
807          * returned inode will be referenced but not locked.
808          */
809         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
810         if (error)
811                 kprintf("hammer_mkdir error %d\n", error);
812         if (error) {
813                 hammer_abort_transaction(&trans);
814                 *ap->a_vpp = NULL;
815                 return (error);
816         }
817
818         /*
819          * Add the new filesystem object to the directory.  This will also
820          * bump the inode's link count.
821          */
822         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
823         if (error)
824                 kprintf("hammer_mkdir (add) error %d\n", error);
825
826         /*
827          * Finish up.
828          */
829         if (error) {
830                 hammer_rel_inode(nip, 0);
831                 hammer_abort_transaction(&trans);
832                 *ap->a_vpp = NULL;
833         } else {
834                 hammer_commit_transaction(&trans);
835                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
836                 hammer_rel_inode(nip, 0);
837                 if (error == 0) {
838                         cache_setunresolved(ap->a_nch);
839                         cache_setvp(ap->a_nch, *ap->a_vpp);
840                 }
841         }
842         return (error);
843 }
844
845 /*
846  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
847  *
848  * The operating system has already ensured that the directory entry
849  * does not exist and done all appropriate namespace locking.
850  */
851 static
852 int
853 hammer_vop_nmknod(struct vop_nmknod_args *ap)
854 {
855         struct hammer_transaction trans;
856         struct hammer_inode *dip;
857         struct hammer_inode *nip;
858         struct nchandle *nch;
859         int error;
860
861         nch = ap->a_nch;
862         dip = VTOI(ap->a_dvp);
863
864         if (dip->flags & HAMMER_INODE_RO)
865                 return (EROFS);
866
867         /*
868          * Create a transaction to cover the operations we perform.
869          */
870         hammer_start_transaction(&trans, dip->hmp);
871
872         /*
873          * Create a new filesystem object of the requested type.  The
874          * returned inode will be referenced but not locked.
875          */
876         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
877         if (error) {
878                 hammer_abort_transaction(&trans);
879                 *ap->a_vpp = NULL;
880                 return (error);
881         }
882
883         /*
884          * Add the new filesystem object to the directory.  This will also
885          * bump the inode's link count.
886          */
887         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
888
889         /*
890          * Finish up.
891          */
892         if (error) {
893                 hammer_rel_inode(nip, 0);
894                 hammer_abort_transaction(&trans);
895                 *ap->a_vpp = NULL;
896         } else {
897                 hammer_commit_transaction(&trans);
898                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
899                 hammer_rel_inode(nip, 0);
900                 if (error == 0) {
901                         cache_setunresolved(ap->a_nch);
902                         cache_setvp(ap->a_nch, *ap->a_vpp);
903                 }
904         }
905         return (error);
906 }
907
908 /*
909  * hammer_vop_open { vp, mode, cred, fp }
910  */
911 static
912 int
913 hammer_vop_open(struct vop_open_args *ap)
914 {
915         if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
916                 return (EROFS);
917
918         return(vop_stdopen(ap));
919 }
920
921 /*
922  * hammer_vop_pathconf { vp, name, retval }
923  */
924 static
925 int
926 hammer_vop_pathconf(struct vop_pathconf_args *ap)
927 {
928         return EOPNOTSUPP;
929 }
930
931 /*
932  * hammer_vop_print { vp }
933  */
934 static
935 int
936 hammer_vop_print(struct vop_print_args *ap)
937 {
938         return EOPNOTSUPP;
939 }
940
941 /*
942  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
943  */
944 static
945 int
946 hammer_vop_readdir(struct vop_readdir_args *ap)
947 {
948         struct hammer_cursor cursor;
949         struct hammer_inode *ip;
950         struct uio *uio;
951         hammer_record_ondisk_t rec;
952         hammer_base_elm_t base;
953         int error;
954         int cookie_index;
955         int ncookies;
956         off_t *cookies;
957         off_t saveoff;
958         int r;
959
960         ip = VTOI(ap->a_vp);
961         uio = ap->a_uio;
962         saveoff = uio->uio_offset;
963
964         if (ap->a_ncookies) {
965                 ncookies = uio->uio_resid / 16 + 1;
966                 if (ncookies > 1024)
967                         ncookies = 1024;
968                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
969                 cookie_index = 0;
970         } else {
971                 ncookies = -1;
972                 cookies = NULL;
973                 cookie_index = 0;
974         }
975
976         /*
977          * Handle artificial entries
978          */
979         error = 0;
980         if (saveoff == 0) {
981                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
982                 if (r)
983                         goto done;
984                 if (cookies)
985                         cookies[cookie_index] = saveoff;
986                 ++saveoff;
987                 ++cookie_index;
988                 if (cookie_index == ncookies)
989                         goto done;
990         }
991         if (saveoff == 1) {
992                 if (ip->ino_data.parent_obj_id) {
993                         r = vop_write_dirent(&error, uio,
994                                              ip->ino_data.parent_obj_id,
995                                              DT_DIR, 2, "..");
996                 } else {
997                         r = vop_write_dirent(&error, uio,
998                                              ip->obj_id, DT_DIR, 2, "..");
999                 }
1000                 if (r)
1001                         goto done;
1002                 if (cookies)
1003                         cookies[cookie_index] = saveoff;
1004                 ++saveoff;
1005                 ++cookie_index;
1006                 if (cookie_index == ncookies)
1007                         goto done;
1008         }
1009
1010         /*
1011          * Key range (begin and end inclusive) to scan.  Directory keys
1012          * directly translate to a 64 bit 'seek' position.
1013          */
1014         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1015         cursor.key_beg.obj_id = ip->obj_id;
1016         cursor.key_beg.create_tid = 0;
1017         cursor.key_beg.delete_tid = 0;
1018         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1019         cursor.key_beg.obj_type = 0;
1020         cursor.key_beg.key = saveoff;
1021
1022         cursor.key_end = cursor.key_beg;
1023         cursor.key_end.key = HAMMER_MAX_KEY;
1024         cursor.asof = ip->obj_asof;
1025         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1026
1027         error = hammer_ip_first(&cursor, ip);
1028
1029         while (error == 0) {
1030                 error = hammer_ip_resolve_data(&cursor);
1031                 if (error)
1032                         break;
1033                 rec = cursor.record;
1034                 base = &rec->base.base;
1035                 saveoff = base->key;
1036
1037                 if (base->obj_id != ip->obj_id)
1038                         panic("readdir: bad record at %p", cursor.node);
1039
1040                 r = vop_write_dirent(
1041                              &error, uio, rec->entry.obj_id,
1042                              hammer_get_dtype(rec->entry.base.base.obj_type),
1043                              rec->entry.base.data_len,
1044                              (void *)cursor.data);
1045                 if (r)
1046                         break;
1047                 ++saveoff;
1048                 if (cookies)
1049                         cookies[cookie_index] = base->key;
1050                 ++cookie_index;
1051                 if (cookie_index == ncookies)
1052                         break;
1053                 error = hammer_ip_next(&cursor);
1054         }
1055         hammer_done_cursor(&cursor);
1056
1057 done:
1058         if (ap->a_eofflag)
1059                 *ap->a_eofflag = (error == ENOENT);
1060         uio->uio_offset = saveoff;
1061         if (error && cookie_index == 0) {
1062                 if (error == ENOENT)
1063                         error = 0;
1064                 if (cookies) {
1065                         kfree(cookies, M_TEMP);
1066                         *ap->a_ncookies = 0;
1067                         *ap->a_cookies = NULL;
1068                 }
1069         } else {
1070                 if (error == ENOENT)
1071                         error = 0;
1072                 if (cookies) {
1073                         *ap->a_ncookies = cookie_index;
1074                         *ap->a_cookies = cookies;
1075                 }
1076         }
1077         return(error);
1078 }
1079
1080 /*
1081  * hammer_vop_readlink { vp, uio, cred }
1082  */
1083 static
1084 int
1085 hammer_vop_readlink(struct vop_readlink_args *ap)
1086 {
1087         struct hammer_cursor cursor;
1088         struct hammer_inode *ip;
1089         int error;
1090
1091         ip = VTOI(ap->a_vp);
1092         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1093
1094         /*
1095          * Key range (begin and end inclusive) to scan.  Directory keys
1096          * directly translate to a 64 bit 'seek' position.
1097          */
1098         cursor.key_beg.obj_id = ip->obj_id;
1099         cursor.key_beg.create_tid = 0;
1100         cursor.key_beg.delete_tid = 0;
1101         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1102         cursor.key_beg.obj_type = 0;
1103         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1104         cursor.asof = ip->obj_asof;
1105         cursor.flags |= HAMMER_CURSOR_ASOF;
1106
1107         error = hammer_ip_lookup(&cursor, ip);
1108         if (error == 0) {
1109                 error = hammer_ip_resolve_data(&cursor);
1110                 if (error == 0) {
1111                         error = uiomove((char *)cursor.data,
1112                                         cursor.record->generic.base.data_len,
1113                                         ap->a_uio);
1114                 }
1115         }
1116         hammer_done_cursor(&cursor);
1117         return(error);
1118 }
1119
1120 /*
1121  * hammer_vop_nremove { nch, dvp, cred }
1122  */
1123 static
1124 int
1125 hammer_vop_nremove(struct vop_nremove_args *ap)
1126 {
1127         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
1128 }
1129
1130 /*
1131  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1132  */
1133 static
1134 int
1135 hammer_vop_nrename(struct vop_nrename_args *ap)
1136 {
1137         struct hammer_transaction trans;
1138         struct namecache *fncp;
1139         struct namecache *tncp;
1140         struct hammer_inode *fdip;
1141         struct hammer_inode *tdip;
1142         struct hammer_inode *ip;
1143         struct hammer_cursor cursor;
1144         union hammer_record_ondisk *rec;
1145         int64_t namekey;
1146         int error;
1147
1148         fdip = VTOI(ap->a_fdvp);
1149         tdip = VTOI(ap->a_tdvp);
1150         fncp = ap->a_fnch->ncp;
1151         tncp = ap->a_tnch->ncp;
1152         ip = VTOI(fncp->nc_vp);
1153         KKASSERT(ip != NULL);
1154
1155         if (fdip->flags & HAMMER_INODE_RO)
1156                 return (EROFS);
1157         if (tdip->flags & HAMMER_INODE_RO)
1158                 return (EROFS);
1159         if (ip->flags & HAMMER_INODE_RO)
1160                 return (EROFS);
1161
1162         hammer_start_transaction(&trans, fdip->hmp);
1163
1164         /*
1165          * Remove tncp from the target directory and then link ip as
1166          * tncp. XXX pass trans to dounlink
1167          */
1168         error = hammer_dounlink(ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1169         if (error == 0 || error == ENOENT)
1170                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1171         if (error)
1172                 goto failed; /* XXX */
1173
1174         /*
1175          * Locate the record in the originating directory and remove it.
1176          *
1177          * Calculate the namekey and setup the key range for the scan.  This
1178          * works kinda like a chained hash table where the lower 32 bits
1179          * of the namekey synthesize the chain.
1180          *
1181          * The key range is inclusive of both key_beg and key_end.
1182          */
1183         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1184 retry:
1185         hammer_init_cursor_hmp(&cursor, &fdip->cache[0], fdip->hmp);
1186         cursor.key_beg.obj_id = fdip->obj_id;
1187         cursor.key_beg.key = namekey;
1188         cursor.key_beg.create_tid = 0;
1189         cursor.key_beg.delete_tid = 0;
1190         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1191         cursor.key_beg.obj_type = 0;
1192
1193         cursor.key_end = cursor.key_beg;
1194         cursor.key_end.key |= 0xFFFFFFFFULL;
1195         cursor.asof = fdip->obj_asof;
1196         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1197
1198         /*
1199          * Scan all matching records (the chain), locate the one matching
1200          * the requested path component.
1201          *
1202          * The hammer_ip_*() functions merge in-memory records with on-disk
1203          * records for the purposes of the search.
1204          */
1205         error = hammer_ip_first(&cursor, fdip);
1206         while (error == 0) {
1207                 if (hammer_ip_resolve_data(&cursor) != 0)
1208                         break;
1209                 rec = cursor.record;
1210                 if (fncp->nc_nlen == rec->entry.base.data_len &&
1211                     bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
1212                         break;
1213                 }
1214                 error = hammer_ip_next(&cursor);
1215         }
1216
1217         /*
1218          * If all is ok we have to get the inode so we can adjust nlinks.
1219          *
1220          * WARNING: hammer_ip_del_directory() may have to terminate the
1221          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1222          * twice.
1223          */
1224         if (error == 0)
1225                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1226         hammer_done_cursor(&cursor);
1227         if (error == 0)
1228                 cache_rename(ap->a_fnch, ap->a_tnch);
1229         if (error == EDEADLK)
1230                 goto retry;
1231 failed:
1232         if (error == 0) {
1233                 hammer_commit_transaction(&trans);
1234         } else {
1235                 hammer_abort_transaction(&trans);
1236         }
1237         return (error);
1238 }
1239
1240 /*
1241  * hammer_vop_nrmdir { nch, dvp, cred }
1242  */
1243 static
1244 int
1245 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1246 {
1247         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
1248 }
1249
1250 /*
1251  * hammer_vop_setattr { vp, vap, cred }
1252  */
1253 static
1254 int
1255 hammer_vop_setattr(struct vop_setattr_args *ap)
1256 {
1257         struct hammer_transaction trans;
1258         struct hammer_cursor *spike = NULL;
1259         struct vattr *vap;
1260         struct hammer_inode *ip;
1261         int modflags;
1262         int error;
1263         int truncating;
1264         int64_t aligned_size;
1265         u_int32_t flags;
1266         uuid_t uuid;
1267
1268         vap = ap->a_vap;
1269         ip = ap->a_vp->v_data;
1270         modflags = 0;
1271
1272         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1273                 return(EROFS);
1274         if (ip->flags & HAMMER_INODE_RO)
1275                 return (EROFS);
1276
1277         hammer_start_transaction(&trans, ip->hmp);
1278         error = 0;
1279
1280         if (vap->va_flags != VNOVAL) {
1281                 flags = ip->ino_data.uflags;
1282                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1283                                          hammer_to_unix_xid(&ip->ino_data.uid),
1284                                          ap->a_cred);
1285                 if (error == 0) {
1286                         if (ip->ino_data.uflags != flags) {
1287                                 ip->ino_data.uflags = flags;
1288                                 modflags |= HAMMER_INODE_DDIRTY;
1289                         }
1290                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1291                                 error = 0;
1292                                 goto done;
1293                         }
1294                 }
1295                 goto done;
1296         }
1297         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1298                 error = EPERM;
1299                 goto done;
1300         }
1301         if (vap->va_uid != (uid_t)VNOVAL) {
1302                 hammer_guid_to_uuid(&uuid, vap->va_uid);
1303                 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1304                         ip->ino_data.uid = uuid;
1305                         modflags |= HAMMER_INODE_DDIRTY;
1306                 }
1307         }
1308         if (vap->va_gid != (uid_t)VNOVAL) {
1309                 hammer_guid_to_uuid(&uuid, vap->va_gid);
1310                 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1311                         ip->ino_data.gid = uuid;
1312                         modflags |= HAMMER_INODE_DDIRTY;
1313                 }
1314         }
1315         while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
1316                 switch(ap->a_vp->v_type) {
1317                 case VREG:
1318                         if (vap->va_size == ip->ino_rec.ino_size)
1319                                 break;
1320                         if (vap->va_size < ip->ino_rec.ino_size) {
1321                                 vtruncbuf(ap->a_vp, vap->va_size,
1322                                           HAMMER_BUFSIZE);
1323                                 truncating = 1;
1324                         } else {
1325                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1326                                 truncating = 0;
1327                         }
1328                         ip->ino_rec.ino_size = vap->va_size;
1329                         modflags |= HAMMER_INODE_RDIRTY;
1330                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1331                                         ~(int64_t)HAMMER_BUFMASK;
1332
1333                         if (truncating) {
1334                                 error = hammer_ip_delete_range(&trans, ip,
1335                                                     aligned_size,
1336                                                     0x7FFFFFFFFFFFFFFFLL,
1337                                                     &spike);
1338                         }
1339                         /*
1340                          * If truncating we have to clean out a portion of
1341                          * the last block on-disk.
1342                          */
1343                         if (truncating && error == 0 &&
1344                             vap->va_size < aligned_size) {
1345                                 struct buf *bp;
1346                                 int offset;
1347
1348                                 offset = vap->va_size & HAMMER_BUFMASK;
1349                                 error = bread(ap->a_vp,
1350                                               aligned_size - HAMMER_BUFSIZE,
1351                                               HAMMER_BUFSIZE, &bp);
1352                                 if (error == 0) {
1353                                         bzero(bp->b_data + offset,
1354                                               HAMMER_BUFSIZE - offset);
1355                                         bdwrite(bp);
1356                                 } else {
1357                                         brelse(bp);
1358                                 }
1359                         }
1360                         break;
1361                 case VDATABASE:
1362                         error = hammer_ip_delete_range(&trans, ip,
1363                                                     vap->va_size,
1364                                                     0x7FFFFFFFFFFFFFFFLL,
1365                                                     &spike);
1366                         ip->ino_rec.ino_size = vap->va_size;
1367                         modflags |= HAMMER_INODE_RDIRTY;
1368                         break;
1369                 default:
1370                         error = EINVAL;
1371                         goto done;
1372                 }
1373                 if (error == ENOSPC) {
1374                         error = hammer_spike(&spike);
1375                         if (error == 0)
1376                                 continue;
1377                 }
1378                 KKASSERT(spike == NULL);
1379                 break;
1380         }
1381         if (vap->va_atime.tv_sec != VNOVAL) {
1382                 ip->ino_rec.ino_atime =
1383                         hammer_timespec_to_transid(&vap->va_atime);
1384                 modflags |= HAMMER_INODE_ITIMES;
1385         }
1386         if (vap->va_mtime.tv_sec != VNOVAL) {
1387                 ip->ino_rec.ino_mtime =
1388                         hammer_timespec_to_transid(&vap->va_mtime);
1389                 modflags |= HAMMER_INODE_ITIMES;
1390         }
1391         if (vap->va_mode != (mode_t)VNOVAL) {
1392                 if (ip->ino_data.mode != vap->va_mode) {
1393                         ip->ino_data.mode = vap->va_mode;
1394                         modflags |= HAMMER_INODE_DDIRTY;
1395                 }
1396         }
1397 done:
1398         if (error) {
1399                 hammer_abort_transaction(&trans);
1400         } else {
1401                 hammer_modify_inode(&trans, ip, modflags);
1402                 hammer_commit_transaction(&trans);
1403         }
1404         return (error);
1405 }
1406
1407 /*
1408  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1409  */
1410 static
1411 int
1412 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1413 {
1414         struct hammer_transaction trans;
1415         struct hammer_inode *dip;
1416         struct hammer_inode *nip;
1417         struct nchandle *nch;
1418         hammer_record_t record;
1419         int error;
1420         int bytes;
1421
1422         ap->a_vap->va_type = VLNK;
1423
1424         nch = ap->a_nch;
1425         dip = VTOI(ap->a_dvp);
1426
1427         if (dip->flags & HAMMER_INODE_RO)
1428                 return (EROFS);
1429
1430         /*
1431          * Create a transaction to cover the operations we perform.
1432          */
1433         hammer_start_transaction(&trans, dip->hmp);
1434
1435         /*
1436          * Create a new filesystem object of the requested type.  The
1437          * returned inode will be referenced but not locked.
1438          */
1439
1440         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1441         if (error) {
1442                 hammer_abort_transaction(&trans);
1443                 *ap->a_vpp = NULL;
1444                 return (error);
1445         }
1446
1447         /*
1448          * Add the new filesystem object to the directory.  This will also
1449          * bump the inode's link count.
1450          */
1451         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1452
1453         /*
1454          * Add a record representing the symlink.  symlink stores the link
1455          * as pure data, not a string, and is no \0 terminated.
1456          */
1457         if (error == 0) {
1458                 record = hammer_alloc_mem_record(nip);
1459                 bytes = strlen(ap->a_target);
1460
1461                 record->rec.generic.base.base.key = HAMMER_FIXKEY_SYMLINK;
1462                 record->rec.generic.base.base.rec_type = HAMMER_RECTYPE_FIX;
1463                 record->rec.generic.base.data_len = bytes;
1464                 if (bytes <= sizeof(record->rec.generic.filler)) {
1465                         record->data = (void *)record->rec.generic.filler;
1466                         bcopy(ap->a_target, record->data, bytes);
1467                 } else {
1468                         record->data = (void *)ap->a_target;
1469                         /* will be reallocated by routine below */
1470                 }
1471                 error = hammer_ip_add_record(&trans, record);
1472         }
1473
1474         /*
1475          * Finish up.
1476          */
1477         if (error) {
1478                 hammer_rel_inode(nip, 0);
1479                 hammer_abort_transaction(&trans);
1480                 *ap->a_vpp = NULL;
1481         } else {
1482                 hammer_commit_transaction(&trans);
1483                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
1484                 hammer_rel_inode(nip, 0);
1485                 if (error == 0) {
1486                         cache_setunresolved(ap->a_nch);
1487                         cache_setvp(ap->a_nch, *ap->a_vpp);
1488                 }
1489         }
1490         return (error);
1491 }
1492
1493 /*
1494  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1495  */
1496 static
1497 int
1498 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1499 {
1500         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags));
1501 }
1502
1503 /*
1504  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1505  */
1506 static
1507 int
1508 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1509 {
1510         struct hammer_inode *ip = ap->a_vp->v_data;
1511
1512         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1513                             ap->a_fflag, ap->a_cred));
1514 }
1515
1516 static
1517 int
1518 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1519 {
1520         struct mount *mp;
1521         int error;
1522
1523         mp = ap->a_head.a_ops->head.vv_mount;
1524
1525         switch(ap->a_op) {
1526         case MOUNTCTL_SET_EXPORT:
1527                 if (ap->a_ctllen != sizeof(struct export_args))
1528                         error = EINVAL;
1529                 error = hammer_vfs_export(mp, ap->a_op,
1530                                       (const struct export_args *)ap->a_ctl);
1531                 break;
1532         default:
1533                 error = journal_mountctl(ap);
1534                 break;
1535         }
1536         return(error);
1537 }
1538
1539 /*
1540  * hammer_vop_strategy { vp, bio }
1541  *
1542  * Strategy call, used for regular file read & write only.  Note that the
1543  * bp may represent a cluster.
1544  *
1545  * To simplify operation and allow better optimizations in the future,
1546  * this code does not make any assumptions with regards to buffer alignment
1547  * or size.
1548  */
1549 static
1550 int
1551 hammer_vop_strategy(struct vop_strategy_args *ap)
1552 {
1553         struct buf *bp;
1554         int error;
1555
1556         bp = ap->a_bio->bio_buf;
1557
1558         switch(bp->b_cmd) {
1559         case BUF_CMD_READ:
1560                 error = hammer_vop_strategy_read(ap);
1561                 break;
1562         case BUF_CMD_WRITE:
1563                 error = hammer_vop_strategy_write(ap);
1564                 break;
1565         default:
1566                 error = EINVAL;
1567                 break;
1568         }
1569         bp->b_error = error;
1570         if (error)
1571                 bp->b_flags |= B_ERROR;
1572         biodone(ap->a_bio);
1573         return (error);
1574 }
1575
1576 /*
1577  * Read from a regular file.  Iterate the related records and fill in the
1578  * BIO/BUF.  Gaps are zero-filled.
1579  *
1580  * The support code in hammer_object.c should be used to deal with mixed
1581  * in-memory and on-disk records.
1582  *
1583  * XXX atime update
1584  */
1585 static
1586 int
1587 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1588 {
1589         struct hammer_inode *ip = ap->a_vp->v_data;
1590         struct hammer_cursor cursor;
1591         hammer_record_ondisk_t rec;
1592         hammer_base_elm_t base;
1593         struct bio *bio;
1594         struct buf *bp;
1595         int64_t rec_offset;
1596         int64_t ran_end;
1597         int64_t tmp64;
1598         int error;
1599         int boff;
1600         int roff;
1601         int n;
1602
1603         bio = ap->a_bio;
1604         bp = bio->bio_buf;
1605
1606         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1607
1608         /*
1609          * Key range (begin and end inclusive) to scan.  Note that the key's
1610          * stored in the actual records represent BASE+LEN, not BASE.  The
1611          * first record containing bio_offset will have a key > bio_offset.
1612          */
1613         cursor.key_beg.obj_id = ip->obj_id;
1614         cursor.key_beg.create_tid = 0;
1615         cursor.key_beg.delete_tid = 0;
1616         cursor.key_beg.obj_type = 0;
1617         cursor.key_beg.key = bio->bio_offset + 1;
1618         cursor.asof = ip->obj_asof;
1619         cursor.flags |= HAMMER_CURSOR_ASOF;
1620
1621         cursor.key_end = cursor.key_beg;
1622         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1623                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1624                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1625                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1626         } else {
1627                 ran_end = bio->bio_offset + bp->b_bufsize;
1628                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1629                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1630                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1631                 if (tmp64 < ran_end)
1632                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1633                 else
1634                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1635         }
1636         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1637
1638         error = hammer_ip_first(&cursor, ip);
1639         boff = 0;
1640
1641         while (error == 0) {
1642                 error = hammer_ip_resolve_data(&cursor);
1643                 if (error)
1644                         break;
1645                 rec = cursor.record;
1646                 base = &rec->base.base;
1647
1648                 rec_offset = base->key - rec->data.base.data_len;
1649
1650                 /*
1651                  * Calculate the gap, if any, and zero-fill it.
1652                  */
1653                 n = (int)(rec_offset - (bio->bio_offset + boff));
1654                 if (n > 0) {
1655                         if (n > bp->b_bufsize - boff)
1656                                 n = bp->b_bufsize - boff;
1657                         bzero((char *)bp->b_data + boff, n);
1658                         boff += n;
1659                         n = 0;
1660                 }
1661
1662                 /*
1663                  * Calculate the data offset in the record and the number
1664                  * of bytes we can copy.
1665                  *
1666                  * Note there is a degenerate case here where boff may
1667                  * already be at bp->b_bufsize.
1668                  */
1669                 roff = -n;
1670                 n = rec->data.base.data_len - roff;
1671                 KKASSERT(n > 0);
1672                 if (n > bp->b_bufsize - boff)
1673                         n = bp->b_bufsize - boff;
1674                 bcopy((char *)cursor.data + roff, (char *)bp->b_data + boff, n);
1675                 boff += n;
1676                 if (boff == bp->b_bufsize)
1677                         break;
1678                 error = hammer_ip_next(&cursor);
1679         }
1680         hammer_done_cursor(&cursor);
1681
1682         /*
1683          * There may have been a gap after the last record
1684          */
1685         if (error == ENOENT)
1686                 error = 0;
1687         if (error == 0 && boff != bp->b_bufsize) {
1688                 KKASSERT(boff < bp->b_bufsize);
1689                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1690                 /* boff = bp->b_bufsize; */
1691         }
1692         bp->b_resid = 0;
1693         return(error);
1694 }
1695
1696 /*
1697  * Write to a regular file.  Iterate the related records and mark for
1698  * deletion.  If existing edge records (left and right side) overlap our
1699  * write they have to be marked deleted and new records created, usually
1700  * referencing a portion of the original data.  Then add a record to
1701  * represent the buffer.
1702  *
1703  * The support code in hammer_object.c should be used to deal with mixed
1704  * in-memory and on-disk records.
1705  */
1706 static
1707 int
1708 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1709 {
1710         struct hammer_transaction trans;
1711         struct hammer_cursor *spike = NULL;
1712         hammer_inode_t ip;
1713         struct bio *bio;
1714         struct buf *bp;
1715         int error;
1716
1717         bio = ap->a_bio;
1718         bp = bio->bio_buf;
1719         ip = ap->a_vp->v_data;
1720
1721         if (ip->flags & HAMMER_INODE_RO)
1722                 return (EROFS);
1723
1724         /*
1725          * Start a transaction using the TID stored with the bp.
1726          */
1727         KKASSERT(bp->b_tid != 0);
1728         hammer_start_transaction_tid(&trans, ip->hmp, bp->b_tid);
1729
1730 retry:
1731         /*
1732          * Delete any records overlapping our range.  This function will
1733          * (eventually) properly truncate partial overlaps.
1734          */
1735         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1736                 error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
1737                                                bio->bio_offset, &spike);
1738         } else {
1739                 error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
1740                                                bio->bio_offset +
1741                                                 bp->b_bufsize - 1,
1742                                                &spike);
1743         }
1744
1745         /*
1746          * Add a single record to cover the write
1747          */
1748         if (error == 0) {
1749                 error = hammer_ip_sync_data(&trans, ip, bio->bio_offset,
1750                                             bp->b_data, bp->b_bufsize,
1751                                             &spike);
1752         }
1753
1754         /*
1755          * If we ran out of space the spike structure will be filled in
1756          * and we must call hammer_spike with it, then retry.
1757          */
1758         if (error == ENOSPC) {
1759                 error = hammer_spike(&spike);
1760                 if (error == 0)
1761                         goto retry;
1762         }
1763         KKASSERT(spike == NULL);
1764
1765         /*
1766          * If an error occured abort the transaction
1767          */
1768         if (error) {
1769                 /* XXX undo deletion */
1770                 hammer_abort_transaction(&trans);
1771                 bp->b_resid = bp->b_bufsize;
1772         } else {
1773                 hammer_commit_transaction(&trans);
1774                 bp->b_resid = 0;
1775                 bp->b_tid = 0;
1776         }
1777         return(error);
1778 }
1779
1780 /*
1781  * dounlink - disconnect a directory entry
1782  *
1783  * XXX whiteout support not really in yet
1784  */
1785 static int
1786 hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred,
1787                 int flags)
1788 {
1789         struct hammer_transaction trans;
1790         struct namecache *ncp;
1791         hammer_inode_t dip;
1792         hammer_inode_t ip;
1793         hammer_record_ondisk_t rec;
1794         struct hammer_cursor cursor;
1795         int64_t namekey;
1796         int error;
1797
1798         /*
1799          * Calculate the namekey and setup the key range for the scan.  This
1800          * works kinda like a chained hash table where the lower 32 bits
1801          * of the namekey synthesize the chain.
1802          *
1803          * The key range is inclusive of both key_beg and key_end.
1804          */
1805         dip = VTOI(dvp);
1806         ncp = nch->ncp;
1807
1808         if (dip->flags & HAMMER_INODE_RO)
1809                 return (EROFS);
1810
1811         hammer_start_transaction(&trans, dip->hmp);
1812
1813         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
1814 retry:
1815         hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp);
1816         cursor.key_beg.obj_id = dip->obj_id;
1817         cursor.key_beg.key = namekey;
1818         cursor.key_beg.create_tid = 0;
1819         cursor.key_beg.delete_tid = 0;
1820         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1821         cursor.key_beg.obj_type = 0;
1822
1823         cursor.key_end = cursor.key_beg;
1824         cursor.key_end.key |= 0xFFFFFFFFULL;
1825         cursor.asof = dip->obj_asof;
1826         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1827
1828         /*
1829          * Scan all matching records (the chain), locate the one matching
1830          * the requested path component.  info->last_error contains the
1831          * error code on search termination and could be 0, ENOENT, or
1832          * something else.
1833          *
1834          * The hammer_ip_*() functions merge in-memory records with on-disk
1835          * records for the purposes of the search.
1836          */
1837         error = hammer_ip_first(&cursor, dip);
1838         while (error == 0) {
1839                 error = hammer_ip_resolve_data(&cursor);
1840                 if (error)
1841                         break;
1842                 rec = cursor.record;
1843                 if (ncp->nc_nlen == rec->entry.base.data_len &&
1844                     bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
1845                         break;
1846                 }
1847                 error = hammer_ip_next(&cursor);
1848         }
1849
1850         /*
1851          * If all is ok we have to get the inode so we can adjust nlinks.
1852          *
1853          * If the target is a directory, it must be empty.
1854          */
1855         if (error == 0) {
1856                 ip = hammer_get_inode(dip->hmp, &dip->cache[1],
1857                                       rec->entry.obj_id,
1858                                       dip->hmp->asof, 0, &error);
1859                 if (error == ENOENT) {
1860                         kprintf("obj_id %016llx\n", rec->entry.obj_id);
1861                         Debugger("ENOENT unlinking object that should exist, cont to sync");
1862                         hammer_sync_hmp(dip->hmp, MNT_NOWAIT);
1863                         Debugger("ENOENT - sync done");
1864                 }
1865                 if (error == 0 && ip->ino_rec.base.base.obj_type ==
1866                                   HAMMER_OBJTYPE_DIRECTORY) {
1867                         error = hammer_ip_check_directory_empty(&trans, ip);
1868                 }
1869                 /*
1870                  * WARNING: hammer_ip_del_directory() may have to terminate
1871                  * the cursor to avoid a lock recursion.  It's ok to call
1872                  * hammer_done_cursor() twice.
1873                  */
1874                 if (error == 0)
1875                         error = hammer_ip_del_directory(&trans, &cursor, dip, ip);
1876                 if (error == 0) {
1877                         cache_setunresolved(nch);
1878                         cache_setvp(nch, NULL);
1879                         /* XXX locking */
1880                         if (ip->vp)
1881                                 cache_inval_vp(ip->vp, CINV_DESTROY);
1882                 }
1883                 hammer_rel_inode(ip, 0);
1884         }
1885         hammer_done_cursor(&cursor);
1886         if (error == EDEADLK)
1887                 goto retry;
1888
1889         if (error == 0)
1890                 hammer_commit_transaction(&trans);
1891         else
1892                 hammer_abort_transaction(&trans);
1893         return (error);
1894 }
1895
1896 /************************************************************************
1897  *                          FIFO AND SPECFS OPS                         *
1898  ************************************************************************
1899  *
1900  */
1901
1902 static int
1903 hammer_vop_fifoclose (struct vop_close_args *ap)
1904 {
1905         /* XXX update itimes */
1906         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
1907 }
1908
1909 static int
1910 hammer_vop_fiforead (struct vop_read_args *ap)
1911 {
1912         int error;
1913
1914         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
1915         /* XXX update access time */
1916         return (error);
1917 }
1918
1919 static int
1920 hammer_vop_fifowrite (struct vop_write_args *ap)
1921 {
1922         int error;
1923
1924         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
1925         /* XXX update access time */
1926         return (error);
1927 }
1928
1929 static int
1930 hammer_vop_specclose (struct vop_close_args *ap)
1931 {
1932         /* XXX update itimes */
1933         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1934 }
1935
1936 static int
1937 hammer_vop_specread (struct vop_read_args *ap)
1938 {
1939         /* XXX update access time */
1940         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1941 }
1942
1943 static int
1944 hammer_vop_specwrite (struct vop_write_args *ap)
1945 {
1946         /* XXX update last change time */
1947         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1948 }
1949