HAMMER 25/many: Add an ioctl API for HAMMER.
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.25 2008/02/04 08:33:17 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81
82 static int hammer_vop_fifoclose (struct vop_close_args *);
83 static int hammer_vop_fiforead (struct vop_read_args *);
84 static int hammer_vop_fifowrite (struct vop_write_args *);
85
86 static int hammer_vop_specclose (struct vop_close_args *);
87 static int hammer_vop_specread (struct vop_read_args *);
88 static int hammer_vop_specwrite (struct vop_write_args *);
89
90 struct vop_ops hammer_vnode_vops = {
91         .vop_default =          vop_defaultop,
92         .vop_fsync =            hammer_vop_fsync,
93         .vop_getpages =         vop_stdgetpages,
94         .vop_putpages =         vop_stdputpages,
95         .vop_read =             hammer_vop_read,
96         .vop_write =            hammer_vop_write,
97         .vop_access =           hammer_vop_access,
98         .vop_advlock =          hammer_vop_advlock,
99         .vop_close =            hammer_vop_close,
100         .vop_ncreate =          hammer_vop_ncreate,
101         .vop_getattr =          hammer_vop_getattr,
102         .vop_inactive =         hammer_vop_inactive,
103         .vop_reclaim =          hammer_vop_reclaim,
104         .vop_nresolve =         hammer_vop_nresolve,
105         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
106         .vop_nlink =            hammer_vop_nlink,
107         .vop_nmkdir =           hammer_vop_nmkdir,
108         .vop_nmknod =           hammer_vop_nmknod,
109         .vop_open =             hammer_vop_open,
110         .vop_pathconf =         hammer_vop_pathconf,
111         .vop_print =            hammer_vop_print,
112         .vop_readdir =          hammer_vop_readdir,
113         .vop_readlink =         hammer_vop_readlink,
114         .vop_nremove =          hammer_vop_nremove,
115         .vop_nrename =          hammer_vop_nrename,
116         .vop_nrmdir =           hammer_vop_nrmdir,
117         .vop_setattr =          hammer_vop_setattr,
118         .vop_strategy =         hammer_vop_strategy,
119         .vop_nsymlink =         hammer_vop_nsymlink,
120         .vop_nwhiteout =        hammer_vop_nwhiteout,
121         .vop_ioctl =            hammer_vop_ioctl
122 };
123
124 struct vop_ops hammer_spec_vops = {
125         .vop_default =          spec_vnoperate,
126         .vop_fsync =            hammer_vop_fsync,
127         .vop_read =             hammer_vop_specread,
128         .vop_write =            hammer_vop_specwrite,
129         .vop_access =           hammer_vop_access,
130         .vop_close =            hammer_vop_specclose,
131         .vop_getattr =          hammer_vop_getattr,
132         .vop_inactive =         hammer_vop_inactive,
133         .vop_reclaim =          hammer_vop_reclaim,
134         .vop_setattr =          hammer_vop_setattr
135 };
136
137 struct vop_ops hammer_fifo_vops = {
138         .vop_default =          fifo_vnoperate,
139         .vop_fsync =            hammer_vop_fsync,
140         .vop_read =             hammer_vop_fiforead,
141         .vop_write =            hammer_vop_fifowrite,
142         .vop_access =           hammer_vop_access,
143         .vop_close =            hammer_vop_fifoclose,
144         .vop_getattr =          hammer_vop_getattr,
145         .vop_inactive =         hammer_vop_inactive,
146         .vop_reclaim =          hammer_vop_reclaim,
147         .vop_setattr =          hammer_vop_setattr
148 };
149
150 static int hammer_dounlink(struct nchandle *nch, struct vnode *dvp,
151                            struct ucred *cred, int flags);
152 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
153 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
154
155 #if 0
156 static
157 int
158 hammer_vop_vnoperate(struct vop_generic_args *)
159 {
160         return (VOCALL(&hammer_vnode_vops, ap));
161 }
162 #endif
163
164 /*
165  * hammer_vop_fsync { vp, waitfor }
166  */
167 static
168 int
169 hammer_vop_fsync(struct vop_fsync_args *ap)
170 {
171         hammer_inode_t ip;
172         int error;
173
174         ip = VTOI(ap->a_vp);
175         error = hammer_sync_inode(ip, ap->a_waitfor, 0);
176         return (error);
177 }
178
179 /*
180  * hammer_vop_read { vp, uio, ioflag, cred }
181  */
182 static
183 int
184 hammer_vop_read(struct vop_read_args *ap)
185 {
186         struct hammer_transaction trans;
187         hammer_inode_t ip;
188         off_t offset;
189         struct buf *bp;
190         struct uio *uio;
191         int error;
192         int n;
193         int seqcount;
194
195         if (ap->a_vp->v_type != VREG)
196                 return (EINVAL);
197         ip = VTOI(ap->a_vp);
198         error = 0;
199         seqcount = ap->a_ioflag >> 16;
200
201         hammer_start_transaction(&trans, ip->hmp);
202
203         /*
204          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
205          */
206         uio = ap->a_uio;
207         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
208                 offset = uio->uio_offset & HAMMER_BUFMASK;
209 #if 0
210                 error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
211                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
212                                      MAXBSIZE, seqcount, &bp);
213 #endif
214                 error = bread(ap->a_vp, uio->uio_offset - offset,
215                               HAMMER_BUFSIZE, &bp);
216                 if (error) {
217                         brelse(bp);
218                         break;
219                 }
220                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
221                 n = HAMMER_BUFSIZE - offset;
222                 if (n > uio->uio_resid)
223                         n = uio->uio_resid;
224                 if (n > ip->ino_rec.ino_size - uio->uio_offset)
225                         n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
226                 error = uiomove((char *)bp->b_data + offset, n, uio);
227                 if (error) {
228                         bqrelse(bp);
229                         break;
230                 }
231                 if ((ip->flags & HAMMER_INODE_RO) == 0) {
232                         ip->ino_rec.ino_atime = trans.tid;
233                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
234                 }
235                 bqrelse(bp);
236         }
237         hammer_commit_transaction(&trans);
238         return (error);
239 }
240
241 /*
242  * hammer_vop_write { vp, uio, ioflag, cred }
243  */
244 static
245 int
246 hammer_vop_write(struct vop_write_args *ap)
247 {
248         struct hammer_transaction trans;
249         struct hammer_inode *ip;
250         struct uio *uio;
251         off_t offset;
252         struct buf *bp;
253         int error;
254         int n;
255         int flags;
256
257         if (ap->a_vp->v_type != VREG)
258                 return (EINVAL);
259         ip = VTOI(ap->a_vp);
260         error = 0;
261
262         if (ip->flags & HAMMER_INODE_RO)
263                 return (EROFS);
264
265         /*
266          * Create a transaction to cover the operations we perform.
267          */
268         hammer_start_transaction(&trans, ip->hmp);
269         uio = ap->a_uio;
270
271         /*
272          * Check append mode
273          */
274         if (ap->a_ioflag & IO_APPEND)
275                 uio->uio_offset = ip->ino_rec.ino_size;
276
277         /*
278          * Check for illegal write offsets.  Valid range is 0...2^63-1
279          */
280         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
281                 hammer_commit_transaction(&trans);
282                 return (EFBIG);
283         }
284
285         /*
286          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
287          */
288         while (uio->uio_resid > 0) {
289                 int fixsize = 0;
290
291                 offset = uio->uio_offset & HAMMER_BUFMASK;
292                 n = HAMMER_BUFSIZE - offset;
293                 if (n > uio->uio_resid)
294                         n = uio->uio_resid;
295                 if (uio->uio_offset + n > ip->ino_rec.ino_size) {
296                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
297                         fixsize = 1;
298                 }
299
300                 if (uio->uio_segflg == UIO_NOCOPY) {
301                         /*
302                          * Issuing a write with the same data backing the
303                          * buffer.  Instantiate the buffer to collect the
304                          * backing vm pages, then read-in any missing bits.
305                          *
306                          * This case is used by vop_stdputpages().
307                          */
308                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
309                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
310                         if ((bp->b_flags & B_CACHE) == 0) {
311                                 bqrelse(bp);
312                                 error = bread(ap->a_vp,
313                                               uio->uio_offset - offset,
314                                               HAMMER_BUFSIZE, &bp);
315                         }
316                 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
317                         /*
318                          * entirely overwrite the buffer
319                          */
320                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
321                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
322                 } else if (offset == 0 && uio->uio_offset >= ip->ino_rec.ino_size) {
323                         /*
324                          * XXX
325                          */
326                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
327                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
328                         vfs_bio_clrbuf(bp);
329                 } else {
330                         /*
331                          * Partial overwrite, read in any missing bits then
332                          * replace the portion being written.
333                          */
334                         error = bread(ap->a_vp, uio->uio_offset - offset,
335                                       HAMMER_BUFSIZE, &bp);
336                         if (error == 0)
337                                 bheavy(bp);
338                 }
339                 if (error == 0)
340                         error = uiomove((char *)bp->b_data + offset, n, uio);
341
342                 /*
343                  * If we screwed up we have to undo any VM size changes we
344                  * made.
345                  */
346                 if (error) {
347                         brelse(bp);
348                         if (fixsize) {
349                                 vtruncbuf(ap->a_vp, ip->ino_rec.ino_size,
350                                           HAMMER_BUFSIZE);
351                         }
352                         break;
353                 }
354                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
355                 if (ip->ino_rec.ino_size < uio->uio_offset) {
356                         ip->ino_rec.ino_size = uio->uio_offset;
357                         flags = HAMMER_INODE_RDIRTY;
358                         vnode_pager_setsize(ap->a_vp, ip->ino_rec.ino_size);
359                 } else {
360                         flags = 0;
361                 }
362                 ip->ino_rec.ino_mtime = trans.tid;
363                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
364                 hammer_modify_inode(&trans, ip, flags);
365                 if (ap->a_ioflag & IO_SYNC) {
366                         bwrite(bp);
367                 } else if (ap->a_ioflag & IO_DIRECT) {
368                         bawrite(bp);
369                 } else {
370                         bdwrite(bp);
371                 }
372         }
373         if (error)
374                 hammer_abort_transaction(&trans);
375         else
376                 hammer_commit_transaction(&trans);
377         return (error);
378 }
379
380 /*
381  * hammer_vop_access { vp, mode, cred }
382  */
383 static
384 int
385 hammer_vop_access(struct vop_access_args *ap)
386 {
387         struct hammer_inode *ip = VTOI(ap->a_vp);
388         uid_t uid;
389         gid_t gid;
390         int error;
391
392         uid = hammer_to_unix_xid(&ip->ino_data.uid);
393         gid = hammer_to_unix_xid(&ip->ino_data.gid);
394
395         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
396                                   ip->ino_data.uflags);
397         return (error);
398 }
399
400 /*
401  * hammer_vop_advlock { vp, id, op, fl, flags }
402  */
403 static
404 int
405 hammer_vop_advlock(struct vop_advlock_args *ap)
406 {
407         struct hammer_inode *ip = VTOI(ap->a_vp);
408
409         return (lf_advlock(ap, &ip->advlock, ip->ino_rec.ino_size));
410 }
411
412 /*
413  * hammer_vop_close { vp, fflag }
414  */
415 static
416 int
417 hammer_vop_close(struct vop_close_args *ap)
418 {
419         return (vop_stdclose(ap));
420 }
421
422 /*
423  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
424  *
425  * The operating system has already ensured that the directory entry
426  * does not exist and done all appropriate namespace locking.
427  */
428 static
429 int
430 hammer_vop_ncreate(struct vop_ncreate_args *ap)
431 {
432         struct hammer_transaction trans;
433         struct hammer_inode *dip;
434         struct hammer_inode *nip;
435         struct nchandle *nch;
436         int error;
437
438         nch = ap->a_nch;
439         dip = VTOI(ap->a_dvp);
440
441         if (dip->flags & HAMMER_INODE_RO)
442                 return (EROFS);
443
444         /*
445          * Create a transaction to cover the operations we perform.
446          */
447         hammer_start_transaction(&trans, dip->hmp);
448
449         /*
450          * Create a new filesystem object of the requested type.  The
451          * returned inode will be referenced but not locked.
452          */
453
454         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
455         if (error)
456                 kprintf("hammer_create_inode error %d\n", error);
457         if (error) {
458                 hammer_abort_transaction(&trans);
459                 *ap->a_vpp = NULL;
460                 return (error);
461         }
462
463         /*
464          * Add the new filesystem object to the directory.  This will also
465          * bump the inode's link count.
466          */
467         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
468         if (error)
469                 kprintf("hammer_ip_add_directory error %d\n", error);
470
471         /*
472          * Finish up.
473          */
474         if (error) {
475                 hammer_rel_inode(nip, 0);
476                 hammer_abort_transaction(&trans);
477                 *ap->a_vpp = NULL;
478         } else {
479                 hammer_commit_transaction(&trans);
480                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
481                 hammer_rel_inode(nip, 0);
482                 if (error == 0) {
483                         cache_setunresolved(ap->a_nch);
484                         cache_setvp(ap->a_nch, *ap->a_vpp);
485                 }
486         }
487         return (error);
488 }
489
490 /*
491  * hammer_vop_getattr { vp, vap }
492  */
493 static
494 int
495 hammer_vop_getattr(struct vop_getattr_args *ap)
496 {
497         struct hammer_inode *ip = VTOI(ap->a_vp);
498         struct vattr *vap = ap->a_vap;
499
500 #if 0
501         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
502             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
503             ip->obj_asof == XXX
504         ) {
505                 /* LAZYMOD XXX */
506         }
507         hammer_itimes(ap->a_vp);
508 #endif
509
510         vap->va_fsid = ip->hmp->fsid_udev;
511         vap->va_fileid = ip->ino_rec.base.base.obj_id;
512         vap->va_mode = ip->ino_data.mode;
513         vap->va_nlink = ip->ino_rec.ino_nlinks;
514         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
515         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
516         vap->va_rmajor = 0;
517         vap->va_rminor = 0;
518         vap->va_size = ip->ino_rec.ino_size;
519         hammer_to_timespec(ip->ino_rec.ino_atime, &vap->va_atime);
520         hammer_to_timespec(ip->ino_rec.ino_mtime, &vap->va_mtime);
521         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
522         vap->va_flags = ip->ino_data.uflags;
523         vap->va_gen = 1;        /* hammer inums are unique for all time */
524         vap->va_blocksize = 32768; /* XXX - extract from root volume */
525         vap->va_bytes = ip->ino_rec.ino_size;
526         vap->va_type = hammer_get_vnode_type(ip->ino_rec.base.base.obj_type);
527         vap->va_filerev = 0;    /* XXX */
528         /* mtime uniquely identifies any adjustments made to the file */
529         vap->va_fsmid = ip->ino_rec.ino_mtime;
530         vap->va_uid_uuid = ip->ino_data.uid;
531         vap->va_gid_uuid = ip->ino_data.gid;
532         vap->va_fsid_uuid = ip->hmp->fsid;
533         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
534                           VA_FSID_UUID_VALID;
535
536         switch (ip->ino_rec.base.base.obj_type) {
537         case HAMMER_OBJTYPE_CDEV:
538         case HAMMER_OBJTYPE_BDEV:
539                 vap->va_rmajor = ip->ino_data.rmajor;
540                 vap->va_rminor = ip->ino_data.rminor;
541                 break;
542         default:
543                 break;
544         }
545
546         return(0);
547 }
548
549 /*
550  * hammer_vop_nresolve { nch, dvp, cred }
551  *
552  * Locate the requested directory entry.
553  */
554 static
555 int
556 hammer_vop_nresolve(struct vop_nresolve_args *ap)
557 {
558         struct namecache *ncp;
559         hammer_inode_t dip;
560         hammer_inode_t ip;
561         hammer_tid_t asof;
562         struct hammer_cursor cursor;
563         union hammer_record_ondisk *rec;
564         struct vnode *vp;
565         int64_t namekey;
566         int error;
567         int i;
568         int nlen;
569         int flags;
570         u_int64_t obj_id;
571
572         /*
573          * Misc initialization, plus handle as-of name extensions.  Look for
574          * the '@@' extension.  Note that as-of files and directories cannot
575          * be modified.
576          */
577         dip = VTOI(ap->a_dvp);
578         ncp = ap->a_nch->ncp;
579         asof = dip->obj_asof;
580         nlen = ncp->nc_nlen;
581         flags = dip->flags;
582
583         for (i = 0; i < nlen; ++i) {
584                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
585                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
586                         flags |= HAMMER_INODE_RO;
587                         break;
588                 }
589         }
590         nlen = i;
591
592         /*
593          * If there is no path component the time extension is relative to
594          * dip.
595          */
596         if (nlen == 0) {
597                 ip = hammer_get_inode(dip->hmp, &dip->cache[1], dip->obj_id,
598                                       asof, flags, &error);
599                 if (error == 0) {
600                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
601                         hammer_rel_inode(ip, 0);
602                 } else {
603                         vp = NULL;
604                 }
605                 if (error == 0) {
606                         vn_unlock(vp);
607                         cache_setvp(ap->a_nch, vp);
608                         vrele(vp);
609                 }
610                 return(error);
611         }
612
613         /*
614          * Calculate the namekey and setup the key range for the scan.  This
615          * works kinda like a chained hash table where the lower 32 bits
616          * of the namekey synthesize the chain.
617          *
618          * The key range is inclusive of both key_beg and key_end.
619          */
620         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
621
622         error = hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp);
623         cursor.key_beg.obj_id = dip->obj_id;
624         cursor.key_beg.key = namekey;
625         cursor.key_beg.create_tid = 0;
626         cursor.key_beg.delete_tid = 0;
627         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
628         cursor.key_beg.obj_type = 0;
629
630         cursor.key_end = cursor.key_beg;
631         cursor.key_end.key |= 0xFFFFFFFFULL;
632         cursor.asof = asof;
633         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
634
635         /*
636          * Scan all matching records (the chain), locate the one matching
637          * the requested path component.
638          *
639          * The hammer_ip_*() functions merge in-memory records with on-disk
640          * records for the purposes of the search.
641          */
642         if (error == 0)
643                 error = hammer_ip_first(&cursor, dip);
644
645         rec = NULL;
646         obj_id = 0;
647
648         while (error == 0) {
649                 error = hammer_ip_resolve_data(&cursor);
650                 if (error)
651                         break;
652                 rec = cursor.record;
653                 if (nlen == rec->entry.base.data_len &&
654                     bcmp(ncp->nc_name, cursor.data, nlen) == 0) {
655                         obj_id = rec->entry.obj_id;
656                         break;
657                 }
658                 error = hammer_ip_next(&cursor);
659         }
660         hammer_done_cursor(&cursor);
661         if (error == 0) {
662                 ip = hammer_get_inode(dip->hmp, &dip->cache[1],
663                                       obj_id, asof, flags, &error);
664                 if (error == 0) {
665                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
666                         hammer_rel_inode(ip, 0);
667                 } else {
668                         vp = NULL;
669                 }
670                 if (error == 0) {
671                         vn_unlock(vp);
672                         cache_setvp(ap->a_nch, vp);
673                         vrele(vp);
674                 }
675         } else if (error == ENOENT) {
676                 cache_setvp(ap->a_nch, NULL);
677         }
678         return (error);
679 }
680
681 /*
682  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
683  *
684  * Locate the parent directory of a directory vnode.
685  *
686  * dvp is referenced but not locked.  *vpp must be returned referenced and
687  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
688  * at the root, instead it could indicate that the directory we were in was
689  * removed.
690  */
691 static
692 int
693 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
694 {
695         struct hammer_inode *dip;
696         struct hammer_inode *ip;
697         u_int64_t parent_obj_id;
698         int error;
699
700         dip = VTOI(ap->a_dvp);
701         if ((parent_obj_id = dip->ino_data.parent_obj_id) == 0) {
702                 *ap->a_vpp = NULL;
703                 return ENOENT;
704         }
705
706         ip = hammer_get_inode(dip->hmp, &dip->cache[1], parent_obj_id,
707                               dip->obj_asof, dip->flags, &error);
708         if (ip == NULL) {
709                 *ap->a_vpp = NULL;
710                 return(error);
711         }
712         error = hammer_get_vnode(ip, LK_EXCLUSIVE, ap->a_vpp);
713         hammer_rel_inode(ip, 0);
714         return (error);
715 }
716
717 /*
718  * hammer_vop_nlink { nch, dvp, vp, cred }
719  */
720 static
721 int
722 hammer_vop_nlink(struct vop_nlink_args *ap)
723 {
724         struct hammer_transaction trans;
725         struct hammer_inode *dip;
726         struct hammer_inode *ip;
727         struct nchandle *nch;
728         int error;
729
730         nch = ap->a_nch;
731         dip = VTOI(ap->a_dvp);
732         ip = VTOI(ap->a_vp);
733
734         if (dip->flags & HAMMER_INODE_RO)
735                 return (EROFS);
736         if (ip->flags & HAMMER_INODE_RO)
737                 return (EROFS);
738
739         /*
740          * Create a transaction to cover the operations we perform.
741          */
742         hammer_start_transaction(&trans, dip->hmp);
743
744         /*
745          * Add the filesystem object to the directory.  Note that neither
746          * dip nor ip are referenced or locked, but their vnodes are
747          * referenced.  This function will bump the inode's link count.
748          */
749         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
750
751         /*
752          * Finish up.
753          */
754         if (error) {
755                 hammer_abort_transaction(&trans);
756         } else {
757                 cache_setunresolved(nch);
758                 cache_setvp(nch, ap->a_vp);
759                 hammer_commit_transaction(&trans);
760         }
761         return (error);
762 }
763
764 /*
765  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
766  *
767  * The operating system has already ensured that the directory entry
768  * does not exist and done all appropriate namespace locking.
769  */
770 static
771 int
772 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
773 {
774         struct hammer_transaction trans;
775         struct hammer_inode *dip;
776         struct hammer_inode *nip;
777         struct nchandle *nch;
778         int error;
779
780         nch = ap->a_nch;
781         dip = VTOI(ap->a_dvp);
782
783         if (dip->flags & HAMMER_INODE_RO)
784                 return (EROFS);
785
786         /*
787          * Create a transaction to cover the operations we perform.
788          */
789         hammer_start_transaction(&trans, dip->hmp);
790
791         /*
792          * Create a new filesystem object of the requested type.  The
793          * returned inode will be referenced but not locked.
794          */
795         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
796         if (error)
797                 kprintf("hammer_mkdir error %d\n", error);
798         if (error) {
799                 hammer_abort_transaction(&trans);
800                 *ap->a_vpp = NULL;
801                 return (error);
802         }
803
804         /*
805          * Add the new filesystem object to the directory.  This will also
806          * bump the inode's link count.
807          */
808         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
809         if (error)
810                 kprintf("hammer_mkdir (add) error %d\n", error);
811
812         /*
813          * Finish up.
814          */
815         if (error) {
816                 hammer_rel_inode(nip, 0);
817                 hammer_abort_transaction(&trans);
818                 *ap->a_vpp = NULL;
819         } else {
820                 hammer_commit_transaction(&trans);
821                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
822                 hammer_rel_inode(nip, 0);
823                 if (error == 0) {
824                         cache_setunresolved(ap->a_nch);
825                         cache_setvp(ap->a_nch, *ap->a_vpp);
826                 }
827         }
828         return (error);
829 }
830
831 /*
832  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
833  *
834  * The operating system has already ensured that the directory entry
835  * does not exist and done all appropriate namespace locking.
836  */
837 static
838 int
839 hammer_vop_nmknod(struct vop_nmknod_args *ap)
840 {
841         struct hammer_transaction trans;
842         struct hammer_inode *dip;
843         struct hammer_inode *nip;
844         struct nchandle *nch;
845         int error;
846
847         nch = ap->a_nch;
848         dip = VTOI(ap->a_dvp);
849
850         if (dip->flags & HAMMER_INODE_RO)
851                 return (EROFS);
852
853         /*
854          * Create a transaction to cover the operations we perform.
855          */
856         hammer_start_transaction(&trans, dip->hmp);
857
858         /*
859          * Create a new filesystem object of the requested type.  The
860          * returned inode will be referenced but not locked.
861          */
862         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
863         if (error) {
864                 hammer_abort_transaction(&trans);
865                 *ap->a_vpp = NULL;
866                 return (error);
867         }
868
869         /*
870          * Add the new filesystem object to the directory.  This will also
871          * bump the inode's link count.
872          */
873         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
874
875         /*
876          * Finish up.
877          */
878         if (error) {
879                 hammer_rel_inode(nip, 0);
880                 hammer_abort_transaction(&trans);
881                 *ap->a_vpp = NULL;
882         } else {
883                 hammer_commit_transaction(&trans);
884                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
885                 hammer_rel_inode(nip, 0);
886                 if (error == 0) {
887                         cache_setunresolved(ap->a_nch);
888                         cache_setvp(ap->a_nch, *ap->a_vpp);
889                 }
890         }
891         return (error);
892 }
893
894 /*
895  * hammer_vop_open { vp, mode, cred, fp }
896  */
897 static
898 int
899 hammer_vop_open(struct vop_open_args *ap)
900 {
901         if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
902                 return (EROFS);
903
904         return(vop_stdopen(ap));
905 }
906
907 /*
908  * hammer_vop_pathconf { vp, name, retval }
909  */
910 static
911 int
912 hammer_vop_pathconf(struct vop_pathconf_args *ap)
913 {
914         return EOPNOTSUPP;
915 }
916
917 /*
918  * hammer_vop_print { vp }
919  */
920 static
921 int
922 hammer_vop_print(struct vop_print_args *ap)
923 {
924         return EOPNOTSUPP;
925 }
926
927 /*
928  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
929  */
930 static
931 int
932 hammer_vop_readdir(struct vop_readdir_args *ap)
933 {
934         struct hammer_cursor cursor;
935         struct hammer_inode *ip;
936         struct uio *uio;
937         hammer_record_ondisk_t rec;
938         hammer_base_elm_t base;
939         int error;
940         int cookie_index;
941         int ncookies;
942         off_t *cookies;
943         off_t saveoff;
944         int r;
945
946         ip = VTOI(ap->a_vp);
947         uio = ap->a_uio;
948         saveoff = uio->uio_offset;
949
950         if (ap->a_ncookies) {
951                 ncookies = uio->uio_resid / 16 + 1;
952                 if (ncookies > 1024)
953                         ncookies = 1024;
954                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
955                 cookie_index = 0;
956         } else {
957                 ncookies = -1;
958                 cookies = NULL;
959                 cookie_index = 0;
960         }
961
962         /*
963          * Handle artificial entries
964          */
965         error = 0;
966         if (saveoff == 0) {
967                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
968                 if (r)
969                         goto done;
970                 if (cookies)
971                         cookies[cookie_index] = saveoff;
972                 ++saveoff;
973                 ++cookie_index;
974                 if (cookie_index == ncookies)
975                         goto done;
976         }
977         if (saveoff == 1) {
978                 if (ip->ino_data.parent_obj_id) {
979                         r = vop_write_dirent(&error, uio,
980                                              ip->ino_data.parent_obj_id,
981                                              DT_DIR, 2, "..");
982                 } else {
983                         r = vop_write_dirent(&error, uio,
984                                              ip->obj_id, DT_DIR, 2, "..");
985                 }
986                 if (r)
987                         goto done;
988                 if (cookies)
989                         cookies[cookie_index] = saveoff;
990                 ++saveoff;
991                 ++cookie_index;
992                 if (cookie_index == ncookies)
993                         goto done;
994         }
995
996         /*
997          * Key range (begin and end inclusive) to scan.  Directory keys
998          * directly translate to a 64 bit 'seek' position.
999          */
1000         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1001         cursor.key_beg.obj_id = ip->obj_id;
1002         cursor.key_beg.create_tid = 0;
1003         cursor.key_beg.delete_tid = 0;
1004         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1005         cursor.key_beg.obj_type = 0;
1006         cursor.key_beg.key = saveoff;
1007
1008         cursor.key_end = cursor.key_beg;
1009         cursor.key_end.key = HAMMER_MAX_KEY;
1010         cursor.asof = ip->obj_asof;
1011         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1012
1013         error = hammer_ip_first(&cursor, ip);
1014
1015         while (error == 0) {
1016                 error = hammer_ip_resolve_data(&cursor);
1017                 if (error)
1018                         break;
1019                 rec = cursor.record;
1020                 base = &rec->base.base;
1021                 saveoff = base->key;
1022
1023                 if (base->obj_id != ip->obj_id)
1024                         panic("readdir: bad record at %p", cursor.node);
1025
1026                 r = vop_write_dirent(
1027                              &error, uio, rec->entry.obj_id,
1028                              hammer_get_dtype(rec->entry.base.base.obj_type),
1029                              rec->entry.base.data_len,
1030                              (void *)cursor.data);
1031                 if (r)
1032                         break;
1033                 ++saveoff;
1034                 if (cookies)
1035                         cookies[cookie_index] = base->key;
1036                 ++cookie_index;
1037                 if (cookie_index == ncookies)
1038                         break;
1039                 error = hammer_ip_next(&cursor);
1040         }
1041         hammer_done_cursor(&cursor);
1042
1043 done:
1044         if (ap->a_eofflag)
1045                 *ap->a_eofflag = (error == ENOENT);
1046         uio->uio_offset = saveoff;
1047         if (error && cookie_index == 0) {
1048                 if (error == ENOENT)
1049                         error = 0;
1050                 if (cookies) {
1051                         kfree(cookies, M_TEMP);
1052                         *ap->a_ncookies = 0;
1053                         *ap->a_cookies = NULL;
1054                 }
1055         } else {
1056                 if (error == ENOENT)
1057                         error = 0;
1058                 if (cookies) {
1059                         *ap->a_ncookies = cookie_index;
1060                         *ap->a_cookies = cookies;
1061                 }
1062         }
1063         return(error);
1064 }
1065
1066 /*
1067  * hammer_vop_readlink { vp, uio, cred }
1068  */
1069 static
1070 int
1071 hammer_vop_readlink(struct vop_readlink_args *ap)
1072 {
1073         struct hammer_cursor cursor;
1074         struct hammer_inode *ip;
1075         int error;
1076
1077         ip = VTOI(ap->a_vp);
1078         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1079
1080         /*
1081          * Key range (begin and end inclusive) to scan.  Directory keys
1082          * directly translate to a 64 bit 'seek' position.
1083          */
1084         cursor.key_beg.obj_id = ip->obj_id;
1085         cursor.key_beg.create_tid = 0;
1086         cursor.key_beg.delete_tid = 0;
1087         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1088         cursor.key_beg.obj_type = 0;
1089         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1090         cursor.asof = ip->obj_asof;
1091         cursor.flags |= HAMMER_CURSOR_ASOF;
1092
1093         error = hammer_ip_lookup(&cursor, ip);
1094         if (error == 0) {
1095                 error = hammer_ip_resolve_data(&cursor);
1096                 if (error == 0) {
1097                         error = uiomove((char *)cursor.data,
1098                                         cursor.record->generic.base.data_len,
1099                                         ap->a_uio);
1100                 }
1101         }
1102         hammer_done_cursor(&cursor);
1103         return(error);
1104 }
1105
1106 /*
1107  * hammer_vop_nremove { nch, dvp, cred }
1108  */
1109 static
1110 int
1111 hammer_vop_nremove(struct vop_nremove_args *ap)
1112 {
1113         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
1114 }
1115
1116 /*
1117  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1118  */
1119 static
1120 int
1121 hammer_vop_nrename(struct vop_nrename_args *ap)
1122 {
1123         struct hammer_transaction trans;
1124         struct namecache *fncp;
1125         struct namecache *tncp;
1126         struct hammer_inode *fdip;
1127         struct hammer_inode *tdip;
1128         struct hammer_inode *ip;
1129         struct hammer_cursor cursor;
1130         union hammer_record_ondisk *rec;
1131         int64_t namekey;
1132         int error;
1133
1134         fdip = VTOI(ap->a_fdvp);
1135         tdip = VTOI(ap->a_tdvp);
1136         fncp = ap->a_fnch->ncp;
1137         tncp = ap->a_tnch->ncp;
1138         ip = VTOI(fncp->nc_vp);
1139         KKASSERT(ip != NULL);
1140
1141         if (fdip->flags & HAMMER_INODE_RO)
1142                 return (EROFS);
1143         if (tdip->flags & HAMMER_INODE_RO)
1144                 return (EROFS);
1145         if (ip->flags & HAMMER_INODE_RO)
1146                 return (EROFS);
1147
1148         hammer_start_transaction(&trans, fdip->hmp);
1149
1150         /*
1151          * Remove tncp from the target directory and then link ip as
1152          * tncp. XXX pass trans to dounlink
1153          */
1154         error = hammer_dounlink(ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1155         if (error == 0 || error == ENOENT)
1156                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1157         if (error)
1158                 goto failed; /* XXX */
1159
1160         /*
1161          * Locate the record in the originating directory and remove it.
1162          *
1163          * Calculate the namekey and setup the key range for the scan.  This
1164          * works kinda like a chained hash table where the lower 32 bits
1165          * of the namekey synthesize the chain.
1166          *
1167          * The key range is inclusive of both key_beg and key_end.
1168          */
1169         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1170 retry:
1171         hammer_init_cursor_hmp(&cursor, &fdip->cache[0], fdip->hmp);
1172         cursor.key_beg.obj_id = fdip->obj_id;
1173         cursor.key_beg.key = namekey;
1174         cursor.key_beg.create_tid = 0;
1175         cursor.key_beg.delete_tid = 0;
1176         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1177         cursor.key_beg.obj_type = 0;
1178
1179         cursor.key_end = cursor.key_beg;
1180         cursor.key_end.key |= 0xFFFFFFFFULL;
1181         cursor.asof = fdip->obj_asof;
1182         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1183
1184         /*
1185          * Scan all matching records (the chain), locate the one matching
1186          * the requested path component.
1187          *
1188          * The hammer_ip_*() functions merge in-memory records with on-disk
1189          * records for the purposes of the search.
1190          */
1191         error = hammer_ip_first(&cursor, fdip);
1192         while (error == 0) {
1193                 if (hammer_ip_resolve_data(&cursor) != 0)
1194                         break;
1195                 rec = cursor.record;
1196                 if (fncp->nc_nlen == rec->entry.base.data_len &&
1197                     bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
1198                         break;
1199                 }
1200                 error = hammer_ip_next(&cursor);
1201         }
1202
1203         /*
1204          * If all is ok we have to get the inode so we can adjust nlinks.
1205          *
1206          * WARNING: hammer_ip_del_directory() may have to terminate the
1207          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1208          * twice.
1209          */
1210         if (error == 0)
1211                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1212         hammer_done_cursor(&cursor);
1213         if (error == 0)
1214                 cache_rename(ap->a_fnch, ap->a_tnch);
1215         if (error == EDEADLK)
1216                 goto retry;
1217 failed:
1218         if (error == 0) {
1219                 hammer_commit_transaction(&trans);
1220         } else {
1221                 hammer_abort_transaction(&trans);
1222         }
1223         return (error);
1224 }
1225
1226 /*
1227  * hammer_vop_nrmdir { nch, dvp, cred }
1228  */
1229 static
1230 int
1231 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1232 {
1233         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
1234 }
1235
1236 /*
1237  * hammer_vop_setattr { vp, vap, cred }
1238  */
1239 static
1240 int
1241 hammer_vop_setattr(struct vop_setattr_args *ap)
1242 {
1243         struct hammer_transaction trans;
1244         struct hammer_cursor *spike = NULL;
1245         struct vattr *vap;
1246         struct hammer_inode *ip;
1247         int modflags;
1248         int error;
1249         int truncating;
1250         int64_t aligned_size;
1251         u_int32_t flags;
1252         uuid_t uuid;
1253
1254         vap = ap->a_vap;
1255         ip = ap->a_vp->v_data;
1256         modflags = 0;
1257
1258         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1259                 return(EROFS);
1260         if (ip->flags & HAMMER_INODE_RO)
1261                 return (EROFS);
1262
1263         hammer_start_transaction(&trans, ip->hmp);
1264         error = 0;
1265
1266         if (vap->va_flags != VNOVAL) {
1267                 flags = ip->ino_data.uflags;
1268                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1269                                          hammer_to_unix_xid(&ip->ino_data.uid),
1270                                          ap->a_cred);
1271                 if (error == 0) {
1272                         if (ip->ino_data.uflags != flags) {
1273                                 ip->ino_data.uflags = flags;
1274                                 modflags |= HAMMER_INODE_DDIRTY;
1275                         }
1276                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1277                                 error = 0;
1278                                 goto done;
1279                         }
1280                 }
1281                 goto done;
1282         }
1283         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1284                 error = EPERM;
1285                 goto done;
1286         }
1287         if (vap->va_uid != (uid_t)VNOVAL) {
1288                 hammer_guid_to_uuid(&uuid, vap->va_uid);
1289                 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1290                         ip->ino_data.uid = uuid;
1291                         modflags |= HAMMER_INODE_DDIRTY;
1292                 }
1293         }
1294         if (vap->va_gid != (uid_t)VNOVAL) {
1295                 hammer_guid_to_uuid(&uuid, vap->va_gid);
1296                 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1297                         ip->ino_data.gid = uuid;
1298                         modflags |= HAMMER_INODE_DDIRTY;
1299                 }
1300         }
1301         while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
1302                 switch(ap->a_vp->v_type) {
1303                 case VREG:
1304                         if (vap->va_size == ip->ino_rec.ino_size)
1305                                 break;
1306                         if (vap->va_size < ip->ino_rec.ino_size) {
1307                                 vtruncbuf(ap->a_vp, vap->va_size,
1308                                           HAMMER_BUFSIZE);
1309                                 truncating = 1;
1310                         } else {
1311                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1312                                 truncating = 0;
1313                         }
1314                         ip->ino_rec.ino_size = vap->va_size;
1315                         modflags |= HAMMER_INODE_RDIRTY;
1316                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1317                                         ~(int64_t)HAMMER_BUFMASK;
1318
1319                         if (truncating) {
1320                                 error = hammer_ip_delete_range(&trans, ip,
1321                                                     aligned_size,
1322                                                     0x7FFFFFFFFFFFFFFFLL,
1323                                                     &spike);
1324                         }
1325                         /*
1326                          * If truncating we have to clean out a portion of
1327                          * the last block on-disk.
1328                          */
1329                         if (truncating && error == 0 &&
1330                             vap->va_size < aligned_size) {
1331                                 struct buf *bp;
1332                                 int offset;
1333
1334                                 offset = vap->va_size & HAMMER_BUFMASK;
1335                                 error = bread(ap->a_vp,
1336                                               aligned_size - HAMMER_BUFSIZE,
1337                                               HAMMER_BUFSIZE, &bp);
1338                                 if (error == 0) {
1339                                         bzero(bp->b_data + offset,
1340                                               HAMMER_BUFSIZE - offset);
1341                                         bdwrite(bp);
1342                                 } else {
1343                                         brelse(bp);
1344                                 }
1345                         }
1346                         break;
1347                 case VDATABASE:
1348                         error = hammer_ip_delete_range(&trans, ip,
1349                                                     vap->va_size,
1350                                                     0x7FFFFFFFFFFFFFFFLL,
1351                                                     &spike);
1352                         ip->ino_rec.ino_size = vap->va_size;
1353                         modflags |= HAMMER_INODE_RDIRTY;
1354                         break;
1355                 default:
1356                         error = EINVAL;
1357                         goto done;
1358                 }
1359                 if (error == ENOSPC) {
1360                         error = hammer_spike(&spike);
1361                         if (error == 0)
1362                                 continue;
1363                 }
1364                 KKASSERT(spike == NULL);
1365                 break;
1366         }
1367         if (vap->va_atime.tv_sec != VNOVAL) {
1368                 ip->ino_rec.ino_atime =
1369                         hammer_timespec_to_transid(&vap->va_atime);
1370                 modflags |= HAMMER_INODE_ITIMES;
1371         }
1372         if (vap->va_mtime.tv_sec != VNOVAL) {
1373                 ip->ino_rec.ino_mtime =
1374                         hammer_timespec_to_transid(&vap->va_mtime);
1375                 modflags |= HAMMER_INODE_ITIMES;
1376         }
1377         if (vap->va_mode != (mode_t)VNOVAL) {
1378                 if (ip->ino_data.mode != vap->va_mode) {
1379                         ip->ino_data.mode = vap->va_mode;
1380                         modflags |= HAMMER_INODE_DDIRTY;
1381                 }
1382         }
1383 done:
1384         if (error) {
1385                 hammer_abort_transaction(&trans);
1386         } else {
1387                 hammer_modify_inode(&trans, ip, modflags);
1388                 hammer_commit_transaction(&trans);
1389         }
1390         return (error);
1391 }
1392
1393 /*
1394  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1395  */
1396 static
1397 int
1398 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1399 {
1400         struct hammer_transaction trans;
1401         struct hammer_inode *dip;
1402         struct hammer_inode *nip;
1403         struct nchandle *nch;
1404         hammer_record_t record;
1405         int error;
1406         int bytes;
1407
1408         ap->a_vap->va_type = VLNK;
1409
1410         nch = ap->a_nch;
1411         dip = VTOI(ap->a_dvp);
1412
1413         if (dip->flags & HAMMER_INODE_RO)
1414                 return (EROFS);
1415
1416         /*
1417          * Create a transaction to cover the operations we perform.
1418          */
1419         hammer_start_transaction(&trans, dip->hmp);
1420
1421         /*
1422          * Create a new filesystem object of the requested type.  The
1423          * returned inode will be referenced but not locked.
1424          */
1425
1426         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1427         if (error) {
1428                 hammer_abort_transaction(&trans);
1429                 *ap->a_vpp = NULL;
1430                 return (error);
1431         }
1432
1433         /*
1434          * Add the new filesystem object to the directory.  This will also
1435          * bump the inode's link count.
1436          */
1437         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1438
1439         /*
1440          * Add a record representing the symlink.  symlink stores the link
1441          * as pure data, not a string, and is no \0 terminated.
1442          */
1443         if (error == 0) {
1444                 record = hammer_alloc_mem_record(nip);
1445                 bytes = strlen(ap->a_target);
1446
1447                 record->rec.generic.base.base.key = HAMMER_FIXKEY_SYMLINK;
1448                 record->rec.generic.base.base.rec_type = HAMMER_RECTYPE_FIX;
1449                 record->rec.generic.base.data_len = bytes;
1450                 if (bytes <= sizeof(record->rec.generic.filler)) {
1451                         record->data = (void *)record->rec.generic.filler;
1452                         bcopy(ap->a_target, record->data, bytes);
1453                 } else {
1454                         record->data = (void *)ap->a_target;
1455                         /* will be reallocated by routine below */
1456                 }
1457                 error = hammer_ip_add_record(&trans, record);
1458         }
1459
1460         /*
1461          * Finish up.
1462          */
1463         if (error) {
1464                 hammer_rel_inode(nip, 0);
1465                 hammer_abort_transaction(&trans);
1466                 *ap->a_vpp = NULL;
1467         } else {
1468                 hammer_commit_transaction(&trans);
1469                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
1470                 hammer_rel_inode(nip, 0);
1471                 if (error == 0) {
1472                         cache_setunresolved(ap->a_nch);
1473                         cache_setvp(ap->a_nch, *ap->a_vpp);
1474                 }
1475         }
1476         return (error);
1477 }
1478
1479 /*
1480  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1481  */
1482 static
1483 int
1484 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1485 {
1486         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags));
1487 }
1488
1489 /*
1490  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1491  */
1492 static
1493 int
1494 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1495 {
1496         struct hammer_inode *ip = ap->a_vp->v_data;
1497
1498         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1499                             ap->a_fflag, ap->a_cred));
1500 }
1501
1502 /*
1503  * hammer_vop_strategy { vp, bio }
1504  *
1505  * Strategy call, used for regular file read & write only.  Note that the
1506  * bp may represent a cluster.
1507  *
1508  * To simplify operation and allow better optimizations in the future,
1509  * this code does not make any assumptions with regards to buffer alignment
1510  * or size.
1511  */
1512 static
1513 int
1514 hammer_vop_strategy(struct vop_strategy_args *ap)
1515 {
1516         struct buf *bp;
1517         int error;
1518
1519         bp = ap->a_bio->bio_buf;
1520
1521         switch(bp->b_cmd) {
1522         case BUF_CMD_READ:
1523                 error = hammer_vop_strategy_read(ap);
1524                 break;
1525         case BUF_CMD_WRITE:
1526                 error = hammer_vop_strategy_write(ap);
1527                 break;
1528         default:
1529                 error = EINVAL;
1530                 break;
1531         }
1532         bp->b_error = error;
1533         if (error)
1534                 bp->b_flags |= B_ERROR;
1535         biodone(ap->a_bio);
1536         return (error);
1537 }
1538
1539 /*
1540  * Read from a regular file.  Iterate the related records and fill in the
1541  * BIO/BUF.  Gaps are zero-filled.
1542  *
1543  * The support code in hammer_object.c should be used to deal with mixed
1544  * in-memory and on-disk records.
1545  *
1546  * XXX atime update
1547  */
1548 static
1549 int
1550 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1551 {
1552         struct hammer_inode *ip = ap->a_vp->v_data;
1553         struct hammer_cursor cursor;
1554         hammer_record_ondisk_t rec;
1555         hammer_base_elm_t base;
1556         struct bio *bio;
1557         struct buf *bp;
1558         int64_t rec_offset;
1559         int64_t ran_end;
1560         int64_t tmp64;
1561         int error;
1562         int boff;
1563         int roff;
1564         int n;
1565
1566         bio = ap->a_bio;
1567         bp = bio->bio_buf;
1568
1569         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1570
1571         /*
1572          * Key range (begin and end inclusive) to scan.  Note that the key's
1573          * stored in the actual records represent BASE+LEN, not BASE.  The
1574          * first record containing bio_offset will have a key > bio_offset.
1575          */
1576         cursor.key_beg.obj_id = ip->obj_id;
1577         cursor.key_beg.create_tid = 0;
1578         cursor.key_beg.delete_tid = 0;
1579         cursor.key_beg.obj_type = 0;
1580         cursor.key_beg.key = bio->bio_offset + 1;
1581         cursor.asof = ip->obj_asof;
1582         cursor.flags |= HAMMER_CURSOR_ASOF;
1583
1584         cursor.key_end = cursor.key_beg;
1585         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1586                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1587                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1588                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1589         } else {
1590                 ran_end = bio->bio_offset + bp->b_bufsize;
1591                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1592                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1593                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1594                 if (tmp64 < ran_end)
1595                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1596                 else
1597                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1598         }
1599         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1600
1601         error = hammer_ip_first(&cursor, ip);
1602         boff = 0;
1603
1604         while (error == 0) {
1605                 error = hammer_ip_resolve_data(&cursor);
1606                 if (error)
1607                         break;
1608                 rec = cursor.record;
1609                 base = &rec->base.base;
1610
1611                 rec_offset = base->key - rec->data.base.data_len;
1612
1613                 /*
1614                  * Calculate the gap, if any, and zero-fill it.
1615                  */
1616                 n = (int)(rec_offset - (bio->bio_offset + boff));
1617                 if (n > 0) {
1618                         if (n > bp->b_bufsize - boff)
1619                                 n = bp->b_bufsize - boff;
1620                         bzero((char *)bp->b_data + boff, n);
1621                         boff += n;
1622                         n = 0;
1623                 }
1624
1625                 /*
1626                  * Calculate the data offset in the record and the number
1627                  * of bytes we can copy.
1628                  *
1629                  * Note there is a degenerate case here where boff may
1630                  * already be at bp->b_bufsize.
1631                  */
1632                 roff = -n;
1633                 n = rec->data.base.data_len - roff;
1634                 KKASSERT(n > 0);
1635                 if (n > bp->b_bufsize - boff)
1636                         n = bp->b_bufsize - boff;
1637                 bcopy((char *)cursor.data + roff, (char *)bp->b_data + boff, n);
1638                 boff += n;
1639                 if (boff == bp->b_bufsize)
1640                         break;
1641                 error = hammer_ip_next(&cursor);
1642         }
1643         hammer_done_cursor(&cursor);
1644
1645         /*
1646          * There may have been a gap after the last record
1647          */
1648         if (error == ENOENT)
1649                 error = 0;
1650         if (error == 0 && boff != bp->b_bufsize) {
1651                 KKASSERT(boff < bp->b_bufsize);
1652                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1653                 /* boff = bp->b_bufsize; */
1654         }
1655         bp->b_resid = 0;
1656         return(error);
1657 }
1658
1659 /*
1660  * Write to a regular file.  Iterate the related records and mark for
1661  * deletion.  If existing edge records (left and right side) overlap our
1662  * write they have to be marked deleted and new records created, usually
1663  * referencing a portion of the original data.  Then add a record to
1664  * represent the buffer.
1665  *
1666  * The support code in hammer_object.c should be used to deal with mixed
1667  * in-memory and on-disk records.
1668  */
1669 static
1670 int
1671 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1672 {
1673         struct hammer_transaction trans;
1674         struct hammer_cursor *spike = NULL;
1675         hammer_inode_t ip;
1676         struct bio *bio;
1677         struct buf *bp;
1678         int error;
1679
1680         bio = ap->a_bio;
1681         bp = bio->bio_buf;
1682         ip = ap->a_vp->v_data;
1683
1684         if (ip->flags & HAMMER_INODE_RO)
1685                 return (EROFS);
1686
1687         hammer_start_transaction(&trans, ip->hmp);
1688
1689 retry:
1690         /*
1691          * Delete any records overlapping our range.  This function will
1692          * (eventually) properly truncate partial overlaps.
1693          */
1694         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1695                 error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
1696                                                bio->bio_offset, &spike);
1697         } else {
1698                 error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
1699                                                bio->bio_offset +
1700                                                 bp->b_bufsize - 1,
1701                                                &spike);
1702         }
1703
1704         /*
1705          * Add a single record to cover the write
1706          */
1707         if (error == 0) {
1708                 error = hammer_ip_sync_data(&trans, ip, bio->bio_offset,
1709                                             bp->b_data, bp->b_bufsize,
1710                                             &spike);
1711         }
1712
1713         /*
1714          * If we ran out of space the spike structure will be filled in
1715          * and we must call hammer_spike with it, then retry.
1716          */
1717         if (error == ENOSPC) {
1718                 error = hammer_spike(&spike);
1719                 if (error == 0)
1720                         goto retry;
1721         }
1722         KKASSERT(spike == NULL);
1723
1724         /*
1725          * If an error occured abort the transaction
1726          */
1727         if (error) {
1728                 /* XXX undo deletion */
1729                 hammer_abort_transaction(&trans);
1730                 bp->b_resid = bp->b_bufsize;
1731         } else {
1732                 hammer_commit_transaction(&trans);
1733                 bp->b_resid = 0;
1734         }
1735         return(error);
1736 }
1737
1738 /*
1739  * dounlink - disconnect a directory entry
1740  *
1741  * XXX whiteout support not really in yet
1742  */
1743 static int
1744 hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred,
1745                 int flags)
1746 {
1747         struct hammer_transaction trans;
1748         struct namecache *ncp;
1749         hammer_inode_t dip;
1750         hammer_inode_t ip;
1751         hammer_record_ondisk_t rec;
1752         struct hammer_cursor cursor;
1753         int64_t namekey;
1754         int error;
1755
1756         /*
1757          * Calculate the namekey and setup the key range for the scan.  This
1758          * works kinda like a chained hash table where the lower 32 bits
1759          * of the namekey synthesize the chain.
1760          *
1761          * The key range is inclusive of both key_beg and key_end.
1762          */
1763         dip = VTOI(dvp);
1764         ncp = nch->ncp;
1765
1766         if (dip->flags & HAMMER_INODE_RO)
1767                 return (EROFS);
1768
1769         hammer_start_transaction(&trans, dip->hmp);
1770
1771         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
1772 retry:
1773         hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp);
1774         cursor.key_beg.obj_id = dip->obj_id;
1775         cursor.key_beg.key = namekey;
1776         cursor.key_beg.create_tid = 0;
1777         cursor.key_beg.delete_tid = 0;
1778         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1779         cursor.key_beg.obj_type = 0;
1780
1781         cursor.key_end = cursor.key_beg;
1782         cursor.key_end.key |= 0xFFFFFFFFULL;
1783         cursor.asof = dip->obj_asof;
1784         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1785
1786         /*
1787          * Scan all matching records (the chain), locate the one matching
1788          * the requested path component.  info->last_error contains the
1789          * error code on search termination and could be 0, ENOENT, or
1790          * something else.
1791          *
1792          * The hammer_ip_*() functions merge in-memory records with on-disk
1793          * records for the purposes of the search.
1794          */
1795         error = hammer_ip_first(&cursor, dip);
1796         while (error == 0) {
1797                 error = hammer_ip_resolve_data(&cursor);
1798                 if (error)
1799                         break;
1800                 rec = cursor.record;
1801                 if (ncp->nc_nlen == rec->entry.base.data_len &&
1802                     bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
1803                         break;
1804                 }
1805                 error = hammer_ip_next(&cursor);
1806         }
1807
1808         /*
1809          * If all is ok we have to get the inode so we can adjust nlinks.
1810          *
1811          * If the target is a directory, it must be empty.
1812          */
1813         if (error == 0) {
1814                 ip = hammer_get_inode(dip->hmp, &dip->cache[1],
1815                                       rec->entry.obj_id,
1816                                       dip->hmp->asof, 0, &error);
1817                 if (error == ENOENT) {
1818                         kprintf("obj_id %016llx\n", rec->entry.obj_id);
1819                         Debugger("ENOENT unlinking object that should exist, cont to sync");
1820                         hammer_sync_hmp(dip->hmp, MNT_NOWAIT);
1821                         Debugger("ENOENT - sync done");
1822                 }
1823                 if (error == 0 && ip->ino_rec.base.base.obj_type ==
1824                                   HAMMER_OBJTYPE_DIRECTORY) {
1825                         error = hammer_ip_check_directory_empty(&trans, ip);
1826                 }
1827                 /*
1828                  * WARNING: hammer_ip_del_directory() may have to terminate
1829                  * the cursor to avoid a lock recursion.  It's ok to call
1830                  * hammer_done_cursor() twice.
1831                  */
1832                 if (error == 0)
1833                         error = hammer_ip_del_directory(&trans, &cursor, dip, ip);
1834                 if (error == 0) {
1835                         cache_setunresolved(nch);
1836                         cache_setvp(nch, NULL);
1837                         /* XXX locking */
1838                         if (ip->vp)
1839                                 cache_inval_vp(ip->vp, CINV_DESTROY);
1840                 }
1841                 hammer_rel_inode(ip, 0);
1842         }
1843         hammer_done_cursor(&cursor);
1844         if (error == EDEADLK)
1845                 goto retry;
1846
1847         if (error == 0)
1848                 hammer_commit_transaction(&trans);
1849         else
1850                 hammer_abort_transaction(&trans);
1851         return (error);
1852 }
1853
1854 /************************************************************************
1855  *                          FIFO AND SPECFS OPS                         *
1856  ************************************************************************
1857  *
1858  */
1859
1860 static int
1861 hammer_vop_fifoclose (struct vop_close_args *ap)
1862 {
1863         /* XXX update itimes */
1864         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
1865 }
1866
1867 static int
1868 hammer_vop_fiforead (struct vop_read_args *ap)
1869 {
1870         int error;
1871
1872         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
1873         /* XXX update access time */
1874         return (error);
1875 }
1876
1877 static int
1878 hammer_vop_fifowrite (struct vop_write_args *ap)
1879 {
1880         int error;
1881
1882         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
1883         /* XXX update access time */
1884         return (error);
1885 }
1886
1887 static int
1888 hammer_vop_specclose (struct vop_close_args *ap)
1889 {
1890         /* XXX update itimes */
1891         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1892 }
1893
1894 static int
1895 hammer_vop_specread (struct vop_read_args *ap)
1896 {
1897         /* XXX update access time */
1898         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1899 }
1900
1901 static int
1902 hammer_vop_specwrite (struct vop_write_args *ap)
1903 {
1904         /* XXX update last change time */
1905         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1906 }
1907