Merge from vendor branch LIBEVENT:
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.24 2008/01/25 10:36:04 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80
81 static int hammer_vop_fifoclose (struct vop_close_args *);
82 static int hammer_vop_fiforead (struct vop_read_args *);
83 static int hammer_vop_fifowrite (struct vop_write_args *);
84
85 static int hammer_vop_specclose (struct vop_close_args *);
86 static int hammer_vop_specread (struct vop_read_args *);
87 static int hammer_vop_specwrite (struct vop_write_args *);
88
89 struct vop_ops hammer_vnode_vops = {
90         .vop_default =          vop_defaultop,
91         .vop_fsync =            hammer_vop_fsync,
92         .vop_getpages =         vop_stdgetpages,
93         .vop_putpages =         vop_stdputpages,
94         .vop_read =             hammer_vop_read,
95         .vop_write =            hammer_vop_write,
96         .vop_access =           hammer_vop_access,
97         .vop_advlock =          hammer_vop_advlock,
98         .vop_close =            hammer_vop_close,
99         .vop_ncreate =          hammer_vop_ncreate,
100         .vop_getattr =          hammer_vop_getattr,
101         .vop_inactive =         hammer_vop_inactive,
102         .vop_reclaim =          hammer_vop_reclaim,
103         .vop_nresolve =         hammer_vop_nresolve,
104         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
105         .vop_nlink =            hammer_vop_nlink,
106         .vop_nmkdir =           hammer_vop_nmkdir,
107         .vop_nmknod =           hammer_vop_nmknod,
108         .vop_open =             hammer_vop_open,
109         .vop_pathconf =         hammer_vop_pathconf,
110         .vop_print =            hammer_vop_print,
111         .vop_readdir =          hammer_vop_readdir,
112         .vop_readlink =         hammer_vop_readlink,
113         .vop_nremove =          hammer_vop_nremove,
114         .vop_nrename =          hammer_vop_nrename,
115         .vop_nrmdir =           hammer_vop_nrmdir,
116         .vop_setattr =          hammer_vop_setattr,
117         .vop_strategy =         hammer_vop_strategy,
118         .vop_nsymlink =         hammer_vop_nsymlink,
119         .vop_nwhiteout =        hammer_vop_nwhiteout
120 };
121
122 struct vop_ops hammer_spec_vops = {
123         .vop_default =          spec_vnoperate,
124         .vop_fsync =            hammer_vop_fsync,
125         .vop_read =             hammer_vop_specread,
126         .vop_write =            hammer_vop_specwrite,
127         .vop_access =           hammer_vop_access,
128         .vop_close =            hammer_vop_specclose,
129         .vop_getattr =          hammer_vop_getattr,
130         .vop_inactive =         hammer_vop_inactive,
131         .vop_reclaim =          hammer_vop_reclaim,
132         .vop_setattr =          hammer_vop_setattr
133 };
134
135 struct vop_ops hammer_fifo_vops = {
136         .vop_default =          fifo_vnoperate,
137         .vop_fsync =            hammer_vop_fsync,
138         .vop_read =             hammer_vop_fiforead,
139         .vop_write =            hammer_vop_fifowrite,
140         .vop_access =           hammer_vop_access,
141         .vop_close =            hammer_vop_fifoclose,
142         .vop_getattr =          hammer_vop_getattr,
143         .vop_inactive =         hammer_vop_inactive,
144         .vop_reclaim =          hammer_vop_reclaim,
145         .vop_setattr =          hammer_vop_setattr
146 };
147
148 static int hammer_dounlink(struct nchandle *nch, struct vnode *dvp,
149                            struct ucred *cred, int flags);
150 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
151 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
152
153 #if 0
154 static
155 int
156 hammer_vop_vnoperate(struct vop_generic_args *)
157 {
158         return (VOCALL(&hammer_vnode_vops, ap));
159 }
160 #endif
161
162 /*
163  * hammer_vop_fsync { vp, waitfor }
164  */
165 static
166 int
167 hammer_vop_fsync(struct vop_fsync_args *ap)
168 {
169         hammer_inode_t ip;
170         int error;
171
172         ip = VTOI(ap->a_vp);
173         error = hammer_sync_inode(ip, ap->a_waitfor, 0);
174         return (error);
175 }
176
177 /*
178  * hammer_vop_read { vp, uio, ioflag, cred }
179  */
180 static
181 int
182 hammer_vop_read(struct vop_read_args *ap)
183 {
184         struct hammer_transaction trans;
185         hammer_inode_t ip;
186         off_t offset;
187         struct buf *bp;
188         struct uio *uio;
189         int error;
190         int n;
191         int seqcount;
192
193         if (ap->a_vp->v_type != VREG)
194                 return (EINVAL);
195         ip = VTOI(ap->a_vp);
196         error = 0;
197         seqcount = ap->a_ioflag >> 16;
198
199         hammer_start_transaction(&trans, ip->hmp);
200
201         /*
202          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
203          */
204         uio = ap->a_uio;
205         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
206                 offset = uio->uio_offset & HAMMER_BUFMASK;
207 #if 0
208                 error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
209                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
210                                      MAXBSIZE, seqcount, &bp);
211 #endif
212                 error = bread(ap->a_vp, uio->uio_offset - offset,
213                               HAMMER_BUFSIZE, &bp);
214                 if (error) {
215                         brelse(bp);
216                         break;
217                 }
218                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
219                 n = HAMMER_BUFSIZE - offset;
220                 if (n > uio->uio_resid)
221                         n = uio->uio_resid;
222                 if (n > ip->ino_rec.ino_size - uio->uio_offset)
223                         n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
224                 error = uiomove((char *)bp->b_data + offset, n, uio);
225                 if (error) {
226                         bqrelse(bp);
227                         break;
228                 }
229                 if ((ip->flags & HAMMER_INODE_RO) == 0) {
230                         ip->ino_rec.ino_atime = trans.tid;
231                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
232                 }
233                 bqrelse(bp);
234         }
235         hammer_commit_transaction(&trans);
236         return (error);
237 }
238
239 /*
240  * hammer_vop_write { vp, uio, ioflag, cred }
241  */
242 static
243 int
244 hammer_vop_write(struct vop_write_args *ap)
245 {
246         struct hammer_transaction trans;
247         struct hammer_inode *ip;
248         struct uio *uio;
249         off_t offset;
250         struct buf *bp;
251         int error;
252         int n;
253         int flags;
254
255         if (ap->a_vp->v_type != VREG)
256                 return (EINVAL);
257         ip = VTOI(ap->a_vp);
258         error = 0;
259
260         if (ip->flags & HAMMER_INODE_RO)
261                 return (EROFS);
262
263         /*
264          * Create a transaction to cover the operations we perform.
265          */
266         hammer_start_transaction(&trans, ip->hmp);
267         uio = ap->a_uio;
268
269         /*
270          * Check append mode
271          */
272         if (ap->a_ioflag & IO_APPEND)
273                 uio->uio_offset = ip->ino_rec.ino_size;
274
275         /*
276          * Check for illegal write offsets.  Valid range is 0...2^63-1
277          */
278         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
279                 hammer_commit_transaction(&trans);
280                 return (EFBIG);
281         }
282
283         /*
284          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
285          */
286         while (uio->uio_resid > 0) {
287                 int fixsize = 0;
288
289                 offset = uio->uio_offset & HAMMER_BUFMASK;
290                 n = HAMMER_BUFSIZE - offset;
291                 if (n > uio->uio_resid)
292                         n = uio->uio_resid;
293                 if (uio->uio_offset + n > ip->ino_rec.ino_size) {
294                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
295                         fixsize = 1;
296                 }
297
298                 if (uio->uio_segflg == UIO_NOCOPY) {
299                         /*
300                          * Issuing a write with the same data backing the
301                          * buffer.  Instantiate the buffer to collect the
302                          * backing vm pages, then read-in any missing bits.
303                          *
304                          * This case is used by vop_stdputpages().
305                          */
306                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
307                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
308                         if ((bp->b_flags & B_CACHE) == 0) {
309                                 bqrelse(bp);
310                                 error = bread(ap->a_vp,
311                                               uio->uio_offset - offset,
312                                               HAMMER_BUFSIZE, &bp);
313                         }
314                 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
315                         /*
316                          * entirely overwrite the buffer
317                          */
318                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
319                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
320                 } else if (offset == 0 && uio->uio_offset >= ip->ino_rec.ino_size) {
321                         /*
322                          * XXX
323                          */
324                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
325                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
326                         vfs_bio_clrbuf(bp);
327                 } else {
328                         /*
329                          * Partial overwrite, read in any missing bits then
330                          * replace the portion being written.
331                          */
332                         error = bread(ap->a_vp, uio->uio_offset - offset,
333                                       HAMMER_BUFSIZE, &bp);
334                         if (error == 0)
335                                 bheavy(bp);
336                 }
337                 if (error == 0)
338                         error = uiomove((char *)bp->b_data + offset, n, uio);
339
340                 /*
341                  * If we screwed up we have to undo any VM size changes we
342                  * made.
343                  */
344                 if (error) {
345                         brelse(bp);
346                         if (fixsize) {
347                                 vtruncbuf(ap->a_vp, ip->ino_rec.ino_size,
348                                           HAMMER_BUFSIZE);
349                         }
350                         break;
351                 }
352                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
353                 if (ip->ino_rec.ino_size < uio->uio_offset) {
354                         ip->ino_rec.ino_size = uio->uio_offset;
355                         flags = HAMMER_INODE_RDIRTY;
356                         vnode_pager_setsize(ap->a_vp, ip->ino_rec.ino_size);
357                 } else {
358                         flags = 0;
359                 }
360                 ip->ino_rec.ino_mtime = trans.tid;
361                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
362                 hammer_modify_inode(&trans, ip, flags);
363                 if (ap->a_ioflag & IO_SYNC) {
364                         bwrite(bp);
365                 } else if (ap->a_ioflag & IO_DIRECT) {
366                         bawrite(bp);
367                 } else {
368                         bdwrite(bp);
369                 }
370         }
371         if (error)
372                 hammer_abort_transaction(&trans);
373         else
374                 hammer_commit_transaction(&trans);
375         return (error);
376 }
377
378 /*
379  * hammer_vop_access { vp, mode, cred }
380  */
381 static
382 int
383 hammer_vop_access(struct vop_access_args *ap)
384 {
385         struct hammer_inode *ip = VTOI(ap->a_vp);
386         uid_t uid;
387         gid_t gid;
388         int error;
389
390         uid = hammer_to_unix_xid(&ip->ino_data.uid);
391         gid = hammer_to_unix_xid(&ip->ino_data.gid);
392
393         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
394                                   ip->ino_data.uflags);
395         return (error);
396 }
397
398 /*
399  * hammer_vop_advlock { vp, id, op, fl, flags }
400  */
401 static
402 int
403 hammer_vop_advlock(struct vop_advlock_args *ap)
404 {
405         struct hammer_inode *ip = VTOI(ap->a_vp);
406
407         return (lf_advlock(ap, &ip->advlock, ip->ino_rec.ino_size));
408 }
409
410 /*
411  * hammer_vop_close { vp, fflag }
412  */
413 static
414 int
415 hammer_vop_close(struct vop_close_args *ap)
416 {
417         return (vop_stdclose(ap));
418 }
419
420 /*
421  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
422  *
423  * The operating system has already ensured that the directory entry
424  * does not exist and done all appropriate namespace locking.
425  */
426 static
427 int
428 hammer_vop_ncreate(struct vop_ncreate_args *ap)
429 {
430         struct hammer_transaction trans;
431         struct hammer_inode *dip;
432         struct hammer_inode *nip;
433         struct nchandle *nch;
434         int error;
435
436         nch = ap->a_nch;
437         dip = VTOI(ap->a_dvp);
438
439         if (dip->flags & HAMMER_INODE_RO)
440                 return (EROFS);
441
442         /*
443          * Create a transaction to cover the operations we perform.
444          */
445         hammer_start_transaction(&trans, dip->hmp);
446
447         /*
448          * Create a new filesystem object of the requested type.  The
449          * returned inode will be referenced but not locked.
450          */
451
452         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
453         if (error)
454                 kprintf("hammer_create_inode error %d\n", error);
455         if (error) {
456                 hammer_abort_transaction(&trans);
457                 *ap->a_vpp = NULL;
458                 return (error);
459         }
460
461         /*
462          * Add the new filesystem object to the directory.  This will also
463          * bump the inode's link count.
464          */
465         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
466         if (error)
467                 kprintf("hammer_ip_add_directory error %d\n", error);
468
469         /*
470          * Finish up.
471          */
472         if (error) {
473                 hammer_rel_inode(nip, 0);
474                 hammer_abort_transaction(&trans);
475                 *ap->a_vpp = NULL;
476         } else {
477                 hammer_commit_transaction(&trans);
478                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
479                 hammer_rel_inode(nip, 0);
480                 if (error == 0) {
481                         cache_setunresolved(ap->a_nch);
482                         cache_setvp(ap->a_nch, *ap->a_vpp);
483                 }
484         }
485         return (error);
486 }
487
488 /*
489  * hammer_vop_getattr { vp, vap }
490  */
491 static
492 int
493 hammer_vop_getattr(struct vop_getattr_args *ap)
494 {
495         struct hammer_inode *ip = VTOI(ap->a_vp);
496         struct vattr *vap = ap->a_vap;
497
498 #if 0
499         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
500             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
501             ip->obj_asof == XXX
502         ) {
503                 /* LAZYMOD XXX */
504         }
505         hammer_itimes(ap->a_vp);
506 #endif
507
508         vap->va_fsid = ip->hmp->fsid_udev;
509         vap->va_fileid = ip->ino_rec.base.base.obj_id;
510         vap->va_mode = ip->ino_data.mode;
511         vap->va_nlink = ip->ino_rec.ino_nlinks;
512         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
513         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
514         vap->va_rmajor = 0;
515         vap->va_rminor = 0;
516         vap->va_size = ip->ino_rec.ino_size;
517         hammer_to_timespec(ip->ino_rec.ino_atime, &vap->va_atime);
518         hammer_to_timespec(ip->ino_rec.ino_mtime, &vap->va_mtime);
519         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
520         vap->va_flags = ip->ino_data.uflags;
521         vap->va_gen = 1;        /* hammer inums are unique for all time */
522         vap->va_blocksize = 32768; /* XXX - extract from root volume */
523         vap->va_bytes = ip->ino_rec.ino_size;
524         vap->va_type = hammer_get_vnode_type(ip->ino_rec.base.base.obj_type);
525         vap->va_filerev = 0;    /* XXX */
526         /* mtime uniquely identifies any adjustments made to the file */
527         vap->va_fsmid = ip->ino_rec.ino_mtime;
528         vap->va_uid_uuid = ip->ino_data.uid;
529         vap->va_gid_uuid = ip->ino_data.gid;
530         vap->va_fsid_uuid = ip->hmp->fsid;
531         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
532                           VA_FSID_UUID_VALID;
533
534         switch (ip->ino_rec.base.base.obj_type) {
535         case HAMMER_OBJTYPE_CDEV:
536         case HAMMER_OBJTYPE_BDEV:
537                 vap->va_rmajor = ip->ino_data.rmajor;
538                 vap->va_rminor = ip->ino_data.rminor;
539                 break;
540         default:
541                 break;
542         }
543
544         return(0);
545 }
546
547 /*
548  * hammer_vop_nresolve { nch, dvp, cred }
549  *
550  * Locate the requested directory entry.
551  */
552 static
553 int
554 hammer_vop_nresolve(struct vop_nresolve_args *ap)
555 {
556         struct namecache *ncp;
557         hammer_inode_t dip;
558         hammer_inode_t ip;
559         hammer_tid_t asof;
560         struct hammer_cursor cursor;
561         union hammer_record_ondisk *rec;
562         struct vnode *vp;
563         int64_t namekey;
564         int error;
565         int i;
566         int nlen;
567         int flags;
568         u_int64_t obj_id;
569
570         /*
571          * Misc initialization, plus handle as-of name extensions.  Look for
572          * the '@@' extension.  Note that as-of files and directories cannot
573          * be modified.
574          */
575         dip = VTOI(ap->a_dvp);
576         ncp = ap->a_nch->ncp;
577         asof = dip->obj_asof;
578         nlen = ncp->nc_nlen;
579         flags = dip->flags;
580
581         for (i = 0; i < nlen; ++i) {
582                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
583                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
584                         kprintf("ASOF %016llx\n", asof);
585                         flags |= HAMMER_INODE_RO;
586                         break;
587                 }
588         }
589         nlen = i;
590
591         /*
592          * If there is no path component the time extension is relative to
593          * dip.
594          */
595         if (nlen == 0) {
596                 ip = hammer_get_inode(dip->hmp, &dip->cache[1], dip->obj_id,
597                                       asof, flags, &error);
598                 if (error == 0) {
599                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
600                         hammer_rel_inode(ip, 0);
601                 } else {
602                         vp = NULL;
603                 }
604                 if (error == 0) {
605                         vn_unlock(vp);
606                         cache_setvp(ap->a_nch, vp);
607                         vrele(vp);
608                 }
609                 return(error);
610         }
611
612         /*
613          * Calculate the namekey and setup the key range for the scan.  This
614          * works kinda like a chained hash table where the lower 32 bits
615          * of the namekey synthesize the chain.
616          *
617          * The key range is inclusive of both key_beg and key_end.
618          */
619         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
620
621         hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp);
622         cursor.key_beg.obj_id = dip->obj_id;
623         cursor.key_beg.key = namekey;
624         cursor.key_beg.create_tid = 0;
625         cursor.key_beg.delete_tid = 0;
626         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
627         cursor.key_beg.obj_type = 0;
628
629         cursor.key_end = cursor.key_beg;
630         cursor.key_end.key |= 0xFFFFFFFFULL;
631         cursor.asof = asof;
632         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
633
634         /*
635          * Scan all matching records (the chain), locate the one matching
636          * the requested path component.
637          *
638          * The hammer_ip_*() functions merge in-memory records with on-disk
639          * records for the purposes of the search.
640          */
641         error = hammer_ip_first(&cursor, dip);
642         rec = NULL;
643         obj_id = 0;
644
645         while (error == 0) {
646                 error = hammer_ip_resolve_data(&cursor);
647                 if (error)
648                         break;
649                 rec = cursor.record;
650                 if (nlen == rec->entry.base.data_len &&
651                     bcmp(ncp->nc_name, cursor.data, nlen) == 0) {
652                         obj_id = rec->entry.obj_id;
653                         break;
654                 }
655                 error = hammer_ip_next(&cursor);
656         }
657         hammer_done_cursor(&cursor);
658         if (error == 0) {
659                 ip = hammer_get_inode(dip->hmp, &dip->cache[1],
660                                       obj_id, asof, flags, &error);
661                 if (error == 0) {
662                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
663                         hammer_rel_inode(ip, 0);
664                 } else {
665                         vp = NULL;
666                 }
667                 if (error == 0) {
668                         vn_unlock(vp);
669                         cache_setvp(ap->a_nch, vp);
670                         vrele(vp);
671                 }
672         } else if (error == ENOENT) {
673                 cache_setvp(ap->a_nch, NULL);
674         }
675         return (error);
676 }
677
678 /*
679  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
680  *
681  * Locate the parent directory of a directory vnode.
682  *
683  * dvp is referenced but not locked.  *vpp must be returned referenced and
684  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
685  * at the root, instead it could indicate that the directory we were in was
686  * removed.
687  */
688 static
689 int
690 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
691 {
692         struct hammer_inode *dip;
693         struct hammer_inode *ip;
694         u_int64_t parent_obj_id;
695         int error;
696
697         dip = VTOI(ap->a_dvp);
698         if ((parent_obj_id = dip->ino_data.parent_obj_id) == 0) {
699                 *ap->a_vpp = NULL;
700                 return ENOENT;
701         }
702
703         ip = hammer_get_inode(dip->hmp, &dip->cache[1], parent_obj_id,
704                               dip->obj_asof, dip->flags, &error);
705         if (ip == NULL) {
706                 *ap->a_vpp = NULL;
707                 return(error);
708         }
709         error = hammer_get_vnode(ip, LK_EXCLUSIVE, ap->a_vpp);
710         hammer_rel_inode(ip, 0);
711         return (error);
712 }
713
714 /*
715  * hammer_vop_nlink { nch, dvp, vp, cred }
716  */
717 static
718 int
719 hammer_vop_nlink(struct vop_nlink_args *ap)
720 {
721         struct hammer_transaction trans;
722         struct hammer_inode *dip;
723         struct hammer_inode *ip;
724         struct nchandle *nch;
725         int error;
726
727         nch = ap->a_nch;
728         dip = VTOI(ap->a_dvp);
729         ip = VTOI(ap->a_vp);
730
731         if (dip->flags & HAMMER_INODE_RO)
732                 return (EROFS);
733         if (ip->flags & HAMMER_INODE_RO)
734                 return (EROFS);
735
736         /*
737          * Create a transaction to cover the operations we perform.
738          */
739         hammer_start_transaction(&trans, dip->hmp);
740
741         /*
742          * Add the filesystem object to the directory.  Note that neither
743          * dip nor ip are referenced or locked, but their vnodes are
744          * referenced.  This function will bump the inode's link count.
745          */
746         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
747
748         /*
749          * Finish up.
750          */
751         if (error) {
752                 hammer_abort_transaction(&trans);
753         } else {
754                 cache_setunresolved(nch);
755                 cache_setvp(nch, ap->a_vp);
756                 hammer_commit_transaction(&trans);
757         }
758         return (error);
759 }
760
761 /*
762  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
763  *
764  * The operating system has already ensured that the directory entry
765  * does not exist and done all appropriate namespace locking.
766  */
767 static
768 int
769 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
770 {
771         struct hammer_transaction trans;
772         struct hammer_inode *dip;
773         struct hammer_inode *nip;
774         struct nchandle *nch;
775         int error;
776
777         nch = ap->a_nch;
778         dip = VTOI(ap->a_dvp);
779
780         if (dip->flags & HAMMER_INODE_RO)
781                 return (EROFS);
782
783         /*
784          * Create a transaction to cover the operations we perform.
785          */
786         hammer_start_transaction(&trans, dip->hmp);
787
788         /*
789          * Create a new filesystem object of the requested type.  The
790          * returned inode will be referenced but not locked.
791          */
792         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
793         if (error)
794                 kprintf("hammer_mkdir error %d\n", error);
795         if (error) {
796                 hammer_abort_transaction(&trans);
797                 *ap->a_vpp = NULL;
798                 return (error);
799         }
800
801         /*
802          * Add the new filesystem object to the directory.  This will also
803          * bump the inode's link count.
804          */
805         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
806         if (error)
807                 kprintf("hammer_mkdir (add) error %d\n", error);
808
809         /*
810          * Finish up.
811          */
812         if (error) {
813                 hammer_rel_inode(nip, 0);
814                 hammer_abort_transaction(&trans);
815                 *ap->a_vpp = NULL;
816         } else {
817                 hammer_commit_transaction(&trans);
818                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
819                 hammer_rel_inode(nip, 0);
820                 if (error == 0) {
821                         cache_setunresolved(ap->a_nch);
822                         cache_setvp(ap->a_nch, *ap->a_vpp);
823                 }
824         }
825         return (error);
826 }
827
828 /*
829  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
830  *
831  * The operating system has already ensured that the directory entry
832  * does not exist and done all appropriate namespace locking.
833  */
834 static
835 int
836 hammer_vop_nmknod(struct vop_nmknod_args *ap)
837 {
838         struct hammer_transaction trans;
839         struct hammer_inode *dip;
840         struct hammer_inode *nip;
841         struct nchandle *nch;
842         int error;
843
844         nch = ap->a_nch;
845         dip = VTOI(ap->a_dvp);
846
847         if (dip->flags & HAMMER_INODE_RO)
848                 return (EROFS);
849
850         /*
851          * Create a transaction to cover the operations we perform.
852          */
853         hammer_start_transaction(&trans, dip->hmp);
854
855         /*
856          * Create a new filesystem object of the requested type.  The
857          * returned inode will be referenced but not locked.
858          */
859         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
860         if (error) {
861                 hammer_abort_transaction(&trans);
862                 *ap->a_vpp = NULL;
863                 return (error);
864         }
865
866         /*
867          * Add the new filesystem object to the directory.  This will also
868          * bump the inode's link count.
869          */
870         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
871
872         /*
873          * Finish up.
874          */
875         if (error) {
876                 hammer_rel_inode(nip, 0);
877                 hammer_abort_transaction(&trans);
878                 *ap->a_vpp = NULL;
879         } else {
880                 hammer_commit_transaction(&trans);
881                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
882                 hammer_rel_inode(nip, 0);
883                 if (error == 0) {
884                         cache_setunresolved(ap->a_nch);
885                         cache_setvp(ap->a_nch, *ap->a_vpp);
886                 }
887         }
888         return (error);
889 }
890
891 /*
892  * hammer_vop_open { vp, mode, cred, fp }
893  */
894 static
895 int
896 hammer_vop_open(struct vop_open_args *ap)
897 {
898         if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
899                 return (EROFS);
900
901         return(vop_stdopen(ap));
902 }
903
904 /*
905  * hammer_vop_pathconf { vp, name, retval }
906  */
907 static
908 int
909 hammer_vop_pathconf(struct vop_pathconf_args *ap)
910 {
911         return EOPNOTSUPP;
912 }
913
914 /*
915  * hammer_vop_print { vp }
916  */
917 static
918 int
919 hammer_vop_print(struct vop_print_args *ap)
920 {
921         return EOPNOTSUPP;
922 }
923
924 /*
925  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
926  */
927 static
928 int
929 hammer_vop_readdir(struct vop_readdir_args *ap)
930 {
931         struct hammer_cursor cursor;
932         struct hammer_inode *ip;
933         struct uio *uio;
934         hammer_record_ondisk_t rec;
935         hammer_base_elm_t base;
936         int error;
937         int cookie_index;
938         int ncookies;
939         off_t *cookies;
940         off_t saveoff;
941         int r;
942
943         ip = VTOI(ap->a_vp);
944         uio = ap->a_uio;
945         saveoff = uio->uio_offset;
946
947         if (ap->a_ncookies) {
948                 ncookies = uio->uio_resid / 16 + 1;
949                 if (ncookies > 1024)
950                         ncookies = 1024;
951                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
952                 cookie_index = 0;
953         } else {
954                 ncookies = -1;
955                 cookies = NULL;
956                 cookie_index = 0;
957         }
958
959         /*
960          * Handle artificial entries
961          */
962         error = 0;
963         if (saveoff == 0) {
964                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
965                 if (r)
966                         goto done;
967                 if (cookies)
968                         cookies[cookie_index] = saveoff;
969                 ++saveoff;
970                 ++cookie_index;
971                 if (cookie_index == ncookies)
972                         goto done;
973         }
974         if (saveoff == 1) {
975                 if (ip->ino_data.parent_obj_id) {
976                         r = vop_write_dirent(&error, uio,
977                                              ip->ino_data.parent_obj_id,
978                                              DT_DIR, 2, "..");
979                 } else {
980                         r = vop_write_dirent(&error, uio,
981                                              ip->obj_id, DT_DIR, 2, "..");
982                 }
983                 if (r)
984                         goto done;
985                 if (cookies)
986                         cookies[cookie_index] = saveoff;
987                 ++saveoff;
988                 ++cookie_index;
989                 if (cookie_index == ncookies)
990                         goto done;
991         }
992
993         /*
994          * Key range (begin and end inclusive) to scan.  Directory keys
995          * directly translate to a 64 bit 'seek' position.
996          */
997         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
998         cursor.key_beg.obj_id = ip->obj_id;
999         cursor.key_beg.create_tid = 0;
1000         cursor.key_beg.delete_tid = 0;
1001         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1002         cursor.key_beg.obj_type = 0;
1003         cursor.key_beg.key = saveoff;
1004
1005         cursor.key_end = cursor.key_beg;
1006         cursor.key_end.key = HAMMER_MAX_KEY;
1007         cursor.asof = ip->obj_asof;
1008         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1009
1010         error = hammer_ip_first(&cursor, ip);
1011
1012         while (error == 0) {
1013                 error = hammer_ip_resolve_data(&cursor);
1014                 if (error)
1015                         break;
1016                 rec = cursor.record;
1017                 base = &rec->base.base;
1018                 saveoff = base->key;
1019
1020                 if (base->obj_id != ip->obj_id)
1021                         panic("readdir: bad record at %p", cursor.node);
1022
1023                 r = vop_write_dirent(
1024                              &error, uio, rec->entry.obj_id,
1025                              hammer_get_dtype(rec->entry.base.base.obj_type),
1026                              rec->entry.base.data_len,
1027                              (void *)cursor.data);
1028                 if (r)
1029                         break;
1030                 ++saveoff;
1031                 if (cookies)
1032                         cookies[cookie_index] = base->key;
1033                 ++cookie_index;
1034                 if (cookie_index == ncookies)
1035                         break;
1036                 error = hammer_ip_next(&cursor);
1037         }
1038         hammer_done_cursor(&cursor);
1039
1040 done:
1041         if (ap->a_eofflag)
1042                 *ap->a_eofflag = (error == ENOENT);
1043         uio->uio_offset = saveoff;
1044         if (error && cookie_index == 0) {
1045                 if (error == ENOENT)
1046                         error = 0;
1047                 if (cookies) {
1048                         kfree(cookies, M_TEMP);
1049                         *ap->a_ncookies = 0;
1050                         *ap->a_cookies = NULL;
1051                 }
1052         } else {
1053                 if (error == ENOENT)
1054                         error = 0;
1055                 if (cookies) {
1056                         *ap->a_ncookies = cookie_index;
1057                         *ap->a_cookies = cookies;
1058                 }
1059         }
1060         return(error);
1061 }
1062
1063 /*
1064  * hammer_vop_readlink { vp, uio, cred }
1065  */
1066 static
1067 int
1068 hammer_vop_readlink(struct vop_readlink_args *ap)
1069 {
1070         struct hammer_cursor cursor;
1071         struct hammer_inode *ip;
1072         int error;
1073
1074         ip = VTOI(ap->a_vp);
1075         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1076
1077         /*
1078          * Key range (begin and end inclusive) to scan.  Directory keys
1079          * directly translate to a 64 bit 'seek' position.
1080          */
1081         cursor.key_beg.obj_id = ip->obj_id;
1082         cursor.key_beg.create_tid = 0;
1083         cursor.key_beg.delete_tid = 0;
1084         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1085         cursor.key_beg.obj_type = 0;
1086         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1087         cursor.asof = ip->obj_asof;
1088         cursor.flags |= HAMMER_CURSOR_ASOF;
1089
1090         error = hammer_ip_lookup(&cursor, ip);
1091         if (error == 0) {
1092                 error = hammer_ip_resolve_data(&cursor);
1093                 if (error == 0) {
1094                         error = uiomove((char *)cursor.data,
1095                                         cursor.record->generic.base.data_len,
1096                                         ap->a_uio);
1097                 }
1098         }
1099         hammer_done_cursor(&cursor);
1100         return(error);
1101 }
1102
1103 /*
1104  * hammer_vop_nremove { nch, dvp, cred }
1105  */
1106 static
1107 int
1108 hammer_vop_nremove(struct vop_nremove_args *ap)
1109 {
1110         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
1111 }
1112
1113 /*
1114  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1115  */
1116 static
1117 int
1118 hammer_vop_nrename(struct vop_nrename_args *ap)
1119 {
1120         struct hammer_transaction trans;
1121         struct namecache *fncp;
1122         struct namecache *tncp;
1123         struct hammer_inode *fdip;
1124         struct hammer_inode *tdip;
1125         struct hammer_inode *ip;
1126         struct hammer_cursor cursor;
1127         union hammer_record_ondisk *rec;
1128         int64_t namekey;
1129         int error;
1130
1131         fdip = VTOI(ap->a_fdvp);
1132         tdip = VTOI(ap->a_tdvp);
1133         fncp = ap->a_fnch->ncp;
1134         tncp = ap->a_tnch->ncp;
1135         ip = VTOI(fncp->nc_vp);
1136         KKASSERT(ip != NULL);
1137
1138         if (fdip->flags & HAMMER_INODE_RO)
1139                 return (EROFS);
1140         if (tdip->flags & HAMMER_INODE_RO)
1141                 return (EROFS);
1142         if (ip->flags & HAMMER_INODE_RO)
1143                 return (EROFS);
1144
1145         hammer_start_transaction(&trans, fdip->hmp);
1146
1147         /*
1148          * Remove tncp from the target directory and then link ip as
1149          * tncp. XXX pass trans to dounlink
1150          */
1151         error = hammer_dounlink(ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1152         if (error == 0 || error == ENOENT)
1153                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1154         if (error)
1155                 goto failed; /* XXX */
1156
1157         /*
1158          * Locate the record in the originating directory and remove it.
1159          *
1160          * Calculate the namekey and setup the key range for the scan.  This
1161          * works kinda like a chained hash table where the lower 32 bits
1162          * of the namekey synthesize the chain.
1163          *
1164          * The key range is inclusive of both key_beg and key_end.
1165          */
1166         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1167 retry:
1168         hammer_init_cursor_hmp(&cursor, &fdip->cache[0], fdip->hmp);
1169         cursor.key_beg.obj_id = fdip->obj_id;
1170         cursor.key_beg.key = namekey;
1171         cursor.key_beg.create_tid = 0;
1172         cursor.key_beg.delete_tid = 0;
1173         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1174         cursor.key_beg.obj_type = 0;
1175
1176         cursor.key_end = cursor.key_beg;
1177         cursor.key_end.key |= 0xFFFFFFFFULL;
1178         cursor.asof = fdip->obj_asof;
1179         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1180
1181         /*
1182          * Scan all matching records (the chain), locate the one matching
1183          * the requested path component.
1184          *
1185          * The hammer_ip_*() functions merge in-memory records with on-disk
1186          * records for the purposes of the search.
1187          */
1188         error = hammer_ip_first(&cursor, fdip);
1189         while (error == 0) {
1190                 if (hammer_ip_resolve_data(&cursor) != 0)
1191                         break;
1192                 rec = cursor.record;
1193                 if (fncp->nc_nlen == rec->entry.base.data_len &&
1194                     bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
1195                         break;
1196                 }
1197                 error = hammer_ip_next(&cursor);
1198         }
1199
1200         /*
1201          * If all is ok we have to get the inode so we can adjust nlinks.
1202          *
1203          * WARNING: hammer_ip_del_directory() may have to terminate the
1204          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1205          * twice.
1206          */
1207         if (error == 0)
1208                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1209         hammer_done_cursor(&cursor);
1210         if (error == 0)
1211                 cache_rename(ap->a_fnch, ap->a_tnch);
1212         if (error == EDEADLK)
1213                 goto retry;
1214 failed:
1215         if (error == 0) {
1216                 hammer_commit_transaction(&trans);
1217         } else {
1218                 hammer_abort_transaction(&trans);
1219         }
1220         return (error);
1221 }
1222
1223 /*
1224  * hammer_vop_nrmdir { nch, dvp, cred }
1225  */
1226 static
1227 int
1228 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1229 {
1230         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
1231 }
1232
1233 /*
1234  * hammer_vop_setattr { vp, vap, cred }
1235  */
1236 static
1237 int
1238 hammer_vop_setattr(struct vop_setattr_args *ap)
1239 {
1240         struct hammer_transaction trans;
1241         struct hammer_cursor *spike = NULL;
1242         struct vattr *vap;
1243         struct hammer_inode *ip;
1244         int modflags;
1245         int error;
1246         int truncating;
1247         int64_t aligned_size;
1248         u_int32_t flags;
1249         uuid_t uuid;
1250
1251         vap = ap->a_vap;
1252         ip = ap->a_vp->v_data;
1253         modflags = 0;
1254
1255         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1256                 return(EROFS);
1257         if (ip->flags & HAMMER_INODE_RO)
1258                 return (EROFS);
1259
1260         hammer_start_transaction(&trans, ip->hmp);
1261         error = 0;
1262
1263         if (vap->va_flags != VNOVAL) {
1264                 flags = ip->ino_data.uflags;
1265                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1266                                          hammer_to_unix_xid(&ip->ino_data.uid),
1267                                          ap->a_cred);
1268                 if (error == 0) {
1269                         if (ip->ino_data.uflags != flags) {
1270                                 ip->ino_data.uflags = flags;
1271                                 modflags |= HAMMER_INODE_DDIRTY;
1272                         }
1273                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1274                                 error = 0;
1275                                 goto done;
1276                         }
1277                 }
1278                 goto done;
1279         }
1280         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1281                 error = EPERM;
1282                 goto done;
1283         }
1284         if (vap->va_uid != (uid_t)VNOVAL) {
1285                 hammer_guid_to_uuid(&uuid, vap->va_uid);
1286                 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1287                         ip->ino_data.uid = uuid;
1288                         modflags |= HAMMER_INODE_DDIRTY;
1289                 }
1290         }
1291         if (vap->va_gid != (uid_t)VNOVAL) {
1292                 hammer_guid_to_uuid(&uuid, vap->va_gid);
1293                 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1294                         ip->ino_data.gid = uuid;
1295                         modflags |= HAMMER_INODE_DDIRTY;
1296                 }
1297         }
1298         while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
1299                 switch(ap->a_vp->v_type) {
1300                 case VREG:
1301                         if (vap->va_size == ip->ino_rec.ino_size)
1302                                 break;
1303                         if (vap->va_size < ip->ino_rec.ino_size) {
1304                                 vtruncbuf(ap->a_vp, vap->va_size,
1305                                           HAMMER_BUFSIZE);
1306                                 truncating = 1;
1307                         } else {
1308                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1309                                 truncating = 0;
1310                         }
1311                         ip->ino_rec.ino_size = vap->va_size;
1312                         modflags |= HAMMER_INODE_RDIRTY;
1313                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1314                                         ~(int64_t)HAMMER_BUFMASK;
1315
1316                         if (truncating) {
1317                                 error = hammer_ip_delete_range(&trans, ip,
1318                                                     aligned_size,
1319                                                     0x7FFFFFFFFFFFFFFFLL,
1320                                                     &spike);
1321                         }
1322                         /*
1323                          * If truncating we have to clean out a portion of
1324                          * the last block on-disk.
1325                          */
1326                         if (truncating && error == 0 &&
1327                             vap->va_size < aligned_size) {
1328                                 struct buf *bp;
1329                                 int offset;
1330
1331                                 offset = vap->va_size & HAMMER_BUFMASK;
1332                                 error = bread(ap->a_vp,
1333                                               aligned_size - HAMMER_BUFSIZE,
1334                                               HAMMER_BUFSIZE, &bp);
1335                                 if (error == 0) {
1336                                         bzero(bp->b_data + offset,
1337                                               HAMMER_BUFSIZE - offset);
1338                                         bdwrite(bp);
1339                                 } else {
1340                                         brelse(bp);
1341                                 }
1342                         }
1343                         break;
1344                 case VDATABASE:
1345                         error = hammer_ip_delete_range(&trans, ip,
1346                                                     vap->va_size,
1347                                                     0x7FFFFFFFFFFFFFFFLL,
1348                                                     &spike);
1349                         ip->ino_rec.ino_size = vap->va_size;
1350                         modflags |= HAMMER_INODE_RDIRTY;
1351                         break;
1352                 default:
1353                         error = EINVAL;
1354                         goto done;
1355                 }
1356                 if (error == ENOSPC) {
1357                         error = hammer_spike(&spike);
1358                         if (error == 0)
1359                                 continue;
1360                 }
1361                 KKASSERT(spike == NULL);
1362                 break;
1363         }
1364         if (vap->va_atime.tv_sec != VNOVAL) {
1365                 ip->ino_rec.ino_atime =
1366                         hammer_timespec_to_transid(&vap->va_atime);
1367                 modflags |= HAMMER_INODE_ITIMES;
1368         }
1369         if (vap->va_mtime.tv_sec != VNOVAL) {
1370                 ip->ino_rec.ino_mtime =
1371                         hammer_timespec_to_transid(&vap->va_mtime);
1372                 modflags |= HAMMER_INODE_ITIMES;
1373         }
1374         if (vap->va_mode != (mode_t)VNOVAL) {
1375                 if (ip->ino_data.mode != vap->va_mode) {
1376                         ip->ino_data.mode = vap->va_mode;
1377                         modflags |= HAMMER_INODE_DDIRTY;
1378                 }
1379         }
1380 done:
1381         if (error) {
1382                 hammer_abort_transaction(&trans);
1383         } else {
1384                 hammer_modify_inode(&trans, ip, modflags);
1385                 hammer_commit_transaction(&trans);
1386         }
1387         return (error);
1388 }
1389
1390 /*
1391  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1392  */
1393 static
1394 int
1395 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1396 {
1397         struct hammer_transaction trans;
1398         struct hammer_inode *dip;
1399         struct hammer_inode *nip;
1400         struct nchandle *nch;
1401         hammer_record_t record;
1402         int error;
1403         int bytes;
1404
1405         ap->a_vap->va_type = VLNK;
1406
1407         nch = ap->a_nch;
1408         dip = VTOI(ap->a_dvp);
1409
1410         if (dip->flags & HAMMER_INODE_RO)
1411                 return (EROFS);
1412
1413         /*
1414          * Create a transaction to cover the operations we perform.
1415          */
1416         hammer_start_transaction(&trans, dip->hmp);
1417
1418         /*
1419          * Create a new filesystem object of the requested type.  The
1420          * returned inode will be referenced but not locked.
1421          */
1422
1423         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1424         if (error) {
1425                 hammer_abort_transaction(&trans);
1426                 *ap->a_vpp = NULL;
1427                 return (error);
1428         }
1429
1430         /*
1431          * Add the new filesystem object to the directory.  This will also
1432          * bump the inode's link count.
1433          */
1434         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1435
1436         /*
1437          * Add a record representing the symlink.  symlink stores the link
1438          * as pure data, not a string, and is no \0 terminated.
1439          */
1440         if (error == 0) {
1441                 record = hammer_alloc_mem_record(nip);
1442                 bytes = strlen(ap->a_target);
1443
1444                 record->rec.generic.base.base.key = HAMMER_FIXKEY_SYMLINK;
1445                 record->rec.generic.base.base.rec_type = HAMMER_RECTYPE_FIX;
1446                 record->rec.generic.base.data_len = bytes;
1447                 if (bytes <= sizeof(record->rec.generic.filler)) {
1448                         record->data = (void *)record->rec.generic.filler;
1449                         bcopy(ap->a_target, record->data, bytes);
1450                 } else {
1451                         record->data = (void *)ap->a_target;
1452                         /* will be reallocated by routine below */
1453                 }
1454                 error = hammer_ip_add_record(&trans, record);
1455         }
1456
1457         /*
1458          * Finish up.
1459          */
1460         if (error) {
1461                 hammer_rel_inode(nip, 0);
1462                 hammer_abort_transaction(&trans);
1463                 *ap->a_vpp = NULL;
1464         } else {
1465                 hammer_commit_transaction(&trans);
1466                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
1467                 hammer_rel_inode(nip, 0);
1468                 if (error == 0) {
1469                         cache_setunresolved(ap->a_nch);
1470                         cache_setvp(ap->a_nch, *ap->a_vpp);
1471                 }
1472         }
1473         return (error);
1474 }
1475
1476 /*
1477  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1478  */
1479 static
1480 int
1481 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1482 {
1483         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags));
1484 }
1485
1486 /*
1487  * hammer_vop_strategy { vp, bio }
1488  *
1489  * Strategy call, used for regular file read & write only.  Note that the
1490  * bp may represent a cluster.
1491  *
1492  * To simplify operation and allow better optimizations in the future,
1493  * this code does not make any assumptions with regards to buffer alignment
1494  * or size.
1495  */
1496 static
1497 int
1498 hammer_vop_strategy(struct vop_strategy_args *ap)
1499 {
1500         struct buf *bp;
1501         int error;
1502
1503         bp = ap->a_bio->bio_buf;
1504
1505         switch(bp->b_cmd) {
1506         case BUF_CMD_READ:
1507                 error = hammer_vop_strategy_read(ap);
1508                 break;
1509         case BUF_CMD_WRITE:
1510                 error = hammer_vop_strategy_write(ap);
1511                 break;
1512         default:
1513                 error = EINVAL;
1514                 break;
1515         }
1516         bp->b_error = error;
1517         if (error)
1518                 bp->b_flags |= B_ERROR;
1519         biodone(ap->a_bio);
1520         return (error);
1521 }
1522
1523 /*
1524  * Read from a regular file.  Iterate the related records and fill in the
1525  * BIO/BUF.  Gaps are zero-filled.
1526  *
1527  * The support code in hammer_object.c should be used to deal with mixed
1528  * in-memory and on-disk records.
1529  *
1530  * XXX atime update
1531  */
1532 static
1533 int
1534 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1535 {
1536         struct hammer_inode *ip = ap->a_vp->v_data;
1537         struct hammer_cursor cursor;
1538         hammer_record_ondisk_t rec;
1539         hammer_base_elm_t base;
1540         struct bio *bio;
1541         struct buf *bp;
1542         int64_t rec_offset;
1543         int64_t ran_end;
1544         int64_t tmp64;
1545         int error;
1546         int boff;
1547         int roff;
1548         int n;
1549
1550         bio = ap->a_bio;
1551         bp = bio->bio_buf;
1552
1553         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1554
1555         /*
1556          * Key range (begin and end inclusive) to scan.  Note that the key's
1557          * stored in the actual records represent BASE+LEN, not BASE.  The
1558          * first record containing bio_offset will have a key > bio_offset.
1559          */
1560         cursor.key_beg.obj_id = ip->obj_id;
1561         cursor.key_beg.create_tid = 0;
1562         cursor.key_beg.delete_tid = 0;
1563         cursor.key_beg.obj_type = 0;
1564         cursor.key_beg.key = bio->bio_offset + 1;
1565         cursor.asof = ip->obj_asof;
1566         cursor.flags |= HAMMER_CURSOR_ASOF;
1567
1568         cursor.key_end = cursor.key_beg;
1569         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1570                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1571                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1572                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1573         } else {
1574                 ran_end = bio->bio_offset + bp->b_bufsize;
1575                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1576                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1577                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1578                 if (tmp64 < ran_end)
1579                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1580                 else
1581                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1582         }
1583         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1584
1585         error = hammer_ip_first(&cursor, ip);
1586         boff = 0;
1587
1588         while (error == 0) {
1589                 error = hammer_ip_resolve_data(&cursor);
1590                 if (error)
1591                         break;
1592                 rec = cursor.record;
1593                 base = &rec->base.base;
1594
1595                 rec_offset = base->key - rec->data.base.data_len;
1596
1597                 /*
1598                  * Calculate the gap, if any, and zero-fill it.
1599                  */
1600                 n = (int)(rec_offset - (bio->bio_offset + boff));
1601                 if (n > 0) {
1602                         if (n > bp->b_bufsize - boff)
1603                                 n = bp->b_bufsize - boff;
1604                         bzero((char *)bp->b_data + boff, n);
1605                         boff += n;
1606                         n = 0;
1607                 }
1608
1609                 /*
1610                  * Calculate the data offset in the record and the number
1611                  * of bytes we can copy.
1612                  *
1613                  * Note there is a degenerate case here where boff may
1614                  * already be at bp->b_bufsize.
1615                  */
1616                 roff = -n;
1617                 n = rec->data.base.data_len - roff;
1618                 KKASSERT(n > 0);
1619                 if (n > bp->b_bufsize - boff)
1620                         n = bp->b_bufsize - boff;
1621                 bcopy((char *)cursor.data + roff, (char *)bp->b_data + boff, n);
1622                 boff += n;
1623                 if (boff == bp->b_bufsize)
1624                         break;
1625                 error = hammer_ip_next(&cursor);
1626         }
1627         hammer_done_cursor(&cursor);
1628
1629         /*
1630          * There may have been a gap after the last record
1631          */
1632         if (error == ENOENT)
1633                 error = 0;
1634         if (error == 0 && boff != bp->b_bufsize) {
1635                 KKASSERT(boff < bp->b_bufsize);
1636                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1637                 /* boff = bp->b_bufsize; */
1638         }
1639         bp->b_resid = 0;
1640         return(error);
1641 }
1642
1643 /*
1644  * Write to a regular file.  Iterate the related records and mark for
1645  * deletion.  If existing edge records (left and right side) overlap our
1646  * write they have to be marked deleted and new records created, usually
1647  * referencing a portion of the original data.  Then add a record to
1648  * represent the buffer.
1649  *
1650  * The support code in hammer_object.c should be used to deal with mixed
1651  * in-memory and on-disk records.
1652  */
1653 static
1654 int
1655 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1656 {
1657         struct hammer_transaction trans;
1658         struct hammer_cursor *spike = NULL;
1659         hammer_inode_t ip;
1660         struct bio *bio;
1661         struct buf *bp;
1662         int error;
1663
1664         bio = ap->a_bio;
1665         bp = bio->bio_buf;
1666         ip = ap->a_vp->v_data;
1667
1668         if (ip->flags & HAMMER_INODE_RO)
1669                 return (EROFS);
1670
1671         hammer_start_transaction(&trans, ip->hmp);
1672
1673 retry:
1674         /*
1675          * Delete any records overlapping our range.  This function will
1676          * (eventually) properly truncate partial overlaps.
1677          */
1678         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1679                 error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
1680                                                bio->bio_offset, &spike);
1681         } else {
1682                 error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
1683                                                bio->bio_offset +
1684                                                 bp->b_bufsize - 1,
1685                                                &spike);
1686         }
1687
1688         /*
1689          * Add a single record to cover the write
1690          */
1691         if (error == 0) {
1692                 error = hammer_ip_sync_data(&trans, ip, bio->bio_offset,
1693                                             bp->b_data, bp->b_bufsize,
1694                                             &spike);
1695         }
1696
1697         /*
1698          * If we ran out of space the spike structure will be filled in
1699          * and we must call hammer_spike with it, then retry.
1700          */
1701         if (error == ENOSPC) {
1702                 error = hammer_spike(&spike);
1703                 if (error == 0)
1704                         goto retry;
1705         }
1706         KKASSERT(spike == NULL);
1707
1708         /*
1709          * If an error occured abort the transaction
1710          */
1711         if (error) {
1712                 /* XXX undo deletion */
1713                 hammer_abort_transaction(&trans);
1714                 bp->b_resid = bp->b_bufsize;
1715         } else {
1716                 hammer_commit_transaction(&trans);
1717                 bp->b_resid = 0;
1718         }
1719         return(error);
1720 }
1721
1722 /*
1723  * dounlink - disconnect a directory entry
1724  *
1725  * XXX whiteout support not really in yet
1726  */
1727 static int
1728 hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred,
1729                 int flags)
1730 {
1731         struct hammer_transaction trans;
1732         struct namecache *ncp;
1733         hammer_inode_t dip;
1734         hammer_inode_t ip;
1735         hammer_record_ondisk_t rec;
1736         struct hammer_cursor cursor;
1737         int64_t namekey;
1738         int error;
1739
1740         /*
1741          * Calculate the namekey and setup the key range for the scan.  This
1742          * works kinda like a chained hash table where the lower 32 bits
1743          * of the namekey synthesize the chain.
1744          *
1745          * The key range is inclusive of both key_beg and key_end.
1746          */
1747         dip = VTOI(dvp);
1748         ncp = nch->ncp;
1749
1750         if (dip->flags & HAMMER_INODE_RO)
1751                 return (EROFS);
1752
1753         hammer_start_transaction(&trans, dip->hmp);
1754
1755         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
1756 retry:
1757         hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp);
1758         cursor.key_beg.obj_id = dip->obj_id;
1759         cursor.key_beg.key = namekey;
1760         cursor.key_beg.create_tid = 0;
1761         cursor.key_beg.delete_tid = 0;
1762         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1763         cursor.key_beg.obj_type = 0;
1764
1765         cursor.key_end = cursor.key_beg;
1766         cursor.key_end.key |= 0xFFFFFFFFULL;
1767         cursor.asof = dip->obj_asof;
1768         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1769
1770         /*
1771          * Scan all matching records (the chain), locate the one matching
1772          * the requested path component.  info->last_error contains the
1773          * error code on search termination and could be 0, ENOENT, or
1774          * something else.
1775          *
1776          * The hammer_ip_*() functions merge in-memory records with on-disk
1777          * records for the purposes of the search.
1778          */
1779         error = hammer_ip_first(&cursor, dip);
1780         while (error == 0) {
1781                 error = hammer_ip_resolve_data(&cursor);
1782                 if (error)
1783                         break;
1784                 rec = cursor.record;
1785                 if (ncp->nc_nlen == rec->entry.base.data_len &&
1786                     bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
1787                         break;
1788                 }
1789                 error = hammer_ip_next(&cursor);
1790         }
1791
1792         /*
1793          * If all is ok we have to get the inode so we can adjust nlinks.
1794          *
1795          * If the target is a directory, it must be empty.
1796          */
1797         if (error == 0) {
1798                 ip = hammer_get_inode(dip->hmp, &dip->cache[1],
1799                                       rec->entry.obj_id,
1800                                       dip->hmp->asof, 0, &error);
1801                 if (error == ENOENT) {
1802                         kprintf("obj_id %016llx\n", rec->entry.obj_id);
1803                         Debugger("ENOENT unlinking object that should exist, cont to sync");
1804                         hammer_sync_hmp(dip->hmp, MNT_NOWAIT);
1805                         Debugger("ENOENT - sync done");
1806                 }
1807                 if (error == 0 && ip->ino_rec.base.base.obj_type ==
1808                                   HAMMER_OBJTYPE_DIRECTORY) {
1809                         error = hammer_ip_check_directory_empty(&trans, ip);
1810                 }
1811                 /*
1812                  * WARNING: hammer_ip_del_directory() may have to terminate
1813                  * the cursor to avoid a lock recursion.  It's ok to call
1814                  * hammer_done_cursor() twice.
1815                  */
1816                 if (error == 0)
1817                         error = hammer_ip_del_directory(&trans, &cursor, dip, ip);
1818                 if (error == 0) {
1819                         cache_setunresolved(nch);
1820                         cache_setvp(nch, NULL);
1821                         /* XXX locking */
1822                         if (ip->vp)
1823                                 cache_inval_vp(ip->vp, CINV_DESTROY);
1824                 }
1825                 hammer_rel_inode(ip, 0);
1826         }
1827         hammer_done_cursor(&cursor);
1828         if (error == EDEADLK)
1829                 goto retry;
1830
1831         if (error == 0)
1832                 hammer_commit_transaction(&trans);
1833         else
1834                 hammer_abort_transaction(&trans);
1835         return (error);
1836 }
1837
1838 /************************************************************************
1839  *                          FIFO AND SPECFS OPS                         *
1840  ************************************************************************
1841  *
1842  */
1843
1844 static int
1845 hammer_vop_fifoclose (struct vop_close_args *ap)
1846 {
1847         /* XXX update itimes */
1848         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
1849 }
1850
1851 static int
1852 hammer_vop_fiforead (struct vop_read_args *ap)
1853 {
1854         int error;
1855
1856         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
1857         /* XXX update access time */
1858         return (error);
1859 }
1860
1861 static int
1862 hammer_vop_fifowrite (struct vop_write_args *ap)
1863 {
1864         int error;
1865
1866         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
1867         /* XXX update access time */
1868         return (error);
1869 }
1870
1871 static int
1872 hammer_vop_specclose (struct vop_close_args *ap)
1873 {
1874         /* XXX update itimes */
1875         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1876 }
1877
1878 static int
1879 hammer_vop_specread (struct vop_read_args *ap)
1880 {
1881         /* XXX update access time */
1882         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1883 }
1884
1885 static int
1886 hammer_vop_specwrite (struct vop_write_args *ap)
1887 {
1888         /* XXX update last change time */
1889         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1890 }
1891