HAMMER 21/many: B-Tree node locking finalization.
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.22 2008/01/18 07:02:41 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80
81 static int hammer_vop_fifoclose (struct vop_close_args *);
82 static int hammer_vop_fiforead (struct vop_read_args *);
83 static int hammer_vop_fifowrite (struct vop_write_args *);
84
85 static int hammer_vop_specclose (struct vop_close_args *);
86 static int hammer_vop_specread (struct vop_read_args *);
87 static int hammer_vop_specwrite (struct vop_write_args *);
88
89 struct vop_ops hammer_vnode_vops = {
90         .vop_default =          vop_defaultop,
91         .vop_fsync =            hammer_vop_fsync,
92         .vop_getpages =         vop_stdgetpages,
93         .vop_putpages =         vop_stdputpages,
94         .vop_read =             hammer_vop_read,
95         .vop_write =            hammer_vop_write,
96         .vop_access =           hammer_vop_access,
97         .vop_advlock =          hammer_vop_advlock,
98         .vop_close =            hammer_vop_close,
99         .vop_ncreate =          hammer_vop_ncreate,
100         .vop_getattr =          hammer_vop_getattr,
101         .vop_inactive =         hammer_vop_inactive,
102         .vop_reclaim =          hammer_vop_reclaim,
103         .vop_nresolve =         hammer_vop_nresolve,
104         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
105         .vop_nlink =            hammer_vop_nlink,
106         .vop_nmkdir =           hammer_vop_nmkdir,
107         .vop_nmknod =           hammer_vop_nmknod,
108         .vop_open =             hammer_vop_open,
109         .vop_pathconf =         hammer_vop_pathconf,
110         .vop_print =            hammer_vop_print,
111         .vop_readdir =          hammer_vop_readdir,
112         .vop_readlink =         hammer_vop_readlink,
113         .vop_nremove =          hammer_vop_nremove,
114         .vop_nrename =          hammer_vop_nrename,
115         .vop_nrmdir =           hammer_vop_nrmdir,
116         .vop_setattr =          hammer_vop_setattr,
117         .vop_strategy =         hammer_vop_strategy,
118         .vop_nsymlink =         hammer_vop_nsymlink,
119         .vop_nwhiteout =        hammer_vop_nwhiteout
120 };
121
122 struct vop_ops hammer_spec_vops = {
123         .vop_default =          spec_vnoperate,
124         .vop_fsync =            hammer_vop_fsync,
125         .vop_read =             hammer_vop_specread,
126         .vop_write =            hammer_vop_specwrite,
127         .vop_access =           hammer_vop_access,
128         .vop_close =            hammer_vop_specclose,
129         .vop_getattr =          hammer_vop_getattr,
130         .vop_inactive =         hammer_vop_inactive,
131         .vop_reclaim =          hammer_vop_reclaim,
132         .vop_setattr =          hammer_vop_setattr
133 };
134
135 struct vop_ops hammer_fifo_vops = {
136         .vop_default =          fifo_vnoperate,
137         .vop_fsync =            hammer_vop_fsync,
138         .vop_read =             hammer_vop_fiforead,
139         .vop_write =            hammer_vop_fifowrite,
140         .vop_access =           hammer_vop_access,
141         .vop_close =            hammer_vop_fifoclose,
142         .vop_getattr =          hammer_vop_getattr,
143         .vop_inactive =         hammer_vop_inactive,
144         .vop_reclaim =          hammer_vop_reclaim,
145         .vop_setattr =          hammer_vop_setattr
146 };
147
148 static int hammer_dounlink(struct nchandle *nch, struct vnode *dvp,
149                            struct ucred *cred, int flags);
150 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
151 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
152
153 #if 0
154 static
155 int
156 hammer_vop_vnoperate(struct vop_generic_args *)
157 {
158         return (VOCALL(&hammer_vnode_vops, ap));
159 }
160 #endif
161
162 /*
163  * hammer_vop_fsync { vp, waitfor }
164  */
165 static
166 int
167 hammer_vop_fsync(struct vop_fsync_args *ap)
168 {
169         hammer_inode_t ip;
170         int error;
171
172         ip = VTOI(ap->a_vp);
173         error = hammer_sync_inode(ip, ap->a_waitfor, 0);
174         return (error);
175 }
176
177 /*
178  * hammer_vop_read { vp, uio, ioflag, cred }
179  */
180 static
181 int
182 hammer_vop_read(struct vop_read_args *ap)
183 {
184         struct hammer_transaction trans;
185         hammer_inode_t ip;
186         off_t offset;
187         struct buf *bp;
188         struct uio *uio;
189         int error;
190         int n;
191         int seqcount;
192
193         if (ap->a_vp->v_type != VREG)
194                 return (EINVAL);
195         ip = VTOI(ap->a_vp);
196         error = 0;
197         seqcount = ap->a_ioflag >> 16;
198
199         hammer_start_transaction(&trans, ip->hmp);
200
201         /*
202          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
203          */
204         uio = ap->a_uio;
205         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_rec.ino_size) {
206                 offset = uio->uio_offset & HAMMER_BUFMASK;
207 #if 0
208                 error = cluster_read(ap->a_vp, ip->ino_rec.ino_size,
209                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
210                                      MAXBSIZE, seqcount, &bp);
211 #endif
212                 error = bread(ap->a_vp, uio->uio_offset - offset,
213                               HAMMER_BUFSIZE, &bp);
214                 if (error) {
215                         brelse(bp);
216                         break;
217                 }
218                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
219                 n = HAMMER_BUFSIZE - offset;
220                 if (n > uio->uio_resid)
221                         n = uio->uio_resid;
222                 if (n > ip->ino_rec.ino_size - uio->uio_offset)
223                         n = (int)(ip->ino_rec.ino_size - uio->uio_offset);
224                 error = uiomove((char *)bp->b_data + offset, n, uio);
225                 if (error) {
226                         bqrelse(bp);
227                         break;
228                 }
229                 if ((ip->flags & HAMMER_INODE_RO) == 0) {
230                         ip->ino_rec.ino_atime = trans.tid;
231                         hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
232                 }
233                 bqrelse(bp);
234         }
235         hammer_commit_transaction(&trans);
236         return (error);
237 }
238
239 /*
240  * hammer_vop_write { vp, uio, ioflag, cred }
241  */
242 static
243 int
244 hammer_vop_write(struct vop_write_args *ap)
245 {
246         struct hammer_transaction trans;
247         struct hammer_inode *ip;
248         struct uio *uio;
249         off_t offset;
250         struct buf *bp;
251         int error;
252         int n;
253         int flags;
254
255         if (ap->a_vp->v_type != VREG)
256                 return (EINVAL);
257         ip = VTOI(ap->a_vp);
258         error = 0;
259
260         if (ip->flags & HAMMER_INODE_RO)
261                 return (EROFS);
262
263         /*
264          * Create a transaction to cover the operations we perform.
265          */
266         hammer_start_transaction(&trans, ip->hmp);
267         uio = ap->a_uio;
268
269         /*
270          * Check append mode
271          */
272         if (ap->a_ioflag & IO_APPEND)
273                 uio->uio_offset = ip->ino_rec.ino_size;
274
275         /*
276          * Check for illegal write offsets.  Valid range is 0...2^63-1
277          */
278         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
279                 hammer_commit_transaction(&trans);
280                 return (EFBIG);
281         }
282
283         /*
284          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
285          */
286         while (uio->uio_resid > 0) {
287                 offset = uio->uio_offset & HAMMER_BUFMASK;
288                 if (uio->uio_segflg == UIO_NOCOPY) {
289                         /*
290                          * Issuing a write with the same data backing the
291                          * buffer.  Instantiate the buffer to collect the
292                          * backing vm pages, then read-in any missing bits.
293                          *
294                          * This case is used by vop_stdputpages().
295                          */
296                         bp = getblk(ap->a_vp, uio->uio_offset, HAMMER_BUFSIZE,
297                                     GETBLK_BHEAVY, 0);
298                         if ((bp->b_flags & B_CACHE) == 0) {
299                                 bqrelse(bp);
300                                 error = bread(ap->a_vp,
301                                               uio->uio_offset - offset,
302                                               HAMMER_BUFSIZE, &bp);
303                                 if (error) {
304                                         brelse(bp);
305                                         break;
306                                 }
307                         }
308                 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
309                         /*
310                          * entirely overwrite the buffer
311                          */
312                         bp = getblk(ap->a_vp, uio->uio_offset, HAMMER_BUFSIZE,
313                                     GETBLK_BHEAVY, 0);
314                 } else if (offset == 0 && uio->uio_offset >= ip->ino_rec.ino_size) {
315                         /*
316                          * XXX
317                          */
318                         bp = getblk(ap->a_vp, uio->uio_offset, HAMMER_BUFSIZE,
319                                     GETBLK_BHEAVY, 0);
320                         vfs_bio_clrbuf(bp);
321                 } else {
322                         /*
323                          * Partial overwrite, read in any missing bits then
324                          * replace the portion being written.
325                          */
326                         error = bread(ap->a_vp, uio->uio_offset - offset,
327                                       HAMMER_BUFSIZE, &bp);
328                         if (error) {
329                                 brelse(bp);
330                                 break;
331                         }
332                         bheavy(bp);
333                 }
334                 n = HAMMER_BUFSIZE - offset;
335                 if (n > uio->uio_resid)
336                         n = uio->uio_resid;
337                 error = uiomove((char *)bp->b_data + offset, n, uio);
338                 if (error) {
339                         brelse(bp);
340                         break;
341                 }
342                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
343                 if (ip->ino_rec.ino_size < uio->uio_offset) {
344                         ip->ino_rec.ino_size = uio->uio_offset;
345                         flags = HAMMER_INODE_RDIRTY;
346                         vnode_pager_setsize(ap->a_vp, ip->ino_rec.ino_size);
347                 } else {
348                         flags = 0;
349                 }
350                 ip->ino_rec.ino_mtime = trans.tid;
351                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
352                 hammer_modify_inode(&trans, ip, flags);
353                 if (ap->a_ioflag & IO_SYNC) {
354                         bwrite(bp);
355                 } else if (ap->a_ioflag & IO_DIRECT) {
356                         bawrite(bp);
357                 } else {
358                         bdwrite(bp);
359                 }
360         }
361         if (error)
362                 hammer_abort_transaction(&trans);
363         else
364                 hammer_commit_transaction(&trans);
365         return (error);
366 }
367
368 /*
369  * hammer_vop_access { vp, mode, cred }
370  */
371 static
372 int
373 hammer_vop_access(struct vop_access_args *ap)
374 {
375         struct hammer_inode *ip = VTOI(ap->a_vp);
376         uid_t uid;
377         gid_t gid;
378         int error;
379
380         uid = hammer_to_unix_xid(&ip->ino_data.uid);
381         gid = hammer_to_unix_xid(&ip->ino_data.gid);
382
383         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
384                                   ip->ino_data.uflags);
385         return (error);
386 }
387
388 /*
389  * hammer_vop_advlock { vp, id, op, fl, flags }
390  */
391 static
392 int
393 hammer_vop_advlock(struct vop_advlock_args *ap)
394 {
395         struct hammer_inode *ip = VTOI(ap->a_vp);
396
397         return (lf_advlock(ap, &ip->advlock, ip->ino_rec.ino_size));
398 }
399
400 /*
401  * hammer_vop_close { vp, fflag }
402  */
403 static
404 int
405 hammer_vop_close(struct vop_close_args *ap)
406 {
407         return (vop_stdclose(ap));
408 }
409
410 /*
411  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
412  *
413  * The operating system has already ensured that the directory entry
414  * does not exist and done all appropriate namespace locking.
415  */
416 static
417 int
418 hammer_vop_ncreate(struct vop_ncreate_args *ap)
419 {
420         struct hammer_transaction trans;
421         struct hammer_inode *dip;
422         struct hammer_inode *nip;
423         struct nchandle *nch;
424         int error;
425
426         nch = ap->a_nch;
427         dip = VTOI(ap->a_dvp);
428
429         if (dip->flags & HAMMER_INODE_RO)
430                 return (EROFS);
431
432         /*
433          * Create a transaction to cover the operations we perform.
434          */
435         hammer_start_transaction(&trans, dip->hmp);
436
437         /*
438          * Create a new filesystem object of the requested type.  The
439          * returned inode will be referenced but not locked.
440          */
441
442         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
443         if (error)
444                 kprintf("hammer_create_inode error %d\n", error);
445         if (error) {
446                 hammer_abort_transaction(&trans);
447                 *ap->a_vpp = NULL;
448                 return (error);
449         }
450
451         /*
452          * Add the new filesystem object to the directory.  This will also
453          * bump the inode's link count.
454          */
455         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
456         if (error)
457                 kprintf("hammer_ip_add_directory error %d\n", error);
458
459         /*
460          * Finish up.
461          */
462         if (error) {
463                 hammer_rel_inode(nip, 0);
464                 hammer_abort_transaction(&trans);
465                 *ap->a_vpp = NULL;
466         } else {
467                 hammer_commit_transaction(&trans);
468                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
469                 hammer_rel_inode(nip, 0);
470                 if (error == 0) {
471                         cache_setunresolved(ap->a_nch);
472                         cache_setvp(ap->a_nch, *ap->a_vpp);
473                 }
474         }
475         return (error);
476 }
477
478 /*
479  * hammer_vop_getattr { vp, vap }
480  */
481 static
482 int
483 hammer_vop_getattr(struct vop_getattr_args *ap)
484 {
485         struct hammer_inode *ip = VTOI(ap->a_vp);
486         struct vattr *vap = ap->a_vap;
487
488 #if 0
489         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
490             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
491             ip->obj_asof == XXX
492         ) {
493                 /* LAZYMOD XXX */
494         }
495         hammer_itimes(ap->a_vp);
496 #endif
497
498         vap->va_fsid = ip->hmp->fsid_udev;
499         vap->va_fileid = ip->ino_rec.base.base.obj_id;
500         vap->va_mode = ip->ino_data.mode;
501         vap->va_nlink = ip->ino_rec.ino_nlinks;
502         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
503         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
504         vap->va_rmajor = 0;
505         vap->va_rminor = 0;
506         vap->va_size = ip->ino_rec.ino_size;
507         hammer_to_timespec(ip->ino_rec.ino_atime, &vap->va_atime);
508         hammer_to_timespec(ip->ino_rec.ino_mtime, &vap->va_mtime);
509         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
510         vap->va_flags = ip->ino_data.uflags;
511         vap->va_gen = 1;        /* hammer inums are unique for all time */
512         vap->va_blocksize = 32768; /* XXX - extract from root volume */
513         vap->va_bytes = ip->ino_rec.ino_size;
514         vap->va_type = hammer_get_vnode_type(ip->ino_rec.base.base.obj_type);
515         vap->va_filerev = 0;    /* XXX */
516         /* mtime uniquely identifies any adjustments made to the file */
517         vap->va_fsmid = ip->ino_rec.ino_mtime;
518         vap->va_uid_uuid = ip->ino_data.uid;
519         vap->va_gid_uuid = ip->ino_data.gid;
520         vap->va_fsid_uuid = ip->hmp->fsid;
521         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
522                           VA_FSID_UUID_VALID;
523
524         switch (ip->ino_rec.base.base.obj_type) {
525         case HAMMER_OBJTYPE_CDEV:
526         case HAMMER_OBJTYPE_BDEV:
527                 vap->va_rmajor = ip->ino_data.rmajor;
528                 vap->va_rminor = ip->ino_data.rminor;
529                 break;
530         default:
531                 break;
532         }
533
534         return(0);
535 }
536
537 /*
538  * hammer_vop_nresolve { nch, dvp, cred }
539  *
540  * Locate the requested directory entry.
541  */
542 static
543 int
544 hammer_vop_nresolve(struct vop_nresolve_args *ap)
545 {
546         struct namecache *ncp;
547         hammer_inode_t dip;
548         hammer_inode_t ip;
549         hammer_tid_t asof;
550         struct hammer_cursor cursor;
551         union hammer_record_ondisk *rec;
552         struct vnode *vp;
553         int64_t namekey;
554         int error;
555         int i;
556         int nlen;
557         int flags;
558         u_int64_t obj_id;
559
560         /*
561          * Misc initialization, plus handle as-of name extensions.  Look for
562          * the '@@' extension.  Note that as-of files and directories cannot
563          * be modified.
564          */
565         dip = VTOI(ap->a_dvp);
566         ncp = ap->a_nch->ncp;
567         asof = dip->obj_asof;
568         nlen = ncp->nc_nlen;
569         flags = dip->flags;
570
571         for (i = 0; i < nlen; ++i) {
572                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
573                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
574                         kprintf("ASOF %016llx\n", asof);
575                         flags |= HAMMER_INODE_RO;
576                         break;
577                 }
578         }
579         nlen = i;
580
581         /*
582          * If there is no path component the time extension is relative to
583          * dip.
584          */
585         if (nlen == 0) {
586                 ip = hammer_get_inode(dip->hmp, &dip->cache[1], dip->obj_id,
587                                       asof, flags, &error);
588                 if (error == 0) {
589                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
590                         hammer_rel_inode(ip, 0);
591                 } else {
592                         vp = NULL;
593                 }
594                 if (error == 0) {
595                         vn_unlock(vp);
596                         cache_setvp(ap->a_nch, vp);
597                         vrele(vp);
598                 }
599                 return(error);
600         }
601
602         /*
603          * Calculate the namekey and setup the key range for the scan.  This
604          * works kinda like a chained hash table where the lower 32 bits
605          * of the namekey synthesize the chain.
606          *
607          * The key range is inclusive of both key_beg and key_end.
608          */
609         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
610
611         hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp);
612         cursor.key_beg.obj_id = dip->obj_id;
613         cursor.key_beg.key = namekey;
614         cursor.key_beg.create_tid = 0;
615         cursor.key_beg.delete_tid = 0;
616         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
617         cursor.key_beg.obj_type = 0;
618
619         cursor.key_end = cursor.key_beg;
620         cursor.key_end.key |= 0xFFFFFFFFULL;
621         cursor.asof = asof;
622         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
623
624         /*
625          * Scan all matching records (the chain), locate the one matching
626          * the requested path component.
627          *
628          * The hammer_ip_*() functions merge in-memory records with on-disk
629          * records for the purposes of the search.
630          */
631         error = hammer_ip_first(&cursor, dip);
632         rec = NULL;
633         obj_id = 0;
634
635         while (error == 0) {
636                 error = hammer_ip_resolve_data(&cursor);
637                 if (error)
638                         break;
639                 rec = cursor.record;
640                 if (nlen == rec->entry.base.data_len &&
641                     bcmp(ncp->nc_name, cursor.data, nlen) == 0) {
642                         obj_id = rec->entry.obj_id;
643                         break;
644                 }
645                 error = hammer_ip_next(&cursor);
646         }
647         hammer_done_cursor(&cursor);
648         if (error == 0) {
649                 ip = hammer_get_inode(dip->hmp, &dip->cache[1],
650                                       obj_id, asof, flags, &error);
651                 if (error == 0) {
652                         error = hammer_get_vnode(ip, LK_EXCLUSIVE, &vp);
653                         hammer_rel_inode(ip, 0);
654                 } else {
655                         vp = NULL;
656                 }
657                 if (error == 0) {
658                         vn_unlock(vp);
659                         cache_setvp(ap->a_nch, vp);
660                         vrele(vp);
661                 }
662         } else if (error == ENOENT) {
663                 cache_setvp(ap->a_nch, NULL);
664         }
665         return (error);
666 }
667
668 /*
669  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
670  *
671  * Locate the parent directory of a directory vnode.
672  *
673  * dvp is referenced but not locked.  *vpp must be returned referenced and
674  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
675  * at the root, instead it could indicate that the directory we were in was
676  * removed.
677  */
678 static
679 int
680 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
681 {
682         struct hammer_inode *dip;
683         struct hammer_inode *ip;
684         u_int64_t parent_obj_id;
685         int error;
686
687         dip = VTOI(ap->a_dvp);
688         if ((parent_obj_id = dip->ino_data.parent_obj_id) == 0) {
689                 *ap->a_vpp = NULL;
690                 return ENOENT;
691         }
692
693         ip = hammer_get_inode(dip->hmp, &dip->cache[1], parent_obj_id,
694                               dip->obj_asof, dip->flags, &error);
695         if (ip == NULL) {
696                 *ap->a_vpp = NULL;
697                 return(error);
698         }
699         error = hammer_get_vnode(ip, LK_EXCLUSIVE, ap->a_vpp);
700         hammer_rel_inode(ip, 0);
701         return (error);
702 }
703
704 /*
705  * hammer_vop_nlink { nch, dvp, vp, cred }
706  */
707 static
708 int
709 hammer_vop_nlink(struct vop_nlink_args *ap)
710 {
711         struct hammer_transaction trans;
712         struct hammer_inode *dip;
713         struct hammer_inode *ip;
714         struct nchandle *nch;
715         int error;
716
717         nch = ap->a_nch;
718         dip = VTOI(ap->a_dvp);
719         ip = VTOI(ap->a_vp);
720
721         if (dip->flags & HAMMER_INODE_RO)
722                 return (EROFS);
723         if (ip->flags & HAMMER_INODE_RO)
724                 return (EROFS);
725
726         /*
727          * Create a transaction to cover the operations we perform.
728          */
729         hammer_start_transaction(&trans, dip->hmp);
730
731         /*
732          * Add the filesystem object to the directory.  Note that neither
733          * dip nor ip are referenced or locked, but their vnodes are
734          * referenced.  This function will bump the inode's link count.
735          */
736         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
737
738         /*
739          * Finish up.
740          */
741         if (error) {
742                 hammer_abort_transaction(&trans);
743         } else {
744                 cache_setunresolved(nch);
745                 cache_setvp(nch, ap->a_vp);
746                 hammer_commit_transaction(&trans);
747         }
748         return (error);
749 }
750
751 /*
752  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
753  *
754  * The operating system has already ensured that the directory entry
755  * does not exist and done all appropriate namespace locking.
756  */
757 static
758 int
759 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
760 {
761         struct hammer_transaction trans;
762         struct hammer_inode *dip;
763         struct hammer_inode *nip;
764         struct nchandle *nch;
765         int error;
766
767         nch = ap->a_nch;
768         dip = VTOI(ap->a_dvp);
769
770         if (dip->flags & HAMMER_INODE_RO)
771                 return (EROFS);
772
773         /*
774          * Create a transaction to cover the operations we perform.
775          */
776         hammer_start_transaction(&trans, dip->hmp);
777
778         /*
779          * Create a new filesystem object of the requested type.  The
780          * returned inode will be referenced but not locked.
781          */
782         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
783         if (error)
784                 kprintf("hammer_mkdir error %d\n", error);
785         if (error) {
786                 hammer_abort_transaction(&trans);
787                 *ap->a_vpp = NULL;
788                 return (error);
789         }
790
791         /*
792          * Add the new filesystem object to the directory.  This will also
793          * bump the inode's link count.
794          */
795         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
796         if (error)
797                 kprintf("hammer_mkdir (add) error %d\n", error);
798
799         /*
800          * Finish up.
801          */
802         if (error) {
803                 hammer_rel_inode(nip, 0);
804                 hammer_abort_transaction(&trans);
805                 *ap->a_vpp = NULL;
806         } else {
807                 hammer_commit_transaction(&trans);
808                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
809                 hammer_rel_inode(nip, 0);
810                 if (error == 0) {
811                         cache_setunresolved(ap->a_nch);
812                         cache_setvp(ap->a_nch, *ap->a_vpp);
813                 }
814         }
815         return (error);
816 }
817
818 /*
819  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
820  *
821  * The operating system has already ensured that the directory entry
822  * does not exist and done all appropriate namespace locking.
823  */
824 static
825 int
826 hammer_vop_nmknod(struct vop_nmknod_args *ap)
827 {
828         struct hammer_transaction trans;
829         struct hammer_inode *dip;
830         struct hammer_inode *nip;
831         struct nchandle *nch;
832         int error;
833
834         nch = ap->a_nch;
835         dip = VTOI(ap->a_dvp);
836
837         if (dip->flags & HAMMER_INODE_RO)
838                 return (EROFS);
839
840         /*
841          * Create a transaction to cover the operations we perform.
842          */
843         hammer_start_transaction(&trans, dip->hmp);
844
845         /*
846          * Create a new filesystem object of the requested type.  The
847          * returned inode will be referenced but not locked.
848          */
849         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
850         if (error) {
851                 hammer_abort_transaction(&trans);
852                 *ap->a_vpp = NULL;
853                 return (error);
854         }
855
856         /*
857          * Add the new filesystem object to the directory.  This will also
858          * bump the inode's link count.
859          */
860         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
861
862         /*
863          * Finish up.
864          */
865         if (error) {
866                 hammer_rel_inode(nip, 0);
867                 hammer_abort_transaction(&trans);
868                 *ap->a_vpp = NULL;
869         } else {
870                 hammer_commit_transaction(&trans);
871                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
872                 hammer_rel_inode(nip, 0);
873                 if (error == 0) {
874                         cache_setunresolved(ap->a_nch);
875                         cache_setvp(ap->a_nch, *ap->a_vpp);
876                 }
877         }
878         return (error);
879 }
880
881 /*
882  * hammer_vop_open { vp, mode, cred, fp }
883  */
884 static
885 int
886 hammer_vop_open(struct vop_open_args *ap)
887 {
888         if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
889                 return (EROFS);
890
891         return(vop_stdopen(ap));
892 }
893
894 /*
895  * hammer_vop_pathconf { vp, name, retval }
896  */
897 static
898 int
899 hammer_vop_pathconf(struct vop_pathconf_args *ap)
900 {
901         return EOPNOTSUPP;
902 }
903
904 /*
905  * hammer_vop_print { vp }
906  */
907 static
908 int
909 hammer_vop_print(struct vop_print_args *ap)
910 {
911         return EOPNOTSUPP;
912 }
913
914 /*
915  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
916  */
917 static
918 int
919 hammer_vop_readdir(struct vop_readdir_args *ap)
920 {
921         struct hammer_cursor cursor;
922         struct hammer_inode *ip;
923         struct uio *uio;
924         hammer_record_ondisk_t rec;
925         hammer_base_elm_t base;
926         int error;
927         int cookie_index;
928         int ncookies;
929         off_t *cookies;
930         off_t saveoff;
931         int r;
932
933         ip = VTOI(ap->a_vp);
934         uio = ap->a_uio;
935         saveoff = uio->uio_offset;
936
937         if (ap->a_ncookies) {
938                 ncookies = uio->uio_resid / 16 + 1;
939                 if (ncookies > 1024)
940                         ncookies = 1024;
941                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
942                 cookie_index = 0;
943         } else {
944                 ncookies = -1;
945                 cookies = NULL;
946                 cookie_index = 0;
947         }
948
949         /*
950          * Handle artificial entries
951          */
952         error = 0;
953         if (saveoff == 0) {
954                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
955                 if (r)
956                         goto done;
957                 if (cookies)
958                         cookies[cookie_index] = saveoff;
959                 ++saveoff;
960                 ++cookie_index;
961                 if (cookie_index == ncookies)
962                         goto done;
963         }
964         if (saveoff == 1) {
965                 if (ip->ino_data.parent_obj_id) {
966                         r = vop_write_dirent(&error, uio,
967                                              ip->ino_data.parent_obj_id,
968                                              DT_DIR, 2, "..");
969                 } else {
970                         r = vop_write_dirent(&error, uio,
971                                              ip->obj_id, DT_DIR, 2, "..");
972                 }
973                 if (r)
974                         goto done;
975                 if (cookies)
976                         cookies[cookie_index] = saveoff;
977                 ++saveoff;
978                 ++cookie_index;
979                 if (cookie_index == ncookies)
980                         goto done;
981         }
982
983         /*
984          * Key range (begin and end inclusive) to scan.  Directory keys
985          * directly translate to a 64 bit 'seek' position.
986          */
987         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
988         cursor.key_beg.obj_id = ip->obj_id;
989         cursor.key_beg.create_tid = 0;
990         cursor.key_beg.delete_tid = 0;
991         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
992         cursor.key_beg.obj_type = 0;
993         cursor.key_beg.key = saveoff;
994
995         cursor.key_end = cursor.key_beg;
996         cursor.key_end.key = HAMMER_MAX_KEY;
997         cursor.asof = ip->obj_asof;
998         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
999
1000         error = hammer_ip_first(&cursor, ip);
1001
1002         while (error == 0) {
1003                 error = hammer_ip_resolve_data(&cursor);
1004                 if (error)
1005                         break;
1006                 rec = cursor.record;
1007                 base = &rec->base.base;
1008                 saveoff = base->key;
1009
1010                 if (base->obj_id != ip->obj_id)
1011                         panic("readdir: bad record at %p", cursor.node);
1012
1013                 r = vop_write_dirent(
1014                              &error, uio, rec->entry.obj_id,
1015                              hammer_get_dtype(rec->entry.base.base.obj_type),
1016                              rec->entry.base.data_len,
1017                              (void *)cursor.data);
1018                 if (r)
1019                         break;
1020                 ++saveoff;
1021                 if (cookies)
1022                         cookies[cookie_index] = base->key;
1023                 ++cookie_index;
1024                 if (cookie_index == ncookies)
1025                         break;
1026                 error = hammer_ip_next(&cursor);
1027         }
1028         hammer_done_cursor(&cursor);
1029
1030 done:
1031         if (ap->a_eofflag)
1032                 *ap->a_eofflag = (error == ENOENT);
1033         uio->uio_offset = saveoff;
1034         if (error && cookie_index == 0) {
1035                 if (error == ENOENT)
1036                         error = 0;
1037                 if (cookies) {
1038                         kfree(cookies, M_TEMP);
1039                         *ap->a_ncookies = 0;
1040                         *ap->a_cookies = NULL;
1041                 }
1042         } else {
1043                 if (error == ENOENT)
1044                         error = 0;
1045                 if (cookies) {
1046                         *ap->a_ncookies = cookie_index;
1047                         *ap->a_cookies = cookies;
1048                 }
1049         }
1050         return(error);
1051 }
1052
1053 /*
1054  * hammer_vop_readlink { vp, uio, cred }
1055  */
1056 static
1057 int
1058 hammer_vop_readlink(struct vop_readlink_args *ap)
1059 {
1060         struct hammer_cursor cursor;
1061         struct hammer_inode *ip;
1062         int error;
1063
1064         ip = VTOI(ap->a_vp);
1065         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1066
1067         /*
1068          * Key range (begin and end inclusive) to scan.  Directory keys
1069          * directly translate to a 64 bit 'seek' position.
1070          */
1071         cursor.key_beg.obj_id = ip->obj_id;
1072         cursor.key_beg.create_tid = 0;
1073         cursor.key_beg.delete_tid = 0;
1074         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1075         cursor.key_beg.obj_type = 0;
1076         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1077         cursor.asof = ip->obj_asof;
1078         cursor.flags |= HAMMER_CURSOR_ASOF;
1079
1080         error = hammer_ip_lookup(&cursor, ip);
1081         if (error == 0) {
1082                 error = hammer_ip_resolve_data(&cursor);
1083                 if (error == 0) {
1084                         error = uiomove((char *)cursor.data,
1085                                         cursor.record->generic.base.data_len,
1086                                         ap->a_uio);
1087                 }
1088         }
1089         hammer_done_cursor(&cursor);
1090         return(error);
1091 }
1092
1093 /*
1094  * hammer_vop_nremove { nch, dvp, cred }
1095  */
1096 static
1097 int
1098 hammer_vop_nremove(struct vop_nremove_args *ap)
1099 {
1100         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
1101 }
1102
1103 /*
1104  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1105  */
1106 static
1107 int
1108 hammer_vop_nrename(struct vop_nrename_args *ap)
1109 {
1110         struct hammer_transaction trans;
1111         struct namecache *fncp;
1112         struct namecache *tncp;
1113         struct hammer_inode *fdip;
1114         struct hammer_inode *tdip;
1115         struct hammer_inode *ip;
1116         struct hammer_cursor cursor;
1117         union hammer_record_ondisk *rec;
1118         int64_t namekey;
1119         int error;
1120
1121         fdip = VTOI(ap->a_fdvp);
1122         tdip = VTOI(ap->a_tdvp);
1123         fncp = ap->a_fnch->ncp;
1124         tncp = ap->a_tnch->ncp;
1125         ip = VTOI(fncp->nc_vp);
1126         KKASSERT(ip != NULL);
1127
1128         if (fdip->flags & HAMMER_INODE_RO)
1129                 return (EROFS);
1130         if (tdip->flags & HAMMER_INODE_RO)
1131                 return (EROFS);
1132         if (ip->flags & HAMMER_INODE_RO)
1133                 return (EROFS);
1134
1135         hammer_start_transaction(&trans, fdip->hmp);
1136
1137         /*
1138          * Remove tncp from the target directory and then link ip as
1139          * tncp. XXX pass trans to dounlink
1140          */
1141         error = hammer_dounlink(ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1142         if (error == 0 || error == ENOENT)
1143                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1144         if (error)
1145                 goto failed; /* XXX */
1146
1147         /*
1148          * Locate the record in the originating directory and remove it.
1149          *
1150          * Calculate the namekey and setup the key range for the scan.  This
1151          * works kinda like a chained hash table where the lower 32 bits
1152          * of the namekey synthesize the chain.
1153          *
1154          * The key range is inclusive of both key_beg and key_end.
1155          */
1156         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1157 retry:
1158         hammer_init_cursor_hmp(&cursor, &fdip->cache[0], fdip->hmp);
1159         cursor.key_beg.obj_id = fdip->obj_id;
1160         cursor.key_beg.key = namekey;
1161         cursor.key_beg.create_tid = 0;
1162         cursor.key_beg.delete_tid = 0;
1163         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1164         cursor.key_beg.obj_type = 0;
1165
1166         cursor.key_end = cursor.key_beg;
1167         cursor.key_end.key |= 0xFFFFFFFFULL;
1168         cursor.asof = fdip->obj_asof;
1169         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1170
1171         /*
1172          * Scan all matching records (the chain), locate the one matching
1173          * the requested path component.
1174          *
1175          * The hammer_ip_*() functions merge in-memory records with on-disk
1176          * records for the purposes of the search.
1177          */
1178         error = hammer_ip_first(&cursor, fdip);
1179         while (error == 0) {
1180                 if (hammer_ip_resolve_data(&cursor) != 0)
1181                         break;
1182                 rec = cursor.record;
1183                 if (fncp->nc_nlen == rec->entry.base.data_len &&
1184                     bcmp(fncp->nc_name, cursor.data, fncp->nc_nlen) == 0) {
1185                         break;
1186                 }
1187                 error = hammer_ip_next(&cursor);
1188         }
1189
1190         /*
1191          * If all is ok we have to get the inode so we can adjust nlinks.
1192          *
1193          * WARNING: hammer_ip_del_directory() may have to terminate the
1194          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1195          * twice.
1196          */
1197         if (error == 0)
1198                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1199         hammer_done_cursor(&cursor);
1200         if (error == 0)
1201                 cache_rename(ap->a_fnch, ap->a_tnch);
1202         if (error == EDEADLK)
1203                 goto retry;
1204 failed:
1205         if (error == 0) {
1206                 hammer_commit_transaction(&trans);
1207         } else {
1208                 hammer_abort_transaction(&trans);
1209         }
1210         return (error);
1211 }
1212
1213 /*
1214  * hammer_vop_nrmdir { nch, dvp, cred }
1215  */
1216 static
1217 int
1218 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1219 {
1220         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, 0));
1221 }
1222
1223 /*
1224  * hammer_vop_setattr { vp, vap, cred }
1225  */
1226 static
1227 int
1228 hammer_vop_setattr(struct vop_setattr_args *ap)
1229 {
1230         struct hammer_transaction trans;
1231         struct hammer_cursor *spike = NULL;
1232         struct vattr *vap;
1233         struct hammer_inode *ip;
1234         int modflags;
1235         int error;
1236         int64_t aligned_size;
1237         u_int32_t flags;
1238         uuid_t uuid;
1239
1240         vap = ap->a_vap;
1241         ip = ap->a_vp->v_data;
1242         modflags = 0;
1243
1244         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1245                 return(EROFS);
1246         if (ip->flags & HAMMER_INODE_RO)
1247                 return (EROFS);
1248
1249         hammer_start_transaction(&trans, ip->hmp);
1250         error = 0;
1251
1252         if (vap->va_flags != VNOVAL) {
1253                 flags = ip->ino_data.uflags;
1254                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1255                                          hammer_to_unix_xid(&ip->ino_data.uid),
1256                                          ap->a_cred);
1257                 if (error == 0) {
1258                         if (ip->ino_data.uflags != flags) {
1259                                 ip->ino_data.uflags = flags;
1260                                 modflags |= HAMMER_INODE_DDIRTY;
1261                         }
1262                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1263                                 error = 0;
1264                                 goto done;
1265                         }
1266                 }
1267                 goto done;
1268         }
1269         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1270                 error = EPERM;
1271                 goto done;
1272         }
1273         if (vap->va_uid != (uid_t)VNOVAL) {
1274                 hammer_guid_to_uuid(&uuid, vap->va_uid);
1275                 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1276                         ip->ino_data.uid = uuid;
1277                         modflags |= HAMMER_INODE_DDIRTY;
1278                 }
1279         }
1280         if (vap->va_gid != (uid_t)VNOVAL) {
1281                 hammer_guid_to_uuid(&uuid, vap->va_gid);
1282                 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1283                         ip->ino_data.gid = uuid;
1284                         modflags |= HAMMER_INODE_DDIRTY;
1285                 }
1286         }
1287         while (vap->va_size != VNOVAL && ip->ino_rec.ino_size != vap->va_size) {
1288                 switch(ap->a_vp->v_type) {
1289                 case VREG:
1290                         if (vap->va_size < ip->ino_rec.ino_size) {
1291                                 vtruncbuf(ap->a_vp, vap->va_size,
1292                                           HAMMER_BUFSIZE);
1293                         } else if (vap->va_size > ip->ino_rec.ino_size) {
1294                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1295                         }
1296                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1297                                         ~(int64_t)HAMMER_BUFMASK;
1298                         error = hammer_ip_delete_range(&trans, ip,
1299                                                     aligned_size,
1300                                                     0x7FFFFFFFFFFFFFFFLL,
1301                                                     &spike);
1302                         ip->ino_rec.ino_size = vap->va_size;
1303                         modflags |= HAMMER_INODE_RDIRTY;
1304                         break;
1305                 case VDATABASE:
1306                         error = hammer_ip_delete_range(&trans, ip,
1307                                                     vap->va_size,
1308                                                     0x7FFFFFFFFFFFFFFFLL,
1309                                                     &spike);
1310                         ip->ino_rec.ino_size = vap->va_size;
1311                         modflags |= HAMMER_INODE_RDIRTY;
1312                         break;
1313                 default:
1314                         error = EINVAL;
1315                         goto done;
1316                 }
1317                 if (error == ENOSPC) {
1318                         error = hammer_spike(&spike);
1319                         if (error == 0)
1320                                 continue;
1321                 }
1322                 KKASSERT(spike == NULL);
1323                 break;
1324         }
1325         if (vap->va_atime.tv_sec != VNOVAL) {
1326                 ip->ino_rec.ino_atime =
1327                         hammer_timespec_to_transid(&vap->va_atime);
1328                 modflags |= HAMMER_INODE_ITIMES;
1329         }
1330         if (vap->va_mtime.tv_sec != VNOVAL) {
1331                 ip->ino_rec.ino_mtime =
1332                         hammer_timespec_to_transid(&vap->va_mtime);
1333                 modflags |= HAMMER_INODE_ITIMES;
1334         }
1335         if (vap->va_mode != (mode_t)VNOVAL) {
1336                 if (ip->ino_data.mode != vap->va_mode) {
1337                         ip->ino_data.mode = vap->va_mode;
1338                         modflags |= HAMMER_INODE_DDIRTY;
1339                 }
1340         }
1341 done:
1342         if (error) {
1343                 hammer_abort_transaction(&trans);
1344         } else {
1345                 hammer_modify_inode(&trans, ip, modflags);
1346                 hammer_commit_transaction(&trans);
1347         }
1348         return (error);
1349 }
1350
1351 /*
1352  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1353  */
1354 static
1355 int
1356 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1357 {
1358         struct hammer_transaction trans;
1359         struct hammer_inode *dip;
1360         struct hammer_inode *nip;
1361         struct nchandle *nch;
1362         hammer_record_t record;
1363         int error;
1364         int bytes;
1365
1366         ap->a_vap->va_type = VLNK;
1367
1368         nch = ap->a_nch;
1369         dip = VTOI(ap->a_dvp);
1370
1371         if (dip->flags & HAMMER_INODE_RO)
1372                 return (EROFS);
1373
1374         /*
1375          * Create a transaction to cover the operations we perform.
1376          */
1377         hammer_start_transaction(&trans, dip->hmp);
1378
1379         /*
1380          * Create a new filesystem object of the requested type.  The
1381          * returned inode will be referenced but not locked.
1382          */
1383
1384         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1385         if (error) {
1386                 hammer_abort_transaction(&trans);
1387                 *ap->a_vpp = NULL;
1388                 return (error);
1389         }
1390
1391         /*
1392          * Add the new filesystem object to the directory.  This will also
1393          * bump the inode's link count.
1394          */
1395         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1396
1397         /*
1398          * Add a record representing the symlink.  symlink stores the link
1399          * as pure data, not a string, and is no \0 terminated.
1400          */
1401         if (error == 0) {
1402                 record = hammer_alloc_mem_record(nip);
1403                 bytes = strlen(ap->a_target);
1404
1405                 record->rec.generic.base.base.key = HAMMER_FIXKEY_SYMLINK;
1406                 record->rec.generic.base.base.rec_type = HAMMER_RECTYPE_FIX;
1407                 record->rec.generic.base.data_len = bytes;
1408                 if (bytes <= sizeof(record->rec.generic.filler)) {
1409                         record->data = (void *)record->rec.generic.filler;
1410                         bcopy(ap->a_target, record->data, bytes);
1411                 } else {
1412                         record->data = (void *)ap->a_target;
1413                         /* will be reallocated by routine below */
1414                 }
1415                 error = hammer_ip_add_record(&trans, record);
1416         }
1417
1418         /*
1419          * Finish up.
1420          */
1421         if (error) {
1422                 hammer_rel_inode(nip, 0);
1423                 hammer_abort_transaction(&trans);
1424                 *ap->a_vpp = NULL;
1425         } else {
1426                 hammer_commit_transaction(&trans);
1427                 error = hammer_get_vnode(nip, LK_EXCLUSIVE, ap->a_vpp);
1428                 hammer_rel_inode(nip, 0);
1429                 if (error == 0) {
1430                         cache_setunresolved(ap->a_nch);
1431                         cache_setvp(ap->a_nch, *ap->a_vpp);
1432                 }
1433         }
1434         return (error);
1435 }
1436
1437 /*
1438  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1439  */
1440 static
1441 int
1442 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1443 {
1444         return(hammer_dounlink(ap->a_nch, ap->a_dvp, ap->a_cred, ap->a_flags));
1445 }
1446
1447 /*
1448  * hammer_vop_strategy { vp, bio }
1449  *
1450  * Strategy call, used for regular file read & write only.  Note that the
1451  * bp may represent a cluster.
1452  *
1453  * To simplify operation and allow better optimizations in the future,
1454  * this code does not make any assumptions with regards to buffer alignment
1455  * or size.
1456  */
1457 static
1458 int
1459 hammer_vop_strategy(struct vop_strategy_args *ap)
1460 {
1461         struct buf *bp;
1462         int error;
1463
1464         bp = ap->a_bio->bio_buf;
1465
1466         switch(bp->b_cmd) {
1467         case BUF_CMD_READ:
1468                 error = hammer_vop_strategy_read(ap);
1469                 break;
1470         case BUF_CMD_WRITE:
1471                 error = hammer_vop_strategy_write(ap);
1472                 break;
1473         default:
1474                 error = EINVAL;
1475                 break;
1476         }
1477         bp->b_error = error;
1478         if (error)
1479                 bp->b_flags |= B_ERROR;
1480         biodone(ap->a_bio);
1481         return (error);
1482 }
1483
1484 /*
1485  * Read from a regular file.  Iterate the related records and fill in the
1486  * BIO/BUF.  Gaps are zero-filled.
1487  *
1488  * The support code in hammer_object.c should be used to deal with mixed
1489  * in-memory and on-disk records.
1490  *
1491  * XXX atime update
1492  */
1493 static
1494 int
1495 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1496 {
1497         struct hammer_inode *ip = ap->a_vp->v_data;
1498         struct hammer_cursor cursor;
1499         hammer_record_ondisk_t rec;
1500         hammer_base_elm_t base;
1501         struct bio *bio;
1502         struct buf *bp;
1503         int64_t rec_offset;
1504         int64_t ran_end;
1505         int64_t tmp64;
1506         int error;
1507         int boff;
1508         int roff;
1509         int n;
1510
1511         bio = ap->a_bio;
1512         bp = bio->bio_buf;
1513
1514         hammer_init_cursor_hmp(&cursor, &ip->cache[0], ip->hmp);
1515
1516         /*
1517          * Key range (begin and end inclusive) to scan.  Note that the key's
1518          * stored in the actual records represent BASE+LEN, not BASE.  The
1519          * first record containing bio_offset will have a key > bio_offset.
1520          */
1521         cursor.key_beg.obj_id = ip->obj_id;
1522         cursor.key_beg.create_tid = 0;
1523         cursor.key_beg.delete_tid = 0;
1524         cursor.key_beg.obj_type = 0;
1525         cursor.key_beg.key = bio->bio_offset + 1;
1526         cursor.asof = ip->obj_asof;
1527         cursor.flags |= HAMMER_CURSOR_ASOF;
1528
1529         cursor.key_end = cursor.key_beg;
1530         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1531                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1532                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1533                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1534         } else {
1535                 ran_end = bio->bio_offset + bp->b_bufsize;
1536                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1537                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1538                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1539                 if (tmp64 < ran_end)
1540                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1541                 else
1542                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1543         }
1544         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1545
1546         error = hammer_ip_first(&cursor, ip);
1547         boff = 0;
1548
1549         while (error == 0) {
1550                 error = hammer_ip_resolve_data(&cursor);
1551                 if (error)
1552                         break;
1553                 rec = cursor.record;
1554                 base = &rec->base.base;
1555
1556                 rec_offset = base->key - rec->data.base.data_len;
1557
1558                 /*
1559                  * Calculate the gap, if any, and zero-fill it.
1560                  */
1561                 n = (int)(rec_offset - (bio->bio_offset + boff));
1562                 if (n > 0) {
1563                         if (n > bp->b_bufsize - boff)
1564                                 n = bp->b_bufsize - boff;
1565                         kprintf("zfill %d bytes\n", n);
1566                         bzero((char *)bp->b_data + boff, n);
1567                         boff += n;
1568                         n = 0;
1569                 }
1570
1571                 /*
1572                  * Calculate the data offset in the record and the number
1573                  * of bytes we can copy.
1574                  *
1575                  * Note there is a degenerate case here where boff may
1576                  * already be at bp->b_bufsize.
1577                  */
1578                 roff = -n;
1579                 n = rec->data.base.data_len - roff;
1580                 KKASSERT(n > 0);
1581                 if (n > bp->b_bufsize - boff)
1582                         n = bp->b_bufsize - boff;
1583                 bcopy((char *)cursor.data + roff, (char *)bp->b_data + boff, n);
1584                 boff += n;
1585                 if (boff == bp->b_bufsize)
1586                         break;
1587                 error = hammer_ip_next(&cursor);
1588         }
1589         hammer_done_cursor(&cursor);
1590
1591         /*
1592          * There may have been a gap after the last record
1593          */
1594         if (error == ENOENT)
1595                 error = 0;
1596         if (error == 0 && boff != bp->b_bufsize) {
1597                 KKASSERT(boff < bp->b_bufsize);
1598                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1599                 /* boff = bp->b_bufsize; */
1600         }
1601         bp->b_resid = 0;
1602         return(error);
1603 }
1604
1605 /*
1606  * Write to a regular file.  Iterate the related records and mark for
1607  * deletion.  If existing edge records (left and right side) overlap our
1608  * write they have to be marked deleted and new records created, usually
1609  * referencing a portion of the original data.  Then add a record to
1610  * represent the buffer.
1611  *
1612  * The support code in hammer_object.c should be used to deal with mixed
1613  * in-memory and on-disk records.
1614  */
1615 static
1616 int
1617 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1618 {
1619         struct hammer_transaction trans;
1620         struct hammer_cursor *spike = NULL;
1621         hammer_inode_t ip;
1622         struct bio *bio;
1623         struct buf *bp;
1624         int error;
1625
1626         bio = ap->a_bio;
1627         bp = bio->bio_buf;
1628         ip = ap->a_vp->v_data;
1629
1630         if (ip->flags & HAMMER_INODE_RO)
1631                 return (EROFS);
1632
1633         hammer_start_transaction(&trans, ip->hmp);
1634
1635 retry:
1636         /*
1637          * Delete any records overlapping our range.  This function will
1638          * (eventually) properly truncate partial overlaps.
1639          */
1640         if (ip->ino_rec.base.base.obj_type == HAMMER_OBJTYPE_DBFILE) {
1641                 error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
1642                                                bio->bio_offset, &spike);
1643         } else {
1644                 error = hammer_ip_delete_range(&trans, ip, bio->bio_offset,
1645                                                bio->bio_offset +
1646                                                 bp->b_bufsize - 1,
1647                                                &spike);
1648         }
1649
1650         /*
1651          * Add a single record to cover the write
1652          */
1653         if (error == 0) {
1654                 error = hammer_ip_sync_data(&trans, ip, bio->bio_offset,
1655                                             bp->b_data, bp->b_bufsize,
1656                                             &spike);
1657         }
1658
1659         /*
1660          * If we ran out of space the spike structure will be filled in
1661          * and we must call hammer_spike with it, then retry.
1662          */
1663         if (error == ENOSPC) {
1664                 error = hammer_spike(&spike);
1665                 if (error == 0)
1666                         goto retry;
1667         }
1668         KKASSERT(spike == NULL);
1669
1670         /*
1671          * If an error occured abort the transaction
1672          */
1673         if (error) {
1674                 /* XXX undo deletion */
1675                 hammer_abort_transaction(&trans);
1676                 bp->b_resid = bp->b_bufsize;
1677         } else {
1678                 hammer_commit_transaction(&trans);
1679                 bp->b_resid = 0;
1680         }
1681         return(error);
1682 }
1683
1684 /*
1685  * dounlink - disconnect a directory entry
1686  *
1687  * XXX whiteout support not really in yet
1688  */
1689 static int
1690 hammer_dounlink(struct nchandle *nch, struct vnode *dvp, struct ucred *cred,
1691                 int flags)
1692 {
1693         struct hammer_transaction trans;
1694         struct namecache *ncp;
1695         hammer_inode_t dip;
1696         hammer_inode_t ip;
1697         hammer_record_ondisk_t rec;
1698         struct hammer_cursor cursor;
1699         int64_t namekey;
1700         int error;
1701
1702         /*
1703          * Calculate the namekey and setup the key range for the scan.  This
1704          * works kinda like a chained hash table where the lower 32 bits
1705          * of the namekey synthesize the chain.
1706          *
1707          * The key range is inclusive of both key_beg and key_end.
1708          */
1709         dip = VTOI(dvp);
1710         ncp = nch->ncp;
1711
1712         if (dip->flags & HAMMER_INODE_RO)
1713                 return (EROFS);
1714
1715         hammer_start_transaction(&trans, dip->hmp);
1716
1717         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
1718 retry:
1719         hammer_init_cursor_hmp(&cursor, &dip->cache[0], dip->hmp);
1720         cursor.key_beg.obj_id = dip->obj_id;
1721         cursor.key_beg.key = namekey;
1722         cursor.key_beg.create_tid = 0;
1723         cursor.key_beg.delete_tid = 0;
1724         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1725         cursor.key_beg.obj_type = 0;
1726
1727         cursor.key_end = cursor.key_beg;
1728         cursor.key_end.key |= 0xFFFFFFFFULL;
1729         cursor.asof = dip->obj_asof;
1730         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1731
1732         /*
1733          * Scan all matching records (the chain), locate the one matching
1734          * the requested path component.  info->last_error contains the
1735          * error code on search termination and could be 0, ENOENT, or
1736          * something else.
1737          *
1738          * The hammer_ip_*() functions merge in-memory records with on-disk
1739          * records for the purposes of the search.
1740          */
1741         error = hammer_ip_first(&cursor, dip);
1742         while (error == 0) {
1743                 error = hammer_ip_resolve_data(&cursor);
1744                 if (error)
1745                         break;
1746                 rec = cursor.record;
1747                 if (ncp->nc_nlen == rec->entry.base.data_len &&
1748                     bcmp(ncp->nc_name, cursor.data, ncp->nc_nlen) == 0) {
1749                         break;
1750                 }
1751                 error = hammer_ip_next(&cursor);
1752         }
1753
1754         /*
1755          * If all is ok we have to get the inode so we can adjust nlinks.
1756          *
1757          * If the target is a directory, it must be empty.
1758          */
1759         if (error == 0) {
1760                 ip = hammer_get_inode(dip->hmp, &dip->cache[1],
1761                                       rec->entry.obj_id,
1762                                       dip->hmp->asof, 0, &error);
1763                 KKASSERT(error != ENOENT);
1764                 if (error == 0 && ip->ino_rec.base.base.obj_type ==
1765                                   HAMMER_OBJTYPE_DIRECTORY) {
1766                         error = hammer_ip_check_directory_empty(&trans, ip);
1767                 }
1768                 /*
1769                  * WARNING: hammer_ip_del_directory() may have to terminate
1770                  * the cursor to avoid a lock recursion.  It's ok to call
1771                  * hammer_done_cursor() twice.
1772                  */
1773                 if (error == 0)
1774                         error = hammer_ip_del_directory(&trans, &cursor, dip, ip);
1775                 if (error == 0) {
1776                         cache_setunresolved(nch);
1777                         cache_setvp(nch, NULL);
1778                         /* XXX locking */
1779                         if (ip->vp)
1780                                 cache_inval_vp(ip->vp, CINV_DESTROY);
1781                 }
1782                 hammer_rel_inode(ip, 0);
1783         }
1784         hammer_done_cursor(&cursor);
1785         if (error == EDEADLK)
1786                 goto retry;
1787
1788         if (error == 0)
1789                 hammer_commit_transaction(&trans);
1790         else
1791                 hammer_abort_transaction(&trans);
1792         return (error);
1793 }
1794
1795 /************************************************************************
1796  *                          FIFO AND SPECFS OPS                         *
1797  ************************************************************************
1798  *
1799  */
1800
1801 static int
1802 hammer_vop_fifoclose (struct vop_close_args *ap)
1803 {
1804         /* XXX update itimes */
1805         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
1806 }
1807
1808 static int
1809 hammer_vop_fiforead (struct vop_read_args *ap)
1810 {
1811         int error;
1812
1813         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
1814         /* XXX update access time */
1815         return (error);
1816 }
1817
1818 static int
1819 hammer_vop_fifowrite (struct vop_write_args *ap)
1820 {
1821         int error;
1822
1823         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
1824         /* XXX update access time */
1825         return (error);
1826 }
1827
1828 static int
1829 hammer_vop_specclose (struct vop_close_args *ap)
1830 {
1831         /* XXX update itimes */
1832         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1833 }
1834
1835 static int
1836 hammer_vop_specread (struct vop_read_args *ap)
1837 {
1838         /* XXX update access time */
1839         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1840 }
1841
1842 static int
1843 hammer_vop_specwrite (struct vop_write_args *ap)
1844 {
1845         /* XXX update last change time */
1846         return (VOCALL(&spec_vnode_vops, &ap->a_head));
1847 }
1848