HAMMER 55: Performance tuning and bug fixes - MEDIA STRUCTURES CHANGED!
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.70 2008/06/14 01:42:13 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91
92 struct vop_ops hammer_vnode_vops = {
93         .vop_default =          vop_defaultop,
94         .vop_fsync =            hammer_vop_fsync,
95         .vop_getpages =         vop_stdgetpages,
96         .vop_putpages =         vop_stdputpages,
97         .vop_read =             hammer_vop_read,
98         .vop_write =            hammer_vop_write,
99         .vop_access =           hammer_vop_access,
100         .vop_advlock =          hammer_vop_advlock,
101         .vop_close =            hammer_vop_close,
102         .vop_ncreate =          hammer_vop_ncreate,
103         .vop_getattr =          hammer_vop_getattr,
104         .vop_inactive =         hammer_vop_inactive,
105         .vop_reclaim =          hammer_vop_reclaim,
106         .vop_nresolve =         hammer_vop_nresolve,
107         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
108         .vop_nlink =            hammer_vop_nlink,
109         .vop_nmkdir =           hammer_vop_nmkdir,
110         .vop_nmknod =           hammer_vop_nmknod,
111         .vop_open =             hammer_vop_open,
112         .vop_pathconf =         hammer_vop_pathconf,
113         .vop_print =            hammer_vop_print,
114         .vop_readdir =          hammer_vop_readdir,
115         .vop_readlink =         hammer_vop_readlink,
116         .vop_nremove =          hammer_vop_nremove,
117         .vop_nrename =          hammer_vop_nrename,
118         .vop_nrmdir =           hammer_vop_nrmdir,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl
126 };
127
128 struct vop_ops hammer_spec_vops = {
129         .vop_default =          spec_vnoperate,
130         .vop_fsync =            hammer_vop_fsync,
131         .vop_read =             hammer_vop_specread,
132         .vop_write =            hammer_vop_specwrite,
133         .vop_access =           hammer_vop_access,
134         .vop_close =            hammer_vop_specclose,
135         .vop_getattr =          hammer_vop_getattr,
136         .vop_inactive =         hammer_vop_inactive,
137         .vop_reclaim =          hammer_vop_reclaim,
138         .vop_setattr =          hammer_vop_setattr
139 };
140
141 struct vop_ops hammer_fifo_vops = {
142         .vop_default =          fifo_vnoperate,
143         .vop_fsync =            hammer_vop_fsync,
144         .vop_read =             hammer_vop_fiforead,
145         .vop_write =            hammer_vop_fifowrite,
146         .vop_access =           hammer_vop_access,
147         .vop_close =            hammer_vop_fifoclose,
148         .vop_getattr =          hammer_vop_getattr,
149         .vop_inactive =         hammer_vop_inactive,
150         .vop_reclaim =          hammer_vop_reclaim,
151         .vop_setattr =          hammer_vop_setattr
152 };
153
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159                            struct vnode *dvp, struct ucred *cred, int flags);
160 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
161 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
162 static void hammer_cleanup_write_io(hammer_inode_t ip);
163 static void hammer_update_rsv_databufs(hammer_inode_t ip);
164
165 #if 0
166 static
167 int
168 hammer_vop_vnoperate(struct vop_generic_args *)
169 {
170         return (VOCALL(&hammer_vnode_vops, ap));
171 }
172 #endif
173
174 /*
175  * hammer_vop_fsync { vp, waitfor }
176  */
177 static
178 int
179 hammer_vop_fsync(struct vop_fsync_args *ap)
180 {
181         hammer_inode_t ip = VTOI(ap->a_vp);
182
183         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
184         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
185         if (ap->a_waitfor == MNT_WAIT)
186                 hammer_wait_inode(ip);
187         return (ip->error);
188 }
189
190 /*
191  * hammer_vop_read { vp, uio, ioflag, cred }
192  */
193 static
194 int
195 hammer_vop_read(struct vop_read_args *ap)
196 {
197         struct hammer_transaction trans;
198         hammer_inode_t ip;
199         off_t offset;
200         struct buf *bp;
201         struct uio *uio;
202         int error;
203         int n;
204         int seqcount;
205
206         if (ap->a_vp->v_type != VREG)
207                 return (EINVAL);
208         ip = VTOI(ap->a_vp);
209         error = 0;
210         seqcount = ap->a_ioflag >> 16;
211
212         hammer_start_transaction(&trans, ip->hmp);
213
214         /*
215          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
216          */
217         uio = ap->a_uio;
218         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
219                 offset = uio->uio_offset & HAMMER_BUFMASK;
220                 if (hammer_debug_cluster_enable) {
221                         error = cluster_read(ap->a_vp, ip->ino_data.size,
222                                              uio->uio_offset - offset,
223                                              HAMMER_BUFSIZE,
224                                              MAXBSIZE, seqcount, &bp);
225                 } else {
226                         error = bread(ap->a_vp, uio->uio_offset - offset,
227                                       HAMMER_BUFSIZE, &bp);
228                 }
229                 if (error) {
230                         brelse(bp);
231                         break;
232                 }
233
234                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
235                 n = HAMMER_BUFSIZE - offset;
236                 if (n > uio->uio_resid)
237                         n = uio->uio_resid;
238                 if (n > ip->ino_data.size - uio->uio_offset)
239                         n = (int)(ip->ino_data.size - uio->uio_offset);
240                 error = uiomove((char *)bp->b_data + offset, n, uio);
241
242                 /* data has a lower priority then meta-data */
243                 bp->b_flags |= B_AGE;
244                 bqrelse(bp);
245                 if (error)
246                         break;
247         }
248         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
249             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
250                 ip->ino_leaf.atime = trans.time;
251                 hammer_modify_inode(ip, HAMMER_INODE_ITIMES);
252         }
253         hammer_done_transaction(&trans);
254         return (error);
255 }
256
257 /*
258  * hammer_vop_write { vp, uio, ioflag, cred }
259  */
260 static
261 int
262 hammer_vop_write(struct vop_write_args *ap)
263 {
264         struct hammer_transaction trans;
265         struct hammer_inode *ip;
266         struct uio *uio;
267         int rel_offset;
268         off_t base_offset;
269         struct buf *bp;
270         int error;
271         int n;
272         int flags;
273         int count;
274
275         if (ap->a_vp->v_type != VREG)
276                 return (EINVAL);
277         ip = VTOI(ap->a_vp);
278         error = 0;
279
280         if (ip->flags & HAMMER_INODE_RO)
281                 return (EROFS);
282
283         /*
284          * Create a transaction to cover the operations we perform.
285          */
286         hammer_start_transaction(&trans, ip->hmp);
287         uio = ap->a_uio;
288
289         /*
290          * Check append mode
291          */
292         if (ap->a_ioflag & IO_APPEND)
293                 uio->uio_offset = ip->ino_data.size;
294
295         /*
296          * Check for illegal write offsets.  Valid range is 0...2^63-1.
297          *
298          * NOTE: the base_off assignment is required to work around what
299          * I consider to be a GCC-4 optimization bug.
300          */
301         if (uio->uio_offset < 0) {
302                 hammer_done_transaction(&trans);
303                 return (EFBIG);
304         }
305         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
306         if (uio->uio_resid > 0 && base_offset <= 0) {
307                 hammer_done_transaction(&trans);
308                 return (EFBIG);
309         }
310
311         /*
312          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
313          */
314         count = 0;
315         while (uio->uio_resid > 0) {
316                 int fixsize = 0;
317
318                 if ((error = hammer_checkspace(trans.hmp)) != 0)
319                         break;
320
321                 /*
322                  * Do not allow HAMMER to blow out the buffer cache.
323                  *
324                  * Do not allow HAMMER to blow out system memory by
325                  * accumulating too many records.   Records are decoupled
326                  * from the buffer cache.
327                  *
328                  * Always check at the beginning so separate writes are
329                  * not able to bypass this code.
330                  *
331                  * WARNING: Cannot unlock vp when doing a NOCOPY write as
332                  * part of a putpages operation.  Doing so could cause us
333                  * to deadlock against the VM system when we try to re-lock.
334                  */
335                 if ((count++ & 15) == 0) {
336                         if (uio->uio_segflg != UIO_NOCOPY) {
337                                 vn_unlock(ap->a_vp);
338                                 if ((ap->a_ioflag & IO_NOBWILL) == 0)
339                                         bwillwrite();
340                         }
341                         if (ip->rsv_recs > hammer_limit_irecs)
342                                 hammer_wait_inode_recs(ip);
343                         if (uio->uio_segflg != UIO_NOCOPY)
344                                 vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
345                 }
346
347                 rel_offset = (int)(uio->uio_offset & HAMMER_BUFMASK);
348                 base_offset = uio->uio_offset & ~HAMMER_BUFMASK64;
349                 n = HAMMER_BUFSIZE - rel_offset;
350                 if (n > uio->uio_resid)
351                         n = uio->uio_resid;
352                 if (uio->uio_offset + n > ip->ino_data.size) {
353                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
354                         fixsize = 1;
355                 }
356
357                 if (uio->uio_segflg == UIO_NOCOPY) {
358                         /*
359                          * Issuing a write with the same data backing the
360                          * buffer.  Instantiate the buffer to collect the
361                          * backing vm pages, then read-in any missing bits.
362                          *
363                          * This case is used by vop_stdputpages().
364                          */
365                         bp = getblk(ap->a_vp, base_offset,
366                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
367                         if ((bp->b_flags & B_CACHE) == 0) {
368                                 bqrelse(bp);
369                                 error = bread(ap->a_vp, base_offset,
370                                               HAMMER_BUFSIZE, &bp);
371                         }
372                 } else if (rel_offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
373                         /*
374                          * Even though we are entirely overwriting the buffer
375                          * we may still have to zero it out to avoid a 
376                          * mmap/write visibility issue.
377                          */
378                         bp = getblk(ap->a_vp, base_offset,
379                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
380                         if ((bp->b_flags & B_CACHE) == 0)
381                                 vfs_bio_clrbuf(bp);
382                 } else if (base_offset >= ip->ino_data.size) {
383                         /*
384                          * If the base offset of the buffer is beyond the
385                          * file EOF, we don't have to issue a read.
386                          */
387                         bp = getblk(ap->a_vp, base_offset,
388                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
389                         vfs_bio_clrbuf(bp);
390                 } else {
391                         /*
392                          * Partial overwrite, read in any missing bits then
393                          * replace the portion being written.
394                          */
395                         error = bread(ap->a_vp, base_offset,
396                                       HAMMER_BUFSIZE, &bp);
397                         if (error == 0)
398                                 bheavy(bp);
399                 }
400                 if (error == 0) {
401                         error = uiomove((char *)bp->b_data + rel_offset,
402                                         n, uio);
403                 }
404
405                 /*
406                  * If we screwed up we have to undo any VM size changes we
407                  * made.
408                  */
409                 if (error) {
410                         brelse(bp);
411                         if (fixsize) {
412                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
413                                           HAMMER_BUFSIZE);
414                         }
415                         break;
416                 }
417                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
418                 if (ip->ino_data.size < uio->uio_offset) {
419                         ip->ino_data.size = uio->uio_offset;
420                         flags = HAMMER_INODE_DDIRTY;
421                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
422                 } else {
423                         flags = 0;
424                 }
425                 ip->ino_data.mtime = trans.time;
426                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
427                 flags |= HAMMER_INODE_DDIRTY;   /* XXX mtime */
428                 hammer_modify_inode(ip, flags);
429
430                 /*
431                  * Try to keep track of cached dirty data.
432                  */
433                 if ((bp->b_flags & B_DIRTY) == 0) {
434                         ++ip->rsv_databufs;
435                         ++ip->hmp->rsv_databufs;
436                 }
437
438                 /*
439                  * Final buffer disposition.
440                  */
441                 if (ap->a_ioflag & IO_SYNC) {
442                         bwrite(bp);
443                 } else if (ap->a_ioflag & IO_DIRECT) {
444                         bawrite(bp);
445 #if 1
446                 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
447                            (uio->uio_offset & HAMMER_BUFMASK) == 0) {
448                         /*
449                          * If seqcount indicates sequential operation and
450                          * we just finished filling a buffer, push it out
451                          * now to prevent the buffer cache from becoming
452                          * too full, which would trigger non-optimal
453                          * flushes.
454                          */
455                         bawrite(bp);
456 #endif
457                 } else {
458                         bdwrite(bp);
459                 }
460         }
461         hammer_done_transaction(&trans);
462         return (error);
463 }
464
465 /*
466  * hammer_vop_access { vp, mode, cred }
467  */
468 static
469 int
470 hammer_vop_access(struct vop_access_args *ap)
471 {
472         struct hammer_inode *ip = VTOI(ap->a_vp);
473         uid_t uid;
474         gid_t gid;
475         int error;
476
477         uid = hammer_to_unix_xid(&ip->ino_data.uid);
478         gid = hammer_to_unix_xid(&ip->ino_data.gid);
479
480         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
481                                   ip->ino_data.uflags);
482         return (error);
483 }
484
485 /*
486  * hammer_vop_advlock { vp, id, op, fl, flags }
487  */
488 static
489 int
490 hammer_vop_advlock(struct vop_advlock_args *ap)
491 {
492         struct hammer_inode *ip = VTOI(ap->a_vp);
493
494         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
495 }
496
497 /*
498  * hammer_vop_close { vp, fflag }
499  */
500 static
501 int
502 hammer_vop_close(struct vop_close_args *ap)
503 {
504         return (vop_stdclose(ap));
505 }
506
507 /*
508  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
509  *
510  * The operating system has already ensured that the directory entry
511  * does not exist and done all appropriate namespace locking.
512  */
513 static
514 int
515 hammer_vop_ncreate(struct vop_ncreate_args *ap)
516 {
517         struct hammer_transaction trans;
518         struct hammer_inode *dip;
519         struct hammer_inode *nip;
520         struct nchandle *nch;
521         int error;
522
523         nch = ap->a_nch;
524         dip = VTOI(ap->a_dvp);
525
526         if (dip->flags & HAMMER_INODE_RO)
527                 return (EROFS);
528         if ((error = hammer_checkspace(dip->hmp)) != 0)
529                 return (error);
530
531         /*
532          * Create a transaction to cover the operations we perform.
533          */
534         hammer_start_transaction(&trans, dip->hmp);
535
536         /*
537          * Create a new filesystem object of the requested type.  The
538          * returned inode will be referenced and shared-locked to prevent
539          * it from being moved to the flusher.
540          */
541
542         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
543         if (error) {
544                 hkprintf("hammer_create_inode error %d\n", error);
545                 hammer_done_transaction(&trans);
546                 *ap->a_vpp = NULL;
547                 return (error);
548         }
549
550         /*
551          * Add the new filesystem object to the directory.  This will also
552          * bump the inode's link count.
553          */
554         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
555         if (error)
556                 hkprintf("hammer_ip_add_directory error %d\n", error);
557
558         /*
559          * Finish up.
560          */
561         if (error) {
562                 hammer_rel_inode(nip, 0);
563                 hammer_done_transaction(&trans);
564                 *ap->a_vpp = NULL;
565         } else {
566                 error = hammer_get_vnode(nip, ap->a_vpp);
567                 hammer_done_transaction(&trans);
568                 hammer_rel_inode(nip, 0);
569                 if (error == 0) {
570                         cache_setunresolved(ap->a_nch);
571                         cache_setvp(ap->a_nch, *ap->a_vpp);
572                 }
573         }
574         return (error);
575 }
576
577 /*
578  * hammer_vop_getattr { vp, vap }
579  *
580  * Retrieve an inode's attribute information.  When accessing inodes
581  * historically we fake the atime field to ensure consistent results.
582  * The atime field is stored in the B-Tree element and allowed to be
583  * updated without cycling the element.
584  */
585 static
586 int
587 hammer_vop_getattr(struct vop_getattr_args *ap)
588 {
589         struct hammer_inode *ip = VTOI(ap->a_vp);
590         struct vattr *vap = ap->a_vap;
591
592 #if 0
593         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
594             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
595             ip->obj_asof == XXX
596         ) {
597                 /* LAZYMOD XXX */
598         }
599         hammer_itimes(ap->a_vp);
600 #endif
601
602         vap->va_fsid = ip->hmp->fsid_udev;
603         vap->va_fileid = ip->ino_leaf.base.obj_id;
604         vap->va_mode = ip->ino_data.mode;
605         vap->va_nlink = ip->ino_data.nlinks;
606         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
607         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
608         vap->va_rmajor = 0;
609         vap->va_rminor = 0;
610         vap->va_size = ip->ino_data.size;
611         if (ip->flags & HAMMER_INODE_RO)
612                 hammer_to_timespec(ip->ino_data.mtime, &vap->va_atime);
613         else
614                 hammer_to_timespec(ip->ino_leaf.atime, &vap->va_atime);
615         hammer_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
616         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
617         vap->va_flags = ip->ino_data.uflags;
618         vap->va_gen = 1;        /* hammer inums are unique for all time */
619         vap->va_blocksize = HAMMER_BUFSIZE;
620         vap->va_bytes = (ip->ino_data.size + 63) & ~63;
621         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
622         vap->va_filerev = 0;    /* XXX */
623         /* mtime uniquely identifies any adjustments made to the file */
624         vap->va_fsmid = ip->ino_data.mtime;
625         vap->va_uid_uuid = ip->ino_data.uid;
626         vap->va_gid_uuid = ip->ino_data.gid;
627         vap->va_fsid_uuid = ip->hmp->fsid;
628         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
629                           VA_FSID_UUID_VALID;
630
631         switch (ip->ino_data.obj_type) {
632         case HAMMER_OBJTYPE_CDEV:
633         case HAMMER_OBJTYPE_BDEV:
634                 vap->va_rmajor = ip->ino_data.rmajor;
635                 vap->va_rminor = ip->ino_data.rminor;
636                 break;
637         default:
638                 break;
639         }
640
641         return(0);
642 }
643
644 /*
645  * hammer_vop_nresolve { nch, dvp, cred }
646  *
647  * Locate the requested directory entry.
648  */
649 static
650 int
651 hammer_vop_nresolve(struct vop_nresolve_args *ap)
652 {
653         struct hammer_transaction trans;
654         struct namecache *ncp;
655         hammer_inode_t dip;
656         hammer_inode_t ip;
657         hammer_tid_t asof;
658         struct hammer_cursor cursor;
659         struct vnode *vp;
660         int64_t namekey;
661         int error;
662         int i;
663         int nlen;
664         int flags;
665         u_int64_t obj_id;
666
667         /*
668          * Misc initialization, plus handle as-of name extensions.  Look for
669          * the '@@' extension.  Note that as-of files and directories cannot
670          * be modified.
671          */
672         dip = VTOI(ap->a_dvp);
673         ncp = ap->a_nch->ncp;
674         asof = dip->obj_asof;
675         nlen = ncp->nc_nlen;
676         flags = dip->flags;
677
678         hammer_simple_transaction(&trans, dip->hmp);
679
680         for (i = 0; i < nlen; ++i) {
681                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
682                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
683                         flags |= HAMMER_INODE_RO;
684                         break;
685                 }
686         }
687         nlen = i;
688
689         /*
690          * If there is no path component the time extension is relative to
691          * dip.
692          */
693         if (nlen == 0) {
694                 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
695                                       asof, flags, &error);
696                 if (error == 0) {
697                         error = hammer_get_vnode(ip, &vp);
698                         hammer_rel_inode(ip, 0);
699                 } else {
700                         vp = NULL;
701                 }
702                 if (error == 0) {
703                         vn_unlock(vp);
704                         cache_setvp(ap->a_nch, vp);
705                         vrele(vp);
706                 }
707                 goto done;
708         }
709
710         /*
711          * Calculate the namekey and setup the key range for the scan.  This
712          * works kinda like a chained hash table where the lower 32 bits
713          * of the namekey synthesize the chain.
714          *
715          * The key range is inclusive of both key_beg and key_end.
716          */
717         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
718
719         error = hammer_init_cursor(&trans, &cursor, &dip->cache[0], dip);
720         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
721         cursor.key_beg.obj_id = dip->obj_id;
722         cursor.key_beg.key = namekey;
723         cursor.key_beg.create_tid = 0;
724         cursor.key_beg.delete_tid = 0;
725         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
726         cursor.key_beg.obj_type = 0;
727
728         cursor.key_end = cursor.key_beg;
729         cursor.key_end.key |= 0xFFFFFFFFULL;
730         cursor.asof = asof;
731         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
732
733         /*
734          * Scan all matching records (the chain), locate the one matching
735          * the requested path component.
736          *
737          * The hammer_ip_*() functions merge in-memory records with on-disk
738          * records for the purposes of the search.
739          */
740         obj_id = 0;
741
742         if (error == 0) {
743                 error = hammer_ip_first(&cursor);
744                 while (error == 0) {
745                         error = hammer_ip_resolve_data(&cursor);
746                         if (error)
747                                 break;
748                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
749                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
750                                 obj_id = cursor.data->entry.obj_id;
751                                 break;
752                         }
753                         error = hammer_ip_next(&cursor);
754                 }
755         }
756         hammer_done_cursor(&cursor);
757         if (error == 0) {
758                 ip = hammer_get_inode(&trans, &dip->cache[1],
759                                       obj_id, asof, flags, &error);
760                 if (error == 0) {
761                         error = hammer_get_vnode(ip, &vp);
762                         hammer_rel_inode(ip, 0);
763                 } else {
764                         vp = NULL;
765                 }
766                 if (error == 0) {
767                         vn_unlock(vp);
768                         cache_setvp(ap->a_nch, vp);
769                         vrele(vp);
770                 }
771         } else if (error == ENOENT) {
772                 cache_setvp(ap->a_nch, NULL);
773         }
774 done:
775         hammer_done_transaction(&trans);
776         return (error);
777 }
778
779 /*
780  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
781  *
782  * Locate the parent directory of a directory vnode.
783  *
784  * dvp is referenced but not locked.  *vpp must be returned referenced and
785  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
786  * at the root, instead it could indicate that the directory we were in was
787  * removed.
788  *
789  * NOTE: as-of sequences are not linked into the directory structure.  If
790  * we are at the root with a different asof then the mount point, reload
791  * the same directory with the mount point's asof.   I'm not sure what this
792  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
793  * get confused, but it hasn't been tested.
794  */
795 static
796 int
797 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
798 {
799         struct hammer_transaction trans;
800         struct hammer_inode *dip;
801         struct hammer_inode *ip;
802         int64_t parent_obj_id;
803         hammer_tid_t asof;
804         int error;
805
806         dip = VTOI(ap->a_dvp);
807         asof = dip->obj_asof;
808         parent_obj_id = dip->ino_data.parent_obj_id;
809
810         if (parent_obj_id == 0) {
811                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
812                    asof != dip->hmp->asof) {
813                         parent_obj_id = dip->obj_id;
814                         asof = dip->hmp->asof;
815                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
816                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
817                                    dip->obj_asof);
818                 } else {
819                         *ap->a_vpp = NULL;
820                         return ENOENT;
821                 }
822         }
823
824         hammer_simple_transaction(&trans, dip->hmp);
825
826         ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
827                               asof, dip->flags, &error);
828         if (ip) {
829                 error = hammer_get_vnode(ip, ap->a_vpp);
830                 hammer_rel_inode(ip, 0);
831         } else {
832                 *ap->a_vpp = NULL;
833         }
834         hammer_done_transaction(&trans);
835         return (error);
836 }
837
838 /*
839  * hammer_vop_nlink { nch, dvp, vp, cred }
840  */
841 static
842 int
843 hammer_vop_nlink(struct vop_nlink_args *ap)
844 {
845         struct hammer_transaction trans;
846         struct hammer_inode *dip;
847         struct hammer_inode *ip;
848         struct nchandle *nch;
849         int error;
850
851         nch = ap->a_nch;
852         dip = VTOI(ap->a_dvp);
853         ip = VTOI(ap->a_vp);
854
855         if (dip->flags & HAMMER_INODE_RO)
856                 return (EROFS);
857         if (ip->flags & HAMMER_INODE_RO)
858                 return (EROFS);
859         if ((error = hammer_checkspace(dip->hmp)) != 0)
860                 return (error);
861
862         /*
863          * Create a transaction to cover the operations we perform.
864          */
865         hammer_start_transaction(&trans, dip->hmp);
866
867         /*
868          * Add the filesystem object to the directory.  Note that neither
869          * dip nor ip are referenced or locked, but their vnodes are
870          * referenced.  This function will bump the inode's link count.
871          */
872         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
873
874         /*
875          * Finish up.
876          */
877         if (error == 0) {
878                 cache_setunresolved(nch);
879                 cache_setvp(nch, ap->a_vp);
880         }
881         hammer_done_transaction(&trans);
882         return (error);
883 }
884
885 /*
886  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
887  *
888  * The operating system has already ensured that the directory entry
889  * does not exist and done all appropriate namespace locking.
890  */
891 static
892 int
893 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
894 {
895         struct hammer_transaction trans;
896         struct hammer_inode *dip;
897         struct hammer_inode *nip;
898         struct nchandle *nch;
899         int error;
900
901         nch = ap->a_nch;
902         dip = VTOI(ap->a_dvp);
903
904         if (dip->flags & HAMMER_INODE_RO)
905                 return (EROFS);
906         if ((error = hammer_checkspace(dip->hmp)) != 0)
907                 return (error);
908
909         /*
910          * Create a transaction to cover the operations we perform.
911          */
912         hammer_start_transaction(&trans, dip->hmp);
913
914         /*
915          * Create a new filesystem object of the requested type.  The
916          * returned inode will be referenced but not locked.
917          */
918         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
919         if (error) {
920                 hkprintf("hammer_mkdir error %d\n", error);
921                 hammer_done_transaction(&trans);
922                 *ap->a_vpp = NULL;
923                 return (error);
924         }
925         /*
926          * Add the new filesystem object to the directory.  This will also
927          * bump the inode's link count.
928          */
929         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
930         if (error)
931                 hkprintf("hammer_mkdir (add) error %d\n", error);
932
933         /*
934          * Finish up.
935          */
936         if (error) {
937                 hammer_rel_inode(nip, 0);
938                 *ap->a_vpp = NULL;
939         } else {
940                 error = hammer_get_vnode(nip, ap->a_vpp);
941                 hammer_rel_inode(nip, 0);
942                 if (error == 0) {
943                         cache_setunresolved(ap->a_nch);
944                         cache_setvp(ap->a_nch, *ap->a_vpp);
945                 }
946         }
947         hammer_done_transaction(&trans);
948         return (error);
949 }
950
951 /*
952  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
953  *
954  * The operating system has already ensured that the directory entry
955  * does not exist and done all appropriate namespace locking.
956  */
957 static
958 int
959 hammer_vop_nmknod(struct vop_nmknod_args *ap)
960 {
961         struct hammer_transaction trans;
962         struct hammer_inode *dip;
963         struct hammer_inode *nip;
964         struct nchandle *nch;
965         int error;
966
967         nch = ap->a_nch;
968         dip = VTOI(ap->a_dvp);
969
970         if (dip->flags & HAMMER_INODE_RO)
971                 return (EROFS);
972         if ((error = hammer_checkspace(dip->hmp)) != 0)
973                 return (error);
974
975         /*
976          * Create a transaction to cover the operations we perform.
977          */
978         hammer_start_transaction(&trans, dip->hmp);
979
980         /*
981          * Create a new filesystem object of the requested type.  The
982          * returned inode will be referenced but not locked.
983          */
984         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
985         if (error) {
986                 hammer_done_transaction(&trans);
987                 *ap->a_vpp = NULL;
988                 return (error);
989         }
990
991         /*
992          * Add the new filesystem object to the directory.  This will also
993          * bump the inode's link count.
994          */
995         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
996
997         /*
998          * Finish up.
999          */
1000         if (error) {
1001                 hammer_rel_inode(nip, 0);
1002                 *ap->a_vpp = NULL;
1003         } else {
1004                 error = hammer_get_vnode(nip, ap->a_vpp);
1005                 hammer_rel_inode(nip, 0);
1006                 if (error == 0) {
1007                         cache_setunresolved(ap->a_nch);
1008                         cache_setvp(ap->a_nch, *ap->a_vpp);
1009                 }
1010         }
1011         hammer_done_transaction(&trans);
1012         return (error);
1013 }
1014
1015 /*
1016  * hammer_vop_open { vp, mode, cred, fp }
1017  */
1018 static
1019 int
1020 hammer_vop_open(struct vop_open_args *ap)
1021 {
1022         hammer_inode_t ip;
1023
1024         ip = VTOI(ap->a_vp);
1025
1026         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1027                 return (EROFS);
1028         return(vop_stdopen(ap));
1029 }
1030
1031 /*
1032  * hammer_vop_pathconf { vp, name, retval }
1033  */
1034 static
1035 int
1036 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1037 {
1038         return EOPNOTSUPP;
1039 }
1040
1041 /*
1042  * hammer_vop_print { vp }
1043  */
1044 static
1045 int
1046 hammer_vop_print(struct vop_print_args *ap)
1047 {
1048         return EOPNOTSUPP;
1049 }
1050
1051 /*
1052  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1053  */
1054 static
1055 int
1056 hammer_vop_readdir(struct vop_readdir_args *ap)
1057 {
1058         struct hammer_transaction trans;
1059         struct hammer_cursor cursor;
1060         struct hammer_inode *ip;
1061         struct uio *uio;
1062         hammer_base_elm_t base;
1063         int error;
1064         int cookie_index;
1065         int ncookies;
1066         off_t *cookies;
1067         off_t saveoff;
1068         int r;
1069
1070         ip = VTOI(ap->a_vp);
1071         uio = ap->a_uio;
1072         saveoff = uio->uio_offset;
1073
1074         if (ap->a_ncookies) {
1075                 ncookies = uio->uio_resid / 16 + 1;
1076                 if (ncookies > 1024)
1077                         ncookies = 1024;
1078                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1079                 cookie_index = 0;
1080         } else {
1081                 ncookies = -1;
1082                 cookies = NULL;
1083                 cookie_index = 0;
1084         }
1085
1086         hammer_simple_transaction(&trans, ip->hmp);
1087
1088         /*
1089          * Handle artificial entries
1090          */
1091         error = 0;
1092         if (saveoff == 0) {
1093                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1094                 if (r)
1095                         goto done;
1096                 if (cookies)
1097                         cookies[cookie_index] = saveoff;
1098                 ++saveoff;
1099                 ++cookie_index;
1100                 if (cookie_index == ncookies)
1101                         goto done;
1102         }
1103         if (saveoff == 1) {
1104                 if (ip->ino_data.parent_obj_id) {
1105                         r = vop_write_dirent(&error, uio,
1106                                              ip->ino_data.parent_obj_id,
1107                                              DT_DIR, 2, "..");
1108                 } else {
1109                         r = vop_write_dirent(&error, uio,
1110                                              ip->obj_id, DT_DIR, 2, "..");
1111                 }
1112                 if (r)
1113                         goto done;
1114                 if (cookies)
1115                         cookies[cookie_index] = saveoff;
1116                 ++saveoff;
1117                 ++cookie_index;
1118                 if (cookie_index == ncookies)
1119                         goto done;
1120         }
1121
1122         /*
1123          * Key range (begin and end inclusive) to scan.  Directory keys
1124          * directly translate to a 64 bit 'seek' position.
1125          */
1126         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1127         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1128         cursor.key_beg.obj_id = ip->obj_id;
1129         cursor.key_beg.create_tid = 0;
1130         cursor.key_beg.delete_tid = 0;
1131         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1132         cursor.key_beg.obj_type = 0;
1133         cursor.key_beg.key = saveoff;
1134
1135         cursor.key_end = cursor.key_beg;
1136         cursor.key_end.key = HAMMER_MAX_KEY;
1137         cursor.asof = ip->obj_asof;
1138         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1139
1140         error = hammer_ip_first(&cursor);
1141
1142         while (error == 0) {
1143                 error = hammer_ip_resolve_data(&cursor);
1144                 if (error)
1145                         break;
1146                 base = &cursor.leaf->base;
1147                 saveoff = base->key;
1148                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1149
1150                 if (base->obj_id != ip->obj_id)
1151                         panic("readdir: bad record at %p", cursor.node);
1152
1153                 r = vop_write_dirent(
1154                              &error, uio, cursor.data->entry.obj_id,
1155                              hammer_get_dtype(cursor.leaf->base.obj_type),
1156                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1157                              (void *)cursor.data->entry.name);
1158                 if (r)
1159                         break;
1160                 ++saveoff;
1161                 if (cookies)
1162                         cookies[cookie_index] = base->key;
1163                 ++cookie_index;
1164                 if (cookie_index == ncookies)
1165                         break;
1166                 error = hammer_ip_next(&cursor);
1167         }
1168         hammer_done_cursor(&cursor);
1169
1170 done:
1171         hammer_done_transaction(&trans);
1172
1173         if (ap->a_eofflag)
1174                 *ap->a_eofflag = (error == ENOENT);
1175         uio->uio_offset = saveoff;
1176         if (error && cookie_index == 0) {
1177                 if (error == ENOENT)
1178                         error = 0;
1179                 if (cookies) {
1180                         kfree(cookies, M_TEMP);
1181                         *ap->a_ncookies = 0;
1182                         *ap->a_cookies = NULL;
1183                 }
1184         } else {
1185                 if (error == ENOENT)
1186                         error = 0;
1187                 if (cookies) {
1188                         *ap->a_ncookies = cookie_index;
1189                         *ap->a_cookies = cookies;
1190                 }
1191         }
1192         return(error);
1193 }
1194
1195 /*
1196  * hammer_vop_readlink { vp, uio, cred }
1197  */
1198 static
1199 int
1200 hammer_vop_readlink(struct vop_readlink_args *ap)
1201 {
1202         struct hammer_transaction trans;
1203         struct hammer_cursor cursor;
1204         struct hammer_inode *ip;
1205         int error;
1206
1207         ip = VTOI(ap->a_vp);
1208
1209         /*
1210          * Shortcut if the symlink data was stuffed into ino_data.
1211          */
1212         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1213                 error = uiomove(ip->ino_data.ext.symlink,
1214                                 ip->ino_data.size, ap->a_uio);
1215                 return(error);
1216         }
1217
1218         /*
1219          * Long version
1220          */
1221         hammer_simple_transaction(&trans, ip->hmp);
1222         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1223
1224         /*
1225          * Key range (begin and end inclusive) to scan.  Directory keys
1226          * directly translate to a 64 bit 'seek' position.
1227          */
1228         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; /* XXX */
1229         cursor.key_beg.obj_id = ip->obj_id;
1230         cursor.key_beg.create_tid = 0;
1231         cursor.key_beg.delete_tid = 0;
1232         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1233         cursor.key_beg.obj_type = 0;
1234         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1235         cursor.asof = ip->obj_asof;
1236         cursor.flags |= HAMMER_CURSOR_ASOF;
1237
1238         error = hammer_ip_lookup(&cursor);
1239         if (error == 0) {
1240                 error = hammer_ip_resolve_data(&cursor);
1241                 if (error == 0) {
1242                         KKASSERT(cursor.leaf->data_len >=
1243                                  HAMMER_SYMLINK_NAME_OFF);
1244                         error = uiomove(cursor.data->symlink.name,
1245                                         cursor.leaf->data_len -
1246                                                 HAMMER_SYMLINK_NAME_OFF,
1247                                         ap->a_uio);
1248                 }
1249         }
1250         hammer_done_cursor(&cursor);
1251         hammer_done_transaction(&trans);
1252         return(error);
1253 }
1254
1255 /*
1256  * hammer_vop_nremove { nch, dvp, cred }
1257  */
1258 static
1259 int
1260 hammer_vop_nremove(struct vop_nremove_args *ap)
1261 {
1262         struct hammer_transaction trans;
1263         struct hammer_inode *dip;
1264         int error;
1265
1266         dip = VTOI(ap->a_dvp);
1267
1268         if (hammer_nohistory(dip) == 0 &&
1269             (error = hammer_checkspace(dip->hmp)) != 0) {
1270                 return (error);
1271         }
1272
1273         hammer_start_transaction(&trans, dip->hmp);
1274         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1275         hammer_done_transaction(&trans);
1276
1277         return (error);
1278 }
1279
1280 /*
1281  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1282  */
1283 static
1284 int
1285 hammer_vop_nrename(struct vop_nrename_args *ap)
1286 {
1287         struct hammer_transaction trans;
1288         struct namecache *fncp;
1289         struct namecache *tncp;
1290         struct hammer_inode *fdip;
1291         struct hammer_inode *tdip;
1292         struct hammer_inode *ip;
1293         struct hammer_cursor cursor;
1294         int64_t namekey;
1295         int nlen, error;
1296
1297         fdip = VTOI(ap->a_fdvp);
1298         tdip = VTOI(ap->a_tdvp);
1299         fncp = ap->a_fnch->ncp;
1300         tncp = ap->a_tnch->ncp;
1301         ip = VTOI(fncp->nc_vp);
1302         KKASSERT(ip != NULL);
1303
1304         if (fdip->flags & HAMMER_INODE_RO)
1305                 return (EROFS);
1306         if (tdip->flags & HAMMER_INODE_RO)
1307                 return (EROFS);
1308         if (ip->flags & HAMMER_INODE_RO)
1309                 return (EROFS);
1310         if ((error = hammer_checkspace(fdip->hmp)) != 0)
1311                 return (error);
1312
1313         hammer_start_transaction(&trans, fdip->hmp);
1314
1315         /*
1316          * Remove tncp from the target directory and then link ip as
1317          * tncp. XXX pass trans to dounlink
1318          *
1319          * Force the inode sync-time to match the transaction so it is
1320          * in-sync with the creation of the target directory entry.
1321          */
1322         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1323         if (error == 0 || error == ENOENT) {
1324                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1325                 if (error == 0) {
1326                         ip->ino_data.parent_obj_id = tdip->obj_id;
1327                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1328                 }
1329         }
1330         if (error)
1331                 goto failed; /* XXX */
1332
1333         /*
1334          * Locate the record in the originating directory and remove it.
1335          *
1336          * Calculate the namekey and setup the key range for the scan.  This
1337          * works kinda like a chained hash table where the lower 32 bits
1338          * of the namekey synthesize the chain.
1339          *
1340          * The key range is inclusive of both key_beg and key_end.
1341          */
1342         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1343 retry:
1344         hammer_init_cursor(&trans, &cursor, &fdip->cache[0], fdip);
1345         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1346         cursor.key_beg.obj_id = fdip->obj_id;
1347         cursor.key_beg.key = namekey;
1348         cursor.key_beg.create_tid = 0;
1349         cursor.key_beg.delete_tid = 0;
1350         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1351         cursor.key_beg.obj_type = 0;
1352
1353         cursor.key_end = cursor.key_beg;
1354         cursor.key_end.key |= 0xFFFFFFFFULL;
1355         cursor.asof = fdip->obj_asof;
1356         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1357
1358         /*
1359          * Scan all matching records (the chain), locate the one matching
1360          * the requested path component.
1361          *
1362          * The hammer_ip_*() functions merge in-memory records with on-disk
1363          * records for the purposes of the search.
1364          */
1365         error = hammer_ip_first(&cursor);
1366         while (error == 0) {
1367                 if (hammer_ip_resolve_data(&cursor) != 0)
1368                         break;
1369                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1370                 KKASSERT(nlen > 0);
1371                 if (fncp->nc_nlen == nlen &&
1372                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1373                         break;
1374                 }
1375                 error = hammer_ip_next(&cursor);
1376         }
1377
1378         /*
1379          * If all is ok we have to get the inode so we can adjust nlinks.
1380          *
1381          * WARNING: hammer_ip_del_directory() may have to terminate the
1382          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1383          * twice.
1384          */
1385         if (error == 0)
1386                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1387
1388         /*
1389          * XXX A deadlock here will break rename's atomicy for the purposes
1390          * of crash recovery.
1391          */
1392         if (error == EDEADLK) {
1393                 hammer_done_cursor(&cursor);
1394                 goto retry;
1395         }
1396
1397         /*
1398          * Cleanup and tell the kernel that the rename succeeded.
1399          */
1400         hammer_done_cursor(&cursor);
1401         if (error == 0)
1402                 cache_rename(ap->a_fnch, ap->a_tnch);
1403
1404 failed:
1405         hammer_done_transaction(&trans);
1406         return (error);
1407 }
1408
1409 /*
1410  * hammer_vop_nrmdir { nch, dvp, cred }
1411  */
1412 static
1413 int
1414 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1415 {
1416         struct hammer_transaction trans;
1417         struct hammer_inode *dip;
1418         int error;
1419
1420         dip = VTOI(ap->a_dvp);
1421
1422         if (hammer_nohistory(dip) == 0 &&
1423             (error = hammer_checkspace(dip->hmp)) != 0) {
1424                 return (error);
1425         }
1426
1427         hammer_start_transaction(&trans, dip->hmp);
1428         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1429         hammer_done_transaction(&trans);
1430
1431         return (error);
1432 }
1433
1434 /*
1435  * hammer_vop_setattr { vp, vap, cred }
1436  */
1437 static
1438 int
1439 hammer_vop_setattr(struct vop_setattr_args *ap)
1440 {
1441         struct hammer_transaction trans;
1442         struct vattr *vap;
1443         struct hammer_inode *ip;
1444         int modflags;
1445         int error;
1446         int truncating;
1447         off_t aligned_size;
1448         u_int32_t flags;
1449
1450         vap = ap->a_vap;
1451         ip = ap->a_vp->v_data;
1452         modflags = 0;
1453
1454         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1455                 return(EROFS);
1456         if (ip->flags & HAMMER_INODE_RO)
1457                 return (EROFS);
1458         if (hammer_nohistory(ip) == 0 &&
1459             (error = hammer_checkspace(ip->hmp)) != 0) {
1460                 return (error);
1461         }
1462
1463         hammer_start_transaction(&trans, ip->hmp);
1464         error = 0;
1465
1466         if (vap->va_flags != VNOVAL) {
1467                 flags = ip->ino_data.uflags;
1468                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1469                                          hammer_to_unix_xid(&ip->ino_data.uid),
1470                                          ap->a_cred);
1471                 if (error == 0) {
1472                         if (ip->ino_data.uflags != flags) {
1473                                 ip->ino_data.uflags = flags;
1474                                 modflags |= HAMMER_INODE_DDIRTY;
1475                         }
1476                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1477                                 error = 0;
1478                                 goto done;
1479                         }
1480                 }
1481                 goto done;
1482         }
1483         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1484                 error = EPERM;
1485                 goto done;
1486         }
1487         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1488                 mode_t cur_mode = ip->ino_data.mode;
1489                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1490                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1491                 uuid_t uuid_uid;
1492                 uuid_t uuid_gid;
1493
1494                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1495                                          ap->a_cred,
1496                                          &cur_uid, &cur_gid, &cur_mode);
1497                 if (error == 0) {
1498                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1499                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1500                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1501                                  sizeof(uuid_uid)) ||
1502                             bcmp(&uuid_gid, &ip->ino_data.gid,
1503                                  sizeof(uuid_gid)) ||
1504                             ip->ino_data.mode != cur_mode
1505                         ) {
1506                                 ip->ino_data.uid = uuid_uid;
1507                                 ip->ino_data.gid = uuid_gid;
1508                                 ip->ino_data.mode = cur_mode;
1509                         }
1510                         modflags |= HAMMER_INODE_DDIRTY;
1511                 }
1512         }
1513         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1514                 switch(ap->a_vp->v_type) {
1515                 case VREG:
1516                         if (vap->va_size == ip->ino_data.size)
1517                                 break;
1518                         /*
1519                          * XXX break atomicy, we can deadlock the backend
1520                          * if we do not release the lock.  Probably not a
1521                          * big deal here.
1522                          */
1523                         if (vap->va_size < ip->ino_data.size) {
1524                                 vtruncbuf(ap->a_vp, vap->va_size,
1525                                           HAMMER_BUFSIZE);
1526                                 truncating = 1;
1527                         } else {
1528                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1529                                 truncating = 0;
1530                         }
1531                         ip->ino_data.size = vap->va_size;
1532                         modflags |= HAMMER_INODE_DDIRTY;
1533                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1534                                        ~HAMMER_BUFMASK64;
1535
1536                         /*
1537                          * on-media truncation is cached in the inode until
1538                          * the inode is synchronized.
1539                          */
1540                         if (truncating) {
1541                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1542                                 hammer_update_rsv_databufs(ip);
1543 #ifdef DEBUG_TRUNCATE
1544                                 if (HammerTruncIp == NULL)
1545                                         HammerTruncIp = ip;
1546 #endif
1547                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1548                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1549                                         ip->trunc_off = vap->va_size;
1550 #ifdef DEBUG_TRUNCATE
1551                                         if (ip == HammerTruncIp)
1552                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1553 #endif
1554                                 } else if (ip->trunc_off > vap->va_size) {
1555                                         ip->trunc_off = vap->va_size;
1556 #ifdef DEBUG_TRUNCATE
1557                                         if (ip == HammerTruncIp)
1558                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1559 #endif
1560                                 } else {
1561 #ifdef DEBUG_TRUNCATE
1562                                         if (ip == HammerTruncIp)
1563                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1564 #endif
1565                                 }
1566                         }
1567
1568                         /*
1569                          * If truncating we have to clean out a portion of
1570                          * the last block on-disk.  We do this in the
1571                          * front-end buffer cache.
1572                          */
1573                         if (truncating && vap->va_size < aligned_size) {
1574                                 struct buf *bp;
1575                                 int offset;
1576
1577                                 aligned_size -= HAMMER_BUFSIZE;
1578
1579                                 offset = vap->va_size & HAMMER_BUFMASK;
1580                                 error = bread(ap->a_vp, aligned_size,
1581                                               HAMMER_BUFSIZE, &bp);
1582                                 hammer_ip_frontend_trunc(ip, aligned_size);
1583                                 if (error == 0) {
1584                                         bzero(bp->b_data + offset,
1585                                               HAMMER_BUFSIZE - offset);
1586                                         bdwrite(bp);
1587                                 } else {
1588                                         kprintf("ERROR %d\n", error);
1589                                         brelse(bp);
1590                                 }
1591                         }
1592                         break;
1593                 case VDATABASE:
1594                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1595                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1596                                 ip->trunc_off = vap->va_size;
1597                         } else if (ip->trunc_off > vap->va_size) {
1598                                 ip->trunc_off = vap->va_size;
1599                         }
1600                         hammer_ip_frontend_trunc(ip, vap->va_size);
1601                         ip->ino_data.size = vap->va_size;
1602                         modflags |= HAMMER_INODE_DDIRTY;
1603                         break;
1604                 default:
1605                         error = EINVAL;
1606                         goto done;
1607                 }
1608                 break;
1609         }
1610         if (vap->va_atime.tv_sec != VNOVAL) {
1611                 ip->ino_leaf.atime =
1612                         hammer_timespec_to_transid(&vap->va_atime);
1613                 modflags |= HAMMER_INODE_ITIMES;
1614         }
1615         if (vap->va_mtime.tv_sec != VNOVAL) {
1616                 ip->ino_data.mtime =
1617                         hammer_timespec_to_transid(&vap->va_mtime);
1618                 modflags |= HAMMER_INODE_ITIMES;
1619                 modflags |= HAMMER_INODE_DDIRTY;        /* XXX mtime */
1620         }
1621         if (vap->va_mode != (mode_t)VNOVAL) {
1622                 mode_t   cur_mode = ip->ino_data.mode;
1623                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1624                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1625
1626                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1627                                          cur_uid, cur_gid, &cur_mode);
1628                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1629                         ip->ino_data.mode = cur_mode;
1630                         modflags |= HAMMER_INODE_DDIRTY;
1631                 }
1632         }
1633 done:
1634         if (error == 0)
1635                 hammer_modify_inode(ip, modflags);
1636         hammer_done_transaction(&trans);
1637         return (error);
1638 }
1639
1640 /*
1641  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1642  */
1643 static
1644 int
1645 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1646 {
1647         struct hammer_transaction trans;
1648         struct hammer_inode *dip;
1649         struct hammer_inode *nip;
1650         struct nchandle *nch;
1651         hammer_record_t record;
1652         int error;
1653         int bytes;
1654
1655         ap->a_vap->va_type = VLNK;
1656
1657         nch = ap->a_nch;
1658         dip = VTOI(ap->a_dvp);
1659
1660         if (dip->flags & HAMMER_INODE_RO)
1661                 return (EROFS);
1662         if ((error = hammer_checkspace(dip->hmp)) != 0)
1663                 return (error);
1664
1665         /*
1666          * Create a transaction to cover the operations we perform.
1667          */
1668         hammer_start_transaction(&trans, dip->hmp);
1669
1670         /*
1671          * Create a new filesystem object of the requested type.  The
1672          * returned inode will be referenced but not locked.
1673          */
1674
1675         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1676         if (error) {
1677                 hammer_done_transaction(&trans);
1678                 *ap->a_vpp = NULL;
1679                 return (error);
1680         }
1681
1682         /*
1683          * Add a record representing the symlink.  symlink stores the link
1684          * as pure data, not a string, and is no \0 terminated.
1685          */
1686         if (error == 0) {
1687                 bytes = strlen(ap->a_target);
1688
1689                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1690                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1691                 } else {
1692                         record = hammer_alloc_mem_record(nip, bytes);
1693                         record->type = HAMMER_MEM_RECORD_GENERAL;
1694
1695                         record->leaf.base.localization = HAMMER_LOCALIZE_MISC;
1696                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1697                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1698                         record->leaf.data_len = bytes;
1699                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1700                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1701                         error = hammer_ip_add_record(&trans, record);
1702                 }
1703
1704                 /*
1705                  * Set the file size to the length of the link.
1706                  */
1707                 if (error == 0) {
1708                         nip->ino_data.size = bytes;
1709                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1710                 }
1711         }
1712         if (error == 0)
1713                 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1714
1715         /*
1716          * Finish up.
1717          */
1718         if (error) {
1719                 hammer_rel_inode(nip, 0);
1720                 *ap->a_vpp = NULL;
1721         } else {
1722                 error = hammer_get_vnode(nip, ap->a_vpp);
1723                 hammer_rel_inode(nip, 0);
1724                 if (error == 0) {
1725                         cache_setunresolved(ap->a_nch);
1726                         cache_setvp(ap->a_nch, *ap->a_vpp);
1727                 }
1728         }
1729         hammer_done_transaction(&trans);
1730         return (error);
1731 }
1732
1733 /*
1734  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1735  */
1736 static
1737 int
1738 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1739 {
1740         struct hammer_transaction trans;
1741         struct hammer_inode *dip;
1742         int error;
1743
1744         dip = VTOI(ap->a_dvp);
1745
1746         if (hammer_nohistory(dip) == 0 &&
1747             (error = hammer_checkspace(dip->hmp)) != 0) {
1748                 return (error);
1749         }
1750
1751         hammer_start_transaction(&trans, dip->hmp);
1752         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1753                                 ap->a_cred, ap->a_flags);
1754         hammer_done_transaction(&trans);
1755
1756         return (error);
1757 }
1758
1759 /*
1760  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1761  */
1762 static
1763 int
1764 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1765 {
1766         struct hammer_inode *ip = ap->a_vp->v_data;
1767
1768         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1769                             ap->a_fflag, ap->a_cred));
1770 }
1771
1772 static
1773 int
1774 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1775 {
1776         struct mount *mp;
1777         int error;
1778
1779         mp = ap->a_head.a_ops->head.vv_mount;
1780
1781         switch(ap->a_op) {
1782         case MOUNTCTL_SET_EXPORT:
1783                 if (ap->a_ctllen != sizeof(struct export_args))
1784                         error = EINVAL;
1785                 error = hammer_vfs_export(mp, ap->a_op,
1786                                       (const struct export_args *)ap->a_ctl);
1787                 break;
1788         default:
1789                 error = journal_mountctl(ap);
1790                 break;
1791         }
1792         return(error);
1793 }
1794
1795 /*
1796  * hammer_vop_strategy { vp, bio }
1797  *
1798  * Strategy call, used for regular file read & write only.  Note that the
1799  * bp may represent a cluster.
1800  *
1801  * To simplify operation and allow better optimizations in the future,
1802  * this code does not make any assumptions with regards to buffer alignment
1803  * or size.
1804  */
1805 static
1806 int
1807 hammer_vop_strategy(struct vop_strategy_args *ap)
1808 {
1809         struct buf *bp;
1810         int error;
1811
1812         bp = ap->a_bio->bio_buf;
1813
1814         switch(bp->b_cmd) {
1815         case BUF_CMD_READ:
1816                 error = hammer_vop_strategy_read(ap);
1817                 break;
1818         case BUF_CMD_WRITE:
1819                 error = hammer_vop_strategy_write(ap);
1820                 break;
1821         default:
1822                 bp->b_error = error = EINVAL;
1823                 bp->b_flags |= B_ERROR;
1824                 biodone(ap->a_bio);
1825                 break;
1826         }
1827         return (error);
1828 }
1829
1830 /*
1831  * Read from a regular file.  Iterate the related records and fill in the
1832  * BIO/BUF.  Gaps are zero-filled.
1833  *
1834  * The support code in hammer_object.c should be used to deal with mixed
1835  * in-memory and on-disk records.
1836  *
1837  * XXX atime update
1838  */
1839 static
1840 int
1841 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1842 {
1843         struct hammer_transaction trans;
1844         struct hammer_inode *ip;
1845         struct hammer_cursor cursor;
1846         hammer_base_elm_t base;
1847         struct bio *bio;
1848         struct bio *nbio;
1849         struct buf *bp;
1850         int64_t rec_offset;
1851         int64_t ran_end;
1852         int64_t tmp64;
1853         int error;
1854         int boff;
1855         int roff;
1856         int n;
1857
1858         bio = ap->a_bio;
1859         bp = bio->bio_buf;
1860         ip = ap->a_vp->v_data;
1861
1862         /*
1863          * The zone-2 disk offset may have been set by the cluster code via
1864          * a BMAP operation.  Take care not to confuse it with the bio_offset
1865          * set by hammer_io_direct_write(), which is a device-relative offset.
1866          *
1867          * Checking the high bits should suffice.
1868          */
1869         nbio = push_bio(bio);
1870         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
1871             HAMMER_ZONE_RAW_BUFFER) {
1872                 error = hammer_io_direct_read(ip->hmp, nbio->bio_offset, bio);
1873                 return (error);
1874         }
1875
1876         /*
1877          * Hard way
1878          */
1879         hammer_simple_transaction(&trans, ip->hmp);
1880         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1881
1882         /*
1883          * Key range (begin and end inclusive) to scan.  Note that the key's
1884          * stored in the actual records represent BASE+LEN, not BASE.  The
1885          * first record containing bio_offset will have a key > bio_offset.
1886          */
1887         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1888         cursor.key_beg.obj_id = ip->obj_id;
1889         cursor.key_beg.create_tid = 0;
1890         cursor.key_beg.delete_tid = 0;
1891         cursor.key_beg.obj_type = 0;
1892         cursor.key_beg.key = bio->bio_offset + 1;
1893         cursor.asof = ip->obj_asof;
1894         cursor.flags |= HAMMER_CURSOR_ASOF;
1895
1896         cursor.key_end = cursor.key_beg;
1897         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
1898 #if 0
1899         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1900                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1901                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1902                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1903         } else
1904 #endif
1905         {
1906                 ran_end = bio->bio_offset + bp->b_bufsize;
1907                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1908                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1909                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1910                 if (tmp64 < ran_end)
1911                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1912                 else
1913                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1914         }
1915         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1916
1917         error = hammer_ip_first(&cursor);
1918         boff = 0;
1919
1920         while (error == 0) {
1921                 /*
1922                  * Get the base file offset of the record.  The key for
1923                  * data records is (base + bytes) rather then (base).
1924                  */
1925                 base = &cursor.leaf->base;
1926                 rec_offset = base->key - cursor.leaf->data_len;
1927
1928                 /*
1929                  * Calculate the gap, if any, and zero-fill it.
1930                  *
1931                  * n is the offset of the start of the record verses our
1932                  * current seek offset in the bio.
1933                  */
1934                 n = (int)(rec_offset - (bio->bio_offset + boff));
1935                 if (n > 0) {
1936                         if (n > bp->b_bufsize - boff)
1937                                 n = bp->b_bufsize - boff;
1938                         bzero((char *)bp->b_data + boff, n);
1939                         boff += n;
1940                         n = 0;
1941                 }
1942
1943                 /*
1944                  * Calculate the data offset in the record and the number
1945                  * of bytes we can copy.
1946                  *
1947                  * There are two degenerate cases.  First, boff may already
1948                  * be at bp->b_bufsize.  Secondly, the data offset within
1949                  * the record may exceed the record's size.
1950                  */
1951                 roff = -n;
1952                 rec_offset += roff;
1953                 n = cursor.leaf->data_len - roff;
1954                 if (n <= 0) {
1955                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
1956                         n = 0;
1957                 } else if (n > bp->b_bufsize - boff) {
1958                         n = bp->b_bufsize - boff;
1959                 }
1960
1961                 /*
1962                  * Deal with cached truncations.  This cool bit of code
1963                  * allows truncate()/ftruncate() to avoid having to sync
1964                  * the file.
1965                  *
1966                  * If the frontend is truncated then all backend records are
1967                  * subject to the frontend's truncation.
1968                  *
1969                  * If the backend is truncated then backend records on-disk
1970                  * (but not in-memory) are subject to the backend's
1971                  * truncation.  In-memory records owned by the backend
1972                  * represent data written after the truncation point on the
1973                  * backend and must not be truncated.
1974                  *
1975                  * Truncate operations deal with frontend buffer cache
1976                  * buffers and frontend-owned in-memory records synchronously.
1977                  */
1978                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
1979                         if (hammer_cursor_ondisk(&cursor) ||
1980                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
1981                                 if (ip->trunc_off <= rec_offset)
1982                                         n = 0;
1983                                 else if (ip->trunc_off < rec_offset + n)
1984                                         n = (int)(ip->trunc_off - rec_offset);
1985                         }
1986                 }
1987                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
1988                         if (hammer_cursor_ondisk(&cursor)) {
1989                                 if (ip->sync_trunc_off <= rec_offset)
1990                                         n = 0;
1991                                 else if (ip->sync_trunc_off < rec_offset + n)
1992                                         n = (int)(ip->sync_trunc_off - rec_offset);
1993                         }
1994                 }
1995
1996                 /*
1997                  * Try to issue a direct read into our bio if possible,
1998                  * otherwise resolve the element data into a hammer_buffer
1999                  * and copy.
2000                  */
2001                 if (n && boff == 0 &&
2002                     ((cursor.leaf->data_offset + roff) & HAMMER_BUFMASK) == 0) {
2003                         error = hammer_io_direct_read(
2004                                         trans.hmp,
2005                                         cursor.leaf->data_offset + roff,
2006                                         bio);
2007                         goto done;
2008                 } else if (n) {
2009                         error = hammer_ip_resolve_data(&cursor);
2010                         if (error == 0) {
2011                                 bcopy((char *)cursor.data + roff,
2012                                       (char *)bp->b_data + boff, n);
2013                         }
2014                 }
2015                 if (error)
2016                         break;
2017
2018                 /*
2019                  * Iterate until we have filled the request.
2020                  */
2021                 boff += n;
2022                 if (boff == bp->b_bufsize)
2023                         break;
2024                 error = hammer_ip_next(&cursor);
2025         }
2026
2027         /*
2028          * There may have been a gap after the last record
2029          */
2030         if (error == ENOENT)
2031                 error = 0;
2032         if (error == 0 && boff != bp->b_bufsize) {
2033                 KKASSERT(boff < bp->b_bufsize);
2034                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2035                 /* boff = bp->b_bufsize; */
2036         }
2037         bp->b_resid = 0;
2038         bp->b_error = error;
2039         if (error)
2040                 bp->b_flags |= B_ERROR;
2041         biodone(ap->a_bio);
2042
2043 done:
2044         if (cursor.node)
2045                 hammer_cache_node(cursor.node, &ip->cache[1]);
2046         hammer_done_cursor(&cursor);
2047         hammer_done_transaction(&trans);
2048         return(error);
2049 }
2050
2051 /*
2052  * BMAP operation - used to support cluster_read() only.
2053  *
2054  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2055  *
2056  * This routine may return EOPNOTSUPP if the opration is not supported for
2057  * the specified offset.  The contents of the pointer arguments do not
2058  * need to be initialized in that case. 
2059  *
2060  * If a disk address is available and properly aligned return 0 with 
2061  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2062  * to the run-length relative to that offset.  Callers may assume that
2063  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2064  * large, so return EOPNOTSUPP if it is not sufficiently large.
2065  */
2066 static
2067 int
2068 hammer_vop_bmap(struct vop_bmap_args *ap)
2069 {
2070         struct hammer_transaction trans;
2071         struct hammer_inode *ip;
2072         struct hammer_cursor cursor;
2073         hammer_base_elm_t base;
2074         int64_t rec_offset;
2075         int64_t ran_end;
2076         int64_t tmp64;
2077         int64_t base_offset;
2078         int64_t base_disk_offset;
2079         int64_t last_offset;
2080         hammer_off_t last_disk_offset;
2081         hammer_off_t disk_offset;
2082         int     rec_len;
2083         int     error;
2084
2085         ip = ap->a_vp->v_data;
2086
2087         /*
2088          * We can only BMAP regular files.  We can't BMAP database files,
2089          * directories, etc.
2090          */
2091         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2092                 return(EOPNOTSUPP);
2093
2094         /*
2095          * bmap is typically called with runp/runb both NULL when used
2096          * for writing.  We do not support BMAP for writing atm.
2097          */
2098         if (ap->a_runp == NULL && ap->a_runb == NULL)
2099                 return(EOPNOTSUPP);
2100
2101         /*
2102          * Scan the B-Tree to acquire blockmap addresses, then translate
2103          * to raw addresses.
2104          */
2105         hammer_simple_transaction(&trans, ip->hmp);
2106         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2107
2108         /*
2109          * Key range (begin and end inclusive) to scan.  Note that the key's
2110          * stored in the actual records represent BASE+LEN, not BASE.  The
2111          * first record containing bio_offset will have a key > bio_offset.
2112          */
2113         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
2114         cursor.key_beg.obj_id = ip->obj_id;
2115         cursor.key_beg.create_tid = 0;
2116         cursor.key_beg.delete_tid = 0;
2117         cursor.key_beg.obj_type = 0;
2118         if (ap->a_runb)
2119                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2120         else
2121                 cursor.key_beg.key = ap->a_loffset + 1;
2122         if (cursor.key_beg.key < 0)
2123                 cursor.key_beg.key = 0;
2124         cursor.asof = ip->obj_asof;
2125         cursor.flags |= HAMMER_CURSOR_ASOF;
2126
2127         cursor.key_end = cursor.key_beg;
2128         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2129
2130         ran_end = ap->a_loffset + MAXPHYS;
2131         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2132         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2133         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2134         if (tmp64 < ran_end)
2135                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2136         else
2137                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2138
2139         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2140
2141         error = hammer_ip_first(&cursor);
2142         base_offset = last_offset = 0;
2143         base_disk_offset = last_disk_offset = 0;
2144
2145         while (error == 0) {
2146                 /*
2147                  * Get the base file offset of the record.  The key for
2148                  * data records is (base + bytes) rather then (base).
2149                  */
2150                 base = &cursor.leaf->base;
2151                 rec_offset = base->key - cursor.leaf->data_len;
2152                 rec_len    = cursor.leaf->data_len;
2153
2154                 /*
2155                  * Incorporate any cached truncation
2156                  */
2157                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2158                         if (hammer_cursor_ondisk(&cursor) ||
2159                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2160                                 if (ip->trunc_off <= rec_offset)
2161                                         rec_len = 0;
2162                                 else if (ip->trunc_off < rec_offset + rec_len)
2163                                         rec_len = (int)(ip->trunc_off - rec_offset);
2164                         }
2165                 }
2166                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2167                         if (hammer_cursor_ondisk(&cursor)) {
2168                                 if (ip->sync_trunc_off <= rec_offset)
2169                                         rec_len = 0;
2170                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2171                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2172                         }
2173                 }
2174
2175                 /*
2176                  * Accumulate information.  If we have hit a discontiguous
2177                  * block reset base_offset unless we are already beyond the
2178                  * requested offset.  If we are, that's it, we stop.
2179                  */
2180                 disk_offset = hammer_blockmap_lookup(trans.hmp,
2181                                                      cursor.leaf->data_offset,
2182                                                      &error);
2183                 if (error)
2184                         break;
2185                 if (rec_offset != last_offset ||
2186                     disk_offset != last_disk_offset) {
2187                         if (rec_offset > ap->a_loffset)
2188                                 break;
2189                         base_offset = rec_offset;
2190                         base_disk_offset = disk_offset;
2191                 }
2192                 last_offset = rec_offset + rec_len;
2193                 last_disk_offset = disk_offset + rec_len;
2194
2195                 error = hammer_ip_next(&cursor);
2196         }
2197
2198 #if 0
2199         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2200                 ap->a_loffset, base_offset, last_offset);
2201         kprintf("BMAP %16s:  %016llx - %016llx\n",
2202                 "", base_disk_offset, last_disk_offset);
2203 #endif
2204
2205         if (cursor.node)
2206                 hammer_cache_node(cursor.node, &ip->cache[1]);
2207         hammer_done_cursor(&cursor);
2208         hammer_done_transaction(&trans);
2209
2210         if (base_offset == 0 || base_offset > ap->a_loffset ||
2211             last_offset < ap->a_loffset) {
2212                 error = EOPNOTSUPP;
2213         } else {
2214                 disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2215
2216                 /*
2217                  * If doffsetp is not aligned or the forward run size does
2218                  * not cover a whole buffer, disallow the direct I/O.
2219                  */
2220                 if ((disk_offset & HAMMER_BUFMASK) ||
2221                     (last_offset - ap->a_loffset) < HAMMER_BUFSIZE) {
2222                         error = EOPNOTSUPP;
2223                 } else {
2224                         *ap->a_doffsetp = disk_offset;
2225                         if (ap->a_runb)
2226                                 *ap->a_runb = ap->a_loffset - base_offset;
2227                         if (ap->a_runp)
2228                                 *ap->a_runp = last_offset - ap->a_loffset;
2229                         error = 0;
2230                 }
2231         }
2232         return(error);
2233 }
2234
2235 /*
2236  * Write to a regular file.   Because this is a strategy call the OS is
2237  * trying to actually sync data to the media.   HAMMER can only flush
2238  * the entire inode (so the TID remains properly synchronized).
2239  *
2240  * Basically all we do here is place the bio on the inode's flush queue
2241  * and activate the flusher.
2242  */
2243 static
2244 int
2245 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2246 {
2247         hammer_record_t record;
2248         hammer_mount_t hmp;
2249         hammer_inode_t ip;
2250         struct bio *bio;
2251         struct buf *bp;
2252         int bytes;
2253         int error;
2254
2255         bio = ap->a_bio;
2256         bp = bio->bio_buf;
2257         ip = ap->a_vp->v_data;
2258         hmp = ip->hmp;
2259
2260         if (ip->flags & HAMMER_INODE_RO) {
2261                 bp->b_error = EROFS;
2262                 bp->b_flags |= B_ERROR;
2263                 biodone(ap->a_bio);
2264                 hammer_cleanup_write_io(ip);
2265                 return(EROFS);
2266         }
2267
2268         /*
2269          * Interlock with inode destruction (no in-kernel or directory
2270          * topology visibility).  If we queue new IO while trying to
2271          * destroy the inode we can deadlock the vtrunc call in
2272          * hammer_inode_unloadable_check().
2273          */
2274         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2275                 bp->b_resid = 0;
2276                 biodone(ap->a_bio);
2277                 hammer_cleanup_write_io(ip);
2278                 return(0);
2279         }
2280
2281         /*
2282          * Reserve space and issue a direct-write from the front-end. 
2283          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2284          * allocations.
2285          *
2286          * An in-memory record will be installed to reference the storage
2287          * until the flusher can get to it.
2288          *
2289          * Since we own the high level bio the front-end will not try to
2290          * do a direct-read until the write completes.
2291          *
2292          * NOTE: The only time we do not reserve a full-sized buffers
2293          * worth of data is if the file is small.  We do not try to
2294          * allocate a fragment (from the small-data zone) at the end of
2295          * an otherwise large file as this can lead to wildly separated
2296          * data.
2297          */
2298         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2299         KKASSERT(bio->bio_offset < ip->ino_data.size);
2300         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2301                 bytes = (bp->b_bufsize + HAMMER_BUFMASK) & ~HAMMER_BUFMASK;
2302         else
2303                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2304
2305         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2306                                     bytes, &error);
2307         if (record) {
2308                 hammer_io_direct_write(hmp, &record->leaf, bio);
2309                 hammer_rel_mem_record(record);
2310                 if (hmp->rsv_recs > hammer_limit_recs &&
2311                     ip->rsv_recs > hammer_limit_irecs / 10) {
2312                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2313                 } else if (ip->rsv_recs > hammer_limit_irecs / 2) {
2314                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2315                 }
2316         } else {
2317                 bp->b_bio2.bio_offset = NOOFFSET;
2318                 bp->b_error = error;
2319                 bp->b_flags |= B_ERROR;
2320                 biodone(ap->a_bio);
2321         }
2322         hammer_cleanup_write_io(ip);
2323         return(error);
2324 }
2325
2326 /*
2327  * Clean-up after disposing of a dirty frontend buffer's data.
2328  * This is somewhat heuristical so try to be robust.
2329  */
2330 static void
2331 hammer_cleanup_write_io(hammer_inode_t ip)
2332 {
2333         if (ip->rsv_databufs) {
2334                 --ip->rsv_databufs;
2335                 --ip->hmp->rsv_databufs;
2336         }
2337 }
2338
2339 /*
2340  * We can lose track of dirty buffer cache buffers if we truncate, this
2341  * routine will resynchronize the count.
2342  */
2343 static
2344 void
2345 hammer_update_rsv_databufs(hammer_inode_t ip)
2346 {
2347         struct buf *bp;
2348         int delta;
2349         int n;
2350
2351         if (ip->vp) {
2352                 n = 0;
2353                 RB_FOREACH(bp, buf_rb_tree, &ip->vp->v_rbdirty_tree) {
2354                         ++n;
2355                 }
2356         } else {
2357                 n = 0;
2358         }
2359         delta = n - ip->rsv_databufs;
2360         ip->rsv_databufs += delta;
2361         ip->hmp->rsv_databufs += delta;
2362 }
2363
2364 /*
2365  * dounlink - disconnect a directory entry
2366  *
2367  * XXX whiteout support not really in yet
2368  */
2369 static int
2370 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2371                 struct vnode *dvp, struct ucred *cred, int flags)
2372 {
2373         struct namecache *ncp;
2374         hammer_inode_t dip;
2375         hammer_inode_t ip;
2376         struct hammer_cursor cursor;
2377         int64_t namekey;
2378         int nlen, error;
2379
2380         /*
2381          * Calculate the namekey and setup the key range for the scan.  This
2382          * works kinda like a chained hash table where the lower 32 bits
2383          * of the namekey synthesize the chain.
2384          *
2385          * The key range is inclusive of both key_beg and key_end.
2386          */
2387         dip = VTOI(dvp);
2388         ncp = nch->ncp;
2389
2390         if (dip->flags & HAMMER_INODE_RO)
2391                 return (EROFS);
2392
2393         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2394 retry:
2395         hammer_init_cursor(trans, &cursor, &dip->cache[0], dip);
2396         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
2397         cursor.key_beg.obj_id = dip->obj_id;
2398         cursor.key_beg.key = namekey;
2399         cursor.key_beg.create_tid = 0;
2400         cursor.key_beg.delete_tid = 0;
2401         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2402         cursor.key_beg.obj_type = 0;
2403
2404         cursor.key_end = cursor.key_beg;
2405         cursor.key_end.key |= 0xFFFFFFFFULL;
2406         cursor.asof = dip->obj_asof;
2407         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2408
2409         /*
2410          * Scan all matching records (the chain), locate the one matching
2411          * the requested path component.  info->last_error contains the
2412          * error code on search termination and could be 0, ENOENT, or
2413          * something else.
2414          *
2415          * The hammer_ip_*() functions merge in-memory records with on-disk
2416          * records for the purposes of the search.
2417          */
2418         error = hammer_ip_first(&cursor);
2419
2420         while (error == 0) {
2421                 error = hammer_ip_resolve_data(&cursor);
2422                 if (error)
2423                         break;
2424                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2425                 KKASSERT(nlen > 0);
2426                 if (ncp->nc_nlen == nlen &&
2427                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2428                         break;
2429                 }
2430                 error = hammer_ip_next(&cursor);
2431         }
2432
2433         /*
2434          * If all is ok we have to get the inode so we can adjust nlinks.
2435          * To avoid a deadlock with the flusher we must release the inode
2436          * lock on the directory when acquiring the inode for the entry.
2437          *
2438          * If the target is a directory, it must be empty.
2439          */
2440         if (error == 0) {
2441                 hammer_unlock(&cursor.ip->lock);
2442                 ip = hammer_get_inode(trans, &dip->cache[1],
2443                                       cursor.data->entry.obj_id,
2444                                       dip->hmp->asof, 0, &error);
2445                 hammer_lock_sh(&cursor.ip->lock);
2446                 if (error == ENOENT) {
2447                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2448                         Debugger("ENOENT unlinking object that should exist");
2449                 }
2450
2451                 /*
2452                  * If we are trying to remove a directory the directory must
2453                  * be empty.
2454                  *
2455                  * WARNING: hammer_ip_check_directory_empty() may have to
2456                  * terminate the cursor to avoid a deadlock.  It is ok to
2457                  * call hammer_done_cursor() twice.
2458                  */
2459                 if (error == 0 && ip->ino_data.obj_type ==
2460                                   HAMMER_OBJTYPE_DIRECTORY) {
2461                         error = hammer_ip_check_directory_empty(trans, ip);
2462                 }
2463
2464                 /*
2465                  * Delete the directory entry.
2466                  *
2467                  * WARNING: hammer_ip_del_directory() may have to terminate
2468                  * the cursor to avoid a deadlock.  It is ok to call
2469                  * hammer_done_cursor() twice.
2470                  */
2471                 if (error == 0) {
2472                         error = hammer_ip_del_directory(trans, &cursor,
2473                                                         dip, ip);
2474                 }
2475                 hammer_done_cursor(&cursor);
2476                 if (error == 0) {
2477                         cache_setunresolved(nch);
2478                         cache_setvp(nch, NULL);
2479                         /* XXX locking */
2480                         if (ip->vp)
2481                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2482                 }
2483                 if (ip)
2484                         hammer_rel_inode(ip, 0);
2485         } else {
2486                 hammer_done_cursor(&cursor);
2487         }
2488         if (error == EDEADLK)
2489                 goto retry;
2490
2491         return (error);
2492 }
2493
2494 /************************************************************************
2495  *                          FIFO AND SPECFS OPS                         *
2496  ************************************************************************
2497  *
2498  */
2499
2500 static int
2501 hammer_vop_fifoclose (struct vop_close_args *ap)
2502 {
2503         /* XXX update itimes */
2504         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2505 }
2506
2507 static int
2508 hammer_vop_fiforead (struct vop_read_args *ap)
2509 {
2510         int error;
2511
2512         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2513         /* XXX update access time */
2514         return (error);
2515 }
2516
2517 static int
2518 hammer_vop_fifowrite (struct vop_write_args *ap)
2519 {
2520         int error;
2521
2522         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2523         /* XXX update access time */
2524         return (error);
2525 }
2526
2527 static int
2528 hammer_vop_specclose (struct vop_close_args *ap)
2529 {
2530         /* XXX update itimes */
2531         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2532 }
2533
2534 static int
2535 hammer_vop_specread (struct vop_read_args *ap)
2536 {
2537         /* XXX update access time */
2538         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2539 }
2540
2541 static int
2542 hammer_vop_specwrite (struct vop_write_args *ap)
2543 {
2544         /* XXX update last change time */
2545         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2546 }
2547