HAMMER 54B/Many: Performance tuning.
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.67 2008/06/12 00:16:10 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91
92 struct vop_ops hammer_vnode_vops = {
93         .vop_default =          vop_defaultop,
94         .vop_fsync =            hammer_vop_fsync,
95         .vop_getpages =         vop_stdgetpages,
96         .vop_putpages =         vop_stdputpages,
97         .vop_read =             hammer_vop_read,
98         .vop_write =            hammer_vop_write,
99         .vop_access =           hammer_vop_access,
100         .vop_advlock =          hammer_vop_advlock,
101         .vop_close =            hammer_vop_close,
102         .vop_ncreate =          hammer_vop_ncreate,
103         .vop_getattr =          hammer_vop_getattr,
104         .vop_inactive =         hammer_vop_inactive,
105         .vop_reclaim =          hammer_vop_reclaim,
106         .vop_nresolve =         hammer_vop_nresolve,
107         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
108         .vop_nlink =            hammer_vop_nlink,
109         .vop_nmkdir =           hammer_vop_nmkdir,
110         .vop_nmknod =           hammer_vop_nmknod,
111         .vop_open =             hammer_vop_open,
112         .vop_pathconf =         hammer_vop_pathconf,
113         .vop_print =            hammer_vop_print,
114         .vop_readdir =          hammer_vop_readdir,
115         .vop_readlink =         hammer_vop_readlink,
116         .vop_nremove =          hammer_vop_nremove,
117         .vop_nrename =          hammer_vop_nrename,
118         .vop_nrmdir =           hammer_vop_nrmdir,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl
126 };
127
128 struct vop_ops hammer_spec_vops = {
129         .vop_default =          spec_vnoperate,
130         .vop_fsync =            hammer_vop_fsync,
131         .vop_read =             hammer_vop_specread,
132         .vop_write =            hammer_vop_specwrite,
133         .vop_access =           hammer_vop_access,
134         .vop_close =            hammer_vop_specclose,
135         .vop_getattr =          hammer_vop_getattr,
136         .vop_inactive =         hammer_vop_inactive,
137         .vop_reclaim =          hammer_vop_reclaim,
138         .vop_setattr =          hammer_vop_setattr
139 };
140
141 struct vop_ops hammer_fifo_vops = {
142         .vop_default =          fifo_vnoperate,
143         .vop_fsync =            hammer_vop_fsync,
144         .vop_read =             hammer_vop_fiforead,
145         .vop_write =            hammer_vop_fifowrite,
146         .vop_access =           hammer_vop_access,
147         .vop_close =            hammer_vop_fifoclose,
148         .vop_getattr =          hammer_vop_getattr,
149         .vop_inactive =         hammer_vop_inactive,
150         .vop_reclaim =          hammer_vop_reclaim,
151         .vop_setattr =          hammer_vop_setattr
152 };
153
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159                            struct vnode *dvp, struct ucred *cred, int flags);
160 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
161 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
162 static void hammer_cleanup_write_io(hammer_inode_t ip);
163 static void hammer_update_rsv_databufs(hammer_inode_t ip);
164
165 #if 0
166 static
167 int
168 hammer_vop_vnoperate(struct vop_generic_args *)
169 {
170         return (VOCALL(&hammer_vnode_vops, ap));
171 }
172 #endif
173
174 /*
175  * hammer_vop_fsync { vp, waitfor }
176  */
177 static
178 int
179 hammer_vop_fsync(struct vop_fsync_args *ap)
180 {
181         hammer_inode_t ip = VTOI(ap->a_vp);
182
183         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
184         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
185         if (ap->a_waitfor == MNT_WAIT)
186                 hammer_wait_inode(ip);
187         return (ip->error);
188 }
189
190 /*
191  * hammer_vop_read { vp, uio, ioflag, cred }
192  */
193 static
194 int
195 hammer_vop_read(struct vop_read_args *ap)
196 {
197         struct hammer_transaction trans;
198         hammer_inode_t ip;
199         off_t offset;
200         struct buf *bp;
201         struct uio *uio;
202         int error;
203         int n;
204         int seqcount;
205
206         if (ap->a_vp->v_type != VREG)
207                 return (EINVAL);
208         ip = VTOI(ap->a_vp);
209         error = 0;
210         seqcount = ap->a_ioflag >> 16;
211
212         hammer_start_transaction(&trans, ip->hmp);
213
214         /*
215          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
216          */
217         uio = ap->a_uio;
218         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
219                 offset = uio->uio_offset & HAMMER_BUFMASK;
220                 if (hammer_debug_cluster_enable) {
221                         error = cluster_read(ap->a_vp, ip->ino_data.size,
222                                              uio->uio_offset - offset,
223                                              HAMMER_BUFSIZE,
224                                              MAXBSIZE, seqcount, &bp);
225                 } else {
226                         error = bread(ap->a_vp, uio->uio_offset - offset,
227                                       HAMMER_BUFSIZE, &bp);
228                 }
229                 if (error) {
230                         brelse(bp);
231                         break;
232                 }
233                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
234                 n = HAMMER_BUFSIZE - offset;
235                 if (n > uio->uio_resid)
236                         n = uio->uio_resid;
237                 if (n > ip->ino_data.size - uio->uio_offset)
238                         n = (int)(ip->ino_data.size - uio->uio_offset);
239                 error = uiomove((char *)bp->b_data + offset, n, uio);
240                 bqrelse(bp);
241                 if (error)
242                         break;
243         }
244         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
245             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
246                 ip->ino_leaf.atime = trans.time;
247                 hammer_modify_inode(ip, HAMMER_INODE_ITIMES);
248         }
249         hammer_done_transaction(&trans);
250         return (error);
251 }
252
253 /*
254  * hammer_vop_write { vp, uio, ioflag, cred }
255  */
256 static
257 int
258 hammer_vop_write(struct vop_write_args *ap)
259 {
260         struct hammer_transaction trans;
261         struct hammer_inode *ip;
262         struct uio *uio;
263         int rel_offset;
264         off_t base_offset;
265         struct buf *bp;
266         int error;
267         int n;
268         int flags;
269         int count;
270
271         if (ap->a_vp->v_type != VREG)
272                 return (EINVAL);
273         ip = VTOI(ap->a_vp);
274         error = 0;
275
276         if (ip->flags & HAMMER_INODE_RO)
277                 return (EROFS);
278
279         /*
280          * Create a transaction to cover the operations we perform.
281          */
282         hammer_start_transaction(&trans, ip->hmp);
283         uio = ap->a_uio;
284
285         /*
286          * Check append mode
287          */
288         if (ap->a_ioflag & IO_APPEND)
289                 uio->uio_offset = ip->ino_data.size;
290
291         /*
292          * Check for illegal write offsets.  Valid range is 0...2^63-1.
293          *
294          * NOTE: the base_off assignment is required to work around what
295          * I consider to be a GCC-4 optimization bug.
296          */
297         if (uio->uio_offset < 0) {
298                 hammer_done_transaction(&trans);
299                 return (EFBIG);
300         }
301         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
302         if (uio->uio_resid > 0 && base_offset <= 0) {
303                 hammer_done_transaction(&trans);
304                 return (EFBIG);
305         }
306
307         /*
308          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
309          */
310         count = 0;
311         while (uio->uio_resid > 0) {
312                 int fixsize = 0;
313
314                 if ((error = hammer_checkspace(trans.hmp)) != 0)
315                         break;
316
317                 /*
318                  * Do not allow HAMMER to blow out the buffer cache.
319                  *
320                  * Do not allow HAMMER to blow out system memory by
321                  * accumulating too many records.   Records are decoupled
322                  * from the buffer cache.
323                  *
324                  * Always check at the beginning so separate writes are
325                  * not able to bypass this code.
326                  *
327                  * WARNING: Cannot unlock vp when doing a NOCOPY write as
328                  * part of a putpages operation.  Doing so could cause us
329                  * to deadlock against the VM system when we try to re-lock.
330                  */
331                 if ((count++ & 15) == 0) {
332                         if (uio->uio_segflg != UIO_NOCOPY) {
333                                 vn_unlock(ap->a_vp);
334                                 if ((ap->a_ioflag & IO_NOBWILL) == 0)
335                                         bwillwrite();
336                         }
337                         if (ip->rsv_recs > hammer_limit_irecs)
338                                 hammer_wait_inode_recs(ip);
339                         if (uio->uio_segflg != UIO_NOCOPY)
340                                 vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
341                 }
342
343                 rel_offset = (int)(uio->uio_offset & HAMMER_BUFMASK);
344                 base_offset = uio->uio_offset & ~HAMMER_BUFMASK64;
345                 n = HAMMER_BUFSIZE - rel_offset;
346                 if (n > uio->uio_resid)
347                         n = uio->uio_resid;
348                 if (uio->uio_offset + n > ip->ino_data.size) {
349                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
350                         fixsize = 1;
351                 }
352
353                 if (uio->uio_segflg == UIO_NOCOPY) {
354                         /*
355                          * Issuing a write with the same data backing the
356                          * buffer.  Instantiate the buffer to collect the
357                          * backing vm pages, then read-in any missing bits.
358                          *
359                          * This case is used by vop_stdputpages().
360                          */
361                         bp = getblk(ap->a_vp, base_offset,
362                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
363                         if ((bp->b_flags & B_CACHE) == 0) {
364                                 bqrelse(bp);
365                                 error = bread(ap->a_vp, base_offset,
366                                               HAMMER_BUFSIZE, &bp);
367                         }
368                 } else if (rel_offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
369                         /*
370                          * Even though we are entirely overwriting the buffer
371                          * we may still have to zero it out to avoid a 
372                          * mmap/write visibility issue.
373                          */
374                         bp = getblk(ap->a_vp, base_offset,
375                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
376                         if ((bp->b_flags & B_CACHE) == 0)
377                                 vfs_bio_clrbuf(bp);
378                 } else if (base_offset >= ip->ino_data.size) {
379                         /*
380                          * If the base offset of the buffer is beyond the
381                          * file EOF, we don't have to issue a read.
382                          */
383                         bp = getblk(ap->a_vp, base_offset,
384                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
385                         vfs_bio_clrbuf(bp);
386                 } else {
387                         /*
388                          * Partial overwrite, read in any missing bits then
389                          * replace the portion being written.
390                          */
391                         error = bread(ap->a_vp, base_offset,
392                                       HAMMER_BUFSIZE, &bp);
393                         if (error == 0)
394                                 bheavy(bp);
395                 }
396                 if (error == 0) {
397                         error = uiomove((char *)bp->b_data + rel_offset,
398                                         n, uio);
399                 }
400
401                 /*
402                  * If we screwed up we have to undo any VM size changes we
403                  * made.
404                  */
405                 if (error) {
406                         brelse(bp);
407                         if (fixsize) {
408                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
409                                           HAMMER_BUFSIZE);
410                         }
411                         break;
412                 }
413                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
414                 if (ip->ino_data.size < uio->uio_offset) {
415                         ip->ino_data.size = uio->uio_offset;
416                         flags = HAMMER_INODE_DDIRTY;
417                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
418                 } else {
419                         flags = 0;
420                 }
421                 ip->ino_data.mtime = trans.time;
422                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
423                 flags |= HAMMER_INODE_DDIRTY;   /* XXX mtime */
424                 hammer_modify_inode(ip, flags);
425
426                 /*
427                  * Try to keep track of cached dirty data.
428                  */
429                 if ((bp->b_flags & B_DIRTY) == 0) {
430                         ++ip->rsv_databufs;
431                         ++ip->hmp->rsv_databufs;
432                 }
433
434                 /*
435                  * Final buffer disposition.
436                  */
437                 if (ap->a_ioflag & IO_SYNC) {
438                         bwrite(bp);
439                 } else if (ap->a_ioflag & IO_DIRECT) {
440                         bawrite(bp);
441 #if 1
442                 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
443                            (uio->uio_offset & HAMMER_BUFMASK) == 0) {
444                         /*
445                          * If seqcount indicates sequential operation and
446                          * we just finished filling a buffer, push it out
447                          * now to prevent the buffer cache from becoming
448                          * too full, which would trigger non-optimal
449                          * flushes.
450                          */
451                         bawrite(bp);
452 #endif
453                 } else {
454                         bdwrite(bp);
455                 }
456         }
457         hammer_done_transaction(&trans);
458         return (error);
459 }
460
461 /*
462  * hammer_vop_access { vp, mode, cred }
463  */
464 static
465 int
466 hammer_vop_access(struct vop_access_args *ap)
467 {
468         struct hammer_inode *ip = VTOI(ap->a_vp);
469         uid_t uid;
470         gid_t gid;
471         int error;
472
473         uid = hammer_to_unix_xid(&ip->ino_data.uid);
474         gid = hammer_to_unix_xid(&ip->ino_data.gid);
475
476         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
477                                   ip->ino_data.uflags);
478         return (error);
479 }
480
481 /*
482  * hammer_vop_advlock { vp, id, op, fl, flags }
483  */
484 static
485 int
486 hammer_vop_advlock(struct vop_advlock_args *ap)
487 {
488         struct hammer_inode *ip = VTOI(ap->a_vp);
489
490         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
491 }
492
493 /*
494  * hammer_vop_close { vp, fflag }
495  */
496 static
497 int
498 hammer_vop_close(struct vop_close_args *ap)
499 {
500         struct hammer_inode *ip = VTOI(ap->a_vp);
501
502         if (ap->a_vp->v_opencount == 1)
503                 hammer_inode_waitreclaims(ip);
504
505         return (vop_stdclose(ap));
506 }
507
508 /*
509  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
510  *
511  * The operating system has already ensured that the directory entry
512  * does not exist and done all appropriate namespace locking.
513  */
514 static
515 int
516 hammer_vop_ncreate(struct vop_ncreate_args *ap)
517 {
518         struct hammer_transaction trans;
519         struct hammer_inode *dip;
520         struct hammer_inode *nip;
521         struct nchandle *nch;
522         int error;
523
524         nch = ap->a_nch;
525         dip = VTOI(ap->a_dvp);
526
527         if (dip->flags & HAMMER_INODE_RO)
528                 return (EROFS);
529         if ((error = hammer_checkspace(dip->hmp)) != 0)
530                 return (error);
531
532         /*
533          * Create a transaction to cover the operations we perform.
534          */
535         hammer_start_transaction(&trans, dip->hmp);
536
537         /*
538          * Create a new filesystem object of the requested type.  The
539          * returned inode will be referenced and shared-locked to prevent
540          * it from being moved to the flusher.
541          */
542
543         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
544         if (error) {
545                 hkprintf("hammer_create_inode error %d\n", error);
546                 hammer_done_transaction(&trans);
547                 *ap->a_vpp = NULL;
548                 return (error);
549         }
550
551         /*
552          * Add the new filesystem object to the directory.  This will also
553          * bump the inode's link count.
554          */
555         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
556         if (error)
557                 hkprintf("hammer_ip_add_directory error %d\n", error);
558
559         /*
560          * Finish up.
561          */
562         if (error) {
563                 hammer_rel_inode(nip, 0);
564                 hammer_done_transaction(&trans);
565                 *ap->a_vpp = NULL;
566         } else {
567                 error = hammer_get_vnode(nip, ap->a_vpp);
568                 hammer_done_transaction(&trans);
569                 hammer_rel_inode(nip, 0);
570                 if (error == 0) {
571                         cache_setunresolved(ap->a_nch);
572                         cache_setvp(ap->a_nch, *ap->a_vpp);
573                 }
574         }
575         return (error);
576 }
577
578 /*
579  * hammer_vop_getattr { vp, vap }
580  *
581  * Retrieve an inode's attribute information.  When accessing inodes
582  * historically we fake the atime field to ensure consistent results.
583  * The atime field is stored in the B-Tree element and allowed to be
584  * updated without cycling the element.
585  */
586 static
587 int
588 hammer_vop_getattr(struct vop_getattr_args *ap)
589 {
590         struct hammer_inode *ip = VTOI(ap->a_vp);
591         struct vattr *vap = ap->a_vap;
592
593 #if 0
594         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
595             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
596             ip->obj_asof == XXX
597         ) {
598                 /* LAZYMOD XXX */
599         }
600         hammer_itimes(ap->a_vp);
601 #endif
602
603         vap->va_fsid = ip->hmp->fsid_udev;
604         vap->va_fileid = ip->ino_leaf.base.obj_id;
605         vap->va_mode = ip->ino_data.mode;
606         vap->va_nlink = ip->ino_data.nlinks;
607         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
608         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
609         vap->va_rmajor = 0;
610         vap->va_rminor = 0;
611         vap->va_size = ip->ino_data.size;
612         if (ip->flags & HAMMER_INODE_RO)
613                 hammer_to_timespec(ip->ino_data.mtime, &vap->va_atime);
614         else
615                 hammer_to_timespec(ip->ino_leaf.atime, &vap->va_atime);
616         hammer_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
617         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
618         vap->va_flags = ip->ino_data.uflags;
619         vap->va_gen = 1;        /* hammer inums are unique for all time */
620         vap->va_blocksize = HAMMER_BUFSIZE;
621         vap->va_bytes = (ip->ino_data.size + 63) & ~63;
622         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
623         vap->va_filerev = 0;    /* XXX */
624         /* mtime uniquely identifies any adjustments made to the file */
625         vap->va_fsmid = ip->ino_data.mtime;
626         vap->va_uid_uuid = ip->ino_data.uid;
627         vap->va_gid_uuid = ip->ino_data.gid;
628         vap->va_fsid_uuid = ip->hmp->fsid;
629         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
630                           VA_FSID_UUID_VALID;
631
632         switch (ip->ino_data.obj_type) {
633         case HAMMER_OBJTYPE_CDEV:
634         case HAMMER_OBJTYPE_BDEV:
635                 vap->va_rmajor = ip->ino_data.rmajor;
636                 vap->va_rminor = ip->ino_data.rminor;
637                 break;
638         default:
639                 break;
640         }
641
642         return(0);
643 }
644
645 /*
646  * hammer_vop_nresolve { nch, dvp, cred }
647  *
648  * Locate the requested directory entry.
649  */
650 static
651 int
652 hammer_vop_nresolve(struct vop_nresolve_args *ap)
653 {
654         struct hammer_transaction trans;
655         struct namecache *ncp;
656         hammer_inode_t dip;
657         hammer_inode_t ip;
658         hammer_tid_t asof;
659         struct hammer_cursor cursor;
660         struct vnode *vp;
661         int64_t namekey;
662         int error;
663         int i;
664         int nlen;
665         int flags;
666         u_int64_t obj_id;
667
668         /*
669          * Misc initialization, plus handle as-of name extensions.  Look for
670          * the '@@' extension.  Note that as-of files and directories cannot
671          * be modified.
672          */
673         dip = VTOI(ap->a_dvp);
674         ncp = ap->a_nch->ncp;
675         asof = dip->obj_asof;
676         nlen = ncp->nc_nlen;
677         flags = dip->flags;
678
679         hammer_simple_transaction(&trans, dip->hmp);
680
681         for (i = 0; i < nlen; ++i) {
682                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
683                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
684                         flags |= HAMMER_INODE_RO;
685                         break;
686                 }
687         }
688         nlen = i;
689
690         /*
691          * If there is no path component the time extension is relative to
692          * dip.
693          */
694         if (nlen == 0) {
695                 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
696                                       asof, flags, &error);
697                 if (error == 0) {
698                         error = hammer_get_vnode(ip, &vp);
699                         hammer_rel_inode(ip, 0);
700                 } else {
701                         vp = NULL;
702                 }
703                 if (error == 0) {
704                         vn_unlock(vp);
705                         cache_setvp(ap->a_nch, vp);
706                         vrele(vp);
707                 }
708                 goto done;
709         }
710
711         /*
712          * Calculate the namekey and setup the key range for the scan.  This
713          * works kinda like a chained hash table where the lower 32 bits
714          * of the namekey synthesize the chain.
715          *
716          * The key range is inclusive of both key_beg and key_end.
717          */
718         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
719
720         error = hammer_init_cursor(&trans, &cursor, &dip->cache[0], dip);
721         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
722         cursor.key_beg.obj_id = dip->obj_id;
723         cursor.key_beg.key = namekey;
724         cursor.key_beg.create_tid = 0;
725         cursor.key_beg.delete_tid = 0;
726         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
727         cursor.key_beg.obj_type = 0;
728
729         cursor.key_end = cursor.key_beg;
730         cursor.key_end.key |= 0xFFFFFFFFULL;
731         cursor.asof = asof;
732         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
733
734         /*
735          * Scan all matching records (the chain), locate the one matching
736          * the requested path component.
737          *
738          * The hammer_ip_*() functions merge in-memory records with on-disk
739          * records for the purposes of the search.
740          */
741         obj_id = 0;
742
743         if (error == 0) {
744                 error = hammer_ip_first(&cursor);
745                 while (error == 0) {
746                         error = hammer_ip_resolve_data(&cursor);
747                         if (error)
748                                 break;
749                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
750                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
751                                 obj_id = cursor.data->entry.obj_id;
752                                 break;
753                         }
754                         error = hammer_ip_next(&cursor);
755                 }
756         }
757         hammer_done_cursor(&cursor);
758         if (error == 0) {
759                 ip = hammer_get_inode(&trans, &dip->cache[1],
760                                       obj_id, asof, flags, &error);
761                 if (error == 0) {
762                         error = hammer_get_vnode(ip, &vp);
763                         hammer_rel_inode(ip, 0);
764                 } else {
765                         vp = NULL;
766                 }
767                 if (error == 0) {
768                         vn_unlock(vp);
769                         cache_setvp(ap->a_nch, vp);
770                         vrele(vp);
771                 }
772         } else if (error == ENOENT) {
773                 cache_setvp(ap->a_nch, NULL);
774         }
775 done:
776         hammer_done_transaction(&trans);
777         return (error);
778 }
779
780 /*
781  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
782  *
783  * Locate the parent directory of a directory vnode.
784  *
785  * dvp is referenced but not locked.  *vpp must be returned referenced and
786  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
787  * at the root, instead it could indicate that the directory we were in was
788  * removed.
789  *
790  * NOTE: as-of sequences are not linked into the directory structure.  If
791  * we are at the root with a different asof then the mount point, reload
792  * the same directory with the mount point's asof.   I'm not sure what this
793  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
794  * get confused, but it hasn't been tested.
795  */
796 static
797 int
798 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
799 {
800         struct hammer_transaction trans;
801         struct hammer_inode *dip;
802         struct hammer_inode *ip;
803         int64_t parent_obj_id;
804         hammer_tid_t asof;
805         int error;
806
807         dip = VTOI(ap->a_dvp);
808         asof = dip->obj_asof;
809         parent_obj_id = dip->ino_data.parent_obj_id;
810
811         if (parent_obj_id == 0) {
812                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
813                    asof != dip->hmp->asof) {
814                         parent_obj_id = dip->obj_id;
815                         asof = dip->hmp->asof;
816                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
817                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
818                                    dip->obj_asof);
819                 } else {
820                         *ap->a_vpp = NULL;
821                         return ENOENT;
822                 }
823         }
824
825         hammer_simple_transaction(&trans, dip->hmp);
826
827         ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
828                               asof, dip->flags, &error);
829         if (ip) {
830                 error = hammer_get_vnode(ip, ap->a_vpp);
831                 hammer_rel_inode(ip, 0);
832         } else {
833                 *ap->a_vpp = NULL;
834         }
835         hammer_done_transaction(&trans);
836         return (error);
837 }
838
839 /*
840  * hammer_vop_nlink { nch, dvp, vp, cred }
841  */
842 static
843 int
844 hammer_vop_nlink(struct vop_nlink_args *ap)
845 {
846         struct hammer_transaction trans;
847         struct hammer_inode *dip;
848         struct hammer_inode *ip;
849         struct nchandle *nch;
850         int error;
851
852         nch = ap->a_nch;
853         dip = VTOI(ap->a_dvp);
854         ip = VTOI(ap->a_vp);
855
856         if (dip->flags & HAMMER_INODE_RO)
857                 return (EROFS);
858         if (ip->flags & HAMMER_INODE_RO)
859                 return (EROFS);
860         if ((error = hammer_checkspace(dip->hmp)) != 0)
861                 return (error);
862
863         /*
864          * Create a transaction to cover the operations we perform.
865          */
866         hammer_start_transaction(&trans, dip->hmp);
867
868         /*
869          * Add the filesystem object to the directory.  Note that neither
870          * dip nor ip are referenced or locked, but their vnodes are
871          * referenced.  This function will bump the inode's link count.
872          */
873         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
874
875         /*
876          * Finish up.
877          */
878         if (error == 0) {
879                 cache_setunresolved(nch);
880                 cache_setvp(nch, ap->a_vp);
881         }
882         hammer_done_transaction(&trans);
883         return (error);
884 }
885
886 /*
887  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
888  *
889  * The operating system has already ensured that the directory entry
890  * does not exist and done all appropriate namespace locking.
891  */
892 static
893 int
894 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
895 {
896         struct hammer_transaction trans;
897         struct hammer_inode *dip;
898         struct hammer_inode *nip;
899         struct nchandle *nch;
900         int error;
901
902         nch = ap->a_nch;
903         dip = VTOI(ap->a_dvp);
904
905         if (dip->flags & HAMMER_INODE_RO)
906                 return (EROFS);
907         if ((error = hammer_checkspace(dip->hmp)) != 0)
908                 return (error);
909
910         /*
911          * Create a transaction to cover the operations we perform.
912          */
913         hammer_start_transaction(&trans, dip->hmp);
914
915         /*
916          * Create a new filesystem object of the requested type.  The
917          * returned inode will be referenced but not locked.
918          */
919         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
920         if (error) {
921                 hkprintf("hammer_mkdir error %d\n", error);
922                 hammer_done_transaction(&trans);
923                 *ap->a_vpp = NULL;
924                 return (error);
925         }
926         /*
927          * Add the new filesystem object to the directory.  This will also
928          * bump the inode's link count.
929          */
930         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
931         if (error)
932                 hkprintf("hammer_mkdir (add) error %d\n", error);
933
934         /*
935          * Finish up.
936          */
937         if (error) {
938                 hammer_rel_inode(nip, 0);
939                 *ap->a_vpp = NULL;
940         } else {
941                 error = hammer_get_vnode(nip, ap->a_vpp);
942                 hammer_rel_inode(nip, 0);
943                 if (error == 0) {
944                         cache_setunresolved(ap->a_nch);
945                         cache_setvp(ap->a_nch, *ap->a_vpp);
946                 }
947         }
948         hammer_done_transaction(&trans);
949         return (error);
950 }
951
952 /*
953  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
954  *
955  * The operating system has already ensured that the directory entry
956  * does not exist and done all appropriate namespace locking.
957  */
958 static
959 int
960 hammer_vop_nmknod(struct vop_nmknod_args *ap)
961 {
962         struct hammer_transaction trans;
963         struct hammer_inode *dip;
964         struct hammer_inode *nip;
965         struct nchandle *nch;
966         int error;
967
968         nch = ap->a_nch;
969         dip = VTOI(ap->a_dvp);
970
971         if (dip->flags & HAMMER_INODE_RO)
972                 return (EROFS);
973         if ((error = hammer_checkspace(dip->hmp)) != 0)
974                 return (error);
975
976         /*
977          * Create a transaction to cover the operations we perform.
978          */
979         hammer_start_transaction(&trans, dip->hmp);
980
981         /*
982          * Create a new filesystem object of the requested type.  The
983          * returned inode will be referenced but not locked.
984          */
985         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
986         if (error) {
987                 hammer_done_transaction(&trans);
988                 *ap->a_vpp = NULL;
989                 return (error);
990         }
991
992         /*
993          * Add the new filesystem object to the directory.  This will also
994          * bump the inode's link count.
995          */
996         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
997
998         /*
999          * Finish up.
1000          */
1001         if (error) {
1002                 hammer_rel_inode(nip, 0);
1003                 *ap->a_vpp = NULL;
1004         } else {
1005                 error = hammer_get_vnode(nip, ap->a_vpp);
1006                 hammer_rel_inode(nip, 0);
1007                 if (error == 0) {
1008                         cache_setunresolved(ap->a_nch);
1009                         cache_setvp(ap->a_nch, *ap->a_vpp);
1010                 }
1011         }
1012         hammer_done_transaction(&trans);
1013         return (error);
1014 }
1015
1016 /*
1017  * hammer_vop_open { vp, mode, cred, fp }
1018  */
1019 static
1020 int
1021 hammer_vop_open(struct vop_open_args *ap)
1022 {
1023         hammer_inode_t ip;
1024
1025         ip = VTOI(ap->a_vp);
1026
1027         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1028                 return (EROFS);
1029         return(vop_stdopen(ap));
1030 }
1031
1032 /*
1033  * hammer_vop_pathconf { vp, name, retval }
1034  */
1035 static
1036 int
1037 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1038 {
1039         return EOPNOTSUPP;
1040 }
1041
1042 /*
1043  * hammer_vop_print { vp }
1044  */
1045 static
1046 int
1047 hammer_vop_print(struct vop_print_args *ap)
1048 {
1049         return EOPNOTSUPP;
1050 }
1051
1052 /*
1053  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1054  */
1055 static
1056 int
1057 hammer_vop_readdir(struct vop_readdir_args *ap)
1058 {
1059         struct hammer_transaction trans;
1060         struct hammer_cursor cursor;
1061         struct hammer_inode *ip;
1062         struct uio *uio;
1063         hammer_base_elm_t base;
1064         int error;
1065         int cookie_index;
1066         int ncookies;
1067         off_t *cookies;
1068         off_t saveoff;
1069         int r;
1070
1071         ip = VTOI(ap->a_vp);
1072         uio = ap->a_uio;
1073         saveoff = uio->uio_offset;
1074
1075         if (ap->a_ncookies) {
1076                 ncookies = uio->uio_resid / 16 + 1;
1077                 if (ncookies > 1024)
1078                         ncookies = 1024;
1079                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1080                 cookie_index = 0;
1081         } else {
1082                 ncookies = -1;
1083                 cookies = NULL;
1084                 cookie_index = 0;
1085         }
1086
1087         hammer_simple_transaction(&trans, ip->hmp);
1088
1089         /*
1090          * Handle artificial entries
1091          */
1092         error = 0;
1093         if (saveoff == 0) {
1094                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1095                 if (r)
1096                         goto done;
1097                 if (cookies)
1098                         cookies[cookie_index] = saveoff;
1099                 ++saveoff;
1100                 ++cookie_index;
1101                 if (cookie_index == ncookies)
1102                         goto done;
1103         }
1104         if (saveoff == 1) {
1105                 if (ip->ino_data.parent_obj_id) {
1106                         r = vop_write_dirent(&error, uio,
1107                                              ip->ino_data.parent_obj_id,
1108                                              DT_DIR, 2, "..");
1109                 } else {
1110                         r = vop_write_dirent(&error, uio,
1111                                              ip->obj_id, DT_DIR, 2, "..");
1112                 }
1113                 if (r)
1114                         goto done;
1115                 if (cookies)
1116                         cookies[cookie_index] = saveoff;
1117                 ++saveoff;
1118                 ++cookie_index;
1119                 if (cookie_index == ncookies)
1120                         goto done;
1121         }
1122
1123         /*
1124          * Key range (begin and end inclusive) to scan.  Directory keys
1125          * directly translate to a 64 bit 'seek' position.
1126          */
1127         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1128         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1129         cursor.key_beg.obj_id = ip->obj_id;
1130         cursor.key_beg.create_tid = 0;
1131         cursor.key_beg.delete_tid = 0;
1132         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1133         cursor.key_beg.obj_type = 0;
1134         cursor.key_beg.key = saveoff;
1135
1136         cursor.key_end = cursor.key_beg;
1137         cursor.key_end.key = HAMMER_MAX_KEY;
1138         cursor.asof = ip->obj_asof;
1139         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1140
1141         error = hammer_ip_first(&cursor);
1142
1143         while (error == 0) {
1144                 error = hammer_ip_resolve_data(&cursor);
1145                 if (error)
1146                         break;
1147                 base = &cursor.leaf->base;
1148                 saveoff = base->key;
1149                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1150
1151                 if (base->obj_id != ip->obj_id)
1152                         panic("readdir: bad record at %p", cursor.node);
1153
1154                 r = vop_write_dirent(
1155                              &error, uio, cursor.data->entry.obj_id,
1156                              hammer_get_dtype(cursor.leaf->base.obj_type),
1157                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1158                              (void *)cursor.data->entry.name);
1159                 if (r)
1160                         break;
1161                 ++saveoff;
1162                 if (cookies)
1163                         cookies[cookie_index] = base->key;
1164                 ++cookie_index;
1165                 if (cookie_index == ncookies)
1166                         break;
1167                 error = hammer_ip_next(&cursor);
1168         }
1169         hammer_done_cursor(&cursor);
1170
1171 done:
1172         hammer_done_transaction(&trans);
1173
1174         if (ap->a_eofflag)
1175                 *ap->a_eofflag = (error == ENOENT);
1176         uio->uio_offset = saveoff;
1177         if (error && cookie_index == 0) {
1178                 if (error == ENOENT)
1179                         error = 0;
1180                 if (cookies) {
1181                         kfree(cookies, M_TEMP);
1182                         *ap->a_ncookies = 0;
1183                         *ap->a_cookies = NULL;
1184                 }
1185         } else {
1186                 if (error == ENOENT)
1187                         error = 0;
1188                 if (cookies) {
1189                         *ap->a_ncookies = cookie_index;
1190                         *ap->a_cookies = cookies;
1191                 }
1192         }
1193         return(error);
1194 }
1195
1196 /*
1197  * hammer_vop_readlink { vp, uio, cred }
1198  */
1199 static
1200 int
1201 hammer_vop_readlink(struct vop_readlink_args *ap)
1202 {
1203         struct hammer_transaction trans;
1204         struct hammer_cursor cursor;
1205         struct hammer_inode *ip;
1206         int error;
1207
1208         ip = VTOI(ap->a_vp);
1209
1210         /*
1211          * Shortcut if the symlink data was stuffed into ino_data.
1212          */
1213         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1214                 error = uiomove(ip->ino_data.ext.symlink,
1215                                 ip->ino_data.size, ap->a_uio);
1216                 return(error);
1217         }
1218
1219         /*
1220          * Long version
1221          */
1222         hammer_simple_transaction(&trans, ip->hmp);
1223         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1224
1225         /*
1226          * Key range (begin and end inclusive) to scan.  Directory keys
1227          * directly translate to a 64 bit 'seek' position.
1228          */
1229         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; /* XXX */
1230         cursor.key_beg.obj_id = ip->obj_id;
1231         cursor.key_beg.create_tid = 0;
1232         cursor.key_beg.delete_tid = 0;
1233         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1234         cursor.key_beg.obj_type = 0;
1235         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1236         cursor.asof = ip->obj_asof;
1237         cursor.flags |= HAMMER_CURSOR_ASOF;
1238
1239         error = hammer_ip_lookup(&cursor);
1240         if (error == 0) {
1241                 error = hammer_ip_resolve_data(&cursor);
1242                 if (error == 0) {
1243                         KKASSERT(cursor.leaf->data_len >=
1244                                  HAMMER_SYMLINK_NAME_OFF);
1245                         error = uiomove(cursor.data->symlink.name,
1246                                         cursor.leaf->data_len -
1247                                                 HAMMER_SYMLINK_NAME_OFF,
1248                                         ap->a_uio);
1249                 }
1250         }
1251         hammer_done_cursor(&cursor);
1252         hammer_done_transaction(&trans);
1253         return(error);
1254 }
1255
1256 /*
1257  * hammer_vop_nremove { nch, dvp, cred }
1258  */
1259 static
1260 int
1261 hammer_vop_nremove(struct vop_nremove_args *ap)
1262 {
1263         struct hammer_transaction trans;
1264         struct hammer_inode *dip;
1265         int error;
1266
1267         dip = VTOI(ap->a_dvp);
1268
1269         if (hammer_nohistory(dip) == 0 &&
1270             (error = hammer_checkspace(dip->hmp)) != 0) {
1271                 return (error);
1272         }
1273
1274         hammer_start_transaction(&trans, dip->hmp);
1275         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1276         hammer_done_transaction(&trans);
1277
1278         return (error);
1279 }
1280
1281 /*
1282  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1283  */
1284 static
1285 int
1286 hammer_vop_nrename(struct vop_nrename_args *ap)
1287 {
1288         struct hammer_transaction trans;
1289         struct namecache *fncp;
1290         struct namecache *tncp;
1291         struct hammer_inode *fdip;
1292         struct hammer_inode *tdip;
1293         struct hammer_inode *ip;
1294         struct hammer_cursor cursor;
1295         int64_t namekey;
1296         int nlen, error;
1297
1298         fdip = VTOI(ap->a_fdvp);
1299         tdip = VTOI(ap->a_tdvp);
1300         fncp = ap->a_fnch->ncp;
1301         tncp = ap->a_tnch->ncp;
1302         ip = VTOI(fncp->nc_vp);
1303         KKASSERT(ip != NULL);
1304
1305         if (fdip->flags & HAMMER_INODE_RO)
1306                 return (EROFS);
1307         if (tdip->flags & HAMMER_INODE_RO)
1308                 return (EROFS);
1309         if (ip->flags & HAMMER_INODE_RO)
1310                 return (EROFS);
1311         if ((error = hammer_checkspace(fdip->hmp)) != 0)
1312                 return (error);
1313
1314         hammer_start_transaction(&trans, fdip->hmp);
1315
1316         /*
1317          * Remove tncp from the target directory and then link ip as
1318          * tncp. XXX pass trans to dounlink
1319          *
1320          * Force the inode sync-time to match the transaction so it is
1321          * in-sync with the creation of the target directory entry.
1322          */
1323         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1324         if (error == 0 || error == ENOENT) {
1325                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1326                 if (error == 0) {
1327                         ip->ino_data.parent_obj_id = tdip->obj_id;
1328                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1329                 }
1330         }
1331         if (error)
1332                 goto failed; /* XXX */
1333
1334         /*
1335          * Locate the record in the originating directory and remove it.
1336          *
1337          * Calculate the namekey and setup the key range for the scan.  This
1338          * works kinda like a chained hash table where the lower 32 bits
1339          * of the namekey synthesize the chain.
1340          *
1341          * The key range is inclusive of both key_beg and key_end.
1342          */
1343         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1344 retry:
1345         hammer_init_cursor(&trans, &cursor, &fdip->cache[0], fdip);
1346         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1347         cursor.key_beg.obj_id = fdip->obj_id;
1348         cursor.key_beg.key = namekey;
1349         cursor.key_beg.create_tid = 0;
1350         cursor.key_beg.delete_tid = 0;
1351         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1352         cursor.key_beg.obj_type = 0;
1353
1354         cursor.key_end = cursor.key_beg;
1355         cursor.key_end.key |= 0xFFFFFFFFULL;
1356         cursor.asof = fdip->obj_asof;
1357         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1358
1359         /*
1360          * Scan all matching records (the chain), locate the one matching
1361          * the requested path component.
1362          *
1363          * The hammer_ip_*() functions merge in-memory records with on-disk
1364          * records for the purposes of the search.
1365          */
1366         error = hammer_ip_first(&cursor);
1367         while (error == 0) {
1368                 if (hammer_ip_resolve_data(&cursor) != 0)
1369                         break;
1370                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1371                 KKASSERT(nlen > 0);
1372                 if (fncp->nc_nlen == nlen &&
1373                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1374                         break;
1375                 }
1376                 error = hammer_ip_next(&cursor);
1377         }
1378
1379         /*
1380          * If all is ok we have to get the inode so we can adjust nlinks.
1381          *
1382          * WARNING: hammer_ip_del_directory() may have to terminate the
1383          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1384          * twice.
1385          */
1386         if (error == 0)
1387                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1388
1389         /*
1390          * XXX A deadlock here will break rename's atomicy for the purposes
1391          * of crash recovery.
1392          */
1393         if (error == EDEADLK) {
1394                 hammer_done_cursor(&cursor);
1395                 goto retry;
1396         }
1397
1398         /*
1399          * Cleanup and tell the kernel that the rename succeeded.
1400          */
1401         hammer_done_cursor(&cursor);
1402         if (error == 0)
1403                 cache_rename(ap->a_fnch, ap->a_tnch);
1404
1405 failed:
1406         hammer_done_transaction(&trans);
1407         return (error);
1408 }
1409
1410 /*
1411  * hammer_vop_nrmdir { nch, dvp, cred }
1412  */
1413 static
1414 int
1415 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1416 {
1417         struct hammer_transaction trans;
1418         struct hammer_inode *dip;
1419         int error;
1420
1421         dip = VTOI(ap->a_dvp);
1422
1423         if (hammer_nohistory(dip) == 0 &&
1424             (error = hammer_checkspace(dip->hmp)) != 0) {
1425                 return (error);
1426         }
1427
1428         hammer_start_transaction(&trans, dip->hmp);
1429         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1430         hammer_done_transaction(&trans);
1431
1432         return (error);
1433 }
1434
1435 /*
1436  * hammer_vop_setattr { vp, vap, cred }
1437  */
1438 static
1439 int
1440 hammer_vop_setattr(struct vop_setattr_args *ap)
1441 {
1442         struct hammer_transaction trans;
1443         struct vattr *vap;
1444         struct hammer_inode *ip;
1445         int modflags;
1446         int error;
1447         int truncating;
1448         off_t aligned_size;
1449         u_int32_t flags;
1450
1451         vap = ap->a_vap;
1452         ip = ap->a_vp->v_data;
1453         modflags = 0;
1454
1455         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1456                 return(EROFS);
1457         if (ip->flags & HAMMER_INODE_RO)
1458                 return (EROFS);
1459         if (hammer_nohistory(ip) == 0 &&
1460             (error = hammer_checkspace(ip->hmp)) != 0) {
1461                 return (error);
1462         }
1463
1464         hammer_start_transaction(&trans, ip->hmp);
1465         error = 0;
1466
1467         if (vap->va_flags != VNOVAL) {
1468                 flags = ip->ino_data.uflags;
1469                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1470                                          hammer_to_unix_xid(&ip->ino_data.uid),
1471                                          ap->a_cred);
1472                 if (error == 0) {
1473                         if (ip->ino_data.uflags != flags) {
1474                                 ip->ino_data.uflags = flags;
1475                                 modflags |= HAMMER_INODE_DDIRTY;
1476                         }
1477                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1478                                 error = 0;
1479                                 goto done;
1480                         }
1481                 }
1482                 goto done;
1483         }
1484         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1485                 error = EPERM;
1486                 goto done;
1487         }
1488         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1489                 mode_t cur_mode = ip->ino_data.mode;
1490                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1491                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1492                 uuid_t uuid_uid;
1493                 uuid_t uuid_gid;
1494
1495                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1496                                          ap->a_cred,
1497                                          &cur_uid, &cur_gid, &cur_mode);
1498                 if (error == 0) {
1499                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1500                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1501                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1502                                  sizeof(uuid_uid)) ||
1503                             bcmp(&uuid_gid, &ip->ino_data.gid,
1504                                  sizeof(uuid_gid)) ||
1505                             ip->ino_data.mode != cur_mode
1506                         ) {
1507                                 ip->ino_data.uid = uuid_uid;
1508                                 ip->ino_data.gid = uuid_gid;
1509                                 ip->ino_data.mode = cur_mode;
1510                         }
1511                         modflags |= HAMMER_INODE_DDIRTY;
1512                 }
1513         }
1514         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1515                 switch(ap->a_vp->v_type) {
1516                 case VREG:
1517                         if (vap->va_size == ip->ino_data.size)
1518                                 break;
1519                         /*
1520                          * XXX break atomicy, we can deadlock the backend
1521                          * if we do not release the lock.  Probably not a
1522                          * big deal here.
1523                          */
1524                         if (vap->va_size < ip->ino_data.size) {
1525                                 vtruncbuf(ap->a_vp, vap->va_size,
1526                                           HAMMER_BUFSIZE);
1527                                 truncating = 1;
1528                         } else {
1529                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1530                                 truncating = 0;
1531                         }
1532                         ip->ino_data.size = vap->va_size;
1533                         modflags |= HAMMER_INODE_DDIRTY;
1534                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1535                                        ~HAMMER_BUFMASK64;
1536
1537                         /*
1538                          * on-media truncation is cached in the inode until
1539                          * the inode is synchronized.
1540                          */
1541                         if (truncating) {
1542                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1543                                 hammer_update_rsv_databufs(ip);
1544 #ifdef DEBUG_TRUNCATE
1545                                 if (HammerTruncIp == NULL)
1546                                         HammerTruncIp = ip;
1547 #endif
1548                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1549                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1550                                         ip->trunc_off = vap->va_size;
1551 #ifdef DEBUG_TRUNCATE
1552                                         if (ip == HammerTruncIp)
1553                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1554 #endif
1555                                 } else if (ip->trunc_off > vap->va_size) {
1556                                         ip->trunc_off = vap->va_size;
1557 #ifdef DEBUG_TRUNCATE
1558                                         if (ip == HammerTruncIp)
1559                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1560 #endif
1561                                 } else {
1562 #ifdef DEBUG_TRUNCATE
1563                                         if (ip == HammerTruncIp)
1564                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1565 #endif
1566                                 }
1567                         }
1568
1569                         /*
1570                          * If truncating we have to clean out a portion of
1571                          * the last block on-disk.  We do this in the
1572                          * front-end buffer cache.
1573                          */
1574                         if (truncating && vap->va_size < aligned_size) {
1575                                 struct buf *bp;
1576                                 int offset;
1577
1578                                 aligned_size -= HAMMER_BUFSIZE;
1579
1580                                 offset = vap->va_size & HAMMER_BUFMASK;
1581                                 error = bread(ap->a_vp, aligned_size,
1582                                               HAMMER_BUFSIZE, &bp);
1583                                 hammer_ip_frontend_trunc(ip, aligned_size);
1584                                 if (error == 0) {
1585                                         bzero(bp->b_data + offset,
1586                                               HAMMER_BUFSIZE - offset);
1587                                         bdwrite(bp);
1588                                 } else {
1589                                         kprintf("ERROR %d\n", error);
1590                                         brelse(bp);
1591                                 }
1592                         }
1593                         break;
1594                 case VDATABASE:
1595                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1596                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1597                                 ip->trunc_off = vap->va_size;
1598                         } else if (ip->trunc_off > vap->va_size) {
1599                                 ip->trunc_off = vap->va_size;
1600                         }
1601                         hammer_ip_frontend_trunc(ip, vap->va_size);
1602                         ip->ino_data.size = vap->va_size;
1603                         modflags |= HAMMER_INODE_DDIRTY;
1604                         break;
1605                 default:
1606                         error = EINVAL;
1607                         goto done;
1608                 }
1609                 break;
1610         }
1611         if (vap->va_atime.tv_sec != VNOVAL) {
1612                 ip->ino_leaf.atime =
1613                         hammer_timespec_to_transid(&vap->va_atime);
1614                 modflags |= HAMMER_INODE_ITIMES;
1615         }
1616         if (vap->va_mtime.tv_sec != VNOVAL) {
1617                 ip->ino_data.mtime =
1618                         hammer_timespec_to_transid(&vap->va_mtime);
1619                 modflags |= HAMMER_INODE_ITIMES;
1620                 modflags |= HAMMER_INODE_DDIRTY;        /* XXX mtime */
1621         }
1622         if (vap->va_mode != (mode_t)VNOVAL) {
1623                 mode_t   cur_mode = ip->ino_data.mode;
1624                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1625                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1626
1627                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1628                                          cur_uid, cur_gid, &cur_mode);
1629                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1630                         ip->ino_data.mode = cur_mode;
1631                         modflags |= HAMMER_INODE_DDIRTY;
1632                 }
1633         }
1634 done:
1635         if (error == 0)
1636                 hammer_modify_inode(ip, modflags);
1637         hammer_done_transaction(&trans);
1638         return (error);
1639 }
1640
1641 /*
1642  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1643  */
1644 static
1645 int
1646 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1647 {
1648         struct hammer_transaction trans;
1649         struct hammer_inode *dip;
1650         struct hammer_inode *nip;
1651         struct nchandle *nch;
1652         hammer_record_t record;
1653         int error;
1654         int bytes;
1655
1656         ap->a_vap->va_type = VLNK;
1657
1658         nch = ap->a_nch;
1659         dip = VTOI(ap->a_dvp);
1660
1661         if (dip->flags & HAMMER_INODE_RO)
1662                 return (EROFS);
1663         if ((error = hammer_checkspace(dip->hmp)) != 0)
1664                 return (error);
1665
1666         /*
1667          * Create a transaction to cover the operations we perform.
1668          */
1669         hammer_start_transaction(&trans, dip->hmp);
1670
1671         /*
1672          * Create a new filesystem object of the requested type.  The
1673          * returned inode will be referenced but not locked.
1674          */
1675
1676         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1677         if (error) {
1678                 hammer_done_transaction(&trans);
1679                 *ap->a_vpp = NULL;
1680                 return (error);
1681         }
1682
1683         /*
1684          * Add a record representing the symlink.  symlink stores the link
1685          * as pure data, not a string, and is no \0 terminated.
1686          */
1687         if (error == 0) {
1688                 bytes = strlen(ap->a_target);
1689
1690                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1691                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1692                 } else {
1693                         record = hammer_alloc_mem_record(nip, bytes);
1694                         record->type = HAMMER_MEM_RECORD_GENERAL;
1695
1696                         record->leaf.base.localization = HAMMER_LOCALIZE_MISC;
1697                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1698                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1699                         record->leaf.data_len = bytes;
1700                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1701                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1702                         error = hammer_ip_add_record(&trans, record);
1703                 }
1704
1705                 /*
1706                  * Set the file size to the length of the link.
1707                  */
1708                 if (error == 0) {
1709                         nip->ino_data.size = bytes;
1710                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1711                 }
1712         }
1713         if (error == 0)
1714                 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1715
1716         /*
1717          * Finish up.
1718          */
1719         if (error) {
1720                 hammer_rel_inode(nip, 0);
1721                 *ap->a_vpp = NULL;
1722         } else {
1723                 error = hammer_get_vnode(nip, ap->a_vpp);
1724                 hammer_rel_inode(nip, 0);
1725                 if (error == 0) {
1726                         cache_setunresolved(ap->a_nch);
1727                         cache_setvp(ap->a_nch, *ap->a_vpp);
1728                 }
1729         }
1730         hammer_done_transaction(&trans);
1731         return (error);
1732 }
1733
1734 /*
1735  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1736  */
1737 static
1738 int
1739 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1740 {
1741         struct hammer_transaction trans;
1742         struct hammer_inode *dip;
1743         int error;
1744
1745         dip = VTOI(ap->a_dvp);
1746
1747         if (hammer_nohistory(dip) == 0 &&
1748             (error = hammer_checkspace(dip->hmp)) != 0) {
1749                 return (error);
1750         }
1751
1752         hammer_start_transaction(&trans, dip->hmp);
1753         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1754                                 ap->a_cred, ap->a_flags);
1755         hammer_done_transaction(&trans);
1756
1757         return (error);
1758 }
1759
1760 /*
1761  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1762  */
1763 static
1764 int
1765 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1766 {
1767         struct hammer_inode *ip = ap->a_vp->v_data;
1768
1769         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1770                             ap->a_fflag, ap->a_cred));
1771 }
1772
1773 static
1774 int
1775 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1776 {
1777         struct mount *mp;
1778         int error;
1779
1780         mp = ap->a_head.a_ops->head.vv_mount;
1781
1782         switch(ap->a_op) {
1783         case MOUNTCTL_SET_EXPORT:
1784                 if (ap->a_ctllen != sizeof(struct export_args))
1785                         error = EINVAL;
1786                 error = hammer_vfs_export(mp, ap->a_op,
1787                                       (const struct export_args *)ap->a_ctl);
1788                 break;
1789         default:
1790                 error = journal_mountctl(ap);
1791                 break;
1792         }
1793         return(error);
1794 }
1795
1796 /*
1797  * hammer_vop_strategy { vp, bio }
1798  *
1799  * Strategy call, used for regular file read & write only.  Note that the
1800  * bp may represent a cluster.
1801  *
1802  * To simplify operation and allow better optimizations in the future,
1803  * this code does not make any assumptions with regards to buffer alignment
1804  * or size.
1805  */
1806 static
1807 int
1808 hammer_vop_strategy(struct vop_strategy_args *ap)
1809 {
1810         struct buf *bp;
1811         int error;
1812
1813         bp = ap->a_bio->bio_buf;
1814
1815         switch(bp->b_cmd) {
1816         case BUF_CMD_READ:
1817                 error = hammer_vop_strategy_read(ap);
1818                 break;
1819         case BUF_CMD_WRITE:
1820                 error = hammer_vop_strategy_write(ap);
1821                 break;
1822         default:
1823                 bp->b_error = error = EINVAL;
1824                 bp->b_flags |= B_ERROR;
1825                 biodone(ap->a_bio);
1826                 break;
1827         }
1828         return (error);
1829 }
1830
1831 /*
1832  * Read from a regular file.  Iterate the related records and fill in the
1833  * BIO/BUF.  Gaps are zero-filled.
1834  *
1835  * The support code in hammer_object.c should be used to deal with mixed
1836  * in-memory and on-disk records.
1837  *
1838  * XXX atime update
1839  */
1840 static
1841 int
1842 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1843 {
1844         struct hammer_transaction trans;
1845         struct hammer_inode *ip;
1846         struct hammer_cursor cursor;
1847         hammer_base_elm_t base;
1848         struct bio *bio;
1849         struct bio *nbio;
1850         struct buf *bp;
1851         int64_t rec_offset;
1852         int64_t ran_end;
1853         int64_t tmp64;
1854         int error;
1855         int boff;
1856         int roff;
1857         int n;
1858
1859         bio = ap->a_bio;
1860         bp = bio->bio_buf;
1861         ip = ap->a_vp->v_data;
1862
1863         /*
1864          * The zone-2 disk offset may have been set by the cluster code via
1865          * a BMAP operation.  Take care not to confuse it with the bio_offset
1866          * set by hammer_io_direct_write(), which is a device-relative offset.
1867          *
1868          * Checking the high bits should suffice.
1869          */
1870         nbio = push_bio(bio);
1871         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) == HAMMER_ZONE_RAW_BUFFER) {
1872                 error = hammer_io_direct_read(ip->hmp, nbio->bio_offset, bio);
1873                 return (error);
1874         }
1875
1876         /*
1877          * Hard way
1878          */
1879         hammer_simple_transaction(&trans, ip->hmp);
1880         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1881
1882         /*
1883          * Key range (begin and end inclusive) to scan.  Note that the key's
1884          * stored in the actual records represent BASE+LEN, not BASE.  The
1885          * first record containing bio_offset will have a key > bio_offset.
1886          */
1887         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1888         cursor.key_beg.obj_id = ip->obj_id;
1889         cursor.key_beg.create_tid = 0;
1890         cursor.key_beg.delete_tid = 0;
1891         cursor.key_beg.obj_type = 0;
1892         cursor.key_beg.key = bio->bio_offset + 1;
1893         cursor.asof = ip->obj_asof;
1894         cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
1895
1896         cursor.key_end = cursor.key_beg;
1897         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
1898 #if 0
1899         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1900                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1901                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1902                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1903         } else
1904 #endif
1905         {
1906                 ran_end = bio->bio_offset + bp->b_bufsize;
1907                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1908                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1909                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1910                 if (tmp64 < ran_end)
1911                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1912                 else
1913                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1914         }
1915         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1916
1917         error = hammer_ip_first(&cursor);
1918         boff = 0;
1919
1920         while (error == 0) {
1921                 /*
1922                  * Get the base file offset of the record.  The key for
1923                  * data records is (base + bytes) rather then (base).
1924                  */
1925                 base = &cursor.leaf->base;
1926                 rec_offset = base->key - cursor.leaf->data_len;
1927
1928                 /*
1929                  * Calculate the gap, if any, and zero-fill it.
1930                  *
1931                  * n is the offset of the start of the record verses our
1932                  * current seek offset in the bio.
1933                  */
1934                 n = (int)(rec_offset - (bio->bio_offset + boff));
1935                 if (n > 0) {
1936                         if (n > bp->b_bufsize - boff)
1937                                 n = bp->b_bufsize - boff;
1938                         bzero((char *)bp->b_data + boff, n);
1939                         boff += n;
1940                         n = 0;
1941                 }
1942
1943                 /*
1944                  * Calculate the data offset in the record and the number
1945                  * of bytes we can copy.
1946                  *
1947                  * There are two degenerate cases.  First, boff may already
1948                  * be at bp->b_bufsize.  Secondly, the data offset within
1949                  * the record may exceed the record's size.
1950                  */
1951                 roff = -n;
1952                 rec_offset += roff;
1953                 n = cursor.leaf->data_len - roff;
1954                 if (n <= 0) {
1955                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
1956                         n = 0;
1957                 } else if (n > bp->b_bufsize - boff) {
1958                         n = bp->b_bufsize - boff;
1959                 }
1960
1961                 /*
1962                  * Deal with cached truncations.  This cool bit of code
1963                  * allows truncate()/ftruncate() to avoid having to sync
1964                  * the file.
1965                  *
1966                  * If the frontend is truncated then all backend records are
1967                  * subject to the frontend's truncation.
1968                  *
1969                  * If the backend is truncated then backend records on-disk
1970                  * (but not in-memory) are subject to the backend's
1971                  * truncation.  In-memory records owned by the backend
1972                  * represent data written after the truncation point on the
1973                  * backend and must not be truncated.
1974                  *
1975                  * Truncate operations deal with frontend buffer cache
1976                  * buffers and frontend-owned in-memory records synchronously.
1977                  */
1978                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
1979                         if (hammer_cursor_ondisk(&cursor) ||
1980                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
1981                                 if (ip->trunc_off <= rec_offset)
1982                                         n = 0;
1983                                 else if (ip->trunc_off < rec_offset + n)
1984                                         n = (int)(ip->trunc_off - rec_offset);
1985                         }
1986                 }
1987                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
1988                         if (hammer_cursor_ondisk(&cursor)) {
1989                                 if (ip->sync_trunc_off <= rec_offset)
1990                                         n = 0;
1991                                 else if (ip->sync_trunc_off < rec_offset + n)
1992                                         n = (int)(ip->sync_trunc_off - rec_offset);
1993                         }
1994                 }
1995
1996                 /*
1997                  * Try to issue a direct read into our bio if possible,
1998                  * otherwise resolve the element data into a hammer_buffer
1999                  * and copy.
2000                  */
2001                 if (boff == 0 &&
2002                     ((cursor.leaf->data_offset + roff) & HAMMER_BUFMASK) == 0) {
2003                         error = hammer_io_direct_read(
2004                                         trans.hmp,
2005                                         cursor.leaf->data_offset + roff,
2006                                         bio);
2007                         goto done;
2008                 } else if (n) {
2009                         error = hammer_ip_resolve_data(&cursor);
2010                         if (error == 0) {
2011                                 bcopy((char *)cursor.data + roff,
2012                                       (char *)bp->b_data + boff, n);
2013                         }
2014                 }
2015                 if (error)
2016                         break;
2017
2018                 /*
2019                  * Iterate until we have filled the request.
2020                  */
2021                 boff += n;
2022                 if (boff == bp->b_bufsize)
2023                         break;
2024                 error = hammer_ip_next(&cursor);
2025         }
2026
2027         /*
2028          * There may have been a gap after the last record
2029          */
2030         if (error == ENOENT)
2031                 error = 0;
2032         if (error == 0 && boff != bp->b_bufsize) {
2033                 KKASSERT(boff < bp->b_bufsize);
2034                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2035                 /* boff = bp->b_bufsize; */
2036         }
2037         bp->b_resid = 0;
2038         bp->b_error = error;
2039         if (error)
2040                 bp->b_flags |= B_ERROR;
2041         biodone(ap->a_bio);
2042
2043 done:
2044         if (cursor.node)
2045                 hammer_cache_node(cursor.node, &ip->cache[1]);
2046         hammer_done_cursor(&cursor);
2047         hammer_done_transaction(&trans);
2048         return(error);
2049 }
2050
2051 /*
2052  * BMAP operation - used to support cluster_read() only.
2053  *
2054  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2055  *
2056  * This routine may return EOPNOTSUPP if the opration is not supported for
2057  * the specified offset.  The contents of the pointer arguments do not
2058  * need to be initialized in that case. 
2059  *
2060  * If a disk address is available and properly aligned return 0 with 
2061  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2062  * to the run-length relative to that offset.  Callers may assume that
2063  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2064  * large, so return EOPNOTSUPP if it is not sufficiently large.
2065  */
2066 static
2067 int
2068 hammer_vop_bmap(struct vop_bmap_args *ap)
2069 {
2070         struct hammer_transaction trans;
2071         struct hammer_inode *ip;
2072         struct hammer_cursor cursor;
2073         hammer_base_elm_t base;
2074         int64_t rec_offset;
2075         int64_t ran_end;
2076         int64_t tmp64;
2077         int64_t base_offset;
2078         int64_t base_disk_offset;
2079         int64_t last_offset;
2080         hammer_off_t last_disk_offset;
2081         hammer_off_t disk_offset;
2082         int     rec_len;
2083         int     error;
2084
2085         ip = ap->a_vp->v_data;
2086
2087         /*
2088          * We can only BMAP regular files.  We can't BMAP database files,
2089          * directories, etc.
2090          */
2091         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2092                 return(EOPNOTSUPP);
2093
2094         /*
2095          * bmap is typically called with runp/runb both NULL when used
2096          * for writing.  We do not support BMAP for writing atm.
2097          */
2098         if (ap->a_runp == NULL && ap->a_runb == NULL)
2099                 return(EOPNOTSUPP);
2100
2101         /*
2102          * Scan the B-Tree to acquire blockmap addresses, then translate
2103          * to raw addresses.
2104          */
2105         hammer_simple_transaction(&trans, ip->hmp);
2106         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2107
2108         /*
2109          * Key range (begin and end inclusive) to scan.  Note that the key's
2110          * stored in the actual records represent BASE+LEN, not BASE.  The
2111          * first record containing bio_offset will have a key > bio_offset.
2112          */
2113         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
2114         cursor.key_beg.obj_id = ip->obj_id;
2115         cursor.key_beg.create_tid = 0;
2116         cursor.key_beg.delete_tid = 0;
2117         cursor.key_beg.obj_type = 0;
2118         if (ap->a_runb)
2119                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2120         else
2121                 cursor.key_beg.key = ap->a_loffset + 1;
2122         if (cursor.key_beg.key < 0)
2123                 cursor.key_beg.key = 0;
2124         cursor.asof = ip->obj_asof;
2125         cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
2126
2127         cursor.key_end = cursor.key_beg;
2128         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2129
2130         ran_end = ap->a_loffset + MAXPHYS;
2131         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2132         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2133         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2134         if (tmp64 < ran_end)
2135                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2136         else
2137                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2138
2139         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2140
2141         error = hammer_ip_first(&cursor);
2142         base_offset = last_offset = 0;
2143         base_disk_offset = last_disk_offset = 0;
2144
2145         while (error == 0) {
2146                 /*
2147                  * Get the base file offset of the record.  The key for
2148                  * data records is (base + bytes) rather then (base).
2149                  */
2150                 base = &cursor.leaf->base;
2151                 rec_offset = base->key - cursor.leaf->data_len;
2152                 rec_len    = cursor.leaf->data_len;
2153
2154                 /*
2155                  * Incorporate any cached truncation
2156                  */
2157                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2158                         if (hammer_cursor_ondisk(&cursor) ||
2159                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2160                                 if (ip->trunc_off <= rec_offset)
2161                                         rec_len = 0;
2162                                 else if (ip->trunc_off < rec_offset + rec_len)
2163                                         rec_len = (int)(ip->trunc_off - rec_offset);
2164                         }
2165                 }
2166                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2167                         if (hammer_cursor_ondisk(&cursor)) {
2168                                 if (ip->sync_trunc_off <= rec_offset)
2169                                         rec_len = 0;
2170                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2171                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2172                         }
2173                 }
2174
2175                 /*
2176                  * Accumulate information.  If we have hit a discontiguous
2177                  * block reset base_offset unless we are already beyond the
2178                  * requested offset.  If we are, that's it, we stop.
2179                  */
2180                 disk_offset = hammer_blockmap_lookup(trans.hmp,
2181                                                      cursor.leaf->data_offset,
2182                                                      &error);
2183                 if (error)
2184                         break;
2185                 if (rec_offset != last_offset ||
2186                     disk_offset != last_disk_offset) {
2187                         if (rec_offset > ap->a_loffset)
2188                                 break;
2189                         base_offset = rec_offset;
2190                         base_disk_offset = disk_offset;
2191                 }
2192                 last_offset = rec_offset + rec_len;
2193                 last_disk_offset = disk_offset + rec_len;
2194
2195                 error = hammer_ip_next(&cursor);
2196         }
2197
2198 #if 0
2199         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2200                 ap->a_loffset, base_offset, last_offset);
2201         kprintf("BMAP %16s:  %016llx - %016llx\n",
2202                 "", base_disk_offset, last_disk_offset);
2203 #endif
2204
2205         if (cursor.node)
2206                 hammer_cache_node(cursor.node, &ip->cache[1]);
2207         hammer_done_cursor(&cursor);
2208         hammer_done_transaction(&trans);
2209
2210         if (base_offset == 0 || base_offset > ap->a_loffset ||
2211             last_offset < ap->a_loffset) {
2212                 error = EOPNOTSUPP;
2213         } else {
2214                 disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2215
2216                 /*
2217                  * If doffsetp is not aligned or the forward run size does
2218                  * not cover a whole buffer, disallow the direct I/O.
2219                  */
2220                 if ((disk_offset & HAMMER_BUFMASK) ||
2221                     (last_offset - ap->a_loffset) < HAMMER_BUFSIZE) {
2222                         error = EOPNOTSUPP;
2223                 } else {
2224                         *ap->a_doffsetp = disk_offset;
2225                         if (ap->a_runb)
2226                                 *ap->a_runb = ap->a_loffset - base_offset;
2227                         if (ap->a_runp)
2228                                 *ap->a_runp = last_offset - ap->a_loffset;
2229                         error = 0;
2230                 }
2231         }
2232         return(error);
2233 }
2234
2235 /*
2236  * Write to a regular file.   Because this is a strategy call the OS is
2237  * trying to actually sync data to the media.   HAMMER can only flush
2238  * the entire inode (so the TID remains properly synchronized).
2239  *
2240  * Basically all we do here is place the bio on the inode's flush queue
2241  * and activate the flusher.
2242  */
2243 static
2244 int
2245 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2246 {
2247         hammer_record_t record;
2248         hammer_mount_t hmp;
2249         hammer_inode_t ip;
2250         struct bio *bio;
2251         struct buf *bp;
2252         int bytes;
2253         int error;
2254
2255         bio = ap->a_bio;
2256         bp = bio->bio_buf;
2257         ip = ap->a_vp->v_data;
2258         hmp = ip->hmp;
2259
2260         if (ip->flags & HAMMER_INODE_RO) {
2261                 bp->b_error = EROFS;
2262                 bp->b_flags |= B_ERROR;
2263                 biodone(ap->a_bio);
2264                 hammer_cleanup_write_io(ip);
2265                 return(EROFS);
2266         }
2267
2268         /*
2269          * Interlock with inode destruction (no in-kernel or directory
2270          * topology visibility).  If we queue new IO while trying to
2271          * destroy the inode we can deadlock the vtrunc call in
2272          * hammer_inode_unloadable_check().
2273          */
2274         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2275                 bp->b_resid = 0;
2276                 biodone(ap->a_bio);
2277                 hammer_cleanup_write_io(ip);
2278                 return(0);
2279         }
2280
2281         /*
2282          * Reserve space and issue a direct-write from the front-end. 
2283          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2284          * allocations.
2285          *
2286          * An in-memory record will be installed to reference the storage
2287          * until the flusher can get to it.
2288          *
2289          * Since we own the high level bio the front-end will not try to
2290          * do a direct-read until the write completes.
2291          *
2292          * NOTE: The only time we do not reserve a full-sized buffers
2293          * worth of data is if the file is small.  We do not try to
2294          * allocate a fragment (from the small-data zone) at the end of
2295          * an otherwise large file as this can lead to wildly separated
2296          * data.
2297          */
2298         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2299         KKASSERT(bio->bio_offset < ip->ino_data.size);
2300         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2301                 bytes = (bp->b_bufsize + HAMMER_BUFMASK) & ~HAMMER_BUFMASK;
2302         else
2303                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2304
2305         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2306                                     bytes, &error);
2307         if (record) {
2308                 hammer_io_direct_write(hmp, &record->leaf, bio);
2309                 hammer_rel_mem_record(record);
2310                 if (hmp->rsv_recs > hammer_limit_recs &&
2311                     ip->rsv_recs > hammer_limit_irecs / 10) {
2312                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2313                 } else if (ip->rsv_recs > hammer_limit_irecs / 2) {
2314                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2315                 }
2316         } else {
2317                 bp->b_bio2.bio_offset = NOOFFSET;
2318                 bp->b_error = error;
2319                 bp->b_flags |= B_ERROR;
2320                 biodone(ap->a_bio);
2321         }
2322         hammer_cleanup_write_io(ip);
2323         return(error);
2324 }
2325
2326 /*
2327  * Clean-up after disposing of a dirty frontend buffer's data.
2328  * This is somewhat heuristical so try to be robust.
2329  */
2330 static void
2331 hammer_cleanup_write_io(hammer_inode_t ip)
2332 {
2333         if (ip->rsv_databufs) {
2334                 --ip->rsv_databufs;
2335                 --ip->hmp->rsv_databufs;
2336         }
2337 }
2338
2339 /*
2340  * We can lose track of dirty buffer cache buffers if we truncate, this
2341  * routine will resynchronize the count.
2342  */
2343 static
2344 void
2345 hammer_update_rsv_databufs(hammer_inode_t ip)
2346 {
2347         struct buf *bp;
2348         int delta;
2349         int n;
2350
2351         if (ip->vp) {
2352                 n = 0;
2353                 RB_FOREACH(bp, buf_rb_tree, &ip->vp->v_rbdirty_tree) {
2354                         ++n;
2355                 }
2356         } else {
2357                 n = 0;
2358         }
2359         delta = n - ip->rsv_databufs;
2360         ip->rsv_databufs += delta;
2361         ip->hmp->rsv_databufs += delta;
2362 }
2363
2364 /*
2365  * dounlink - disconnect a directory entry
2366  *
2367  * XXX whiteout support not really in yet
2368  */
2369 static int
2370 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2371                 struct vnode *dvp, struct ucred *cred, int flags)
2372 {
2373         struct namecache *ncp;
2374         hammer_inode_t dip;
2375         hammer_inode_t ip;
2376         struct hammer_cursor cursor;
2377         int64_t namekey;
2378         int nlen, error;
2379
2380         /*
2381          * Calculate the namekey and setup the key range for the scan.  This
2382          * works kinda like a chained hash table where the lower 32 bits
2383          * of the namekey synthesize the chain.
2384          *
2385          * The key range is inclusive of both key_beg and key_end.
2386          */
2387         dip = VTOI(dvp);
2388         ncp = nch->ncp;
2389
2390         if (dip->flags & HAMMER_INODE_RO)
2391                 return (EROFS);
2392
2393         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2394 retry:
2395         hammer_init_cursor(trans, &cursor, &dip->cache[0], dip);
2396         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
2397         cursor.key_beg.obj_id = dip->obj_id;
2398         cursor.key_beg.key = namekey;
2399         cursor.key_beg.create_tid = 0;
2400         cursor.key_beg.delete_tid = 0;
2401         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2402         cursor.key_beg.obj_type = 0;
2403
2404         cursor.key_end = cursor.key_beg;
2405         cursor.key_end.key |= 0xFFFFFFFFULL;
2406         cursor.asof = dip->obj_asof;
2407         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2408
2409         /*
2410          * Scan all matching records (the chain), locate the one matching
2411          * the requested path component.  info->last_error contains the
2412          * error code on search termination and could be 0, ENOENT, or
2413          * something else.
2414          *
2415          * The hammer_ip_*() functions merge in-memory records with on-disk
2416          * records for the purposes of the search.
2417          */
2418         error = hammer_ip_first(&cursor);
2419
2420         while (error == 0) {
2421                 error = hammer_ip_resolve_data(&cursor);
2422                 if (error)
2423                         break;
2424                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2425                 KKASSERT(nlen > 0);
2426                 if (ncp->nc_nlen == nlen &&
2427                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2428                         break;
2429                 }
2430                 error = hammer_ip_next(&cursor);
2431         }
2432
2433         /*
2434          * If all is ok we have to get the inode so we can adjust nlinks.
2435          * To avoid a deadlock with the flusher we must release the inode
2436          * lock on the directory when acquiring the inode for the entry.
2437          *
2438          * If the target is a directory, it must be empty.
2439          */
2440         if (error == 0) {
2441                 hammer_unlock(&cursor.ip->lock);
2442                 ip = hammer_get_inode(trans, &dip->cache[1],
2443                                       cursor.data->entry.obj_id,
2444                                       dip->hmp->asof, 0, &error);
2445                 hammer_lock_sh(&cursor.ip->lock);
2446                 if (error == ENOENT) {
2447                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2448                         Debugger("ENOENT unlinking object that should exist");
2449                 }
2450
2451                 /*
2452                  * If we are trying to remove a directory the directory must
2453                  * be empty.
2454                  *
2455                  * WARNING: hammer_ip_check_directory_empty() may have to
2456                  * terminate the cursor to avoid a deadlock.  It is ok to
2457                  * call hammer_done_cursor() twice.
2458                  */
2459                 if (error == 0 && ip->ino_data.obj_type ==
2460                                   HAMMER_OBJTYPE_DIRECTORY) {
2461                         error = hammer_ip_check_directory_empty(trans, ip);
2462                 }
2463
2464                 /*
2465                  * Delete the directory entry.
2466                  *
2467                  * WARNING: hammer_ip_del_directory() may have to terminate
2468                  * the cursor to avoid a deadlock.  It is ok to call
2469                  * hammer_done_cursor() twice.
2470                  */
2471                 if (error == 0) {
2472                         error = hammer_ip_del_directory(trans, &cursor,
2473                                                         dip, ip);
2474                 }
2475                 hammer_done_cursor(&cursor);
2476                 if (error == 0) {
2477                         cache_setunresolved(nch);
2478                         cache_setvp(nch, NULL);
2479                         /* XXX locking */
2480                         if (ip->vp)
2481                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2482                 }
2483                 if (ip)
2484                         hammer_rel_inode(ip, 0);
2485         } else {
2486                 hammer_done_cursor(&cursor);
2487         }
2488         if (error == EDEADLK)
2489                 goto retry;
2490
2491         return (error);
2492 }
2493
2494 /************************************************************************
2495  *                          FIFO AND SPECFS OPS                         *
2496  ************************************************************************
2497  *
2498  */
2499
2500 static int
2501 hammer_vop_fifoclose (struct vop_close_args *ap)
2502 {
2503         /* XXX update itimes */
2504         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2505 }
2506
2507 static int
2508 hammer_vop_fiforead (struct vop_read_args *ap)
2509 {
2510         int error;
2511
2512         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2513         /* XXX update access time */
2514         return (error);
2515 }
2516
2517 static int
2518 hammer_vop_fifowrite (struct vop_write_args *ap)
2519 {
2520         int error;
2521
2522         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2523         /* XXX update access time */
2524         return (error);
2525 }
2526
2527 static int
2528 hammer_vop_specclose (struct vop_close_args *ap)
2529 {
2530         /* XXX update itimes */
2531         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2532 }
2533
2534 static int
2535 hammer_vop_specread (struct vop_read_args *ap)
2536 {
2537         /* XXX update access time */
2538         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2539 }
2540
2541 static int
2542 hammer_vop_specwrite (struct vop_write_args *ap)
2543 {
2544         /* XXX update last change time */
2545         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2546 }
2547