80bb60296fb6d3d2fb10d4b35b0dbe6c67a75eff
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.63 2008/06/10 05:06:20 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81 static int hammer_vop_mountctl(struct vop_mountctl_args *);
82
83 static int hammer_vop_fifoclose (struct vop_close_args *);
84 static int hammer_vop_fiforead (struct vop_read_args *);
85 static int hammer_vop_fifowrite (struct vop_write_args *);
86
87 static int hammer_vop_specclose (struct vop_close_args *);
88 static int hammer_vop_specread (struct vop_read_args *);
89 static int hammer_vop_specwrite (struct vop_write_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         hammer_vop_pathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_setattr =          hammer_vop_setattr,
119         .vop_strategy =         hammer_vop_strategy,
120         .vop_nsymlink =         hammer_vop_nsymlink,
121         .vop_nwhiteout =        hammer_vop_nwhiteout,
122         .vop_ioctl =            hammer_vop_ioctl,
123         .vop_mountctl =         hammer_vop_mountctl
124 };
125
126 struct vop_ops hammer_spec_vops = {
127         .vop_default =          spec_vnoperate,
128         .vop_fsync =            hammer_vop_fsync,
129         .vop_read =             hammer_vop_specread,
130         .vop_write =            hammer_vop_specwrite,
131         .vop_access =           hammer_vop_access,
132         .vop_close =            hammer_vop_specclose,
133         .vop_getattr =          hammer_vop_getattr,
134         .vop_inactive =         hammer_vop_inactive,
135         .vop_reclaim =          hammer_vop_reclaim,
136         .vop_setattr =          hammer_vop_setattr
137 };
138
139 struct vop_ops hammer_fifo_vops = {
140         .vop_default =          fifo_vnoperate,
141         .vop_fsync =            hammer_vop_fsync,
142         .vop_read =             hammer_vop_fiforead,
143         .vop_write =            hammer_vop_fifowrite,
144         .vop_access =           hammer_vop_access,
145         .vop_close =            hammer_vop_fifoclose,
146         .vop_getattr =          hammer_vop_getattr,
147         .vop_inactive =         hammer_vop_inactive,
148         .vop_reclaim =          hammer_vop_reclaim,
149         .vop_setattr =          hammer_vop_setattr
150 };
151
152 #ifdef DEBUG_TRUNCATE
153 struct hammer_inode *HammerTruncIp;
154 #endif
155
156 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
157                            struct vnode *dvp, struct ucred *cred, int flags);
158 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
159 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
160 static void hammer_cleanup_write_io(hammer_inode_t ip);
161 static void hammer_update_rsv_databufs(hammer_inode_t ip);
162
163 #if 0
164 static
165 int
166 hammer_vop_vnoperate(struct vop_generic_args *)
167 {
168         return (VOCALL(&hammer_vnode_vops, ap));
169 }
170 #endif
171
172 /*
173  * hammer_vop_fsync { vp, waitfor }
174  */
175 static
176 int
177 hammer_vop_fsync(struct vop_fsync_args *ap)
178 {
179         hammer_inode_t ip = VTOI(ap->a_vp);
180
181         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
182         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
183         if (ap->a_waitfor == MNT_WAIT)
184                 hammer_wait_inode(ip);
185         return (ip->error);
186 }
187
188 /*
189  * hammer_vop_read { vp, uio, ioflag, cred }
190  */
191 static
192 int
193 hammer_vop_read(struct vop_read_args *ap)
194 {
195         struct hammer_transaction trans;
196         hammer_inode_t ip;
197         off_t offset;
198         struct buf *bp;
199         struct uio *uio;
200         int error;
201         int n;
202         int seqcount;
203
204         if (ap->a_vp->v_type != VREG)
205                 return (EINVAL);
206         ip = VTOI(ap->a_vp);
207         error = 0;
208         seqcount = ap->a_ioflag >> 16;
209
210         hammer_start_transaction(&trans, ip->hmp);
211
212         /*
213          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
214          */
215         uio = ap->a_uio;
216         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
217                 offset = uio->uio_offset & HAMMER_BUFMASK;
218 #if 0
219                 error = cluster_read(ap->a_vp, ip->ino_data.size,
220                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
221                                      MAXBSIZE, seqcount, &bp);
222 #endif
223                 error = bread(ap->a_vp, uio->uio_offset - offset,
224                               HAMMER_BUFSIZE, &bp);
225                 if (error) {
226                         brelse(bp);
227                         break;
228                 }
229                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
230                 n = HAMMER_BUFSIZE - offset;
231                 if (n > uio->uio_resid)
232                         n = uio->uio_resid;
233                 if (n > ip->ino_data.size - uio->uio_offset)
234                         n = (int)(ip->ino_data.size - uio->uio_offset);
235                 error = uiomove((char *)bp->b_data + offset, n, uio);
236                 if (error) {
237                         bqrelse(bp);
238                         break;
239                 }
240                 bqrelse(bp);
241         }
242         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
243             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
244                 ip->ino_leaf.atime = trans.time;
245                 hammer_modify_inode(ip, HAMMER_INODE_ITIMES);
246         }
247         hammer_done_transaction(&trans);
248         return (error);
249 }
250
251 /*
252  * hammer_vop_write { vp, uio, ioflag, cred }
253  */
254 static
255 int
256 hammer_vop_write(struct vop_write_args *ap)
257 {
258         struct hammer_transaction trans;
259         struct hammer_inode *ip;
260         struct uio *uio;
261         int rel_offset;
262         off_t base_offset;
263         struct buf *bp;
264         int error;
265         int n;
266         int flags;
267         int count;
268
269         if (ap->a_vp->v_type != VREG)
270                 return (EINVAL);
271         ip = VTOI(ap->a_vp);
272         error = 0;
273
274         if (ip->flags & HAMMER_INODE_RO)
275                 return (EROFS);
276
277         /*
278          * Create a transaction to cover the operations we perform.
279          */
280         hammer_start_transaction(&trans, ip->hmp);
281         uio = ap->a_uio;
282
283         /*
284          * Check append mode
285          */
286         if (ap->a_ioflag & IO_APPEND)
287                 uio->uio_offset = ip->ino_data.size;
288
289         /*
290          * Check for illegal write offsets.  Valid range is 0...2^63-1
291          */
292         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
293                 hammer_done_transaction(&trans);
294                 return (EFBIG);
295         }
296
297         /*
298          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
299          */
300         count = 0;
301         while (uio->uio_resid > 0) {
302                 int fixsize = 0;
303
304                 if ((error = hammer_checkspace(trans.hmp)) != 0)
305                         break;
306
307                 /*
308                  * Do not allow HAMMER to blow out the buffer cache.
309                  *
310                  * Do not allow HAMMER to blow out system memory by
311                  * accumulating too many records.   Records are decoupled
312                  * from the buffer cache.
313                  *
314                  * Always check at the beginning so separate writes are
315                  * not able to bypass this code.
316                  *
317                  * WARNING: Cannot unlock vp when doing a NOCOPY write as
318                  * part of a putpages operation.  Doing so could cause us
319                  * to deadlock against the VM system when we try to re-lock.
320                  */
321                 if ((count++ & 15) == 0) {
322                         if (uio->uio_segflg != UIO_NOCOPY) {
323                                 vn_unlock(ap->a_vp);
324                                 if ((ap->a_ioflag & IO_NOBWILL) == 0)
325                                         bwillwrite();
326                         }
327                         if (ip->rsv_recs > hammer_limit_irecs) {
328                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
329                                 hammer_wait_inode(ip);
330                         }
331                         if (uio->uio_segflg != UIO_NOCOPY)
332                                 vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
333                 }
334
335                 rel_offset = (int)(uio->uio_offset & HAMMER_BUFMASK);
336                 base_offset = uio->uio_offset & ~HAMMER_BUFMASK64;
337                 n = HAMMER_BUFSIZE - rel_offset;
338                 if (n > uio->uio_resid)
339                         n = uio->uio_resid;
340                 if (uio->uio_offset + n > ip->ino_data.size) {
341                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
342                         fixsize = 1;
343                 }
344
345                 if (uio->uio_segflg == UIO_NOCOPY) {
346                         /*
347                          * Issuing a write with the same data backing the
348                          * buffer.  Instantiate the buffer to collect the
349                          * backing vm pages, then read-in any missing bits.
350                          *
351                          * This case is used by vop_stdputpages().
352                          */
353                         bp = getblk(ap->a_vp, base_offset,
354                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
355                         if ((bp->b_flags & B_CACHE) == 0) {
356                                 bqrelse(bp);
357                                 error = bread(ap->a_vp, base_offset,
358                                               HAMMER_BUFSIZE, &bp);
359                         }
360                 } else if (rel_offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
361                         /*
362                          * Even though we are entirely overwriting the buffer
363                          * we may still have to zero it out to avoid a 
364                          * mmap/write visibility issue.
365                          */
366                         bp = getblk(ap->a_vp, base_offset,
367                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
368                         if ((bp->b_flags & B_CACHE) == 0)
369                                 vfs_bio_clrbuf(bp);
370                 } else if (base_offset >= ip->ino_data.size) {
371                         /*
372                          * If the base offset of the buffer is beyond the
373                          * file EOF, we don't have to issue a read.
374                          */
375                         bp = getblk(ap->a_vp, base_offset,
376                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
377                         vfs_bio_clrbuf(bp);
378                 } else {
379                         /*
380                          * Partial overwrite, read in any missing bits then
381                          * replace the portion being written.
382                          */
383                         error = bread(ap->a_vp, base_offset,
384                                       HAMMER_BUFSIZE, &bp);
385                         if (error == 0)
386                                 bheavy(bp);
387                 }
388                 if (error == 0) {
389                         error = uiomove((char *)bp->b_data + rel_offset,
390                                         n, uio);
391                 }
392
393                 /*
394                  * If we screwed up we have to undo any VM size changes we
395                  * made.
396                  */
397                 if (error) {
398                         brelse(bp);
399                         if (fixsize) {
400                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
401                                           HAMMER_BUFSIZE);
402                         }
403                         break;
404                 }
405                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
406                 if (ip->ino_data.size < uio->uio_offset) {
407                         ip->ino_data.size = uio->uio_offset;
408                         flags = HAMMER_INODE_DDIRTY;
409                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
410                 } else {
411                         flags = 0;
412                 }
413                 ip->ino_data.mtime = trans.time;
414                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
415                 flags |= HAMMER_INODE_DDIRTY;   /* XXX mtime */
416                 hammer_modify_inode(ip, flags);
417
418                 /*
419                  * Try to keep track of cached dirty data.
420                  */
421                 if ((bp->b_flags & B_DIRTY) == 0) {
422                         ++ip->rsv_databufs;
423                         ++ip->hmp->rsv_databufs;
424                 }
425
426                 /*
427                  * Final buffer disposition.
428                  */
429                 if (ap->a_ioflag & IO_SYNC) {
430                         bwrite(bp);
431                 } else if (ap->a_ioflag & IO_DIRECT) {
432                         bawrite(bp);
433 #if 1
434                 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
435                            (uio->uio_offset & HAMMER_BUFMASK) == 0) {
436                         /*
437                          * If seqcount indicates sequential operation and
438                          * we just finished filling a buffer, push it out
439                          * now to prevent the buffer cache from becoming
440                          * too full, which would trigger non-optimal
441                          * flushes.
442                          */
443                         bawrite(bp);
444 #endif
445                 } else {
446                         bdwrite(bp);
447                 }
448         }
449         hammer_done_transaction(&trans);
450         return (error);
451 }
452
453 /*
454  * hammer_vop_access { vp, mode, cred }
455  */
456 static
457 int
458 hammer_vop_access(struct vop_access_args *ap)
459 {
460         struct hammer_inode *ip = VTOI(ap->a_vp);
461         uid_t uid;
462         gid_t gid;
463         int error;
464
465         uid = hammer_to_unix_xid(&ip->ino_data.uid);
466         gid = hammer_to_unix_xid(&ip->ino_data.gid);
467
468         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
469                                   ip->ino_data.uflags);
470         return (error);
471 }
472
473 /*
474  * hammer_vop_advlock { vp, id, op, fl, flags }
475  */
476 static
477 int
478 hammer_vop_advlock(struct vop_advlock_args *ap)
479 {
480         struct hammer_inode *ip = VTOI(ap->a_vp);
481
482         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
483 }
484
485 /*
486  * hammer_vop_close { vp, fflag }
487  */
488 static
489 int
490 hammer_vop_close(struct vop_close_args *ap)
491 {
492         return (vop_stdclose(ap));
493 }
494
495 /*
496  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
497  *
498  * The operating system has already ensured that the directory entry
499  * does not exist and done all appropriate namespace locking.
500  */
501 static
502 int
503 hammer_vop_ncreate(struct vop_ncreate_args *ap)
504 {
505         struct hammer_transaction trans;
506         struct hammer_inode *dip;
507         struct hammer_inode *nip;
508         struct nchandle *nch;
509         int error;
510
511         nch = ap->a_nch;
512         dip = VTOI(ap->a_dvp);
513
514         if (dip->flags & HAMMER_INODE_RO)
515                 return (EROFS);
516         if ((error = hammer_checkspace(dip->hmp)) != 0)
517                 return (error);
518
519         /*
520          * Create a transaction to cover the operations we perform.
521          */
522         hammer_start_transaction(&trans, dip->hmp);
523
524         /*
525          * Create a new filesystem object of the requested type.  The
526          * returned inode will be referenced and shared-locked to prevent
527          * it from being moved to the flusher.
528          */
529
530         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
531         if (error) {
532                 hkprintf("hammer_create_inode error %d\n", error);
533                 hammer_done_transaction(&trans);
534                 *ap->a_vpp = NULL;
535                 return (error);
536         }
537
538         /*
539          * Add the new filesystem object to the directory.  This will also
540          * bump the inode's link count.
541          */
542         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
543         if (error)
544                 hkprintf("hammer_ip_add_directory error %d\n", error);
545
546         /*
547          * Finish up.
548          */
549         if (error) {
550                 hammer_rel_inode(nip, 0);
551                 hammer_done_transaction(&trans);
552                 *ap->a_vpp = NULL;
553         } else {
554                 error = hammer_get_vnode(nip, ap->a_vpp);
555                 hammer_done_transaction(&trans);
556                 hammer_rel_inode(nip, 0);
557                 if (error == 0) {
558                         cache_setunresolved(ap->a_nch);
559                         cache_setvp(ap->a_nch, *ap->a_vpp);
560                 }
561         }
562         return (error);
563 }
564
565 /*
566  * hammer_vop_getattr { vp, vap }
567  *
568  * Retrieve an inode's attribute information.  When accessing inodes
569  * historically we fake the atime field to ensure consistent results.
570  * The atime field is stored in the B-Tree element and allowed to be
571  * updated without cycling the element.
572  */
573 static
574 int
575 hammer_vop_getattr(struct vop_getattr_args *ap)
576 {
577         struct hammer_inode *ip = VTOI(ap->a_vp);
578         struct vattr *vap = ap->a_vap;
579
580 #if 0
581         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
582             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
583             ip->obj_asof == XXX
584         ) {
585                 /* LAZYMOD XXX */
586         }
587         hammer_itimes(ap->a_vp);
588 #endif
589
590         vap->va_fsid = ip->hmp->fsid_udev;
591         vap->va_fileid = ip->ino_leaf.base.obj_id;
592         vap->va_mode = ip->ino_data.mode;
593         vap->va_nlink = ip->ino_data.nlinks;
594         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
595         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
596         vap->va_rmajor = 0;
597         vap->va_rminor = 0;
598         vap->va_size = ip->ino_data.size;
599         if (ip->flags & HAMMER_INODE_RO)
600                 hammer_to_timespec(ip->ino_data.mtime, &vap->va_atime);
601         else
602                 hammer_to_timespec(ip->ino_leaf.atime, &vap->va_atime);
603         hammer_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
604         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
605         vap->va_flags = ip->ino_data.uflags;
606         vap->va_gen = 1;        /* hammer inums are unique for all time */
607         vap->va_blocksize = HAMMER_BUFSIZE;
608         vap->va_bytes = (ip->ino_data.size + 63) & ~63;
609         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
610         vap->va_filerev = 0;    /* XXX */
611         /* mtime uniquely identifies any adjustments made to the file */
612         vap->va_fsmid = ip->ino_data.mtime;
613         vap->va_uid_uuid = ip->ino_data.uid;
614         vap->va_gid_uuid = ip->ino_data.gid;
615         vap->va_fsid_uuid = ip->hmp->fsid;
616         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
617                           VA_FSID_UUID_VALID;
618
619         switch (ip->ino_data.obj_type) {
620         case HAMMER_OBJTYPE_CDEV:
621         case HAMMER_OBJTYPE_BDEV:
622                 vap->va_rmajor = ip->ino_data.rmajor;
623                 vap->va_rminor = ip->ino_data.rminor;
624                 break;
625         default:
626                 break;
627         }
628
629         return(0);
630 }
631
632 /*
633  * hammer_vop_nresolve { nch, dvp, cred }
634  *
635  * Locate the requested directory entry.
636  */
637 static
638 int
639 hammer_vop_nresolve(struct vop_nresolve_args *ap)
640 {
641         struct hammer_transaction trans;
642         struct namecache *ncp;
643         hammer_inode_t dip;
644         hammer_inode_t ip;
645         hammer_tid_t asof;
646         struct hammer_cursor cursor;
647         struct vnode *vp;
648         int64_t namekey;
649         int error;
650         int i;
651         int nlen;
652         int flags;
653         u_int64_t obj_id;
654
655         /*
656          * Misc initialization, plus handle as-of name extensions.  Look for
657          * the '@@' extension.  Note that as-of files and directories cannot
658          * be modified.
659          */
660         dip = VTOI(ap->a_dvp);
661         ncp = ap->a_nch->ncp;
662         asof = dip->obj_asof;
663         nlen = ncp->nc_nlen;
664         flags = dip->flags;
665
666         hammer_simple_transaction(&trans, dip->hmp);
667
668         for (i = 0; i < nlen; ++i) {
669                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
670                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
671                         flags |= HAMMER_INODE_RO;
672                         break;
673                 }
674         }
675         nlen = i;
676
677         /*
678          * If there is no path component the time extension is relative to
679          * dip.
680          */
681         if (nlen == 0) {
682                 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
683                                       asof, flags, &error);
684                 if (error == 0) {
685                         error = hammer_get_vnode(ip, &vp);
686                         hammer_rel_inode(ip, 0);
687                 } else {
688                         vp = NULL;
689                 }
690                 if (error == 0) {
691                         vn_unlock(vp);
692                         cache_setvp(ap->a_nch, vp);
693                         vrele(vp);
694                 }
695                 goto done;
696         }
697
698         /*
699          * Calculate the namekey and setup the key range for the scan.  This
700          * works kinda like a chained hash table where the lower 32 bits
701          * of the namekey synthesize the chain.
702          *
703          * The key range is inclusive of both key_beg and key_end.
704          */
705         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
706
707         error = hammer_init_cursor(&trans, &cursor, &dip->cache[0], dip);
708         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
709         cursor.key_beg.obj_id = dip->obj_id;
710         cursor.key_beg.key = namekey;
711         cursor.key_beg.create_tid = 0;
712         cursor.key_beg.delete_tid = 0;
713         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
714         cursor.key_beg.obj_type = 0;
715
716         cursor.key_end = cursor.key_beg;
717         cursor.key_end.key |= 0xFFFFFFFFULL;
718         cursor.asof = asof;
719         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
720
721         /*
722          * Scan all matching records (the chain), locate the one matching
723          * the requested path component.
724          *
725          * The hammer_ip_*() functions merge in-memory records with on-disk
726          * records for the purposes of the search.
727          */
728         obj_id = 0;
729
730         if (error == 0) {
731                 error = hammer_ip_first(&cursor);
732                 while (error == 0) {
733                         error = hammer_ip_resolve_data(&cursor);
734                         if (error)
735                                 break;
736                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
737                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
738                                 obj_id = cursor.data->entry.obj_id;
739                                 break;
740                         }
741                         error = hammer_ip_next(&cursor);
742                 }
743         }
744         hammer_done_cursor(&cursor);
745         if (error == 0) {
746                 ip = hammer_get_inode(&trans, &dip->cache[1],
747                                       obj_id, asof, flags, &error);
748                 if (error == 0) {
749                         error = hammer_get_vnode(ip, &vp);
750                         hammer_rel_inode(ip, 0);
751                 } else {
752                         vp = NULL;
753                 }
754                 if (error == 0) {
755                         vn_unlock(vp);
756                         cache_setvp(ap->a_nch, vp);
757                         vrele(vp);
758                 }
759         } else if (error == ENOENT) {
760                 cache_setvp(ap->a_nch, NULL);
761         }
762 done:
763         hammer_done_transaction(&trans);
764         return (error);
765 }
766
767 /*
768  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
769  *
770  * Locate the parent directory of a directory vnode.
771  *
772  * dvp is referenced but not locked.  *vpp must be returned referenced and
773  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
774  * at the root, instead it could indicate that the directory we were in was
775  * removed.
776  *
777  * NOTE: as-of sequences are not linked into the directory structure.  If
778  * we are at the root with a different asof then the mount point, reload
779  * the same directory with the mount point's asof.   I'm not sure what this
780  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
781  * get confused, but it hasn't been tested.
782  */
783 static
784 int
785 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
786 {
787         struct hammer_transaction trans;
788         struct hammer_inode *dip;
789         struct hammer_inode *ip;
790         int64_t parent_obj_id;
791         hammer_tid_t asof;
792         int error;
793
794         dip = VTOI(ap->a_dvp);
795         asof = dip->obj_asof;
796         parent_obj_id = dip->ino_data.parent_obj_id;
797
798         if (parent_obj_id == 0) {
799                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
800                    asof != dip->hmp->asof) {
801                         parent_obj_id = dip->obj_id;
802                         asof = dip->hmp->asof;
803                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
804                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
805                                    dip->obj_asof);
806                 } else {
807                         *ap->a_vpp = NULL;
808                         return ENOENT;
809                 }
810         }
811
812         hammer_simple_transaction(&trans, dip->hmp);
813
814         ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
815                               asof, dip->flags, &error);
816         if (ip) {
817                 error = hammer_get_vnode(ip, ap->a_vpp);
818                 hammer_rel_inode(ip, 0);
819         } else {
820                 *ap->a_vpp = NULL;
821         }
822         hammer_done_transaction(&trans);
823         return (error);
824 }
825
826 /*
827  * hammer_vop_nlink { nch, dvp, vp, cred }
828  */
829 static
830 int
831 hammer_vop_nlink(struct vop_nlink_args *ap)
832 {
833         struct hammer_transaction trans;
834         struct hammer_inode *dip;
835         struct hammer_inode *ip;
836         struct nchandle *nch;
837         int error;
838
839         nch = ap->a_nch;
840         dip = VTOI(ap->a_dvp);
841         ip = VTOI(ap->a_vp);
842
843         if (dip->flags & HAMMER_INODE_RO)
844                 return (EROFS);
845         if (ip->flags & HAMMER_INODE_RO)
846                 return (EROFS);
847         if ((error = hammer_checkspace(dip->hmp)) != 0)
848                 return (error);
849
850         /*
851          * Create a transaction to cover the operations we perform.
852          */
853         hammer_start_transaction(&trans, dip->hmp);
854
855         /*
856          * Add the filesystem object to the directory.  Note that neither
857          * dip nor ip are referenced or locked, but their vnodes are
858          * referenced.  This function will bump the inode's link count.
859          */
860         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
861
862         /*
863          * Finish up.
864          */
865         if (error == 0) {
866                 cache_setunresolved(nch);
867                 cache_setvp(nch, ap->a_vp);
868         }
869         hammer_done_transaction(&trans);
870         return (error);
871 }
872
873 /*
874  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
875  *
876  * The operating system has already ensured that the directory entry
877  * does not exist and done all appropriate namespace locking.
878  */
879 static
880 int
881 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
882 {
883         struct hammer_transaction trans;
884         struct hammer_inode *dip;
885         struct hammer_inode *nip;
886         struct nchandle *nch;
887         int error;
888
889         nch = ap->a_nch;
890         dip = VTOI(ap->a_dvp);
891
892         if (dip->flags & HAMMER_INODE_RO)
893                 return (EROFS);
894         if ((error = hammer_checkspace(dip->hmp)) != 0)
895                 return (error);
896
897         /*
898          * Create a transaction to cover the operations we perform.
899          */
900         hammer_start_transaction(&trans, dip->hmp);
901
902         /*
903          * Create a new filesystem object of the requested type.  The
904          * returned inode will be referenced but not locked.
905          */
906         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
907         if (error) {
908                 hkprintf("hammer_mkdir error %d\n", error);
909                 hammer_done_transaction(&trans);
910                 *ap->a_vpp = NULL;
911                 return (error);
912         }
913         /*
914          * Add the new filesystem object to the directory.  This will also
915          * bump the inode's link count.
916          */
917         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
918         if (error)
919                 hkprintf("hammer_mkdir (add) error %d\n", error);
920
921         /*
922          * Finish up.
923          */
924         if (error) {
925                 hammer_rel_inode(nip, 0);
926                 *ap->a_vpp = NULL;
927         } else {
928                 error = hammer_get_vnode(nip, ap->a_vpp);
929                 hammer_rel_inode(nip, 0);
930                 if (error == 0) {
931                         cache_setunresolved(ap->a_nch);
932                         cache_setvp(ap->a_nch, *ap->a_vpp);
933                 }
934         }
935         hammer_done_transaction(&trans);
936         return (error);
937 }
938
939 /*
940  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
941  *
942  * The operating system has already ensured that the directory entry
943  * does not exist and done all appropriate namespace locking.
944  */
945 static
946 int
947 hammer_vop_nmknod(struct vop_nmknod_args *ap)
948 {
949         struct hammer_transaction trans;
950         struct hammer_inode *dip;
951         struct hammer_inode *nip;
952         struct nchandle *nch;
953         int error;
954
955         nch = ap->a_nch;
956         dip = VTOI(ap->a_dvp);
957
958         if (dip->flags & HAMMER_INODE_RO)
959                 return (EROFS);
960         if ((error = hammer_checkspace(dip->hmp)) != 0)
961                 return (error);
962
963         /*
964          * Create a transaction to cover the operations we perform.
965          */
966         hammer_start_transaction(&trans, dip->hmp);
967
968         /*
969          * Create a new filesystem object of the requested type.  The
970          * returned inode will be referenced but not locked.
971          */
972         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
973         if (error) {
974                 hammer_done_transaction(&trans);
975                 *ap->a_vpp = NULL;
976                 return (error);
977         }
978
979         /*
980          * Add the new filesystem object to the directory.  This will also
981          * bump the inode's link count.
982          */
983         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
984
985         /*
986          * Finish up.
987          */
988         if (error) {
989                 hammer_rel_inode(nip, 0);
990                 *ap->a_vpp = NULL;
991         } else {
992                 error = hammer_get_vnode(nip, ap->a_vpp);
993                 hammer_rel_inode(nip, 0);
994                 if (error == 0) {
995                         cache_setunresolved(ap->a_nch);
996                         cache_setvp(ap->a_nch, *ap->a_vpp);
997                 }
998         }
999         hammer_done_transaction(&trans);
1000         return (error);
1001 }
1002
1003 /*
1004  * hammer_vop_open { vp, mode, cred, fp }
1005  */
1006 static
1007 int
1008 hammer_vop_open(struct vop_open_args *ap)
1009 {
1010         hammer_inode_t ip;
1011
1012         ip = VTOI(ap->a_vp);
1013
1014         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1015                 return (EROFS);
1016         return(vop_stdopen(ap));
1017 }
1018
1019 /*
1020  * hammer_vop_pathconf { vp, name, retval }
1021  */
1022 static
1023 int
1024 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1025 {
1026         return EOPNOTSUPP;
1027 }
1028
1029 /*
1030  * hammer_vop_print { vp }
1031  */
1032 static
1033 int
1034 hammer_vop_print(struct vop_print_args *ap)
1035 {
1036         return EOPNOTSUPP;
1037 }
1038
1039 /*
1040  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1041  */
1042 static
1043 int
1044 hammer_vop_readdir(struct vop_readdir_args *ap)
1045 {
1046         struct hammer_transaction trans;
1047         struct hammer_cursor cursor;
1048         struct hammer_inode *ip;
1049         struct uio *uio;
1050         hammer_base_elm_t base;
1051         int error;
1052         int cookie_index;
1053         int ncookies;
1054         off_t *cookies;
1055         off_t saveoff;
1056         int r;
1057
1058         ip = VTOI(ap->a_vp);
1059         uio = ap->a_uio;
1060         saveoff = uio->uio_offset;
1061
1062         if (ap->a_ncookies) {
1063                 ncookies = uio->uio_resid / 16 + 1;
1064                 if (ncookies > 1024)
1065                         ncookies = 1024;
1066                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1067                 cookie_index = 0;
1068         } else {
1069                 ncookies = -1;
1070                 cookies = NULL;
1071                 cookie_index = 0;
1072         }
1073
1074         hammer_simple_transaction(&trans, ip->hmp);
1075
1076         /*
1077          * Handle artificial entries
1078          */
1079         error = 0;
1080         if (saveoff == 0) {
1081                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1082                 if (r)
1083                         goto done;
1084                 if (cookies)
1085                         cookies[cookie_index] = saveoff;
1086                 ++saveoff;
1087                 ++cookie_index;
1088                 if (cookie_index == ncookies)
1089                         goto done;
1090         }
1091         if (saveoff == 1) {
1092                 if (ip->ino_data.parent_obj_id) {
1093                         r = vop_write_dirent(&error, uio,
1094                                              ip->ino_data.parent_obj_id,
1095                                              DT_DIR, 2, "..");
1096                 } else {
1097                         r = vop_write_dirent(&error, uio,
1098                                              ip->obj_id, DT_DIR, 2, "..");
1099                 }
1100                 if (r)
1101                         goto done;
1102                 if (cookies)
1103                         cookies[cookie_index] = saveoff;
1104                 ++saveoff;
1105                 ++cookie_index;
1106                 if (cookie_index == ncookies)
1107                         goto done;
1108         }
1109
1110         /*
1111          * Key range (begin and end inclusive) to scan.  Directory keys
1112          * directly translate to a 64 bit 'seek' position.
1113          */
1114         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1115         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1116         cursor.key_beg.obj_id = ip->obj_id;
1117         cursor.key_beg.create_tid = 0;
1118         cursor.key_beg.delete_tid = 0;
1119         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1120         cursor.key_beg.obj_type = 0;
1121         cursor.key_beg.key = saveoff;
1122
1123         cursor.key_end = cursor.key_beg;
1124         cursor.key_end.key = HAMMER_MAX_KEY;
1125         cursor.asof = ip->obj_asof;
1126         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1127
1128         error = hammer_ip_first(&cursor);
1129
1130         while (error == 0) {
1131                 error = hammer_ip_resolve_data(&cursor);
1132                 if (error)
1133                         break;
1134                 base = &cursor.leaf->base;
1135                 saveoff = base->key;
1136                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1137
1138                 if (base->obj_id != ip->obj_id)
1139                         panic("readdir: bad record at %p", cursor.node);
1140
1141                 r = vop_write_dirent(
1142                              &error, uio, cursor.data->entry.obj_id,
1143                              hammer_get_dtype(cursor.leaf->base.obj_type),
1144                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1145                              (void *)cursor.data->entry.name);
1146                 if (r)
1147                         break;
1148                 ++saveoff;
1149                 if (cookies)
1150                         cookies[cookie_index] = base->key;
1151                 ++cookie_index;
1152                 if (cookie_index == ncookies)
1153                         break;
1154                 error = hammer_ip_next(&cursor);
1155         }
1156         hammer_done_cursor(&cursor);
1157
1158 done:
1159         hammer_done_transaction(&trans);
1160
1161         if (ap->a_eofflag)
1162                 *ap->a_eofflag = (error == ENOENT);
1163         uio->uio_offset = saveoff;
1164         if (error && cookie_index == 0) {
1165                 if (error == ENOENT)
1166                         error = 0;
1167                 if (cookies) {
1168                         kfree(cookies, M_TEMP);
1169                         *ap->a_ncookies = 0;
1170                         *ap->a_cookies = NULL;
1171                 }
1172         } else {
1173                 if (error == ENOENT)
1174                         error = 0;
1175                 if (cookies) {
1176                         *ap->a_ncookies = cookie_index;
1177                         *ap->a_cookies = cookies;
1178                 }
1179         }
1180         return(error);
1181 }
1182
1183 /*
1184  * hammer_vop_readlink { vp, uio, cred }
1185  */
1186 static
1187 int
1188 hammer_vop_readlink(struct vop_readlink_args *ap)
1189 {
1190         struct hammer_transaction trans;
1191         struct hammer_cursor cursor;
1192         struct hammer_inode *ip;
1193         int error;
1194
1195         ip = VTOI(ap->a_vp);
1196
1197         /*
1198          * Shortcut if the symlink data was stuffed into ino_data.
1199          */
1200         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1201                 error = uiomove(ip->ino_data.ext.symlink,
1202                                 ip->ino_data.size, ap->a_uio);
1203                 return(error);
1204         }
1205
1206         /*
1207          * Long version
1208          */
1209         hammer_simple_transaction(&trans, ip->hmp);
1210         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1211
1212         /*
1213          * Key range (begin and end inclusive) to scan.  Directory keys
1214          * directly translate to a 64 bit 'seek' position.
1215          */
1216         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; /* XXX */
1217         cursor.key_beg.obj_id = ip->obj_id;
1218         cursor.key_beg.create_tid = 0;
1219         cursor.key_beg.delete_tid = 0;
1220         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1221         cursor.key_beg.obj_type = 0;
1222         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1223         cursor.asof = ip->obj_asof;
1224         cursor.flags |= HAMMER_CURSOR_ASOF;
1225
1226         error = hammer_ip_lookup(&cursor);
1227         if (error == 0) {
1228                 error = hammer_ip_resolve_data(&cursor);
1229                 if (error == 0) {
1230                         KKASSERT(cursor.leaf->data_len >=
1231                                  HAMMER_SYMLINK_NAME_OFF);
1232                         error = uiomove(cursor.data->symlink.name,
1233                                         cursor.leaf->data_len -
1234                                                 HAMMER_SYMLINK_NAME_OFF,
1235                                         ap->a_uio);
1236                 }
1237         }
1238         hammer_done_cursor(&cursor);
1239         hammer_done_transaction(&trans);
1240         return(error);
1241 }
1242
1243 /*
1244  * hammer_vop_nremove { nch, dvp, cred }
1245  */
1246 static
1247 int
1248 hammer_vop_nremove(struct vop_nremove_args *ap)
1249 {
1250         struct hammer_transaction trans;
1251         struct hammer_inode *dip;
1252         int error;
1253
1254         dip = VTOI(ap->a_dvp);
1255
1256         if (hammer_nohistory(dip) == 0 &&
1257             (error = hammer_checkspace(dip->hmp)) != 0) {
1258                 return (error);
1259         }
1260
1261         hammer_start_transaction(&trans, dip->hmp);
1262         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1263         hammer_done_transaction(&trans);
1264
1265         return (error);
1266 }
1267
1268 /*
1269  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1270  */
1271 static
1272 int
1273 hammer_vop_nrename(struct vop_nrename_args *ap)
1274 {
1275         struct hammer_transaction trans;
1276         struct namecache *fncp;
1277         struct namecache *tncp;
1278         struct hammer_inode *fdip;
1279         struct hammer_inode *tdip;
1280         struct hammer_inode *ip;
1281         struct hammer_cursor cursor;
1282         int64_t namekey;
1283         int nlen, error;
1284
1285         fdip = VTOI(ap->a_fdvp);
1286         tdip = VTOI(ap->a_tdvp);
1287         fncp = ap->a_fnch->ncp;
1288         tncp = ap->a_tnch->ncp;
1289         ip = VTOI(fncp->nc_vp);
1290         KKASSERT(ip != NULL);
1291
1292         if (fdip->flags & HAMMER_INODE_RO)
1293                 return (EROFS);
1294         if (tdip->flags & HAMMER_INODE_RO)
1295                 return (EROFS);
1296         if (ip->flags & HAMMER_INODE_RO)
1297                 return (EROFS);
1298         if ((error = hammer_checkspace(fdip->hmp)) != 0)
1299                 return (error);
1300
1301         hammer_start_transaction(&trans, fdip->hmp);
1302
1303         /*
1304          * Remove tncp from the target directory and then link ip as
1305          * tncp. XXX pass trans to dounlink
1306          *
1307          * Force the inode sync-time to match the transaction so it is
1308          * in-sync with the creation of the target directory entry.
1309          */
1310         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1311         if (error == 0 || error == ENOENT) {
1312                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1313                 if (error == 0) {
1314                         ip->ino_data.parent_obj_id = tdip->obj_id;
1315                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1316                 }
1317         }
1318         if (error)
1319                 goto failed; /* XXX */
1320
1321         /*
1322          * Locate the record in the originating directory and remove it.
1323          *
1324          * Calculate the namekey and setup the key range for the scan.  This
1325          * works kinda like a chained hash table where the lower 32 bits
1326          * of the namekey synthesize the chain.
1327          *
1328          * The key range is inclusive of both key_beg and key_end.
1329          */
1330         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1331 retry:
1332         hammer_init_cursor(&trans, &cursor, &fdip->cache[0], fdip);
1333         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1334         cursor.key_beg.obj_id = fdip->obj_id;
1335         cursor.key_beg.key = namekey;
1336         cursor.key_beg.create_tid = 0;
1337         cursor.key_beg.delete_tid = 0;
1338         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1339         cursor.key_beg.obj_type = 0;
1340
1341         cursor.key_end = cursor.key_beg;
1342         cursor.key_end.key |= 0xFFFFFFFFULL;
1343         cursor.asof = fdip->obj_asof;
1344         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1345
1346         /*
1347          * Scan all matching records (the chain), locate the one matching
1348          * the requested path component.
1349          *
1350          * The hammer_ip_*() functions merge in-memory records with on-disk
1351          * records for the purposes of the search.
1352          */
1353         error = hammer_ip_first(&cursor);
1354         while (error == 0) {
1355                 if (hammer_ip_resolve_data(&cursor) != 0)
1356                         break;
1357                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1358                 KKASSERT(nlen > 0);
1359                 if (fncp->nc_nlen == nlen &&
1360                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1361                         break;
1362                 }
1363                 error = hammer_ip_next(&cursor);
1364         }
1365
1366         /*
1367          * If all is ok we have to get the inode so we can adjust nlinks.
1368          *
1369          * WARNING: hammer_ip_del_directory() may have to terminate the
1370          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1371          * twice.
1372          */
1373         if (error == 0)
1374                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1375
1376         /*
1377          * XXX A deadlock here will break rename's atomicy for the purposes
1378          * of crash recovery.
1379          */
1380         if (error == EDEADLK) {
1381                 hammer_done_cursor(&cursor);
1382                 goto retry;
1383         }
1384
1385         /*
1386          * Cleanup and tell the kernel that the rename succeeded.
1387          */
1388         hammer_done_cursor(&cursor);
1389         if (error == 0)
1390                 cache_rename(ap->a_fnch, ap->a_tnch);
1391
1392 failed:
1393         hammer_done_transaction(&trans);
1394         return (error);
1395 }
1396
1397 /*
1398  * hammer_vop_nrmdir { nch, dvp, cred }
1399  */
1400 static
1401 int
1402 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1403 {
1404         struct hammer_transaction trans;
1405         struct hammer_inode *dip;
1406         int error;
1407
1408         dip = VTOI(ap->a_dvp);
1409
1410         if (hammer_nohistory(dip) == 0 &&
1411             (error = hammer_checkspace(dip->hmp)) != 0) {
1412                 return (error);
1413         }
1414
1415         hammer_start_transaction(&trans, dip->hmp);
1416         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1417         hammer_done_transaction(&trans);
1418
1419         return (error);
1420 }
1421
1422 /*
1423  * hammer_vop_setattr { vp, vap, cred }
1424  */
1425 static
1426 int
1427 hammer_vop_setattr(struct vop_setattr_args *ap)
1428 {
1429         struct hammer_transaction trans;
1430         struct vattr *vap;
1431         struct hammer_inode *ip;
1432         int modflags;
1433         int error;
1434         int truncating;
1435         off_t aligned_size;
1436         u_int32_t flags;
1437
1438         vap = ap->a_vap;
1439         ip = ap->a_vp->v_data;
1440         modflags = 0;
1441
1442         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1443                 return(EROFS);
1444         if (ip->flags & HAMMER_INODE_RO)
1445                 return (EROFS);
1446         if (hammer_nohistory(ip) == 0 &&
1447             (error = hammer_checkspace(ip->hmp)) != 0) {
1448                 return (error);
1449         }
1450
1451         hammer_start_transaction(&trans, ip->hmp);
1452         error = 0;
1453
1454         if (vap->va_flags != VNOVAL) {
1455                 flags = ip->ino_data.uflags;
1456                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1457                                          hammer_to_unix_xid(&ip->ino_data.uid),
1458                                          ap->a_cred);
1459                 if (error == 0) {
1460                         if (ip->ino_data.uflags != flags) {
1461                                 ip->ino_data.uflags = flags;
1462                                 modflags |= HAMMER_INODE_DDIRTY;
1463                         }
1464                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1465                                 error = 0;
1466                                 goto done;
1467                         }
1468                 }
1469                 goto done;
1470         }
1471         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1472                 error = EPERM;
1473                 goto done;
1474         }
1475         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1476                 mode_t cur_mode = ip->ino_data.mode;
1477                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1478                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1479                 uuid_t uuid_uid;
1480                 uuid_t uuid_gid;
1481
1482                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1483                                          ap->a_cred,
1484                                          &cur_uid, &cur_gid, &cur_mode);
1485                 if (error == 0) {
1486                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1487                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1488                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1489                                  sizeof(uuid_uid)) ||
1490                             bcmp(&uuid_gid, &ip->ino_data.gid,
1491                                  sizeof(uuid_gid)) ||
1492                             ip->ino_data.mode != cur_mode
1493                         ) {
1494                                 ip->ino_data.uid = uuid_uid;
1495                                 ip->ino_data.gid = uuid_gid;
1496                                 ip->ino_data.mode = cur_mode;
1497                         }
1498                         modflags |= HAMMER_INODE_DDIRTY;
1499                 }
1500         }
1501         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1502                 switch(ap->a_vp->v_type) {
1503                 case VREG:
1504                         if (vap->va_size == ip->ino_data.size)
1505                                 break;
1506                         /*
1507                          * XXX break atomicy, we can deadlock the backend
1508                          * if we do not release the lock.  Probably not a
1509                          * big deal here.
1510                          */
1511                         if (vap->va_size < ip->ino_data.size) {
1512                                 vtruncbuf(ap->a_vp, vap->va_size,
1513                                           HAMMER_BUFSIZE);
1514                                 truncating = 1;
1515                         } else {
1516                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1517                                 truncating = 0;
1518                         }
1519                         ip->ino_data.size = vap->va_size;
1520                         modflags |= HAMMER_INODE_DDIRTY;
1521                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1522                                        ~HAMMER_BUFMASK64;
1523
1524                         /*
1525                          * on-media truncation is cached in the inode until
1526                          * the inode is synchronized.
1527                          */
1528                         if (truncating) {
1529                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1530                                 hammer_update_rsv_databufs(ip);
1531 #ifdef DEBUG_TRUNCATE
1532                                 if (HammerTruncIp == NULL)
1533                                         HammerTruncIp = ip;
1534 #endif
1535                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1536                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1537                                         ip->trunc_off = vap->va_size;
1538 #ifdef DEBUG_TRUNCATE
1539                                         if (ip == HammerTruncIp)
1540                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1541 #endif
1542                                 } else if (ip->trunc_off > vap->va_size) {
1543                                         ip->trunc_off = vap->va_size;
1544 #ifdef DEBUG_TRUNCATE
1545                                         if (ip == HammerTruncIp)
1546                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1547 #endif
1548                                 } else {
1549 #ifdef DEBUG_TRUNCATE
1550                                         if (ip == HammerTruncIp)
1551                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1552 #endif
1553                                 }
1554                         }
1555
1556                         /*
1557                          * If truncating we have to clean out a portion of
1558                          * the last block on-disk.  We do this in the
1559                          * front-end buffer cache.
1560                          */
1561                         if (truncating && vap->va_size < aligned_size) {
1562                                 struct buf *bp;
1563                                 int offset;
1564
1565                                 aligned_size -= HAMMER_BUFSIZE;
1566
1567                                 offset = vap->va_size & HAMMER_BUFMASK;
1568                                 error = bread(ap->a_vp, aligned_size,
1569                                               HAMMER_BUFSIZE, &bp);
1570                                 hammer_ip_frontend_trunc(ip, aligned_size);
1571                                 if (error == 0) {
1572                                         bzero(bp->b_data + offset,
1573                                               HAMMER_BUFSIZE - offset);
1574                                         bdwrite(bp);
1575                                 } else {
1576                                         kprintf("ERROR %d\n", error);
1577                                         brelse(bp);
1578                                 }
1579                         }
1580                         break;
1581                 case VDATABASE:
1582                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1583                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1584                                 ip->trunc_off = vap->va_size;
1585                         } else if (ip->trunc_off > vap->va_size) {
1586                                 ip->trunc_off = vap->va_size;
1587                         }
1588                         hammer_ip_frontend_trunc(ip, vap->va_size);
1589                         ip->ino_data.size = vap->va_size;
1590                         modflags |= HAMMER_INODE_DDIRTY;
1591                         break;
1592                 default:
1593                         error = EINVAL;
1594                         goto done;
1595                 }
1596                 break;
1597         }
1598         if (vap->va_atime.tv_sec != VNOVAL) {
1599                 ip->ino_leaf.atime =
1600                         hammer_timespec_to_transid(&vap->va_atime);
1601                 modflags |= HAMMER_INODE_ITIMES;
1602         }
1603         if (vap->va_mtime.tv_sec != VNOVAL) {
1604                 ip->ino_data.mtime =
1605                         hammer_timespec_to_transid(&vap->va_mtime);
1606                 modflags |= HAMMER_INODE_ITIMES;
1607                 modflags |= HAMMER_INODE_DDIRTY;        /* XXX mtime */
1608         }
1609         if (vap->va_mode != (mode_t)VNOVAL) {
1610                 mode_t   cur_mode = ip->ino_data.mode;
1611                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1612                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1613
1614                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1615                                          cur_uid, cur_gid, &cur_mode);
1616                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1617                         ip->ino_data.mode = cur_mode;
1618                         modflags |= HAMMER_INODE_DDIRTY;
1619                 }
1620         }
1621 done:
1622         if (error == 0)
1623                 hammer_modify_inode(ip, modflags);
1624         hammer_done_transaction(&trans);
1625         return (error);
1626 }
1627
1628 /*
1629  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1630  */
1631 static
1632 int
1633 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1634 {
1635         struct hammer_transaction trans;
1636         struct hammer_inode *dip;
1637         struct hammer_inode *nip;
1638         struct nchandle *nch;
1639         hammer_record_t record;
1640         int error;
1641         int bytes;
1642
1643         ap->a_vap->va_type = VLNK;
1644
1645         nch = ap->a_nch;
1646         dip = VTOI(ap->a_dvp);
1647
1648         if (dip->flags & HAMMER_INODE_RO)
1649                 return (EROFS);
1650         if ((error = hammer_checkspace(dip->hmp)) != 0)
1651                 return (error);
1652
1653         /*
1654          * Create a transaction to cover the operations we perform.
1655          */
1656         hammer_start_transaction(&trans, dip->hmp);
1657
1658         /*
1659          * Create a new filesystem object of the requested type.  The
1660          * returned inode will be referenced but not locked.
1661          */
1662
1663         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1664         if (error) {
1665                 hammer_done_transaction(&trans);
1666                 *ap->a_vpp = NULL;
1667                 return (error);
1668         }
1669
1670         /*
1671          * Add a record representing the symlink.  symlink stores the link
1672          * as pure data, not a string, and is no \0 terminated.
1673          */
1674         if (error == 0) {
1675                 bytes = strlen(ap->a_target);
1676
1677                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1678                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1679                 } else {
1680                         record = hammer_alloc_mem_record(nip, bytes);
1681                         record->type = HAMMER_MEM_RECORD_GENERAL;
1682
1683                         record->leaf.base.localization = HAMMER_LOCALIZE_MISC;
1684                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1685                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1686                         record->leaf.data_len = bytes;
1687                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1688                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1689                         error = hammer_ip_add_record(&trans, record);
1690                 }
1691
1692                 /*
1693                  * Set the file size to the length of the link.
1694                  */
1695                 if (error == 0) {
1696                         nip->ino_data.size = bytes;
1697                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1698                 }
1699         }
1700         if (error == 0)
1701                 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1702
1703         /*
1704          * Finish up.
1705          */
1706         if (error) {
1707                 hammer_rel_inode(nip, 0);
1708                 *ap->a_vpp = NULL;
1709         } else {
1710                 error = hammer_get_vnode(nip, ap->a_vpp);
1711                 hammer_rel_inode(nip, 0);
1712                 if (error == 0) {
1713                         cache_setunresolved(ap->a_nch);
1714                         cache_setvp(ap->a_nch, *ap->a_vpp);
1715                 }
1716         }
1717         hammer_done_transaction(&trans);
1718         return (error);
1719 }
1720
1721 /*
1722  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1723  */
1724 static
1725 int
1726 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1727 {
1728         struct hammer_transaction trans;
1729         struct hammer_inode *dip;
1730         int error;
1731
1732         dip = VTOI(ap->a_dvp);
1733
1734         if (hammer_nohistory(dip) == 0 &&
1735             (error = hammer_checkspace(dip->hmp)) != 0) {
1736                 return (error);
1737         }
1738
1739         hammer_start_transaction(&trans, dip->hmp);
1740         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1741                                 ap->a_cred, ap->a_flags);
1742         hammer_done_transaction(&trans);
1743
1744         return (error);
1745 }
1746
1747 /*
1748  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1749  */
1750 static
1751 int
1752 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1753 {
1754         struct hammer_inode *ip = ap->a_vp->v_data;
1755
1756         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1757                             ap->a_fflag, ap->a_cred));
1758 }
1759
1760 static
1761 int
1762 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1763 {
1764         struct mount *mp;
1765         int error;
1766
1767         mp = ap->a_head.a_ops->head.vv_mount;
1768
1769         switch(ap->a_op) {
1770         case MOUNTCTL_SET_EXPORT:
1771                 if (ap->a_ctllen != sizeof(struct export_args))
1772                         error = EINVAL;
1773                 error = hammer_vfs_export(mp, ap->a_op,
1774                                       (const struct export_args *)ap->a_ctl);
1775                 break;
1776         default:
1777                 error = journal_mountctl(ap);
1778                 break;
1779         }
1780         return(error);
1781 }
1782
1783 /*
1784  * hammer_vop_strategy { vp, bio }
1785  *
1786  * Strategy call, used for regular file read & write only.  Note that the
1787  * bp may represent a cluster.
1788  *
1789  * To simplify operation and allow better optimizations in the future,
1790  * this code does not make any assumptions with regards to buffer alignment
1791  * or size.
1792  */
1793 static
1794 int
1795 hammer_vop_strategy(struct vop_strategy_args *ap)
1796 {
1797         struct buf *bp;
1798         int error;
1799
1800         bp = ap->a_bio->bio_buf;
1801
1802         switch(bp->b_cmd) {
1803         case BUF_CMD_READ:
1804                 error = hammer_vop_strategy_read(ap);
1805                 break;
1806         case BUF_CMD_WRITE:
1807                 error = hammer_vop_strategy_write(ap);
1808                 break;
1809         default:
1810                 bp->b_error = error = EINVAL;
1811                 bp->b_flags |= B_ERROR;
1812                 biodone(ap->a_bio);
1813                 break;
1814         }
1815         return (error);
1816 }
1817
1818 /*
1819  * Read from a regular file.  Iterate the related records and fill in the
1820  * BIO/BUF.  Gaps are zero-filled.
1821  *
1822  * The support code in hammer_object.c should be used to deal with mixed
1823  * in-memory and on-disk records.
1824  *
1825  * XXX atime update
1826  */
1827 static
1828 int
1829 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1830 {
1831         struct hammer_transaction trans;
1832         struct hammer_inode *ip;
1833         struct hammer_cursor cursor;
1834         hammer_base_elm_t base;
1835         struct bio *bio;
1836         struct buf *bp;
1837         int64_t rec_offset;
1838         int64_t ran_end;
1839         int64_t tmp64;
1840         int error;
1841         int boff;
1842         int roff;
1843         int n;
1844
1845         bio = ap->a_bio;
1846         bp = bio->bio_buf;
1847         ip = ap->a_vp->v_data;
1848
1849         hammer_simple_transaction(&trans, ip->hmp);
1850         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1851
1852         /*
1853          * Key range (begin and end inclusive) to scan.  Note that the key's
1854          * stored in the actual records represent BASE+LEN, not BASE.  The
1855          * first record containing bio_offset will have a key > bio_offset.
1856          */
1857         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1858         cursor.key_beg.obj_id = ip->obj_id;
1859         cursor.key_beg.create_tid = 0;
1860         cursor.key_beg.delete_tid = 0;
1861         cursor.key_beg.obj_type = 0;
1862         cursor.key_beg.key = bio->bio_offset + 1;
1863         cursor.asof = ip->obj_asof;
1864         cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
1865
1866         cursor.key_end = cursor.key_beg;
1867         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
1868 #if 0
1869         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1870                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1871                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1872                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1873         } else
1874 #endif
1875         {
1876                 ran_end = bio->bio_offset + bp->b_bufsize;
1877                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1878                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1879                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1880                 if (tmp64 < ran_end)
1881                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1882                 else
1883                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1884         }
1885         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1886
1887         error = hammer_ip_first(&cursor);
1888         boff = 0;
1889
1890         while (error == 0) {
1891                 /*
1892                  * Get the base file offset of the record.  The key for
1893                  * data records is (base + bytes) rather then (base).
1894                  */
1895                 base = &cursor.leaf->base;
1896                 rec_offset = base->key - cursor.leaf->data_len;
1897
1898                 /*
1899                  * Calculate the gap, if any, and zero-fill it.
1900                  *
1901                  * n is the offset of the start of the record verses our
1902                  * current seek offset in the bio.
1903                  */
1904                 n = (int)(rec_offset - (bio->bio_offset + boff));
1905                 if (n > 0) {
1906                         if (n > bp->b_bufsize - boff)
1907                                 n = bp->b_bufsize - boff;
1908                         bzero((char *)bp->b_data + boff, n);
1909                         boff += n;
1910                         n = 0;
1911                 }
1912
1913                 /*
1914                  * Calculate the data offset in the record and the number
1915                  * of bytes we can copy.
1916                  *
1917                  * There are two degenerate cases.  First, boff may already
1918                  * be at bp->b_bufsize.  Secondly, the data offset within
1919                  * the record may exceed the record's size.
1920                  */
1921                 roff = -n;
1922                 rec_offset += roff;
1923                 n = cursor.leaf->data_len - roff;
1924                 if (n <= 0) {
1925                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
1926                         n = 0;
1927                 } else if (n > bp->b_bufsize - boff) {
1928                         n = bp->b_bufsize - boff;
1929                 }
1930
1931                 /*
1932                  * Deal with cached truncations.  This cool bit of code
1933                  * allows truncate()/ftruncate() to avoid having to sync
1934                  * the file.
1935                  *
1936                  * If the frontend is truncated then all backend records are
1937                  * subject to the frontend's truncation.
1938                  *
1939                  * If the backend is truncated then backend records on-disk
1940                  * (but not in-memory) are subject to the backend's
1941                  * truncation.  In-memory records owned by the backend
1942                  * represent data written after the truncation point on the
1943                  * backend and must not be truncated.
1944                  *
1945                  * Truncate operations deal with frontend buffer cache
1946                  * buffers and frontend-owned in-memory records synchronously.
1947                  */
1948                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
1949                         if (hammer_cursor_ondisk(&cursor) ||
1950                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
1951                                 if (ip->trunc_off <= rec_offset)
1952                                         n = 0;
1953                                 else if (ip->trunc_off < rec_offset + n)
1954                                         n = (int)(ip->trunc_off - rec_offset);
1955                         }
1956                 }
1957                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
1958                         if (hammer_cursor_ondisk(&cursor)) {
1959                                 if (ip->sync_trunc_off <= rec_offset)
1960                                         n = 0;
1961                                 else if (ip->sync_trunc_off < rec_offset + n)
1962                                         n = (int)(ip->sync_trunc_off - rec_offset);
1963                         }
1964                 }
1965
1966                 /*
1967                  * Try to issue a direct read into our bio if possible,
1968                  * otherwise resolve the element data into a hammer_buffer
1969                  * and copy.
1970                  *
1971                  * WARNING: If we hit the else clause.
1972                  */
1973                 if (roff == 0 && boff == 0 && n == bp->b_bufsize &&
1974                     (rec_offset & HAMMER_BUFMASK) == 0) {
1975                         error = hammer_io_direct_read(trans.hmp, cursor.leaf,
1976                                                       bio);
1977                         goto done;
1978                 } else if (n) {
1979                         error = hammer_ip_resolve_data(&cursor);
1980                         if (error == 0) {
1981                                 bcopy((char *)cursor.data + roff,
1982                                       (char *)bp->b_data + boff, n);
1983                         }
1984                 }
1985                 if (error)
1986                         break;
1987
1988                 /*
1989                  * Iterate until we have filled the request.
1990                  */
1991                 boff += n;
1992                 if (boff == bp->b_bufsize)
1993                         break;
1994                 error = hammer_ip_next(&cursor);
1995         }
1996
1997         /*
1998          * There may have been a gap after the last record
1999          */
2000         if (error == ENOENT)
2001                 error = 0;
2002         if (error == 0 && boff != bp->b_bufsize) {
2003                 KKASSERT(boff < bp->b_bufsize);
2004                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2005                 /* boff = bp->b_bufsize; */
2006         }
2007         bp->b_resid = 0;
2008         bp->b_error = error;
2009         if (error)
2010                 bp->b_flags |= B_ERROR;
2011         biodone(ap->a_bio);
2012
2013 done:
2014         if (cursor.node)
2015                 hammer_cache_node(cursor.node, &ip->cache[1]);
2016         hammer_done_cursor(&cursor);
2017         hammer_done_transaction(&trans);
2018         return(error);
2019 }
2020
2021 /*
2022  * Write to a regular file.   Because this is a strategy call the OS is
2023  * trying to actually sync data to the media.   HAMMER can only flush
2024  * the entire inode (so the TID remains properly synchronized).
2025  *
2026  * Basically all we do here is place the bio on the inode's flush queue
2027  * and activate the flusher.
2028  */
2029 static
2030 int
2031 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2032 {
2033         hammer_record_t record;
2034         hammer_inode_t ip;
2035         struct bio *bio;
2036         struct buf *bp;
2037         int bytes;
2038         int error;
2039
2040         bio = ap->a_bio;
2041         bp = bio->bio_buf;
2042         ip = ap->a_vp->v_data;
2043
2044         if (ip->flags & HAMMER_INODE_RO) {
2045                 bp->b_error = EROFS;
2046                 bp->b_flags |= B_ERROR;
2047                 biodone(ap->a_bio);
2048                 hammer_cleanup_write_io(ip);
2049                 return(EROFS);
2050         }
2051
2052         /*
2053          * Interlock with inode destruction (no in-kernel or directory
2054          * topology visibility).  If we queue new IO while trying to
2055          * destroy the inode we can deadlock the vtrunc call in
2056          * hammer_inode_unloadable_check().
2057          */
2058         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2059                 bp->b_resid = 0;
2060                 biodone(ap->a_bio);
2061                 hammer_cleanup_write_io(ip);
2062                 return(0);
2063         }
2064
2065         /*
2066          * Attempt to reserve space and issue a direct-write from the
2067          * front-end.  If we can't we will queue the BIO to the flusher.
2068          * The bulk/direct-write code will still bcopy if writing less
2069          * then full-sized blocks (at the end of a file).
2070          *
2071          * If we can the I/O can be issued and an in-memory record will
2072          * be installed to reference the storage until the flusher can get to
2073          * it.
2074          *
2075          * Since we own the high level bio the front-end will not try to
2076          * do a direct-read until the write completes.
2077          */
2078         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2079         KKASSERT(bio->bio_offset < ip->ino_data.size);
2080         if (bio->bio_offset + bp->b_bufsize <= ip->ino_data.size)
2081                 bytes = bp->b_bufsize;
2082         else
2083                 bytes = (int)(ip->ino_data.size - bio->bio_offset);
2084
2085         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2086                                     bytes, &error);
2087         if (record) {
2088                 hammer_io_direct_write(ip->hmp, &record->leaf, bio);
2089                 hammer_rel_mem_record(record);
2090                 if (ip->rsv_recs > hammer_limit_irecs / 2)
2091                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2092                 else
2093                         hammer_flush_inode(ip, 0);
2094         } else {
2095                 bp->b_error = error;
2096                 bp->b_flags |= B_ERROR;
2097                 biodone(ap->a_bio);
2098         }
2099         hammer_cleanup_write_io(ip);
2100         return(error);
2101 }
2102
2103 /*
2104  * Clean-up after disposing of a dirty frontend buffer's data.
2105  * This is somewhat heuristical so try to be robust.
2106  */
2107 static void
2108 hammer_cleanup_write_io(hammer_inode_t ip)
2109 {
2110         if (ip->rsv_databufs) {
2111                 --ip->rsv_databufs;
2112                 --ip->hmp->rsv_databufs;
2113         }
2114 }
2115
2116 /*
2117  * We can lose track of dirty buffer cache buffers if we truncate, this
2118  * routine will resynchronize the count.
2119  */
2120 static
2121 void
2122 hammer_update_rsv_databufs(hammer_inode_t ip)
2123 {
2124         struct buf *bp;
2125         int delta;
2126         int n;
2127
2128         if (ip->vp) {
2129                 n = 0;
2130                 RB_FOREACH(bp, buf_rb_tree, &ip->vp->v_rbdirty_tree) {
2131                         ++n;
2132                 }
2133         } else {
2134                 n = 0;
2135         }
2136         delta = n - ip->rsv_databufs;
2137         ip->rsv_databufs += delta;
2138         ip->hmp->rsv_databufs += delta;
2139 }
2140
2141 /*
2142  * dounlink - disconnect a directory entry
2143  *
2144  * XXX whiteout support not really in yet
2145  */
2146 static int
2147 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2148                 struct vnode *dvp, struct ucred *cred, int flags)
2149 {
2150         struct namecache *ncp;
2151         hammer_inode_t dip;
2152         hammer_inode_t ip;
2153         struct hammer_cursor cursor;
2154         int64_t namekey;
2155         int nlen, error;
2156
2157         /*
2158          * Calculate the namekey and setup the key range for the scan.  This
2159          * works kinda like a chained hash table where the lower 32 bits
2160          * of the namekey synthesize the chain.
2161          *
2162          * The key range is inclusive of both key_beg and key_end.
2163          */
2164         dip = VTOI(dvp);
2165         ncp = nch->ncp;
2166
2167         if (dip->flags & HAMMER_INODE_RO)
2168                 return (EROFS);
2169
2170         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2171 retry:
2172         hammer_init_cursor(trans, &cursor, &dip->cache[0], dip);
2173         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
2174         cursor.key_beg.obj_id = dip->obj_id;
2175         cursor.key_beg.key = namekey;
2176         cursor.key_beg.create_tid = 0;
2177         cursor.key_beg.delete_tid = 0;
2178         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2179         cursor.key_beg.obj_type = 0;
2180
2181         cursor.key_end = cursor.key_beg;
2182         cursor.key_end.key |= 0xFFFFFFFFULL;
2183         cursor.asof = dip->obj_asof;
2184         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2185
2186         /*
2187          * Scan all matching records (the chain), locate the one matching
2188          * the requested path component.  info->last_error contains the
2189          * error code on search termination and could be 0, ENOENT, or
2190          * something else.
2191          *
2192          * The hammer_ip_*() functions merge in-memory records with on-disk
2193          * records for the purposes of the search.
2194          */
2195         error = hammer_ip_first(&cursor);
2196
2197         while (error == 0) {
2198                 error = hammer_ip_resolve_data(&cursor);
2199                 if (error)
2200                         break;
2201                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2202                 KKASSERT(nlen > 0);
2203                 if (ncp->nc_nlen == nlen &&
2204                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2205                         break;
2206                 }
2207                 error = hammer_ip_next(&cursor);
2208         }
2209
2210         /*
2211          * If all is ok we have to get the inode so we can adjust nlinks.
2212          *
2213          * If the target is a directory, it must be empty.
2214          */
2215         if (error == 0) {
2216                 ip = hammer_get_inode(trans, &dip->cache[1],
2217                                       cursor.data->entry.obj_id,
2218                                       dip->hmp->asof, 0, &error);
2219                 if (error == ENOENT) {
2220                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2221                         Debugger("ENOENT unlinking object that should exist");
2222                 }
2223
2224                 /*
2225                  * If we are trying to remove a directory the directory must
2226                  * be empty.
2227                  *
2228                  * WARNING: hammer_ip_check_directory_empty() may have to
2229                  * terminate the cursor to avoid a deadlock.  It is ok to
2230                  * call hammer_done_cursor() twice.
2231                  */
2232                 if (error == 0 && ip->ino_data.obj_type ==
2233                                   HAMMER_OBJTYPE_DIRECTORY) {
2234                         error = hammer_ip_check_directory_empty(trans, ip);
2235                 }
2236
2237                 /*
2238                  * Delete the directory entry.
2239                  *
2240                  * WARNING: hammer_ip_del_directory() may have to terminate
2241                  * the cursor to avoid a deadlock.  It is ok to call
2242                  * hammer_done_cursor() twice.
2243                  */
2244                 if (error == 0) {
2245                         error = hammer_ip_del_directory(trans, &cursor,
2246                                                         dip, ip);
2247                 }
2248                 if (error == 0) {
2249                         cache_setunresolved(nch);
2250                         cache_setvp(nch, NULL);
2251                         /* XXX locking */
2252                         if (ip->vp)
2253                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2254                 }
2255                 hammer_rel_inode(ip, 0);
2256         }
2257         hammer_done_cursor(&cursor);
2258         if (error == EDEADLK)
2259                 goto retry;
2260
2261         return (error);
2262 }
2263
2264 /************************************************************************
2265  *                          FIFO AND SPECFS OPS                         *
2266  ************************************************************************
2267  *
2268  */
2269
2270 static int
2271 hammer_vop_fifoclose (struct vop_close_args *ap)
2272 {
2273         /* XXX update itimes */
2274         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2275 }
2276
2277 static int
2278 hammer_vop_fiforead (struct vop_read_args *ap)
2279 {
2280         int error;
2281
2282         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2283         /* XXX update access time */
2284         return (error);
2285 }
2286
2287 static int
2288 hammer_vop_fifowrite (struct vop_write_args *ap)
2289 {
2290         int error;
2291
2292         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2293         /* XXX update access time */
2294         return (error);
2295 }
2296
2297 static int
2298 hammer_vop_specclose (struct vop_close_args *ap)
2299 {
2300         /* XXX update itimes */
2301         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2302 }
2303
2304 static int
2305 hammer_vop_specread (struct vop_read_args *ap)
2306 {
2307         /* XXX update access time */
2308         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2309 }
2310
2311 static int
2312 hammer_vop_specwrite (struct vop_write_args *ap)
2313 {
2314         /* XXX update last change time */
2315         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2316 }
2317