98713d14b5e0ced9a41ed30cde125dfea58621b7
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.61 2008/06/09 04:19:10 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81 static int hammer_vop_mountctl(struct vop_mountctl_args *);
82
83 static int hammer_vop_fifoclose (struct vop_close_args *);
84 static int hammer_vop_fiforead (struct vop_read_args *);
85 static int hammer_vop_fifowrite (struct vop_write_args *);
86
87 static int hammer_vop_specclose (struct vop_close_args *);
88 static int hammer_vop_specread (struct vop_read_args *);
89 static int hammer_vop_specwrite (struct vop_write_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         hammer_vop_pathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_setattr =          hammer_vop_setattr,
119         .vop_strategy =         hammer_vop_strategy,
120         .vop_nsymlink =         hammer_vop_nsymlink,
121         .vop_nwhiteout =        hammer_vop_nwhiteout,
122         .vop_ioctl =            hammer_vop_ioctl,
123         .vop_mountctl =         hammer_vop_mountctl
124 };
125
126 struct vop_ops hammer_spec_vops = {
127         .vop_default =          spec_vnoperate,
128         .vop_fsync =            hammer_vop_fsync,
129         .vop_read =             hammer_vop_specread,
130         .vop_write =            hammer_vop_specwrite,
131         .vop_access =           hammer_vop_access,
132         .vop_close =            hammer_vop_specclose,
133         .vop_getattr =          hammer_vop_getattr,
134         .vop_inactive =         hammer_vop_inactive,
135         .vop_reclaim =          hammer_vop_reclaim,
136         .vop_setattr =          hammer_vop_setattr
137 };
138
139 struct vop_ops hammer_fifo_vops = {
140         .vop_default =          fifo_vnoperate,
141         .vop_fsync =            hammer_vop_fsync,
142         .vop_read =             hammer_vop_fiforead,
143         .vop_write =            hammer_vop_fifowrite,
144         .vop_access =           hammer_vop_access,
145         .vop_close =            hammer_vop_fifoclose,
146         .vop_getattr =          hammer_vop_getattr,
147         .vop_inactive =         hammer_vop_inactive,
148         .vop_reclaim =          hammer_vop_reclaim,
149         .vop_setattr =          hammer_vop_setattr
150 };
151
152 #ifdef DEBUG_TRUNCATE
153 struct hammer_inode *HammerTruncIp;
154 #endif
155
156 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
157                            struct vnode *dvp, struct ucred *cred, int flags);
158 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
159 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
160 static void hammer_cleanup_write_io(hammer_inode_t ip);
161 static void hammer_update_rsv_databufs(hammer_inode_t ip);
162
163 #if 0
164 static
165 int
166 hammer_vop_vnoperate(struct vop_generic_args *)
167 {
168         return (VOCALL(&hammer_vnode_vops, ap));
169 }
170 #endif
171
172 /*
173  * hammer_vop_fsync { vp, waitfor }
174  */
175 static
176 int
177 hammer_vop_fsync(struct vop_fsync_args *ap)
178 {
179         hammer_inode_t ip = VTOI(ap->a_vp);
180
181         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
182         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
183         if (ap->a_waitfor == MNT_WAIT)
184                 hammer_wait_inode(ip);
185         return (ip->error);
186 }
187
188 /*
189  * hammer_vop_read { vp, uio, ioflag, cred }
190  */
191 static
192 int
193 hammer_vop_read(struct vop_read_args *ap)
194 {
195         struct hammer_transaction trans;
196         hammer_inode_t ip;
197         off_t offset;
198         struct buf *bp;
199         struct uio *uio;
200         int error;
201         int n;
202         int seqcount;
203
204         if (ap->a_vp->v_type != VREG)
205                 return (EINVAL);
206         ip = VTOI(ap->a_vp);
207         error = 0;
208         seqcount = ap->a_ioflag >> 16;
209
210         hammer_start_transaction(&trans, ip->hmp);
211
212         /*
213          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
214          */
215         uio = ap->a_uio;
216         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
217                 offset = uio->uio_offset & HAMMER_BUFMASK;
218 #if 0
219                 error = cluster_read(ap->a_vp, ip->ino_data.size,
220                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
221                                      MAXBSIZE, seqcount, &bp);
222 #endif
223                 error = bread(ap->a_vp, uio->uio_offset - offset,
224                               HAMMER_BUFSIZE, &bp);
225                 if (error) {
226                         brelse(bp);
227                         break;
228                 }
229                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
230                 n = HAMMER_BUFSIZE - offset;
231                 if (n > uio->uio_resid)
232                         n = uio->uio_resid;
233                 if (n > ip->ino_data.size - uio->uio_offset)
234                         n = (int)(ip->ino_data.size - uio->uio_offset);
235                 error = uiomove((char *)bp->b_data + offset, n, uio);
236                 if (error) {
237                         bqrelse(bp);
238                         break;
239                 }
240                 bqrelse(bp);
241         }
242         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
243             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
244                 ip->ino_leaf.atime = trans.time;
245                 hammer_modify_inode(ip, HAMMER_INODE_ITIMES);
246         }
247         hammer_done_transaction(&trans);
248         return (error);
249 }
250
251 /*
252  * hammer_vop_write { vp, uio, ioflag, cred }
253  */
254 static
255 int
256 hammer_vop_write(struct vop_write_args *ap)
257 {
258         struct hammer_transaction trans;
259         struct hammer_inode *ip;
260         struct uio *uio;
261         int rel_offset;
262         off_t base_offset;
263         struct buf *bp;
264         int error;
265         int n;
266         int flags;
267         int count;
268
269         if (ap->a_vp->v_type != VREG)
270                 return (EINVAL);
271         ip = VTOI(ap->a_vp);
272         error = 0;
273
274         if (ip->flags & HAMMER_INODE_RO)
275                 return (EROFS);
276
277         /*
278          * Create a transaction to cover the operations we perform.
279          */
280         hammer_start_transaction(&trans, ip->hmp);
281         uio = ap->a_uio;
282
283         /*
284          * Check append mode
285          */
286         if (ap->a_ioflag & IO_APPEND)
287                 uio->uio_offset = ip->ino_data.size;
288
289         /*
290          * Check for illegal write offsets.  Valid range is 0...2^63-1
291          */
292         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
293                 hammer_done_transaction(&trans);
294                 return (EFBIG);
295         }
296
297         /*
298          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
299          */
300         count = 0;
301         while (uio->uio_resid > 0) {
302                 int fixsize = 0;
303
304                 if ((error = hammer_checkspace(trans.hmp)) != 0)
305                         break;
306
307                 /*
308                  * Do not allow HAMMER to blow out the buffer cache.
309                  *
310                  * Do not allow HAMMER to blow out system memory by
311                  * accumulating too many records.   Records are decoupled
312                  * from the buffer cache.
313                  *
314                  * Always check at the beginning so separate writes are
315                  * not able to bypass this code.
316                  *
317                  * WARNING: Cannot unlock vp when doing a NOCOPY write as
318                  * part of a putpages operation.  Doing so could cause us
319                  * to deadlock against the VM system when we try to re-lock.
320                  */
321                 if ((count++ & 15) == 0) {
322                         if (uio->uio_segflg != UIO_NOCOPY) {
323                                 vn_unlock(ap->a_vp);
324                                 if ((ap->a_ioflag & IO_NOBWILL) == 0)
325                                         bwillwrite();
326                         }
327                         if (ip->rsv_recs > hammer_limit_irecs) {
328                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
329                                 hammer_wait_inode(ip);
330                         }
331                         if (uio->uio_segflg != UIO_NOCOPY)
332                                 vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
333                 }
334
335                 rel_offset = (int)(uio->uio_offset & HAMMER_BUFMASK);
336                 base_offset = uio->uio_offset & ~HAMMER_BUFMASK64;
337                 n = HAMMER_BUFSIZE - rel_offset;
338                 if (n > uio->uio_resid)
339                         n = uio->uio_resid;
340                 if (uio->uio_offset + n > ip->ino_data.size) {
341                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
342                         fixsize = 1;
343                 }
344
345                 if (uio->uio_segflg == UIO_NOCOPY) {
346                         /*
347                          * Issuing a write with the same data backing the
348                          * buffer.  Instantiate the buffer to collect the
349                          * backing vm pages, then read-in any missing bits.
350                          *
351                          * This case is used by vop_stdputpages().
352                          */
353                         bp = getblk(ap->a_vp, base_offset,
354                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
355                         if ((bp->b_flags & B_CACHE) == 0) {
356                                 bqrelse(bp);
357                                 error = bread(ap->a_vp, base_offset,
358                                               HAMMER_BUFSIZE, &bp);
359                         }
360                 } else if (rel_offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
361                         /*
362                          * Even though we are entirely overwriting the buffer
363                          * we may still have to zero it out to avoid a 
364                          * mmap/write visibility issue.
365                          */
366                         bp = getblk(ap->a_vp, base_offset,
367                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
368                         if ((bp->b_flags & B_CACHE) == 0)
369                                 vfs_bio_clrbuf(bp);
370                 } else if (base_offset >= ip->ino_data.size) {
371                         /*
372                          * If the base offset of the buffer is beyond the
373                          * file EOF, we don't have to issue a read.
374                          */
375                         bp = getblk(ap->a_vp, base_offset,
376                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
377                         vfs_bio_clrbuf(bp);
378                 } else {
379                         /*
380                          * Partial overwrite, read in any missing bits then
381                          * replace the portion being written.
382                          */
383                         error = bread(ap->a_vp, base_offset,
384                                       HAMMER_BUFSIZE, &bp);
385                         if (error == 0)
386                                 bheavy(bp);
387                 }
388                 if (error == 0) {
389                         error = uiomove((char *)bp->b_data + rel_offset,
390                                         n, uio);
391                 }
392
393                 /*
394                  * If we screwed up we have to undo any VM size changes we
395                  * made.
396                  */
397                 if (error) {
398                         brelse(bp);
399                         if (fixsize) {
400                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
401                                           HAMMER_BUFSIZE);
402                         }
403                         break;
404                 }
405                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
406                 if (ip->ino_data.size < uio->uio_offset) {
407                         ip->ino_data.size = uio->uio_offset;
408                         flags = HAMMER_INODE_DDIRTY;
409                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
410                 } else {
411                         flags = 0;
412                 }
413                 ip->ino_data.mtime = trans.time;
414                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
415                 flags |= HAMMER_INODE_DDIRTY;   /* XXX mtime */
416                 hammer_modify_inode(ip, flags);
417
418                 /*
419                  * Try to keep track of cached dirty data.
420                  */
421                 if ((bp->b_flags & B_DIRTY) == 0) {
422                         ++ip->rsv_databufs;
423                         ++ip->hmp->rsv_databufs;
424                 }
425
426                 /*
427                  * Final buffer disposition.
428                  */
429                 if (ap->a_ioflag & IO_SYNC) {
430                         bwrite(bp);
431                 } else if (ap->a_ioflag & IO_DIRECT) {
432                         bawrite(bp);
433 #if 1
434                 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
435                            (uio->uio_offset & HAMMER_BUFMASK) == 0) {
436                         /*
437                          * If seqcount indicates sequential operation and
438                          * we just finished filling a buffer, push it out
439                          * now to prevent the buffer cache from becoming
440                          * too full, which would trigger non-optimal
441                          * flushes.
442                          */
443                         bawrite(bp);
444 #endif
445                 } else {
446                         bdwrite(bp);
447                 }
448         }
449         hammer_done_transaction(&trans);
450         return (error);
451 }
452
453 /*
454  * hammer_vop_access { vp, mode, cred }
455  */
456 static
457 int
458 hammer_vop_access(struct vop_access_args *ap)
459 {
460         struct hammer_inode *ip = VTOI(ap->a_vp);
461         uid_t uid;
462         gid_t gid;
463         int error;
464
465         uid = hammer_to_unix_xid(&ip->ino_data.uid);
466         gid = hammer_to_unix_xid(&ip->ino_data.gid);
467
468         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
469                                   ip->ino_data.uflags);
470         return (error);
471 }
472
473 /*
474  * hammer_vop_advlock { vp, id, op, fl, flags }
475  */
476 static
477 int
478 hammer_vop_advlock(struct vop_advlock_args *ap)
479 {
480         struct hammer_inode *ip = VTOI(ap->a_vp);
481
482         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
483 }
484
485 /*
486  * hammer_vop_close { vp, fflag }
487  */
488 static
489 int
490 hammer_vop_close(struct vop_close_args *ap)
491 {
492         return (vop_stdclose(ap));
493 }
494
495 /*
496  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
497  *
498  * The operating system has already ensured that the directory entry
499  * does not exist and done all appropriate namespace locking.
500  */
501 static
502 int
503 hammer_vop_ncreate(struct vop_ncreate_args *ap)
504 {
505         struct hammer_transaction trans;
506         struct hammer_inode *dip;
507         struct hammer_inode *nip;
508         struct nchandle *nch;
509         int error;
510
511         nch = ap->a_nch;
512         dip = VTOI(ap->a_dvp);
513
514         if (dip->flags & HAMMER_INODE_RO)
515                 return (EROFS);
516         if ((error = hammer_checkspace(dip->hmp)) != 0)
517                 return (error);
518
519         /*
520          * Create a transaction to cover the operations we perform.
521          */
522         hammer_start_transaction(&trans, dip->hmp);
523
524         /*
525          * Create a new filesystem object of the requested type.  The
526          * returned inode will be referenced and shared-locked to prevent
527          * it from being moved to the flusher.
528          */
529
530         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
531         if (error) {
532                 hkprintf("hammer_create_inode error %d\n", error);
533                 hammer_done_transaction(&trans);
534                 *ap->a_vpp = NULL;
535                 return (error);
536         }
537
538         /*
539          * Add the new filesystem object to the directory.  This will also
540          * bump the inode's link count.
541          */
542         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
543         if (error)
544                 hkprintf("hammer_ip_add_directory error %d\n", error);
545
546         /*
547          * Finish up.
548          */
549         if (error) {
550                 hammer_rel_inode(nip, 0);
551                 hammer_done_transaction(&trans);
552                 *ap->a_vpp = NULL;
553         } else {
554                 error = hammer_get_vnode(nip, ap->a_vpp);
555                 hammer_done_transaction(&trans);
556                 hammer_rel_inode(nip, 0);
557                 if (error == 0) {
558                         cache_setunresolved(ap->a_nch);
559                         cache_setvp(ap->a_nch, *ap->a_vpp);
560                 }
561         }
562         return (error);
563 }
564
565 /*
566  * hammer_vop_getattr { vp, vap }
567  *
568  * Retrieve an inode's attribute information.  When accessing inodes
569  * historically we fake the atime field to ensure consistent results.
570  * The atime field is stored in the B-Tree element and allowed to be
571  * updated without cycling the element.
572  */
573 static
574 int
575 hammer_vop_getattr(struct vop_getattr_args *ap)
576 {
577         struct hammer_inode *ip = VTOI(ap->a_vp);
578         struct vattr *vap = ap->a_vap;
579
580 #if 0
581         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
582             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
583             ip->obj_asof == XXX
584         ) {
585                 /* LAZYMOD XXX */
586         }
587         hammer_itimes(ap->a_vp);
588 #endif
589
590         vap->va_fsid = ip->hmp->fsid_udev;
591         vap->va_fileid = ip->ino_leaf.base.obj_id;
592         vap->va_mode = ip->ino_data.mode;
593         vap->va_nlink = ip->ino_data.nlinks;
594         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
595         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
596         vap->va_rmajor = 0;
597         vap->va_rminor = 0;
598         vap->va_size = ip->ino_data.size;
599         if (ip->flags & HAMMER_INODE_RO)
600                 hammer_to_timespec(ip->ino_data.mtime, &vap->va_atime);
601         else
602                 hammer_to_timespec(ip->ino_leaf.atime, &vap->va_atime);
603         hammer_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
604         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
605         vap->va_flags = ip->ino_data.uflags;
606         vap->va_gen = 1;        /* hammer inums are unique for all time */
607         vap->va_blocksize = HAMMER_BUFSIZE;
608         vap->va_bytes = (ip->ino_data.size + 63) & ~63;
609         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
610         vap->va_filerev = 0;    /* XXX */
611         /* mtime uniquely identifies any adjustments made to the file */
612         vap->va_fsmid = ip->ino_data.mtime;
613         vap->va_uid_uuid = ip->ino_data.uid;
614         vap->va_gid_uuid = ip->ino_data.gid;
615         vap->va_fsid_uuid = ip->hmp->fsid;
616         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
617                           VA_FSID_UUID_VALID;
618
619         switch (ip->ino_data.obj_type) {
620         case HAMMER_OBJTYPE_CDEV:
621         case HAMMER_OBJTYPE_BDEV:
622                 vap->va_rmajor = ip->ino_data.rmajor;
623                 vap->va_rminor = ip->ino_data.rminor;
624                 break;
625         default:
626                 break;
627         }
628
629         return(0);
630 }
631
632 /*
633  * hammer_vop_nresolve { nch, dvp, cred }
634  *
635  * Locate the requested directory entry.
636  */
637 static
638 int
639 hammer_vop_nresolve(struct vop_nresolve_args *ap)
640 {
641         struct hammer_transaction trans;
642         struct namecache *ncp;
643         hammer_inode_t dip;
644         hammer_inode_t ip;
645         hammer_tid_t asof;
646         struct hammer_cursor cursor;
647         struct vnode *vp;
648         int64_t namekey;
649         int error;
650         int i;
651         int nlen;
652         int flags;
653         u_int64_t obj_id;
654
655         /*
656          * Misc initialization, plus handle as-of name extensions.  Look for
657          * the '@@' extension.  Note that as-of files and directories cannot
658          * be modified.
659          */
660         dip = VTOI(ap->a_dvp);
661         ncp = ap->a_nch->ncp;
662         asof = dip->obj_asof;
663         nlen = ncp->nc_nlen;
664         flags = dip->flags;
665
666         hammer_simple_transaction(&trans, dip->hmp);
667
668         for (i = 0; i < nlen; ++i) {
669                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
670                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
671                         flags |= HAMMER_INODE_RO;
672                         break;
673                 }
674         }
675         nlen = i;
676
677         /*
678          * If there is no path component the time extension is relative to
679          * dip.
680          */
681         if (nlen == 0) {
682                 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
683                                       asof, flags, &error);
684                 if (error == 0) {
685                         error = hammer_get_vnode(ip, &vp);
686                         hammer_rel_inode(ip, 0);
687                 } else {
688                         vp = NULL;
689                 }
690                 if (error == 0) {
691                         vn_unlock(vp);
692                         cache_setvp(ap->a_nch, vp);
693                         vrele(vp);
694                 }
695                 goto done;
696         }
697
698         /*
699          * Calculate the namekey and setup the key range for the scan.  This
700          * works kinda like a chained hash table where the lower 32 bits
701          * of the namekey synthesize the chain.
702          *
703          * The key range is inclusive of both key_beg and key_end.
704          */
705         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
706
707         error = hammer_init_cursor(&trans, &cursor, &dip->cache[0], dip);
708         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
709         cursor.key_beg.obj_id = dip->obj_id;
710         cursor.key_beg.key = namekey;
711         cursor.key_beg.create_tid = 0;
712         cursor.key_beg.delete_tid = 0;
713         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
714         cursor.key_beg.obj_type = 0;
715
716         cursor.key_end = cursor.key_beg;
717         cursor.key_end.key |= 0xFFFFFFFFULL;
718         cursor.asof = asof;
719         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
720
721         /*
722          * Scan all matching records (the chain), locate the one matching
723          * the requested path component.
724          *
725          * The hammer_ip_*() functions merge in-memory records with on-disk
726          * records for the purposes of the search.
727          */
728         obj_id = 0;
729
730         if (error == 0) {
731                 error = hammer_ip_first(&cursor);
732                 while (error == 0) {
733                         error = hammer_ip_resolve_data(&cursor);
734                         if (error)
735                                 break;
736                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
737                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
738                                 obj_id = cursor.data->entry.obj_id;
739                                 break;
740                         }
741                         error = hammer_ip_next(&cursor);
742                 }
743         }
744         hammer_done_cursor(&cursor);
745         if (error == 0) {
746                 ip = hammer_get_inode(&trans, &dip->cache[1],
747                                       obj_id, asof, flags, &error);
748                 if (error == 0) {
749                         error = hammer_get_vnode(ip, &vp);
750                         hammer_rel_inode(ip, 0);
751                 } else {
752                         vp = NULL;
753                 }
754                 if (error == 0) {
755                         vn_unlock(vp);
756                         cache_setvp(ap->a_nch, vp);
757                         vrele(vp);
758                 }
759         } else if (error == ENOENT) {
760                 cache_setvp(ap->a_nch, NULL);
761         }
762 done:
763         hammer_done_transaction(&trans);
764         return (error);
765 }
766
767 /*
768  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
769  *
770  * Locate the parent directory of a directory vnode.
771  *
772  * dvp is referenced but not locked.  *vpp must be returned referenced and
773  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
774  * at the root, instead it could indicate that the directory we were in was
775  * removed.
776  *
777  * NOTE: as-of sequences are not linked into the directory structure.  If
778  * we are at the root with a different asof then the mount point, reload
779  * the same directory with the mount point's asof.   I'm not sure what this
780  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
781  * get confused, but it hasn't been tested.
782  */
783 static
784 int
785 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
786 {
787         struct hammer_transaction trans;
788         struct hammer_inode *dip;
789         struct hammer_inode *ip;
790         int64_t parent_obj_id;
791         hammer_tid_t asof;
792         int error;
793
794         dip = VTOI(ap->a_dvp);
795         asof = dip->obj_asof;
796         parent_obj_id = dip->ino_data.parent_obj_id;
797
798         if (parent_obj_id == 0) {
799                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
800                    asof != dip->hmp->asof) {
801                         parent_obj_id = dip->obj_id;
802                         asof = dip->hmp->asof;
803                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
804                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
805                                    dip->obj_asof);
806                 } else {
807                         *ap->a_vpp = NULL;
808                         return ENOENT;
809                 }
810         }
811
812         hammer_simple_transaction(&trans, dip->hmp);
813
814         ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
815                               asof, dip->flags, &error);
816         if (ip) {
817                 error = hammer_get_vnode(ip, ap->a_vpp);
818                 hammer_rel_inode(ip, 0);
819         } else {
820                 *ap->a_vpp = NULL;
821         }
822         hammer_done_transaction(&trans);
823         return (error);
824 }
825
826 /*
827  * hammer_vop_nlink { nch, dvp, vp, cred }
828  */
829 static
830 int
831 hammer_vop_nlink(struct vop_nlink_args *ap)
832 {
833         struct hammer_transaction trans;
834         struct hammer_inode *dip;
835         struct hammer_inode *ip;
836         struct nchandle *nch;
837         int error;
838
839         nch = ap->a_nch;
840         dip = VTOI(ap->a_dvp);
841         ip = VTOI(ap->a_vp);
842
843         if (dip->flags & HAMMER_INODE_RO)
844                 return (EROFS);
845         if (ip->flags & HAMMER_INODE_RO)
846                 return (EROFS);
847         if ((error = hammer_checkspace(dip->hmp)) != 0)
848                 return (error);
849
850         /*
851          * Create a transaction to cover the operations we perform.
852          */
853         hammer_start_transaction(&trans, dip->hmp);
854
855         /*
856          * Add the filesystem object to the directory.  Note that neither
857          * dip nor ip are referenced or locked, but their vnodes are
858          * referenced.  This function will bump the inode's link count.
859          */
860         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
861
862         /*
863          * Finish up.
864          */
865         if (error == 0) {
866                 cache_setunresolved(nch);
867                 cache_setvp(nch, ap->a_vp);
868         }
869         hammer_done_transaction(&trans);
870         return (error);
871 }
872
873 /*
874  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
875  *
876  * The operating system has already ensured that the directory entry
877  * does not exist and done all appropriate namespace locking.
878  */
879 static
880 int
881 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
882 {
883         struct hammer_transaction trans;
884         struct hammer_inode *dip;
885         struct hammer_inode *nip;
886         struct nchandle *nch;
887         int error;
888
889         nch = ap->a_nch;
890         dip = VTOI(ap->a_dvp);
891
892         if (dip->flags & HAMMER_INODE_RO)
893                 return (EROFS);
894         if ((error = hammer_checkspace(dip->hmp)) != 0)
895                 return (error);
896
897         /*
898          * Create a transaction to cover the operations we perform.
899          */
900         hammer_start_transaction(&trans, dip->hmp);
901
902         /*
903          * Create a new filesystem object of the requested type.  The
904          * returned inode will be referenced but not locked.
905          */
906         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
907         if (error) {
908                 hkprintf("hammer_mkdir error %d\n", error);
909                 hammer_done_transaction(&trans);
910                 *ap->a_vpp = NULL;
911                 return (error);
912         }
913         /*
914          * Add the new filesystem object to the directory.  This will also
915          * bump the inode's link count.
916          */
917         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
918         if (error)
919                 hkprintf("hammer_mkdir (add) error %d\n", error);
920
921         /*
922          * Finish up.
923          */
924         if (error) {
925                 hammer_rel_inode(nip, 0);
926                 *ap->a_vpp = NULL;
927         } else {
928                 error = hammer_get_vnode(nip, ap->a_vpp);
929                 hammer_rel_inode(nip, 0);
930                 if (error == 0) {
931                         cache_setunresolved(ap->a_nch);
932                         cache_setvp(ap->a_nch, *ap->a_vpp);
933                 }
934         }
935         hammer_done_transaction(&trans);
936         return (error);
937 }
938
939 /*
940  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
941  *
942  * The operating system has already ensured that the directory entry
943  * does not exist and done all appropriate namespace locking.
944  */
945 static
946 int
947 hammer_vop_nmknod(struct vop_nmknod_args *ap)
948 {
949         struct hammer_transaction trans;
950         struct hammer_inode *dip;
951         struct hammer_inode *nip;
952         struct nchandle *nch;
953         int error;
954
955         nch = ap->a_nch;
956         dip = VTOI(ap->a_dvp);
957
958         if (dip->flags & HAMMER_INODE_RO)
959                 return (EROFS);
960         if ((error = hammer_checkspace(dip->hmp)) != 0)
961                 return (error);
962
963         /*
964          * Create a transaction to cover the operations we perform.
965          */
966         hammer_start_transaction(&trans, dip->hmp);
967
968         /*
969          * Create a new filesystem object of the requested type.  The
970          * returned inode will be referenced but not locked.
971          */
972         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
973         if (error) {
974                 hammer_done_transaction(&trans);
975                 *ap->a_vpp = NULL;
976                 return (error);
977         }
978
979         /*
980          * Add the new filesystem object to the directory.  This will also
981          * bump the inode's link count.
982          */
983         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
984
985         /*
986          * Finish up.
987          */
988         if (error) {
989                 hammer_rel_inode(nip, 0);
990                 *ap->a_vpp = NULL;
991         } else {
992                 error = hammer_get_vnode(nip, ap->a_vpp);
993                 hammer_rel_inode(nip, 0);
994                 if (error == 0) {
995                         cache_setunresolved(ap->a_nch);
996                         cache_setvp(ap->a_nch, *ap->a_vpp);
997                 }
998         }
999         hammer_done_transaction(&trans);
1000         return (error);
1001 }
1002
1003 /*
1004  * hammer_vop_open { vp, mode, cred, fp }
1005  */
1006 static
1007 int
1008 hammer_vop_open(struct vop_open_args *ap)
1009 {
1010         hammer_inode_t ip;
1011
1012         ip = VTOI(ap->a_vp);
1013
1014         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1015                 return (EROFS);
1016
1017         /*
1018          * It is posible for the backend to get behind disposing
1019          * of modified inodes.  Do not let the number increase into
1020          * infinity.
1021          *
1022          * This bit of code can go almost anywhere but it is best
1023          * to put it somewhere where it will not hang-up other
1024          * processes.  The inode create/release paths typicaly hold
1025          * other locks, like directory vnode locks or namecache locks.
1026          */
1027         if (ip->hmp->inode_reclaims > HAMMER_RECLAIM_MIN)
1028                 hammer_inode_waitreclaims(ip->hmp);
1029
1030         return(vop_stdopen(ap));
1031 }
1032
1033 /*
1034  * hammer_vop_pathconf { vp, name, retval }
1035  */
1036 static
1037 int
1038 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1039 {
1040         return EOPNOTSUPP;
1041 }
1042
1043 /*
1044  * hammer_vop_print { vp }
1045  */
1046 static
1047 int
1048 hammer_vop_print(struct vop_print_args *ap)
1049 {
1050         return EOPNOTSUPP;
1051 }
1052
1053 /*
1054  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1055  */
1056 static
1057 int
1058 hammer_vop_readdir(struct vop_readdir_args *ap)
1059 {
1060         struct hammer_transaction trans;
1061         struct hammer_cursor cursor;
1062         struct hammer_inode *ip;
1063         struct uio *uio;
1064         hammer_base_elm_t base;
1065         int error;
1066         int cookie_index;
1067         int ncookies;
1068         off_t *cookies;
1069         off_t saveoff;
1070         int r;
1071
1072         ip = VTOI(ap->a_vp);
1073         uio = ap->a_uio;
1074         saveoff = uio->uio_offset;
1075
1076         if (ap->a_ncookies) {
1077                 ncookies = uio->uio_resid / 16 + 1;
1078                 if (ncookies > 1024)
1079                         ncookies = 1024;
1080                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1081                 cookie_index = 0;
1082         } else {
1083                 ncookies = -1;
1084                 cookies = NULL;
1085                 cookie_index = 0;
1086         }
1087
1088         hammer_simple_transaction(&trans, ip->hmp);
1089
1090         /*
1091          * Handle artificial entries
1092          */
1093         error = 0;
1094         if (saveoff == 0) {
1095                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1096                 if (r)
1097                         goto done;
1098                 if (cookies)
1099                         cookies[cookie_index] = saveoff;
1100                 ++saveoff;
1101                 ++cookie_index;
1102                 if (cookie_index == ncookies)
1103                         goto done;
1104         }
1105         if (saveoff == 1) {
1106                 if (ip->ino_data.parent_obj_id) {
1107                         r = vop_write_dirent(&error, uio,
1108                                              ip->ino_data.parent_obj_id,
1109                                              DT_DIR, 2, "..");
1110                 } else {
1111                         r = vop_write_dirent(&error, uio,
1112                                              ip->obj_id, DT_DIR, 2, "..");
1113                 }
1114                 if (r)
1115                         goto done;
1116                 if (cookies)
1117                         cookies[cookie_index] = saveoff;
1118                 ++saveoff;
1119                 ++cookie_index;
1120                 if (cookie_index == ncookies)
1121                         goto done;
1122         }
1123
1124         /*
1125          * Key range (begin and end inclusive) to scan.  Directory keys
1126          * directly translate to a 64 bit 'seek' position.
1127          */
1128         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1129         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1130         cursor.key_beg.obj_id = ip->obj_id;
1131         cursor.key_beg.create_tid = 0;
1132         cursor.key_beg.delete_tid = 0;
1133         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1134         cursor.key_beg.obj_type = 0;
1135         cursor.key_beg.key = saveoff;
1136
1137         cursor.key_end = cursor.key_beg;
1138         cursor.key_end.key = HAMMER_MAX_KEY;
1139         cursor.asof = ip->obj_asof;
1140         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1141
1142         error = hammer_ip_first(&cursor);
1143
1144         while (error == 0) {
1145                 error = hammer_ip_resolve_data(&cursor);
1146                 if (error)
1147                         break;
1148                 base = &cursor.leaf->base;
1149                 saveoff = base->key;
1150                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1151
1152                 if (base->obj_id != ip->obj_id)
1153                         panic("readdir: bad record at %p", cursor.node);
1154
1155                 r = vop_write_dirent(
1156                              &error, uio, cursor.data->entry.obj_id,
1157                              hammer_get_dtype(cursor.leaf->base.obj_type),
1158                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1159                              (void *)cursor.data->entry.name);
1160                 if (r)
1161                         break;
1162                 ++saveoff;
1163                 if (cookies)
1164                         cookies[cookie_index] = base->key;
1165                 ++cookie_index;
1166                 if (cookie_index == ncookies)
1167                         break;
1168                 error = hammer_ip_next(&cursor);
1169         }
1170         hammer_done_cursor(&cursor);
1171
1172 done:
1173         hammer_done_transaction(&trans);
1174
1175         if (ap->a_eofflag)
1176                 *ap->a_eofflag = (error == ENOENT);
1177         uio->uio_offset = saveoff;
1178         if (error && cookie_index == 0) {
1179                 if (error == ENOENT)
1180                         error = 0;
1181                 if (cookies) {
1182                         kfree(cookies, M_TEMP);
1183                         *ap->a_ncookies = 0;
1184                         *ap->a_cookies = NULL;
1185                 }
1186         } else {
1187                 if (error == ENOENT)
1188                         error = 0;
1189                 if (cookies) {
1190                         *ap->a_ncookies = cookie_index;
1191                         *ap->a_cookies = cookies;
1192                 }
1193         }
1194         return(error);
1195 }
1196
1197 /*
1198  * hammer_vop_readlink { vp, uio, cred }
1199  */
1200 static
1201 int
1202 hammer_vop_readlink(struct vop_readlink_args *ap)
1203 {
1204         struct hammer_transaction trans;
1205         struct hammer_cursor cursor;
1206         struct hammer_inode *ip;
1207         int error;
1208
1209         ip = VTOI(ap->a_vp);
1210
1211         /*
1212          * Shortcut if the symlink data was stuffed into ino_data.
1213          */
1214         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1215                 error = uiomove(ip->ino_data.ext.symlink,
1216                                 ip->ino_data.size, ap->a_uio);
1217                 return(error);
1218         }
1219
1220         /*
1221          * Long version
1222          */
1223         hammer_simple_transaction(&trans, ip->hmp);
1224         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1225
1226         /*
1227          * Key range (begin and end inclusive) to scan.  Directory keys
1228          * directly translate to a 64 bit 'seek' position.
1229          */
1230         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; /* XXX */
1231         cursor.key_beg.obj_id = ip->obj_id;
1232         cursor.key_beg.create_tid = 0;
1233         cursor.key_beg.delete_tid = 0;
1234         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1235         cursor.key_beg.obj_type = 0;
1236         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1237         cursor.asof = ip->obj_asof;
1238         cursor.flags |= HAMMER_CURSOR_ASOF;
1239
1240         error = hammer_ip_lookup(&cursor);
1241         if (error == 0) {
1242                 error = hammer_ip_resolve_data(&cursor);
1243                 if (error == 0) {
1244                         KKASSERT(cursor.leaf->data_len >=
1245                                  HAMMER_SYMLINK_NAME_OFF);
1246                         error = uiomove(cursor.data->symlink.name,
1247                                         cursor.leaf->data_len -
1248                                                 HAMMER_SYMLINK_NAME_OFF,
1249                                         ap->a_uio);
1250                 }
1251         }
1252         hammer_done_cursor(&cursor);
1253         hammer_done_transaction(&trans);
1254         return(error);
1255 }
1256
1257 /*
1258  * hammer_vop_nremove { nch, dvp, cred }
1259  */
1260 static
1261 int
1262 hammer_vop_nremove(struct vop_nremove_args *ap)
1263 {
1264         struct hammer_transaction trans;
1265         struct hammer_inode *dip;
1266         int error;
1267
1268         dip = VTOI(ap->a_dvp);
1269
1270         if (hammer_nohistory(dip) == 0 &&
1271             (error = hammer_checkspace(dip->hmp)) != 0) {
1272                 return (error);
1273         }
1274
1275         hammer_start_transaction(&trans, dip->hmp);
1276         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1277         hammer_done_transaction(&trans);
1278
1279         return (error);
1280 }
1281
1282 /*
1283  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1284  */
1285 static
1286 int
1287 hammer_vop_nrename(struct vop_nrename_args *ap)
1288 {
1289         struct hammer_transaction trans;
1290         struct namecache *fncp;
1291         struct namecache *tncp;
1292         struct hammer_inode *fdip;
1293         struct hammer_inode *tdip;
1294         struct hammer_inode *ip;
1295         struct hammer_cursor cursor;
1296         int64_t namekey;
1297         int nlen, error;
1298
1299         fdip = VTOI(ap->a_fdvp);
1300         tdip = VTOI(ap->a_tdvp);
1301         fncp = ap->a_fnch->ncp;
1302         tncp = ap->a_tnch->ncp;
1303         ip = VTOI(fncp->nc_vp);
1304         KKASSERT(ip != NULL);
1305
1306         if (fdip->flags & HAMMER_INODE_RO)
1307                 return (EROFS);
1308         if (tdip->flags & HAMMER_INODE_RO)
1309                 return (EROFS);
1310         if (ip->flags & HAMMER_INODE_RO)
1311                 return (EROFS);
1312         if ((error = hammer_checkspace(fdip->hmp)) != 0)
1313                 return (error);
1314
1315         hammer_start_transaction(&trans, fdip->hmp);
1316
1317         /*
1318          * Remove tncp from the target directory and then link ip as
1319          * tncp. XXX pass trans to dounlink
1320          *
1321          * Force the inode sync-time to match the transaction so it is
1322          * in-sync with the creation of the target directory entry.
1323          */
1324         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1325         if (error == 0 || error == ENOENT) {
1326                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1327                 if (error == 0) {
1328                         ip->ino_data.parent_obj_id = tdip->obj_id;
1329                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1330                 }
1331         }
1332         if (error)
1333                 goto failed; /* XXX */
1334
1335         /*
1336          * Locate the record in the originating directory and remove it.
1337          *
1338          * Calculate the namekey and setup the key range for the scan.  This
1339          * works kinda like a chained hash table where the lower 32 bits
1340          * of the namekey synthesize the chain.
1341          *
1342          * The key range is inclusive of both key_beg and key_end.
1343          */
1344         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1345 retry:
1346         hammer_init_cursor(&trans, &cursor, &fdip->cache[0], fdip);
1347         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1348         cursor.key_beg.obj_id = fdip->obj_id;
1349         cursor.key_beg.key = namekey;
1350         cursor.key_beg.create_tid = 0;
1351         cursor.key_beg.delete_tid = 0;
1352         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1353         cursor.key_beg.obj_type = 0;
1354
1355         cursor.key_end = cursor.key_beg;
1356         cursor.key_end.key |= 0xFFFFFFFFULL;
1357         cursor.asof = fdip->obj_asof;
1358         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1359
1360         /*
1361          * Scan all matching records (the chain), locate the one matching
1362          * the requested path component.
1363          *
1364          * The hammer_ip_*() functions merge in-memory records with on-disk
1365          * records for the purposes of the search.
1366          */
1367         error = hammer_ip_first(&cursor);
1368         while (error == 0) {
1369                 if (hammer_ip_resolve_data(&cursor) != 0)
1370                         break;
1371                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1372                 KKASSERT(nlen > 0);
1373                 if (fncp->nc_nlen == nlen &&
1374                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1375                         break;
1376                 }
1377                 error = hammer_ip_next(&cursor);
1378         }
1379
1380         /*
1381          * If all is ok we have to get the inode so we can adjust nlinks.
1382          *
1383          * WARNING: hammer_ip_del_directory() may have to terminate the
1384          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1385          * twice.
1386          */
1387         if (error == 0)
1388                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1389
1390         /*
1391          * XXX A deadlock here will break rename's atomicy for the purposes
1392          * of crash recovery.
1393          */
1394         if (error == EDEADLK) {
1395                 hammer_done_cursor(&cursor);
1396                 goto retry;
1397         }
1398
1399         /*
1400          * Cleanup and tell the kernel that the rename succeeded.
1401          */
1402         hammer_done_cursor(&cursor);
1403         if (error == 0)
1404                 cache_rename(ap->a_fnch, ap->a_tnch);
1405
1406 failed:
1407         hammer_done_transaction(&trans);
1408         return (error);
1409 }
1410
1411 /*
1412  * hammer_vop_nrmdir { nch, dvp, cred }
1413  */
1414 static
1415 int
1416 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1417 {
1418         struct hammer_transaction trans;
1419         struct hammer_inode *dip;
1420         int error;
1421
1422         dip = VTOI(ap->a_dvp);
1423
1424         if (hammer_nohistory(dip) == 0 &&
1425             (error = hammer_checkspace(dip->hmp)) != 0) {
1426                 return (error);
1427         }
1428
1429         hammer_start_transaction(&trans, dip->hmp);
1430         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1431         hammer_done_transaction(&trans);
1432
1433         return (error);
1434 }
1435
1436 /*
1437  * hammer_vop_setattr { vp, vap, cred }
1438  */
1439 static
1440 int
1441 hammer_vop_setattr(struct vop_setattr_args *ap)
1442 {
1443         struct hammer_transaction trans;
1444         struct vattr *vap;
1445         struct hammer_inode *ip;
1446         int modflags;
1447         int error;
1448         int truncating;
1449         off_t aligned_size;
1450         u_int32_t flags;
1451
1452         vap = ap->a_vap;
1453         ip = ap->a_vp->v_data;
1454         modflags = 0;
1455
1456         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1457                 return(EROFS);
1458         if (ip->flags & HAMMER_INODE_RO)
1459                 return (EROFS);
1460         if (hammer_nohistory(ip) == 0 &&
1461             (error = hammer_checkspace(ip->hmp)) != 0) {
1462                 return (error);
1463         }
1464
1465         hammer_start_transaction(&trans, ip->hmp);
1466         error = 0;
1467
1468         if (vap->va_flags != VNOVAL) {
1469                 flags = ip->ino_data.uflags;
1470                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1471                                          hammer_to_unix_xid(&ip->ino_data.uid),
1472                                          ap->a_cred);
1473                 if (error == 0) {
1474                         if (ip->ino_data.uflags != flags) {
1475                                 ip->ino_data.uflags = flags;
1476                                 modflags |= HAMMER_INODE_DDIRTY;
1477                         }
1478                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1479                                 error = 0;
1480                                 goto done;
1481                         }
1482                 }
1483                 goto done;
1484         }
1485         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1486                 error = EPERM;
1487                 goto done;
1488         }
1489         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1490                 mode_t cur_mode = ip->ino_data.mode;
1491                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1492                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1493                 uuid_t uuid_uid;
1494                 uuid_t uuid_gid;
1495
1496                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1497                                          ap->a_cred,
1498                                          &cur_uid, &cur_gid, &cur_mode);
1499                 if (error == 0) {
1500                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1501                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1502                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1503                                  sizeof(uuid_uid)) ||
1504                             bcmp(&uuid_gid, &ip->ino_data.gid,
1505                                  sizeof(uuid_gid)) ||
1506                             ip->ino_data.mode != cur_mode
1507                         ) {
1508                                 ip->ino_data.uid = uuid_uid;
1509                                 ip->ino_data.gid = uuid_gid;
1510                                 ip->ino_data.mode = cur_mode;
1511                         }
1512                         modflags |= HAMMER_INODE_DDIRTY;
1513                 }
1514         }
1515         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1516                 switch(ap->a_vp->v_type) {
1517                 case VREG:
1518                         if (vap->va_size == ip->ino_data.size)
1519                                 break;
1520                         /*
1521                          * XXX break atomicy, we can deadlock the backend
1522                          * if we do not release the lock.  Probably not a
1523                          * big deal here.
1524                          */
1525                         if (vap->va_size < ip->ino_data.size) {
1526                                 vtruncbuf(ap->a_vp, vap->va_size,
1527                                           HAMMER_BUFSIZE);
1528                                 truncating = 1;
1529                         } else {
1530                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1531                                 truncating = 0;
1532                         }
1533                         ip->ino_data.size = vap->va_size;
1534                         modflags |= HAMMER_INODE_DDIRTY;
1535                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1536                                        ~HAMMER_BUFMASK64;
1537
1538                         /*
1539                          * on-media truncation is cached in the inode until
1540                          * the inode is synchronized.
1541                          */
1542                         if (truncating) {
1543                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1544                                 hammer_update_rsv_databufs(ip);
1545 #ifdef DEBUG_TRUNCATE
1546                                 if (HammerTruncIp == NULL)
1547                                         HammerTruncIp = ip;
1548 #endif
1549                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1550                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1551                                         ip->trunc_off = vap->va_size;
1552 #ifdef DEBUG_TRUNCATE
1553                                         if (ip == HammerTruncIp)
1554                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1555 #endif
1556                                 } else if (ip->trunc_off > vap->va_size) {
1557                                         ip->trunc_off = vap->va_size;
1558 #ifdef DEBUG_TRUNCATE
1559                                         if (ip == HammerTruncIp)
1560                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1561 #endif
1562                                 } else {
1563 #ifdef DEBUG_TRUNCATE
1564                                         if (ip == HammerTruncIp)
1565                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1566 #endif
1567                                 }
1568                         }
1569
1570                         /*
1571                          * If truncating we have to clean out a portion of
1572                          * the last block on-disk.  We do this in the
1573                          * front-end buffer cache.
1574                          */
1575                         if (truncating && vap->va_size < aligned_size) {
1576                                 struct buf *bp;
1577                                 int offset;
1578
1579                                 aligned_size -= HAMMER_BUFSIZE;
1580
1581                                 offset = vap->va_size & HAMMER_BUFMASK;
1582                                 error = bread(ap->a_vp, aligned_size,
1583                                               HAMMER_BUFSIZE, &bp);
1584                                 hammer_ip_frontend_trunc(ip, aligned_size);
1585                                 if (error == 0) {
1586                                         bzero(bp->b_data + offset,
1587                                               HAMMER_BUFSIZE - offset);
1588                                         bdwrite(bp);
1589                                 } else {
1590                                         kprintf("ERROR %d\n", error);
1591                                         brelse(bp);
1592                                 }
1593                         }
1594                         break;
1595                 case VDATABASE:
1596                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1597                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1598                                 ip->trunc_off = vap->va_size;
1599                         } else if (ip->trunc_off > vap->va_size) {
1600                                 ip->trunc_off = vap->va_size;
1601                         }
1602                         hammer_ip_frontend_trunc(ip, vap->va_size);
1603                         ip->ino_data.size = vap->va_size;
1604                         modflags |= HAMMER_INODE_DDIRTY;
1605                         break;
1606                 default:
1607                         error = EINVAL;
1608                         goto done;
1609                 }
1610                 break;
1611         }
1612         if (vap->va_atime.tv_sec != VNOVAL) {
1613                 ip->ino_leaf.atime =
1614                         hammer_timespec_to_transid(&vap->va_atime);
1615                 modflags |= HAMMER_INODE_ITIMES;
1616         }
1617         if (vap->va_mtime.tv_sec != VNOVAL) {
1618                 ip->ino_data.mtime =
1619                         hammer_timespec_to_transid(&vap->va_mtime);
1620                 modflags |= HAMMER_INODE_ITIMES;
1621                 modflags |= HAMMER_INODE_DDIRTY;        /* XXX mtime */
1622         }
1623         if (vap->va_mode != (mode_t)VNOVAL) {
1624                 mode_t   cur_mode = ip->ino_data.mode;
1625                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1626                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1627
1628                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1629                                          cur_uid, cur_gid, &cur_mode);
1630                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1631                         ip->ino_data.mode = cur_mode;
1632                         modflags |= HAMMER_INODE_DDIRTY;
1633                 }
1634         }
1635 done:
1636         if (error == 0)
1637                 hammer_modify_inode(ip, modflags);
1638         hammer_done_transaction(&trans);
1639         return (error);
1640 }
1641
1642 /*
1643  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1644  */
1645 static
1646 int
1647 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1648 {
1649         struct hammer_transaction trans;
1650         struct hammer_inode *dip;
1651         struct hammer_inode *nip;
1652         struct nchandle *nch;
1653         hammer_record_t record;
1654         int error;
1655         int bytes;
1656
1657         ap->a_vap->va_type = VLNK;
1658
1659         nch = ap->a_nch;
1660         dip = VTOI(ap->a_dvp);
1661
1662         if (dip->flags & HAMMER_INODE_RO)
1663                 return (EROFS);
1664         if ((error = hammer_checkspace(dip->hmp)) != 0)
1665                 return (error);
1666
1667         /*
1668          * Create a transaction to cover the operations we perform.
1669          */
1670         hammer_start_transaction(&trans, dip->hmp);
1671
1672         /*
1673          * Create a new filesystem object of the requested type.  The
1674          * returned inode will be referenced but not locked.
1675          */
1676
1677         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1678         if (error) {
1679                 hammer_done_transaction(&trans);
1680                 *ap->a_vpp = NULL;
1681                 return (error);
1682         }
1683
1684         /*
1685          * Add a record representing the symlink.  symlink stores the link
1686          * as pure data, not a string, and is no \0 terminated.
1687          */
1688         if (error == 0) {
1689                 bytes = strlen(ap->a_target);
1690
1691                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1692                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1693                 } else {
1694                         record = hammer_alloc_mem_record(nip, bytes);
1695                         record->type = HAMMER_MEM_RECORD_GENERAL;
1696
1697                         record->leaf.base.localization = HAMMER_LOCALIZE_MISC;
1698                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1699                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1700                         record->leaf.data_len = bytes;
1701                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1702                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1703                         error = hammer_ip_add_record(&trans, record);
1704                 }
1705
1706                 /*
1707                  * Set the file size to the length of the link.
1708                  */
1709                 if (error == 0) {
1710                         nip->ino_data.size = bytes;
1711                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1712                 }
1713         }
1714         if (error == 0)
1715                 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1716
1717         /*
1718          * Finish up.
1719          */
1720         if (error) {
1721                 hammer_rel_inode(nip, 0);
1722                 *ap->a_vpp = NULL;
1723         } else {
1724                 error = hammer_get_vnode(nip, ap->a_vpp);
1725                 hammer_rel_inode(nip, 0);
1726                 if (error == 0) {
1727                         cache_setunresolved(ap->a_nch);
1728                         cache_setvp(ap->a_nch, *ap->a_vpp);
1729                 }
1730         }
1731         hammer_done_transaction(&trans);
1732         return (error);
1733 }
1734
1735 /*
1736  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1737  */
1738 static
1739 int
1740 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1741 {
1742         struct hammer_transaction trans;
1743         struct hammer_inode *dip;
1744         int error;
1745
1746         dip = VTOI(ap->a_dvp);
1747
1748         if (hammer_nohistory(dip) == 0 &&
1749             (error = hammer_checkspace(dip->hmp)) != 0) {
1750                 return (error);
1751         }
1752
1753         hammer_start_transaction(&trans, dip->hmp);
1754         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1755                                 ap->a_cred, ap->a_flags);
1756         hammer_done_transaction(&trans);
1757
1758         return (error);
1759 }
1760
1761 /*
1762  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1763  */
1764 static
1765 int
1766 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1767 {
1768         struct hammer_inode *ip = ap->a_vp->v_data;
1769
1770         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1771                             ap->a_fflag, ap->a_cred));
1772 }
1773
1774 static
1775 int
1776 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1777 {
1778         struct mount *mp;
1779         int error;
1780
1781         mp = ap->a_head.a_ops->head.vv_mount;
1782
1783         switch(ap->a_op) {
1784         case MOUNTCTL_SET_EXPORT:
1785                 if (ap->a_ctllen != sizeof(struct export_args))
1786                         error = EINVAL;
1787                 error = hammer_vfs_export(mp, ap->a_op,
1788                                       (const struct export_args *)ap->a_ctl);
1789                 break;
1790         default:
1791                 error = journal_mountctl(ap);
1792                 break;
1793         }
1794         return(error);
1795 }
1796
1797 /*
1798  * hammer_vop_strategy { vp, bio }
1799  *
1800  * Strategy call, used for regular file read & write only.  Note that the
1801  * bp may represent a cluster.
1802  *
1803  * To simplify operation and allow better optimizations in the future,
1804  * this code does not make any assumptions with regards to buffer alignment
1805  * or size.
1806  */
1807 static
1808 int
1809 hammer_vop_strategy(struct vop_strategy_args *ap)
1810 {
1811         struct buf *bp;
1812         int error;
1813
1814         bp = ap->a_bio->bio_buf;
1815
1816         switch(bp->b_cmd) {
1817         case BUF_CMD_READ:
1818                 error = hammer_vop_strategy_read(ap);
1819                 break;
1820         case BUF_CMD_WRITE:
1821                 error = hammer_vop_strategy_write(ap);
1822                 break;
1823         default:
1824                 bp->b_error = error = EINVAL;
1825                 bp->b_flags |= B_ERROR;
1826                 biodone(ap->a_bio);
1827                 break;
1828         }
1829         return (error);
1830 }
1831
1832 /*
1833  * Read from a regular file.  Iterate the related records and fill in the
1834  * BIO/BUF.  Gaps are zero-filled.
1835  *
1836  * The support code in hammer_object.c should be used to deal with mixed
1837  * in-memory and on-disk records.
1838  *
1839  * XXX atime update
1840  */
1841 static
1842 int
1843 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1844 {
1845         struct hammer_transaction trans;
1846         struct hammer_inode *ip;
1847         struct hammer_cursor cursor;
1848         hammer_base_elm_t base;
1849         struct bio *bio;
1850         struct buf *bp;
1851         int64_t rec_offset;
1852         int64_t ran_end;
1853         int64_t tmp64;
1854         int error;
1855         int boff;
1856         int roff;
1857         int n;
1858
1859         bio = ap->a_bio;
1860         bp = bio->bio_buf;
1861         ip = ap->a_vp->v_data;
1862
1863         hammer_simple_transaction(&trans, ip->hmp);
1864         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1865
1866         /*
1867          * Key range (begin and end inclusive) to scan.  Note that the key's
1868          * stored in the actual records represent BASE+LEN, not BASE.  The
1869          * first record containing bio_offset will have a key > bio_offset.
1870          */
1871         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1872         cursor.key_beg.obj_id = ip->obj_id;
1873         cursor.key_beg.create_tid = 0;
1874         cursor.key_beg.delete_tid = 0;
1875         cursor.key_beg.obj_type = 0;
1876         cursor.key_beg.key = bio->bio_offset + 1;
1877         cursor.asof = ip->obj_asof;
1878         cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
1879
1880         cursor.key_end = cursor.key_beg;
1881         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
1882 #if 0
1883         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1884                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1885                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1886                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1887         } else
1888 #endif
1889         {
1890                 ran_end = bio->bio_offset + bp->b_bufsize;
1891                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1892                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1893                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1894                 if (tmp64 < ran_end)
1895                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1896                 else
1897                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1898         }
1899         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1900
1901         error = hammer_ip_first(&cursor);
1902         boff = 0;
1903
1904         while (error == 0) {
1905                 /*
1906                  * Get the base file offset of the record.  The key for
1907                  * data records is (base + bytes) rather then (base).
1908                  */
1909                 base = &cursor.leaf->base;
1910                 rec_offset = base->key - cursor.leaf->data_len;
1911
1912                 /*
1913                  * Calculate the gap, if any, and zero-fill it.
1914                  *
1915                  * n is the offset of the start of the record verses our
1916                  * current seek offset in the bio.
1917                  */
1918                 n = (int)(rec_offset - (bio->bio_offset + boff));
1919                 if (n > 0) {
1920                         if (n > bp->b_bufsize - boff)
1921                                 n = bp->b_bufsize - boff;
1922                         bzero((char *)bp->b_data + boff, n);
1923                         boff += n;
1924                         n = 0;
1925                 }
1926
1927                 /*
1928                  * Calculate the data offset in the record and the number
1929                  * of bytes we can copy.
1930                  *
1931                  * There are two degenerate cases.  First, boff may already
1932                  * be at bp->b_bufsize.  Secondly, the data offset within
1933                  * the record may exceed the record's size.
1934                  */
1935                 roff = -n;
1936                 rec_offset += roff;
1937                 n = cursor.leaf->data_len - roff;
1938                 if (n <= 0) {
1939                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
1940                         n = 0;
1941                 } else if (n > bp->b_bufsize - boff) {
1942                         n = bp->b_bufsize - boff;
1943                 }
1944
1945                 /*
1946                  * Deal with cached truncations.  This cool bit of code
1947                  * allows truncate()/ftruncate() to avoid having to sync
1948                  * the file.
1949                  *
1950                  * If the frontend is truncated then all backend records are
1951                  * subject to the frontend's truncation.
1952                  *
1953                  * If the backend is truncated then backend records on-disk
1954                  * (but not in-memory) are subject to the backend's
1955                  * truncation.  In-memory records owned by the backend
1956                  * represent data written after the truncation point on the
1957                  * backend and must not be truncated.
1958                  *
1959                  * Truncate operations deal with frontend buffer cache
1960                  * buffers and frontend-owned in-memory records synchronously.
1961                  */
1962                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
1963                         if (hammer_cursor_ondisk(&cursor) ||
1964                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
1965                                 if (ip->trunc_off <= rec_offset)
1966                                         n = 0;
1967                                 else if (ip->trunc_off < rec_offset + n)
1968                                         n = (int)(ip->trunc_off - rec_offset);
1969                         }
1970                 }
1971                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
1972                         if (hammer_cursor_ondisk(&cursor)) {
1973                                 if (ip->sync_trunc_off <= rec_offset)
1974                                         n = 0;
1975                                 else if (ip->sync_trunc_off < rec_offset + n)
1976                                         n = (int)(ip->sync_trunc_off - rec_offset);
1977                         }
1978                 }
1979
1980                 /*
1981                  * Try to issue a direct read into our bio if possible,
1982                  * otherwise resolve the element data into a hammer_buffer
1983                  * and copy.
1984                  *
1985                  * WARNING: If we hit the else clause.
1986                  */
1987                 if (roff == 0 && n == bp->b_bufsize &&
1988                     (rec_offset & HAMMER_BUFMASK) == 0) {
1989                         error = hammer_io_direct_read(trans.hmp, cursor.leaf,
1990                                                       bio);
1991                         goto done;
1992                 } else if (n) {
1993                         error = hammer_ip_resolve_data(&cursor);
1994                         if (error == 0) {
1995                                 bcopy((char *)cursor.data + roff,
1996                                       (char *)bp->b_data + boff, n);
1997                         }
1998                 }
1999                 if (error)
2000                         break;
2001
2002                 /*
2003                  * Iterate until we have filled the request.
2004                  */
2005                 boff += n;
2006                 if (boff == bp->b_bufsize)
2007                         break;
2008                 error = hammer_ip_next(&cursor);
2009         }
2010
2011         /*
2012          * There may have been a gap after the last record
2013          */
2014         if (error == ENOENT)
2015                 error = 0;
2016         if (error == 0 && boff != bp->b_bufsize) {
2017                 KKASSERT(boff < bp->b_bufsize);
2018                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2019                 /* boff = bp->b_bufsize; */
2020         }
2021         bp->b_resid = 0;
2022         bp->b_error = error;
2023         if (error)
2024                 bp->b_flags |= B_ERROR;
2025         biodone(ap->a_bio);
2026
2027 done:
2028         if (cursor.node)
2029                 hammer_cache_node(cursor.node, &ip->cache[1]);
2030         hammer_done_cursor(&cursor);
2031         hammer_done_transaction(&trans);
2032         return(error);
2033 }
2034
2035 /*
2036  * Write to a regular file.   Because this is a strategy call the OS is
2037  * trying to actually sync data to the media.   HAMMER can only flush
2038  * the entire inode (so the TID remains properly synchronized).
2039  *
2040  * Basically all we do here is place the bio on the inode's flush queue
2041  * and activate the flusher.
2042  */
2043 static
2044 int
2045 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2046 {
2047         hammer_record_t record;
2048         hammer_inode_t ip;
2049         struct bio *bio;
2050         struct buf *bp;
2051         int bytes;
2052         int error;
2053
2054         bio = ap->a_bio;
2055         bp = bio->bio_buf;
2056         ip = ap->a_vp->v_data;
2057
2058         if (ip->flags & HAMMER_INODE_RO) {
2059                 bp->b_error = EROFS;
2060                 bp->b_flags |= B_ERROR;
2061                 biodone(ap->a_bio);
2062                 hammer_cleanup_write_io(ip);
2063                 return(EROFS);
2064         }
2065
2066         /*
2067          * Interlock with inode destruction (no in-kernel or directory
2068          * topology visibility).  If we queue new IO while trying to
2069          * destroy the inode we can deadlock the vtrunc call in
2070          * hammer_inode_unloadable_check().
2071          */
2072         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2073                 bp->b_resid = 0;
2074                 biodone(ap->a_bio);
2075                 hammer_cleanup_write_io(ip);
2076                 return(0);
2077         }
2078
2079         /*
2080          * Attempt to reserve space and issue a direct-write from the
2081          * front-end.  If we can't we will queue the BIO to the flusher.
2082          * The bulk/direct-write code will still bcopy if writing less
2083          * then full-sized blocks (at the end of a file).
2084          *
2085          * If we can the I/O can be issued and an in-memory record will
2086          * be installed to reference the storage until the flusher can get to
2087          * it.
2088          *
2089          * Since we own the high level bio the front-end will not try to
2090          * do a direct-read until the write completes.
2091          */
2092         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2093         KKASSERT(bio->bio_offset < ip->ino_data.size);
2094         if (bio->bio_offset + bp->b_bufsize <= ip->ino_data.size)
2095                 bytes = bp->b_bufsize;
2096         else
2097                 bytes = (int)(ip->ino_data.size - bio->bio_offset);
2098
2099         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2100                                     bytes, &error);
2101         if (record) {
2102                 hammer_io_direct_write(ip->hmp, &record->leaf, bio);
2103                 hammer_rel_mem_record(record);
2104                 if (ip->rsv_recs > hammer_limit_irecs / 2)
2105                         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
2106                 else
2107                         hammer_flush_inode(ip, 0);
2108         } else {
2109                 bp->b_error = error;
2110                 bp->b_flags |= B_ERROR;
2111                 biodone(ap->a_bio);
2112         }
2113         hammer_cleanup_write_io(ip);
2114         return(error);
2115 }
2116
2117 /*
2118  * Clean-up after disposing of a dirty frontend buffer's data.
2119  * This is somewhat heuristical so try to be robust.
2120  */
2121 static void
2122 hammer_cleanup_write_io(hammer_inode_t ip)
2123 {
2124         if (ip->rsv_databufs) {
2125                 --ip->rsv_databufs;
2126                 --ip->hmp->rsv_databufs;
2127         }
2128 }
2129
2130 /*
2131  * We can lose track of dirty buffer cache buffers if we truncate, this
2132  * routine will resynchronize the count.
2133  */
2134 static
2135 void
2136 hammer_update_rsv_databufs(hammer_inode_t ip)
2137 {
2138         struct buf *bp;
2139         int delta;
2140         int n;
2141
2142         if (ip->vp) {
2143                 n = 0;
2144                 RB_FOREACH(bp, buf_rb_tree, &ip->vp->v_rbdirty_tree) {
2145                         ++n;
2146                 }
2147         } else {
2148                 n = 0;
2149         }
2150         delta = n - ip->rsv_databufs;
2151         ip->rsv_databufs += delta;
2152         ip->hmp->rsv_databufs += delta;
2153 }
2154
2155 /*
2156  * dounlink - disconnect a directory entry
2157  *
2158  * XXX whiteout support not really in yet
2159  */
2160 static int
2161 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2162                 struct vnode *dvp, struct ucred *cred, int flags)
2163 {
2164         struct namecache *ncp;
2165         hammer_inode_t dip;
2166         hammer_inode_t ip;
2167         struct hammer_cursor cursor;
2168         int64_t namekey;
2169         int nlen, error;
2170
2171         /*
2172          * Calculate the namekey and setup the key range for the scan.  This
2173          * works kinda like a chained hash table where the lower 32 bits
2174          * of the namekey synthesize the chain.
2175          *
2176          * The key range is inclusive of both key_beg and key_end.
2177          */
2178         dip = VTOI(dvp);
2179         ncp = nch->ncp;
2180
2181         if (dip->flags & HAMMER_INODE_RO)
2182                 return (EROFS);
2183
2184         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2185 retry:
2186         hammer_init_cursor(trans, &cursor, &dip->cache[0], dip);
2187         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
2188         cursor.key_beg.obj_id = dip->obj_id;
2189         cursor.key_beg.key = namekey;
2190         cursor.key_beg.create_tid = 0;
2191         cursor.key_beg.delete_tid = 0;
2192         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2193         cursor.key_beg.obj_type = 0;
2194
2195         cursor.key_end = cursor.key_beg;
2196         cursor.key_end.key |= 0xFFFFFFFFULL;
2197         cursor.asof = dip->obj_asof;
2198         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2199
2200         /*
2201          * Scan all matching records (the chain), locate the one matching
2202          * the requested path component.  info->last_error contains the
2203          * error code on search termination and could be 0, ENOENT, or
2204          * something else.
2205          *
2206          * The hammer_ip_*() functions merge in-memory records with on-disk
2207          * records for the purposes of the search.
2208          */
2209         error = hammer_ip_first(&cursor);
2210
2211         while (error == 0) {
2212                 error = hammer_ip_resolve_data(&cursor);
2213                 if (error)
2214                         break;
2215                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2216                 KKASSERT(nlen > 0);
2217                 if (ncp->nc_nlen == nlen &&
2218                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2219                         break;
2220                 }
2221                 error = hammer_ip_next(&cursor);
2222         }
2223
2224         /*
2225          * If all is ok we have to get the inode so we can adjust nlinks.
2226          *
2227          * If the target is a directory, it must be empty.
2228          */
2229         if (error == 0) {
2230                 ip = hammer_get_inode(trans, &dip->cache[1],
2231                                       cursor.data->entry.obj_id,
2232                                       dip->hmp->asof, 0, &error);
2233                 if (error == ENOENT) {
2234                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2235                         Debugger("ENOENT unlinking object that should exist");
2236                 }
2237
2238                 /*
2239                  * If we are trying to remove a directory the directory must
2240                  * be empty.
2241                  *
2242                  * WARNING: hammer_ip_check_directory_empty() may have to
2243                  * terminate the cursor to avoid a deadlock.  It is ok to
2244                  * call hammer_done_cursor() twice.
2245                  */
2246                 if (error == 0 && ip->ino_data.obj_type ==
2247                                   HAMMER_OBJTYPE_DIRECTORY) {
2248                         error = hammer_ip_check_directory_empty(trans, ip);
2249                 }
2250
2251                 /*
2252                  * Delete the directory entry.
2253                  *
2254                  * WARNING: hammer_ip_del_directory() may have to terminate
2255                  * the cursor to avoid a deadlock.  It is ok to call
2256                  * hammer_done_cursor() twice.
2257                  */
2258                 if (error == 0) {
2259                         error = hammer_ip_del_directory(trans, &cursor,
2260                                                         dip, ip);
2261                 }
2262                 if (error == 0) {
2263                         cache_setunresolved(nch);
2264                         cache_setvp(nch, NULL);
2265                         /* XXX locking */
2266                         if (ip->vp)
2267                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2268                 }
2269                 hammer_rel_inode(ip, 0);
2270         }
2271         hammer_done_cursor(&cursor);
2272         if (error == EDEADLK)
2273                 goto retry;
2274
2275         return (error);
2276 }
2277
2278 /************************************************************************
2279  *                          FIFO AND SPECFS OPS                         *
2280  ************************************************************************
2281  *
2282  */
2283
2284 static int
2285 hammer_vop_fifoclose (struct vop_close_args *ap)
2286 {
2287         /* XXX update itimes */
2288         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2289 }
2290
2291 static int
2292 hammer_vop_fiforead (struct vop_read_args *ap)
2293 {
2294         int error;
2295
2296         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2297         /* XXX update access time */
2298         return (error);
2299 }
2300
2301 static int
2302 hammer_vop_fifowrite (struct vop_write_args *ap)
2303 {
2304         int error;
2305
2306         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2307         /* XXX update access time */
2308         return (error);
2309 }
2310
2311 static int
2312 hammer_vop_specclose (struct vop_close_args *ap)
2313 {
2314         /* XXX update itimes */
2315         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2316 }
2317
2318 static int
2319 hammer_vop_specread (struct vop_read_args *ap)
2320 {
2321         /* XXX update access time */
2322         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2323 }
2324
2325 static int
2326 hammer_vop_specwrite (struct vop_write_args *ap)
2327 {
2328         /* XXX update last change time */
2329         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2330 }
2331