Merge branch 'master' of ssh://crater.dragonflybsd.org/repository/git/dragonfly
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
51
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
85
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
90
91 static int hammer_vop_specclose (struct vop_close_args *);
92 static int hammer_vop_specread (struct vop_read_args *);
93 static int hammer_vop_specwrite (struct vop_write_args *);
94 static int hammer_vop_specgetattr (struct vop_getattr_args *);
95
96 struct vop_ops hammer_vnode_vops = {
97         .vop_default =          vop_defaultop,
98         .vop_fsync =            hammer_vop_fsync,
99         .vop_getpages =         vop_stdgetpages,
100         .vop_putpages =         vop_stdputpages,
101         .vop_read =             hammer_vop_read,
102         .vop_write =            hammer_vop_write,
103         .vop_access =           hammer_vop_access,
104         .vop_advlock =          hammer_vop_advlock,
105         .vop_close =            hammer_vop_close,
106         .vop_ncreate =          hammer_vop_ncreate,
107         .vop_getattr =          hammer_vop_getattr,
108         .vop_inactive =         hammer_vop_inactive,
109         .vop_reclaim =          hammer_vop_reclaim,
110         .vop_nresolve =         hammer_vop_nresolve,
111         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
112         .vop_nlink =            hammer_vop_nlink,
113         .vop_nmkdir =           hammer_vop_nmkdir,
114         .vop_nmknod =           hammer_vop_nmknod,
115         .vop_open =             hammer_vop_open,
116         .vop_pathconf =         vop_stdpathconf,
117         .vop_print =            hammer_vop_print,
118         .vop_readdir =          hammer_vop_readdir,
119         .vop_readlink =         hammer_vop_readlink,
120         .vop_nremove =          hammer_vop_nremove,
121         .vop_nrename =          hammer_vop_nrename,
122         .vop_nrmdir =           hammer_vop_nrmdir,
123         .vop_markatime =        hammer_vop_markatime,
124         .vop_setattr =          hammer_vop_setattr,
125         .vop_bmap =             hammer_vop_bmap,
126         .vop_strategy =         hammer_vop_strategy,
127         .vop_nsymlink =         hammer_vop_nsymlink,
128         .vop_nwhiteout =        hammer_vop_nwhiteout,
129         .vop_ioctl =            hammer_vop_ioctl,
130         .vop_mountctl =         hammer_vop_mountctl,
131         .vop_kqfilter =         hammer_vop_kqfilter
132 };
133
134 struct vop_ops hammer_spec_vops = {
135         .vop_default =          spec_vnoperate,
136         .vop_fsync =            hammer_vop_fsync,
137         .vop_read =             hammer_vop_specread,
138         .vop_write =            hammer_vop_specwrite,
139         .vop_access =           hammer_vop_access,
140         .vop_close =            hammer_vop_specclose,
141         .vop_markatime =        hammer_vop_markatime,
142         .vop_getattr =          hammer_vop_specgetattr,
143         .vop_inactive =         hammer_vop_inactive,
144         .vop_reclaim =          hammer_vop_reclaim,
145         .vop_setattr =          hammer_vop_setattr
146 };
147
148 struct vop_ops hammer_fifo_vops = {
149         .vop_default =          fifo_vnoperate,
150         .vop_fsync =            hammer_vop_fsync,
151         .vop_read =             hammer_vop_fiforead,
152         .vop_write =            hammer_vop_fifowrite,
153         .vop_access =           hammer_vop_access,
154         .vop_close =            hammer_vop_fifoclose,
155         .vop_markatime =        hammer_vop_markatime,
156         .vop_getattr =          hammer_vop_getattr,
157         .vop_inactive =         hammer_vop_inactive,
158         .vop_reclaim =          hammer_vop_reclaim,
159         .vop_setattr =          hammer_vop_setattr,
160         .vop_kqfilter =         hammer_vop_fifokqfilter
161 };
162
163 static __inline
164 void
165 hammer_knote(struct vnode *vp, int flags)
166 {
167         if (flags)
168                 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
169 }
170
171 #ifdef DEBUG_TRUNCATE
172 struct hammer_inode *HammerTruncIp;
173 #endif
174
175 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
176                            struct vnode *dvp, struct ucred *cred,
177                            int flags, int isdir);
178 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
179 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
180
181 #if 0
182 static
183 int
184 hammer_vop_vnoperate(struct vop_generic_args *)
185 {
186         return (VOCALL(&hammer_vnode_vops, ap));
187 }
188 #endif
189
190 /*
191  * hammer_vop_fsync { vp, waitfor }
192  *
193  * fsync() an inode to disk and wait for it to be completely committed
194  * such that the information would not be undone if a crash occured after
195  * return.
196  */
197 static
198 int
199 hammer_vop_fsync(struct vop_fsync_args *ap)
200 {
201         hammer_inode_t ip = VTOI(ap->a_vp);
202
203         ++hammer_count_fsyncs;
204         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
205         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
206         if (ap->a_waitfor == MNT_WAIT) {
207                 vn_unlock(ap->a_vp);
208                 hammer_wait_inode(ip);
209                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
210         }
211         return (ip->error);
212 }
213
214 /*
215  * hammer_vop_read { vp, uio, ioflag, cred }
216  *
217  * MPALMOSTSAFE
218  */
219 static
220 int
221 hammer_vop_read(struct vop_read_args *ap)
222 {
223         struct hammer_transaction trans;
224         hammer_inode_t ip;
225         off_t offset;
226         struct buf *bp;
227         struct uio *uio;
228         int error;
229         int n;
230         int seqcount;
231         int ioseqcount;
232         int blksize;
233         int got_mplock;
234
235         if (ap->a_vp->v_type != VREG)
236                 return (EINVAL);
237         ip = VTOI(ap->a_vp);
238         error = 0;
239         uio = ap->a_uio;
240
241         /*
242          * Allow the UIO's size to override the sequential heuristic.
243          */
244         blksize = hammer_blocksize(uio->uio_offset);
245         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
246         ioseqcount = ap->a_ioflag >> 16;
247         if (seqcount < ioseqcount)
248                 seqcount = ioseqcount;
249
250         if (curthread->td_mpcount) {
251                 got_mplock = -1;
252                 hammer_start_transaction(&trans, ip->hmp);
253         } else {
254                 got_mplock = 0;
255         }
256
257         /*
258          * Access the data typically in HAMMER_BUFSIZE blocks via the
259          * buffer cache, but HAMMER may use a variable block size based
260          * on the offset.
261          *
262          * XXX Temporary hack, delay the start transaction while we remain
263          *     MPSAFE.  NOTE: ino_data.size cannot change while vnode is
264          *     locked-shared.
265          */
266         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
267                 int64_t base_offset;
268                 int64_t file_limit;
269
270                 blksize = hammer_blocksize(uio->uio_offset);
271                 offset = (int)uio->uio_offset & (blksize - 1);
272                 base_offset = uio->uio_offset - offset;
273
274                 /*
275                  * MPSAFE
276                  */
277                 bp = getcacheblk(ap->a_vp, base_offset);
278                 if (bp) {
279                         error = 0;
280                         goto skip;
281                 }
282
283                 /*
284                  * MPUNSAFE
285                  */
286                 if (got_mplock == 0) {
287                         got_mplock = 1;
288                         get_mplock();
289                         hammer_start_transaction(&trans, ip->hmp);
290                 }
291
292                 if (hammer_cluster_enable) {
293                         /*
294                          * Use file_limit to prevent cluster_read() from
295                          * creating buffers of the wrong block size past
296                          * the demarc.
297                          */
298                         file_limit = ip->ino_data.size;
299                         if (base_offset < HAMMER_XDEMARC &&
300                             file_limit > HAMMER_XDEMARC) {
301                                 file_limit = HAMMER_XDEMARC;
302                         }
303                         error = cluster_read(ap->a_vp,
304                                              file_limit, base_offset,
305                                              blksize, MAXPHYS,
306                                              seqcount, &bp);
307                 } else {
308                         error = bread(ap->a_vp, base_offset, blksize, &bp);
309                 }
310                 if (error) {
311                         kprintf("error %d\n", error);
312                         brelse(bp);
313                         break;
314                 }
315 skip:
316
317                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
318                 n = blksize - offset;
319                 if (n > uio->uio_resid)
320                         n = uio->uio_resid;
321                 if (n > ip->ino_data.size - uio->uio_offset)
322                         n = (int)(ip->ino_data.size - uio->uio_offset);
323                 error = uiomove((char *)bp->b_data + offset, n, uio);
324
325                 /* data has a lower priority then meta-data */
326                 bp->b_flags |= B_AGE;
327                 bqrelse(bp);
328                 if (error)
329                         break;
330                 hammer_stats_file_read += n;
331         }
332
333         /*
334          * XXX only update the atime if we had to get the MP lock.
335          * XXX hack hack hack, fixme.
336          */
337         if (got_mplock) {
338                 if ((ip->flags & HAMMER_INODE_RO) == 0 &&
339                     (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
340                         ip->ino_data.atime = trans.time;
341                         hammer_modify_inode(ip, HAMMER_INODE_ATIME);
342                 }
343                 hammer_done_transaction(&trans);
344                 if (got_mplock > 0)
345                         rel_mplock();
346         }
347         return (error);
348 }
349
350 /*
351  * hammer_vop_write { vp, uio, ioflag, cred }
352  */
353 static
354 int
355 hammer_vop_write(struct vop_write_args *ap)
356 {
357         struct hammer_transaction trans;
358         struct hammer_inode *ip;
359         hammer_mount_t hmp;
360         struct uio *uio;
361         int offset;
362         off_t base_offset;
363         struct buf *bp;
364         int kflags;
365         int error;
366         int n;
367         int flags;
368         int delta;
369         int seqcount;
370
371         if (ap->a_vp->v_type != VREG)
372                 return (EINVAL);
373         ip = VTOI(ap->a_vp);
374         hmp = ip->hmp;
375         error = 0;
376         kflags = 0;
377         seqcount = ap->a_ioflag >> 16;
378
379         if (ip->flags & HAMMER_INODE_RO)
380                 return (EROFS);
381
382         /*
383          * Create a transaction to cover the operations we perform.
384          */
385         hammer_start_transaction(&trans, hmp);
386         uio = ap->a_uio;
387
388         /*
389          * Check append mode
390          */
391         if (ap->a_ioflag & IO_APPEND)
392                 uio->uio_offset = ip->ino_data.size;
393
394         /*
395          * Check for illegal write offsets.  Valid range is 0...2^63-1.
396          *
397          * NOTE: the base_off assignment is required to work around what
398          * I consider to be a GCC-4 optimization bug.
399          */
400         if (uio->uio_offset < 0) {
401                 hammer_done_transaction(&trans);
402                 return (EFBIG);
403         }
404         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
405         if (uio->uio_resid > 0 && base_offset <= 0) {
406                 hammer_done_transaction(&trans);
407                 return (EFBIG);
408         }
409
410         /*
411          * Access the data typically in HAMMER_BUFSIZE blocks via the
412          * buffer cache, but HAMMER may use a variable block size based
413          * on the offset.
414          */
415         while (uio->uio_resid > 0) {
416                 int fixsize = 0;
417                 int blksize;
418                 int blkmask;
419
420                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
421                         break;
422
423                 blksize = hammer_blocksize(uio->uio_offset);
424
425                 /*
426                  * Do not allow HAMMER to blow out the buffer cache.  Very
427                  * large UIOs can lockout other processes due to bwillwrite()
428                  * mechanics.
429                  *
430                  * The hammer inode is not locked during these operations.
431                  * The vnode is locked which can interfere with the pageout
432                  * daemon for non-UIO_NOCOPY writes but should not interfere
433                  * with the buffer cache.  Even so, we cannot afford to
434                  * allow the pageout daemon to build up too many dirty buffer
435                  * cache buffers.
436                  *
437                  * Only call this if we aren't being recursively called from
438                  * a virtual disk device (vn), else we may deadlock.
439                  */
440                 if ((ap->a_ioflag & IO_RECURSE) == 0)
441                         bwillwrite(blksize);
442
443                 /*
444                  * Do not allow HAMMER to blow out system memory by
445                  * accumulating too many records.   Records are so well
446                  * decoupled from the buffer cache that it is possible
447                  * for userland to push data out to the media via
448                  * direct-write, but build up the records queued to the
449                  * backend faster then the backend can flush them out.
450                  * HAMMER has hit its write limit but the frontend has
451                  * no pushback to slow it down.
452                  */
453                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
454                         /*
455                          * Get the inode on the flush list
456                          */
457                         if (ip->rsv_recs >= 64)
458                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
459                         else if (ip->rsv_recs >= 16)
460                                 hammer_flush_inode(ip, 0);
461
462                         /*
463                          * Keep the flusher going if the system keeps
464                          * queueing records.
465                          */
466                         delta = hmp->count_newrecords -
467                                 hmp->last_newrecords;
468                         if (delta < 0 || delta > hammer_limit_recs / 2) {
469                                 hmp->last_newrecords = hmp->count_newrecords;
470                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
471                         }
472
473                         /*
474                          * If we have gotten behind start slowing
475                          * down the writers.
476                          */
477                         delta = (hmp->rsv_recs - hammer_limit_recs) *
478                                 hz / hammer_limit_recs;
479                         if (delta > 0)
480                                 tsleep(&trans, 0, "hmrslo", delta);
481                 }
482
483                 /*
484                  * Calculate the blocksize at the current offset and figure
485                  * out how much we can actually write.
486                  */
487                 blkmask = blksize - 1;
488                 offset = (int)uio->uio_offset & blkmask;
489                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
490                 n = blksize - offset;
491                 if (n > uio->uio_resid)
492                         n = uio->uio_resid;
493                 if (uio->uio_offset + n > ip->ino_data.size) {
494                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
495                         fixsize = 1;
496                         kflags |= NOTE_EXTEND;
497                 }
498
499                 if (uio->uio_segflg == UIO_NOCOPY) {
500                         /*
501                          * Issuing a write with the same data backing the
502                          * buffer.  Instantiate the buffer to collect the
503                          * backing vm pages, then read-in any missing bits.
504                          *
505                          * This case is used by vop_stdputpages().
506                          */
507                         bp = getblk(ap->a_vp, base_offset,
508                                     blksize, GETBLK_BHEAVY, 0);
509                         if ((bp->b_flags & B_CACHE) == 0) {
510                                 bqrelse(bp);
511                                 error = bread(ap->a_vp, base_offset,
512                                               blksize, &bp);
513                         }
514                 } else if (offset == 0 && uio->uio_resid >= blksize) {
515                         /*
516                          * Even though we are entirely overwriting the buffer
517                          * we may still have to zero it out to avoid a 
518                          * mmap/write visibility issue.
519                          */
520                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
521                         if ((bp->b_flags & B_CACHE) == 0)
522                                 vfs_bio_clrbuf(bp);
523                 } else if (base_offset >= ip->ino_data.size) {
524                         /*
525                          * If the base offset of the buffer is beyond the
526                          * file EOF, we don't have to issue a read.
527                          */
528                         bp = getblk(ap->a_vp, base_offset,
529                                     blksize, GETBLK_BHEAVY, 0);
530                         vfs_bio_clrbuf(bp);
531                 } else {
532                         /*
533                          * Partial overwrite, read in any missing bits then
534                          * replace the portion being written.
535                          */
536                         error = bread(ap->a_vp, base_offset, blksize, &bp);
537                         if (error == 0)
538                                 bheavy(bp);
539                 }
540                 if (error == 0) {
541                         error = uiomove((char *)bp->b_data + offset,
542                                         n, uio);
543                 }
544
545                 /*
546                  * If we screwed up we have to undo any VM size changes we
547                  * made.
548                  */
549                 if (error) {
550                         brelse(bp);
551                         if (fixsize) {
552                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
553                                           hammer_blocksize(ip->ino_data.size));
554                         }
555                         break;
556                 }
557                 kflags |= NOTE_WRITE;
558                 hammer_stats_file_write += n;
559                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
560                 if (ip->ino_data.size < uio->uio_offset) {
561                         ip->ino_data.size = uio->uio_offset;
562                         flags = HAMMER_INODE_DDIRTY;
563                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
564                 } else {
565                         flags = 0;
566                 }
567                 ip->ino_data.mtime = trans.time;
568                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
569                 hammer_modify_inode(ip, flags);
570
571                 /*
572                  * Once we dirty the buffer any cached zone-X offset
573                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
574                  * allow overwriting over the same data sector unless
575                  * we provide UNDOs for the old data, which we don't.
576                  */
577                 bp->b_bio2.bio_offset = NOOFFSET;
578
579                 /*
580                  * Final buffer disposition.
581                  */
582                 bp->b_flags |= B_AGE;
583                 if (ap->a_ioflag & IO_SYNC) {
584                         bwrite(bp);
585                 } else if (ap->a_ioflag & IO_DIRECT) {
586                         bawrite(bp);
587                 } else {
588                         bdwrite(bp);
589                 }
590         }
591         hammer_done_transaction(&trans);
592         hammer_knote(ap->a_vp, kflags);
593         return (error);
594 }
595
596 /*
597  * hammer_vop_access { vp, mode, cred }
598  */
599 static
600 int
601 hammer_vop_access(struct vop_access_args *ap)
602 {
603         struct hammer_inode *ip = VTOI(ap->a_vp);
604         uid_t uid;
605         gid_t gid;
606         int error;
607
608         ++hammer_stats_file_iopsr;
609         uid = hammer_to_unix_xid(&ip->ino_data.uid);
610         gid = hammer_to_unix_xid(&ip->ino_data.gid);
611
612         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
613                                   ip->ino_data.uflags);
614         return (error);
615 }
616
617 /*
618  * hammer_vop_advlock { vp, id, op, fl, flags }
619  */
620 static
621 int
622 hammer_vop_advlock(struct vop_advlock_args *ap)
623 {
624         hammer_inode_t ip = VTOI(ap->a_vp);
625
626         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
627 }
628
629 /*
630  * hammer_vop_close { vp, fflag }
631  */
632 static
633 int
634 hammer_vop_close(struct vop_close_args *ap)
635 {
636         /*hammer_inode_t ip = VTOI(ap->a_vp);*/
637         return (vop_stdclose(ap));
638 }
639
640 /*
641  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
642  *
643  * The operating system has already ensured that the directory entry
644  * does not exist and done all appropriate namespace locking.
645  */
646 static
647 int
648 hammer_vop_ncreate(struct vop_ncreate_args *ap)
649 {
650         struct hammer_transaction trans;
651         struct hammer_inode *dip;
652         struct hammer_inode *nip;
653         struct nchandle *nch;
654         int error;
655
656         nch = ap->a_nch;
657         dip = VTOI(ap->a_dvp);
658
659         if (dip->flags & HAMMER_INODE_RO)
660                 return (EROFS);
661         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
662                 return (error);
663
664         /*
665          * Create a transaction to cover the operations we perform.
666          */
667         hammer_start_transaction(&trans, dip->hmp);
668         ++hammer_stats_file_iopsw;
669
670         /*
671          * Create a new filesystem object of the requested type.  The
672          * returned inode will be referenced and shared-locked to prevent
673          * it from being moved to the flusher.
674          */
675         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
676                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
677                                     NULL, &nip);
678         if (error) {
679                 hkprintf("hammer_create_inode error %d\n", error);
680                 hammer_done_transaction(&trans);
681                 *ap->a_vpp = NULL;
682                 return (error);
683         }
684
685         /*
686          * Add the new filesystem object to the directory.  This will also
687          * bump the inode's link count.
688          */
689         error = hammer_ip_add_directory(&trans, dip,
690                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
691                                         nip);
692         if (error)
693                 hkprintf("hammer_ip_add_directory error %d\n", error);
694
695         /*
696          * Finish up.
697          */
698         if (error) {
699                 hammer_rel_inode(nip, 0);
700                 hammer_done_transaction(&trans);
701                 *ap->a_vpp = NULL;
702         } else {
703                 error = hammer_get_vnode(nip, ap->a_vpp);
704                 hammer_done_transaction(&trans);
705                 hammer_rel_inode(nip, 0);
706                 if (error == 0) {
707                         cache_setunresolved(ap->a_nch);
708                         cache_setvp(ap->a_nch, *ap->a_vpp);
709                 }
710                 hammer_knote(ap->a_dvp, NOTE_WRITE);
711         }
712         return (error);
713 }
714
715 /*
716  * hammer_vop_getattr { vp, vap }
717  *
718  * Retrieve an inode's attribute information.  When accessing inodes
719  * historically we fake the atime field to ensure consistent results.
720  * The atime field is stored in the B-Tree element and allowed to be
721  * updated without cycling the element.
722  *
723  * MPSAFE
724  */
725 static
726 int
727 hammer_vop_getattr(struct vop_getattr_args *ap)
728 {
729         struct hammer_inode *ip = VTOI(ap->a_vp);
730         struct vattr *vap = ap->a_vap;
731
732         /*
733          * We want the fsid to be different when accessing a filesystem
734          * with different as-of's so programs like diff don't think
735          * the files are the same.
736          *
737          * We also want the fsid to be the same when comparing snapshots,
738          * or when comparing mirrors (which might be backed by different
739          * physical devices).  HAMMER fsids are based on the PFS's
740          * shared_uuid field.
741          *
742          * XXX there is a chance of collision here.  The va_fsid reported
743          * by stat is different from the more involved fsid used in the
744          * mount structure.
745          */
746         ++hammer_stats_file_iopsr;
747         hammer_lock_sh(&ip->lock);
748         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
749                        (u_int32_t)(ip->obj_asof >> 32);
750
751         vap->va_fileid = ip->ino_leaf.base.obj_id;
752         vap->va_mode = ip->ino_data.mode;
753         vap->va_nlink = ip->ino_data.nlinks;
754         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
755         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
756         vap->va_rmajor = 0;
757         vap->va_rminor = 0;
758         vap->va_size = ip->ino_data.size;
759
760         /*
761          * Special case for @@PFS softlinks.  The actual size of the
762          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
763          * or for MAX_TID is    "@@-1:%05d" == 10 bytes.
764          */
765         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
766             ip->ino_data.size == 10 &&
767             ip->obj_asof == HAMMER_MAX_TID &&
768             ip->obj_localization == 0 &&
769             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
770                     if (ip->pfsm->pfsd.mirror_flags & HAMMER_PFSD_SLAVE)
771                             vap->va_size = 26;
772                     else
773                             vap->va_size = 10;
774         }
775
776         /*
777          * We must provide a consistent atime and mtime for snapshots
778          * so people can do a 'tar cf - ... | md5' on them and get
779          * consistent results.
780          */
781         if (ip->flags & HAMMER_INODE_RO) {
782                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
783                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
784         } else {
785                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
786                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
787         }
788         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
789         vap->va_flags = ip->ino_data.uflags;
790         vap->va_gen = 1;        /* hammer inums are unique for all time */
791         vap->va_blocksize = HAMMER_BUFSIZE;
792         if (ip->ino_data.size >= HAMMER_XDEMARC) {
793                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
794                                 ~HAMMER_XBUFMASK64;
795         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
796                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
797                                 ~HAMMER_BUFMASK64;
798         } else {
799                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
800         }
801
802         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
803         vap->va_filerev = 0;    /* XXX */
804         /* mtime uniquely identifies any adjustments made to the file XXX */
805         vap->va_fsmid = ip->ino_data.mtime;
806         vap->va_uid_uuid = ip->ino_data.uid;
807         vap->va_gid_uuid = ip->ino_data.gid;
808         vap->va_fsid_uuid = ip->hmp->fsid;
809         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
810                           VA_FSID_UUID_VALID;
811
812         switch (ip->ino_data.obj_type) {
813         case HAMMER_OBJTYPE_CDEV:
814         case HAMMER_OBJTYPE_BDEV:
815                 vap->va_rmajor = ip->ino_data.rmajor;
816                 vap->va_rminor = ip->ino_data.rminor;
817                 break;
818         default:
819                 break;
820         }
821         hammer_unlock(&ip->lock);
822         return(0);
823 }
824
825 /*
826  * hammer_vop_nresolve { nch, dvp, cred }
827  *
828  * Locate the requested directory entry.
829  */
830 static
831 int
832 hammer_vop_nresolve(struct vop_nresolve_args *ap)
833 {
834         struct hammer_transaction trans;
835         struct namecache *ncp;
836         hammer_inode_t dip;
837         hammer_inode_t ip;
838         hammer_tid_t asof;
839         struct hammer_cursor cursor;
840         struct vnode *vp;
841         int64_t namekey;
842         int error;
843         int i;
844         int nlen;
845         int flags;
846         int ispfs;
847         int64_t obj_id;
848         u_int32_t localization;
849         u_int32_t max_iterations;
850
851         /*
852          * Misc initialization, plus handle as-of name extensions.  Look for
853          * the '@@' extension.  Note that as-of files and directories cannot
854          * be modified.
855          */
856         dip = VTOI(ap->a_dvp);
857         ncp = ap->a_nch->ncp;
858         asof = dip->obj_asof;
859         localization = dip->obj_localization;   /* for code consistency */
860         nlen = ncp->nc_nlen;
861         flags = dip->flags & HAMMER_INODE_RO;
862         ispfs = 0;
863
864         hammer_simple_transaction(&trans, dip->hmp);
865         ++hammer_stats_file_iopsr;
866
867         for (i = 0; i < nlen; ++i) {
868                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
869                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
870                                                   &ispfs, &asof, &localization);
871                         if (error != 0) {
872                                 i = nlen;
873                                 break;
874                         }
875                         if (asof != HAMMER_MAX_TID)
876                                 flags |= HAMMER_INODE_RO;
877                         break;
878                 }
879         }
880         nlen = i;
881
882         /*
883          * If this is a PFS softlink we dive into the PFS
884          */
885         if (ispfs && nlen == 0) {
886                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
887                                       asof, localization,
888                                       flags, &error);
889                 if (error == 0) {
890                         error = hammer_get_vnode(ip, &vp);
891                         hammer_rel_inode(ip, 0);
892                 } else {
893                         vp = NULL;
894                 }
895                 if (error == 0) {
896                         vn_unlock(vp);
897                         cache_setvp(ap->a_nch, vp);
898                         vrele(vp);
899                 }
900                 goto done;
901         }
902
903         /*
904          * If there is no path component the time extension is relative to dip.
905          * e.g. "fubar/@@<snapshot>"
906          *
907          * "." is handled by the kernel, but ".@@<snapshot>" is not.
908          * e.g. "fubar/.@@<snapshot>"
909          *
910          * ".." is handled by the kernel.  We do not currently handle
911          * "..@<snapshot>".
912          */
913         if (nlen == 0 || (nlen == 1 && ncp->nc_name[0] == '.')) {
914                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
915                                       asof, dip->obj_localization,
916                                       flags, &error);
917                 if (error == 0) {
918                         error = hammer_get_vnode(ip, &vp);
919                         hammer_rel_inode(ip, 0);
920                 } else {
921                         vp = NULL;
922                 }
923                 if (error == 0) {
924                         vn_unlock(vp);
925                         cache_setvp(ap->a_nch, vp);
926                         vrele(vp);
927                 }
928                 goto done;
929         }
930
931         /*
932          * Calculate the namekey and setup the key range for the scan.  This
933          * works kinda like a chained hash table where the lower 32 bits
934          * of the namekey synthesize the chain.
935          *
936          * The key range is inclusive of both key_beg and key_end.
937          */
938         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
939                                            &max_iterations);
940
941         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
942         cursor.key_beg.localization = dip->obj_localization +
943                                       hammer_dir_localization(dip);
944         cursor.key_beg.obj_id = dip->obj_id;
945         cursor.key_beg.key = namekey;
946         cursor.key_beg.create_tid = 0;
947         cursor.key_beg.delete_tid = 0;
948         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
949         cursor.key_beg.obj_type = 0;
950
951         cursor.key_end = cursor.key_beg;
952         cursor.key_end.key += max_iterations;
953         cursor.asof = asof;
954         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
955
956         /*
957          * Scan all matching records (the chain), locate the one matching
958          * the requested path component.
959          *
960          * The hammer_ip_*() functions merge in-memory records with on-disk
961          * records for the purposes of the search.
962          */
963         obj_id = 0;
964         localization = HAMMER_DEF_LOCALIZATION;
965
966         if (error == 0) {
967                 error = hammer_ip_first(&cursor);
968                 while (error == 0) {
969                         error = hammer_ip_resolve_data(&cursor);
970                         if (error)
971                                 break;
972                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
973                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
974                                 obj_id = cursor.data->entry.obj_id;
975                                 localization = cursor.data->entry.localization;
976                                 break;
977                         }
978                         error = hammer_ip_next(&cursor);
979                 }
980         }
981         hammer_done_cursor(&cursor);
982
983         /*
984          * Lookup the obj_id.  This should always succeed.  If it does not
985          * the filesystem may be damaged and we return a dummy inode.
986          */
987         if (error == 0) {
988                 ip = hammer_get_inode(&trans, dip, obj_id,
989                                       asof, localization,
990                                       flags, &error);
991                 if (error == ENOENT) {
992                         kprintf("HAMMER: WARNING: Missing "
993                                 "inode for dirent \"%s\"\n"
994                                 "\tobj_id = %016llx\n",
995                                 ncp->nc_name, (long long)obj_id);
996                         error = 0;
997                         ip = hammer_get_dummy_inode(&trans, dip, obj_id,
998                                                     asof, localization,
999                                                     flags, &error);
1000                 }
1001                 if (error == 0) {
1002                         error = hammer_get_vnode(ip, &vp);
1003                         hammer_rel_inode(ip, 0);
1004                 } else {
1005                         vp = NULL;
1006                 }
1007                 if (error == 0) {
1008                         vn_unlock(vp);
1009                         cache_setvp(ap->a_nch, vp);
1010                         vrele(vp);
1011                 }
1012         } else if (error == ENOENT) {
1013                 cache_setvp(ap->a_nch, NULL);
1014         }
1015 done:
1016         hammer_done_transaction(&trans);
1017         return (error);
1018 }
1019
1020 /*
1021  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
1022  *
1023  * Locate the parent directory of a directory vnode.
1024  *
1025  * dvp is referenced but not locked.  *vpp must be returned referenced and
1026  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
1027  * at the root, instead it could indicate that the directory we were in was
1028  * removed.
1029  *
1030  * NOTE: as-of sequences are not linked into the directory structure.  If
1031  * we are at the root with a different asof then the mount point, reload
1032  * the same directory with the mount point's asof.   I'm not sure what this
1033  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
1034  * get confused, but it hasn't been tested.
1035  */
1036 static
1037 int
1038 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1039 {
1040         struct hammer_transaction trans;
1041         struct hammer_inode *dip;
1042         struct hammer_inode *ip;
1043         int64_t parent_obj_id;
1044         u_int32_t parent_obj_localization;
1045         hammer_tid_t asof;
1046         int error;
1047
1048         dip = VTOI(ap->a_dvp);
1049         asof = dip->obj_asof;
1050
1051         /*
1052          * Whos are parent?  This could be the root of a pseudo-filesystem
1053          * whos parent is in another localization domain.
1054          */
1055         parent_obj_id = dip->ino_data.parent_obj_id;
1056         if (dip->obj_id == HAMMER_OBJID_ROOT)
1057                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
1058         else
1059                 parent_obj_localization = dip->obj_localization;
1060
1061         if (parent_obj_id == 0) {
1062                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
1063                    asof != dip->hmp->asof) {
1064                         parent_obj_id = dip->obj_id;
1065                         asof = dip->hmp->asof;
1066                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
1067                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
1068                                   (long long)dip->obj_asof);
1069                 } else {
1070                         *ap->a_vpp = NULL;
1071                         return ENOENT;
1072                 }
1073         }
1074
1075         hammer_simple_transaction(&trans, dip->hmp);
1076         ++hammer_stats_file_iopsr;
1077
1078         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1079                               asof, parent_obj_localization,
1080                               dip->flags, &error);
1081         if (ip) {
1082                 error = hammer_get_vnode(ip, ap->a_vpp);
1083                 hammer_rel_inode(ip, 0);
1084         } else {
1085                 *ap->a_vpp = NULL;
1086         }
1087         hammer_done_transaction(&trans);
1088         return (error);
1089 }
1090
1091 /*
1092  * hammer_vop_nlink { nch, dvp, vp, cred }
1093  */
1094 static
1095 int
1096 hammer_vop_nlink(struct vop_nlink_args *ap)
1097 {
1098         struct hammer_transaction trans;
1099         struct hammer_inode *dip;
1100         struct hammer_inode *ip;
1101         struct nchandle *nch;
1102         int error;
1103
1104         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1105                 return(EXDEV);
1106
1107         nch = ap->a_nch;
1108         dip = VTOI(ap->a_dvp);
1109         ip = VTOI(ap->a_vp);
1110
1111         if (dip->obj_localization != ip->obj_localization)
1112                 return(EXDEV);
1113
1114         if (dip->flags & HAMMER_INODE_RO)
1115                 return (EROFS);
1116         if (ip->flags & HAMMER_INODE_RO)
1117                 return (EROFS);
1118         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1119                 return (error);
1120
1121         /*
1122          * Create a transaction to cover the operations we perform.
1123          */
1124         hammer_start_transaction(&trans, dip->hmp);
1125         ++hammer_stats_file_iopsw;
1126
1127         /*
1128          * Add the filesystem object to the directory.  Note that neither
1129          * dip nor ip are referenced or locked, but their vnodes are
1130          * referenced.  This function will bump the inode's link count.
1131          */
1132         error = hammer_ip_add_directory(&trans, dip,
1133                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1134                                         ip);
1135
1136         /*
1137          * Finish up.
1138          */
1139         if (error == 0) {
1140                 cache_setunresolved(nch);
1141                 cache_setvp(nch, ap->a_vp);
1142         }
1143         hammer_done_transaction(&trans);
1144         hammer_knote(ap->a_vp, NOTE_LINK);
1145         hammer_knote(ap->a_dvp, NOTE_WRITE);
1146         return (error);
1147 }
1148
1149 /*
1150  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1151  *
1152  * The operating system has already ensured that the directory entry
1153  * does not exist and done all appropriate namespace locking.
1154  */
1155 static
1156 int
1157 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1158 {
1159         struct hammer_transaction trans;
1160         struct hammer_inode *dip;
1161         struct hammer_inode *nip;
1162         struct nchandle *nch;
1163         int error;
1164
1165         nch = ap->a_nch;
1166         dip = VTOI(ap->a_dvp);
1167
1168         if (dip->flags & HAMMER_INODE_RO)
1169                 return (EROFS);
1170         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1171                 return (error);
1172
1173         /*
1174          * Create a transaction to cover the operations we perform.
1175          */
1176         hammer_start_transaction(&trans, dip->hmp);
1177         ++hammer_stats_file_iopsw;
1178
1179         /*
1180          * Create a new filesystem object of the requested type.  The
1181          * returned inode will be referenced but not locked.
1182          */
1183         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1184                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1185                                     NULL, &nip);
1186         if (error) {
1187                 hkprintf("hammer_mkdir error %d\n", error);
1188                 hammer_done_transaction(&trans);
1189                 *ap->a_vpp = NULL;
1190                 return (error);
1191         }
1192         /*
1193          * Add the new filesystem object to the directory.  This will also
1194          * bump the inode's link count.
1195          */
1196         error = hammer_ip_add_directory(&trans, dip,
1197                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1198                                         nip);
1199         if (error)
1200                 hkprintf("hammer_mkdir (add) error %d\n", error);
1201
1202         /*
1203          * Finish up.
1204          */
1205         if (error) {
1206                 hammer_rel_inode(nip, 0);
1207                 *ap->a_vpp = NULL;
1208         } else {
1209                 error = hammer_get_vnode(nip, ap->a_vpp);
1210                 hammer_rel_inode(nip, 0);
1211                 if (error == 0) {
1212                         cache_setunresolved(ap->a_nch);
1213                         cache_setvp(ap->a_nch, *ap->a_vpp);
1214                 }
1215         }
1216         hammer_done_transaction(&trans);
1217         if (error == 0)
1218                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1219         return (error);
1220 }
1221
1222 /*
1223  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1224  *
1225  * The operating system has already ensured that the directory entry
1226  * does not exist and done all appropriate namespace locking.
1227  */
1228 static
1229 int
1230 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1231 {
1232         struct hammer_transaction trans;
1233         struct hammer_inode *dip;
1234         struct hammer_inode *nip;
1235         struct nchandle *nch;
1236         int error;
1237
1238         nch = ap->a_nch;
1239         dip = VTOI(ap->a_dvp);
1240
1241         if (dip->flags & HAMMER_INODE_RO)
1242                 return (EROFS);
1243         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1244                 return (error);
1245
1246         /*
1247          * Create a transaction to cover the operations we perform.
1248          */
1249         hammer_start_transaction(&trans, dip->hmp);
1250         ++hammer_stats_file_iopsw;
1251
1252         /*
1253          * Create a new filesystem object of the requested type.  The
1254          * returned inode will be referenced but not locked.
1255          *
1256          * If mknod specifies a directory a pseudo-fs is created.
1257          */
1258         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1259                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
1260                                     NULL, &nip);
1261         if (error) {
1262                 hammer_done_transaction(&trans);
1263                 *ap->a_vpp = NULL;
1264                 return (error);
1265         }
1266
1267         /*
1268          * Add the new filesystem object to the directory.  This will also
1269          * bump the inode's link count.
1270          */
1271         error = hammer_ip_add_directory(&trans, dip,
1272                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1273                                         nip);
1274
1275         /*
1276          * Finish up.
1277          */
1278         if (error) {
1279                 hammer_rel_inode(nip, 0);
1280                 *ap->a_vpp = NULL;
1281         } else {
1282                 error = hammer_get_vnode(nip, ap->a_vpp);
1283                 hammer_rel_inode(nip, 0);
1284                 if (error == 0) {
1285                         cache_setunresolved(ap->a_nch);
1286                         cache_setvp(ap->a_nch, *ap->a_vpp);
1287                 }
1288         }
1289         hammer_done_transaction(&trans);
1290         if (error == 0)
1291                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1292         return (error);
1293 }
1294
1295 /*
1296  * hammer_vop_open { vp, mode, cred, fp }
1297  */
1298 static
1299 int
1300 hammer_vop_open(struct vop_open_args *ap)
1301 {
1302         hammer_inode_t ip;
1303
1304         ++hammer_stats_file_iopsr;
1305         ip = VTOI(ap->a_vp);
1306
1307         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1308                 return (EROFS);
1309         return(vop_stdopen(ap));
1310 }
1311
1312 /*
1313  * hammer_vop_print { vp }
1314  */
1315 static
1316 int
1317 hammer_vop_print(struct vop_print_args *ap)
1318 {
1319         return EOPNOTSUPP;
1320 }
1321
1322 /*
1323  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1324  */
1325 static
1326 int
1327 hammer_vop_readdir(struct vop_readdir_args *ap)
1328 {
1329         struct hammer_transaction trans;
1330         struct hammer_cursor cursor;
1331         struct hammer_inode *ip;
1332         struct uio *uio;
1333         hammer_base_elm_t base;
1334         int error;
1335         int cookie_index;
1336         int ncookies;
1337         off_t *cookies;
1338         off_t saveoff;
1339         int r;
1340         int dtype;
1341
1342         ++hammer_stats_file_iopsr;
1343         ip = VTOI(ap->a_vp);
1344         uio = ap->a_uio;
1345         saveoff = uio->uio_offset;
1346
1347         if (ap->a_ncookies) {
1348                 ncookies = uio->uio_resid / 16 + 1;
1349                 if (ncookies > 1024)
1350                         ncookies = 1024;
1351                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1352                 cookie_index = 0;
1353         } else {
1354                 ncookies = -1;
1355                 cookies = NULL;
1356                 cookie_index = 0;
1357         }
1358
1359         hammer_simple_transaction(&trans, ip->hmp);
1360
1361         /*
1362          * Handle artificial entries
1363          *
1364          * It should be noted that the minimum value for a directory
1365          * hash key on-media is 0x0000000100000000, so we can use anything
1366          * less then that to represent our 'special' key space.
1367          */
1368         error = 0;
1369         if (saveoff == 0) {
1370                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1371                 if (r)
1372                         goto done;
1373                 if (cookies)
1374                         cookies[cookie_index] = saveoff;
1375                 ++saveoff;
1376                 ++cookie_index;
1377                 if (cookie_index == ncookies)
1378                         goto done;
1379         }
1380         if (saveoff == 1) {
1381                 if (ip->ino_data.parent_obj_id) {
1382                         r = vop_write_dirent(&error, uio,
1383                                              ip->ino_data.parent_obj_id,
1384                                              DT_DIR, 2, "..");
1385                 } else {
1386                         r = vop_write_dirent(&error, uio,
1387                                              ip->obj_id, DT_DIR, 2, "..");
1388                 }
1389                 if (r)
1390                         goto done;
1391                 if (cookies)
1392                         cookies[cookie_index] = saveoff;
1393                 ++saveoff;
1394                 ++cookie_index;
1395                 if (cookie_index == ncookies)
1396                         goto done;
1397         }
1398
1399         /*
1400          * Key range (begin and end inclusive) to scan.  Directory keys
1401          * directly translate to a 64 bit 'seek' position.
1402          */
1403         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1404         cursor.key_beg.localization = ip->obj_localization +
1405                                       hammer_dir_localization(ip);
1406         cursor.key_beg.obj_id = ip->obj_id;
1407         cursor.key_beg.create_tid = 0;
1408         cursor.key_beg.delete_tid = 0;
1409         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1410         cursor.key_beg.obj_type = 0;
1411         cursor.key_beg.key = saveoff;
1412
1413         cursor.key_end = cursor.key_beg;
1414         cursor.key_end.key = HAMMER_MAX_KEY;
1415         cursor.asof = ip->obj_asof;
1416         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1417
1418         error = hammer_ip_first(&cursor);
1419
1420         while (error == 0) {
1421                 error = hammer_ip_resolve_data(&cursor);
1422                 if (error)
1423                         break;
1424                 base = &cursor.leaf->base;
1425                 saveoff = base->key;
1426                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1427
1428                 if (base->obj_id != ip->obj_id)
1429                         panic("readdir: bad record at %p", cursor.node);
1430
1431                 /*
1432                  * Convert pseudo-filesystems into softlinks
1433                  */
1434                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1435                 r = vop_write_dirent(
1436                              &error, uio, cursor.data->entry.obj_id,
1437                              dtype,
1438                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1439                              (void *)cursor.data->entry.name);
1440                 if (r)
1441                         break;
1442                 ++saveoff;
1443                 if (cookies)
1444                         cookies[cookie_index] = base->key;
1445                 ++cookie_index;
1446                 if (cookie_index == ncookies)
1447                         break;
1448                 error = hammer_ip_next(&cursor);
1449         }
1450         hammer_done_cursor(&cursor);
1451
1452 done:
1453         hammer_done_transaction(&trans);
1454
1455         if (ap->a_eofflag)
1456                 *ap->a_eofflag = (error == ENOENT);
1457         uio->uio_offset = saveoff;
1458         if (error && cookie_index == 0) {
1459                 if (error == ENOENT)
1460                         error = 0;
1461                 if (cookies) {
1462                         kfree(cookies, M_TEMP);
1463                         *ap->a_ncookies = 0;
1464                         *ap->a_cookies = NULL;
1465                 }
1466         } else {
1467                 if (error == ENOENT)
1468                         error = 0;
1469                 if (cookies) {
1470                         *ap->a_ncookies = cookie_index;
1471                         *ap->a_cookies = cookies;
1472                 }
1473         }
1474         return(error);
1475 }
1476
1477 /*
1478  * hammer_vop_readlink { vp, uio, cred }
1479  */
1480 static
1481 int
1482 hammer_vop_readlink(struct vop_readlink_args *ap)
1483 {
1484         struct hammer_transaction trans;
1485         struct hammer_cursor cursor;
1486         struct hammer_inode *ip;
1487         char buf[32];
1488         u_int32_t localization;
1489         hammer_pseudofs_inmem_t pfsm;
1490         int error;
1491
1492         ip = VTOI(ap->a_vp);
1493
1494         /*
1495          * Shortcut if the symlink data was stuffed into ino_data.
1496          *
1497          * Also expand special "@@PFS%05d" softlinks (expansion only
1498          * occurs for non-historical (current) accesses made from the
1499          * primary filesystem).
1500          */
1501         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1502                 char *ptr;
1503                 int bytes;
1504
1505                 ptr = ip->ino_data.ext.symlink;
1506                 bytes = (int)ip->ino_data.size;
1507                 if (bytes == 10 &&
1508                     ip->obj_asof == HAMMER_MAX_TID &&
1509                     ip->obj_localization == 0 &&
1510                     strncmp(ptr, "@@PFS", 5) == 0) {
1511                         hammer_simple_transaction(&trans, ip->hmp);
1512                         bcopy(ptr + 5, buf, 5);
1513                         buf[5] = 0;
1514                         localization = strtoul(buf, NULL, 10) << 16;
1515                         pfsm = hammer_load_pseudofs(&trans, localization,
1516                                                     &error);
1517                         if (error == 0) {
1518                                 if (pfsm->pfsd.mirror_flags &
1519                                     HAMMER_PFSD_SLAVE) {
1520                                         /* vap->va_size == 26 */
1521                                         ksnprintf(buf, sizeof(buf),
1522                                                   "@@0x%016llx:%05d",
1523                                                   (long long)pfsm->pfsd.sync_end_tid,
1524                                                   localization >> 16);
1525                                 } else {
1526                                         /* vap->va_size == 10 */
1527                                         ksnprintf(buf, sizeof(buf),
1528                                                   "@@-1:%05d",
1529                                                   localization >> 16);
1530 #if 0
1531                                         ksnprintf(buf, sizeof(buf),
1532                                                   "@@0x%016llx:%05d",
1533                                                   (long long)HAMMER_MAX_TID,
1534                                                   localization >> 16);
1535 #endif
1536                                 }
1537                                 ptr = buf;
1538                                 bytes = strlen(buf);
1539                         }
1540                         if (pfsm)
1541                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1542                         hammer_done_transaction(&trans);
1543                 }
1544                 error = uiomove(ptr, bytes, ap->a_uio);
1545                 return(error);
1546         }
1547
1548         /*
1549          * Long version
1550          */
1551         hammer_simple_transaction(&trans, ip->hmp);
1552         ++hammer_stats_file_iopsr;
1553         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1554
1555         /*
1556          * Key range (begin and end inclusive) to scan.  Directory keys
1557          * directly translate to a 64 bit 'seek' position.
1558          */
1559         cursor.key_beg.localization = ip->obj_localization +
1560                                       HAMMER_LOCALIZE_MISC;
1561         cursor.key_beg.obj_id = ip->obj_id;
1562         cursor.key_beg.create_tid = 0;
1563         cursor.key_beg.delete_tid = 0;
1564         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1565         cursor.key_beg.obj_type = 0;
1566         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1567         cursor.asof = ip->obj_asof;
1568         cursor.flags |= HAMMER_CURSOR_ASOF;
1569
1570         error = hammer_ip_lookup(&cursor);
1571         if (error == 0) {
1572                 error = hammer_ip_resolve_data(&cursor);
1573                 if (error == 0) {
1574                         KKASSERT(cursor.leaf->data_len >=
1575                                  HAMMER_SYMLINK_NAME_OFF);
1576                         error = uiomove(cursor.data->symlink.name,
1577                                         cursor.leaf->data_len -
1578                                                 HAMMER_SYMLINK_NAME_OFF,
1579                                         ap->a_uio);
1580                 }
1581         }
1582         hammer_done_cursor(&cursor);
1583         hammer_done_transaction(&trans);
1584         return(error);
1585 }
1586
1587 /*
1588  * hammer_vop_nremove { nch, dvp, cred }
1589  */
1590 static
1591 int
1592 hammer_vop_nremove(struct vop_nremove_args *ap)
1593 {
1594         struct hammer_transaction trans;
1595         struct hammer_inode *dip;
1596         int error;
1597
1598         dip = VTOI(ap->a_dvp);
1599
1600         if (hammer_nohistory(dip) == 0 &&
1601             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1602                 return (error);
1603         }
1604
1605         hammer_start_transaction(&trans, dip->hmp);
1606         ++hammer_stats_file_iopsw;
1607         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1608         hammer_done_transaction(&trans);
1609         if (error == 0)
1610                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1611         return (error);
1612 }
1613
1614 /*
1615  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1616  */
1617 static
1618 int
1619 hammer_vop_nrename(struct vop_nrename_args *ap)
1620 {
1621         struct hammer_transaction trans;
1622         struct namecache *fncp;
1623         struct namecache *tncp;
1624         struct hammer_inode *fdip;
1625         struct hammer_inode *tdip;
1626         struct hammer_inode *ip;
1627         struct hammer_cursor cursor;
1628         int64_t namekey;
1629         u_int32_t max_iterations;
1630         int nlen, error;
1631
1632         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1633                 return(EXDEV);
1634         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1635                 return(EXDEV);
1636
1637         fdip = VTOI(ap->a_fdvp);
1638         tdip = VTOI(ap->a_tdvp);
1639         fncp = ap->a_fnch->ncp;
1640         tncp = ap->a_tnch->ncp;
1641         ip = VTOI(fncp->nc_vp);
1642         KKASSERT(ip != NULL);
1643
1644         if (fdip->obj_localization != tdip->obj_localization)
1645                 return(EXDEV);
1646         if (fdip->obj_localization != ip->obj_localization)
1647                 return(EXDEV);
1648
1649         if (fdip->flags & HAMMER_INODE_RO)
1650                 return (EROFS);
1651         if (tdip->flags & HAMMER_INODE_RO)
1652                 return (EROFS);
1653         if (ip->flags & HAMMER_INODE_RO)
1654                 return (EROFS);
1655         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1656                 return (error);
1657
1658         hammer_start_transaction(&trans, fdip->hmp);
1659         ++hammer_stats_file_iopsw;
1660
1661         /*
1662          * Remove tncp from the target directory and then link ip as
1663          * tncp. XXX pass trans to dounlink
1664          *
1665          * Force the inode sync-time to match the transaction so it is
1666          * in-sync with the creation of the target directory entry.
1667          */
1668         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1669                                 ap->a_cred, 0, -1);
1670         if (error == 0 || error == ENOENT) {
1671                 error = hammer_ip_add_directory(&trans, tdip,
1672                                                 tncp->nc_name, tncp->nc_nlen,
1673                                                 ip);
1674                 if (error == 0) {
1675                         ip->ino_data.parent_obj_id = tdip->obj_id;
1676                         ip->ino_data.ctime = trans.time;
1677                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1678                 }
1679         }
1680         if (error)
1681                 goto failed; /* XXX */
1682
1683         /*
1684          * Locate the record in the originating directory and remove it.
1685          *
1686          * Calculate the namekey and setup the key range for the scan.  This
1687          * works kinda like a chained hash table where the lower 32 bits
1688          * of the namekey synthesize the chain.
1689          *
1690          * The key range is inclusive of both key_beg and key_end.
1691          */
1692         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1693                                            &max_iterations);
1694 retry:
1695         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1696         cursor.key_beg.localization = fdip->obj_localization +
1697                                       hammer_dir_localization(fdip);
1698         cursor.key_beg.obj_id = fdip->obj_id;
1699         cursor.key_beg.key = namekey;
1700         cursor.key_beg.create_tid = 0;
1701         cursor.key_beg.delete_tid = 0;
1702         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1703         cursor.key_beg.obj_type = 0;
1704
1705         cursor.key_end = cursor.key_beg;
1706         cursor.key_end.key += max_iterations;
1707         cursor.asof = fdip->obj_asof;
1708         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1709
1710         /*
1711          * Scan all matching records (the chain), locate the one matching
1712          * the requested path component.
1713          *
1714          * The hammer_ip_*() functions merge in-memory records with on-disk
1715          * records for the purposes of the search.
1716          */
1717         error = hammer_ip_first(&cursor);
1718         while (error == 0) {
1719                 if (hammer_ip_resolve_data(&cursor) != 0)
1720                         break;
1721                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1722                 KKASSERT(nlen > 0);
1723                 if (fncp->nc_nlen == nlen &&
1724                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1725                         break;
1726                 }
1727                 error = hammer_ip_next(&cursor);
1728         }
1729
1730         /*
1731          * If all is ok we have to get the inode so we can adjust nlinks.
1732          *
1733          * WARNING: hammer_ip_del_directory() may have to terminate the
1734          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1735          * twice.
1736          */
1737         if (error == 0)
1738                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1739
1740         /*
1741          * XXX A deadlock here will break rename's atomicy for the purposes
1742          * of crash recovery.
1743          */
1744         if (error == EDEADLK) {
1745                 hammer_done_cursor(&cursor);
1746                 goto retry;
1747         }
1748
1749         /*
1750          * Cleanup and tell the kernel that the rename succeeded.
1751          */
1752         hammer_done_cursor(&cursor);
1753         if (error == 0) {
1754                 cache_rename(ap->a_fnch, ap->a_tnch);
1755                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1756                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1757                 if (ip->vp)
1758                         hammer_knote(ip->vp, NOTE_RENAME);
1759         }
1760
1761 failed:
1762         hammer_done_transaction(&trans);
1763         return (error);
1764 }
1765
1766 /*
1767  * hammer_vop_nrmdir { nch, dvp, cred }
1768  */
1769 static
1770 int
1771 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1772 {
1773         struct hammer_transaction trans;
1774         struct hammer_inode *dip;
1775         int error;
1776
1777         dip = VTOI(ap->a_dvp);
1778
1779         if (hammer_nohistory(dip) == 0 &&
1780             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1781                 return (error);
1782         }
1783
1784         hammer_start_transaction(&trans, dip->hmp);
1785         ++hammer_stats_file_iopsw;
1786         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1787         hammer_done_transaction(&trans);
1788         if (error == 0)
1789                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1790         return (error);
1791 }
1792
1793 /*
1794  * hammer_vop_markatime { vp, cred }
1795  */
1796 static
1797 int
1798 hammer_vop_markatime(struct vop_markatime_args *ap)
1799 {
1800         struct hammer_transaction trans;
1801         struct hammer_inode *ip;
1802
1803         ip = VTOI(ap->a_vp);
1804         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1805                 return (EROFS);
1806         if (ip->flags & HAMMER_INODE_RO)
1807                 return (EROFS);
1808         if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
1809                 return (0);
1810         hammer_start_transaction(&trans, ip->hmp);
1811         ++hammer_stats_file_iopsw;
1812
1813         ip->ino_data.atime = trans.time;
1814         hammer_modify_inode(ip, HAMMER_INODE_ATIME);
1815         hammer_done_transaction(&trans);
1816         hammer_knote(ap->a_vp, NOTE_ATTRIB);
1817         return (0);
1818 }
1819
1820 /*
1821  * hammer_vop_setattr { vp, vap, cred }
1822  */
1823 static
1824 int
1825 hammer_vop_setattr(struct vop_setattr_args *ap)
1826 {
1827         struct hammer_transaction trans;
1828         struct vattr *vap;
1829         struct hammer_inode *ip;
1830         int modflags;
1831         int error;
1832         int truncating;
1833         int blksize;
1834         int kflags;
1835         int64_t aligned_size;
1836         u_int32_t flags;
1837
1838         vap = ap->a_vap;
1839         ip = ap->a_vp->v_data;
1840         modflags = 0;
1841         kflags = 0;
1842
1843         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1844                 return(EROFS);
1845         if (ip->flags & HAMMER_INODE_RO)
1846                 return (EROFS);
1847         if (hammer_nohistory(ip) == 0 &&
1848             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1849                 return (error);
1850         }
1851
1852         hammer_start_transaction(&trans, ip->hmp);
1853         ++hammer_stats_file_iopsw;
1854         error = 0;
1855
1856         if (vap->va_flags != VNOVAL) {
1857                 flags = ip->ino_data.uflags;
1858                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1859                                          hammer_to_unix_xid(&ip->ino_data.uid),
1860                                          ap->a_cred);
1861                 if (error == 0) {
1862                         if (ip->ino_data.uflags != flags) {
1863                                 ip->ino_data.uflags = flags;
1864                                 ip->ino_data.ctime = trans.time;
1865                                 modflags |= HAMMER_INODE_DDIRTY;
1866                                 kflags |= NOTE_ATTRIB;
1867                         }
1868                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1869                                 error = 0;
1870                                 goto done;
1871                         }
1872                 }
1873                 goto done;
1874         }
1875         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1876                 error = EPERM;
1877                 goto done;
1878         }
1879         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1880                 mode_t cur_mode = ip->ino_data.mode;
1881                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1882                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1883                 uuid_t uuid_uid;
1884                 uuid_t uuid_gid;
1885
1886                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1887                                          ap->a_cred,
1888                                          &cur_uid, &cur_gid, &cur_mode);
1889                 if (error == 0) {
1890                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1891                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1892                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1893                                  sizeof(uuid_uid)) ||
1894                             bcmp(&uuid_gid, &ip->ino_data.gid,
1895                                  sizeof(uuid_gid)) ||
1896                             ip->ino_data.mode != cur_mode
1897                         ) {
1898                                 ip->ino_data.uid = uuid_uid;
1899                                 ip->ino_data.gid = uuid_gid;
1900                                 ip->ino_data.mode = cur_mode;
1901                                 ip->ino_data.ctime = trans.time;
1902                                 modflags |= HAMMER_INODE_DDIRTY;
1903                         }
1904                         kflags |= NOTE_ATTRIB;
1905                 }
1906         }
1907         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1908                 switch(ap->a_vp->v_type) {
1909                 case VREG:
1910                         if (vap->va_size == ip->ino_data.size)
1911                                 break;
1912                         /*
1913                          * XXX break atomicy, we can deadlock the backend
1914                          * if we do not release the lock.  Probably not a
1915                          * big deal here.
1916                          */
1917                         blksize = hammer_blocksize(vap->va_size);
1918                         if (vap->va_size < ip->ino_data.size) {
1919                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1920                                 truncating = 1;
1921                                 kflags |= NOTE_WRITE;
1922                         } else {
1923                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1924                                 truncating = 0;
1925                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
1926                         }
1927                         ip->ino_data.size = vap->va_size;
1928                         ip->ino_data.mtime = trans.time;
1929                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
1930
1931                         /*
1932                          * on-media truncation is cached in the inode until
1933                          * the inode is synchronized.
1934                          */
1935                         if (truncating) {
1936                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1937 #ifdef DEBUG_TRUNCATE
1938                                 if (HammerTruncIp == NULL)
1939                                         HammerTruncIp = ip;
1940 #endif
1941                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1942                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1943                                         ip->trunc_off = vap->va_size;
1944 #ifdef DEBUG_TRUNCATE
1945                                         if (ip == HammerTruncIp)
1946                                         kprintf("truncate1 %016llx\n",
1947                                                 (long long)ip->trunc_off);
1948 #endif
1949                                 } else if (ip->trunc_off > vap->va_size) {
1950                                         ip->trunc_off = vap->va_size;
1951 #ifdef DEBUG_TRUNCATE
1952                                         if (ip == HammerTruncIp)
1953                                         kprintf("truncate2 %016llx\n",
1954                                                 (long long)ip->trunc_off);
1955 #endif
1956                                 } else {
1957 #ifdef DEBUG_TRUNCATE
1958                                         if (ip == HammerTruncIp)
1959                                         kprintf("truncate3 %016llx (ignored)\n",
1960                                                 (long long)vap->va_size);
1961 #endif
1962                                 }
1963                         }
1964
1965                         /*
1966                          * If truncating we have to clean out a portion of
1967                          * the last block on-disk.  We do this in the
1968                          * front-end buffer cache.
1969                          */
1970                         aligned_size = (vap->va_size + (blksize - 1)) &
1971                                        ~(int64_t)(blksize - 1);
1972                         if (truncating && vap->va_size < aligned_size) {
1973                                 struct buf *bp;
1974                                 int offset;
1975
1976                                 aligned_size -= blksize;
1977
1978                                 offset = (int)vap->va_size & (blksize - 1);
1979                                 error = bread(ap->a_vp, aligned_size,
1980                                               blksize, &bp);
1981                                 hammer_ip_frontend_trunc(ip, aligned_size);
1982                                 if (error == 0) {
1983                                         bzero(bp->b_data + offset,
1984                                               blksize - offset);
1985                                         /* must de-cache direct-io offset */
1986                                         bp->b_bio2.bio_offset = NOOFFSET;
1987                                         bdwrite(bp);
1988                                 } else {
1989                                         kprintf("ERROR %d\n", error);
1990                                         brelse(bp);
1991                                 }
1992                         }
1993                         break;
1994                 case VDATABASE:
1995                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1996                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1997                                 ip->trunc_off = vap->va_size;
1998                         } else if (ip->trunc_off > vap->va_size) {
1999                                 ip->trunc_off = vap->va_size;
2000                         }
2001                         hammer_ip_frontend_trunc(ip, vap->va_size);
2002                         ip->ino_data.size = vap->va_size;
2003                         ip->ino_data.mtime = trans.time;
2004                         modflags |= HAMMER_INODE_MTIME | HAMMER_INODE_DDIRTY;
2005                         kflags |= NOTE_ATTRIB;
2006                         break;
2007                 default:
2008                         error = EINVAL;
2009                         goto done;
2010                 }
2011                 break;
2012         }
2013         if (vap->va_atime.tv_sec != VNOVAL) {
2014                 ip->ino_data.atime = hammer_timespec_to_time(&vap->va_atime);
2015                 modflags |= HAMMER_INODE_ATIME;
2016                 kflags |= NOTE_ATTRIB;
2017         }
2018         if (vap->va_mtime.tv_sec != VNOVAL) {
2019                 ip->ino_data.mtime = hammer_timespec_to_time(&vap->va_mtime);
2020                 modflags |= HAMMER_INODE_MTIME;
2021                 kflags |= NOTE_ATTRIB;
2022         }
2023         if (vap->va_mode != (mode_t)VNOVAL) {
2024                 mode_t   cur_mode = ip->ino_data.mode;
2025                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
2026                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
2027
2028                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
2029                                          cur_uid, cur_gid, &cur_mode);
2030                 if (error == 0 && ip->ino_data.mode != cur_mode) {
2031                         ip->ino_data.mode = cur_mode;
2032                         ip->ino_data.ctime = trans.time;
2033                         modflags |= HAMMER_INODE_DDIRTY;
2034                         kflags |= NOTE_ATTRIB;
2035                 }
2036         }
2037 done:
2038         if (error == 0)
2039                 hammer_modify_inode(ip, modflags);
2040         hammer_done_transaction(&trans);
2041         hammer_knote(ap->a_vp, kflags);
2042         return (error);
2043 }
2044
2045 /*
2046  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
2047  */
2048 static
2049 int
2050 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
2051 {
2052         struct hammer_transaction trans;
2053         struct hammer_inode *dip;
2054         struct hammer_inode *nip;
2055         struct nchandle *nch;
2056         hammer_record_t record;
2057         int error;
2058         int bytes;
2059
2060         ap->a_vap->va_type = VLNK;
2061
2062         nch = ap->a_nch;
2063         dip = VTOI(ap->a_dvp);
2064
2065         if (dip->flags & HAMMER_INODE_RO)
2066                 return (EROFS);
2067         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
2068                 return (error);
2069
2070         /*
2071          * Create a transaction to cover the operations we perform.
2072          */
2073         hammer_start_transaction(&trans, dip->hmp);
2074         ++hammer_stats_file_iopsw;
2075
2076         /*
2077          * Create a new filesystem object of the requested type.  The
2078          * returned inode will be referenced but not locked.
2079          */
2080
2081         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
2082                                     dip, nch->ncp->nc_name, nch->ncp->nc_nlen,
2083                                     NULL, &nip);
2084         if (error) {
2085                 hammer_done_transaction(&trans);
2086                 *ap->a_vpp = NULL;
2087                 return (error);
2088         }
2089
2090         /*
2091          * Add a record representing the symlink.  symlink stores the link
2092          * as pure data, not a string, and is no \0 terminated.
2093          */
2094         if (error == 0) {
2095                 bytes = strlen(ap->a_target);
2096
2097                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2098                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2099                 } else {
2100                         record = hammer_alloc_mem_record(nip, bytes);
2101                         record->type = HAMMER_MEM_RECORD_GENERAL;
2102
2103                         record->leaf.base.localization = nip->obj_localization +
2104                                                          HAMMER_LOCALIZE_MISC;
2105                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2106                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2107                         record->leaf.data_len = bytes;
2108                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2109                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2110                         error = hammer_ip_add_record(&trans, record);
2111                 }
2112
2113                 /*
2114                  * Set the file size to the length of the link.
2115                  */
2116                 if (error == 0) {
2117                         nip->ino_data.size = bytes;
2118                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
2119                 }
2120         }
2121         if (error == 0)
2122                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2123                                                 nch->ncp->nc_nlen, nip);
2124
2125         /*
2126          * Finish up.
2127          */
2128         if (error) {
2129                 hammer_rel_inode(nip, 0);
2130                 *ap->a_vpp = NULL;
2131         } else {
2132                 error = hammer_get_vnode(nip, ap->a_vpp);
2133                 hammer_rel_inode(nip, 0);
2134                 if (error == 0) {
2135                         cache_setunresolved(ap->a_nch);
2136                         cache_setvp(ap->a_nch, *ap->a_vpp);
2137                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2138                 }
2139         }
2140         hammer_done_transaction(&trans);
2141         return (error);
2142 }
2143
2144 /*
2145  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2146  */
2147 static
2148 int
2149 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2150 {
2151         struct hammer_transaction trans;
2152         struct hammer_inode *dip;
2153         int error;
2154
2155         dip = VTOI(ap->a_dvp);
2156
2157         if (hammer_nohistory(dip) == 0 &&
2158             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2159                 return (error);
2160         }
2161
2162         hammer_start_transaction(&trans, dip->hmp);
2163         ++hammer_stats_file_iopsw;
2164         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2165                                 ap->a_cred, ap->a_flags, -1);
2166         hammer_done_transaction(&trans);
2167
2168         return (error);
2169 }
2170
2171 /*
2172  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2173  */
2174 static
2175 int
2176 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2177 {
2178         struct hammer_inode *ip = ap->a_vp->v_data;
2179
2180         ++hammer_stats_file_iopsr;
2181         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2182                             ap->a_fflag, ap->a_cred));
2183 }
2184
2185 static
2186 int
2187 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2188 {
2189         struct mount *mp;
2190         int error;
2191
2192         mp = ap->a_head.a_ops->head.vv_mount;
2193
2194         switch(ap->a_op) {
2195         case MOUNTCTL_SET_EXPORT:
2196                 if (ap->a_ctllen != sizeof(struct export_args))
2197                         error = EINVAL;
2198                 else
2199                         error = hammer_vfs_export(mp, ap->a_op,
2200                                       (const struct export_args *)ap->a_ctl);
2201                 break;
2202         default:
2203                 error = journal_mountctl(ap);
2204                 break;
2205         }
2206         return(error);
2207 }
2208
2209 /*
2210  * hammer_vop_strategy { vp, bio }
2211  *
2212  * Strategy call, used for regular file read & write only.  Note that the
2213  * bp may represent a cluster.
2214  *
2215  * To simplify operation and allow better optimizations in the future,
2216  * this code does not make any assumptions with regards to buffer alignment
2217  * or size.
2218  */
2219 static
2220 int
2221 hammer_vop_strategy(struct vop_strategy_args *ap)
2222 {
2223         struct buf *bp;
2224         int error;
2225
2226         bp = ap->a_bio->bio_buf;
2227
2228         switch(bp->b_cmd) {
2229         case BUF_CMD_READ:
2230                 error = hammer_vop_strategy_read(ap);
2231                 break;
2232         case BUF_CMD_WRITE:
2233                 error = hammer_vop_strategy_write(ap);
2234                 break;
2235         default:
2236                 bp->b_error = error = EINVAL;
2237                 bp->b_flags |= B_ERROR;
2238                 biodone(ap->a_bio);
2239                 break;
2240         }
2241         return (error);
2242 }
2243
2244 /*
2245  * Read from a regular file.  Iterate the related records and fill in the
2246  * BIO/BUF.  Gaps are zero-filled.
2247  *
2248  * The support code in hammer_object.c should be used to deal with mixed
2249  * in-memory and on-disk records.
2250  *
2251  * NOTE: Can be called from the cluster code with an oversized buf.
2252  *
2253  * XXX atime update
2254  */
2255 static
2256 int
2257 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2258 {
2259         struct hammer_transaction trans;
2260         struct hammer_inode *ip;
2261         struct hammer_inode *dip;
2262         struct hammer_cursor cursor;
2263         hammer_base_elm_t base;
2264         hammer_off_t disk_offset;
2265         struct bio *bio;
2266         struct bio *nbio;
2267         struct buf *bp;
2268         int64_t rec_offset;
2269         int64_t ran_end;
2270         int64_t tmp64;
2271         int error;
2272         int boff;
2273         int roff;
2274         int n;
2275
2276         bio = ap->a_bio;
2277         bp = bio->bio_buf;
2278         ip = ap->a_vp->v_data;
2279
2280         /*
2281          * The zone-2 disk offset may have been set by the cluster code via
2282          * a BMAP operation, or else should be NOOFFSET.
2283          *
2284          * Checking the high bits for a match against zone-2 should suffice.
2285          */
2286         nbio = push_bio(bio);
2287         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2288             HAMMER_ZONE_LARGE_DATA) {
2289                 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2290                 return (error);
2291         }
2292
2293         /*
2294          * Well, that sucked.  Do it the hard way.  If all the stars are
2295          * aligned we may still be able to issue a direct-read.
2296          */
2297         hammer_simple_transaction(&trans, ip->hmp);
2298         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2299
2300         /*
2301          * Key range (begin and end inclusive) to scan.  Note that the key's
2302          * stored in the actual records represent BASE+LEN, not BASE.  The
2303          * first record containing bio_offset will have a key > bio_offset.
2304          */
2305         cursor.key_beg.localization = ip->obj_localization +
2306                                       HAMMER_LOCALIZE_MISC;
2307         cursor.key_beg.obj_id = ip->obj_id;
2308         cursor.key_beg.create_tid = 0;
2309         cursor.key_beg.delete_tid = 0;
2310         cursor.key_beg.obj_type = 0;
2311         cursor.key_beg.key = bio->bio_offset + 1;
2312         cursor.asof = ip->obj_asof;
2313         cursor.flags |= HAMMER_CURSOR_ASOF;
2314
2315         cursor.key_end = cursor.key_beg;
2316         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2317 #if 0
2318         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2319                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2320                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2321                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2322         } else
2323 #endif
2324         {
2325                 ran_end = bio->bio_offset + bp->b_bufsize;
2326                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2327                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2328                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2329                 if (tmp64 < ran_end)
2330                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2331                 else
2332                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2333         }
2334         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2335
2336         error = hammer_ip_first(&cursor);
2337         boff = 0;
2338
2339         while (error == 0) {
2340                 /*
2341                  * Get the base file offset of the record.  The key for
2342                  * data records is (base + bytes) rather then (base).
2343                  */
2344                 base = &cursor.leaf->base;
2345                 rec_offset = base->key - cursor.leaf->data_len;
2346
2347                 /*
2348                  * Calculate the gap, if any, and zero-fill it.
2349                  *
2350                  * n is the offset of the start of the record verses our
2351                  * current seek offset in the bio.
2352                  */
2353                 n = (int)(rec_offset - (bio->bio_offset + boff));
2354                 if (n > 0) {
2355                         if (n > bp->b_bufsize - boff)
2356                                 n = bp->b_bufsize - boff;
2357                         bzero((char *)bp->b_data + boff, n);
2358                         boff += n;
2359                         n = 0;
2360                 }
2361
2362                 /*
2363                  * Calculate the data offset in the record and the number
2364                  * of bytes we can copy.
2365                  *
2366                  * There are two degenerate cases.  First, boff may already
2367                  * be at bp->b_bufsize.  Secondly, the data offset within
2368                  * the record may exceed the record's size.
2369                  */
2370                 roff = -n;
2371                 rec_offset += roff;
2372                 n = cursor.leaf->data_len - roff;
2373                 if (n <= 0) {
2374                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2375                         n = 0;
2376                 } else if (n > bp->b_bufsize - boff) {
2377                         n = bp->b_bufsize - boff;
2378                 }
2379
2380                 /*
2381                  * Deal with cached truncations.  This cool bit of code
2382                  * allows truncate()/ftruncate() to avoid having to sync
2383                  * the file.
2384                  *
2385                  * If the frontend is truncated then all backend records are
2386                  * subject to the frontend's truncation.
2387                  *
2388                  * If the backend is truncated then backend records on-disk
2389                  * (but not in-memory) are subject to the backend's
2390                  * truncation.  In-memory records owned by the backend
2391                  * represent data written after the truncation point on the
2392                  * backend and must not be truncated.
2393                  *
2394                  * Truncate operations deal with frontend buffer cache
2395                  * buffers and frontend-owned in-memory records synchronously.
2396                  */
2397                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2398                         if (hammer_cursor_ondisk(&cursor) ||
2399                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2400                                 if (ip->trunc_off <= rec_offset)
2401                                         n = 0;
2402                                 else if (ip->trunc_off < rec_offset + n)
2403                                         n = (int)(ip->trunc_off - rec_offset);
2404                         }
2405                 }
2406                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2407                         if (hammer_cursor_ondisk(&cursor)) {
2408                                 if (ip->sync_trunc_off <= rec_offset)
2409                                         n = 0;
2410                                 else if (ip->sync_trunc_off < rec_offset + n)
2411                                         n = (int)(ip->sync_trunc_off - rec_offset);
2412                         }
2413                 }
2414
2415                 /*
2416                  * Try to issue a direct read into our bio if possible,
2417                  * otherwise resolve the element data into a hammer_buffer
2418                  * and copy.
2419                  *
2420                  * The buffer on-disk should be zerod past any real
2421                  * truncation point, but may not be for any synthesized
2422                  * truncation point from above.
2423                  */
2424                 disk_offset = cursor.leaf->data_offset + roff;
2425                 if (boff == 0 && n == bp->b_bufsize &&
2426                     hammer_cursor_ondisk(&cursor) &&
2427                     (disk_offset & HAMMER_BUFMASK) == 0) {
2428                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2429                                  HAMMER_ZONE_LARGE_DATA);
2430                         nbio->bio_offset = disk_offset;
2431                         error = hammer_io_direct_read(trans.hmp, nbio,
2432                                                       cursor.leaf);
2433                         goto done;
2434                 } else if (n) {
2435                         error = hammer_ip_resolve_data(&cursor);
2436                         if (error == 0) {
2437                                 bcopy((char *)cursor.data + roff,
2438                                       (char *)bp->b_data + boff, n);
2439                         }
2440                 }
2441                 if (error)
2442                         break;
2443
2444                 /*
2445                  * Iterate until we have filled the request.
2446                  */
2447                 boff += n;
2448                 if (boff == bp->b_bufsize)
2449                         break;
2450                 error = hammer_ip_next(&cursor);
2451         }
2452
2453         /*
2454          * There may have been a gap after the last record
2455          */
2456         if (error == ENOENT)
2457                 error = 0;
2458         if (error == 0 && boff != bp->b_bufsize) {
2459                 KKASSERT(boff < bp->b_bufsize);
2460                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2461                 /* boff = bp->b_bufsize; */
2462         }
2463         bp->b_resid = 0;
2464         bp->b_error = error;
2465         if (error)
2466                 bp->b_flags |= B_ERROR;
2467         biodone(ap->a_bio);
2468
2469 done:
2470         /*
2471          * Cache the b-tree node for the last data read in cache[1].
2472          *
2473          * If we hit the file EOF then also cache the node in the
2474          * governing director's cache[3], it will be used to initialize
2475          * the inode's cache[1] for any inodes looked up via the directory.
2476          *
2477          * This doesn't reduce disk accesses since the B-Tree chain is
2478          * likely cached, but it does reduce cpu overhead when looking
2479          * up file offsets for cpdup/tar/cpio style iterations.
2480          */
2481         if (cursor.node)
2482                 hammer_cache_node(&ip->cache[1], cursor.node);
2483         if (ran_end >= ip->ino_data.size) {
2484                 dip = hammer_find_inode(&trans, ip->ino_data.parent_obj_id,
2485                                         ip->obj_asof, ip->obj_localization);
2486                 if (dip) {
2487                         hammer_cache_node(&dip->cache[3], cursor.node);
2488                         hammer_rel_inode(dip, 0);
2489                 }
2490         }
2491         hammer_done_cursor(&cursor);
2492         hammer_done_transaction(&trans);
2493         return(error);
2494 }
2495
2496 /*
2497  * BMAP operation - used to support cluster_read() only.
2498  *
2499  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2500  *
2501  * This routine may return EOPNOTSUPP if the opration is not supported for
2502  * the specified offset.  The contents of the pointer arguments do not
2503  * need to be initialized in that case. 
2504  *
2505  * If a disk address is available and properly aligned return 0 with 
2506  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2507  * to the run-length relative to that offset.  Callers may assume that
2508  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2509  * large, so return EOPNOTSUPP if it is not sufficiently large.
2510  */
2511 static
2512 int
2513 hammer_vop_bmap(struct vop_bmap_args *ap)
2514 {
2515         struct hammer_transaction trans;
2516         struct hammer_inode *ip;
2517         struct hammer_cursor cursor;
2518         hammer_base_elm_t base;
2519         int64_t rec_offset;
2520         int64_t ran_end;
2521         int64_t tmp64;
2522         int64_t base_offset;
2523         int64_t base_disk_offset;
2524         int64_t last_offset;
2525         hammer_off_t last_disk_offset;
2526         hammer_off_t disk_offset;
2527         int     rec_len;
2528         int     error;
2529         int     blksize;
2530
2531         ++hammer_stats_file_iopsr;
2532         ip = ap->a_vp->v_data;
2533
2534         /*
2535          * We can only BMAP regular files.  We can't BMAP database files,
2536          * directories, etc.
2537          */
2538         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2539                 return(EOPNOTSUPP);
2540
2541         /*
2542          * bmap is typically called with runp/runb both NULL when used
2543          * for writing.  We do not support BMAP for writing atm.
2544          */
2545         if (ap->a_cmd != BUF_CMD_READ)
2546                 return(EOPNOTSUPP);
2547
2548         /*
2549          * Scan the B-Tree to acquire blockmap addresses, then translate
2550          * to raw addresses.
2551          */
2552         hammer_simple_transaction(&trans, ip->hmp);
2553 #if 0
2554         kprintf("bmap_beg %016llx ip->cache %p\n",
2555                 (long long)ap->a_loffset, ip->cache[1]);
2556 #endif
2557         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2558
2559         /*
2560          * Key range (begin and end inclusive) to scan.  Note that the key's
2561          * stored in the actual records represent BASE+LEN, not BASE.  The
2562          * first record containing bio_offset will have a key > bio_offset.
2563          */
2564         cursor.key_beg.localization = ip->obj_localization +
2565                                       HAMMER_LOCALIZE_MISC;
2566         cursor.key_beg.obj_id = ip->obj_id;
2567         cursor.key_beg.create_tid = 0;
2568         cursor.key_beg.delete_tid = 0;
2569         cursor.key_beg.obj_type = 0;
2570         if (ap->a_runb)
2571                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2572         else
2573                 cursor.key_beg.key = ap->a_loffset + 1;
2574         if (cursor.key_beg.key < 0)
2575                 cursor.key_beg.key = 0;
2576         cursor.asof = ip->obj_asof;
2577         cursor.flags |= HAMMER_CURSOR_ASOF;
2578
2579         cursor.key_end = cursor.key_beg;
2580         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2581
2582         ran_end = ap->a_loffset + MAXPHYS;
2583         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2584         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2585         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2586         if (tmp64 < ran_end)
2587                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2588         else
2589                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2590
2591         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2592
2593         error = hammer_ip_first(&cursor);
2594         base_offset = last_offset = 0;
2595         base_disk_offset = last_disk_offset = 0;
2596
2597         while (error == 0) {
2598                 /*
2599                  * Get the base file offset of the record.  The key for
2600                  * data records is (base + bytes) rather then (base).
2601                  *
2602                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2603                  * The extra bytes should be zero on-disk and the BMAP op
2604                  * should still be ok.
2605                  */
2606                 base = &cursor.leaf->base;
2607                 rec_offset = base->key - cursor.leaf->data_len;
2608                 rec_len    = cursor.leaf->data_len;
2609
2610                 /*
2611                  * Incorporate any cached truncation.
2612                  *
2613                  * NOTE: Modifications to rec_len based on synthesized
2614                  * truncation points remove the guarantee that any extended
2615                  * data on disk is zero (since the truncations may not have
2616                  * taken place on-media yet).
2617                  */
2618                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2619                         if (hammer_cursor_ondisk(&cursor) ||
2620                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2621                                 if (ip->trunc_off <= rec_offset)
2622                                         rec_len = 0;
2623                                 else if (ip->trunc_off < rec_offset + rec_len)
2624                                         rec_len = (int)(ip->trunc_off - rec_offset);
2625                         }
2626                 }
2627                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2628                         if (hammer_cursor_ondisk(&cursor)) {
2629                                 if (ip->sync_trunc_off <= rec_offset)
2630                                         rec_len = 0;
2631                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2632                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2633                         }
2634                 }
2635
2636                 /*
2637                  * Accumulate information.  If we have hit a discontiguous
2638                  * block reset base_offset unless we are already beyond the
2639                  * requested offset.  If we are, that's it, we stop.
2640                  */
2641                 if (error)
2642                         break;
2643                 if (hammer_cursor_ondisk(&cursor)) {
2644                         disk_offset = cursor.leaf->data_offset;
2645                         if (rec_offset != last_offset ||
2646                             disk_offset != last_disk_offset) {
2647                                 if (rec_offset > ap->a_loffset)
2648                                         break;
2649                                 base_offset = rec_offset;
2650                                 base_disk_offset = disk_offset;
2651                         }
2652                         last_offset = rec_offset + rec_len;
2653                         last_disk_offset = disk_offset + rec_len;
2654                 }
2655                 error = hammer_ip_next(&cursor);
2656         }
2657
2658 #if 0
2659         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2660                 (long long)ap->a_loffset,
2661                 (long long)base_offset,
2662                 (long long)last_offset);
2663         kprintf("BMAP %16s:  %016llx - %016llx\n", "",
2664                 (long long)base_disk_offset,
2665                 (long long)last_disk_offset);
2666 #endif
2667
2668         if (cursor.node) {
2669                 hammer_cache_node(&ip->cache[1], cursor.node);
2670 #if 0
2671                 kprintf("bmap_end2 %016llx ip->cache %p\n",
2672                         (long long)ap->a_loffset, ip->cache[1]);
2673 #endif
2674         }
2675         hammer_done_cursor(&cursor);
2676         hammer_done_transaction(&trans);
2677
2678         /*
2679          * If we couldn't find any records or the records we did find were
2680          * all behind the requested offset, return failure.  A forward
2681          * truncation can leave a hole w/ no on-disk records.
2682          */
2683         if (last_offset == 0 || last_offset < ap->a_loffset)
2684                 return (EOPNOTSUPP);
2685
2686         /*
2687          * Figure out the block size at the requested offset and adjust
2688          * our limits so the cluster_read() does not create inappropriately
2689          * sized buffer cache buffers.
2690          */
2691         blksize = hammer_blocksize(ap->a_loffset);
2692         if (hammer_blocksize(base_offset) != blksize) {
2693                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2694         }
2695         if (last_offset != ap->a_loffset &&
2696             hammer_blocksize(last_offset - 1) != blksize) {
2697                 last_offset = hammer_blockdemarc(ap->a_loffset,
2698                                                  last_offset - 1);
2699         }
2700
2701         /*
2702          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2703          * from occuring.
2704          */
2705         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2706
2707         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2708                 /*
2709                  * Only large-data zones can be direct-IOd
2710                  */
2711                 error = EOPNOTSUPP;
2712         } else if ((disk_offset & HAMMER_BUFMASK) ||
2713                    (last_offset - ap->a_loffset) < blksize) {
2714                 /*
2715                  * doffsetp is not aligned or the forward run size does
2716                  * not cover a whole buffer, disallow the direct I/O.
2717                  */
2718                 error = EOPNOTSUPP;
2719         } else {
2720                 /*
2721                  * We're good.
2722                  */
2723                 *ap->a_doffsetp = disk_offset;
2724                 if (ap->a_runb) {
2725                         *ap->a_runb = ap->a_loffset - base_offset;
2726                         KKASSERT(*ap->a_runb >= 0);
2727                 }
2728                 if (ap->a_runp) {
2729                         *ap->a_runp = last_offset - ap->a_loffset;
2730                         KKASSERT(*ap->a_runp >= 0);
2731                 }
2732                 error = 0;
2733         }
2734         return(error);
2735 }
2736
2737 /*
2738  * Write to a regular file.   Because this is a strategy call the OS is
2739  * trying to actually get data onto the media.
2740  */
2741 static
2742 int
2743 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2744 {
2745         hammer_record_t record;
2746         hammer_mount_t hmp;
2747         hammer_inode_t ip;
2748         struct bio *bio;
2749         struct buf *bp;
2750         int blksize;
2751         int bytes;
2752         int error;
2753
2754         bio = ap->a_bio;
2755         bp = bio->bio_buf;
2756         ip = ap->a_vp->v_data;
2757         hmp = ip->hmp;
2758
2759         blksize = hammer_blocksize(bio->bio_offset);
2760         KKASSERT(bp->b_bufsize == blksize);
2761
2762         if (ip->flags & HAMMER_INODE_RO) {
2763                 bp->b_error = EROFS;
2764                 bp->b_flags |= B_ERROR;
2765                 biodone(ap->a_bio);
2766                 return(EROFS);
2767         }
2768
2769         /*
2770          * Interlock with inode destruction (no in-kernel or directory
2771          * topology visibility).  If we queue new IO while trying to
2772          * destroy the inode we can deadlock the vtrunc call in
2773          * hammer_inode_unloadable_check().
2774          *
2775          * Besides, there's no point flushing a bp associated with an
2776          * inode that is being destroyed on-media and has no kernel
2777          * references.
2778          */
2779         if ((ip->flags | ip->sync_flags) &
2780             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2781                 bp->b_resid = 0;
2782                 biodone(ap->a_bio);
2783                 return(0);
2784         }
2785
2786         /*
2787          * Reserve space and issue a direct-write from the front-end. 
2788          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2789          * allocations.
2790          *
2791          * An in-memory record will be installed to reference the storage
2792          * until the flusher can get to it.
2793          *
2794          * Since we own the high level bio the front-end will not try to
2795          * do a direct-read until the write completes.
2796          *
2797          * NOTE: The only time we do not reserve a full-sized buffers
2798          * worth of data is if the file is small.  We do not try to
2799          * allocate a fragment (from the small-data zone) at the end of
2800          * an otherwise large file as this can lead to wildly separated
2801          * data.
2802          */
2803         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2804         KKASSERT(bio->bio_offset < ip->ino_data.size);
2805         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2806                 bytes = bp->b_bufsize;
2807         else
2808                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2809
2810         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2811                                     bytes, &error);
2812         if (record) {
2813                 hammer_io_direct_write(hmp, record, bio);
2814                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2815                         hammer_flush_inode(ip, 0);
2816         } else {
2817                 bp->b_bio2.bio_offset = NOOFFSET;
2818                 bp->b_error = error;
2819                 bp->b_flags |= B_ERROR;
2820                 biodone(ap->a_bio);
2821         }
2822         return(error);
2823 }
2824
2825 /*
2826  * dounlink - disconnect a directory entry
2827  *
2828  * XXX whiteout support not really in yet
2829  */
2830 static int
2831 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2832                 struct vnode *dvp, struct ucred *cred, 
2833                 int flags, int isdir)
2834 {
2835         struct namecache *ncp;
2836         hammer_inode_t dip;
2837         hammer_inode_t ip;
2838         struct hammer_cursor cursor;
2839         int64_t namekey;
2840         u_int32_t max_iterations;
2841         int nlen, error;
2842
2843         /*
2844          * Calculate the namekey and setup the key range for the scan.  This
2845          * works kinda like a chained hash table where the lower 32 bits
2846          * of the namekey synthesize the chain.
2847          *
2848          * The key range is inclusive of both key_beg and key_end.
2849          */
2850         dip = VTOI(dvp);
2851         ncp = nch->ncp;
2852
2853         if (dip->flags & HAMMER_INODE_RO)
2854                 return (EROFS);
2855
2856         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
2857                                            &max_iterations);
2858 retry:
2859         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2860         cursor.key_beg.localization = dip->obj_localization +
2861                                       hammer_dir_localization(dip);
2862         cursor.key_beg.obj_id = dip->obj_id;
2863         cursor.key_beg.key = namekey;
2864         cursor.key_beg.create_tid = 0;
2865         cursor.key_beg.delete_tid = 0;
2866         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2867         cursor.key_beg.obj_type = 0;
2868
2869         cursor.key_end = cursor.key_beg;
2870         cursor.key_end.key += max_iterations;
2871         cursor.asof = dip->obj_asof;
2872         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2873
2874         /*
2875          * Scan all matching records (the chain), locate the one matching
2876          * the requested path component.  info->last_error contains the
2877          * error code on search termination and could be 0, ENOENT, or
2878          * something else.
2879          *
2880          * The hammer_ip_*() functions merge in-memory records with on-disk
2881          * records for the purposes of the search.
2882          */
2883         error = hammer_ip_first(&cursor);
2884
2885         while (error == 0) {
2886                 error = hammer_ip_resolve_data(&cursor);
2887                 if (error)
2888                         break;
2889                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2890                 KKASSERT(nlen > 0);
2891                 if (ncp->nc_nlen == nlen &&
2892                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2893                         break;
2894                 }
2895                 error = hammer_ip_next(&cursor);
2896         }
2897
2898         /*
2899          * If all is ok we have to get the inode so we can adjust nlinks.
2900          * To avoid a deadlock with the flusher we must release the inode
2901          * lock on the directory when acquiring the inode for the entry.
2902          *
2903          * If the target is a directory, it must be empty.
2904          */
2905         if (error == 0) {
2906                 hammer_unlock(&cursor.ip->lock);
2907                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2908                                       dip->hmp->asof,
2909                                       cursor.data->entry.localization,
2910                                       0, &error);
2911                 hammer_lock_sh(&cursor.ip->lock);
2912                 if (error == ENOENT) {
2913                         kprintf("HAMMER: WARNING: Removing "
2914                                 "dirent w/missing inode \"%s\"\n"
2915                                 "\tobj_id = %016llx\n",
2916                                 ncp->nc_name,
2917                                 (long long)cursor.data->entry.obj_id);
2918                         error = 0;
2919                 }
2920
2921                 /*
2922                  * If isdir >= 0 we validate that the entry is or is not a
2923                  * directory.  If isdir < 0 we don't care.
2924                  */
2925                 if (error == 0 && isdir >= 0 && ip) {
2926                         if (isdir &&
2927                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
2928                                 error = ENOTDIR;
2929                         } else if (isdir == 0 &&
2930                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
2931                                 error = EISDIR;
2932                         }
2933                 }
2934
2935                 /*
2936                  * If we are trying to remove a directory the directory must
2937                  * be empty.
2938                  *
2939                  * The check directory code can loop and deadlock/retry.  Our
2940                  * own cursor's node locks must be released to avoid a 3-way
2941                  * deadlock with the flusher if the check directory code
2942                  * blocks.
2943                  *
2944                  * If any changes whatsoever have been made to the cursor
2945                  * set EDEADLK and retry.
2946                  */
2947                 if (error == 0 && ip && ip->ino_data.obj_type ==
2948                                         HAMMER_OBJTYPE_DIRECTORY) {
2949                         hammer_unlock_cursor(&cursor);
2950                         error = hammer_ip_check_directory_empty(trans, ip);
2951                         hammer_lock_cursor(&cursor);
2952                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
2953                                 kprintf("HAMMER: Warning: avoided deadlock "
2954                                         "on rmdir '%s'\n",
2955                                         ncp->nc_name);
2956                                 error = EDEADLK;
2957                         }
2958                 }
2959
2960                 /*
2961                  * Delete the directory entry.
2962                  *
2963                  * WARNING: hammer_ip_del_directory() may have to terminate
2964                  * the cursor to avoid a deadlock.  It is ok to call
2965                  * hammer_done_cursor() twice.
2966                  */
2967                 if (error == 0) {
2968                         error = hammer_ip_del_directory(trans, &cursor,
2969                                                         dip, ip);
2970                 }
2971                 hammer_done_cursor(&cursor);
2972                 if (error == 0) {
2973                         cache_setunresolved(nch);
2974                         cache_setvp(nch, NULL);
2975                         /* XXX locking */
2976                         if (ip && ip->vp) {
2977                                 hammer_knote(ip->vp, NOTE_DELETE);
2978                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2979                         }
2980                 }
2981                 if (ip)
2982                         hammer_rel_inode(ip, 0);
2983         } else {
2984                 hammer_done_cursor(&cursor);
2985         }
2986         if (error == EDEADLK)
2987                 goto retry;
2988
2989         return (error);
2990 }
2991
2992 /************************************************************************
2993  *                          FIFO AND SPECFS OPS                         *
2994  ************************************************************************
2995  *
2996  */
2997
2998 static int
2999 hammer_vop_fifoclose (struct vop_close_args *ap)
3000 {
3001         /* XXX update itimes */
3002         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
3003 }
3004
3005 static int
3006 hammer_vop_fiforead (struct vop_read_args *ap)
3007 {
3008         int error;
3009
3010         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3011         /* XXX update access time */
3012         return (error);
3013 }
3014
3015 static int
3016 hammer_vop_fifowrite (struct vop_write_args *ap)
3017 {
3018         int error;
3019
3020         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3021         /* XXX update access time */
3022         return (error);
3023 }
3024
3025 static
3026 int
3027 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
3028 {
3029         int error;
3030
3031         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
3032         if (error)
3033                 error = hammer_vop_kqfilter(ap);
3034         return(error);
3035 }
3036
3037 static int
3038 hammer_vop_specclose (struct vop_close_args *ap)
3039 {
3040         /* XXX update itimes */
3041         return (VOCALL(&spec_vnode_vops, &ap->a_head));
3042 }
3043
3044 static int
3045 hammer_vop_specread (struct vop_read_args *ap)
3046 {
3047         /* XXX update access time */
3048         return (VOCALL(&spec_vnode_vops, &ap->a_head));
3049 }
3050
3051 static int
3052 hammer_vop_specwrite (struct vop_write_args *ap)
3053 {
3054         /* XXX update last change time */
3055         return (VOCALL(&spec_vnode_vops, &ap->a_head));
3056 }
3057
3058 /*
3059  * SPECFS's getattr will override fields as necessary, but does not fill
3060  *          stuff in from scratch.
3061  */
3062 static
3063 int
3064 hammer_vop_specgetattr (struct vop_getattr_args *ap)
3065 {
3066         int error;
3067
3068         error = hammer_vop_getattr(ap);
3069         if (error == 0)
3070                 VOCALL(&spec_vnode_vops, &ap->a_head);
3071         return (error);
3072 }
3073
3074
3075 /************************************************************************
3076  *                          KQFILTER OPS                                *
3077  ************************************************************************
3078  *
3079  */
3080 static void filt_hammerdetach(struct knote *kn);
3081 static int filt_hammerread(struct knote *kn, long hint);
3082 static int filt_hammerwrite(struct knote *kn, long hint);
3083 static int filt_hammervnode(struct knote *kn, long hint);
3084
3085 static struct filterops hammerread_filtops =
3086         { 1, NULL, filt_hammerdetach, filt_hammerread };
3087 static struct filterops hammerwrite_filtops =
3088         { 1, NULL, filt_hammerdetach, filt_hammerwrite };
3089 static struct filterops hammervnode_filtops =
3090         { 1, NULL, filt_hammerdetach, filt_hammervnode };
3091
3092 static
3093 int
3094 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
3095 {
3096         struct vnode *vp = ap->a_vp;
3097         struct knote *kn = ap->a_kn;
3098         lwkt_tokref vlock;
3099
3100         switch (kn->kn_filter) {
3101         case EVFILT_READ:
3102                 kn->kn_fop = &hammerread_filtops;
3103                 break;
3104         case EVFILT_WRITE:
3105                 kn->kn_fop = &hammerwrite_filtops;
3106                 break;
3107         case EVFILT_VNODE:
3108                 kn->kn_fop = &hammervnode_filtops;
3109                 break;
3110         default:
3111                 return (1);
3112         }
3113
3114         kn->kn_hook = (caddr_t)vp;
3115
3116         lwkt_gettoken(&vlock, &vp->v_token);
3117         SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
3118         lwkt_reltoken(&vlock);
3119
3120         return(0);
3121 }
3122
3123 static void
3124 filt_hammerdetach(struct knote *kn)
3125 {
3126         struct vnode *vp = (void *)kn->kn_hook;
3127         lwkt_tokref vlock;
3128
3129         lwkt_gettoken(&vlock, &vp->v_token);
3130         SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
3131                      kn, knote, kn_selnext);
3132         lwkt_reltoken(&vlock);
3133 }
3134
3135 static int
3136 filt_hammerread(struct knote *kn, long hint)
3137 {
3138         struct vnode *vp = (void *)kn->kn_hook;
3139         hammer_inode_t ip = VTOI(vp);
3140
3141         if (hint == NOTE_REVOKE) {
3142                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3143                 return(1);
3144         }
3145         kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
3146         return (kn->kn_data != 0);
3147 }
3148
3149 static int
3150 filt_hammerwrite(struct knote *kn, long hint)
3151 {
3152         if (hint == NOTE_REVOKE)
3153                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3154         kn->kn_data = 0;
3155         return (1);
3156 }
3157
3158 static int
3159 filt_hammervnode(struct knote *kn, long hint)
3160 {
3161         if (kn->kn_sfflags & hint)
3162                 kn->kn_fflags |= hint;
3163         if (hint == NOTE_REVOKE) {
3164                 kn->kn_flags |= EV_EOF;
3165                 return (1);
3166         }
3167         return (kn->kn_fflags != 0);
3168 }
3169