Merge branch 'misc'
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
51
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
84
85 static int hammer_vop_fifoclose (struct vop_close_args *);
86 static int hammer_vop_fiforead (struct vop_read_args *);
87 static int hammer_vop_fifowrite (struct vop_write_args *);
88 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
89
90 static int hammer_vop_specclose (struct vop_close_args *);
91 static int hammer_vop_specread (struct vop_read_args *);
92 static int hammer_vop_specwrite (struct vop_write_args *);
93
94 struct vop_ops hammer_vnode_vops = {
95         .vop_default =          vop_defaultop,
96         .vop_fsync =            hammer_vop_fsync,
97         .vop_getpages =         vop_stdgetpages,
98         .vop_putpages =         vop_stdputpages,
99         .vop_read =             hammer_vop_read,
100         .vop_write =            hammer_vop_write,
101         .vop_access =           hammer_vop_access,
102         .vop_advlock =          hammer_vop_advlock,
103         .vop_close =            hammer_vop_close,
104         .vop_ncreate =          hammer_vop_ncreate,
105         .vop_getattr =          hammer_vop_getattr,
106         .vop_inactive =         hammer_vop_inactive,
107         .vop_reclaim =          hammer_vop_reclaim,
108         .vop_nresolve =         hammer_vop_nresolve,
109         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
110         .vop_nlink =            hammer_vop_nlink,
111         .vop_nmkdir =           hammer_vop_nmkdir,
112         .vop_nmknod =           hammer_vop_nmknod,
113         .vop_open =             hammer_vop_open,
114         .vop_pathconf =         vop_stdpathconf,
115         .vop_print =            hammer_vop_print,
116         .vop_readdir =          hammer_vop_readdir,
117         .vop_readlink =         hammer_vop_readlink,
118         .vop_nremove =          hammer_vop_nremove,
119         .vop_nrename =          hammer_vop_nrename,
120         .vop_nrmdir =           hammer_vop_nrmdir,
121         .vop_setattr =          hammer_vop_setattr,
122         .vop_bmap =             hammer_vop_bmap,
123         .vop_strategy =         hammer_vop_strategy,
124         .vop_nsymlink =         hammer_vop_nsymlink,
125         .vop_nwhiteout =        hammer_vop_nwhiteout,
126         .vop_ioctl =            hammer_vop_ioctl,
127         .vop_mountctl =         hammer_vop_mountctl,
128         .vop_kqfilter =         hammer_vop_kqfilter
129 };
130
131 struct vop_ops hammer_spec_vops = {
132         .vop_default =          spec_vnoperate,
133         .vop_fsync =            hammer_vop_fsync,
134         .vop_read =             hammer_vop_specread,
135         .vop_write =            hammer_vop_specwrite,
136         .vop_access =           hammer_vop_access,
137         .vop_close =            hammer_vop_specclose,
138         .vop_getattr =          hammer_vop_getattr,
139         .vop_inactive =         hammer_vop_inactive,
140         .vop_reclaim =          hammer_vop_reclaim,
141         .vop_setattr =          hammer_vop_setattr
142 };
143
144 struct vop_ops hammer_fifo_vops = {
145         .vop_default =          fifo_vnoperate,
146         .vop_fsync =            hammer_vop_fsync,
147         .vop_read =             hammer_vop_fiforead,
148         .vop_write =            hammer_vop_fifowrite,
149         .vop_access =           hammer_vop_access,
150         .vop_close =            hammer_vop_fifoclose,
151         .vop_getattr =          hammer_vop_getattr,
152         .vop_inactive =         hammer_vop_inactive,
153         .vop_reclaim =          hammer_vop_reclaim,
154         .vop_setattr =          hammer_vop_setattr,
155         .vop_kqfilter =         hammer_vop_fifokqfilter
156 };
157
158 static __inline
159 void
160 hammer_knote(struct vnode *vp, int flags)
161 {
162         if (flags)
163                 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
164 }
165
166 #ifdef DEBUG_TRUNCATE
167 struct hammer_inode *HammerTruncIp;
168 #endif
169
170 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
171                            struct vnode *dvp, struct ucred *cred,
172                            int flags, int isdir);
173 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
174 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
175
176 #if 0
177 static
178 int
179 hammer_vop_vnoperate(struct vop_generic_args *)
180 {
181         return (VOCALL(&hammer_vnode_vops, ap));
182 }
183 #endif
184
185 /*
186  * hammer_vop_fsync { vp, waitfor }
187  *
188  * fsync() an inode to disk and wait for it to be completely committed
189  * such that the information would not be undone if a crash occured after
190  * return.
191  */
192 static
193 int
194 hammer_vop_fsync(struct vop_fsync_args *ap)
195 {
196         hammer_inode_t ip = VTOI(ap->a_vp);
197
198         ++hammer_count_fsyncs;
199         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
200         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
201         if (ap->a_waitfor == MNT_WAIT) {
202                 vn_unlock(ap->a_vp);
203                 hammer_wait_inode(ip);
204                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
205         }
206         return (ip->error);
207 }
208
209 /*
210  * hammer_vop_read { vp, uio, ioflag, cred }
211  */
212 static
213 int
214 hammer_vop_read(struct vop_read_args *ap)
215 {
216         struct hammer_transaction trans;
217         hammer_inode_t ip;
218         off_t offset;
219         struct buf *bp;
220         struct uio *uio;
221         int error;
222         int n;
223         int seqcount;
224         int ioseqcount;
225         int blksize;
226
227         if (ap->a_vp->v_type != VREG)
228                 return (EINVAL);
229         ip = VTOI(ap->a_vp);
230         error = 0;
231         uio = ap->a_uio;
232
233         /*
234          * Allow the UIO's size to override the sequential heuristic.
235          */
236         blksize = hammer_blocksize(uio->uio_offset);
237         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
238         ioseqcount = ap->a_ioflag >> 16;
239         if (seqcount < ioseqcount)
240                 seqcount = ioseqcount;
241
242         hammer_start_transaction(&trans, ip->hmp);
243
244         /*
245          * Access the data typically in HAMMER_BUFSIZE blocks via the
246          * buffer cache, but HAMMER may use a variable block size based
247          * on the offset.
248          */
249         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
250                 int64_t base_offset;
251                 int64_t file_limit;
252
253                 blksize = hammer_blocksize(uio->uio_offset);
254                 offset = (int)uio->uio_offset & (blksize - 1);
255                 base_offset = uio->uio_offset - offset;
256
257                 if (hammer_cluster_enable) {
258                         /*
259                          * Use file_limit to prevent cluster_read() from
260                          * creating buffers of the wrong block size past
261                          * the demarc.
262                          */
263                         file_limit = ip->ino_data.size;
264                         if (base_offset < HAMMER_XDEMARC &&
265                             file_limit > HAMMER_XDEMARC) {
266                                 file_limit = HAMMER_XDEMARC;
267                         }
268                         error = cluster_read(ap->a_vp,
269                                              file_limit, base_offset,
270                                              blksize, MAXPHYS,
271                                              seqcount, &bp);
272                 } else {
273                         error = bread(ap->a_vp, base_offset, blksize, &bp);
274                 }
275                 if (error) {
276                         kprintf("error %d\n", error);
277                         brelse(bp);
278                         break;
279                 }
280
281                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
282                 n = blksize - offset;
283                 if (n > uio->uio_resid)
284                         n = uio->uio_resid;
285                 if (n > ip->ino_data.size - uio->uio_offset)
286                         n = (int)(ip->ino_data.size - uio->uio_offset);
287                 error = uiomove((char *)bp->b_data + offset, n, uio);
288
289                 /* data has a lower priority then meta-data */
290                 bp->b_flags |= B_AGE;
291                 bqrelse(bp);
292                 if (error)
293                         break;
294                 hammer_stats_file_read += n;
295         }
296         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
297             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
298                 ip->ino_data.atime = trans.time;
299                 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
300         }
301         hammer_done_transaction(&trans);
302         return (error);
303 }
304
305 /*
306  * hammer_vop_write { vp, uio, ioflag, cred }
307  */
308 static
309 int
310 hammer_vop_write(struct vop_write_args *ap)
311 {
312         struct hammer_transaction trans;
313         struct hammer_inode *ip;
314         hammer_mount_t hmp;
315         struct uio *uio;
316         int offset;
317         off_t base_offset;
318         struct buf *bp;
319         int kflags;
320         int error;
321         int n;
322         int flags;
323         int delta;
324         int seqcount;
325
326         if (ap->a_vp->v_type != VREG)
327                 return (EINVAL);
328         ip = VTOI(ap->a_vp);
329         hmp = ip->hmp;
330         error = 0;
331         kflags = 0;
332         seqcount = ap->a_ioflag >> 16;
333
334         if (ip->flags & HAMMER_INODE_RO)
335                 return (EROFS);
336
337         /*
338          * Create a transaction to cover the operations we perform.
339          */
340         hammer_start_transaction(&trans, hmp);
341         uio = ap->a_uio;
342
343         /*
344          * Check append mode
345          */
346         if (ap->a_ioflag & IO_APPEND)
347                 uio->uio_offset = ip->ino_data.size;
348
349         /*
350          * Check for illegal write offsets.  Valid range is 0...2^63-1.
351          *
352          * NOTE: the base_off assignment is required to work around what
353          * I consider to be a GCC-4 optimization bug.
354          */
355         if (uio->uio_offset < 0) {
356                 hammer_done_transaction(&trans);
357                 return (EFBIG);
358         }
359         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
360         if (uio->uio_resid > 0 && base_offset <= 0) {
361                 hammer_done_transaction(&trans);
362                 return (EFBIG);
363         }
364
365         /*
366          * Access the data typically in HAMMER_BUFSIZE blocks via the
367          * buffer cache, but HAMMER may use a variable block size based
368          * on the offset.
369          */
370         while (uio->uio_resid > 0) {
371                 int fixsize = 0;
372                 int blksize;
373                 int blkmask;
374
375                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
376                         break;
377
378                 blksize = hammer_blocksize(uio->uio_offset);
379
380                 /*
381                  * Do not allow HAMMER to blow out the buffer cache.  Very
382                  * large UIOs can lockout other processes due to bwillwrite()
383                  * mechanics.
384                  *
385                  * The hammer inode is not locked during these operations.
386                  * The vnode is locked which can interfere with the pageout
387                  * daemon for non-UIO_NOCOPY writes but should not interfere
388                  * with the buffer cache.  Even so, we cannot afford to
389                  * allow the pageout daemon to build up too many dirty buffer
390                  * cache buffers.
391                  */
392                 /*if (((int)uio->uio_offset & (blksize - 1)) == 0)*/
393                 bwillwrite(blksize);
394
395                 /*
396                  * Do not allow HAMMER to blow out system memory by
397                  * accumulating too many records.   Records are so well
398                  * decoupled from the buffer cache that it is possible
399                  * for userland to push data out to the media via
400                  * direct-write, but build up the records queued to the
401                  * backend faster then the backend can flush them out.
402                  * HAMMER has hit its write limit but the frontend has
403                  * no pushback to slow it down.
404                  */
405                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
406                         /*
407                          * Get the inode on the flush list
408                          */
409                         if (ip->rsv_recs >= 64)
410                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
411                         else if (ip->rsv_recs >= 16)
412                                 hammer_flush_inode(ip, 0);
413
414                         /*
415                          * Keep the flusher going if the system keeps
416                          * queueing records.
417                          */
418                         delta = hmp->count_newrecords -
419                                 hmp->last_newrecords;
420                         if (delta < 0 || delta > hammer_limit_recs / 2) {
421                                 hmp->last_newrecords = hmp->count_newrecords;
422                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
423                         }
424
425                         /*
426                          * If we have gotten behind start slowing
427                          * down the writers.
428                          */
429                         delta = (hmp->rsv_recs - hammer_limit_recs) *
430                                 hz / hammer_limit_recs;
431                         if (delta > 0)
432                                 tsleep(&trans, 0, "hmrslo", delta);
433                 }
434
435                 /*
436                  * Calculate the blocksize at the current offset and figure
437                  * out how much we can actually write.
438                  */
439                 blkmask = blksize - 1;
440                 offset = (int)uio->uio_offset & blkmask;
441                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
442                 n = blksize - offset;
443                 if (n > uio->uio_resid)
444                         n = uio->uio_resid;
445                 if (uio->uio_offset + n > ip->ino_data.size) {
446                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
447                         fixsize = 1;
448                         kflags |= NOTE_EXTEND;
449                 }
450
451                 if (uio->uio_segflg == UIO_NOCOPY) {
452                         /*
453                          * Issuing a write with the same data backing the
454                          * buffer.  Instantiate the buffer to collect the
455                          * backing vm pages, then read-in any missing bits.
456                          *
457                          * This case is used by vop_stdputpages().
458                          */
459                         bp = getblk(ap->a_vp, base_offset,
460                                     blksize, GETBLK_BHEAVY, 0);
461                         if ((bp->b_flags & B_CACHE) == 0) {
462                                 bqrelse(bp);
463                                 error = bread(ap->a_vp, base_offset,
464                                               blksize, &bp);
465                         }
466                 } else if (offset == 0 && uio->uio_resid >= blksize) {
467                         /*
468                          * Even though we are entirely overwriting the buffer
469                          * we may still have to zero it out to avoid a 
470                          * mmap/write visibility issue.
471                          */
472                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
473                         if ((bp->b_flags & B_CACHE) == 0)
474                                 vfs_bio_clrbuf(bp);
475                 } else if (base_offset >= ip->ino_data.size) {
476                         /*
477                          * If the base offset of the buffer is beyond the
478                          * file EOF, we don't have to issue a read.
479                          */
480                         bp = getblk(ap->a_vp, base_offset,
481                                     blksize, GETBLK_BHEAVY, 0);
482                         vfs_bio_clrbuf(bp);
483                 } else {
484                         /*
485                          * Partial overwrite, read in any missing bits then
486                          * replace the portion being written.
487                          */
488                         error = bread(ap->a_vp, base_offset, blksize, &bp);
489                         if (error == 0)
490                                 bheavy(bp);
491                 }
492                 if (error == 0) {
493                         error = uiomove((char *)bp->b_data + offset,
494                                         n, uio);
495                 }
496
497                 /*
498                  * If we screwed up we have to undo any VM size changes we
499                  * made.
500                  */
501                 if (error) {
502                         brelse(bp);
503                         if (fixsize) {
504                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
505                                           hammer_blocksize(ip->ino_data.size));
506                         }
507                         break;
508                 }
509                 kflags |= NOTE_WRITE;
510                 hammer_stats_file_write += n;
511                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
512                 if (ip->ino_data.size < uio->uio_offset) {
513                         ip->ino_data.size = uio->uio_offset;
514                         flags = HAMMER_INODE_DDIRTY;
515                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
516                 } else {
517                         flags = 0;
518                 }
519                 ip->ino_data.mtime = trans.time;
520                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
521                 hammer_modify_inode(ip, flags);
522
523                 /*
524                  * Once we dirty the buffer any cached zone-X offset
525                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
526                  * allow overwriting over the same data sector unless
527                  * we provide UNDOs for the old data, which we don't.
528                  */
529                 bp->b_bio2.bio_offset = NOOFFSET;
530
531                 /*
532                  * Final buffer disposition.
533                  */
534                 bp->b_flags |= B_AGE;
535                 if (ap->a_ioflag & IO_SYNC) {
536                         bwrite(bp);
537                 } else if (ap->a_ioflag & IO_DIRECT) {
538                         bawrite(bp);
539                 } else {
540                         bdwrite(bp);
541                 }
542         }
543         hammer_done_transaction(&trans);
544         hammer_knote(ap->a_vp, kflags);
545         return (error);
546 }
547
548 /*
549  * hammer_vop_access { vp, mode, cred }
550  */
551 static
552 int
553 hammer_vop_access(struct vop_access_args *ap)
554 {
555         struct hammer_inode *ip = VTOI(ap->a_vp);
556         uid_t uid;
557         gid_t gid;
558         int error;
559
560         ++hammer_stats_file_iopsr;
561         uid = hammer_to_unix_xid(&ip->ino_data.uid);
562         gid = hammer_to_unix_xid(&ip->ino_data.gid);
563
564         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
565                                   ip->ino_data.uflags);
566         return (error);
567 }
568
569 /*
570  * hammer_vop_advlock { vp, id, op, fl, flags }
571  */
572 static
573 int
574 hammer_vop_advlock(struct vop_advlock_args *ap)
575 {
576         hammer_inode_t ip = VTOI(ap->a_vp);
577
578         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
579 }
580
581 /*
582  * hammer_vop_close { vp, fflag }
583  */
584 static
585 int
586 hammer_vop_close(struct vop_close_args *ap)
587 {
588         /*hammer_inode_t ip = VTOI(ap->a_vp);*/
589         return (vop_stdclose(ap));
590 }
591
592 /*
593  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
594  *
595  * The operating system has already ensured that the directory entry
596  * does not exist and done all appropriate namespace locking.
597  */
598 static
599 int
600 hammer_vop_ncreate(struct vop_ncreate_args *ap)
601 {
602         struct hammer_transaction trans;
603         struct hammer_inode *dip;
604         struct hammer_inode *nip;
605         struct nchandle *nch;
606         int error;
607
608         nch = ap->a_nch;
609         dip = VTOI(ap->a_dvp);
610
611         if (dip->flags & HAMMER_INODE_RO)
612                 return (EROFS);
613         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
614                 return (error);
615
616         /*
617          * Create a transaction to cover the operations we perform.
618          */
619         hammer_start_transaction(&trans, dip->hmp);
620         ++hammer_stats_file_iopsw;
621
622         /*
623          * Create a new filesystem object of the requested type.  The
624          * returned inode will be referenced and shared-locked to prevent
625          * it from being moved to the flusher.
626          */
627
628         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
629                                     dip, NULL, &nip);
630         if (error) {
631                 hkprintf("hammer_create_inode error %d\n", error);
632                 hammer_done_transaction(&trans);
633                 *ap->a_vpp = NULL;
634                 return (error);
635         }
636
637         /*
638          * Add the new filesystem object to the directory.  This will also
639          * bump the inode's link count.
640          */
641         error = hammer_ip_add_directory(&trans, dip,
642                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
643                                         nip);
644         if (error)
645                 hkprintf("hammer_ip_add_directory error %d\n", error);
646
647         /*
648          * Finish up.
649          */
650         if (error) {
651                 hammer_rel_inode(nip, 0);
652                 hammer_done_transaction(&trans);
653                 *ap->a_vpp = NULL;
654         } else {
655                 error = hammer_get_vnode(nip, ap->a_vpp);
656                 hammer_done_transaction(&trans);
657                 hammer_rel_inode(nip, 0);
658                 if (error == 0) {
659                         cache_setunresolved(ap->a_nch);
660                         cache_setvp(ap->a_nch, *ap->a_vpp);
661                 }
662                 hammer_knote(ap->a_dvp, NOTE_WRITE);
663         }
664         return (error);
665 }
666
667 /*
668  * hammer_vop_getattr { vp, vap }
669  *
670  * Retrieve an inode's attribute information.  When accessing inodes
671  * historically we fake the atime field to ensure consistent results.
672  * The atime field is stored in the B-Tree element and allowed to be
673  * updated without cycling the element.
674  */
675 static
676 int
677 hammer_vop_getattr(struct vop_getattr_args *ap)
678 {
679         struct hammer_inode *ip = VTOI(ap->a_vp);
680         struct vattr *vap = ap->a_vap;
681
682         /*
683          * We want the fsid to be different when accessing a filesystem
684          * with different as-of's so programs like diff don't think
685          * the files are the same.
686          *
687          * We also want the fsid to be the same when comparing snapshots,
688          * or when comparing mirrors (which might be backed by different
689          * physical devices).  HAMMER fsids are based on the PFS's
690          * shared_uuid field.
691          *
692          * XXX there is a chance of collision here.  The va_fsid reported
693          * by stat is different from the more involved fsid used in the
694          * mount structure.
695          */
696         ++hammer_stats_file_iopsr;
697         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
698                        (u_int32_t)(ip->obj_asof >> 32);
699
700         vap->va_fileid = ip->ino_leaf.base.obj_id;
701         vap->va_mode = ip->ino_data.mode;
702         vap->va_nlink = ip->ino_data.nlinks;
703         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
704         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
705         vap->va_rmajor = 0;
706         vap->va_rminor = 0;
707         vap->va_size = ip->ino_data.size;
708
709         /*
710          * Special case for @@PFS softlinks.  The actual size of the
711          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
712          */
713         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
714             ip->ino_data.size == 10 &&
715             ip->obj_asof == HAMMER_MAX_TID &&
716             ip->obj_localization == 0 &&
717             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
718                     vap->va_size = 26;
719         }
720
721         /*
722          * We must provide a consistent atime and mtime for snapshots
723          * so people can do a 'tar cf - ... | md5' on them and get
724          * consistent results.
725          */
726         if (ip->flags & HAMMER_INODE_RO) {
727                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
728                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
729         } else {
730                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
731                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
732         }
733         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
734         vap->va_flags = ip->ino_data.uflags;
735         vap->va_gen = 1;        /* hammer inums are unique for all time */
736         vap->va_blocksize = HAMMER_BUFSIZE;
737         if (ip->ino_data.size >= HAMMER_XDEMARC) {
738                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
739                                 ~HAMMER_XBUFMASK64;
740         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
741                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
742                                 ~HAMMER_BUFMASK64;
743         } else {
744                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
745         }
746
747         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
748         vap->va_filerev = 0;    /* XXX */
749         /* mtime uniquely identifies any adjustments made to the file XXX */
750         vap->va_fsmid = ip->ino_data.mtime;
751         vap->va_uid_uuid = ip->ino_data.uid;
752         vap->va_gid_uuid = ip->ino_data.gid;
753         vap->va_fsid_uuid = ip->hmp->fsid;
754         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
755                           VA_FSID_UUID_VALID;
756
757         switch (ip->ino_data.obj_type) {
758         case HAMMER_OBJTYPE_CDEV:
759         case HAMMER_OBJTYPE_BDEV:
760                 vap->va_rmajor = ip->ino_data.rmajor;
761                 vap->va_rminor = ip->ino_data.rminor;
762                 break;
763         default:
764                 break;
765         }
766         return(0);
767 }
768
769 /*
770  * hammer_vop_nresolve { nch, dvp, cred }
771  *
772  * Locate the requested directory entry.
773  */
774 static
775 int
776 hammer_vop_nresolve(struct vop_nresolve_args *ap)
777 {
778         struct hammer_transaction trans;
779         struct namecache *ncp;
780         hammer_inode_t dip;
781         hammer_inode_t ip;
782         hammer_tid_t asof;
783         struct hammer_cursor cursor;
784         struct vnode *vp;
785         int64_t namekey;
786         int error;
787         int i;
788         int nlen;
789         int flags;
790         int ispfs;
791         int64_t obj_id;
792         u_int32_t localization;
793         u_int32_t max_iterations;
794
795         /*
796          * Misc initialization, plus handle as-of name extensions.  Look for
797          * the '@@' extension.  Note that as-of files and directories cannot
798          * be modified.
799          */
800         dip = VTOI(ap->a_dvp);
801         ncp = ap->a_nch->ncp;
802         asof = dip->obj_asof;
803         localization = dip->obj_localization;   /* for code consistency */
804         nlen = ncp->nc_nlen;
805         flags = dip->flags & HAMMER_INODE_RO;
806         ispfs = 0;
807
808         hammer_simple_transaction(&trans, dip->hmp);
809         ++hammer_stats_file_iopsr;
810
811         for (i = 0; i < nlen; ++i) {
812                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
813                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
814                                                   &ispfs, &asof, &localization);
815                         if (error != 0) {
816                                 i = nlen;
817                                 break;
818                         }
819                         if (asof != HAMMER_MAX_TID)
820                                 flags |= HAMMER_INODE_RO;
821                         break;
822                 }
823         }
824         nlen = i;
825
826         /*
827          * If this is a PFS softlink we dive into the PFS
828          */
829         if (ispfs && nlen == 0) {
830                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
831                                       asof, localization,
832                                       flags, &error);
833                 if (error == 0) {
834                         error = hammer_get_vnode(ip, &vp);
835                         hammer_rel_inode(ip, 0);
836                 } else {
837                         vp = NULL;
838                 }
839                 if (error == 0) {
840                         vn_unlock(vp);
841                         cache_setvp(ap->a_nch, vp);
842                         vrele(vp);
843                 }
844                 goto done;
845         }
846
847         /*
848          * If there is no path component the time extension is relative to
849          * dip.
850          */
851         if (nlen == 0) {
852                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
853                                       asof, dip->obj_localization,
854                                       flags, &error);
855                 if (error == 0) {
856                         error = hammer_get_vnode(ip, &vp);
857                         hammer_rel_inode(ip, 0);
858                 } else {
859                         vp = NULL;
860                 }
861                 if (error == 0) {
862                         vn_unlock(vp);
863                         cache_setvp(ap->a_nch, vp);
864                         vrele(vp);
865                 }
866                 goto done;
867         }
868
869         /*
870          * Calculate the namekey and setup the key range for the scan.  This
871          * works kinda like a chained hash table where the lower 32 bits
872          * of the namekey synthesize the chain.
873          *
874          * The key range is inclusive of both key_beg and key_end.
875          */
876         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
877                                            &max_iterations);
878
879         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
880         cursor.key_beg.localization = dip->obj_localization +
881                                       HAMMER_LOCALIZE_MISC;
882         cursor.key_beg.obj_id = dip->obj_id;
883         cursor.key_beg.key = namekey;
884         cursor.key_beg.create_tid = 0;
885         cursor.key_beg.delete_tid = 0;
886         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
887         cursor.key_beg.obj_type = 0;
888
889         cursor.key_end = cursor.key_beg;
890         cursor.key_end.key += max_iterations;
891         cursor.asof = asof;
892         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
893
894         /*
895          * Scan all matching records (the chain), locate the one matching
896          * the requested path component.
897          *
898          * The hammer_ip_*() functions merge in-memory records with on-disk
899          * records for the purposes of the search.
900          */
901         obj_id = 0;
902         localization = HAMMER_DEF_LOCALIZATION;
903
904         if (error == 0) {
905                 error = hammer_ip_first(&cursor);
906                 while (error == 0) {
907                         error = hammer_ip_resolve_data(&cursor);
908                         if (error)
909                                 break;
910                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
911                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
912                                 obj_id = cursor.data->entry.obj_id;
913                                 localization = cursor.data->entry.localization;
914                                 break;
915                         }
916                         error = hammer_ip_next(&cursor);
917                 }
918         }
919         hammer_done_cursor(&cursor);
920         if (error == 0) {
921                 ip = hammer_get_inode(&trans, dip, obj_id,
922                                       asof, localization,
923                                       flags, &error);
924                 if (error == 0) {
925                         error = hammer_get_vnode(ip, &vp);
926                         hammer_rel_inode(ip, 0);
927                 } else {
928                         vp = NULL;
929                 }
930                 if (error == 0) {
931                         vn_unlock(vp);
932                         cache_setvp(ap->a_nch, vp);
933                         vrele(vp);
934                 }
935         } else if (error == ENOENT) {
936                 cache_setvp(ap->a_nch, NULL);
937         }
938 done:
939         hammer_done_transaction(&trans);
940         return (error);
941 }
942
943 /*
944  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
945  *
946  * Locate the parent directory of a directory vnode.
947  *
948  * dvp is referenced but not locked.  *vpp must be returned referenced and
949  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
950  * at the root, instead it could indicate that the directory we were in was
951  * removed.
952  *
953  * NOTE: as-of sequences are not linked into the directory structure.  If
954  * we are at the root with a different asof then the mount point, reload
955  * the same directory with the mount point's asof.   I'm not sure what this
956  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
957  * get confused, but it hasn't been tested.
958  */
959 static
960 int
961 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
962 {
963         struct hammer_transaction trans;
964         struct hammer_inode *dip;
965         struct hammer_inode *ip;
966         int64_t parent_obj_id;
967         u_int32_t parent_obj_localization;
968         hammer_tid_t asof;
969         int error;
970
971         dip = VTOI(ap->a_dvp);
972         asof = dip->obj_asof;
973
974         /*
975          * Whos are parent?  This could be the root of a pseudo-filesystem
976          * whos parent is in another localization domain.
977          */
978         parent_obj_id = dip->ino_data.parent_obj_id;
979         if (dip->obj_id == HAMMER_OBJID_ROOT)
980                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
981         else
982                 parent_obj_localization = dip->obj_localization;
983
984         if (parent_obj_id == 0) {
985                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
986                    asof != dip->hmp->asof) {
987                         parent_obj_id = dip->obj_id;
988                         asof = dip->hmp->asof;
989                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
990                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
991                                    dip->obj_asof);
992                 } else {
993                         *ap->a_vpp = NULL;
994                         return ENOENT;
995                 }
996         }
997
998         hammer_simple_transaction(&trans, dip->hmp);
999         ++hammer_stats_file_iopsr;
1000
1001         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1002                               asof, parent_obj_localization,
1003                               dip->flags, &error);
1004         if (ip) {
1005                 error = hammer_get_vnode(ip, ap->a_vpp);
1006                 hammer_rel_inode(ip, 0);
1007         } else {
1008                 *ap->a_vpp = NULL;
1009         }
1010         hammer_done_transaction(&trans);
1011         return (error);
1012 }
1013
1014 /*
1015  * hammer_vop_nlink { nch, dvp, vp, cred }
1016  */
1017 static
1018 int
1019 hammer_vop_nlink(struct vop_nlink_args *ap)
1020 {
1021         struct hammer_transaction trans;
1022         struct hammer_inode *dip;
1023         struct hammer_inode *ip;
1024         struct nchandle *nch;
1025         int error;
1026
1027         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1028                 return(EXDEV);
1029
1030         nch = ap->a_nch;
1031         dip = VTOI(ap->a_dvp);
1032         ip = VTOI(ap->a_vp);
1033
1034         if (dip->obj_localization != ip->obj_localization)
1035                 return(EXDEV);
1036
1037         if (dip->flags & HAMMER_INODE_RO)
1038                 return (EROFS);
1039         if (ip->flags & HAMMER_INODE_RO)
1040                 return (EROFS);
1041         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1042                 return (error);
1043
1044         /*
1045          * Create a transaction to cover the operations we perform.
1046          */
1047         hammer_start_transaction(&trans, dip->hmp);
1048         ++hammer_stats_file_iopsw;
1049
1050         /*
1051          * Add the filesystem object to the directory.  Note that neither
1052          * dip nor ip are referenced or locked, but their vnodes are
1053          * referenced.  This function will bump the inode's link count.
1054          */
1055         error = hammer_ip_add_directory(&trans, dip,
1056                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1057                                         ip);
1058
1059         /*
1060          * Finish up.
1061          */
1062         if (error == 0) {
1063                 cache_setunresolved(nch);
1064                 cache_setvp(nch, ap->a_vp);
1065         }
1066         hammer_done_transaction(&trans);
1067         hammer_knote(ap->a_vp, NOTE_LINK);
1068         hammer_knote(ap->a_dvp, NOTE_WRITE);
1069         return (error);
1070 }
1071
1072 /*
1073  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1074  *
1075  * The operating system has already ensured that the directory entry
1076  * does not exist and done all appropriate namespace locking.
1077  */
1078 static
1079 int
1080 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1081 {
1082         struct hammer_transaction trans;
1083         struct hammer_inode *dip;
1084         struct hammer_inode *nip;
1085         struct nchandle *nch;
1086         int error;
1087
1088         nch = ap->a_nch;
1089         dip = VTOI(ap->a_dvp);
1090
1091         if (dip->flags & HAMMER_INODE_RO)
1092                 return (EROFS);
1093         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1094                 return (error);
1095
1096         /*
1097          * Create a transaction to cover the operations we perform.
1098          */
1099         hammer_start_transaction(&trans, dip->hmp);
1100         ++hammer_stats_file_iopsw;
1101
1102         /*
1103          * Create a new filesystem object of the requested type.  The
1104          * returned inode will be referenced but not locked.
1105          */
1106         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1107                                     dip, NULL, &nip);
1108         if (error) {
1109                 hkprintf("hammer_mkdir error %d\n", error);
1110                 hammer_done_transaction(&trans);
1111                 *ap->a_vpp = NULL;
1112                 return (error);
1113         }
1114         /*
1115          * Add the new filesystem object to the directory.  This will also
1116          * bump the inode's link count.
1117          */
1118         error = hammer_ip_add_directory(&trans, dip,
1119                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1120                                         nip);
1121         if (error)
1122                 hkprintf("hammer_mkdir (add) error %d\n", error);
1123
1124         /*
1125          * Finish up.
1126          */
1127         if (error) {
1128                 hammer_rel_inode(nip, 0);
1129                 *ap->a_vpp = NULL;
1130         } else {
1131                 error = hammer_get_vnode(nip, ap->a_vpp);
1132                 hammer_rel_inode(nip, 0);
1133                 if (error == 0) {
1134                         cache_setunresolved(ap->a_nch);
1135                         cache_setvp(ap->a_nch, *ap->a_vpp);
1136                 }
1137         }
1138         hammer_done_transaction(&trans);
1139         if (error == 0)
1140                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1141         return (error);
1142 }
1143
1144 /*
1145  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1146  *
1147  * The operating system has already ensured that the directory entry
1148  * does not exist and done all appropriate namespace locking.
1149  */
1150 static
1151 int
1152 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1153 {
1154         struct hammer_transaction trans;
1155         struct hammer_inode *dip;
1156         struct hammer_inode *nip;
1157         struct nchandle *nch;
1158         int error;
1159
1160         nch = ap->a_nch;
1161         dip = VTOI(ap->a_dvp);
1162
1163         if (dip->flags & HAMMER_INODE_RO)
1164                 return (EROFS);
1165         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1166                 return (error);
1167
1168         /*
1169          * Create a transaction to cover the operations we perform.
1170          */
1171         hammer_start_transaction(&trans, dip->hmp);
1172         ++hammer_stats_file_iopsw;
1173
1174         /*
1175          * Create a new filesystem object of the requested type.  The
1176          * returned inode will be referenced but not locked.
1177          *
1178          * If mknod specifies a directory a pseudo-fs is created.
1179          */
1180         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1181                                     dip, NULL, &nip);
1182         if (error) {
1183                 hammer_done_transaction(&trans);
1184                 *ap->a_vpp = NULL;
1185                 return (error);
1186         }
1187
1188         /*
1189          * Add the new filesystem object to the directory.  This will also
1190          * bump the inode's link count.
1191          */
1192         error = hammer_ip_add_directory(&trans, dip,
1193                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1194                                         nip);
1195
1196         /*
1197          * Finish up.
1198          */
1199         if (error) {
1200                 hammer_rel_inode(nip, 0);
1201                 *ap->a_vpp = NULL;
1202         } else {
1203                 error = hammer_get_vnode(nip, ap->a_vpp);
1204                 hammer_rel_inode(nip, 0);
1205                 if (error == 0) {
1206                         cache_setunresolved(ap->a_nch);
1207                         cache_setvp(ap->a_nch, *ap->a_vpp);
1208                 }
1209         }
1210         hammer_done_transaction(&trans);
1211         if (error == 0)
1212                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1213         return (error);
1214 }
1215
1216 /*
1217  * hammer_vop_open { vp, mode, cred, fp }
1218  */
1219 static
1220 int
1221 hammer_vop_open(struct vop_open_args *ap)
1222 {
1223         hammer_inode_t ip;
1224
1225         ++hammer_stats_file_iopsr;
1226         ip = VTOI(ap->a_vp);
1227
1228         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1229                 return (EROFS);
1230         return(vop_stdopen(ap));
1231 }
1232
1233 /*
1234  * hammer_vop_print { vp }
1235  */
1236 static
1237 int
1238 hammer_vop_print(struct vop_print_args *ap)
1239 {
1240         return EOPNOTSUPP;
1241 }
1242
1243 /*
1244  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1245  */
1246 static
1247 int
1248 hammer_vop_readdir(struct vop_readdir_args *ap)
1249 {
1250         struct hammer_transaction trans;
1251         struct hammer_cursor cursor;
1252         struct hammer_inode *ip;
1253         struct uio *uio;
1254         hammer_base_elm_t base;
1255         int error;
1256         int cookie_index;
1257         int ncookies;
1258         off_t *cookies;
1259         off_t saveoff;
1260         int r;
1261         int dtype;
1262
1263         ++hammer_stats_file_iopsr;
1264         ip = VTOI(ap->a_vp);
1265         uio = ap->a_uio;
1266         saveoff = uio->uio_offset;
1267
1268         if (ap->a_ncookies) {
1269                 ncookies = uio->uio_resid / 16 + 1;
1270                 if (ncookies > 1024)
1271                         ncookies = 1024;
1272                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1273                 cookie_index = 0;
1274         } else {
1275                 ncookies = -1;
1276                 cookies = NULL;
1277                 cookie_index = 0;
1278         }
1279
1280         hammer_simple_transaction(&trans, ip->hmp);
1281
1282         /*
1283          * Handle artificial entries
1284          */
1285         error = 0;
1286         if (saveoff == 0) {
1287                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1288                 if (r)
1289                         goto done;
1290                 if (cookies)
1291                         cookies[cookie_index] = saveoff;
1292                 ++saveoff;
1293                 ++cookie_index;
1294                 if (cookie_index == ncookies)
1295                         goto done;
1296         }
1297         if (saveoff == 1) {
1298                 if (ip->ino_data.parent_obj_id) {
1299                         r = vop_write_dirent(&error, uio,
1300                                              ip->ino_data.parent_obj_id,
1301                                              DT_DIR, 2, "..");
1302                 } else {
1303                         r = vop_write_dirent(&error, uio,
1304                                              ip->obj_id, DT_DIR, 2, "..");
1305                 }
1306                 if (r)
1307                         goto done;
1308                 if (cookies)
1309                         cookies[cookie_index] = saveoff;
1310                 ++saveoff;
1311                 ++cookie_index;
1312                 if (cookie_index == ncookies)
1313                         goto done;
1314         }
1315
1316         /*
1317          * Key range (begin and end inclusive) to scan.  Directory keys
1318          * directly translate to a 64 bit 'seek' position.
1319          */
1320         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1321         cursor.key_beg.localization = ip->obj_localization +
1322                                       HAMMER_LOCALIZE_MISC;
1323         cursor.key_beg.obj_id = ip->obj_id;
1324         cursor.key_beg.create_tid = 0;
1325         cursor.key_beg.delete_tid = 0;
1326         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1327         cursor.key_beg.obj_type = 0;
1328         cursor.key_beg.key = saveoff;
1329
1330         cursor.key_end = cursor.key_beg;
1331         cursor.key_end.key = HAMMER_MAX_KEY;
1332         cursor.asof = ip->obj_asof;
1333         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1334
1335         error = hammer_ip_first(&cursor);
1336
1337         while (error == 0) {
1338                 error = hammer_ip_resolve_data(&cursor);
1339                 if (error)
1340                         break;
1341                 base = &cursor.leaf->base;
1342                 saveoff = base->key;
1343                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1344
1345                 if (base->obj_id != ip->obj_id)
1346                         panic("readdir: bad record at %p", cursor.node);
1347
1348                 /*
1349                  * Convert pseudo-filesystems into softlinks
1350                  */
1351                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1352                 r = vop_write_dirent(
1353                              &error, uio, cursor.data->entry.obj_id,
1354                              dtype,
1355                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1356                              (void *)cursor.data->entry.name);
1357                 if (r)
1358                         break;
1359                 ++saveoff;
1360                 if (cookies)
1361                         cookies[cookie_index] = base->key;
1362                 ++cookie_index;
1363                 if (cookie_index == ncookies)
1364                         break;
1365                 error = hammer_ip_next(&cursor);
1366         }
1367         hammer_done_cursor(&cursor);
1368
1369 done:
1370         hammer_done_transaction(&trans);
1371
1372         if (ap->a_eofflag)
1373                 *ap->a_eofflag = (error == ENOENT);
1374         uio->uio_offset = saveoff;
1375         if (error && cookie_index == 0) {
1376                 if (error == ENOENT)
1377                         error = 0;
1378                 if (cookies) {
1379                         kfree(cookies, M_TEMP);
1380                         *ap->a_ncookies = 0;
1381                         *ap->a_cookies = NULL;
1382                 }
1383         } else {
1384                 if (error == ENOENT)
1385                         error = 0;
1386                 if (cookies) {
1387                         *ap->a_ncookies = cookie_index;
1388                         *ap->a_cookies = cookies;
1389                 }
1390         }
1391         return(error);
1392 }
1393
1394 /*
1395  * hammer_vop_readlink { vp, uio, cred }
1396  */
1397 static
1398 int
1399 hammer_vop_readlink(struct vop_readlink_args *ap)
1400 {
1401         struct hammer_transaction trans;
1402         struct hammer_cursor cursor;
1403         struct hammer_inode *ip;
1404         char buf[32];
1405         u_int32_t localization;
1406         hammer_pseudofs_inmem_t pfsm;
1407         int error;
1408
1409         ip = VTOI(ap->a_vp);
1410
1411         /*
1412          * Shortcut if the symlink data was stuffed into ino_data.
1413          *
1414          * Also expand special "@@PFS%05d" softlinks (expansion only
1415          * occurs for non-historical (current) accesses made from the
1416          * primary filesystem).
1417          */
1418         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1419                 char *ptr;
1420                 int bytes;
1421
1422                 ptr = ip->ino_data.ext.symlink;
1423                 bytes = (int)ip->ino_data.size;
1424                 if (bytes == 10 &&
1425                     ip->obj_asof == HAMMER_MAX_TID &&
1426                     ip->obj_localization == 0 &&
1427                     strncmp(ptr, "@@PFS", 5) == 0) {
1428                         hammer_simple_transaction(&trans, ip->hmp);
1429                         bcopy(ptr + 5, buf, 5);
1430                         buf[5] = 0;
1431                         localization = strtoul(buf, NULL, 10) << 16;
1432                         pfsm = hammer_load_pseudofs(&trans, localization,
1433                                                     &error);
1434                         if (error == 0) {
1435                                 if (pfsm->pfsd.mirror_flags &
1436                                     HAMMER_PFSD_SLAVE) {
1437                                         ksnprintf(buf, sizeof(buf),
1438                                                   "@@0x%016llx:%05d",
1439                                                   pfsm->pfsd.sync_end_tid,
1440                                                   localization >> 16);
1441                                 } else {
1442                                         ksnprintf(buf, sizeof(buf),
1443                                                   "@@0x%016llx:%05d",
1444                                                   HAMMER_MAX_TID,
1445                                                   localization >> 16);
1446                                 }
1447                                 ptr = buf;
1448                                 bytes = strlen(buf);
1449                         }
1450                         if (pfsm)
1451                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1452                         hammer_done_transaction(&trans);
1453                 }
1454                 error = uiomove(ptr, bytes, ap->a_uio);
1455                 return(error);
1456         }
1457
1458         /*
1459          * Long version
1460          */
1461         hammer_simple_transaction(&trans, ip->hmp);
1462         ++hammer_stats_file_iopsr;
1463         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1464
1465         /*
1466          * Key range (begin and end inclusive) to scan.  Directory keys
1467          * directly translate to a 64 bit 'seek' position.
1468          */
1469         cursor.key_beg.localization = ip->obj_localization +
1470                                       HAMMER_LOCALIZE_MISC;
1471         cursor.key_beg.obj_id = ip->obj_id;
1472         cursor.key_beg.create_tid = 0;
1473         cursor.key_beg.delete_tid = 0;
1474         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1475         cursor.key_beg.obj_type = 0;
1476         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1477         cursor.asof = ip->obj_asof;
1478         cursor.flags |= HAMMER_CURSOR_ASOF;
1479
1480         error = hammer_ip_lookup(&cursor);
1481         if (error == 0) {
1482                 error = hammer_ip_resolve_data(&cursor);
1483                 if (error == 0) {
1484                         KKASSERT(cursor.leaf->data_len >=
1485                                  HAMMER_SYMLINK_NAME_OFF);
1486                         error = uiomove(cursor.data->symlink.name,
1487                                         cursor.leaf->data_len -
1488                                                 HAMMER_SYMLINK_NAME_OFF,
1489                                         ap->a_uio);
1490                 }
1491         }
1492         hammer_done_cursor(&cursor);
1493         hammer_done_transaction(&trans);
1494         return(error);
1495 }
1496
1497 /*
1498  * hammer_vop_nremove { nch, dvp, cred }
1499  */
1500 static
1501 int
1502 hammer_vop_nremove(struct vop_nremove_args *ap)
1503 {
1504         struct hammer_transaction trans;
1505         struct hammer_inode *dip;
1506         int error;
1507
1508         dip = VTOI(ap->a_dvp);
1509
1510         if (hammer_nohistory(dip) == 0 &&
1511             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1512                 return (error);
1513         }
1514
1515         hammer_start_transaction(&trans, dip->hmp);
1516         ++hammer_stats_file_iopsw;
1517         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1518         hammer_done_transaction(&trans);
1519         if (error == 0)
1520                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1521         return (error);
1522 }
1523
1524 /*
1525  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1526  */
1527 static
1528 int
1529 hammer_vop_nrename(struct vop_nrename_args *ap)
1530 {
1531         struct hammer_transaction trans;
1532         struct namecache *fncp;
1533         struct namecache *tncp;
1534         struct hammer_inode *fdip;
1535         struct hammer_inode *tdip;
1536         struct hammer_inode *ip;
1537         struct hammer_cursor cursor;
1538         int64_t namekey;
1539         u_int32_t max_iterations;
1540         int nlen, error;
1541
1542         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1543                 return(EXDEV);
1544         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1545                 return(EXDEV);
1546
1547         fdip = VTOI(ap->a_fdvp);
1548         tdip = VTOI(ap->a_tdvp);
1549         fncp = ap->a_fnch->ncp;
1550         tncp = ap->a_tnch->ncp;
1551         ip = VTOI(fncp->nc_vp);
1552         KKASSERT(ip != NULL);
1553
1554         if (fdip->obj_localization != tdip->obj_localization)
1555                 return(EXDEV);
1556         if (fdip->obj_localization != ip->obj_localization)
1557                 return(EXDEV);
1558
1559         if (fdip->flags & HAMMER_INODE_RO)
1560                 return (EROFS);
1561         if (tdip->flags & HAMMER_INODE_RO)
1562                 return (EROFS);
1563         if (ip->flags & HAMMER_INODE_RO)
1564                 return (EROFS);
1565         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1566                 return (error);
1567
1568         hammer_start_transaction(&trans, fdip->hmp);
1569         ++hammer_stats_file_iopsw;
1570
1571         /*
1572          * Remove tncp from the target directory and then link ip as
1573          * tncp. XXX pass trans to dounlink
1574          *
1575          * Force the inode sync-time to match the transaction so it is
1576          * in-sync with the creation of the target directory entry.
1577          */
1578         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1579                                 ap->a_cred, 0, -1);
1580         if (error == 0 || error == ENOENT) {
1581                 error = hammer_ip_add_directory(&trans, tdip,
1582                                                 tncp->nc_name, tncp->nc_nlen,
1583                                                 ip);
1584                 if (error == 0) {
1585                         ip->ino_data.parent_obj_id = tdip->obj_id;
1586                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1587                 }
1588         }
1589         if (error)
1590                 goto failed; /* XXX */
1591
1592         /*
1593          * Locate the record in the originating directory and remove it.
1594          *
1595          * Calculate the namekey and setup the key range for the scan.  This
1596          * works kinda like a chained hash table where the lower 32 bits
1597          * of the namekey synthesize the chain.
1598          *
1599          * The key range is inclusive of both key_beg and key_end.
1600          */
1601         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1602                                            &max_iterations);
1603 retry:
1604         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1605         cursor.key_beg.localization = fdip->obj_localization +
1606                                       HAMMER_LOCALIZE_MISC;
1607         cursor.key_beg.obj_id = fdip->obj_id;
1608         cursor.key_beg.key = namekey;
1609         cursor.key_beg.create_tid = 0;
1610         cursor.key_beg.delete_tid = 0;
1611         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1612         cursor.key_beg.obj_type = 0;
1613
1614         cursor.key_end = cursor.key_beg;
1615         cursor.key_end.key += max_iterations;
1616         cursor.asof = fdip->obj_asof;
1617         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1618
1619         /*
1620          * Scan all matching records (the chain), locate the one matching
1621          * the requested path component.
1622          *
1623          * The hammer_ip_*() functions merge in-memory records with on-disk
1624          * records for the purposes of the search.
1625          */
1626         error = hammer_ip_first(&cursor);
1627         while (error == 0) {
1628                 if (hammer_ip_resolve_data(&cursor) != 0)
1629                         break;
1630                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1631                 KKASSERT(nlen > 0);
1632                 if (fncp->nc_nlen == nlen &&
1633                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1634                         break;
1635                 }
1636                 error = hammer_ip_next(&cursor);
1637         }
1638
1639         /*
1640          * If all is ok we have to get the inode so we can adjust nlinks.
1641          *
1642          * WARNING: hammer_ip_del_directory() may have to terminate the
1643          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1644          * twice.
1645          */
1646         if (error == 0)
1647                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1648
1649         /*
1650          * XXX A deadlock here will break rename's atomicy for the purposes
1651          * of crash recovery.
1652          */
1653         if (error == EDEADLK) {
1654                 hammer_done_cursor(&cursor);
1655                 goto retry;
1656         }
1657
1658         /*
1659          * Cleanup and tell the kernel that the rename succeeded.
1660          */
1661         hammer_done_cursor(&cursor);
1662         if (error == 0) {
1663                 cache_rename(ap->a_fnch, ap->a_tnch);
1664                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1665                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1666                 if (ip->vp)
1667                         hammer_knote(ip->vp, NOTE_RENAME);
1668         }
1669
1670 failed:
1671         hammer_done_transaction(&trans);
1672         return (error);
1673 }
1674
1675 /*
1676  * hammer_vop_nrmdir { nch, dvp, cred }
1677  */
1678 static
1679 int
1680 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1681 {
1682         struct hammer_transaction trans;
1683         struct hammer_inode *dip;
1684         int error;
1685
1686         dip = VTOI(ap->a_dvp);
1687
1688         if (hammer_nohistory(dip) == 0 &&
1689             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1690                 return (error);
1691         }
1692
1693         hammer_start_transaction(&trans, dip->hmp);
1694         ++hammer_stats_file_iopsw;
1695         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1696         hammer_done_transaction(&trans);
1697         if (error == 0)
1698                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1699         return (error);
1700 }
1701
1702 /*
1703  * hammer_vop_setattr { vp, vap, cred }
1704  */
1705 static
1706 int
1707 hammer_vop_setattr(struct vop_setattr_args *ap)
1708 {
1709         struct hammer_transaction trans;
1710         struct vattr *vap;
1711         struct hammer_inode *ip;
1712         int modflags;
1713         int error;
1714         int truncating;
1715         int blksize;
1716         int kflags;
1717         int64_t aligned_size;
1718         u_int32_t flags;
1719
1720         vap = ap->a_vap;
1721         ip = ap->a_vp->v_data;
1722         modflags = 0;
1723         kflags = 0;
1724
1725         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1726                 return(EROFS);
1727         if (ip->flags & HAMMER_INODE_RO)
1728                 return (EROFS);
1729         if (hammer_nohistory(ip) == 0 &&
1730             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1731                 return (error);
1732         }
1733
1734         hammer_start_transaction(&trans, ip->hmp);
1735         ++hammer_stats_file_iopsw;
1736         error = 0;
1737
1738         if (vap->va_flags != VNOVAL) {
1739                 flags = ip->ino_data.uflags;
1740                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1741                                          hammer_to_unix_xid(&ip->ino_data.uid),
1742                                          ap->a_cred);
1743                 if (error == 0) {
1744                         if (ip->ino_data.uflags != flags) {
1745                                 ip->ino_data.uflags = flags;
1746                                 modflags |= HAMMER_INODE_DDIRTY;
1747                                 kflags |= NOTE_ATTRIB;
1748                         }
1749                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1750                                 error = 0;
1751                                 goto done;
1752                         }
1753                 }
1754                 goto done;
1755         }
1756         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1757                 error = EPERM;
1758                 goto done;
1759         }
1760         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1761                 mode_t cur_mode = ip->ino_data.mode;
1762                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1763                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1764                 uuid_t uuid_uid;
1765                 uuid_t uuid_gid;
1766
1767                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1768                                          ap->a_cred,
1769                                          &cur_uid, &cur_gid, &cur_mode);
1770                 if (error == 0) {
1771                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1772                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1773                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1774                                  sizeof(uuid_uid)) ||
1775                             bcmp(&uuid_gid, &ip->ino_data.gid,
1776                                  sizeof(uuid_gid)) ||
1777                             ip->ino_data.mode != cur_mode
1778                         ) {
1779                                 ip->ino_data.uid = uuid_uid;
1780                                 ip->ino_data.gid = uuid_gid;
1781                                 ip->ino_data.mode = cur_mode;
1782                         }
1783                         modflags |= HAMMER_INODE_DDIRTY;
1784                         kflags |= NOTE_ATTRIB;
1785                 }
1786         }
1787         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1788                 switch(ap->a_vp->v_type) {
1789                 case VREG:
1790                         if (vap->va_size == ip->ino_data.size)
1791                                 break;
1792                         /*
1793                          * XXX break atomicy, we can deadlock the backend
1794                          * if we do not release the lock.  Probably not a
1795                          * big deal here.
1796                          */
1797                         blksize = hammer_blocksize(vap->va_size);
1798                         if (vap->va_size < ip->ino_data.size) {
1799                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1800                                 truncating = 1;
1801                                 kflags |= NOTE_WRITE;
1802                         } else {
1803                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1804                                 truncating = 0;
1805                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
1806                         }
1807                         ip->ino_data.size = vap->va_size;
1808                         modflags |= HAMMER_INODE_DDIRTY;
1809
1810                         /*
1811                          * on-media truncation is cached in the inode until
1812                          * the inode is synchronized.
1813                          */
1814                         if (truncating) {
1815                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1816 #ifdef DEBUG_TRUNCATE
1817                                 if (HammerTruncIp == NULL)
1818                                         HammerTruncIp = ip;
1819 #endif
1820                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1821                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1822                                         ip->trunc_off = vap->va_size;
1823 #ifdef DEBUG_TRUNCATE
1824                                         if (ip == HammerTruncIp)
1825                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1826 #endif
1827                                 } else if (ip->trunc_off > vap->va_size) {
1828                                         ip->trunc_off = vap->va_size;
1829 #ifdef DEBUG_TRUNCATE
1830                                         if (ip == HammerTruncIp)
1831                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1832 #endif
1833                                 } else {
1834 #ifdef DEBUG_TRUNCATE
1835                                         if (ip == HammerTruncIp)
1836                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1837 #endif
1838                                 }
1839                         }
1840
1841                         /*
1842                          * If truncating we have to clean out a portion of
1843                          * the last block on-disk.  We do this in the
1844                          * front-end buffer cache.
1845                          */
1846                         aligned_size = (vap->va_size + (blksize - 1)) &
1847                                        ~(int64_t)(blksize - 1);
1848                         if (truncating && vap->va_size < aligned_size) {
1849                                 struct buf *bp;
1850                                 int offset;
1851
1852                                 aligned_size -= blksize;
1853
1854                                 offset = (int)vap->va_size & (blksize - 1);
1855                                 error = bread(ap->a_vp, aligned_size,
1856                                               blksize, &bp);
1857                                 hammer_ip_frontend_trunc(ip, aligned_size);
1858                                 if (error == 0) {
1859                                         bzero(bp->b_data + offset,
1860                                               blksize - offset);
1861                                         /* must de-cache direct-io offset */
1862                                         bp->b_bio2.bio_offset = NOOFFSET;
1863                                         bdwrite(bp);
1864                                 } else {
1865                                         kprintf("ERROR %d\n", error);
1866                                         brelse(bp);
1867                                 }
1868                         }
1869                         break;
1870                 case VDATABASE:
1871                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1872                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1873                                 ip->trunc_off = vap->va_size;
1874                         } else if (ip->trunc_off > vap->va_size) {
1875                                 ip->trunc_off = vap->va_size;
1876                         }
1877                         hammer_ip_frontend_trunc(ip, vap->va_size);
1878                         ip->ino_data.size = vap->va_size;
1879                         modflags |= HAMMER_INODE_DDIRTY;
1880                         kflags |= NOTE_ATTRIB;
1881                         break;
1882                 default:
1883                         error = EINVAL;
1884                         goto done;
1885                 }
1886                 break;
1887         }
1888         if (vap->va_atime.tv_sec != VNOVAL) {
1889                 ip->ino_data.atime =
1890                         hammer_timespec_to_time(&vap->va_atime);
1891                 modflags |= HAMMER_INODE_ATIME;
1892                 kflags |= NOTE_ATTRIB;
1893         }
1894         if (vap->va_mtime.tv_sec != VNOVAL) {
1895                 ip->ino_data.mtime =
1896                         hammer_timespec_to_time(&vap->va_mtime);
1897                 modflags |= HAMMER_INODE_MTIME;
1898                 kflags |= NOTE_ATTRIB;
1899         }
1900         if (vap->va_mode != (mode_t)VNOVAL) {
1901                 mode_t   cur_mode = ip->ino_data.mode;
1902                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1903                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1904
1905                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1906                                          cur_uid, cur_gid, &cur_mode);
1907                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1908                         ip->ino_data.mode = cur_mode;
1909                         modflags |= HAMMER_INODE_DDIRTY;
1910                         kflags |= NOTE_ATTRIB;
1911                 }
1912         }
1913 done:
1914         if (error == 0)
1915                 hammer_modify_inode(ip, modflags);
1916         hammer_done_transaction(&trans);
1917         hammer_knote(ap->a_vp, kflags);
1918         return (error);
1919 }
1920
1921 /*
1922  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1923  */
1924 static
1925 int
1926 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1927 {
1928         struct hammer_transaction trans;
1929         struct hammer_inode *dip;
1930         struct hammer_inode *nip;
1931         struct nchandle *nch;
1932         hammer_record_t record;
1933         int error;
1934         int bytes;
1935
1936         ap->a_vap->va_type = VLNK;
1937
1938         nch = ap->a_nch;
1939         dip = VTOI(ap->a_dvp);
1940
1941         if (dip->flags & HAMMER_INODE_RO)
1942                 return (EROFS);
1943         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1944                 return (error);
1945
1946         /*
1947          * Create a transaction to cover the operations we perform.
1948          */
1949         hammer_start_transaction(&trans, dip->hmp);
1950         ++hammer_stats_file_iopsw;
1951
1952         /*
1953          * Create a new filesystem object of the requested type.  The
1954          * returned inode will be referenced but not locked.
1955          */
1956
1957         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1958                                     dip, NULL, &nip);
1959         if (error) {
1960                 hammer_done_transaction(&trans);
1961                 *ap->a_vpp = NULL;
1962                 return (error);
1963         }
1964
1965         /*
1966          * Add a record representing the symlink.  symlink stores the link
1967          * as pure data, not a string, and is no \0 terminated.
1968          */
1969         if (error == 0) {
1970                 bytes = strlen(ap->a_target);
1971
1972                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1973                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1974                 } else {
1975                         record = hammer_alloc_mem_record(nip, bytes);
1976                         record->type = HAMMER_MEM_RECORD_GENERAL;
1977
1978                         record->leaf.base.localization = nip->obj_localization +
1979                                                          HAMMER_LOCALIZE_MISC;
1980                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1981                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1982                         record->leaf.data_len = bytes;
1983                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1984                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1985                         error = hammer_ip_add_record(&trans, record);
1986                 }
1987
1988                 /*
1989                  * Set the file size to the length of the link.
1990                  */
1991                 if (error == 0) {
1992                         nip->ino_data.size = bytes;
1993                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1994                 }
1995         }
1996         if (error == 0)
1997                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1998                                                 nch->ncp->nc_nlen, nip);
1999
2000         /*
2001          * Finish up.
2002          */
2003         if (error) {
2004                 hammer_rel_inode(nip, 0);
2005                 *ap->a_vpp = NULL;
2006         } else {
2007                 error = hammer_get_vnode(nip, ap->a_vpp);
2008                 hammer_rel_inode(nip, 0);
2009                 if (error == 0) {
2010                         cache_setunresolved(ap->a_nch);
2011                         cache_setvp(ap->a_nch, *ap->a_vpp);
2012                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2013                 }
2014         }
2015         hammer_done_transaction(&trans);
2016         return (error);
2017 }
2018
2019 /*
2020  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2021  */
2022 static
2023 int
2024 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2025 {
2026         struct hammer_transaction trans;
2027         struct hammer_inode *dip;
2028         int error;
2029
2030         dip = VTOI(ap->a_dvp);
2031
2032         if (hammer_nohistory(dip) == 0 &&
2033             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2034                 return (error);
2035         }
2036
2037         hammer_start_transaction(&trans, dip->hmp);
2038         ++hammer_stats_file_iopsw;
2039         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2040                                 ap->a_cred, ap->a_flags, -1);
2041         hammer_done_transaction(&trans);
2042
2043         return (error);
2044 }
2045
2046 /*
2047  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2048  */
2049 static
2050 int
2051 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2052 {
2053         struct hammer_inode *ip = ap->a_vp->v_data;
2054
2055         ++hammer_stats_file_iopsr;
2056         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2057                             ap->a_fflag, ap->a_cred));
2058 }
2059
2060 static
2061 int
2062 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2063 {
2064         struct mount *mp;
2065         int error;
2066
2067         mp = ap->a_head.a_ops->head.vv_mount;
2068
2069         switch(ap->a_op) {
2070         case MOUNTCTL_SET_EXPORT:
2071                 if (ap->a_ctllen != sizeof(struct export_args))
2072                         error = EINVAL;
2073                 else
2074                         error = hammer_vfs_export(mp, ap->a_op,
2075                                       (const struct export_args *)ap->a_ctl);
2076                 break;
2077         default:
2078                 error = journal_mountctl(ap);
2079                 break;
2080         }
2081         return(error);
2082 }
2083
2084 /*
2085  * hammer_vop_strategy { vp, bio }
2086  *
2087  * Strategy call, used for regular file read & write only.  Note that the
2088  * bp may represent a cluster.
2089  *
2090  * To simplify operation and allow better optimizations in the future,
2091  * this code does not make any assumptions with regards to buffer alignment
2092  * or size.
2093  */
2094 static
2095 int
2096 hammer_vop_strategy(struct vop_strategy_args *ap)
2097 {
2098         struct buf *bp;
2099         int error;
2100
2101         bp = ap->a_bio->bio_buf;
2102
2103         switch(bp->b_cmd) {
2104         case BUF_CMD_READ:
2105                 error = hammer_vop_strategy_read(ap);
2106                 break;
2107         case BUF_CMD_WRITE:
2108                 error = hammer_vop_strategy_write(ap);
2109                 break;
2110         default:
2111                 bp->b_error = error = EINVAL;
2112                 bp->b_flags |= B_ERROR;
2113                 biodone(ap->a_bio);
2114                 break;
2115         }
2116         return (error);
2117 }
2118
2119 /*
2120  * Read from a regular file.  Iterate the related records and fill in the
2121  * BIO/BUF.  Gaps are zero-filled.
2122  *
2123  * The support code in hammer_object.c should be used to deal with mixed
2124  * in-memory and on-disk records.
2125  *
2126  * NOTE: Can be called from the cluster code with an oversized buf.
2127  *
2128  * XXX atime update
2129  */
2130 static
2131 int
2132 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2133 {
2134         struct hammer_transaction trans;
2135         struct hammer_inode *ip;
2136         struct hammer_cursor cursor;
2137         hammer_base_elm_t base;
2138         hammer_off_t disk_offset;
2139         struct bio *bio;
2140         struct bio *nbio;
2141         struct buf *bp;
2142         int64_t rec_offset;
2143         int64_t ran_end;
2144         int64_t tmp64;
2145         int error;
2146         int boff;
2147         int roff;
2148         int n;
2149
2150         bio = ap->a_bio;
2151         bp = bio->bio_buf;
2152         ip = ap->a_vp->v_data;
2153
2154         /*
2155          * The zone-2 disk offset may have been set by the cluster code via
2156          * a BMAP operation, or else should be NOOFFSET.
2157          *
2158          * Checking the high bits for a match against zone-2 should suffice.
2159          */
2160         nbio = push_bio(bio);
2161         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2162             HAMMER_ZONE_LARGE_DATA) {
2163                 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2164                 return (error);
2165         }
2166
2167         /*
2168          * Well, that sucked.  Do it the hard way.  If all the stars are
2169          * aligned we may still be able to issue a direct-read.
2170          */
2171         hammer_simple_transaction(&trans, ip->hmp);
2172         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2173
2174         /*
2175          * Key range (begin and end inclusive) to scan.  Note that the key's
2176          * stored in the actual records represent BASE+LEN, not BASE.  The
2177          * first record containing bio_offset will have a key > bio_offset.
2178          */
2179         cursor.key_beg.localization = ip->obj_localization +
2180                                       HAMMER_LOCALIZE_MISC;
2181         cursor.key_beg.obj_id = ip->obj_id;
2182         cursor.key_beg.create_tid = 0;
2183         cursor.key_beg.delete_tid = 0;
2184         cursor.key_beg.obj_type = 0;
2185         cursor.key_beg.key = bio->bio_offset + 1;
2186         cursor.asof = ip->obj_asof;
2187         cursor.flags |= HAMMER_CURSOR_ASOF;
2188
2189         cursor.key_end = cursor.key_beg;
2190         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2191 #if 0
2192         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2193                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2194                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2195                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2196         } else
2197 #endif
2198         {
2199                 ran_end = bio->bio_offset + bp->b_bufsize;
2200                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2201                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2202                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2203                 if (tmp64 < ran_end)
2204                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2205                 else
2206                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2207         }
2208         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2209
2210         error = hammer_ip_first(&cursor);
2211         boff = 0;
2212
2213         while (error == 0) {
2214                 /*
2215                  * Get the base file offset of the record.  The key for
2216                  * data records is (base + bytes) rather then (base).
2217                  */
2218                 base = &cursor.leaf->base;
2219                 rec_offset = base->key - cursor.leaf->data_len;
2220
2221                 /*
2222                  * Calculate the gap, if any, and zero-fill it.
2223                  *
2224                  * n is the offset of the start of the record verses our
2225                  * current seek offset in the bio.
2226                  */
2227                 n = (int)(rec_offset - (bio->bio_offset + boff));
2228                 if (n > 0) {
2229                         if (n > bp->b_bufsize - boff)
2230                                 n = bp->b_bufsize - boff;
2231                         bzero((char *)bp->b_data + boff, n);
2232                         boff += n;
2233                         n = 0;
2234                 }
2235
2236                 /*
2237                  * Calculate the data offset in the record and the number
2238                  * of bytes we can copy.
2239                  *
2240                  * There are two degenerate cases.  First, boff may already
2241                  * be at bp->b_bufsize.  Secondly, the data offset within
2242                  * the record may exceed the record's size.
2243                  */
2244                 roff = -n;
2245                 rec_offset += roff;
2246                 n = cursor.leaf->data_len - roff;
2247                 if (n <= 0) {
2248                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2249                         n = 0;
2250                 } else if (n > bp->b_bufsize - boff) {
2251                         n = bp->b_bufsize - boff;
2252                 }
2253
2254                 /*
2255                  * Deal with cached truncations.  This cool bit of code
2256                  * allows truncate()/ftruncate() to avoid having to sync
2257                  * the file.
2258                  *
2259                  * If the frontend is truncated then all backend records are
2260                  * subject to the frontend's truncation.
2261                  *
2262                  * If the backend is truncated then backend records on-disk
2263                  * (but not in-memory) are subject to the backend's
2264                  * truncation.  In-memory records owned by the backend
2265                  * represent data written after the truncation point on the
2266                  * backend and must not be truncated.
2267                  *
2268                  * Truncate operations deal with frontend buffer cache
2269                  * buffers and frontend-owned in-memory records synchronously.
2270                  */
2271                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2272                         if (hammer_cursor_ondisk(&cursor) ||
2273                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2274                                 if (ip->trunc_off <= rec_offset)
2275                                         n = 0;
2276                                 else if (ip->trunc_off < rec_offset + n)
2277                                         n = (int)(ip->trunc_off - rec_offset);
2278                         }
2279                 }
2280                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2281                         if (hammer_cursor_ondisk(&cursor)) {
2282                                 if (ip->sync_trunc_off <= rec_offset)
2283                                         n = 0;
2284                                 else if (ip->sync_trunc_off < rec_offset + n)
2285                                         n = (int)(ip->sync_trunc_off - rec_offset);
2286                         }
2287                 }
2288
2289                 /*
2290                  * Try to issue a direct read into our bio if possible,
2291                  * otherwise resolve the element data into a hammer_buffer
2292                  * and copy.
2293                  *
2294                  * The buffer on-disk should be zerod past any real
2295                  * truncation point, but may not be for any synthesized
2296                  * truncation point from above.
2297                  */
2298                 disk_offset = cursor.leaf->data_offset + roff;
2299                 if (boff == 0 && n == bp->b_bufsize &&
2300                     hammer_cursor_ondisk(&cursor) &&
2301                     (disk_offset & HAMMER_BUFMASK) == 0) {
2302                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2303                                  HAMMER_ZONE_LARGE_DATA);
2304                         nbio->bio_offset = disk_offset;
2305                         error = hammer_io_direct_read(trans.hmp, nbio,
2306                                                       cursor.leaf);
2307                         goto done;
2308                 } else if (n) {
2309                         error = hammer_ip_resolve_data(&cursor);
2310                         if (error == 0) {
2311                                 bcopy((char *)cursor.data + roff,
2312                                       (char *)bp->b_data + boff, n);
2313                         }
2314                 }
2315                 if (error)
2316                         break;
2317
2318                 /*
2319                  * Iterate until we have filled the request.
2320                  */
2321                 boff += n;
2322                 if (boff == bp->b_bufsize)
2323                         break;
2324                 error = hammer_ip_next(&cursor);
2325         }
2326
2327         /*
2328          * There may have been a gap after the last record
2329          */
2330         if (error == ENOENT)
2331                 error = 0;
2332         if (error == 0 && boff != bp->b_bufsize) {
2333                 KKASSERT(boff < bp->b_bufsize);
2334                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2335                 /* boff = bp->b_bufsize; */
2336         }
2337         bp->b_resid = 0;
2338         bp->b_error = error;
2339         if (error)
2340                 bp->b_flags |= B_ERROR;
2341         biodone(ap->a_bio);
2342
2343 done:
2344         if (cursor.node)
2345                 hammer_cache_node(&ip->cache[1], cursor.node);
2346         hammer_done_cursor(&cursor);
2347         hammer_done_transaction(&trans);
2348         return(error);
2349 }
2350
2351 /*
2352  * BMAP operation - used to support cluster_read() only.
2353  *
2354  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2355  *
2356  * This routine may return EOPNOTSUPP if the opration is not supported for
2357  * the specified offset.  The contents of the pointer arguments do not
2358  * need to be initialized in that case. 
2359  *
2360  * If a disk address is available and properly aligned return 0 with 
2361  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2362  * to the run-length relative to that offset.  Callers may assume that
2363  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2364  * large, so return EOPNOTSUPP if it is not sufficiently large.
2365  */
2366 static
2367 int
2368 hammer_vop_bmap(struct vop_bmap_args *ap)
2369 {
2370         struct hammer_transaction trans;
2371         struct hammer_inode *ip;
2372         struct hammer_cursor cursor;
2373         hammer_base_elm_t base;
2374         int64_t rec_offset;
2375         int64_t ran_end;
2376         int64_t tmp64;
2377         int64_t base_offset;
2378         int64_t base_disk_offset;
2379         int64_t last_offset;
2380         hammer_off_t last_disk_offset;
2381         hammer_off_t disk_offset;
2382         int     rec_len;
2383         int     error;
2384         int     blksize;
2385
2386         ++hammer_stats_file_iopsr;
2387         ip = ap->a_vp->v_data;
2388
2389         /*
2390          * We can only BMAP regular files.  We can't BMAP database files,
2391          * directories, etc.
2392          */
2393         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2394                 return(EOPNOTSUPP);
2395
2396         /*
2397          * bmap is typically called with runp/runb both NULL when used
2398          * for writing.  We do not support BMAP for writing atm.
2399          */
2400         if (ap->a_cmd != BUF_CMD_READ)
2401                 return(EOPNOTSUPP);
2402
2403         /*
2404          * Scan the B-Tree to acquire blockmap addresses, then translate
2405          * to raw addresses.
2406          */
2407         hammer_simple_transaction(&trans, ip->hmp);
2408 #if 0
2409         kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2410 #endif
2411         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2412
2413         /*
2414          * Key range (begin and end inclusive) to scan.  Note that the key's
2415          * stored in the actual records represent BASE+LEN, not BASE.  The
2416          * first record containing bio_offset will have a key > bio_offset.
2417          */
2418         cursor.key_beg.localization = ip->obj_localization +
2419                                       HAMMER_LOCALIZE_MISC;
2420         cursor.key_beg.obj_id = ip->obj_id;
2421         cursor.key_beg.create_tid = 0;
2422         cursor.key_beg.delete_tid = 0;
2423         cursor.key_beg.obj_type = 0;
2424         if (ap->a_runb)
2425                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2426         else
2427                 cursor.key_beg.key = ap->a_loffset + 1;
2428         if (cursor.key_beg.key < 0)
2429                 cursor.key_beg.key = 0;
2430         cursor.asof = ip->obj_asof;
2431         cursor.flags |= HAMMER_CURSOR_ASOF;
2432
2433         cursor.key_end = cursor.key_beg;
2434         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2435
2436         ran_end = ap->a_loffset + MAXPHYS;
2437         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2438         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2439         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2440         if (tmp64 < ran_end)
2441                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2442         else
2443                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2444
2445         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2446
2447         error = hammer_ip_first(&cursor);
2448         base_offset = last_offset = 0;
2449         base_disk_offset = last_disk_offset = 0;
2450
2451         while (error == 0) {
2452                 /*
2453                  * Get the base file offset of the record.  The key for
2454                  * data records is (base + bytes) rather then (base).
2455                  *
2456                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2457                  * The extra bytes should be zero on-disk and the BMAP op
2458                  * should still be ok.
2459                  */
2460                 base = &cursor.leaf->base;
2461                 rec_offset = base->key - cursor.leaf->data_len;
2462                 rec_len    = cursor.leaf->data_len;
2463
2464                 /*
2465                  * Incorporate any cached truncation.
2466                  *
2467                  * NOTE: Modifications to rec_len based on synthesized
2468                  * truncation points remove the guarantee that any extended
2469                  * data on disk is zero (since the truncations may not have
2470                  * taken place on-media yet).
2471                  */
2472                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2473                         if (hammer_cursor_ondisk(&cursor) ||
2474                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2475                                 if (ip->trunc_off <= rec_offset)
2476                                         rec_len = 0;
2477                                 else if (ip->trunc_off < rec_offset + rec_len)
2478                                         rec_len = (int)(ip->trunc_off - rec_offset);
2479                         }
2480                 }
2481                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2482                         if (hammer_cursor_ondisk(&cursor)) {
2483                                 if (ip->sync_trunc_off <= rec_offset)
2484                                         rec_len = 0;
2485                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2486                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2487                         }
2488                 }
2489
2490                 /*
2491                  * Accumulate information.  If we have hit a discontiguous
2492                  * block reset base_offset unless we are already beyond the
2493                  * requested offset.  If we are, that's it, we stop.
2494                  */
2495                 if (error)
2496                         break;
2497                 if (hammer_cursor_ondisk(&cursor)) {
2498                         disk_offset = cursor.leaf->data_offset;
2499                         if (rec_offset != last_offset ||
2500                             disk_offset != last_disk_offset) {
2501                                 if (rec_offset > ap->a_loffset)
2502                                         break;
2503                                 base_offset = rec_offset;
2504                                 base_disk_offset = disk_offset;
2505                         }
2506                         last_offset = rec_offset + rec_len;
2507                         last_disk_offset = disk_offset + rec_len;
2508                 }
2509                 error = hammer_ip_next(&cursor);
2510         }
2511
2512 #if 0
2513         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2514                 ap->a_loffset, base_offset, last_offset);
2515         kprintf("BMAP %16s:  %016llx - %016llx\n",
2516                 "", base_disk_offset, last_disk_offset);
2517 #endif
2518
2519         if (cursor.node) {
2520                 hammer_cache_node(&ip->cache[1], cursor.node);
2521 #if 0
2522                 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2523 #endif
2524         }
2525         hammer_done_cursor(&cursor);
2526         hammer_done_transaction(&trans);
2527
2528         /*
2529          * If we couldn't find any records or the records we did find were
2530          * all behind the requested offset, return failure.  A forward
2531          * truncation can leave a hole w/ no on-disk records.
2532          */
2533         if (last_offset == 0 || last_offset < ap->a_loffset)
2534                 return (EOPNOTSUPP);
2535
2536         /*
2537          * Figure out the block size at the requested offset and adjust
2538          * our limits so the cluster_read() does not create inappropriately
2539          * sized buffer cache buffers.
2540          */
2541         blksize = hammer_blocksize(ap->a_loffset);
2542         if (hammer_blocksize(base_offset) != blksize) {
2543                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2544         }
2545         if (last_offset != ap->a_loffset &&
2546             hammer_blocksize(last_offset - 1) != blksize) {
2547                 last_offset = hammer_blockdemarc(ap->a_loffset,
2548                                                  last_offset - 1);
2549         }
2550
2551         /*
2552          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2553          * from occuring.
2554          */
2555         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2556
2557         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2558                 /*
2559                  * Only large-data zones can be direct-IOd
2560                  */
2561                 error = EOPNOTSUPP;
2562         } else if ((disk_offset & HAMMER_BUFMASK) ||
2563                    (last_offset - ap->a_loffset) < blksize) {
2564                 /*
2565                  * doffsetp is not aligned or the forward run size does
2566                  * not cover a whole buffer, disallow the direct I/O.
2567                  */
2568                 error = EOPNOTSUPP;
2569         } else {
2570                 /*
2571                  * We're good.
2572                  */
2573                 *ap->a_doffsetp = disk_offset;
2574                 if (ap->a_runb) {
2575                         *ap->a_runb = ap->a_loffset - base_offset;
2576                         KKASSERT(*ap->a_runb >= 0);
2577                 }
2578                 if (ap->a_runp) {
2579                         *ap->a_runp = last_offset - ap->a_loffset;
2580                         KKASSERT(*ap->a_runp >= 0);
2581                 }
2582                 error = 0;
2583         }
2584         return(error);
2585 }
2586
2587 /*
2588  * Write to a regular file.   Because this is a strategy call the OS is
2589  * trying to actually get data onto the media.
2590  */
2591 static
2592 int
2593 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2594 {
2595         hammer_record_t record;
2596         hammer_mount_t hmp;
2597         hammer_inode_t ip;
2598         struct bio *bio;
2599         struct buf *bp;
2600         int blksize;
2601         int bytes;
2602         int error;
2603
2604         bio = ap->a_bio;
2605         bp = bio->bio_buf;
2606         ip = ap->a_vp->v_data;
2607         hmp = ip->hmp;
2608
2609         blksize = hammer_blocksize(bio->bio_offset);
2610         KKASSERT(bp->b_bufsize == blksize);
2611
2612         if (ip->flags & HAMMER_INODE_RO) {
2613                 bp->b_error = EROFS;
2614                 bp->b_flags |= B_ERROR;
2615                 biodone(ap->a_bio);
2616                 return(EROFS);
2617         }
2618
2619         /*
2620          * Interlock with inode destruction (no in-kernel or directory
2621          * topology visibility).  If we queue new IO while trying to
2622          * destroy the inode we can deadlock the vtrunc call in
2623          * hammer_inode_unloadable_check().
2624          *
2625          * Besides, there's no point flushing a bp associated with an
2626          * inode that is being destroyed on-media and has no kernel
2627          * references.
2628          */
2629         if ((ip->flags | ip->sync_flags) &
2630             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2631                 bp->b_resid = 0;
2632                 biodone(ap->a_bio);
2633                 return(0);
2634         }
2635
2636         /*
2637          * Reserve space and issue a direct-write from the front-end. 
2638          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2639          * allocations.
2640          *
2641          * An in-memory record will be installed to reference the storage
2642          * until the flusher can get to it.
2643          *
2644          * Since we own the high level bio the front-end will not try to
2645          * do a direct-read until the write completes.
2646          *
2647          * NOTE: The only time we do not reserve a full-sized buffers
2648          * worth of data is if the file is small.  We do not try to
2649          * allocate a fragment (from the small-data zone) at the end of
2650          * an otherwise large file as this can lead to wildly separated
2651          * data.
2652          */
2653         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2654         KKASSERT(bio->bio_offset < ip->ino_data.size);
2655         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2656                 bytes = bp->b_bufsize;
2657         else
2658                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2659
2660         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2661                                     bytes, &error);
2662         if (record) {
2663                 hammer_io_direct_write(hmp, record, bio);
2664                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2665                         hammer_flush_inode(ip, 0);
2666         } else {
2667                 bp->b_bio2.bio_offset = NOOFFSET;
2668                 bp->b_error = error;
2669                 bp->b_flags |= B_ERROR;
2670                 biodone(ap->a_bio);
2671         }
2672         return(error);
2673 }
2674
2675 /*
2676  * dounlink - disconnect a directory entry
2677  *
2678  * XXX whiteout support not really in yet
2679  */
2680 static int
2681 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2682                 struct vnode *dvp, struct ucred *cred, 
2683                 int flags, int isdir)
2684 {
2685         struct namecache *ncp;
2686         hammer_inode_t dip;
2687         hammer_inode_t ip;
2688         struct hammer_cursor cursor;
2689         int64_t namekey;
2690         u_int32_t max_iterations;
2691         int nlen, error;
2692
2693         /*
2694          * Calculate the namekey and setup the key range for the scan.  This
2695          * works kinda like a chained hash table where the lower 32 bits
2696          * of the namekey synthesize the chain.
2697          *
2698          * The key range is inclusive of both key_beg and key_end.
2699          */
2700         dip = VTOI(dvp);
2701         ncp = nch->ncp;
2702
2703         if (dip->flags & HAMMER_INODE_RO)
2704                 return (EROFS);
2705
2706         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
2707                                            &max_iterations);
2708 retry:
2709         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2710         cursor.key_beg.localization = dip->obj_localization +
2711                                       HAMMER_LOCALIZE_MISC;
2712         cursor.key_beg.obj_id = dip->obj_id;
2713         cursor.key_beg.key = namekey;
2714         cursor.key_beg.create_tid = 0;
2715         cursor.key_beg.delete_tid = 0;
2716         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2717         cursor.key_beg.obj_type = 0;
2718
2719         cursor.key_end = cursor.key_beg;
2720         cursor.key_end.key += max_iterations;
2721         cursor.asof = dip->obj_asof;
2722         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2723
2724         /*
2725          * Scan all matching records (the chain), locate the one matching
2726          * the requested path component.  info->last_error contains the
2727          * error code on search termination and could be 0, ENOENT, or
2728          * something else.
2729          *
2730          * The hammer_ip_*() functions merge in-memory records with on-disk
2731          * records for the purposes of the search.
2732          */
2733         error = hammer_ip_first(&cursor);
2734
2735         while (error == 0) {
2736                 error = hammer_ip_resolve_data(&cursor);
2737                 if (error)
2738                         break;
2739                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2740                 KKASSERT(nlen > 0);
2741                 if (ncp->nc_nlen == nlen &&
2742                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2743                         break;
2744                 }
2745                 error = hammer_ip_next(&cursor);
2746         }
2747
2748         /*
2749          * If all is ok we have to get the inode so we can adjust nlinks.
2750          * To avoid a deadlock with the flusher we must release the inode
2751          * lock on the directory when acquiring the inode for the entry.
2752          *
2753          * If the target is a directory, it must be empty.
2754          */
2755         if (error == 0) {
2756                 hammer_unlock(&cursor.ip->lock);
2757                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2758                                       dip->hmp->asof,
2759                                       cursor.data->entry.localization,
2760                                       0, &error);
2761                 hammer_lock_sh(&cursor.ip->lock);
2762                 if (error == ENOENT) {
2763                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2764                         Debugger("ENOENT unlinking object that should exist");
2765                 }
2766
2767                 /*
2768                  * If isdir >= 0 we validate that the entry is or is not a
2769                  * directory.  If isdir < 0 we don't care.
2770                  */
2771                 if (error == 0 && isdir >= 0) {
2772                         if (isdir &&
2773                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
2774                                 error = ENOTDIR;
2775                         } else if (isdir == 0 &&
2776                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
2777                                 error = EISDIR;
2778                         }
2779                 }
2780
2781                 /*
2782                  * If we are trying to remove a directory the directory must
2783                  * be empty.
2784                  *
2785                  * WARNING: hammer_ip_check_directory_empty() may have to
2786                  * terminate the cursor to avoid a deadlock.  It is ok to
2787                  * call hammer_done_cursor() twice.
2788                  */
2789                 if (error == 0 && ip->ino_data.obj_type ==
2790                                   HAMMER_OBJTYPE_DIRECTORY) {
2791                         error = hammer_ip_check_directory_empty(trans, ip);
2792                 }
2793
2794                 /*
2795                  * Delete the directory entry.
2796                  *
2797                  * WARNING: hammer_ip_del_directory() may have to terminate
2798                  * the cursor to avoid a deadlock.  It is ok to call
2799                  * hammer_done_cursor() twice.
2800                  */
2801                 if (error == 0) {
2802                         error = hammer_ip_del_directory(trans, &cursor,
2803                                                         dip, ip);
2804                 }
2805                 hammer_done_cursor(&cursor);
2806                 if (error == 0) {
2807                         cache_setunresolved(nch);
2808                         cache_setvp(nch, NULL);
2809                         /* XXX locking */
2810                         if (ip->vp) {
2811                                 hammer_knote(ip->vp, NOTE_DELETE);
2812                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2813                         }
2814                 }
2815                 if (ip)
2816                         hammer_rel_inode(ip, 0);
2817         } else {
2818                 hammer_done_cursor(&cursor);
2819         }
2820         if (error == EDEADLK)
2821                 goto retry;
2822
2823         return (error);
2824 }
2825
2826 /************************************************************************
2827  *                          FIFO AND SPECFS OPS                         *
2828  ************************************************************************
2829  *
2830  */
2831
2832 static int
2833 hammer_vop_fifoclose (struct vop_close_args *ap)
2834 {
2835         /* XXX update itimes */
2836         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2837 }
2838
2839 static int
2840 hammer_vop_fiforead (struct vop_read_args *ap)
2841 {
2842         int error;
2843
2844         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2845         /* XXX update access time */
2846         return (error);
2847 }
2848
2849 static int
2850 hammer_vop_fifowrite (struct vop_write_args *ap)
2851 {
2852         int error;
2853
2854         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2855         /* XXX update access time */
2856         return (error);
2857 }
2858
2859 static
2860 int
2861 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
2862 {
2863         int error;
2864
2865         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2866         if (error)
2867                 error = hammer_vop_kqfilter(ap);
2868         return(error);
2869 }
2870
2871 static int
2872 hammer_vop_specclose (struct vop_close_args *ap)
2873 {
2874         /* XXX update itimes */
2875         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2876 }
2877
2878 static int
2879 hammer_vop_specread (struct vop_read_args *ap)
2880 {
2881         /* XXX update access time */
2882         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2883 }
2884
2885 static int
2886 hammer_vop_specwrite (struct vop_write_args *ap)
2887 {
2888         /* XXX update last change time */
2889         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2890 }
2891
2892 /************************************************************************
2893  *                          KQFILTER OPS                                *
2894  ************************************************************************
2895  *
2896  */
2897 static void filt_hammerdetach(struct knote *kn);
2898 static int filt_hammerread(struct knote *kn, long hint);
2899 static int filt_hammerwrite(struct knote *kn, long hint);
2900 static int filt_hammervnode(struct knote *kn, long hint);
2901
2902 static struct filterops hammerread_filtops =
2903         { 1, NULL, filt_hammerdetach, filt_hammerread };
2904 static struct filterops hammerwrite_filtops =
2905         { 1, NULL, filt_hammerdetach, filt_hammerwrite };
2906 static struct filterops hammervnode_filtops =
2907         { 1, NULL, filt_hammerdetach, filt_hammervnode };
2908
2909 static
2910 int
2911 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
2912 {
2913         struct vnode *vp = ap->a_vp;
2914         struct knote *kn = ap->a_kn;
2915         lwkt_tokref ilock;
2916
2917         switch (kn->kn_filter) {
2918         case EVFILT_READ:
2919                 kn->kn_fop = &hammerread_filtops;
2920                 break;
2921         case EVFILT_WRITE:
2922                 kn->kn_fop = &hammerwrite_filtops;
2923                 break;
2924         case EVFILT_VNODE:
2925                 kn->kn_fop = &hammervnode_filtops;
2926                 break;
2927         default:
2928                 return (1);
2929         }
2930
2931         kn->kn_hook = (caddr_t)vp;
2932
2933         lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
2934         SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
2935         lwkt_reltoken(&ilock);
2936
2937         return(0);
2938 }
2939
2940 static void
2941 filt_hammerdetach(struct knote *kn)
2942 {
2943         struct vnode *vp = (void *)kn->kn_hook;
2944         lwkt_tokref ilock;
2945
2946         lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
2947         SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
2948                      kn, knote, kn_selnext);
2949         lwkt_reltoken(&ilock);
2950 }
2951
2952 static int
2953 filt_hammerread(struct knote *kn, long hint)
2954 {
2955         struct vnode *vp = (void *)kn->kn_hook;
2956         hammer_inode_t ip = VTOI(vp);
2957
2958         if (hint == NOTE_REVOKE) {
2959                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2960                 return(1);
2961         }
2962         kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
2963         return (kn->kn_data != 0);
2964 }
2965
2966 static int
2967 filt_hammerwrite(struct knote *kn, long hint)
2968 {
2969         if (hint == NOTE_REVOKE)
2970                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
2971         kn->kn_data = 0;
2972         return (1);
2973 }
2974
2975 static int
2976 filt_hammervnode(struct knote *kn, long hint)
2977 {
2978         if (kn->kn_sfflags & hint)
2979                 kn->kn_fflags |= hint;
2980         if (hint == NOTE_REVOKE) {
2981                 kn->kn_flags |= EV_EOF;
2982                 return (1);
2983         }
2984         return (kn->kn_fflags != 0);
2985 }
2986