Merge branch 'master' of git://venus/dragonfly
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.102 2008/10/16 17:24:16 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <sys/file.h>
48 #include <vm/vm_extern.h>
49 #include <vfs/fifofs/fifo.h>
50 #include "hammer.h"
51
52 /*
53  * USERFS VNOPS
54  */
55 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
56 static int hammer_vop_fsync(struct vop_fsync_args *);
57 static int hammer_vop_read(struct vop_read_args *);
58 static int hammer_vop_write(struct vop_write_args *);
59 static int hammer_vop_access(struct vop_access_args *);
60 static int hammer_vop_advlock(struct vop_advlock_args *);
61 static int hammer_vop_close(struct vop_close_args *);
62 static int hammer_vop_ncreate(struct vop_ncreate_args *);
63 static int hammer_vop_getattr(struct vop_getattr_args *);
64 static int hammer_vop_nresolve(struct vop_nresolve_args *);
65 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
66 static int hammer_vop_nlink(struct vop_nlink_args *);
67 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
68 static int hammer_vop_nmknod(struct vop_nmknod_args *);
69 static int hammer_vop_open(struct vop_open_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_markatime(struct vop_markatime_args *);
77 static int hammer_vop_setattr(struct vop_setattr_args *);
78 static int hammer_vop_strategy(struct vop_strategy_args *);
79 static int hammer_vop_bmap(struct vop_bmap_args *ap);
80 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
81 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
82 static int hammer_vop_ioctl(struct vop_ioctl_args *);
83 static int hammer_vop_mountctl(struct vop_mountctl_args *);
84 static int hammer_vop_kqfilter (struct vop_kqfilter_args *);
85
86 static int hammer_vop_fifoclose (struct vop_close_args *);
87 static int hammer_vop_fiforead (struct vop_read_args *);
88 static int hammer_vop_fifowrite (struct vop_write_args *);
89 static int hammer_vop_fifokqfilter (struct vop_kqfilter_args *);
90
91 static int hammer_vop_specclose (struct vop_close_args *);
92 static int hammer_vop_specread (struct vop_read_args *);
93 static int hammer_vop_specwrite (struct vop_write_args *);
94
95 struct vop_ops hammer_vnode_vops = {
96         .vop_default =          vop_defaultop,
97         .vop_fsync =            hammer_vop_fsync,
98         .vop_getpages =         vop_stdgetpages,
99         .vop_putpages =         vop_stdputpages,
100         .vop_read =             hammer_vop_read,
101         .vop_write =            hammer_vop_write,
102         .vop_access =           hammer_vop_access,
103         .vop_advlock =          hammer_vop_advlock,
104         .vop_close =            hammer_vop_close,
105         .vop_ncreate =          hammer_vop_ncreate,
106         .vop_getattr =          hammer_vop_getattr,
107         .vop_inactive =         hammer_vop_inactive,
108         .vop_reclaim =          hammer_vop_reclaim,
109         .vop_nresolve =         hammer_vop_nresolve,
110         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
111         .vop_nlink =            hammer_vop_nlink,
112         .vop_nmkdir =           hammer_vop_nmkdir,
113         .vop_nmknod =           hammer_vop_nmknod,
114         .vop_open =             hammer_vop_open,
115         .vop_pathconf =         vop_stdpathconf,
116         .vop_print =            hammer_vop_print,
117         .vop_readdir =          hammer_vop_readdir,
118         .vop_readlink =         hammer_vop_readlink,
119         .vop_nremove =          hammer_vop_nremove,
120         .vop_nrename =          hammer_vop_nrename,
121         .vop_nrmdir =           hammer_vop_nrmdir,
122         .vop_markatime =        hammer_vop_markatime,
123         .vop_setattr =          hammer_vop_setattr,
124         .vop_bmap =             hammer_vop_bmap,
125         .vop_strategy =         hammer_vop_strategy,
126         .vop_nsymlink =         hammer_vop_nsymlink,
127         .vop_nwhiteout =        hammer_vop_nwhiteout,
128         .vop_ioctl =            hammer_vop_ioctl,
129         .vop_mountctl =         hammer_vop_mountctl,
130         .vop_kqfilter =         hammer_vop_kqfilter
131 };
132
133 struct vop_ops hammer_spec_vops = {
134         .vop_default =          spec_vnoperate,
135         .vop_fsync =            hammer_vop_fsync,
136         .vop_read =             hammer_vop_specread,
137         .vop_write =            hammer_vop_specwrite,
138         .vop_access =           hammer_vop_access,
139         .vop_close =            hammer_vop_specclose,
140         .vop_markatime =        hammer_vop_markatime,
141         .vop_getattr =          hammer_vop_getattr,
142         .vop_inactive =         hammer_vop_inactive,
143         .vop_reclaim =          hammer_vop_reclaim,
144         .vop_setattr =          hammer_vop_setattr
145 };
146
147 struct vop_ops hammer_fifo_vops = {
148         .vop_default =          fifo_vnoperate,
149         .vop_fsync =            hammer_vop_fsync,
150         .vop_read =             hammer_vop_fiforead,
151         .vop_write =            hammer_vop_fifowrite,
152         .vop_access =           hammer_vop_access,
153         .vop_close =            hammer_vop_fifoclose,
154         .vop_markatime =        hammer_vop_markatime,
155         .vop_getattr =          hammer_vop_getattr,
156         .vop_inactive =         hammer_vop_inactive,
157         .vop_reclaim =          hammer_vop_reclaim,
158         .vop_setattr =          hammer_vop_setattr,
159         .vop_kqfilter =         hammer_vop_fifokqfilter
160 };
161
162 static __inline
163 void
164 hammer_knote(struct vnode *vp, int flags)
165 {
166         if (flags)
167                 KNOTE(&vp->v_pollinfo.vpi_selinfo.si_note, flags);
168 }
169
170 #ifdef DEBUG_TRUNCATE
171 struct hammer_inode *HammerTruncIp;
172 #endif
173
174 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
175                            struct vnode *dvp, struct ucred *cred,
176                            int flags, int isdir);
177 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
178 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
179
180 #if 0
181 static
182 int
183 hammer_vop_vnoperate(struct vop_generic_args *)
184 {
185         return (VOCALL(&hammer_vnode_vops, ap));
186 }
187 #endif
188
189 /*
190  * hammer_vop_fsync { vp, waitfor }
191  *
192  * fsync() an inode to disk and wait for it to be completely committed
193  * such that the information would not be undone if a crash occured after
194  * return.
195  */
196 static
197 int
198 hammer_vop_fsync(struct vop_fsync_args *ap)
199 {
200         hammer_inode_t ip = VTOI(ap->a_vp);
201
202         ++hammer_count_fsyncs;
203         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
204         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
205         if (ap->a_waitfor == MNT_WAIT) {
206                 vn_unlock(ap->a_vp);
207                 hammer_wait_inode(ip);
208                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
209         }
210         return (ip->error);
211 }
212
213 /*
214  * hammer_vop_read { vp, uio, ioflag, cred }
215  */
216 static
217 int
218 hammer_vop_read(struct vop_read_args *ap)
219 {
220         struct hammer_transaction trans;
221         hammer_inode_t ip;
222         off_t offset;
223         struct buf *bp;
224         struct uio *uio;
225         int error;
226         int n;
227         int seqcount;
228         int ioseqcount;
229         int blksize;
230
231         if (ap->a_vp->v_type != VREG)
232                 return (EINVAL);
233         ip = VTOI(ap->a_vp);
234         error = 0;
235         uio = ap->a_uio;
236
237         /*
238          * Allow the UIO's size to override the sequential heuristic.
239          */
240         blksize = hammer_blocksize(uio->uio_offset);
241         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
242         ioseqcount = ap->a_ioflag >> 16;
243         if (seqcount < ioseqcount)
244                 seqcount = ioseqcount;
245
246         hammer_start_transaction(&trans, ip->hmp);
247
248         /*
249          * Access the data typically in HAMMER_BUFSIZE blocks via the
250          * buffer cache, but HAMMER may use a variable block size based
251          * on the offset.
252          */
253         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
254                 int64_t base_offset;
255                 int64_t file_limit;
256
257                 blksize = hammer_blocksize(uio->uio_offset);
258                 offset = (int)uio->uio_offset & (blksize - 1);
259                 base_offset = uio->uio_offset - offset;
260
261                 if (hammer_cluster_enable) {
262                         /*
263                          * Use file_limit to prevent cluster_read() from
264                          * creating buffers of the wrong block size past
265                          * the demarc.
266                          */
267                         file_limit = ip->ino_data.size;
268                         if (base_offset < HAMMER_XDEMARC &&
269                             file_limit > HAMMER_XDEMARC) {
270                                 file_limit = HAMMER_XDEMARC;
271                         }
272                         error = cluster_read(ap->a_vp,
273                                              file_limit, base_offset,
274                                              blksize, MAXPHYS,
275                                              seqcount, &bp);
276                 } else {
277                         error = bread(ap->a_vp, base_offset, blksize, &bp);
278                 }
279                 if (error) {
280                         kprintf("error %d\n", error);
281                         brelse(bp);
282                         break;
283                 }
284
285                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
286                 n = blksize - offset;
287                 if (n > uio->uio_resid)
288                         n = uio->uio_resid;
289                 if (n > ip->ino_data.size - uio->uio_offset)
290                         n = (int)(ip->ino_data.size - uio->uio_offset);
291                 error = uiomove((char *)bp->b_data + offset, n, uio);
292
293                 /* data has a lower priority then meta-data */
294                 bp->b_flags |= B_AGE;
295                 bqrelse(bp);
296                 if (error)
297                         break;
298                 hammer_stats_file_read += n;
299         }
300         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
301             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
302                 ip->ino_data.atime = trans.time;
303                 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
304         }
305         hammer_done_transaction(&trans);
306         return (error);
307 }
308
309 /*
310  * hammer_vop_write { vp, uio, ioflag, cred }
311  */
312 static
313 int
314 hammer_vop_write(struct vop_write_args *ap)
315 {
316         struct hammer_transaction trans;
317         struct hammer_inode *ip;
318         hammer_mount_t hmp;
319         struct uio *uio;
320         int offset;
321         off_t base_offset;
322         struct buf *bp;
323         int kflags;
324         int error;
325         int n;
326         int flags;
327         int delta;
328         int seqcount;
329
330         if (ap->a_vp->v_type != VREG)
331                 return (EINVAL);
332         ip = VTOI(ap->a_vp);
333         hmp = ip->hmp;
334         error = 0;
335         kflags = 0;
336         seqcount = ap->a_ioflag >> 16;
337
338         if (ip->flags & HAMMER_INODE_RO)
339                 return (EROFS);
340
341         /*
342          * Create a transaction to cover the operations we perform.
343          */
344         hammer_start_transaction(&trans, hmp);
345         uio = ap->a_uio;
346
347         /*
348          * Check append mode
349          */
350         if (ap->a_ioflag & IO_APPEND)
351                 uio->uio_offset = ip->ino_data.size;
352
353         /*
354          * Check for illegal write offsets.  Valid range is 0...2^63-1.
355          *
356          * NOTE: the base_off assignment is required to work around what
357          * I consider to be a GCC-4 optimization bug.
358          */
359         if (uio->uio_offset < 0) {
360                 hammer_done_transaction(&trans);
361                 return (EFBIG);
362         }
363         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
364         if (uio->uio_resid > 0 && base_offset <= 0) {
365                 hammer_done_transaction(&trans);
366                 return (EFBIG);
367         }
368
369         /*
370          * Access the data typically in HAMMER_BUFSIZE blocks via the
371          * buffer cache, but HAMMER may use a variable block size based
372          * on the offset.
373          */
374         while (uio->uio_resid > 0) {
375                 int fixsize = 0;
376                 int blksize;
377                 int blkmask;
378
379                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
380                         break;
381
382                 blksize = hammer_blocksize(uio->uio_offset);
383
384                 /*
385                  * Do not allow HAMMER to blow out the buffer cache.  Very
386                  * large UIOs can lockout other processes due to bwillwrite()
387                  * mechanics.
388                  *
389                  * The hammer inode is not locked during these operations.
390                  * The vnode is locked which can interfere with the pageout
391                  * daemon for non-UIO_NOCOPY writes but should not interfere
392                  * with the buffer cache.  Even so, we cannot afford to
393                  * allow the pageout daemon to build up too many dirty buffer
394                  * cache buffers.
395                  *
396                  * Only call this if we aren't being recursively called from
397                  * a virtual disk device (vn), else we may deadlock.
398                  */
399                 if ((ap->a_ioflag & IO_RECURSE) == 0)
400                         bwillwrite(blksize);
401
402                 /*
403                  * Do not allow HAMMER to blow out system memory by
404                  * accumulating too many records.   Records are so well
405                  * decoupled from the buffer cache that it is possible
406                  * for userland to push data out to the media via
407                  * direct-write, but build up the records queued to the
408                  * backend faster then the backend can flush them out.
409                  * HAMMER has hit its write limit but the frontend has
410                  * no pushback to slow it down.
411                  */
412                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
413                         /*
414                          * Get the inode on the flush list
415                          */
416                         if (ip->rsv_recs >= 64)
417                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
418                         else if (ip->rsv_recs >= 16)
419                                 hammer_flush_inode(ip, 0);
420
421                         /*
422                          * Keep the flusher going if the system keeps
423                          * queueing records.
424                          */
425                         delta = hmp->count_newrecords -
426                                 hmp->last_newrecords;
427                         if (delta < 0 || delta > hammer_limit_recs / 2) {
428                                 hmp->last_newrecords = hmp->count_newrecords;
429                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
430                         }
431
432                         /*
433                          * If we have gotten behind start slowing
434                          * down the writers.
435                          */
436                         delta = (hmp->rsv_recs - hammer_limit_recs) *
437                                 hz / hammer_limit_recs;
438                         if (delta > 0)
439                                 tsleep(&trans, 0, "hmrslo", delta);
440                 }
441
442                 /*
443                  * Calculate the blocksize at the current offset and figure
444                  * out how much we can actually write.
445                  */
446                 blkmask = blksize - 1;
447                 offset = (int)uio->uio_offset & blkmask;
448                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
449                 n = blksize - offset;
450                 if (n > uio->uio_resid)
451                         n = uio->uio_resid;
452                 if (uio->uio_offset + n > ip->ino_data.size) {
453                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
454                         fixsize = 1;
455                         kflags |= NOTE_EXTEND;
456                 }
457
458                 if (uio->uio_segflg == UIO_NOCOPY) {
459                         /*
460                          * Issuing a write with the same data backing the
461                          * buffer.  Instantiate the buffer to collect the
462                          * backing vm pages, then read-in any missing bits.
463                          *
464                          * This case is used by vop_stdputpages().
465                          */
466                         bp = getblk(ap->a_vp, base_offset,
467                                     blksize, GETBLK_BHEAVY, 0);
468                         if ((bp->b_flags & B_CACHE) == 0) {
469                                 bqrelse(bp);
470                                 error = bread(ap->a_vp, base_offset,
471                                               blksize, &bp);
472                         }
473                 } else if (offset == 0 && uio->uio_resid >= blksize) {
474                         /*
475                          * Even though we are entirely overwriting the buffer
476                          * we may still have to zero it out to avoid a 
477                          * mmap/write visibility issue.
478                          */
479                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
480                         if ((bp->b_flags & B_CACHE) == 0)
481                                 vfs_bio_clrbuf(bp);
482                 } else if (base_offset >= ip->ino_data.size) {
483                         /*
484                          * If the base offset of the buffer is beyond the
485                          * file EOF, we don't have to issue a read.
486                          */
487                         bp = getblk(ap->a_vp, base_offset,
488                                     blksize, GETBLK_BHEAVY, 0);
489                         vfs_bio_clrbuf(bp);
490                 } else {
491                         /*
492                          * Partial overwrite, read in any missing bits then
493                          * replace the portion being written.
494                          */
495                         error = bread(ap->a_vp, base_offset, blksize, &bp);
496                         if (error == 0)
497                                 bheavy(bp);
498                 }
499                 if (error == 0) {
500                         error = uiomove((char *)bp->b_data + offset,
501                                         n, uio);
502                 }
503
504                 /*
505                  * If we screwed up we have to undo any VM size changes we
506                  * made.
507                  */
508                 if (error) {
509                         brelse(bp);
510                         if (fixsize) {
511                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
512                                           hammer_blocksize(ip->ino_data.size));
513                         }
514                         break;
515                 }
516                 kflags |= NOTE_WRITE;
517                 hammer_stats_file_write += n;
518                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
519                 if (ip->ino_data.size < uio->uio_offset) {
520                         ip->ino_data.size = uio->uio_offset;
521                         flags = HAMMER_INODE_DDIRTY;
522                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
523                 } else {
524                         flags = 0;
525                 }
526                 ip->ino_data.mtime = trans.time;
527                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
528                 hammer_modify_inode(ip, flags);
529
530                 /*
531                  * Once we dirty the buffer any cached zone-X offset
532                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
533                  * allow overwriting over the same data sector unless
534                  * we provide UNDOs for the old data, which we don't.
535                  */
536                 bp->b_bio2.bio_offset = NOOFFSET;
537
538                 /*
539                  * Final buffer disposition.
540                  */
541                 bp->b_flags |= B_AGE;
542                 if (ap->a_ioflag & IO_SYNC) {
543                         bwrite(bp);
544                 } else if (ap->a_ioflag & IO_DIRECT) {
545                         bawrite(bp);
546                 } else {
547                         bdwrite(bp);
548                 }
549         }
550         hammer_done_transaction(&trans);
551         hammer_knote(ap->a_vp, kflags);
552         return (error);
553 }
554
555 /*
556  * hammer_vop_access { vp, mode, cred }
557  */
558 static
559 int
560 hammer_vop_access(struct vop_access_args *ap)
561 {
562         struct hammer_inode *ip = VTOI(ap->a_vp);
563         uid_t uid;
564         gid_t gid;
565         int error;
566
567         ++hammer_stats_file_iopsr;
568         uid = hammer_to_unix_xid(&ip->ino_data.uid);
569         gid = hammer_to_unix_xid(&ip->ino_data.gid);
570
571         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
572                                   ip->ino_data.uflags);
573         return (error);
574 }
575
576 /*
577  * hammer_vop_advlock { vp, id, op, fl, flags }
578  */
579 static
580 int
581 hammer_vop_advlock(struct vop_advlock_args *ap)
582 {
583         hammer_inode_t ip = VTOI(ap->a_vp);
584
585         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
586 }
587
588 /*
589  * hammer_vop_close { vp, fflag }
590  */
591 static
592 int
593 hammer_vop_close(struct vop_close_args *ap)
594 {
595         /*hammer_inode_t ip = VTOI(ap->a_vp);*/
596         return (vop_stdclose(ap));
597 }
598
599 /*
600  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
601  *
602  * The operating system has already ensured that the directory entry
603  * does not exist and done all appropriate namespace locking.
604  */
605 static
606 int
607 hammer_vop_ncreate(struct vop_ncreate_args *ap)
608 {
609         struct hammer_transaction trans;
610         struct hammer_inode *dip;
611         struct hammer_inode *nip;
612         struct nchandle *nch;
613         int error;
614
615         nch = ap->a_nch;
616         dip = VTOI(ap->a_dvp);
617
618         if (dip->flags & HAMMER_INODE_RO)
619                 return (EROFS);
620         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
621                 return (error);
622
623         /*
624          * Create a transaction to cover the operations we perform.
625          */
626         hammer_start_transaction(&trans, dip->hmp);
627         ++hammer_stats_file_iopsw;
628
629         /*
630          * Create a new filesystem object of the requested type.  The
631          * returned inode will be referenced and shared-locked to prevent
632          * it from being moved to the flusher.
633          */
634
635         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
636                                     dip, NULL, &nip);
637         if (error) {
638                 hkprintf("hammer_create_inode error %d\n", error);
639                 hammer_done_transaction(&trans);
640                 *ap->a_vpp = NULL;
641                 return (error);
642         }
643
644         /*
645          * Add the new filesystem object to the directory.  This will also
646          * bump the inode's link count.
647          */
648         error = hammer_ip_add_directory(&trans, dip,
649                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
650                                         nip);
651         if (error)
652                 hkprintf("hammer_ip_add_directory error %d\n", error);
653
654         /*
655          * Finish up.
656          */
657         if (error) {
658                 hammer_rel_inode(nip, 0);
659                 hammer_done_transaction(&trans);
660                 *ap->a_vpp = NULL;
661         } else {
662                 error = hammer_get_vnode(nip, ap->a_vpp);
663                 hammer_done_transaction(&trans);
664                 hammer_rel_inode(nip, 0);
665                 if (error == 0) {
666                         cache_setunresolved(ap->a_nch);
667                         cache_setvp(ap->a_nch, *ap->a_vpp);
668                 }
669                 hammer_knote(ap->a_dvp, NOTE_WRITE);
670         }
671         return (error);
672 }
673
674 /*
675  * hammer_vop_getattr { vp, vap }
676  *
677  * Retrieve an inode's attribute information.  When accessing inodes
678  * historically we fake the atime field to ensure consistent results.
679  * The atime field is stored in the B-Tree element and allowed to be
680  * updated without cycling the element.
681  */
682 static
683 int
684 hammer_vop_getattr(struct vop_getattr_args *ap)
685 {
686         struct hammer_inode *ip = VTOI(ap->a_vp);
687         struct vattr *vap = ap->a_vap;
688
689         /*
690          * We want the fsid to be different when accessing a filesystem
691          * with different as-of's so programs like diff don't think
692          * the files are the same.
693          *
694          * We also want the fsid to be the same when comparing snapshots,
695          * or when comparing mirrors (which might be backed by different
696          * physical devices).  HAMMER fsids are based on the PFS's
697          * shared_uuid field.
698          *
699          * XXX there is a chance of collision here.  The va_fsid reported
700          * by stat is different from the more involved fsid used in the
701          * mount structure.
702          */
703         ++hammer_stats_file_iopsr;
704         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
705                        (u_int32_t)(ip->obj_asof >> 32);
706
707         vap->va_fileid = ip->ino_leaf.base.obj_id;
708         vap->va_mode = ip->ino_data.mode;
709         vap->va_nlink = ip->ino_data.nlinks;
710         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
711         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
712         vap->va_rmajor = 0;
713         vap->va_rminor = 0;
714         vap->va_size = ip->ino_data.size;
715
716         /*
717          * Special case for @@PFS softlinks.  The actual size of the
718          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
719          */
720         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
721             ip->ino_data.size == 10 &&
722             ip->obj_asof == HAMMER_MAX_TID &&
723             ip->obj_localization == 0 &&
724             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
725                     vap->va_size = 26;
726         }
727
728         /*
729          * We must provide a consistent atime and mtime for snapshots
730          * so people can do a 'tar cf - ... | md5' on them and get
731          * consistent results.
732          */
733         if (ip->flags & HAMMER_INODE_RO) {
734                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
735                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
736         } else {
737                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
738                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
739         }
740         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
741         vap->va_flags = ip->ino_data.uflags;
742         vap->va_gen = 1;        /* hammer inums are unique for all time */
743         vap->va_blocksize = HAMMER_BUFSIZE;
744         if (ip->ino_data.size >= HAMMER_XDEMARC) {
745                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
746                                 ~HAMMER_XBUFMASK64;
747         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
748                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
749                                 ~HAMMER_BUFMASK64;
750         } else {
751                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
752         }
753
754         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
755         vap->va_filerev = 0;    /* XXX */
756         /* mtime uniquely identifies any adjustments made to the file XXX */
757         vap->va_fsmid = ip->ino_data.mtime;
758         vap->va_uid_uuid = ip->ino_data.uid;
759         vap->va_gid_uuid = ip->ino_data.gid;
760         vap->va_fsid_uuid = ip->hmp->fsid;
761         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
762                           VA_FSID_UUID_VALID;
763
764         switch (ip->ino_data.obj_type) {
765         case HAMMER_OBJTYPE_CDEV:
766         case HAMMER_OBJTYPE_BDEV:
767                 vap->va_rmajor = ip->ino_data.rmajor;
768                 vap->va_rminor = ip->ino_data.rminor;
769                 break;
770         default:
771                 break;
772         }
773         return(0);
774 }
775
776 /*
777  * hammer_vop_nresolve { nch, dvp, cred }
778  *
779  * Locate the requested directory entry.
780  */
781 static
782 int
783 hammer_vop_nresolve(struct vop_nresolve_args *ap)
784 {
785         struct hammer_transaction trans;
786         struct namecache *ncp;
787         hammer_inode_t dip;
788         hammer_inode_t ip;
789         hammer_tid_t asof;
790         struct hammer_cursor cursor;
791         struct vnode *vp;
792         int64_t namekey;
793         int error;
794         int i;
795         int nlen;
796         int flags;
797         int ispfs;
798         int64_t obj_id;
799         u_int32_t localization;
800         u_int32_t max_iterations;
801
802         /*
803          * Misc initialization, plus handle as-of name extensions.  Look for
804          * the '@@' extension.  Note that as-of files and directories cannot
805          * be modified.
806          */
807         dip = VTOI(ap->a_dvp);
808         ncp = ap->a_nch->ncp;
809         asof = dip->obj_asof;
810         localization = dip->obj_localization;   /* for code consistency */
811         nlen = ncp->nc_nlen;
812         flags = dip->flags & HAMMER_INODE_RO;
813         ispfs = 0;
814
815         hammer_simple_transaction(&trans, dip->hmp);
816         ++hammer_stats_file_iopsr;
817
818         for (i = 0; i < nlen; ++i) {
819                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
820                         error = hammer_str_to_tid(ncp->nc_name + i + 2,
821                                                   &ispfs, &asof, &localization);
822                         if (error != 0) {
823                                 i = nlen;
824                                 break;
825                         }
826                         if (asof != HAMMER_MAX_TID)
827                                 flags |= HAMMER_INODE_RO;
828                         break;
829                 }
830         }
831         nlen = i;
832
833         /*
834          * If this is a PFS softlink we dive into the PFS
835          */
836         if (ispfs && nlen == 0) {
837                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
838                                       asof, localization,
839                                       flags, &error);
840                 if (error == 0) {
841                         error = hammer_get_vnode(ip, &vp);
842                         hammer_rel_inode(ip, 0);
843                 } else {
844                         vp = NULL;
845                 }
846                 if (error == 0) {
847                         vn_unlock(vp);
848                         cache_setvp(ap->a_nch, vp);
849                         vrele(vp);
850                 }
851                 goto done;
852         }
853
854         /*
855          * If there is no path component the time extension is relative to
856          * dip.
857          */
858         if (nlen == 0) {
859                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
860                                       asof, dip->obj_localization,
861                                       flags, &error);
862                 if (error == 0) {
863                         error = hammer_get_vnode(ip, &vp);
864                         hammer_rel_inode(ip, 0);
865                 } else {
866                         vp = NULL;
867                 }
868                 if (error == 0) {
869                         vn_unlock(vp);
870                         cache_setvp(ap->a_nch, vp);
871                         vrele(vp);
872                 }
873                 goto done;
874         }
875
876         /*
877          * Calculate the namekey and setup the key range for the scan.  This
878          * works kinda like a chained hash table where the lower 32 bits
879          * of the namekey synthesize the chain.
880          *
881          * The key range is inclusive of both key_beg and key_end.
882          */
883         namekey = hammer_directory_namekey(dip, ncp->nc_name, nlen,
884                                            &max_iterations);
885
886         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
887         cursor.key_beg.localization = dip->obj_localization +
888                                       HAMMER_LOCALIZE_MISC;
889         cursor.key_beg.obj_id = dip->obj_id;
890         cursor.key_beg.key = namekey;
891         cursor.key_beg.create_tid = 0;
892         cursor.key_beg.delete_tid = 0;
893         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
894         cursor.key_beg.obj_type = 0;
895
896         cursor.key_end = cursor.key_beg;
897         cursor.key_end.key += max_iterations;
898         cursor.asof = asof;
899         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
900
901         /*
902          * Scan all matching records (the chain), locate the one matching
903          * the requested path component.
904          *
905          * The hammer_ip_*() functions merge in-memory records with on-disk
906          * records for the purposes of the search.
907          */
908         obj_id = 0;
909         localization = HAMMER_DEF_LOCALIZATION;
910
911         if (error == 0) {
912                 error = hammer_ip_first(&cursor);
913                 while (error == 0) {
914                         error = hammer_ip_resolve_data(&cursor);
915                         if (error)
916                                 break;
917                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
918                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
919                                 obj_id = cursor.data->entry.obj_id;
920                                 localization = cursor.data->entry.localization;
921                                 break;
922                         }
923                         error = hammer_ip_next(&cursor);
924                 }
925         }
926         hammer_done_cursor(&cursor);
927         if (error == 0) {
928                 ip = hammer_get_inode(&trans, dip, obj_id,
929                                       asof, localization,
930                                       flags, &error);
931                 if (error == 0) {
932                         error = hammer_get_vnode(ip, &vp);
933                         hammer_rel_inode(ip, 0);
934                 } else {
935                         vp = NULL;
936                 }
937                 if (error == 0) {
938                         vn_unlock(vp);
939                         cache_setvp(ap->a_nch, vp);
940                         vrele(vp);
941                 }
942         } else if (error == ENOENT) {
943                 cache_setvp(ap->a_nch, NULL);
944         }
945 done:
946         hammer_done_transaction(&trans);
947         return (error);
948 }
949
950 /*
951  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
952  *
953  * Locate the parent directory of a directory vnode.
954  *
955  * dvp is referenced but not locked.  *vpp must be returned referenced and
956  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
957  * at the root, instead it could indicate that the directory we were in was
958  * removed.
959  *
960  * NOTE: as-of sequences are not linked into the directory structure.  If
961  * we are at the root with a different asof then the mount point, reload
962  * the same directory with the mount point's asof.   I'm not sure what this
963  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
964  * get confused, but it hasn't been tested.
965  */
966 static
967 int
968 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
969 {
970         struct hammer_transaction trans;
971         struct hammer_inode *dip;
972         struct hammer_inode *ip;
973         int64_t parent_obj_id;
974         u_int32_t parent_obj_localization;
975         hammer_tid_t asof;
976         int error;
977
978         dip = VTOI(ap->a_dvp);
979         asof = dip->obj_asof;
980
981         /*
982          * Whos are parent?  This could be the root of a pseudo-filesystem
983          * whos parent is in another localization domain.
984          */
985         parent_obj_id = dip->ino_data.parent_obj_id;
986         if (dip->obj_id == HAMMER_OBJID_ROOT)
987                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
988         else
989                 parent_obj_localization = dip->obj_localization;
990
991         if (parent_obj_id == 0) {
992                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
993                    asof != dip->hmp->asof) {
994                         parent_obj_id = dip->obj_id;
995                         asof = dip->hmp->asof;
996                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
997                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
998                                    dip->obj_asof);
999                 } else {
1000                         *ap->a_vpp = NULL;
1001                         return ENOENT;
1002                 }
1003         }
1004
1005         hammer_simple_transaction(&trans, dip->hmp);
1006         ++hammer_stats_file_iopsr;
1007
1008         ip = hammer_get_inode(&trans, dip, parent_obj_id,
1009                               asof, parent_obj_localization,
1010                               dip->flags, &error);
1011         if (ip) {
1012                 error = hammer_get_vnode(ip, ap->a_vpp);
1013                 hammer_rel_inode(ip, 0);
1014         } else {
1015                 *ap->a_vpp = NULL;
1016         }
1017         hammer_done_transaction(&trans);
1018         return (error);
1019 }
1020
1021 /*
1022  * hammer_vop_nlink { nch, dvp, vp, cred }
1023  */
1024 static
1025 int
1026 hammer_vop_nlink(struct vop_nlink_args *ap)
1027 {
1028         struct hammer_transaction trans;
1029         struct hammer_inode *dip;
1030         struct hammer_inode *ip;
1031         struct nchandle *nch;
1032         int error;
1033
1034         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1035                 return(EXDEV);
1036
1037         nch = ap->a_nch;
1038         dip = VTOI(ap->a_dvp);
1039         ip = VTOI(ap->a_vp);
1040
1041         if (dip->obj_localization != ip->obj_localization)
1042                 return(EXDEV);
1043
1044         if (dip->flags & HAMMER_INODE_RO)
1045                 return (EROFS);
1046         if (ip->flags & HAMMER_INODE_RO)
1047                 return (EROFS);
1048         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1049                 return (error);
1050
1051         /*
1052          * Create a transaction to cover the operations we perform.
1053          */
1054         hammer_start_transaction(&trans, dip->hmp);
1055         ++hammer_stats_file_iopsw;
1056
1057         /*
1058          * Add the filesystem object to the directory.  Note that neither
1059          * dip nor ip are referenced or locked, but their vnodes are
1060          * referenced.  This function will bump the inode's link count.
1061          */
1062         error = hammer_ip_add_directory(&trans, dip,
1063                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1064                                         ip);
1065
1066         /*
1067          * Finish up.
1068          */
1069         if (error == 0) {
1070                 cache_setunresolved(nch);
1071                 cache_setvp(nch, ap->a_vp);
1072         }
1073         hammer_done_transaction(&trans);
1074         hammer_knote(ap->a_vp, NOTE_LINK);
1075         hammer_knote(ap->a_dvp, NOTE_WRITE);
1076         return (error);
1077 }
1078
1079 /*
1080  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1081  *
1082  * The operating system has already ensured that the directory entry
1083  * does not exist and done all appropriate namespace locking.
1084  */
1085 static
1086 int
1087 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1088 {
1089         struct hammer_transaction trans;
1090         struct hammer_inode *dip;
1091         struct hammer_inode *nip;
1092         struct nchandle *nch;
1093         int error;
1094
1095         nch = ap->a_nch;
1096         dip = VTOI(ap->a_dvp);
1097
1098         if (dip->flags & HAMMER_INODE_RO)
1099                 return (EROFS);
1100         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1101                 return (error);
1102
1103         /*
1104          * Create a transaction to cover the operations we perform.
1105          */
1106         hammer_start_transaction(&trans, dip->hmp);
1107         ++hammer_stats_file_iopsw;
1108
1109         /*
1110          * Create a new filesystem object of the requested type.  The
1111          * returned inode will be referenced but not locked.
1112          */
1113         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1114                                     dip, NULL, &nip);
1115         if (error) {
1116                 hkprintf("hammer_mkdir error %d\n", error);
1117                 hammer_done_transaction(&trans);
1118                 *ap->a_vpp = NULL;
1119                 return (error);
1120         }
1121         /*
1122          * Add the new filesystem object to the directory.  This will also
1123          * bump the inode's link count.
1124          */
1125         error = hammer_ip_add_directory(&trans, dip,
1126                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1127                                         nip);
1128         if (error)
1129                 hkprintf("hammer_mkdir (add) error %d\n", error);
1130
1131         /*
1132          * Finish up.
1133          */
1134         if (error) {
1135                 hammer_rel_inode(nip, 0);
1136                 *ap->a_vpp = NULL;
1137         } else {
1138                 error = hammer_get_vnode(nip, ap->a_vpp);
1139                 hammer_rel_inode(nip, 0);
1140                 if (error == 0) {
1141                         cache_setunresolved(ap->a_nch);
1142                         cache_setvp(ap->a_nch, *ap->a_vpp);
1143                 }
1144         }
1145         hammer_done_transaction(&trans);
1146         if (error == 0)
1147                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1148         return (error);
1149 }
1150
1151 /*
1152  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1153  *
1154  * The operating system has already ensured that the directory entry
1155  * does not exist and done all appropriate namespace locking.
1156  */
1157 static
1158 int
1159 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1160 {
1161         struct hammer_transaction trans;
1162         struct hammer_inode *dip;
1163         struct hammer_inode *nip;
1164         struct nchandle *nch;
1165         int error;
1166
1167         nch = ap->a_nch;
1168         dip = VTOI(ap->a_dvp);
1169
1170         if (dip->flags & HAMMER_INODE_RO)
1171                 return (EROFS);
1172         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1173                 return (error);
1174
1175         /*
1176          * Create a transaction to cover the operations we perform.
1177          */
1178         hammer_start_transaction(&trans, dip->hmp);
1179         ++hammer_stats_file_iopsw;
1180
1181         /*
1182          * Create a new filesystem object of the requested type.  The
1183          * returned inode will be referenced but not locked.
1184          *
1185          * If mknod specifies a directory a pseudo-fs is created.
1186          */
1187         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1188                                     dip, NULL, &nip);
1189         if (error) {
1190                 hammer_done_transaction(&trans);
1191                 *ap->a_vpp = NULL;
1192                 return (error);
1193         }
1194
1195         /*
1196          * Add the new filesystem object to the directory.  This will also
1197          * bump the inode's link count.
1198          */
1199         error = hammer_ip_add_directory(&trans, dip,
1200                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1201                                         nip);
1202
1203         /*
1204          * Finish up.
1205          */
1206         if (error) {
1207                 hammer_rel_inode(nip, 0);
1208                 *ap->a_vpp = NULL;
1209         } else {
1210                 error = hammer_get_vnode(nip, ap->a_vpp);
1211                 hammer_rel_inode(nip, 0);
1212                 if (error == 0) {
1213                         cache_setunresolved(ap->a_nch);
1214                         cache_setvp(ap->a_nch, *ap->a_vpp);
1215                 }
1216         }
1217         hammer_done_transaction(&trans);
1218         if (error == 0)
1219                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1220         return (error);
1221 }
1222
1223 /*
1224  * hammer_vop_open { vp, mode, cred, fp }
1225  */
1226 static
1227 int
1228 hammer_vop_open(struct vop_open_args *ap)
1229 {
1230         hammer_inode_t ip;
1231
1232         ++hammer_stats_file_iopsr;
1233         ip = VTOI(ap->a_vp);
1234
1235         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1236                 return (EROFS);
1237         return(vop_stdopen(ap));
1238 }
1239
1240 /*
1241  * hammer_vop_print { vp }
1242  */
1243 static
1244 int
1245 hammer_vop_print(struct vop_print_args *ap)
1246 {
1247         return EOPNOTSUPP;
1248 }
1249
1250 /*
1251  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1252  */
1253 static
1254 int
1255 hammer_vop_readdir(struct vop_readdir_args *ap)
1256 {
1257         struct hammer_transaction trans;
1258         struct hammer_cursor cursor;
1259         struct hammer_inode *ip;
1260         struct uio *uio;
1261         hammer_base_elm_t base;
1262         int error;
1263         int cookie_index;
1264         int ncookies;
1265         off_t *cookies;
1266         off_t saveoff;
1267         int r;
1268         int dtype;
1269
1270         ++hammer_stats_file_iopsr;
1271         ip = VTOI(ap->a_vp);
1272         uio = ap->a_uio;
1273         saveoff = uio->uio_offset;
1274
1275         if (ap->a_ncookies) {
1276                 ncookies = uio->uio_resid / 16 + 1;
1277                 if (ncookies > 1024)
1278                         ncookies = 1024;
1279                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1280                 cookie_index = 0;
1281         } else {
1282                 ncookies = -1;
1283                 cookies = NULL;
1284                 cookie_index = 0;
1285         }
1286
1287         hammer_simple_transaction(&trans, ip->hmp);
1288
1289         /*
1290          * Handle artificial entries
1291          */
1292         error = 0;
1293         if (saveoff == 0) {
1294                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1295                 if (r)
1296                         goto done;
1297                 if (cookies)
1298                         cookies[cookie_index] = saveoff;
1299                 ++saveoff;
1300                 ++cookie_index;
1301                 if (cookie_index == ncookies)
1302                         goto done;
1303         }
1304         if (saveoff == 1) {
1305                 if (ip->ino_data.parent_obj_id) {
1306                         r = vop_write_dirent(&error, uio,
1307                                              ip->ino_data.parent_obj_id,
1308                                              DT_DIR, 2, "..");
1309                 } else {
1310                         r = vop_write_dirent(&error, uio,
1311                                              ip->obj_id, DT_DIR, 2, "..");
1312                 }
1313                 if (r)
1314                         goto done;
1315                 if (cookies)
1316                         cookies[cookie_index] = saveoff;
1317                 ++saveoff;
1318                 ++cookie_index;
1319                 if (cookie_index == ncookies)
1320                         goto done;
1321         }
1322
1323         /*
1324          * Key range (begin and end inclusive) to scan.  Directory keys
1325          * directly translate to a 64 bit 'seek' position.
1326          */
1327         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1328         cursor.key_beg.localization = ip->obj_localization +
1329                                       HAMMER_LOCALIZE_MISC;
1330         cursor.key_beg.obj_id = ip->obj_id;
1331         cursor.key_beg.create_tid = 0;
1332         cursor.key_beg.delete_tid = 0;
1333         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1334         cursor.key_beg.obj_type = 0;
1335         cursor.key_beg.key = saveoff;
1336
1337         cursor.key_end = cursor.key_beg;
1338         cursor.key_end.key = HAMMER_MAX_KEY;
1339         cursor.asof = ip->obj_asof;
1340         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1341
1342         error = hammer_ip_first(&cursor);
1343
1344         while (error == 0) {
1345                 error = hammer_ip_resolve_data(&cursor);
1346                 if (error)
1347                         break;
1348                 base = &cursor.leaf->base;
1349                 saveoff = base->key;
1350                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1351
1352                 if (base->obj_id != ip->obj_id)
1353                         panic("readdir: bad record at %p", cursor.node);
1354
1355                 /*
1356                  * Convert pseudo-filesystems into softlinks
1357                  */
1358                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1359                 r = vop_write_dirent(
1360                              &error, uio, cursor.data->entry.obj_id,
1361                              dtype,
1362                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1363                              (void *)cursor.data->entry.name);
1364                 if (r)
1365                         break;
1366                 ++saveoff;
1367                 if (cookies)
1368                         cookies[cookie_index] = base->key;
1369                 ++cookie_index;
1370                 if (cookie_index == ncookies)
1371                         break;
1372                 error = hammer_ip_next(&cursor);
1373         }
1374         hammer_done_cursor(&cursor);
1375
1376 done:
1377         hammer_done_transaction(&trans);
1378
1379         if (ap->a_eofflag)
1380                 *ap->a_eofflag = (error == ENOENT);
1381         uio->uio_offset = saveoff;
1382         if (error && cookie_index == 0) {
1383                 if (error == ENOENT)
1384                         error = 0;
1385                 if (cookies) {
1386                         kfree(cookies, M_TEMP);
1387                         *ap->a_ncookies = 0;
1388                         *ap->a_cookies = NULL;
1389                 }
1390         } else {
1391                 if (error == ENOENT)
1392                         error = 0;
1393                 if (cookies) {
1394                         *ap->a_ncookies = cookie_index;
1395                         *ap->a_cookies = cookies;
1396                 }
1397         }
1398         return(error);
1399 }
1400
1401 /*
1402  * hammer_vop_readlink { vp, uio, cred }
1403  */
1404 static
1405 int
1406 hammer_vop_readlink(struct vop_readlink_args *ap)
1407 {
1408         struct hammer_transaction trans;
1409         struct hammer_cursor cursor;
1410         struct hammer_inode *ip;
1411         char buf[32];
1412         u_int32_t localization;
1413         hammer_pseudofs_inmem_t pfsm;
1414         int error;
1415
1416         ip = VTOI(ap->a_vp);
1417
1418         /*
1419          * Shortcut if the symlink data was stuffed into ino_data.
1420          *
1421          * Also expand special "@@PFS%05d" softlinks (expansion only
1422          * occurs for non-historical (current) accesses made from the
1423          * primary filesystem).
1424          */
1425         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1426                 char *ptr;
1427                 int bytes;
1428
1429                 ptr = ip->ino_data.ext.symlink;
1430                 bytes = (int)ip->ino_data.size;
1431                 if (bytes == 10 &&
1432                     ip->obj_asof == HAMMER_MAX_TID &&
1433                     ip->obj_localization == 0 &&
1434                     strncmp(ptr, "@@PFS", 5) == 0) {
1435                         hammer_simple_transaction(&trans, ip->hmp);
1436                         bcopy(ptr + 5, buf, 5);
1437                         buf[5] = 0;
1438                         localization = strtoul(buf, NULL, 10) << 16;
1439                         pfsm = hammer_load_pseudofs(&trans, localization,
1440                                                     &error);
1441                         if (error == 0) {
1442                                 if (pfsm->pfsd.mirror_flags &
1443                                     HAMMER_PFSD_SLAVE) {
1444                                         ksnprintf(buf, sizeof(buf),
1445                                                   "@@0x%016llx:%05d",
1446                                                   pfsm->pfsd.sync_end_tid,
1447                                                   localization >> 16);
1448                                 } else {
1449                                         ksnprintf(buf, sizeof(buf),
1450                                                   "@@0x%016llx:%05d",
1451                                                   HAMMER_MAX_TID,
1452                                                   localization >> 16);
1453                                 }
1454                                 ptr = buf;
1455                                 bytes = strlen(buf);
1456                         }
1457                         if (pfsm)
1458                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1459                         hammer_done_transaction(&trans);
1460                 }
1461                 error = uiomove(ptr, bytes, ap->a_uio);
1462                 return(error);
1463         }
1464
1465         /*
1466          * Long version
1467          */
1468         hammer_simple_transaction(&trans, ip->hmp);
1469         ++hammer_stats_file_iopsr;
1470         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1471
1472         /*
1473          * Key range (begin and end inclusive) to scan.  Directory keys
1474          * directly translate to a 64 bit 'seek' position.
1475          */
1476         cursor.key_beg.localization = ip->obj_localization +
1477                                       HAMMER_LOCALIZE_MISC;
1478         cursor.key_beg.obj_id = ip->obj_id;
1479         cursor.key_beg.create_tid = 0;
1480         cursor.key_beg.delete_tid = 0;
1481         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1482         cursor.key_beg.obj_type = 0;
1483         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1484         cursor.asof = ip->obj_asof;
1485         cursor.flags |= HAMMER_CURSOR_ASOF;
1486
1487         error = hammer_ip_lookup(&cursor);
1488         if (error == 0) {
1489                 error = hammer_ip_resolve_data(&cursor);
1490                 if (error == 0) {
1491                         KKASSERT(cursor.leaf->data_len >=
1492                                  HAMMER_SYMLINK_NAME_OFF);
1493                         error = uiomove(cursor.data->symlink.name,
1494                                         cursor.leaf->data_len -
1495                                                 HAMMER_SYMLINK_NAME_OFF,
1496                                         ap->a_uio);
1497                 }
1498         }
1499         hammer_done_cursor(&cursor);
1500         hammer_done_transaction(&trans);
1501         return(error);
1502 }
1503
1504 /*
1505  * hammer_vop_nremove { nch, dvp, cred }
1506  */
1507 static
1508 int
1509 hammer_vop_nremove(struct vop_nremove_args *ap)
1510 {
1511         struct hammer_transaction trans;
1512         struct hammer_inode *dip;
1513         int error;
1514
1515         dip = VTOI(ap->a_dvp);
1516
1517         if (hammer_nohistory(dip) == 0 &&
1518             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1519                 return (error);
1520         }
1521
1522         hammer_start_transaction(&trans, dip->hmp);
1523         ++hammer_stats_file_iopsw;
1524         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1525         hammer_done_transaction(&trans);
1526         if (error == 0)
1527                 hammer_knote(ap->a_dvp, NOTE_WRITE);
1528         return (error);
1529 }
1530
1531 /*
1532  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1533  */
1534 static
1535 int
1536 hammer_vop_nrename(struct vop_nrename_args *ap)
1537 {
1538         struct hammer_transaction trans;
1539         struct namecache *fncp;
1540         struct namecache *tncp;
1541         struct hammer_inode *fdip;
1542         struct hammer_inode *tdip;
1543         struct hammer_inode *ip;
1544         struct hammer_cursor cursor;
1545         int64_t namekey;
1546         u_int32_t max_iterations;
1547         int nlen, error;
1548
1549         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1550                 return(EXDEV);
1551         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1552                 return(EXDEV);
1553
1554         fdip = VTOI(ap->a_fdvp);
1555         tdip = VTOI(ap->a_tdvp);
1556         fncp = ap->a_fnch->ncp;
1557         tncp = ap->a_tnch->ncp;
1558         ip = VTOI(fncp->nc_vp);
1559         KKASSERT(ip != NULL);
1560
1561         if (fdip->obj_localization != tdip->obj_localization)
1562                 return(EXDEV);
1563         if (fdip->obj_localization != ip->obj_localization)
1564                 return(EXDEV);
1565
1566         if (fdip->flags & HAMMER_INODE_RO)
1567                 return (EROFS);
1568         if (tdip->flags & HAMMER_INODE_RO)
1569                 return (EROFS);
1570         if (ip->flags & HAMMER_INODE_RO)
1571                 return (EROFS);
1572         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1573                 return (error);
1574
1575         hammer_start_transaction(&trans, fdip->hmp);
1576         ++hammer_stats_file_iopsw;
1577
1578         /*
1579          * Remove tncp from the target directory and then link ip as
1580          * tncp. XXX pass trans to dounlink
1581          *
1582          * Force the inode sync-time to match the transaction so it is
1583          * in-sync with the creation of the target directory entry.
1584          */
1585         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1586                                 ap->a_cred, 0, -1);
1587         if (error == 0 || error == ENOENT) {
1588                 error = hammer_ip_add_directory(&trans, tdip,
1589                                                 tncp->nc_name, tncp->nc_nlen,
1590                                                 ip);
1591                 if (error == 0) {
1592                         ip->ino_data.parent_obj_id = tdip->obj_id;
1593                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1594                 }
1595         }
1596         if (error)
1597                 goto failed; /* XXX */
1598
1599         /*
1600          * Locate the record in the originating directory and remove it.
1601          *
1602          * Calculate the namekey and setup the key range for the scan.  This
1603          * works kinda like a chained hash table where the lower 32 bits
1604          * of the namekey synthesize the chain.
1605          *
1606          * The key range is inclusive of both key_beg and key_end.
1607          */
1608         namekey = hammer_directory_namekey(fdip, fncp->nc_name, fncp->nc_nlen,
1609                                            &max_iterations);
1610 retry:
1611         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1612         cursor.key_beg.localization = fdip->obj_localization +
1613                                       HAMMER_LOCALIZE_MISC;
1614         cursor.key_beg.obj_id = fdip->obj_id;
1615         cursor.key_beg.key = namekey;
1616         cursor.key_beg.create_tid = 0;
1617         cursor.key_beg.delete_tid = 0;
1618         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1619         cursor.key_beg.obj_type = 0;
1620
1621         cursor.key_end = cursor.key_beg;
1622         cursor.key_end.key += max_iterations;
1623         cursor.asof = fdip->obj_asof;
1624         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1625
1626         /*
1627          * Scan all matching records (the chain), locate the one matching
1628          * the requested path component.
1629          *
1630          * The hammer_ip_*() functions merge in-memory records with on-disk
1631          * records for the purposes of the search.
1632          */
1633         error = hammer_ip_first(&cursor);
1634         while (error == 0) {
1635                 if (hammer_ip_resolve_data(&cursor) != 0)
1636                         break;
1637                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1638                 KKASSERT(nlen > 0);
1639                 if (fncp->nc_nlen == nlen &&
1640                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1641                         break;
1642                 }
1643                 error = hammer_ip_next(&cursor);
1644         }
1645
1646         /*
1647          * If all is ok we have to get the inode so we can adjust nlinks.
1648          *
1649          * WARNING: hammer_ip_del_directory() may have to terminate the
1650          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1651          * twice.
1652          */
1653         if (error == 0)
1654                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1655
1656         /*
1657          * XXX A deadlock here will break rename's atomicy for the purposes
1658          * of crash recovery.
1659          */
1660         if (error == EDEADLK) {
1661                 hammer_done_cursor(&cursor);
1662                 goto retry;
1663         }
1664
1665         /*
1666          * Cleanup and tell the kernel that the rename succeeded.
1667          */
1668         hammer_done_cursor(&cursor);
1669         if (error == 0) {
1670                 cache_rename(ap->a_fnch, ap->a_tnch);
1671                 hammer_knote(ap->a_fdvp, NOTE_WRITE);
1672                 hammer_knote(ap->a_tdvp, NOTE_WRITE);
1673                 if (ip->vp)
1674                         hammer_knote(ip->vp, NOTE_RENAME);
1675         }
1676
1677 failed:
1678         hammer_done_transaction(&trans);
1679         return (error);
1680 }
1681
1682 /*
1683  * hammer_vop_nrmdir { nch, dvp, cred }
1684  */
1685 static
1686 int
1687 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1688 {
1689         struct hammer_transaction trans;
1690         struct hammer_inode *dip;
1691         int error;
1692
1693         dip = VTOI(ap->a_dvp);
1694
1695         if (hammer_nohistory(dip) == 0 &&
1696             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1697                 return (error);
1698         }
1699
1700         hammer_start_transaction(&trans, dip->hmp);
1701         ++hammer_stats_file_iopsw;
1702         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1703         hammer_done_transaction(&trans);
1704         if (error == 0)
1705                 hammer_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1706         return (error);
1707 }
1708
1709 /*
1710  * hammer_vop_markatime { vp, cred }
1711  */
1712 static
1713 int
1714 hammer_vop_markatime(struct vop_markatime_args *ap)
1715 {
1716         struct hammer_transaction trans;
1717         struct hammer_inode *ip;
1718
1719         ip = VTOI(ap->a_vp);
1720         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1721                 return (EROFS);
1722         if (ip->flags & HAMMER_INODE_RO)
1723                 return (EROFS);
1724         if (ip->hmp->mp->mnt_flag & MNT_NOATIME)
1725                 return (0);
1726         hammer_start_transaction(&trans, ip->hmp);
1727         ++hammer_stats_file_iopsw;
1728
1729         ip->ino_data.atime = trans.time;
1730         hammer_modify_inode(ip, HAMMER_INODE_ATIME);
1731         hammer_done_transaction(&trans);
1732         hammer_knote(ap->a_vp, NOTE_ATTRIB);
1733         return (0);
1734 }
1735
1736 /*
1737  * hammer_vop_setattr { vp, vap, cred }
1738  */
1739 static
1740 int
1741 hammer_vop_setattr(struct vop_setattr_args *ap)
1742 {
1743         struct hammer_transaction trans;
1744         struct vattr *vap;
1745         struct hammer_inode *ip;
1746         int modflags;
1747         int error;
1748         int truncating;
1749         int blksize;
1750         int kflags;
1751         int64_t aligned_size;
1752         u_int32_t flags;
1753
1754         vap = ap->a_vap;
1755         ip = ap->a_vp->v_data;
1756         modflags = 0;
1757         kflags = 0;
1758
1759         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1760                 return(EROFS);
1761         if (ip->flags & HAMMER_INODE_RO)
1762                 return (EROFS);
1763         if (hammer_nohistory(ip) == 0 &&
1764             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1765                 return (error);
1766         }
1767
1768         hammer_start_transaction(&trans, ip->hmp);
1769         ++hammer_stats_file_iopsw;
1770         error = 0;
1771
1772         if (vap->va_flags != VNOVAL) {
1773                 flags = ip->ino_data.uflags;
1774                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1775                                          hammer_to_unix_xid(&ip->ino_data.uid),
1776                                          ap->a_cred);
1777                 if (error == 0) {
1778                         if (ip->ino_data.uflags != flags) {
1779                                 ip->ino_data.uflags = flags;
1780                                 modflags |= HAMMER_INODE_DDIRTY;
1781                                 kflags |= NOTE_ATTRIB;
1782                         }
1783                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1784                                 error = 0;
1785                                 goto done;
1786                         }
1787                 }
1788                 goto done;
1789         }
1790         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1791                 error = EPERM;
1792                 goto done;
1793         }
1794         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1795                 mode_t cur_mode = ip->ino_data.mode;
1796                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1797                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1798                 uuid_t uuid_uid;
1799                 uuid_t uuid_gid;
1800
1801                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1802                                          ap->a_cred,
1803                                          &cur_uid, &cur_gid, &cur_mode);
1804                 if (error == 0) {
1805                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1806                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1807                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1808                                  sizeof(uuid_uid)) ||
1809                             bcmp(&uuid_gid, &ip->ino_data.gid,
1810                                  sizeof(uuid_gid)) ||
1811                             ip->ino_data.mode != cur_mode
1812                         ) {
1813                                 ip->ino_data.uid = uuid_uid;
1814                                 ip->ino_data.gid = uuid_gid;
1815                                 ip->ino_data.mode = cur_mode;
1816                         }
1817                         modflags |= HAMMER_INODE_DDIRTY;
1818                         kflags |= NOTE_ATTRIB;
1819                 }
1820         }
1821         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1822                 switch(ap->a_vp->v_type) {
1823                 case VREG:
1824                         if (vap->va_size == ip->ino_data.size)
1825                                 break;
1826                         /*
1827                          * XXX break atomicy, we can deadlock the backend
1828                          * if we do not release the lock.  Probably not a
1829                          * big deal here.
1830                          */
1831                         blksize = hammer_blocksize(vap->va_size);
1832                         if (vap->va_size < ip->ino_data.size) {
1833                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1834                                 truncating = 1;
1835                                 kflags |= NOTE_WRITE;
1836                         } else {
1837                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1838                                 truncating = 0;
1839                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
1840                         }
1841                         ip->ino_data.size = vap->va_size;
1842                         modflags |= HAMMER_INODE_DDIRTY;
1843
1844                         /*
1845                          * on-media truncation is cached in the inode until
1846                          * the inode is synchronized.
1847                          */
1848                         if (truncating) {
1849                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1850 #ifdef DEBUG_TRUNCATE
1851                                 if (HammerTruncIp == NULL)
1852                                         HammerTruncIp = ip;
1853 #endif
1854                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1855                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1856                                         ip->trunc_off = vap->va_size;
1857 #ifdef DEBUG_TRUNCATE
1858                                         if (ip == HammerTruncIp)
1859                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1860 #endif
1861                                 } else if (ip->trunc_off > vap->va_size) {
1862                                         ip->trunc_off = vap->va_size;
1863 #ifdef DEBUG_TRUNCATE
1864                                         if (ip == HammerTruncIp)
1865                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1866 #endif
1867                                 } else {
1868 #ifdef DEBUG_TRUNCATE
1869                                         if (ip == HammerTruncIp)
1870                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1871 #endif
1872                                 }
1873                         }
1874
1875                         /*
1876                          * If truncating we have to clean out a portion of
1877                          * the last block on-disk.  We do this in the
1878                          * front-end buffer cache.
1879                          */
1880                         aligned_size = (vap->va_size + (blksize - 1)) &
1881                                        ~(int64_t)(blksize - 1);
1882                         if (truncating && vap->va_size < aligned_size) {
1883                                 struct buf *bp;
1884                                 int offset;
1885
1886                                 aligned_size -= blksize;
1887
1888                                 offset = (int)vap->va_size & (blksize - 1);
1889                                 error = bread(ap->a_vp, aligned_size,
1890                                               blksize, &bp);
1891                                 hammer_ip_frontend_trunc(ip, aligned_size);
1892                                 if (error == 0) {
1893                                         bzero(bp->b_data + offset,
1894                                               blksize - offset);
1895                                         /* must de-cache direct-io offset */
1896                                         bp->b_bio2.bio_offset = NOOFFSET;
1897                                         bdwrite(bp);
1898                                 } else {
1899                                         kprintf("ERROR %d\n", error);
1900                                         brelse(bp);
1901                                 }
1902                         }
1903                         break;
1904                 case VDATABASE:
1905                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1906                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1907                                 ip->trunc_off = vap->va_size;
1908                         } else if (ip->trunc_off > vap->va_size) {
1909                                 ip->trunc_off = vap->va_size;
1910                         }
1911                         hammer_ip_frontend_trunc(ip, vap->va_size);
1912                         ip->ino_data.size = vap->va_size;
1913                         modflags |= HAMMER_INODE_DDIRTY;
1914                         kflags |= NOTE_ATTRIB;
1915                         break;
1916                 default:
1917                         error = EINVAL;
1918                         goto done;
1919                 }
1920                 break;
1921         }
1922         if (vap->va_atime.tv_sec != VNOVAL) {
1923                 ip->ino_data.atime =
1924                         hammer_timespec_to_time(&vap->va_atime);
1925                 modflags |= HAMMER_INODE_ATIME;
1926                 kflags |= NOTE_ATTRIB;
1927         }
1928         if (vap->va_mtime.tv_sec != VNOVAL) {
1929                 ip->ino_data.mtime =
1930                         hammer_timespec_to_time(&vap->va_mtime);
1931                 modflags |= HAMMER_INODE_MTIME;
1932                 kflags |= NOTE_ATTRIB;
1933         }
1934         if (vap->va_mode != (mode_t)VNOVAL) {
1935                 mode_t   cur_mode = ip->ino_data.mode;
1936                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1937                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1938
1939                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1940                                          cur_uid, cur_gid, &cur_mode);
1941                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1942                         ip->ino_data.mode = cur_mode;
1943                         modflags |= HAMMER_INODE_DDIRTY;
1944                         kflags |= NOTE_ATTRIB;
1945                 }
1946         }
1947 done:
1948         if (error == 0)
1949                 hammer_modify_inode(ip, modflags);
1950         hammer_done_transaction(&trans);
1951         hammer_knote(ap->a_vp, kflags);
1952         return (error);
1953 }
1954
1955 /*
1956  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1957  */
1958 static
1959 int
1960 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1961 {
1962         struct hammer_transaction trans;
1963         struct hammer_inode *dip;
1964         struct hammer_inode *nip;
1965         struct nchandle *nch;
1966         hammer_record_t record;
1967         int error;
1968         int bytes;
1969
1970         ap->a_vap->va_type = VLNK;
1971
1972         nch = ap->a_nch;
1973         dip = VTOI(ap->a_dvp);
1974
1975         if (dip->flags & HAMMER_INODE_RO)
1976                 return (EROFS);
1977         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1978                 return (error);
1979
1980         /*
1981          * Create a transaction to cover the operations we perform.
1982          */
1983         hammer_start_transaction(&trans, dip->hmp);
1984         ++hammer_stats_file_iopsw;
1985
1986         /*
1987          * Create a new filesystem object of the requested type.  The
1988          * returned inode will be referenced but not locked.
1989          */
1990
1991         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1992                                     dip, NULL, &nip);
1993         if (error) {
1994                 hammer_done_transaction(&trans);
1995                 *ap->a_vpp = NULL;
1996                 return (error);
1997         }
1998
1999         /*
2000          * Add a record representing the symlink.  symlink stores the link
2001          * as pure data, not a string, and is no \0 terminated.
2002          */
2003         if (error == 0) {
2004                 bytes = strlen(ap->a_target);
2005
2006                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
2007                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
2008                 } else {
2009                         record = hammer_alloc_mem_record(nip, bytes);
2010                         record->type = HAMMER_MEM_RECORD_GENERAL;
2011
2012                         record->leaf.base.localization = nip->obj_localization +
2013                                                          HAMMER_LOCALIZE_MISC;
2014                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
2015                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
2016                         record->leaf.data_len = bytes;
2017                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
2018                         bcopy(ap->a_target, record->data->symlink.name, bytes);
2019                         error = hammer_ip_add_record(&trans, record);
2020                 }
2021
2022                 /*
2023                  * Set the file size to the length of the link.
2024                  */
2025                 if (error == 0) {
2026                         nip->ino_data.size = bytes;
2027                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
2028                 }
2029         }
2030         if (error == 0)
2031                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
2032                                                 nch->ncp->nc_nlen, nip);
2033
2034         /*
2035          * Finish up.
2036          */
2037         if (error) {
2038                 hammer_rel_inode(nip, 0);
2039                 *ap->a_vpp = NULL;
2040         } else {
2041                 error = hammer_get_vnode(nip, ap->a_vpp);
2042                 hammer_rel_inode(nip, 0);
2043                 if (error == 0) {
2044                         cache_setunresolved(ap->a_nch);
2045                         cache_setvp(ap->a_nch, *ap->a_vpp);
2046                         hammer_knote(ap->a_dvp, NOTE_WRITE);
2047                 }
2048         }
2049         hammer_done_transaction(&trans);
2050         return (error);
2051 }
2052
2053 /*
2054  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
2055  */
2056 static
2057 int
2058 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
2059 {
2060         struct hammer_transaction trans;
2061         struct hammer_inode *dip;
2062         int error;
2063
2064         dip = VTOI(ap->a_dvp);
2065
2066         if (hammer_nohistory(dip) == 0 &&
2067             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
2068                 return (error);
2069         }
2070
2071         hammer_start_transaction(&trans, dip->hmp);
2072         ++hammer_stats_file_iopsw;
2073         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
2074                                 ap->a_cred, ap->a_flags, -1);
2075         hammer_done_transaction(&trans);
2076
2077         return (error);
2078 }
2079
2080 /*
2081  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2082  */
2083 static
2084 int
2085 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2086 {
2087         struct hammer_inode *ip = ap->a_vp->v_data;
2088
2089         ++hammer_stats_file_iopsr;
2090         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2091                             ap->a_fflag, ap->a_cred));
2092 }
2093
2094 static
2095 int
2096 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2097 {
2098         struct mount *mp;
2099         int error;
2100
2101         mp = ap->a_head.a_ops->head.vv_mount;
2102
2103         switch(ap->a_op) {
2104         case MOUNTCTL_SET_EXPORT:
2105                 if (ap->a_ctllen != sizeof(struct export_args))
2106                         error = EINVAL;
2107                 else
2108                         error = hammer_vfs_export(mp, ap->a_op,
2109                                       (const struct export_args *)ap->a_ctl);
2110                 break;
2111         default:
2112                 error = journal_mountctl(ap);
2113                 break;
2114         }
2115         return(error);
2116 }
2117
2118 /*
2119  * hammer_vop_strategy { vp, bio }
2120  *
2121  * Strategy call, used for regular file read & write only.  Note that the
2122  * bp may represent a cluster.
2123  *
2124  * To simplify operation and allow better optimizations in the future,
2125  * this code does not make any assumptions with regards to buffer alignment
2126  * or size.
2127  */
2128 static
2129 int
2130 hammer_vop_strategy(struct vop_strategy_args *ap)
2131 {
2132         struct buf *bp;
2133         int error;
2134
2135         bp = ap->a_bio->bio_buf;
2136
2137         switch(bp->b_cmd) {
2138         case BUF_CMD_READ:
2139                 error = hammer_vop_strategy_read(ap);
2140                 break;
2141         case BUF_CMD_WRITE:
2142                 error = hammer_vop_strategy_write(ap);
2143                 break;
2144         default:
2145                 bp->b_error = error = EINVAL;
2146                 bp->b_flags |= B_ERROR;
2147                 biodone(ap->a_bio);
2148                 break;
2149         }
2150         return (error);
2151 }
2152
2153 /*
2154  * Read from a regular file.  Iterate the related records and fill in the
2155  * BIO/BUF.  Gaps are zero-filled.
2156  *
2157  * The support code in hammer_object.c should be used to deal with mixed
2158  * in-memory and on-disk records.
2159  *
2160  * NOTE: Can be called from the cluster code with an oversized buf.
2161  *
2162  * XXX atime update
2163  */
2164 static
2165 int
2166 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2167 {
2168         struct hammer_transaction trans;
2169         struct hammer_inode *ip;
2170         struct hammer_cursor cursor;
2171         hammer_base_elm_t base;
2172         hammer_off_t disk_offset;
2173         struct bio *bio;
2174         struct bio *nbio;
2175         struct buf *bp;
2176         int64_t rec_offset;
2177         int64_t ran_end;
2178         int64_t tmp64;
2179         int error;
2180         int boff;
2181         int roff;
2182         int n;
2183
2184         bio = ap->a_bio;
2185         bp = bio->bio_buf;
2186         ip = ap->a_vp->v_data;
2187
2188         /*
2189          * The zone-2 disk offset may have been set by the cluster code via
2190          * a BMAP operation, or else should be NOOFFSET.
2191          *
2192          * Checking the high bits for a match against zone-2 should suffice.
2193          */
2194         nbio = push_bio(bio);
2195         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2196             HAMMER_ZONE_LARGE_DATA) {
2197                 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2198                 return (error);
2199         }
2200
2201         /*
2202          * Well, that sucked.  Do it the hard way.  If all the stars are
2203          * aligned we may still be able to issue a direct-read.
2204          */
2205         hammer_simple_transaction(&trans, ip->hmp);
2206         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2207
2208         /*
2209          * Key range (begin and end inclusive) to scan.  Note that the key's
2210          * stored in the actual records represent BASE+LEN, not BASE.  The
2211          * first record containing bio_offset will have a key > bio_offset.
2212          */
2213         cursor.key_beg.localization = ip->obj_localization +
2214                                       HAMMER_LOCALIZE_MISC;
2215         cursor.key_beg.obj_id = ip->obj_id;
2216         cursor.key_beg.create_tid = 0;
2217         cursor.key_beg.delete_tid = 0;
2218         cursor.key_beg.obj_type = 0;
2219         cursor.key_beg.key = bio->bio_offset + 1;
2220         cursor.asof = ip->obj_asof;
2221         cursor.flags |= HAMMER_CURSOR_ASOF;
2222
2223         cursor.key_end = cursor.key_beg;
2224         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2225 #if 0
2226         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2227                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2228                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2229                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2230         } else
2231 #endif
2232         {
2233                 ran_end = bio->bio_offset + bp->b_bufsize;
2234                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2235                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2236                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2237                 if (tmp64 < ran_end)
2238                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2239                 else
2240                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2241         }
2242         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2243
2244         error = hammer_ip_first(&cursor);
2245         boff = 0;
2246
2247         while (error == 0) {
2248                 /*
2249                  * Get the base file offset of the record.  The key for
2250                  * data records is (base + bytes) rather then (base).
2251                  */
2252                 base = &cursor.leaf->base;
2253                 rec_offset = base->key - cursor.leaf->data_len;
2254
2255                 /*
2256                  * Calculate the gap, if any, and zero-fill it.
2257                  *
2258                  * n is the offset of the start of the record verses our
2259                  * current seek offset in the bio.
2260                  */
2261                 n = (int)(rec_offset - (bio->bio_offset + boff));
2262                 if (n > 0) {
2263                         if (n > bp->b_bufsize - boff)
2264                                 n = bp->b_bufsize - boff;
2265                         bzero((char *)bp->b_data + boff, n);
2266                         boff += n;
2267                         n = 0;
2268                 }
2269
2270                 /*
2271                  * Calculate the data offset in the record and the number
2272                  * of bytes we can copy.
2273                  *
2274                  * There are two degenerate cases.  First, boff may already
2275                  * be at bp->b_bufsize.  Secondly, the data offset within
2276                  * the record may exceed the record's size.
2277                  */
2278                 roff = -n;
2279                 rec_offset += roff;
2280                 n = cursor.leaf->data_len - roff;
2281                 if (n <= 0) {
2282                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2283                         n = 0;
2284                 } else if (n > bp->b_bufsize - boff) {
2285                         n = bp->b_bufsize - boff;
2286                 }
2287
2288                 /*
2289                  * Deal with cached truncations.  This cool bit of code
2290                  * allows truncate()/ftruncate() to avoid having to sync
2291                  * the file.
2292                  *
2293                  * If the frontend is truncated then all backend records are
2294                  * subject to the frontend's truncation.
2295                  *
2296                  * If the backend is truncated then backend records on-disk
2297                  * (but not in-memory) are subject to the backend's
2298                  * truncation.  In-memory records owned by the backend
2299                  * represent data written after the truncation point on the
2300                  * backend and must not be truncated.
2301                  *
2302                  * Truncate operations deal with frontend buffer cache
2303                  * buffers and frontend-owned in-memory records synchronously.
2304                  */
2305                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2306                         if (hammer_cursor_ondisk(&cursor) ||
2307                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2308                                 if (ip->trunc_off <= rec_offset)
2309                                         n = 0;
2310                                 else if (ip->trunc_off < rec_offset + n)
2311                                         n = (int)(ip->trunc_off - rec_offset);
2312                         }
2313                 }
2314                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2315                         if (hammer_cursor_ondisk(&cursor)) {
2316                                 if (ip->sync_trunc_off <= rec_offset)
2317                                         n = 0;
2318                                 else if (ip->sync_trunc_off < rec_offset + n)
2319                                         n = (int)(ip->sync_trunc_off - rec_offset);
2320                         }
2321                 }
2322
2323                 /*
2324                  * Try to issue a direct read into our bio if possible,
2325                  * otherwise resolve the element data into a hammer_buffer
2326                  * and copy.
2327                  *
2328                  * The buffer on-disk should be zerod past any real
2329                  * truncation point, but may not be for any synthesized
2330                  * truncation point from above.
2331                  */
2332                 disk_offset = cursor.leaf->data_offset + roff;
2333                 if (boff == 0 && n == bp->b_bufsize &&
2334                     hammer_cursor_ondisk(&cursor) &&
2335                     (disk_offset & HAMMER_BUFMASK) == 0) {
2336                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2337                                  HAMMER_ZONE_LARGE_DATA);
2338                         nbio->bio_offset = disk_offset;
2339                         error = hammer_io_direct_read(trans.hmp, nbio,
2340                                                       cursor.leaf);
2341                         goto done;
2342                 } else if (n) {
2343                         error = hammer_ip_resolve_data(&cursor);
2344                         if (error == 0) {
2345                                 bcopy((char *)cursor.data + roff,
2346                                       (char *)bp->b_data + boff, n);
2347                         }
2348                 }
2349                 if (error)
2350                         break;
2351
2352                 /*
2353                  * Iterate until we have filled the request.
2354                  */
2355                 boff += n;
2356                 if (boff == bp->b_bufsize)
2357                         break;
2358                 error = hammer_ip_next(&cursor);
2359         }
2360
2361         /*
2362          * There may have been a gap after the last record
2363          */
2364         if (error == ENOENT)
2365                 error = 0;
2366         if (error == 0 && boff != bp->b_bufsize) {
2367                 KKASSERT(boff < bp->b_bufsize);
2368                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2369                 /* boff = bp->b_bufsize; */
2370         }
2371         bp->b_resid = 0;
2372         bp->b_error = error;
2373         if (error)
2374                 bp->b_flags |= B_ERROR;
2375         biodone(ap->a_bio);
2376
2377 done:
2378         if (cursor.node)
2379                 hammer_cache_node(&ip->cache[1], cursor.node);
2380         hammer_done_cursor(&cursor);
2381         hammer_done_transaction(&trans);
2382         return(error);
2383 }
2384
2385 /*
2386  * BMAP operation - used to support cluster_read() only.
2387  *
2388  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2389  *
2390  * This routine may return EOPNOTSUPP if the opration is not supported for
2391  * the specified offset.  The contents of the pointer arguments do not
2392  * need to be initialized in that case. 
2393  *
2394  * If a disk address is available and properly aligned return 0 with 
2395  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2396  * to the run-length relative to that offset.  Callers may assume that
2397  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2398  * large, so return EOPNOTSUPP if it is not sufficiently large.
2399  */
2400 static
2401 int
2402 hammer_vop_bmap(struct vop_bmap_args *ap)
2403 {
2404         struct hammer_transaction trans;
2405         struct hammer_inode *ip;
2406         struct hammer_cursor cursor;
2407         hammer_base_elm_t base;
2408         int64_t rec_offset;
2409         int64_t ran_end;
2410         int64_t tmp64;
2411         int64_t base_offset;
2412         int64_t base_disk_offset;
2413         int64_t last_offset;
2414         hammer_off_t last_disk_offset;
2415         hammer_off_t disk_offset;
2416         int     rec_len;
2417         int     error;
2418         int     blksize;
2419
2420         ++hammer_stats_file_iopsr;
2421         ip = ap->a_vp->v_data;
2422
2423         /*
2424          * We can only BMAP regular files.  We can't BMAP database files,
2425          * directories, etc.
2426          */
2427         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2428                 return(EOPNOTSUPP);
2429
2430         /*
2431          * bmap is typically called with runp/runb both NULL when used
2432          * for writing.  We do not support BMAP for writing atm.
2433          */
2434         if (ap->a_cmd != BUF_CMD_READ)
2435                 return(EOPNOTSUPP);
2436
2437         /*
2438          * Scan the B-Tree to acquire blockmap addresses, then translate
2439          * to raw addresses.
2440          */
2441         hammer_simple_transaction(&trans, ip->hmp);
2442 #if 0
2443         kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2444 #endif
2445         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2446
2447         /*
2448          * Key range (begin and end inclusive) to scan.  Note that the key's
2449          * stored in the actual records represent BASE+LEN, not BASE.  The
2450          * first record containing bio_offset will have a key > bio_offset.
2451          */
2452         cursor.key_beg.localization = ip->obj_localization +
2453                                       HAMMER_LOCALIZE_MISC;
2454         cursor.key_beg.obj_id = ip->obj_id;
2455         cursor.key_beg.create_tid = 0;
2456         cursor.key_beg.delete_tid = 0;
2457         cursor.key_beg.obj_type = 0;
2458         if (ap->a_runb)
2459                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2460         else
2461                 cursor.key_beg.key = ap->a_loffset + 1;
2462         if (cursor.key_beg.key < 0)
2463                 cursor.key_beg.key = 0;
2464         cursor.asof = ip->obj_asof;
2465         cursor.flags |= HAMMER_CURSOR_ASOF;
2466
2467         cursor.key_end = cursor.key_beg;
2468         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2469
2470         ran_end = ap->a_loffset + MAXPHYS;
2471         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2472         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2473         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2474         if (tmp64 < ran_end)
2475                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2476         else
2477                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2478
2479         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2480
2481         error = hammer_ip_first(&cursor);
2482         base_offset = last_offset = 0;
2483         base_disk_offset = last_disk_offset = 0;
2484
2485         while (error == 0) {
2486                 /*
2487                  * Get the base file offset of the record.  The key for
2488                  * data records is (base + bytes) rather then (base).
2489                  *
2490                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2491                  * The extra bytes should be zero on-disk and the BMAP op
2492                  * should still be ok.
2493                  */
2494                 base = &cursor.leaf->base;
2495                 rec_offset = base->key - cursor.leaf->data_len;
2496                 rec_len    = cursor.leaf->data_len;
2497
2498                 /*
2499                  * Incorporate any cached truncation.
2500                  *
2501                  * NOTE: Modifications to rec_len based on synthesized
2502                  * truncation points remove the guarantee that any extended
2503                  * data on disk is zero (since the truncations may not have
2504                  * taken place on-media yet).
2505                  */
2506                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2507                         if (hammer_cursor_ondisk(&cursor) ||
2508                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2509                                 if (ip->trunc_off <= rec_offset)
2510                                         rec_len = 0;
2511                                 else if (ip->trunc_off < rec_offset + rec_len)
2512                                         rec_len = (int)(ip->trunc_off - rec_offset);
2513                         }
2514                 }
2515                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2516                         if (hammer_cursor_ondisk(&cursor)) {
2517                                 if (ip->sync_trunc_off <= rec_offset)
2518                                         rec_len = 0;
2519                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2520                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2521                         }
2522                 }
2523
2524                 /*
2525                  * Accumulate information.  If we have hit a discontiguous
2526                  * block reset base_offset unless we are already beyond the
2527                  * requested offset.  If we are, that's it, we stop.
2528                  */
2529                 if (error)
2530                         break;
2531                 if (hammer_cursor_ondisk(&cursor)) {
2532                         disk_offset = cursor.leaf->data_offset;
2533                         if (rec_offset != last_offset ||
2534                             disk_offset != last_disk_offset) {
2535                                 if (rec_offset > ap->a_loffset)
2536                                         break;
2537                                 base_offset = rec_offset;
2538                                 base_disk_offset = disk_offset;
2539                         }
2540                         last_offset = rec_offset + rec_len;
2541                         last_disk_offset = disk_offset + rec_len;
2542                 }
2543                 error = hammer_ip_next(&cursor);
2544         }
2545
2546 #if 0
2547         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2548                 ap->a_loffset, base_offset, last_offset);
2549         kprintf("BMAP %16s:  %016llx - %016llx\n",
2550                 "", base_disk_offset, last_disk_offset);
2551 #endif
2552
2553         if (cursor.node) {
2554                 hammer_cache_node(&ip->cache[1], cursor.node);
2555 #if 0
2556                 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2557 #endif
2558         }
2559         hammer_done_cursor(&cursor);
2560         hammer_done_transaction(&trans);
2561
2562         /*
2563          * If we couldn't find any records or the records we did find were
2564          * all behind the requested offset, return failure.  A forward
2565          * truncation can leave a hole w/ no on-disk records.
2566          */
2567         if (last_offset == 0 || last_offset < ap->a_loffset)
2568                 return (EOPNOTSUPP);
2569
2570         /*
2571          * Figure out the block size at the requested offset and adjust
2572          * our limits so the cluster_read() does not create inappropriately
2573          * sized buffer cache buffers.
2574          */
2575         blksize = hammer_blocksize(ap->a_loffset);
2576         if (hammer_blocksize(base_offset) != blksize) {
2577                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2578         }
2579         if (last_offset != ap->a_loffset &&
2580             hammer_blocksize(last_offset - 1) != blksize) {
2581                 last_offset = hammer_blockdemarc(ap->a_loffset,
2582                                                  last_offset - 1);
2583         }
2584
2585         /*
2586          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2587          * from occuring.
2588          */
2589         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2590
2591         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2592                 /*
2593                  * Only large-data zones can be direct-IOd
2594                  */
2595                 error = EOPNOTSUPP;
2596         } else if ((disk_offset & HAMMER_BUFMASK) ||
2597                    (last_offset - ap->a_loffset) < blksize) {
2598                 /*
2599                  * doffsetp is not aligned or the forward run size does
2600                  * not cover a whole buffer, disallow the direct I/O.
2601                  */
2602                 error = EOPNOTSUPP;
2603         } else {
2604                 /*
2605                  * We're good.
2606                  */
2607                 *ap->a_doffsetp = disk_offset;
2608                 if (ap->a_runb) {
2609                         *ap->a_runb = ap->a_loffset - base_offset;
2610                         KKASSERT(*ap->a_runb >= 0);
2611                 }
2612                 if (ap->a_runp) {
2613                         *ap->a_runp = last_offset - ap->a_loffset;
2614                         KKASSERT(*ap->a_runp >= 0);
2615                 }
2616                 error = 0;
2617         }
2618         return(error);
2619 }
2620
2621 /*
2622  * Write to a regular file.   Because this is a strategy call the OS is
2623  * trying to actually get data onto the media.
2624  */
2625 static
2626 int
2627 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2628 {
2629         hammer_record_t record;
2630         hammer_mount_t hmp;
2631         hammer_inode_t ip;
2632         struct bio *bio;
2633         struct buf *bp;
2634         int blksize;
2635         int bytes;
2636         int error;
2637
2638         bio = ap->a_bio;
2639         bp = bio->bio_buf;
2640         ip = ap->a_vp->v_data;
2641         hmp = ip->hmp;
2642
2643         blksize = hammer_blocksize(bio->bio_offset);
2644         KKASSERT(bp->b_bufsize == blksize);
2645
2646         if (ip->flags & HAMMER_INODE_RO) {
2647                 bp->b_error = EROFS;
2648                 bp->b_flags |= B_ERROR;
2649                 biodone(ap->a_bio);
2650                 return(EROFS);
2651         }
2652
2653         /*
2654          * Interlock with inode destruction (no in-kernel or directory
2655          * topology visibility).  If we queue new IO while trying to
2656          * destroy the inode we can deadlock the vtrunc call in
2657          * hammer_inode_unloadable_check().
2658          *
2659          * Besides, there's no point flushing a bp associated with an
2660          * inode that is being destroyed on-media and has no kernel
2661          * references.
2662          */
2663         if ((ip->flags | ip->sync_flags) &
2664             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2665                 bp->b_resid = 0;
2666                 biodone(ap->a_bio);
2667                 return(0);
2668         }
2669
2670         /*
2671          * Reserve space and issue a direct-write from the front-end. 
2672          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2673          * allocations.
2674          *
2675          * An in-memory record will be installed to reference the storage
2676          * until the flusher can get to it.
2677          *
2678          * Since we own the high level bio the front-end will not try to
2679          * do a direct-read until the write completes.
2680          *
2681          * NOTE: The only time we do not reserve a full-sized buffers
2682          * worth of data is if the file is small.  We do not try to
2683          * allocate a fragment (from the small-data zone) at the end of
2684          * an otherwise large file as this can lead to wildly separated
2685          * data.
2686          */
2687         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2688         KKASSERT(bio->bio_offset < ip->ino_data.size);
2689         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2690                 bytes = bp->b_bufsize;
2691         else
2692                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2693
2694         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2695                                     bytes, &error);
2696         if (record) {
2697                 hammer_io_direct_write(hmp, record, bio);
2698                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2699                         hammer_flush_inode(ip, 0);
2700         } else {
2701                 bp->b_bio2.bio_offset = NOOFFSET;
2702                 bp->b_error = error;
2703                 bp->b_flags |= B_ERROR;
2704                 biodone(ap->a_bio);
2705         }
2706         return(error);
2707 }
2708
2709 /*
2710  * dounlink - disconnect a directory entry
2711  *
2712  * XXX whiteout support not really in yet
2713  */
2714 static int
2715 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2716                 struct vnode *dvp, struct ucred *cred, 
2717                 int flags, int isdir)
2718 {
2719         struct namecache *ncp;
2720         hammer_inode_t dip;
2721         hammer_inode_t ip;
2722         struct hammer_cursor cursor;
2723         int64_t namekey;
2724         u_int32_t max_iterations;
2725         int nlen, error;
2726
2727         /*
2728          * Calculate the namekey and setup the key range for the scan.  This
2729          * works kinda like a chained hash table where the lower 32 bits
2730          * of the namekey synthesize the chain.
2731          *
2732          * The key range is inclusive of both key_beg and key_end.
2733          */
2734         dip = VTOI(dvp);
2735         ncp = nch->ncp;
2736
2737         if (dip->flags & HAMMER_INODE_RO)
2738                 return (EROFS);
2739
2740         namekey = hammer_directory_namekey(dip, ncp->nc_name, ncp->nc_nlen,
2741                                            &max_iterations);
2742 retry:
2743         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2744         cursor.key_beg.localization = dip->obj_localization +
2745                                       HAMMER_LOCALIZE_MISC;
2746         cursor.key_beg.obj_id = dip->obj_id;
2747         cursor.key_beg.key = namekey;
2748         cursor.key_beg.create_tid = 0;
2749         cursor.key_beg.delete_tid = 0;
2750         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2751         cursor.key_beg.obj_type = 0;
2752
2753         cursor.key_end = cursor.key_beg;
2754         cursor.key_end.key += max_iterations;
2755         cursor.asof = dip->obj_asof;
2756         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2757
2758         /*
2759          * Scan all matching records (the chain), locate the one matching
2760          * the requested path component.  info->last_error contains the
2761          * error code on search termination and could be 0, ENOENT, or
2762          * something else.
2763          *
2764          * The hammer_ip_*() functions merge in-memory records with on-disk
2765          * records for the purposes of the search.
2766          */
2767         error = hammer_ip_first(&cursor);
2768
2769         while (error == 0) {
2770                 error = hammer_ip_resolve_data(&cursor);
2771                 if (error)
2772                         break;
2773                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2774                 KKASSERT(nlen > 0);
2775                 if (ncp->nc_nlen == nlen &&
2776                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2777                         break;
2778                 }
2779                 error = hammer_ip_next(&cursor);
2780         }
2781
2782         /*
2783          * If all is ok we have to get the inode so we can adjust nlinks.
2784          * To avoid a deadlock with the flusher we must release the inode
2785          * lock on the directory when acquiring the inode for the entry.
2786          *
2787          * If the target is a directory, it must be empty.
2788          */
2789         if (error == 0) {
2790                 hammer_unlock(&cursor.ip->lock);
2791                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2792                                       dip->hmp->asof,
2793                                       cursor.data->entry.localization,
2794                                       0, &error);
2795                 hammer_lock_sh(&cursor.ip->lock);
2796                 if (error == ENOENT) {
2797                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2798                         Debugger("ENOENT unlinking object that should exist");
2799                 }
2800
2801                 /*
2802                  * If isdir >= 0 we validate that the entry is or is not a
2803                  * directory.  If isdir < 0 we don't care.
2804                  */
2805                 if (error == 0 && isdir >= 0) {
2806                         if (isdir &&
2807                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
2808                                 error = ENOTDIR;
2809                         } else if (isdir == 0 &&
2810                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
2811                                 error = EISDIR;
2812                         }
2813                 }
2814
2815                 /*
2816                  * If we are trying to remove a directory the directory must
2817                  * be empty.
2818                  *
2819                  * The check directory code can loop and deadlock/retry.  Our
2820                  * own cursor's node locks must be released to avoid a 3-way
2821                  * deadlock with the flusher if the check directory code
2822                  * blocks.
2823                  *
2824                  * If any changes whatsoever have been made to the cursor
2825                  * set EDEADLK and retry.
2826                  */
2827                 if (error == 0 && ip->ino_data.obj_type ==
2828                                   HAMMER_OBJTYPE_DIRECTORY) {
2829                         hammer_unlock_cursor(&cursor);
2830                         error = hammer_ip_check_directory_empty(trans, ip);
2831                         hammer_lock_cursor(&cursor);
2832                         if (cursor.flags & HAMMER_CURSOR_RETEST) {
2833                                 kprintf("HAMMER: Warning: avoided deadlock "
2834                                         "on rmdir '%s'\n",
2835                                         ncp->nc_name);
2836                                 error = EDEADLK;
2837                         }
2838                 }
2839
2840                 /*
2841                  * Delete the directory entry.
2842                  *
2843                  * WARNING: hammer_ip_del_directory() may have to terminate
2844                  * the cursor to avoid a deadlock.  It is ok to call
2845                  * hammer_done_cursor() twice.
2846                  */
2847                 if (error == 0) {
2848                         error = hammer_ip_del_directory(trans, &cursor,
2849                                                         dip, ip);
2850                 }
2851                 hammer_done_cursor(&cursor);
2852                 if (error == 0) {
2853                         cache_setunresolved(nch);
2854                         cache_setvp(nch, NULL);
2855                         /* XXX locking */
2856                         if (ip->vp) {
2857                                 hammer_knote(ip->vp, NOTE_DELETE);
2858                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2859                         }
2860                 }
2861                 if (ip)
2862                         hammer_rel_inode(ip, 0);
2863         } else {
2864                 hammer_done_cursor(&cursor);
2865         }
2866         if (error == EDEADLK)
2867                 goto retry;
2868
2869         return (error);
2870 }
2871
2872 /************************************************************************
2873  *                          FIFO AND SPECFS OPS                         *
2874  ************************************************************************
2875  *
2876  */
2877
2878 static int
2879 hammer_vop_fifoclose (struct vop_close_args *ap)
2880 {
2881         /* XXX update itimes */
2882         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2883 }
2884
2885 static int
2886 hammer_vop_fiforead (struct vop_read_args *ap)
2887 {
2888         int error;
2889
2890         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2891         /* XXX update access time */
2892         return (error);
2893 }
2894
2895 static int
2896 hammer_vop_fifowrite (struct vop_write_args *ap)
2897 {
2898         int error;
2899
2900         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2901         /* XXX update access time */
2902         return (error);
2903 }
2904
2905 static
2906 int
2907 hammer_vop_fifokqfilter(struct vop_kqfilter_args *ap)
2908 {
2909         int error;
2910
2911         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2912         if (error)
2913                 error = hammer_vop_kqfilter(ap);
2914         return(error);
2915 }
2916
2917 static int
2918 hammer_vop_specclose (struct vop_close_args *ap)
2919 {
2920         /* XXX update itimes */
2921         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2922 }
2923
2924 static int
2925 hammer_vop_specread (struct vop_read_args *ap)
2926 {
2927         /* XXX update access time */
2928         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2929 }
2930
2931 static int
2932 hammer_vop_specwrite (struct vop_write_args *ap)
2933 {
2934         /* XXX update last change time */
2935         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2936 }
2937
2938 /************************************************************************
2939  *                          KQFILTER OPS                                *
2940  ************************************************************************
2941  *
2942  */
2943 static void filt_hammerdetach(struct knote *kn);
2944 static int filt_hammerread(struct knote *kn, long hint);
2945 static int filt_hammerwrite(struct knote *kn, long hint);
2946 static int filt_hammervnode(struct knote *kn, long hint);
2947
2948 static struct filterops hammerread_filtops =
2949         { 1, NULL, filt_hammerdetach, filt_hammerread };
2950 static struct filterops hammerwrite_filtops =
2951         { 1, NULL, filt_hammerdetach, filt_hammerwrite };
2952 static struct filterops hammervnode_filtops =
2953         { 1, NULL, filt_hammerdetach, filt_hammervnode };
2954
2955 static
2956 int
2957 hammer_vop_kqfilter(struct vop_kqfilter_args *ap)
2958 {
2959         struct vnode *vp = ap->a_vp;
2960         struct knote *kn = ap->a_kn;
2961         lwkt_tokref ilock;
2962
2963         switch (kn->kn_filter) {
2964         case EVFILT_READ:
2965                 kn->kn_fop = &hammerread_filtops;
2966                 break;
2967         case EVFILT_WRITE:
2968                 kn->kn_fop = &hammerwrite_filtops;
2969                 break;
2970         case EVFILT_VNODE:
2971                 kn->kn_fop = &hammervnode_filtops;
2972                 break;
2973         default:
2974                 return (1);
2975         }
2976
2977         kn->kn_hook = (caddr_t)vp;
2978
2979         lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
2980         SLIST_INSERT_HEAD(&vp->v_pollinfo.vpi_selinfo.si_note, kn, kn_selnext);
2981         lwkt_reltoken(&ilock);
2982
2983         return(0);
2984 }
2985
2986 static void
2987 filt_hammerdetach(struct knote *kn)
2988 {
2989         struct vnode *vp = (void *)kn->kn_hook;
2990         lwkt_tokref ilock;
2991
2992         lwkt_gettoken(&ilock, &vp->v_pollinfo.vpi_token);
2993         SLIST_REMOVE(&vp->v_pollinfo.vpi_selinfo.si_note,
2994                      kn, knote, kn_selnext);
2995         lwkt_reltoken(&ilock);
2996 }
2997
2998 static int
2999 filt_hammerread(struct knote *kn, long hint)
3000 {
3001         struct vnode *vp = (void *)kn->kn_hook;
3002         hammer_inode_t ip = VTOI(vp);
3003
3004         if (hint == NOTE_REVOKE) {
3005                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3006                 return(1);
3007         }
3008         kn->kn_data = ip->ino_data.size - kn->kn_fp->f_offset;
3009         return (kn->kn_data != 0);
3010 }
3011
3012 static int
3013 filt_hammerwrite(struct knote *kn, long hint)
3014 {
3015         if (hint == NOTE_REVOKE)
3016                 kn->kn_flags |= (EV_EOF | EV_ONESHOT);
3017         kn->kn_data = 0;
3018         return (1);
3019 }
3020
3021 static int
3022 filt_hammervnode(struct knote *kn, long hint)
3023 {
3024         if (kn->kn_sfflags & hint)
3025                 kn->kn_fflags |= hint;
3026         if (hint == NOTE_REVOKE) {
3027                 kn->kn_flags |= EV_EOF;
3028                 return (1);
3029         }
3030         return (kn->kn_fflags != 0);
3031 }
3032