Merge from vendor branch OPENSSH:
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.99 2008/09/23 22:28:56 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91
92 struct vop_ops hammer_vnode_vops = {
93         .vop_default =          vop_defaultop,
94         .vop_fsync =            hammer_vop_fsync,
95         .vop_getpages =         vop_stdgetpages,
96         .vop_putpages =         vop_stdputpages,
97         .vop_read =             hammer_vop_read,
98         .vop_write =            hammer_vop_write,
99         .vop_access =           hammer_vop_access,
100         .vop_advlock =          hammer_vop_advlock,
101         .vop_close =            hammer_vop_close,
102         .vop_ncreate =          hammer_vop_ncreate,
103         .vop_getattr =          hammer_vop_getattr,
104         .vop_inactive =         hammer_vop_inactive,
105         .vop_reclaim =          hammer_vop_reclaim,
106         .vop_nresolve =         hammer_vop_nresolve,
107         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
108         .vop_nlink =            hammer_vop_nlink,
109         .vop_nmkdir =           hammer_vop_nmkdir,
110         .vop_nmknod =           hammer_vop_nmknod,
111         .vop_open =             hammer_vop_open,
112         .vop_pathconf =         hammer_vop_pathconf,
113         .vop_print =            hammer_vop_print,
114         .vop_readdir =          hammer_vop_readdir,
115         .vop_readlink =         hammer_vop_readlink,
116         .vop_nremove =          hammer_vop_nremove,
117         .vop_nrename =          hammer_vop_nrename,
118         .vop_nrmdir =           hammer_vop_nrmdir,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl
126 };
127
128 struct vop_ops hammer_spec_vops = {
129         .vop_default =          spec_vnoperate,
130         .vop_fsync =            hammer_vop_fsync,
131         .vop_read =             hammer_vop_specread,
132         .vop_write =            hammer_vop_specwrite,
133         .vop_access =           hammer_vop_access,
134         .vop_close =            hammer_vop_specclose,
135         .vop_getattr =          hammer_vop_getattr,
136         .vop_inactive =         hammer_vop_inactive,
137         .vop_reclaim =          hammer_vop_reclaim,
138         .vop_setattr =          hammer_vop_setattr
139 };
140
141 struct vop_ops hammer_fifo_vops = {
142         .vop_default =          fifo_vnoperate,
143         .vop_fsync =            hammer_vop_fsync,
144         .vop_read =             hammer_vop_fiforead,
145         .vop_write =            hammer_vop_fifowrite,
146         .vop_access =           hammer_vop_access,
147         .vop_close =            hammer_vop_fifoclose,
148         .vop_getattr =          hammer_vop_getattr,
149         .vop_inactive =         hammer_vop_inactive,
150         .vop_reclaim =          hammer_vop_reclaim,
151         .vop_setattr =          hammer_vop_setattr
152 };
153
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159                            struct vnode *dvp, struct ucred *cred,
160                            int flags, int isdir);
161 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
162 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
163
164 #if 0
165 static
166 int
167 hammer_vop_vnoperate(struct vop_generic_args *)
168 {
169         return (VOCALL(&hammer_vnode_vops, ap));
170 }
171 #endif
172
173 /*
174  * hammer_vop_fsync { vp, waitfor }
175  *
176  * fsync() an inode to disk and wait for it to be completely committed
177  * such that the information would not be undone if a crash occured after
178  * return.
179  */
180 static
181 int
182 hammer_vop_fsync(struct vop_fsync_args *ap)
183 {
184         hammer_inode_t ip = VTOI(ap->a_vp);
185
186         ++hammer_count_fsyncs;
187         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
188         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
189         if (ap->a_waitfor == MNT_WAIT) {
190                 vn_unlock(ap->a_vp);
191                 hammer_wait_inode(ip);
192                 vn_lock(ap->a_vp, LK_EXCLUSIVE | LK_RETRY);
193         }
194         return (ip->error);
195 }
196
197 /*
198  * hammer_vop_read { vp, uio, ioflag, cred }
199  */
200 static
201 int
202 hammer_vop_read(struct vop_read_args *ap)
203 {
204         struct hammer_transaction trans;
205         hammer_inode_t ip;
206         off_t offset;
207         struct buf *bp;
208         struct uio *uio;
209         int error;
210         int n;
211         int seqcount;
212         int ioseqcount;
213         int blksize;
214
215         if (ap->a_vp->v_type != VREG)
216                 return (EINVAL);
217         ip = VTOI(ap->a_vp);
218         error = 0;
219         uio = ap->a_uio;
220
221         /*
222          * Allow the UIO's size to override the sequential heuristic.
223          */
224         blksize = hammer_blocksize(uio->uio_offset);
225         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
226         ioseqcount = ap->a_ioflag >> 16;
227         if (seqcount < ioseqcount)
228                 seqcount = ioseqcount;
229
230         hammer_start_transaction(&trans, ip->hmp);
231
232         /*
233          * Access the data typically in HAMMER_BUFSIZE blocks via the
234          * buffer cache, but HAMMER may use a variable block size based
235          * on the offset.
236          */
237         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
238                 int64_t base_offset;
239                 int64_t file_limit;
240
241                 blksize = hammer_blocksize(uio->uio_offset);
242                 offset = (int)uio->uio_offset & (blksize - 1);
243                 base_offset = uio->uio_offset - offset;
244
245                 if (hammer_cluster_enable) {
246                         /*
247                          * Use file_limit to prevent cluster_read() from
248                          * creating buffers of the wrong block size past
249                          * the demarc.
250                          */
251                         file_limit = ip->ino_data.size;
252                         if (base_offset < HAMMER_XDEMARC &&
253                             file_limit > HAMMER_XDEMARC) {
254                                 file_limit = HAMMER_XDEMARC;
255                         }
256                         error = cluster_read(ap->a_vp,
257                                              file_limit, base_offset,
258                                              blksize, MAXPHYS,
259                                              seqcount, &bp);
260                 } else {
261                         error = bread(ap->a_vp, base_offset, blksize, &bp);
262                 }
263                 if (error) {
264                         kprintf("error %d\n", error);
265                         brelse(bp);
266                         break;
267                 }
268
269                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
270                 n = blksize - offset;
271                 if (n > uio->uio_resid)
272                         n = uio->uio_resid;
273                 if (n > ip->ino_data.size - uio->uio_offset)
274                         n = (int)(ip->ino_data.size - uio->uio_offset);
275                 error = uiomove((char *)bp->b_data + offset, n, uio);
276
277                 /* data has a lower priority then meta-data */
278                 bp->b_flags |= B_AGE;
279                 bqrelse(bp);
280                 if (error)
281                         break;
282                 hammer_stats_file_read += n;
283         }
284         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
285             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
286                 ip->ino_data.atime = trans.time;
287                 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
288         }
289         hammer_done_transaction(&trans);
290         return (error);
291 }
292
293 /*
294  * hammer_vop_write { vp, uio, ioflag, cred }
295  */
296 static
297 int
298 hammer_vop_write(struct vop_write_args *ap)
299 {
300         struct hammer_transaction trans;
301         struct hammer_inode *ip;
302         hammer_mount_t hmp;
303         struct uio *uio;
304         int offset;
305         off_t base_offset;
306         struct buf *bp;
307         int error;
308         int n;
309         int flags;
310         int delta;
311         int seqcount;
312
313         if (ap->a_vp->v_type != VREG)
314                 return (EINVAL);
315         ip = VTOI(ap->a_vp);
316         hmp = ip->hmp;
317         error = 0;
318         seqcount = ap->a_ioflag >> 16;
319
320         if (ip->flags & HAMMER_INODE_RO)
321                 return (EROFS);
322
323         /*
324          * Create a transaction to cover the operations we perform.
325          */
326         hammer_start_transaction(&trans, hmp);
327         uio = ap->a_uio;
328
329         /*
330          * Check append mode
331          */
332         if (ap->a_ioflag & IO_APPEND)
333                 uio->uio_offset = ip->ino_data.size;
334
335         /*
336          * Check for illegal write offsets.  Valid range is 0...2^63-1.
337          *
338          * NOTE: the base_off assignment is required to work around what
339          * I consider to be a GCC-4 optimization bug.
340          */
341         if (uio->uio_offset < 0) {
342                 hammer_done_transaction(&trans);
343                 return (EFBIG);
344         }
345         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
346         if (uio->uio_resid > 0 && base_offset <= 0) {
347                 hammer_done_transaction(&trans);
348                 return (EFBIG);
349         }
350
351         /*
352          * Access the data typically in HAMMER_BUFSIZE blocks via the
353          * buffer cache, but HAMMER may use a variable block size based
354          * on the offset.
355          */
356         while (uio->uio_resid > 0) {
357                 int fixsize = 0;
358                 int blksize;
359                 int blkmask;
360
361                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
362                         break;
363
364                 blksize = hammer_blocksize(uio->uio_offset);
365
366                 /*
367                  * Do not allow HAMMER to blow out the buffer cache.  Very
368                  * large UIOs can lockout other processes due to bwillwrite()
369                  * mechanics.
370                  *
371                  * The hammer inode is not locked during these operations.
372                  * The vnode is locked which can interfere with the pageout
373                  * daemon for non-UIO_NOCOPY writes but should not interfere
374                  * with the buffer cache.  Even so, we cannot afford to
375                  * allow the pageout daemon to build up too many dirty buffer
376                  * cache buffers.
377                  */
378                 /*if (((int)uio->uio_offset & (blksize - 1)) == 0)*/
379                 bwillwrite(blksize);
380
381                 /*
382                  * Do not allow HAMMER to blow out system memory by
383                  * accumulating too many records.   Records are so well
384                  * decoupled from the buffer cache that it is possible
385                  * for userland to push data out to the media via
386                  * direct-write, but build up the records queued to the
387                  * backend faster then the backend can flush them out.
388                  * HAMMER has hit its write limit but the frontend has
389                  * no pushback to slow it down.
390                  */
391                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
392                         /*
393                          * Get the inode on the flush list
394                          */
395                         if (ip->rsv_recs >= 64)
396                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
397                         else if (ip->rsv_recs >= 16)
398                                 hammer_flush_inode(ip, 0);
399
400                         /*
401                          * Keep the flusher going if the system keeps
402                          * queueing records.
403                          */
404                         delta = hmp->count_newrecords -
405                                 hmp->last_newrecords;
406                         if (delta < 0 || delta > hammer_limit_recs / 2) {
407                                 hmp->last_newrecords = hmp->count_newrecords;
408                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
409                         }
410
411                         /*
412                          * If we have gotten behind start slowing
413                          * down the writers.
414                          */
415                         delta = (hmp->rsv_recs - hammer_limit_recs) *
416                                 hz / hammer_limit_recs;
417                         if (delta > 0)
418                                 tsleep(&trans, 0, "hmrslo", delta);
419                 }
420
421                 /*
422                  * Calculate the blocksize at the current offset and figure
423                  * out how much we can actually write.
424                  */
425                 blkmask = blksize - 1;
426                 offset = (int)uio->uio_offset & blkmask;
427                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
428                 n = blksize - offset;
429                 if (n > uio->uio_resid)
430                         n = uio->uio_resid;
431                 if (uio->uio_offset + n > ip->ino_data.size) {
432                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
433                         fixsize = 1;
434                 }
435
436                 if (uio->uio_segflg == UIO_NOCOPY) {
437                         /*
438                          * Issuing a write with the same data backing the
439                          * buffer.  Instantiate the buffer to collect the
440                          * backing vm pages, then read-in any missing bits.
441                          *
442                          * This case is used by vop_stdputpages().
443                          */
444                         bp = getblk(ap->a_vp, base_offset,
445                                     blksize, GETBLK_BHEAVY, 0);
446                         if ((bp->b_flags & B_CACHE) == 0) {
447                                 bqrelse(bp);
448                                 error = bread(ap->a_vp, base_offset,
449                                               blksize, &bp);
450                         }
451                 } else if (offset == 0 && uio->uio_resid >= blksize) {
452                         /*
453                          * Even though we are entirely overwriting the buffer
454                          * we may still have to zero it out to avoid a 
455                          * mmap/write visibility issue.
456                          */
457                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
458                         if ((bp->b_flags & B_CACHE) == 0)
459                                 vfs_bio_clrbuf(bp);
460                 } else if (base_offset >= ip->ino_data.size) {
461                         /*
462                          * If the base offset of the buffer is beyond the
463                          * file EOF, we don't have to issue a read.
464                          */
465                         bp = getblk(ap->a_vp, base_offset,
466                                     blksize, GETBLK_BHEAVY, 0);
467                         vfs_bio_clrbuf(bp);
468                 } else {
469                         /*
470                          * Partial overwrite, read in any missing bits then
471                          * replace the portion being written.
472                          */
473                         error = bread(ap->a_vp, base_offset, blksize, &bp);
474                         if (error == 0)
475                                 bheavy(bp);
476                 }
477                 if (error == 0) {
478                         error = uiomove((char *)bp->b_data + offset,
479                                         n, uio);
480                 }
481
482                 /*
483                  * If we screwed up we have to undo any VM size changes we
484                  * made.
485                  */
486                 if (error) {
487                         brelse(bp);
488                         if (fixsize) {
489                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
490                                           hammer_blocksize(ip->ino_data.size));
491                         }
492                         break;
493                 }
494                 hammer_stats_file_write += n;
495                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
496                 if (ip->ino_data.size < uio->uio_offset) {
497                         ip->ino_data.size = uio->uio_offset;
498                         flags = HAMMER_INODE_DDIRTY;
499                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
500                 } else {
501                         flags = 0;
502                 }
503                 ip->ino_data.mtime = trans.time;
504                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
505                 hammer_modify_inode(ip, flags);
506
507                 /*
508                  * Once we dirty the buffer any cached zone-X offset
509                  * becomes invalid.  HAMMER NOTE: no-history mode cannot 
510                  * allow overwriting over the same data sector unless
511                  * we provide UNDOs for the old data, which we don't.
512                  */
513                 bp->b_bio2.bio_offset = NOOFFSET;
514
515                 /*
516                  * Final buffer disposition.
517                  */
518                 bp->b_flags |= B_AGE;
519                 if (ap->a_ioflag & IO_SYNC) {
520                         bwrite(bp);
521                 } else if (ap->a_ioflag & IO_DIRECT) {
522                         bawrite(bp);
523                 } else {
524                         bdwrite(bp);
525                 }
526         }
527         hammer_done_transaction(&trans);
528         return (error);
529 }
530
531 /*
532  * hammer_vop_access { vp, mode, cred }
533  */
534 static
535 int
536 hammer_vop_access(struct vop_access_args *ap)
537 {
538         struct hammer_inode *ip = VTOI(ap->a_vp);
539         uid_t uid;
540         gid_t gid;
541         int error;
542
543         ++hammer_stats_file_iopsr;
544         uid = hammer_to_unix_xid(&ip->ino_data.uid);
545         gid = hammer_to_unix_xid(&ip->ino_data.gid);
546
547         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
548                                   ip->ino_data.uflags);
549         return (error);
550 }
551
552 /*
553  * hammer_vop_advlock { vp, id, op, fl, flags }
554  */
555 static
556 int
557 hammer_vop_advlock(struct vop_advlock_args *ap)
558 {
559         hammer_inode_t ip = VTOI(ap->a_vp);
560
561         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
562 }
563
564 /*
565  * hammer_vop_close { vp, fflag }
566  */
567 static
568 int
569 hammer_vop_close(struct vop_close_args *ap)
570 {
571         /*hammer_inode_t ip = VTOI(ap->a_vp);*/
572         return (vop_stdclose(ap));
573 }
574
575 /*
576  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
577  *
578  * The operating system has already ensured that the directory entry
579  * does not exist and done all appropriate namespace locking.
580  */
581 static
582 int
583 hammer_vop_ncreate(struct vop_ncreate_args *ap)
584 {
585         struct hammer_transaction trans;
586         struct hammer_inode *dip;
587         struct hammer_inode *nip;
588         struct nchandle *nch;
589         int error;
590
591         nch = ap->a_nch;
592         dip = VTOI(ap->a_dvp);
593
594         if (dip->flags & HAMMER_INODE_RO)
595                 return (EROFS);
596         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
597                 return (error);
598
599         /*
600          * Create a transaction to cover the operations we perform.
601          */
602         hammer_start_transaction(&trans, dip->hmp);
603         ++hammer_stats_file_iopsw;
604
605         /*
606          * Create a new filesystem object of the requested type.  The
607          * returned inode will be referenced and shared-locked to prevent
608          * it from being moved to the flusher.
609          */
610
611         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
612                                     dip, NULL, &nip);
613         if (error) {
614                 hkprintf("hammer_create_inode error %d\n", error);
615                 hammer_done_transaction(&trans);
616                 *ap->a_vpp = NULL;
617                 return (error);
618         }
619
620         /*
621          * Add the new filesystem object to the directory.  This will also
622          * bump the inode's link count.
623          */
624         error = hammer_ip_add_directory(&trans, dip,
625                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
626                                         nip);
627         if (error)
628                 hkprintf("hammer_ip_add_directory error %d\n", error);
629
630         /*
631          * Finish up.
632          */
633         if (error) {
634                 hammer_rel_inode(nip, 0);
635                 hammer_done_transaction(&trans);
636                 *ap->a_vpp = NULL;
637         } else {
638                 error = hammer_get_vnode(nip, ap->a_vpp);
639                 hammer_done_transaction(&trans);
640                 hammer_rel_inode(nip, 0);
641                 if (error == 0) {
642                         cache_setunresolved(ap->a_nch);
643                         cache_setvp(ap->a_nch, *ap->a_vpp);
644                 }
645         }
646         return (error);
647 }
648
649 /*
650  * hammer_vop_getattr { vp, vap }
651  *
652  * Retrieve an inode's attribute information.  When accessing inodes
653  * historically we fake the atime field to ensure consistent results.
654  * The atime field is stored in the B-Tree element and allowed to be
655  * updated without cycling the element.
656  */
657 static
658 int
659 hammer_vop_getattr(struct vop_getattr_args *ap)
660 {
661         struct hammer_inode *ip = VTOI(ap->a_vp);
662         struct vattr *vap = ap->a_vap;
663
664         /*
665          * We want the fsid to be different when accessing a filesystem
666          * with different as-of's so programs like diff don't think
667          * the files are the same.
668          *
669          * We also want the fsid to be the same when comparing snapshots,
670          * or when comparing mirrors (which might be backed by different
671          * physical devices).  HAMMER fsids are based on the PFS's
672          * shared_uuid field.
673          *
674          * XXX there is a chance of collision here.  The va_fsid reported
675          * by stat is different from the more involved fsid used in the
676          * mount structure.
677          */
678         ++hammer_stats_file_iopsr;
679         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
680                        (u_int32_t)(ip->obj_asof >> 32);
681
682         vap->va_fileid = ip->ino_leaf.base.obj_id;
683         vap->va_mode = ip->ino_data.mode;
684         vap->va_nlink = ip->ino_data.nlinks;
685         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
686         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
687         vap->va_rmajor = 0;
688         vap->va_rminor = 0;
689         vap->va_size = ip->ino_data.size;
690
691         /*
692          * Special case for @@PFS softlinks.  The actual size of the
693          * expanded softlink is "@@0x%016llx:%05d" == 26 bytes.
694          */
695         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_SOFTLINK &&
696             ip->ino_data.size == 10 &&
697             ip->obj_asof == HAMMER_MAX_TID &&
698             ip->obj_localization == 0 &&
699             strncmp(ip->ino_data.ext.symlink, "@@PFS", 5) == 0) {
700                     vap->va_size = 26;
701         }
702
703         /*
704          * We must provide a consistent atime and mtime for snapshots
705          * so people can do a 'tar cf - ... | md5' on them and get
706          * consistent results.
707          */
708         if (ip->flags & HAMMER_INODE_RO) {
709                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
710                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
711         } else {
712                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
713                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
714         }
715         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
716         vap->va_flags = ip->ino_data.uflags;
717         vap->va_gen = 1;        /* hammer inums are unique for all time */
718         vap->va_blocksize = HAMMER_BUFSIZE;
719         if (ip->ino_data.size >= HAMMER_XDEMARC) {
720                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
721                                 ~HAMMER_XBUFMASK64;
722         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
723                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
724                                 ~HAMMER_BUFMASK64;
725         } else {
726                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
727         }
728         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
729         vap->va_filerev = 0;    /* XXX */
730         /* mtime uniquely identifies any adjustments made to the file XXX */
731         vap->va_fsmid = ip->ino_data.mtime;
732         vap->va_uid_uuid = ip->ino_data.uid;
733         vap->va_gid_uuid = ip->ino_data.gid;
734         vap->va_fsid_uuid = ip->hmp->fsid;
735         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
736                           VA_FSID_UUID_VALID;
737
738         switch (ip->ino_data.obj_type) {
739         case HAMMER_OBJTYPE_CDEV:
740         case HAMMER_OBJTYPE_BDEV:
741                 vap->va_rmajor = ip->ino_data.rmajor;
742                 vap->va_rminor = ip->ino_data.rminor;
743                 break;
744         default:
745                 break;
746         }
747         return(0);
748 }
749
750 /*
751  * hammer_vop_nresolve { nch, dvp, cred }
752  *
753  * Locate the requested directory entry.
754  */
755 static
756 int
757 hammer_vop_nresolve(struct vop_nresolve_args *ap)
758 {
759         struct hammer_transaction trans;
760         struct namecache *ncp;
761         hammer_inode_t dip;
762         hammer_inode_t ip;
763         hammer_tid_t asof;
764         struct hammer_cursor cursor;
765         struct vnode *vp;
766         int64_t namekey;
767         int error;
768         int i;
769         int nlen;
770         int flags;
771         int ispfs;
772         int64_t obj_id;
773         u_int32_t localization;
774
775         /*
776          * Misc initialization, plus handle as-of name extensions.  Look for
777          * the '@@' extension.  Note that as-of files and directories cannot
778          * be modified.
779          */
780         dip = VTOI(ap->a_dvp);
781         ncp = ap->a_nch->ncp;
782         asof = dip->obj_asof;
783         nlen = ncp->nc_nlen;
784         flags = dip->flags & HAMMER_INODE_RO;
785         ispfs = 0;
786
787         hammer_simple_transaction(&trans, dip->hmp);
788         ++hammer_stats_file_iopsr;
789
790         for (i = 0; i < nlen; ++i) {
791                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
792                         asof = hammer_str_to_tid(ncp->nc_name + i + 2,
793                                                  &ispfs, &localization);
794                         if (asof != HAMMER_MAX_TID)
795                                 flags |= HAMMER_INODE_RO;
796                         break;
797                 }
798         }
799         nlen = i;
800
801         /*
802          * If this is a PFS softlink we dive into the PFS
803          */
804         if (ispfs && nlen == 0) {
805                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
806                                       asof, localization,
807                                       flags, &error);
808                 if (error == 0) {
809                         error = hammer_get_vnode(ip, &vp);
810                         hammer_rel_inode(ip, 0);
811                 } else {
812                         vp = NULL;
813                 }
814                 if (error == 0) {
815                         vn_unlock(vp);
816                         cache_setvp(ap->a_nch, vp);
817                         vrele(vp);
818                 }
819                 goto done;
820         }
821
822         /*
823          * If there is no path component the time extension is relative to
824          * dip.
825          */
826         if (nlen == 0) {
827                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
828                                       asof, dip->obj_localization,
829                                       flags, &error);
830                 if (error == 0) {
831                         error = hammer_get_vnode(ip, &vp);
832                         hammer_rel_inode(ip, 0);
833                 } else {
834                         vp = NULL;
835                 }
836                 if (error == 0) {
837                         vn_unlock(vp);
838                         cache_setvp(ap->a_nch, vp);
839                         vrele(vp);
840                 }
841                 goto done;
842         }
843
844         /*
845          * Calculate the namekey and setup the key range for the scan.  This
846          * works kinda like a chained hash table where the lower 32 bits
847          * of the namekey synthesize the chain.
848          *
849          * The key range is inclusive of both key_beg and key_end.
850          */
851         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
852
853         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
854         cursor.key_beg.localization = dip->obj_localization +
855                                       HAMMER_LOCALIZE_MISC;
856         cursor.key_beg.obj_id = dip->obj_id;
857         cursor.key_beg.key = namekey;
858         cursor.key_beg.create_tid = 0;
859         cursor.key_beg.delete_tid = 0;
860         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
861         cursor.key_beg.obj_type = 0;
862
863         cursor.key_end = cursor.key_beg;
864         cursor.key_end.key |= 0xFFFFFFFFULL;
865         cursor.asof = asof;
866         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
867
868         /*
869          * Scan all matching records (the chain), locate the one matching
870          * the requested path component.
871          *
872          * The hammer_ip_*() functions merge in-memory records with on-disk
873          * records for the purposes of the search.
874          */
875         obj_id = 0;
876         localization = HAMMER_DEF_LOCALIZATION;
877
878         if (error == 0) {
879                 error = hammer_ip_first(&cursor);
880                 while (error == 0) {
881                         error = hammer_ip_resolve_data(&cursor);
882                         if (error)
883                                 break;
884                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
885                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
886                                 obj_id = cursor.data->entry.obj_id;
887                                 localization = cursor.data->entry.localization;
888                                 break;
889                         }
890                         error = hammer_ip_next(&cursor);
891                 }
892         }
893         hammer_done_cursor(&cursor);
894         if (error == 0) {
895                 ip = hammer_get_inode(&trans, dip, obj_id,
896                                       asof, localization,
897                                       flags, &error);
898                 if (error == 0) {
899                         error = hammer_get_vnode(ip, &vp);
900                         hammer_rel_inode(ip, 0);
901                 } else {
902                         vp = NULL;
903                 }
904                 if (error == 0) {
905                         vn_unlock(vp);
906                         cache_setvp(ap->a_nch, vp);
907                         vrele(vp);
908                 }
909         } else if (error == ENOENT) {
910                 cache_setvp(ap->a_nch, NULL);
911         }
912 done:
913         hammer_done_transaction(&trans);
914         return (error);
915 }
916
917 /*
918  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
919  *
920  * Locate the parent directory of a directory vnode.
921  *
922  * dvp is referenced but not locked.  *vpp must be returned referenced and
923  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
924  * at the root, instead it could indicate that the directory we were in was
925  * removed.
926  *
927  * NOTE: as-of sequences are not linked into the directory structure.  If
928  * we are at the root with a different asof then the mount point, reload
929  * the same directory with the mount point's asof.   I'm not sure what this
930  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
931  * get confused, but it hasn't been tested.
932  */
933 static
934 int
935 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
936 {
937         struct hammer_transaction trans;
938         struct hammer_inode *dip;
939         struct hammer_inode *ip;
940         int64_t parent_obj_id;
941         u_int32_t parent_obj_localization;
942         hammer_tid_t asof;
943         int error;
944
945         dip = VTOI(ap->a_dvp);
946         asof = dip->obj_asof;
947
948         /*
949          * Whos are parent?  This could be the root of a pseudo-filesystem
950          * whos parent is in another localization domain.
951          */
952         parent_obj_id = dip->ino_data.parent_obj_id;
953         if (dip->obj_id == HAMMER_OBJID_ROOT)
954                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
955         else
956                 parent_obj_localization = dip->obj_localization;
957
958         if (parent_obj_id == 0) {
959                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
960                    asof != dip->hmp->asof) {
961                         parent_obj_id = dip->obj_id;
962                         asof = dip->hmp->asof;
963                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
964                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
965                                    dip->obj_asof);
966                 } else {
967                         *ap->a_vpp = NULL;
968                         return ENOENT;
969                 }
970         }
971
972         hammer_simple_transaction(&trans, dip->hmp);
973         ++hammer_stats_file_iopsr;
974
975         ip = hammer_get_inode(&trans, dip, parent_obj_id,
976                               asof, parent_obj_localization,
977                               dip->flags, &error);
978         if (ip) {
979                 error = hammer_get_vnode(ip, ap->a_vpp);
980                 hammer_rel_inode(ip, 0);
981         } else {
982                 *ap->a_vpp = NULL;
983         }
984         hammer_done_transaction(&trans);
985         return (error);
986 }
987
988 /*
989  * hammer_vop_nlink { nch, dvp, vp, cred }
990  */
991 static
992 int
993 hammer_vop_nlink(struct vop_nlink_args *ap)
994 {
995         struct hammer_transaction trans;
996         struct hammer_inode *dip;
997         struct hammer_inode *ip;
998         struct nchandle *nch;
999         int error;
1000
1001         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)    
1002                 return(EXDEV);
1003
1004         nch = ap->a_nch;
1005         dip = VTOI(ap->a_dvp);
1006         ip = VTOI(ap->a_vp);
1007
1008         if (dip->obj_localization != ip->obj_localization)
1009                 return(EXDEV);
1010
1011         if (dip->flags & HAMMER_INODE_RO)
1012                 return (EROFS);
1013         if (ip->flags & HAMMER_INODE_RO)
1014                 return (EROFS);
1015         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1016                 return (error);
1017
1018         /*
1019          * Create a transaction to cover the operations we perform.
1020          */
1021         hammer_start_transaction(&trans, dip->hmp);
1022         ++hammer_stats_file_iopsw;
1023
1024         /*
1025          * Add the filesystem object to the directory.  Note that neither
1026          * dip nor ip are referenced or locked, but their vnodes are
1027          * referenced.  This function will bump the inode's link count.
1028          */
1029         error = hammer_ip_add_directory(&trans, dip,
1030                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1031                                         ip);
1032
1033         /*
1034          * Finish up.
1035          */
1036         if (error == 0) {
1037                 cache_setunresolved(nch);
1038                 cache_setvp(nch, ap->a_vp);
1039         }
1040         hammer_done_transaction(&trans);
1041         return (error);
1042 }
1043
1044 /*
1045  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1046  *
1047  * The operating system has already ensured that the directory entry
1048  * does not exist and done all appropriate namespace locking.
1049  */
1050 static
1051 int
1052 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1053 {
1054         struct hammer_transaction trans;
1055         struct hammer_inode *dip;
1056         struct hammer_inode *nip;
1057         struct nchandle *nch;
1058         int error;
1059
1060         nch = ap->a_nch;
1061         dip = VTOI(ap->a_dvp);
1062
1063         if (dip->flags & HAMMER_INODE_RO)
1064                 return (EROFS);
1065         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1066                 return (error);
1067
1068         /*
1069          * Create a transaction to cover the operations we perform.
1070          */
1071         hammer_start_transaction(&trans, dip->hmp);
1072         ++hammer_stats_file_iopsw;
1073
1074         /*
1075          * Create a new filesystem object of the requested type.  The
1076          * returned inode will be referenced but not locked.
1077          */
1078         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1079                                     dip, NULL, &nip);
1080         if (error) {
1081                 hkprintf("hammer_mkdir error %d\n", error);
1082                 hammer_done_transaction(&trans);
1083                 *ap->a_vpp = NULL;
1084                 return (error);
1085         }
1086         /*
1087          * Add the new filesystem object to the directory.  This will also
1088          * bump the inode's link count.
1089          */
1090         error = hammer_ip_add_directory(&trans, dip,
1091                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1092                                         nip);
1093         if (error)
1094                 hkprintf("hammer_mkdir (add) error %d\n", error);
1095
1096         /*
1097          * Finish up.
1098          */
1099         if (error) {
1100                 hammer_rel_inode(nip, 0);
1101                 *ap->a_vpp = NULL;
1102         } else {
1103                 error = hammer_get_vnode(nip, ap->a_vpp);
1104                 hammer_rel_inode(nip, 0);
1105                 if (error == 0) {
1106                         cache_setunresolved(ap->a_nch);
1107                         cache_setvp(ap->a_nch, *ap->a_vpp);
1108                 }
1109         }
1110         hammer_done_transaction(&trans);
1111         return (error);
1112 }
1113
1114 /*
1115  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1116  *
1117  * The operating system has already ensured that the directory entry
1118  * does not exist and done all appropriate namespace locking.
1119  */
1120 static
1121 int
1122 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1123 {
1124         struct hammer_transaction trans;
1125         struct hammer_inode *dip;
1126         struct hammer_inode *nip;
1127         struct nchandle *nch;
1128         int error;
1129
1130         nch = ap->a_nch;
1131         dip = VTOI(ap->a_dvp);
1132
1133         if (dip->flags & HAMMER_INODE_RO)
1134                 return (EROFS);
1135         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1136                 return (error);
1137
1138         /*
1139          * Create a transaction to cover the operations we perform.
1140          */
1141         hammer_start_transaction(&trans, dip->hmp);
1142         ++hammer_stats_file_iopsw;
1143
1144         /*
1145          * Create a new filesystem object of the requested type.  The
1146          * returned inode will be referenced but not locked.
1147          *
1148          * If mknod specifies a directory a pseudo-fs is created.
1149          */
1150         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1151                                     dip, NULL, &nip);
1152         if (error) {
1153                 hammer_done_transaction(&trans);
1154                 *ap->a_vpp = NULL;
1155                 return (error);
1156         }
1157
1158         /*
1159          * Add the new filesystem object to the directory.  This will also
1160          * bump the inode's link count.
1161          */
1162         error = hammer_ip_add_directory(&trans, dip,
1163                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1164                                         nip);
1165
1166         /*
1167          * Finish up.
1168          */
1169         if (error) {
1170                 hammer_rel_inode(nip, 0);
1171                 *ap->a_vpp = NULL;
1172         } else {
1173                 error = hammer_get_vnode(nip, ap->a_vpp);
1174                 hammer_rel_inode(nip, 0);
1175                 if (error == 0) {
1176                         cache_setunresolved(ap->a_nch);
1177                         cache_setvp(ap->a_nch, *ap->a_vpp);
1178                 }
1179         }
1180         hammer_done_transaction(&trans);
1181         return (error);
1182 }
1183
1184 /*
1185  * hammer_vop_open { vp, mode, cred, fp }
1186  */
1187 static
1188 int
1189 hammer_vop_open(struct vop_open_args *ap)
1190 {
1191         hammer_inode_t ip;
1192
1193         ++hammer_stats_file_iopsr;
1194         ip = VTOI(ap->a_vp);
1195
1196         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1197                 return (EROFS);
1198         return(vop_stdopen(ap));
1199 }
1200
1201 /*
1202  * hammer_vop_pathconf { vp, name, retval }
1203  */
1204 static
1205 int
1206 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1207 {
1208         return EOPNOTSUPP;
1209 }
1210
1211 /*
1212  * hammer_vop_print { vp }
1213  */
1214 static
1215 int
1216 hammer_vop_print(struct vop_print_args *ap)
1217 {
1218         return EOPNOTSUPP;
1219 }
1220
1221 /*
1222  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1223  */
1224 static
1225 int
1226 hammer_vop_readdir(struct vop_readdir_args *ap)
1227 {
1228         struct hammer_transaction trans;
1229         struct hammer_cursor cursor;
1230         struct hammer_inode *ip;
1231         struct uio *uio;
1232         hammer_base_elm_t base;
1233         int error;
1234         int cookie_index;
1235         int ncookies;
1236         off_t *cookies;
1237         off_t saveoff;
1238         int r;
1239         int dtype;
1240
1241         ++hammer_stats_file_iopsr;
1242         ip = VTOI(ap->a_vp);
1243         uio = ap->a_uio;
1244         saveoff = uio->uio_offset;
1245
1246         if (ap->a_ncookies) {
1247                 ncookies = uio->uio_resid / 16 + 1;
1248                 if (ncookies > 1024)
1249                         ncookies = 1024;
1250                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1251                 cookie_index = 0;
1252         } else {
1253                 ncookies = -1;
1254                 cookies = NULL;
1255                 cookie_index = 0;
1256         }
1257
1258         hammer_simple_transaction(&trans, ip->hmp);
1259
1260         /*
1261          * Handle artificial entries
1262          */
1263         error = 0;
1264         if (saveoff == 0) {
1265                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1266                 if (r)
1267                         goto done;
1268                 if (cookies)
1269                         cookies[cookie_index] = saveoff;
1270                 ++saveoff;
1271                 ++cookie_index;
1272                 if (cookie_index == ncookies)
1273                         goto done;
1274         }
1275         if (saveoff == 1) {
1276                 if (ip->ino_data.parent_obj_id) {
1277                         r = vop_write_dirent(&error, uio,
1278                                              ip->ino_data.parent_obj_id,
1279                                              DT_DIR, 2, "..");
1280                 } else {
1281                         r = vop_write_dirent(&error, uio,
1282                                              ip->obj_id, DT_DIR, 2, "..");
1283                 }
1284                 if (r)
1285                         goto done;
1286                 if (cookies)
1287                         cookies[cookie_index] = saveoff;
1288                 ++saveoff;
1289                 ++cookie_index;
1290                 if (cookie_index == ncookies)
1291                         goto done;
1292         }
1293
1294         /*
1295          * Key range (begin and end inclusive) to scan.  Directory keys
1296          * directly translate to a 64 bit 'seek' position.
1297          */
1298         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1299         cursor.key_beg.localization = ip->obj_localization +
1300                                       HAMMER_LOCALIZE_MISC;
1301         cursor.key_beg.obj_id = ip->obj_id;
1302         cursor.key_beg.create_tid = 0;
1303         cursor.key_beg.delete_tid = 0;
1304         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1305         cursor.key_beg.obj_type = 0;
1306         cursor.key_beg.key = saveoff;
1307
1308         cursor.key_end = cursor.key_beg;
1309         cursor.key_end.key = HAMMER_MAX_KEY;
1310         cursor.asof = ip->obj_asof;
1311         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1312
1313         error = hammer_ip_first(&cursor);
1314
1315         while (error == 0) {
1316                 error = hammer_ip_resolve_data(&cursor);
1317                 if (error)
1318                         break;
1319                 base = &cursor.leaf->base;
1320                 saveoff = base->key;
1321                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1322
1323                 if (base->obj_id != ip->obj_id)
1324                         panic("readdir: bad record at %p", cursor.node);
1325
1326                 /*
1327                  * Convert pseudo-filesystems into softlinks
1328                  */
1329                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1330                 r = vop_write_dirent(
1331                              &error, uio, cursor.data->entry.obj_id,
1332                              dtype,
1333                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1334                              (void *)cursor.data->entry.name);
1335                 if (r)
1336                         break;
1337                 ++saveoff;
1338                 if (cookies)
1339                         cookies[cookie_index] = base->key;
1340                 ++cookie_index;
1341                 if (cookie_index == ncookies)
1342                         break;
1343                 error = hammer_ip_next(&cursor);
1344         }
1345         hammer_done_cursor(&cursor);
1346
1347 done:
1348         hammer_done_transaction(&trans);
1349
1350         if (ap->a_eofflag)
1351                 *ap->a_eofflag = (error == ENOENT);
1352         uio->uio_offset = saveoff;
1353         if (error && cookie_index == 0) {
1354                 if (error == ENOENT)
1355                         error = 0;
1356                 if (cookies) {
1357                         kfree(cookies, M_TEMP);
1358                         *ap->a_ncookies = 0;
1359                         *ap->a_cookies = NULL;
1360                 }
1361         } else {
1362                 if (error == ENOENT)
1363                         error = 0;
1364                 if (cookies) {
1365                         *ap->a_ncookies = cookie_index;
1366                         *ap->a_cookies = cookies;
1367                 }
1368         }
1369         return(error);
1370 }
1371
1372 /*
1373  * hammer_vop_readlink { vp, uio, cred }
1374  */
1375 static
1376 int
1377 hammer_vop_readlink(struct vop_readlink_args *ap)
1378 {
1379         struct hammer_transaction trans;
1380         struct hammer_cursor cursor;
1381         struct hammer_inode *ip;
1382         char buf[32];
1383         u_int32_t localization;
1384         hammer_pseudofs_inmem_t pfsm;
1385         int error;
1386
1387         ip = VTOI(ap->a_vp);
1388
1389         /*
1390          * Shortcut if the symlink data was stuffed into ino_data.
1391          *
1392          * Also expand special "@@PFS%05d" softlinks (expansion only
1393          * occurs for non-historical (current) accesses made from the
1394          * primary filesystem).
1395          */
1396         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1397                 char *ptr;
1398                 int bytes;
1399
1400                 ptr = ip->ino_data.ext.symlink;
1401                 bytes = (int)ip->ino_data.size;
1402                 if (bytes == 10 &&
1403                     ip->obj_asof == HAMMER_MAX_TID &&
1404                     ip->obj_localization == 0 &&
1405                     strncmp(ptr, "@@PFS", 5) == 0) {
1406                         hammer_simple_transaction(&trans, ip->hmp);
1407                         bcopy(ptr + 5, buf, 5);
1408                         buf[5] = 0;
1409                         localization = strtoul(buf, NULL, 10) << 16;
1410                         pfsm = hammer_load_pseudofs(&trans, localization,
1411                                                     &error);
1412                         if (error == 0) {
1413                                 if (pfsm->pfsd.mirror_flags &
1414                                     HAMMER_PFSD_SLAVE) {
1415                                         ksnprintf(buf, sizeof(buf),
1416                                                   "@@0x%016llx:%05d",
1417                                                   pfsm->pfsd.sync_end_tid,
1418                                                   localization >> 16);
1419                                 } else {
1420                                         ksnprintf(buf, sizeof(buf),
1421                                                   "@@0x%016llx:%05d",
1422                                                   HAMMER_MAX_TID,
1423                                                   localization >> 16);
1424                                 }
1425                                 ptr = buf;
1426                                 bytes = strlen(buf);
1427                         }
1428                         if (pfsm)
1429                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1430                         hammer_done_transaction(&trans);
1431                 }
1432                 error = uiomove(ptr, bytes, ap->a_uio);
1433                 return(error);
1434         }
1435
1436         /*
1437          * Long version
1438          */
1439         hammer_simple_transaction(&trans, ip->hmp);
1440         ++hammer_stats_file_iopsr;
1441         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1442
1443         /*
1444          * Key range (begin and end inclusive) to scan.  Directory keys
1445          * directly translate to a 64 bit 'seek' position.
1446          */
1447         cursor.key_beg.localization = ip->obj_localization +
1448                                       HAMMER_LOCALIZE_MISC;
1449         cursor.key_beg.obj_id = ip->obj_id;
1450         cursor.key_beg.create_tid = 0;
1451         cursor.key_beg.delete_tid = 0;
1452         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1453         cursor.key_beg.obj_type = 0;
1454         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1455         cursor.asof = ip->obj_asof;
1456         cursor.flags |= HAMMER_CURSOR_ASOF;
1457
1458         error = hammer_ip_lookup(&cursor);
1459         if (error == 0) {
1460                 error = hammer_ip_resolve_data(&cursor);
1461                 if (error == 0) {
1462                         KKASSERT(cursor.leaf->data_len >=
1463                                  HAMMER_SYMLINK_NAME_OFF);
1464                         error = uiomove(cursor.data->symlink.name,
1465                                         cursor.leaf->data_len -
1466                                                 HAMMER_SYMLINK_NAME_OFF,
1467                                         ap->a_uio);
1468                 }
1469         }
1470         hammer_done_cursor(&cursor);
1471         hammer_done_transaction(&trans);
1472         return(error);
1473 }
1474
1475 /*
1476  * hammer_vop_nremove { nch, dvp, cred }
1477  */
1478 static
1479 int
1480 hammer_vop_nremove(struct vop_nremove_args *ap)
1481 {
1482         struct hammer_transaction trans;
1483         struct hammer_inode *dip;
1484         int error;
1485
1486         dip = VTOI(ap->a_dvp);
1487
1488         if (hammer_nohistory(dip) == 0 &&
1489             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1490                 return (error);
1491         }
1492
1493         hammer_start_transaction(&trans, dip->hmp);
1494         ++hammer_stats_file_iopsw;
1495         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 0);
1496         hammer_done_transaction(&trans);
1497
1498         return (error);
1499 }
1500
1501 /*
1502  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1503  */
1504 static
1505 int
1506 hammer_vop_nrename(struct vop_nrename_args *ap)
1507 {
1508         struct hammer_transaction trans;
1509         struct namecache *fncp;
1510         struct namecache *tncp;
1511         struct hammer_inode *fdip;
1512         struct hammer_inode *tdip;
1513         struct hammer_inode *ip;
1514         struct hammer_cursor cursor;
1515         int64_t namekey;
1516         int nlen, error;
1517
1518         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount) 
1519                 return(EXDEV);
1520         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1521                 return(EXDEV);
1522
1523         fdip = VTOI(ap->a_fdvp);
1524         tdip = VTOI(ap->a_tdvp);
1525         fncp = ap->a_fnch->ncp;
1526         tncp = ap->a_tnch->ncp;
1527         ip = VTOI(fncp->nc_vp);
1528         KKASSERT(ip != NULL);
1529
1530         if (fdip->obj_localization != tdip->obj_localization)
1531                 return(EXDEV);
1532         if (fdip->obj_localization != ip->obj_localization)
1533                 return(EXDEV);
1534
1535         if (fdip->flags & HAMMER_INODE_RO)
1536                 return (EROFS);
1537         if (tdip->flags & HAMMER_INODE_RO)
1538                 return (EROFS);
1539         if (ip->flags & HAMMER_INODE_RO)
1540                 return (EROFS);
1541         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1542                 return (error);
1543
1544         hammer_start_transaction(&trans, fdip->hmp);
1545         ++hammer_stats_file_iopsw;
1546
1547         /*
1548          * Remove tncp from the target directory and then link ip as
1549          * tncp. XXX pass trans to dounlink
1550          *
1551          * Force the inode sync-time to match the transaction so it is
1552          * in-sync with the creation of the target directory entry.
1553          */
1554         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp,
1555                                 ap->a_cred, 0, -1);
1556         if (error == 0 || error == ENOENT) {
1557                 error = hammer_ip_add_directory(&trans, tdip,
1558                                                 tncp->nc_name, tncp->nc_nlen,
1559                                                 ip);
1560                 if (error == 0) {
1561                         ip->ino_data.parent_obj_id = tdip->obj_id;
1562                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1563                 }
1564         }
1565         if (error)
1566                 goto failed; /* XXX */
1567
1568         /*
1569          * Locate the record in the originating directory and remove it.
1570          *
1571          * Calculate the namekey and setup the key range for the scan.  This
1572          * works kinda like a chained hash table where the lower 32 bits
1573          * of the namekey synthesize the chain.
1574          *
1575          * The key range is inclusive of both key_beg and key_end.
1576          */
1577         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1578 retry:
1579         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1580         cursor.key_beg.localization = fdip->obj_localization +
1581                                       HAMMER_LOCALIZE_MISC;
1582         cursor.key_beg.obj_id = fdip->obj_id;
1583         cursor.key_beg.key = namekey;
1584         cursor.key_beg.create_tid = 0;
1585         cursor.key_beg.delete_tid = 0;
1586         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1587         cursor.key_beg.obj_type = 0;
1588
1589         cursor.key_end = cursor.key_beg;
1590         cursor.key_end.key |= 0xFFFFFFFFULL;
1591         cursor.asof = fdip->obj_asof;
1592         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1593
1594         /*
1595          * Scan all matching records (the chain), locate the one matching
1596          * the requested path component.
1597          *
1598          * The hammer_ip_*() functions merge in-memory records with on-disk
1599          * records for the purposes of the search.
1600          */
1601         error = hammer_ip_first(&cursor);
1602         while (error == 0) {
1603                 if (hammer_ip_resolve_data(&cursor) != 0)
1604                         break;
1605                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1606                 KKASSERT(nlen > 0);
1607                 if (fncp->nc_nlen == nlen &&
1608                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1609                         break;
1610                 }
1611                 error = hammer_ip_next(&cursor);
1612         }
1613
1614         /*
1615          * If all is ok we have to get the inode so we can adjust nlinks.
1616          *
1617          * WARNING: hammer_ip_del_directory() may have to terminate the
1618          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1619          * twice.
1620          */
1621         if (error == 0)
1622                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1623
1624         /*
1625          * XXX A deadlock here will break rename's atomicy for the purposes
1626          * of crash recovery.
1627          */
1628         if (error == EDEADLK) {
1629                 hammer_done_cursor(&cursor);
1630                 goto retry;
1631         }
1632
1633         /*
1634          * Cleanup and tell the kernel that the rename succeeded.
1635          */
1636         hammer_done_cursor(&cursor);
1637         if (error == 0)
1638                 cache_rename(ap->a_fnch, ap->a_tnch);
1639
1640 failed:
1641         hammer_done_transaction(&trans);
1642         return (error);
1643 }
1644
1645 /*
1646  * hammer_vop_nrmdir { nch, dvp, cred }
1647  */
1648 static
1649 int
1650 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1651 {
1652         struct hammer_transaction trans;
1653         struct hammer_inode *dip;
1654         int error;
1655
1656         dip = VTOI(ap->a_dvp);
1657
1658         if (hammer_nohistory(dip) == 0 &&
1659             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1660                 return (error);
1661         }
1662
1663         hammer_start_transaction(&trans, dip->hmp);
1664         ++hammer_stats_file_iopsw;
1665         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0, 1);
1666         hammer_done_transaction(&trans);
1667
1668         return (error);
1669 }
1670
1671 /*
1672  * hammer_vop_setattr { vp, vap, cred }
1673  */
1674 static
1675 int
1676 hammer_vop_setattr(struct vop_setattr_args *ap)
1677 {
1678         struct hammer_transaction trans;
1679         struct vattr *vap;
1680         struct hammer_inode *ip;
1681         int modflags;
1682         int error;
1683         int truncating;
1684         int blksize;
1685         int64_t aligned_size;
1686         u_int32_t flags;
1687
1688         vap = ap->a_vap;
1689         ip = ap->a_vp->v_data;
1690         modflags = 0;
1691
1692         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1693                 return(EROFS);
1694         if (ip->flags & HAMMER_INODE_RO)
1695                 return (EROFS);
1696         if (hammer_nohistory(ip) == 0 &&
1697             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1698                 return (error);
1699         }
1700
1701         hammer_start_transaction(&trans, ip->hmp);
1702         ++hammer_stats_file_iopsw;
1703         error = 0;
1704
1705         if (vap->va_flags != VNOVAL) {
1706                 flags = ip->ino_data.uflags;
1707                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1708                                          hammer_to_unix_xid(&ip->ino_data.uid),
1709                                          ap->a_cred);
1710                 if (error == 0) {
1711                         if (ip->ino_data.uflags != flags) {
1712                                 ip->ino_data.uflags = flags;
1713                                 modflags |= HAMMER_INODE_DDIRTY;
1714                         }
1715                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1716                                 error = 0;
1717                                 goto done;
1718                         }
1719                 }
1720                 goto done;
1721         }
1722         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1723                 error = EPERM;
1724                 goto done;
1725         }
1726         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1727                 mode_t cur_mode = ip->ino_data.mode;
1728                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1729                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1730                 uuid_t uuid_uid;
1731                 uuid_t uuid_gid;
1732
1733                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1734                                          ap->a_cred,
1735                                          &cur_uid, &cur_gid, &cur_mode);
1736                 if (error == 0) {
1737                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1738                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1739                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1740                                  sizeof(uuid_uid)) ||
1741                             bcmp(&uuid_gid, &ip->ino_data.gid,
1742                                  sizeof(uuid_gid)) ||
1743                             ip->ino_data.mode != cur_mode
1744                         ) {
1745                                 ip->ino_data.uid = uuid_uid;
1746                                 ip->ino_data.gid = uuid_gid;
1747                                 ip->ino_data.mode = cur_mode;
1748                         }
1749                         modflags |= HAMMER_INODE_DDIRTY;
1750                 }
1751         }
1752         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1753                 switch(ap->a_vp->v_type) {
1754                 case VREG:
1755                         if (vap->va_size == ip->ino_data.size)
1756                                 break;
1757                         /*
1758                          * XXX break atomicy, we can deadlock the backend
1759                          * if we do not release the lock.  Probably not a
1760                          * big deal here.
1761                          */
1762                         blksize = hammer_blocksize(vap->va_size);
1763                         if (vap->va_size < ip->ino_data.size) {
1764                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1765                                 truncating = 1;
1766                         } else {
1767                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1768                                 truncating = 0;
1769                         }
1770                         ip->ino_data.size = vap->va_size;
1771                         modflags |= HAMMER_INODE_DDIRTY;
1772
1773                         /*
1774                          * on-media truncation is cached in the inode until
1775                          * the inode is synchronized.
1776                          */
1777                         if (truncating) {
1778                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1779 #ifdef DEBUG_TRUNCATE
1780                                 if (HammerTruncIp == NULL)
1781                                         HammerTruncIp = ip;
1782 #endif
1783                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1784                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1785                                         ip->trunc_off = vap->va_size;
1786 #ifdef DEBUG_TRUNCATE
1787                                         if (ip == HammerTruncIp)
1788                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1789 #endif
1790                                 } else if (ip->trunc_off > vap->va_size) {
1791                                         ip->trunc_off = vap->va_size;
1792 #ifdef DEBUG_TRUNCATE
1793                                         if (ip == HammerTruncIp)
1794                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1795 #endif
1796                                 } else {
1797 #ifdef DEBUG_TRUNCATE
1798                                         if (ip == HammerTruncIp)
1799                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1800 #endif
1801                                 }
1802                         }
1803
1804                         /*
1805                          * If truncating we have to clean out a portion of
1806                          * the last block on-disk.  We do this in the
1807                          * front-end buffer cache.
1808                          */
1809                         aligned_size = (vap->va_size + (blksize - 1)) &
1810                                        ~(int64_t)(blksize - 1);
1811                         if (truncating && vap->va_size < aligned_size) {
1812                                 struct buf *bp;
1813                                 int offset;
1814
1815                                 aligned_size -= blksize;
1816
1817                                 offset = (int)vap->va_size & (blksize - 1);
1818                                 error = bread(ap->a_vp, aligned_size,
1819                                               blksize, &bp);
1820                                 hammer_ip_frontend_trunc(ip, aligned_size);
1821                                 if (error == 0) {
1822                                         bzero(bp->b_data + offset,
1823                                               blksize - offset);
1824                                         /* must de-cache direct-io offset */
1825                                         bp->b_bio2.bio_offset = NOOFFSET;
1826                                         bdwrite(bp);
1827                                 } else {
1828                                         kprintf("ERROR %d\n", error);
1829                                         brelse(bp);
1830                                 }
1831                         }
1832                         break;
1833                 case VDATABASE:
1834                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1835                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1836                                 ip->trunc_off = vap->va_size;
1837                         } else if (ip->trunc_off > vap->va_size) {
1838                                 ip->trunc_off = vap->va_size;
1839                         }
1840                         hammer_ip_frontend_trunc(ip, vap->va_size);
1841                         ip->ino_data.size = vap->va_size;
1842                         modflags |= HAMMER_INODE_DDIRTY;
1843                         break;
1844                 default:
1845                         error = EINVAL;
1846                         goto done;
1847                 }
1848                 break;
1849         }
1850         if (vap->va_atime.tv_sec != VNOVAL) {
1851                 ip->ino_data.atime =
1852                         hammer_timespec_to_time(&vap->va_atime);
1853                 modflags |= HAMMER_INODE_ATIME;
1854         }
1855         if (vap->va_mtime.tv_sec != VNOVAL) {
1856                 ip->ino_data.mtime =
1857                         hammer_timespec_to_time(&vap->va_mtime);
1858                 modflags |= HAMMER_INODE_MTIME;
1859         }
1860         if (vap->va_mode != (mode_t)VNOVAL) {
1861                 mode_t   cur_mode = ip->ino_data.mode;
1862                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1863                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1864
1865                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1866                                          cur_uid, cur_gid, &cur_mode);
1867                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1868                         ip->ino_data.mode = cur_mode;
1869                         modflags |= HAMMER_INODE_DDIRTY;
1870                 }
1871         }
1872 done:
1873         if (error == 0)
1874                 hammer_modify_inode(ip, modflags);
1875         hammer_done_transaction(&trans);
1876         return (error);
1877 }
1878
1879 /*
1880  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1881  */
1882 static
1883 int
1884 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1885 {
1886         struct hammer_transaction trans;
1887         struct hammer_inode *dip;
1888         struct hammer_inode *nip;
1889         struct nchandle *nch;
1890         hammer_record_t record;
1891         int error;
1892         int bytes;
1893
1894         ap->a_vap->va_type = VLNK;
1895
1896         nch = ap->a_nch;
1897         dip = VTOI(ap->a_dvp);
1898
1899         if (dip->flags & HAMMER_INODE_RO)
1900                 return (EROFS);
1901         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1902                 return (error);
1903
1904         /*
1905          * Create a transaction to cover the operations we perform.
1906          */
1907         hammer_start_transaction(&trans, dip->hmp);
1908         ++hammer_stats_file_iopsw;
1909
1910         /*
1911          * Create a new filesystem object of the requested type.  The
1912          * returned inode will be referenced but not locked.
1913          */
1914
1915         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1916                                     dip, NULL, &nip);
1917         if (error) {
1918                 hammer_done_transaction(&trans);
1919                 *ap->a_vpp = NULL;
1920                 return (error);
1921         }
1922
1923         /*
1924          * Add a record representing the symlink.  symlink stores the link
1925          * as pure data, not a string, and is no \0 terminated.
1926          */
1927         if (error == 0) {
1928                 bytes = strlen(ap->a_target);
1929
1930                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1931                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1932                 } else {
1933                         record = hammer_alloc_mem_record(nip, bytes);
1934                         record->type = HAMMER_MEM_RECORD_GENERAL;
1935
1936                         record->leaf.base.localization = nip->obj_localization +
1937                                                          HAMMER_LOCALIZE_MISC;
1938                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1939                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1940                         record->leaf.data_len = bytes;
1941                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1942                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1943                         error = hammer_ip_add_record(&trans, record);
1944                 }
1945
1946                 /*
1947                  * Set the file size to the length of the link.
1948                  */
1949                 if (error == 0) {
1950                         nip->ino_data.size = bytes;
1951                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1952                 }
1953         }
1954         if (error == 0)
1955                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1956                                                 nch->ncp->nc_nlen, nip);
1957
1958         /*
1959          * Finish up.
1960          */
1961         if (error) {
1962                 hammer_rel_inode(nip, 0);
1963                 *ap->a_vpp = NULL;
1964         } else {
1965                 error = hammer_get_vnode(nip, ap->a_vpp);
1966                 hammer_rel_inode(nip, 0);
1967                 if (error == 0) {
1968                         cache_setunresolved(ap->a_nch);
1969                         cache_setvp(ap->a_nch, *ap->a_vpp);
1970                 }
1971         }
1972         hammer_done_transaction(&trans);
1973         return (error);
1974 }
1975
1976 /*
1977  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1978  */
1979 static
1980 int
1981 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1982 {
1983         struct hammer_transaction trans;
1984         struct hammer_inode *dip;
1985         int error;
1986
1987         dip = VTOI(ap->a_dvp);
1988
1989         if (hammer_nohistory(dip) == 0 &&
1990             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
1991                 return (error);
1992         }
1993
1994         hammer_start_transaction(&trans, dip->hmp);
1995         ++hammer_stats_file_iopsw;
1996         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1997                                 ap->a_cred, ap->a_flags, -1);
1998         hammer_done_transaction(&trans);
1999
2000         return (error);
2001 }
2002
2003 /*
2004  * hammer_vop_ioctl { vp, command, data, fflag, cred }
2005  */
2006 static
2007 int
2008 hammer_vop_ioctl(struct vop_ioctl_args *ap)
2009 {
2010         struct hammer_inode *ip = ap->a_vp->v_data;
2011
2012         ++hammer_stats_file_iopsr;
2013         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
2014                             ap->a_fflag, ap->a_cred));
2015 }
2016
2017 static
2018 int
2019 hammer_vop_mountctl(struct vop_mountctl_args *ap)
2020 {
2021         struct mount *mp;
2022         int error;
2023
2024         mp = ap->a_head.a_ops->head.vv_mount;
2025
2026         switch(ap->a_op) {
2027         case MOUNTCTL_SET_EXPORT:
2028                 if (ap->a_ctllen != sizeof(struct export_args))
2029                         error = EINVAL;
2030                 else
2031                         error = hammer_vfs_export(mp, ap->a_op,
2032                                       (const struct export_args *)ap->a_ctl);
2033                 break;
2034         default:
2035                 error = journal_mountctl(ap);
2036                 break;
2037         }
2038         return(error);
2039 }
2040
2041 /*
2042  * hammer_vop_strategy { vp, bio }
2043  *
2044  * Strategy call, used for regular file read & write only.  Note that the
2045  * bp may represent a cluster.
2046  *
2047  * To simplify operation and allow better optimizations in the future,
2048  * this code does not make any assumptions with regards to buffer alignment
2049  * or size.
2050  */
2051 static
2052 int
2053 hammer_vop_strategy(struct vop_strategy_args *ap)
2054 {
2055         struct buf *bp;
2056         int error;
2057
2058         bp = ap->a_bio->bio_buf;
2059
2060         switch(bp->b_cmd) {
2061         case BUF_CMD_READ:
2062                 error = hammer_vop_strategy_read(ap);
2063                 break;
2064         case BUF_CMD_WRITE:
2065                 error = hammer_vop_strategy_write(ap);
2066                 break;
2067         default:
2068                 bp->b_error = error = EINVAL;
2069                 bp->b_flags |= B_ERROR;
2070                 biodone(ap->a_bio);
2071                 break;
2072         }
2073         return (error);
2074 }
2075
2076 /*
2077  * Read from a regular file.  Iterate the related records and fill in the
2078  * BIO/BUF.  Gaps are zero-filled.
2079  *
2080  * The support code in hammer_object.c should be used to deal with mixed
2081  * in-memory and on-disk records.
2082  *
2083  * NOTE: Can be called from the cluster code with an oversized buf.
2084  *
2085  * XXX atime update
2086  */
2087 static
2088 int
2089 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2090 {
2091         struct hammer_transaction trans;
2092         struct hammer_inode *ip;
2093         struct hammer_cursor cursor;
2094         hammer_base_elm_t base;
2095         hammer_off_t disk_offset;
2096         struct bio *bio;
2097         struct bio *nbio;
2098         struct buf *bp;
2099         int64_t rec_offset;
2100         int64_t ran_end;
2101         int64_t tmp64;
2102         int error;
2103         int boff;
2104         int roff;
2105         int n;
2106
2107         bio = ap->a_bio;
2108         bp = bio->bio_buf;
2109         ip = ap->a_vp->v_data;
2110
2111         /*
2112          * The zone-2 disk offset may have been set by the cluster code via
2113          * a BMAP operation, or else should be NOOFFSET.
2114          *
2115          * Checking the high bits for a match against zone-2 should suffice.
2116          */
2117         nbio = push_bio(bio);
2118         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2119             HAMMER_ZONE_LARGE_DATA) {
2120                 error = hammer_io_direct_read(ip->hmp, nbio, NULL);
2121                 return (error);
2122         }
2123
2124         /*
2125          * Well, that sucked.  Do it the hard way.  If all the stars are
2126          * aligned we may still be able to issue a direct-read.
2127          */
2128         hammer_simple_transaction(&trans, ip->hmp);
2129         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2130
2131         /*
2132          * Key range (begin and end inclusive) to scan.  Note that the key's
2133          * stored in the actual records represent BASE+LEN, not BASE.  The
2134          * first record containing bio_offset will have a key > bio_offset.
2135          */
2136         cursor.key_beg.localization = ip->obj_localization +
2137                                       HAMMER_LOCALIZE_MISC;
2138         cursor.key_beg.obj_id = ip->obj_id;
2139         cursor.key_beg.create_tid = 0;
2140         cursor.key_beg.delete_tid = 0;
2141         cursor.key_beg.obj_type = 0;
2142         cursor.key_beg.key = bio->bio_offset + 1;
2143         cursor.asof = ip->obj_asof;
2144         cursor.flags |= HAMMER_CURSOR_ASOF;
2145
2146         cursor.key_end = cursor.key_beg;
2147         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2148 #if 0
2149         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2150                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2151                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2152                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2153         } else
2154 #endif
2155         {
2156                 ran_end = bio->bio_offset + bp->b_bufsize;
2157                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2158                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2159                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2160                 if (tmp64 < ran_end)
2161                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2162                 else
2163                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2164         }
2165         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2166
2167         error = hammer_ip_first(&cursor);
2168         boff = 0;
2169
2170         while (error == 0) {
2171                 /*
2172                  * Get the base file offset of the record.  The key for
2173                  * data records is (base + bytes) rather then (base).
2174                  */
2175                 base = &cursor.leaf->base;
2176                 rec_offset = base->key - cursor.leaf->data_len;
2177
2178                 /*
2179                  * Calculate the gap, if any, and zero-fill it.
2180                  *
2181                  * n is the offset of the start of the record verses our
2182                  * current seek offset in the bio.
2183                  */
2184                 n = (int)(rec_offset - (bio->bio_offset + boff));
2185                 if (n > 0) {
2186                         if (n > bp->b_bufsize - boff)
2187                                 n = bp->b_bufsize - boff;
2188                         bzero((char *)bp->b_data + boff, n);
2189                         boff += n;
2190                         n = 0;
2191                 }
2192
2193                 /*
2194                  * Calculate the data offset in the record and the number
2195                  * of bytes we can copy.
2196                  *
2197                  * There are two degenerate cases.  First, boff may already
2198                  * be at bp->b_bufsize.  Secondly, the data offset within
2199                  * the record may exceed the record's size.
2200                  */
2201                 roff = -n;
2202                 rec_offset += roff;
2203                 n = cursor.leaf->data_len - roff;
2204                 if (n <= 0) {
2205                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2206                         n = 0;
2207                 } else if (n > bp->b_bufsize - boff) {
2208                         n = bp->b_bufsize - boff;
2209                 }
2210
2211                 /*
2212                  * Deal with cached truncations.  This cool bit of code
2213                  * allows truncate()/ftruncate() to avoid having to sync
2214                  * the file.
2215                  *
2216                  * If the frontend is truncated then all backend records are
2217                  * subject to the frontend's truncation.
2218                  *
2219                  * If the backend is truncated then backend records on-disk
2220                  * (but not in-memory) are subject to the backend's
2221                  * truncation.  In-memory records owned by the backend
2222                  * represent data written after the truncation point on the
2223                  * backend and must not be truncated.
2224                  *
2225                  * Truncate operations deal with frontend buffer cache
2226                  * buffers and frontend-owned in-memory records synchronously.
2227                  */
2228                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2229                         if (hammer_cursor_ondisk(&cursor) ||
2230                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2231                                 if (ip->trunc_off <= rec_offset)
2232                                         n = 0;
2233                                 else if (ip->trunc_off < rec_offset + n)
2234                                         n = (int)(ip->trunc_off - rec_offset);
2235                         }
2236                 }
2237                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2238                         if (hammer_cursor_ondisk(&cursor)) {
2239                                 if (ip->sync_trunc_off <= rec_offset)
2240                                         n = 0;
2241                                 else if (ip->sync_trunc_off < rec_offset + n)
2242                                         n = (int)(ip->sync_trunc_off - rec_offset);
2243                         }
2244                 }
2245
2246                 /*
2247                  * Try to issue a direct read into our bio if possible,
2248                  * otherwise resolve the element data into a hammer_buffer
2249                  * and copy.
2250                  *
2251                  * The buffer on-disk should be zerod past any real
2252                  * truncation point, but may not be for any synthesized
2253                  * truncation point from above.
2254                  */
2255                 disk_offset = cursor.leaf->data_offset + roff;
2256                 if (boff == 0 && n == bp->b_bufsize &&
2257                     hammer_cursor_ondisk(&cursor) &&
2258                     (disk_offset & HAMMER_BUFMASK) == 0) {
2259                         KKASSERT((disk_offset & HAMMER_OFF_ZONE_MASK) ==
2260                                  HAMMER_ZONE_LARGE_DATA);
2261                         nbio->bio_offset = disk_offset;
2262                         error = hammer_io_direct_read(trans.hmp, nbio,
2263                                                       cursor.leaf);
2264                         goto done;
2265                 } else if (n) {
2266                         error = hammer_ip_resolve_data(&cursor);
2267                         if (error == 0) {
2268                                 bcopy((char *)cursor.data + roff,
2269                                       (char *)bp->b_data + boff, n);
2270                         }
2271                 }
2272                 if (error)
2273                         break;
2274
2275                 /*
2276                  * Iterate until we have filled the request.
2277                  */
2278                 boff += n;
2279                 if (boff == bp->b_bufsize)
2280                         break;
2281                 error = hammer_ip_next(&cursor);
2282         }
2283
2284         /*
2285          * There may have been a gap after the last record
2286          */
2287         if (error == ENOENT)
2288                 error = 0;
2289         if (error == 0 && boff != bp->b_bufsize) {
2290                 KKASSERT(boff < bp->b_bufsize);
2291                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2292                 /* boff = bp->b_bufsize; */
2293         }
2294         bp->b_resid = 0;
2295         bp->b_error = error;
2296         if (error)
2297                 bp->b_flags |= B_ERROR;
2298         biodone(ap->a_bio);
2299
2300 done:
2301         if (cursor.node)
2302                 hammer_cache_node(&ip->cache[1], cursor.node);
2303         hammer_done_cursor(&cursor);
2304         hammer_done_transaction(&trans);
2305         return(error);
2306 }
2307
2308 /*
2309  * BMAP operation - used to support cluster_read() only.
2310  *
2311  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2312  *
2313  * This routine may return EOPNOTSUPP if the opration is not supported for
2314  * the specified offset.  The contents of the pointer arguments do not
2315  * need to be initialized in that case. 
2316  *
2317  * If a disk address is available and properly aligned return 0 with 
2318  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2319  * to the run-length relative to that offset.  Callers may assume that
2320  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2321  * large, so return EOPNOTSUPP if it is not sufficiently large.
2322  */
2323 static
2324 int
2325 hammer_vop_bmap(struct vop_bmap_args *ap)
2326 {
2327         struct hammer_transaction trans;
2328         struct hammer_inode *ip;
2329         struct hammer_cursor cursor;
2330         hammer_base_elm_t base;
2331         int64_t rec_offset;
2332         int64_t ran_end;
2333         int64_t tmp64;
2334         int64_t base_offset;
2335         int64_t base_disk_offset;
2336         int64_t last_offset;
2337         hammer_off_t last_disk_offset;
2338         hammer_off_t disk_offset;
2339         int     rec_len;
2340         int     error;
2341         int     blksize;
2342
2343         ++hammer_stats_file_iopsr;
2344         ip = ap->a_vp->v_data;
2345
2346         /*
2347          * We can only BMAP regular files.  We can't BMAP database files,
2348          * directories, etc.
2349          */
2350         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2351                 return(EOPNOTSUPP);
2352
2353         /*
2354          * bmap is typically called with runp/runb both NULL when used
2355          * for writing.  We do not support BMAP for writing atm.
2356          */
2357         if (ap->a_cmd != BUF_CMD_READ)
2358                 return(EOPNOTSUPP);
2359
2360         /*
2361          * Scan the B-Tree to acquire blockmap addresses, then translate
2362          * to raw addresses.
2363          */
2364         hammer_simple_transaction(&trans, ip->hmp);
2365 #if 0
2366         kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2367 #endif
2368         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2369
2370         /*
2371          * Key range (begin and end inclusive) to scan.  Note that the key's
2372          * stored in the actual records represent BASE+LEN, not BASE.  The
2373          * first record containing bio_offset will have a key > bio_offset.
2374          */
2375         cursor.key_beg.localization = ip->obj_localization +
2376                                       HAMMER_LOCALIZE_MISC;
2377         cursor.key_beg.obj_id = ip->obj_id;
2378         cursor.key_beg.create_tid = 0;
2379         cursor.key_beg.delete_tid = 0;
2380         cursor.key_beg.obj_type = 0;
2381         if (ap->a_runb)
2382                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2383         else
2384                 cursor.key_beg.key = ap->a_loffset + 1;
2385         if (cursor.key_beg.key < 0)
2386                 cursor.key_beg.key = 0;
2387         cursor.asof = ip->obj_asof;
2388         cursor.flags |= HAMMER_CURSOR_ASOF;
2389
2390         cursor.key_end = cursor.key_beg;
2391         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2392
2393         ran_end = ap->a_loffset + MAXPHYS;
2394         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2395         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2396         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2397         if (tmp64 < ran_end)
2398                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2399         else
2400                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2401
2402         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2403
2404         error = hammer_ip_first(&cursor);
2405         base_offset = last_offset = 0;
2406         base_disk_offset = last_disk_offset = 0;
2407
2408         while (error == 0) {
2409                 /*
2410                  * Get the base file offset of the record.  The key for
2411                  * data records is (base + bytes) rather then (base).
2412                  *
2413                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2414                  * The extra bytes should be zero on-disk and the BMAP op
2415                  * should still be ok.
2416                  */
2417                 base = &cursor.leaf->base;
2418                 rec_offset = base->key - cursor.leaf->data_len;
2419                 rec_len    = cursor.leaf->data_len;
2420
2421                 /*
2422                  * Incorporate any cached truncation.
2423                  *
2424                  * NOTE: Modifications to rec_len based on synthesized
2425                  * truncation points remove the guarantee that any extended
2426                  * data on disk is zero (since the truncations may not have
2427                  * taken place on-media yet).
2428                  */
2429                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2430                         if (hammer_cursor_ondisk(&cursor) ||
2431                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2432                                 if (ip->trunc_off <= rec_offset)
2433                                         rec_len = 0;
2434                                 else if (ip->trunc_off < rec_offset + rec_len)
2435                                         rec_len = (int)(ip->trunc_off - rec_offset);
2436                         }
2437                 }
2438                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2439                         if (hammer_cursor_ondisk(&cursor)) {
2440                                 if (ip->sync_trunc_off <= rec_offset)
2441                                         rec_len = 0;
2442                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2443                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2444                         }
2445                 }
2446
2447                 /*
2448                  * Accumulate information.  If we have hit a discontiguous
2449                  * block reset base_offset unless we are already beyond the
2450                  * requested offset.  If we are, that's it, we stop.
2451                  */
2452                 if (error)
2453                         break;
2454                 if (hammer_cursor_ondisk(&cursor)) {
2455                         disk_offset = cursor.leaf->data_offset;
2456                         if (rec_offset != last_offset ||
2457                             disk_offset != last_disk_offset) {
2458                                 if (rec_offset > ap->a_loffset)
2459                                         break;
2460                                 base_offset = rec_offset;
2461                                 base_disk_offset = disk_offset;
2462                         }
2463                         last_offset = rec_offset + rec_len;
2464                         last_disk_offset = disk_offset + rec_len;
2465                 }
2466                 error = hammer_ip_next(&cursor);
2467         }
2468
2469 #if 0
2470         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2471                 ap->a_loffset, base_offset, last_offset);
2472         kprintf("BMAP %16s:  %016llx - %016llx\n",
2473                 "", base_disk_offset, last_disk_offset);
2474 #endif
2475
2476         if (cursor.node) {
2477                 hammer_cache_node(&ip->cache[1], cursor.node);
2478 #if 0
2479                 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2480 #endif
2481         }
2482         hammer_done_cursor(&cursor);
2483         hammer_done_transaction(&trans);
2484
2485         /*
2486          * If we couldn't find any records or the records we did find were
2487          * all behind the requested offset, return failure.  A forward
2488          * truncation can leave a hole w/ no on-disk records.
2489          */
2490         if (last_offset == 0 || last_offset < ap->a_loffset)
2491                 return (EOPNOTSUPP);
2492
2493         /*
2494          * Figure out the block size at the requested offset and adjust
2495          * our limits so the cluster_read() does not create inappropriately
2496          * sized buffer cache buffers.
2497          */
2498         blksize = hammer_blocksize(ap->a_loffset);
2499         if (hammer_blocksize(base_offset) != blksize) {
2500                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2501         }
2502         if (last_offset != ap->a_loffset &&
2503             hammer_blocksize(last_offset - 1) != blksize) {
2504                 last_offset = hammer_blockdemarc(ap->a_loffset,
2505                                                  last_offset - 1);
2506         }
2507
2508         /*
2509          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2510          * from occuring.
2511          */
2512         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2513
2514         if ((disk_offset & HAMMER_OFF_ZONE_MASK) != HAMMER_ZONE_LARGE_DATA) {
2515                 /*
2516                  * Only large-data zones can be direct-IOd
2517                  */
2518                 error = EOPNOTSUPP;
2519         } else if ((disk_offset & HAMMER_BUFMASK) ||
2520                    (last_offset - ap->a_loffset) < blksize) {
2521                 /*
2522                  * doffsetp is not aligned or the forward run size does
2523                  * not cover a whole buffer, disallow the direct I/O.
2524                  */
2525                 error = EOPNOTSUPP;
2526         } else {
2527                 /*
2528                  * We're good.
2529                  */
2530                 *ap->a_doffsetp = disk_offset;
2531                 if (ap->a_runb) {
2532                         *ap->a_runb = ap->a_loffset - base_offset;
2533                         KKASSERT(*ap->a_runb >= 0);
2534                 }
2535                 if (ap->a_runp) {
2536                         *ap->a_runp = last_offset - ap->a_loffset;
2537                         KKASSERT(*ap->a_runp >= 0);
2538                 }
2539                 error = 0;
2540         }
2541         return(error);
2542 }
2543
2544 /*
2545  * Write to a regular file.   Because this is a strategy call the OS is
2546  * trying to actually get data onto the media.
2547  */
2548 static
2549 int
2550 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2551 {
2552         hammer_record_t record;
2553         hammer_mount_t hmp;
2554         hammer_inode_t ip;
2555         struct bio *bio;
2556         struct buf *bp;
2557         int blksize;
2558         int bytes;
2559         int error;
2560
2561         bio = ap->a_bio;
2562         bp = bio->bio_buf;
2563         ip = ap->a_vp->v_data;
2564         hmp = ip->hmp;
2565
2566         blksize = hammer_blocksize(bio->bio_offset);
2567         KKASSERT(bp->b_bufsize == blksize);
2568
2569         if (ip->flags & HAMMER_INODE_RO) {
2570                 bp->b_error = EROFS;
2571                 bp->b_flags |= B_ERROR;
2572                 biodone(ap->a_bio);
2573                 return(EROFS);
2574         }
2575
2576         /*
2577          * Interlock with inode destruction (no in-kernel or directory
2578          * topology visibility).  If we queue new IO while trying to
2579          * destroy the inode we can deadlock the vtrunc call in
2580          * hammer_inode_unloadable_check().
2581          *
2582          * Besides, there's no point flushing a bp associated with an
2583          * inode that is being destroyed on-media and has no kernel
2584          * references.
2585          */
2586         if ((ip->flags | ip->sync_flags) &
2587             (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2588                 bp->b_resid = 0;
2589                 biodone(ap->a_bio);
2590                 return(0);
2591         }
2592
2593         /*
2594          * Reserve space and issue a direct-write from the front-end. 
2595          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2596          * allocations.
2597          *
2598          * An in-memory record will be installed to reference the storage
2599          * until the flusher can get to it.
2600          *
2601          * Since we own the high level bio the front-end will not try to
2602          * do a direct-read until the write completes.
2603          *
2604          * NOTE: The only time we do not reserve a full-sized buffers
2605          * worth of data is if the file is small.  We do not try to
2606          * allocate a fragment (from the small-data zone) at the end of
2607          * an otherwise large file as this can lead to wildly separated
2608          * data.
2609          */
2610         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2611         KKASSERT(bio->bio_offset < ip->ino_data.size);
2612         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2613                 bytes = bp->b_bufsize;
2614         else
2615                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2616
2617         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2618                                     bytes, &error);
2619         if (record) {
2620                 hammer_io_direct_write(hmp, record, bio);
2621                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2622                         hammer_flush_inode(ip, 0);
2623         } else {
2624                 bp->b_bio2.bio_offset = NOOFFSET;
2625                 bp->b_error = error;
2626                 bp->b_flags |= B_ERROR;
2627                 biodone(ap->a_bio);
2628         }
2629         return(error);
2630 }
2631
2632 /*
2633  * dounlink - disconnect a directory entry
2634  *
2635  * XXX whiteout support not really in yet
2636  */
2637 static int
2638 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2639                 struct vnode *dvp, struct ucred *cred, 
2640                 int flags, int isdir)
2641 {
2642         struct namecache *ncp;
2643         hammer_inode_t dip;
2644         hammer_inode_t ip;
2645         struct hammer_cursor cursor;
2646         int64_t namekey;
2647         int nlen, error;
2648
2649         /*
2650          * Calculate the namekey and setup the key range for the scan.  This
2651          * works kinda like a chained hash table where the lower 32 bits
2652          * of the namekey synthesize the chain.
2653          *
2654          * The key range is inclusive of both key_beg and key_end.
2655          */
2656         dip = VTOI(dvp);
2657         ncp = nch->ncp;
2658
2659         if (dip->flags & HAMMER_INODE_RO)
2660                 return (EROFS);
2661
2662         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2663 retry:
2664         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2665         cursor.key_beg.localization = dip->obj_localization +
2666                                       HAMMER_LOCALIZE_MISC;
2667         cursor.key_beg.obj_id = dip->obj_id;
2668         cursor.key_beg.key = namekey;
2669         cursor.key_beg.create_tid = 0;
2670         cursor.key_beg.delete_tid = 0;
2671         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2672         cursor.key_beg.obj_type = 0;
2673
2674         cursor.key_end = cursor.key_beg;
2675         cursor.key_end.key |= 0xFFFFFFFFULL;
2676         cursor.asof = dip->obj_asof;
2677         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2678
2679         /*
2680          * Scan all matching records (the chain), locate the one matching
2681          * the requested path component.  info->last_error contains the
2682          * error code on search termination and could be 0, ENOENT, or
2683          * something else.
2684          *
2685          * The hammer_ip_*() functions merge in-memory records with on-disk
2686          * records for the purposes of the search.
2687          */
2688         error = hammer_ip_first(&cursor);
2689
2690         while (error == 0) {
2691                 error = hammer_ip_resolve_data(&cursor);
2692                 if (error)
2693                         break;
2694                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2695                 KKASSERT(nlen > 0);
2696                 if (ncp->nc_nlen == nlen &&
2697                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2698                         break;
2699                 }
2700                 error = hammer_ip_next(&cursor);
2701         }
2702
2703         /*
2704          * If all is ok we have to get the inode so we can adjust nlinks.
2705          * To avoid a deadlock with the flusher we must release the inode
2706          * lock on the directory when acquiring the inode for the entry.
2707          *
2708          * If the target is a directory, it must be empty.
2709          */
2710         if (error == 0) {
2711                 hammer_unlock(&cursor.ip->lock);
2712                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2713                                       dip->hmp->asof,
2714                                       cursor.data->entry.localization,
2715                                       0, &error);
2716                 hammer_lock_sh(&cursor.ip->lock);
2717                 if (error == ENOENT) {
2718                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2719                         Debugger("ENOENT unlinking object that should exist");
2720                 }
2721
2722                 /*
2723                  * If isdir >= 0 we validate that the entry is or is not a
2724                  * directory.  If isdir < 0 we don't care.
2725                  */
2726                 if (error == 0 && isdir >= 0) {
2727                         if (isdir &&
2728                             ip->ino_data.obj_type != HAMMER_OBJTYPE_DIRECTORY) {
2729                                 error = ENOTDIR;
2730                         } else if (isdir == 0 &&
2731                             ip->ino_data.obj_type == HAMMER_OBJTYPE_DIRECTORY) {
2732                                 error = EISDIR;
2733                         }
2734                 }
2735
2736                 /*
2737                  * If we are trying to remove a directory the directory must
2738                  * be empty.
2739                  *
2740                  * WARNING: hammer_ip_check_directory_empty() may have to
2741                  * terminate the cursor to avoid a deadlock.  It is ok to
2742                  * call hammer_done_cursor() twice.
2743                  */
2744                 if (error == 0 && ip->ino_data.obj_type ==
2745                                   HAMMER_OBJTYPE_DIRECTORY) {
2746                         error = hammer_ip_check_directory_empty(trans, ip);
2747                 }
2748
2749                 /*
2750                  * Delete the directory entry.
2751                  *
2752                  * WARNING: hammer_ip_del_directory() may have to terminate
2753                  * the cursor to avoid a deadlock.  It is ok to call
2754                  * hammer_done_cursor() twice.
2755                  */
2756                 if (error == 0) {
2757                         error = hammer_ip_del_directory(trans, &cursor,
2758                                                         dip, ip);
2759                 }
2760                 hammer_done_cursor(&cursor);
2761                 if (error == 0) {
2762                         cache_setunresolved(nch);
2763                         cache_setvp(nch, NULL);
2764                         /* XXX locking */
2765                         if (ip->vp)
2766                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2767                 }
2768                 if (ip)
2769                         hammer_rel_inode(ip, 0);
2770         } else {
2771                 hammer_done_cursor(&cursor);
2772         }
2773         if (error == EDEADLK)
2774                 goto retry;
2775
2776         return (error);
2777 }
2778
2779 /************************************************************************
2780  *                          FIFO AND SPECFS OPS                         *
2781  ************************************************************************
2782  *
2783  */
2784
2785 static int
2786 hammer_vop_fifoclose (struct vop_close_args *ap)
2787 {
2788         /* XXX update itimes */
2789         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2790 }
2791
2792 static int
2793 hammer_vop_fiforead (struct vop_read_args *ap)
2794 {
2795         int error;
2796
2797         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2798         /* XXX update access time */
2799         return (error);
2800 }
2801
2802 static int
2803 hammer_vop_fifowrite (struct vop_write_args *ap)
2804 {
2805         int error;
2806
2807         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2808         /* XXX update access time */
2809         return (error);
2810 }
2811
2812 static int
2813 hammer_vop_specclose (struct vop_close_args *ap)
2814 {
2815         /* XXX update itimes */
2816         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2817 }
2818
2819 static int
2820 hammer_vop_specread (struct vop_read_args *ap)
2821 {
2822         /* XXX update access time */
2823         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2824 }
2825
2826 static int
2827 hammer_vop_specwrite (struct vop_write_args *ap)
2828 {
2829         /* XXX update last change time */
2830         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2831 }
2832