HAMMER 60G/Many: Mirroring, bug fixes
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.83 2008/07/07 22:42:35 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91
92 struct vop_ops hammer_vnode_vops = {
93         .vop_default =          vop_defaultop,
94         .vop_fsync =            hammer_vop_fsync,
95         .vop_getpages =         vop_stdgetpages,
96         .vop_putpages =         vop_stdputpages,
97         .vop_read =             hammer_vop_read,
98         .vop_write =            hammer_vop_write,
99         .vop_access =           hammer_vop_access,
100         .vop_advlock =          hammer_vop_advlock,
101         .vop_close =            hammer_vop_close,
102         .vop_ncreate =          hammer_vop_ncreate,
103         .vop_getattr =          hammer_vop_getattr,
104         .vop_inactive =         hammer_vop_inactive,
105         .vop_reclaim =          hammer_vop_reclaim,
106         .vop_nresolve =         hammer_vop_nresolve,
107         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
108         .vop_nlink =            hammer_vop_nlink,
109         .vop_nmkdir =           hammer_vop_nmkdir,
110         .vop_nmknod =           hammer_vop_nmknod,
111         .vop_open =             hammer_vop_open,
112         .vop_pathconf =         hammer_vop_pathconf,
113         .vop_print =            hammer_vop_print,
114         .vop_readdir =          hammer_vop_readdir,
115         .vop_readlink =         hammer_vop_readlink,
116         .vop_nremove =          hammer_vop_nremove,
117         .vop_nrename =          hammer_vop_nrename,
118         .vop_nrmdir =           hammer_vop_nrmdir,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl
126 };
127
128 struct vop_ops hammer_spec_vops = {
129         .vop_default =          spec_vnoperate,
130         .vop_fsync =            hammer_vop_fsync,
131         .vop_read =             hammer_vop_specread,
132         .vop_write =            hammer_vop_specwrite,
133         .vop_access =           hammer_vop_access,
134         .vop_close =            hammer_vop_specclose,
135         .vop_getattr =          hammer_vop_getattr,
136         .vop_inactive =         hammer_vop_inactive,
137         .vop_reclaim =          hammer_vop_reclaim,
138         .vop_setattr =          hammer_vop_setattr
139 };
140
141 struct vop_ops hammer_fifo_vops = {
142         .vop_default =          fifo_vnoperate,
143         .vop_fsync =            hammer_vop_fsync,
144         .vop_read =             hammer_vop_fiforead,
145         .vop_write =            hammer_vop_fifowrite,
146         .vop_access =           hammer_vop_access,
147         .vop_close =            hammer_vop_fifoclose,
148         .vop_getattr =          hammer_vop_getattr,
149         .vop_inactive =         hammer_vop_inactive,
150         .vop_reclaim =          hammer_vop_reclaim,
151         .vop_setattr =          hammer_vop_setattr
152 };
153
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159                            struct vnode *dvp, struct ucred *cred, int flags);
160 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
161 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
162
163 #if 0
164 static
165 int
166 hammer_vop_vnoperate(struct vop_generic_args *)
167 {
168         return (VOCALL(&hammer_vnode_vops, ap));
169 }
170 #endif
171
172 /*
173  * hammer_vop_fsync { vp, waitfor }
174  *
175  * fsync() an inode to disk and wait for it to be completely committed
176  * such that the information would not be undone if a crash occured after
177  * return.
178  */
179 static
180 int
181 hammer_vop_fsync(struct vop_fsync_args *ap)
182 {
183         hammer_inode_t ip = VTOI(ap->a_vp);
184
185         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
186         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
187         if (ap->a_waitfor == MNT_WAIT)
188                 hammer_wait_inode(ip);
189         return (ip->error);
190 }
191
192 /*
193  * hammer_vop_read { vp, uio, ioflag, cred }
194  */
195 static
196 int
197 hammer_vop_read(struct vop_read_args *ap)
198 {
199         struct hammer_transaction trans;
200         hammer_inode_t ip;
201         off_t offset;
202         struct buf *bp;
203         struct uio *uio;
204         int error;
205         int n;
206         int seqcount;
207         int ioseqcount;
208         int blksize;
209
210         if (ap->a_vp->v_type != VREG)
211                 return (EINVAL);
212         ip = VTOI(ap->a_vp);
213         error = 0;
214         uio = ap->a_uio;
215
216         /*
217          * Allow the UIO's size to override the sequential heuristic.
218          */
219         blksize = hammer_blocksize(uio->uio_offset);
220         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
221         ioseqcount = ap->a_ioflag >> 16;
222         if (seqcount < ioseqcount)
223                 seqcount = ioseqcount;
224
225         hammer_start_transaction(&trans, ip->hmp);
226
227         /*
228          * Access the data typically in HAMMER_BUFSIZE blocks via the
229          * buffer cache, but HAMMER may use a variable block size based
230          * on the offset.
231          */
232         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
233                 int64_t base_offset;
234                 int64_t file_limit;
235
236                 blksize = hammer_blocksize(uio->uio_offset);
237                 offset = (int)uio->uio_offset & (blksize - 1);
238                 base_offset = uio->uio_offset - offset;
239
240                 if (hammer_debug_cluster_enable) {
241                         /*
242                          * Use file_limit to prevent cluster_read() from
243                          * creating buffers of the wrong block size past
244                          * the demarc.
245                          */
246                         file_limit = ip->ino_data.size;
247                         if (base_offset < HAMMER_XDEMARC &&
248                             file_limit > HAMMER_XDEMARC) {
249                                 file_limit = HAMMER_XDEMARC;
250                         }
251                         error = cluster_read(ap->a_vp,
252                                              file_limit, base_offset,
253                                              blksize, MAXPHYS,
254                                              seqcount, &bp);
255                 } else {
256                         error = bread(ap->a_vp, base_offset, blksize, &bp);
257                 }
258                 if (error) {
259                         kprintf("error %d\n", error);
260                         brelse(bp);
261                         break;
262                 }
263
264                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
265                 n = blksize - offset;
266                 if (n > uio->uio_resid)
267                         n = uio->uio_resid;
268                 if (n > ip->ino_data.size - uio->uio_offset)
269                         n = (int)(ip->ino_data.size - uio->uio_offset);
270                 error = uiomove((char *)bp->b_data + offset, n, uio);
271
272                 /* data has a lower priority then meta-data */
273                 bp->b_flags |= B_AGE;
274                 bqrelse(bp);
275                 if (error)
276                         break;
277         }
278         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
279             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
280                 ip->ino_data.atime = trans.time;
281                 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
282         }
283         hammer_done_transaction(&trans);
284         return (error);
285 }
286
287 /*
288  * hammer_vop_write { vp, uio, ioflag, cred }
289  */
290 static
291 int
292 hammer_vop_write(struct vop_write_args *ap)
293 {
294         struct hammer_transaction trans;
295         struct hammer_inode *ip;
296         hammer_mount_t hmp;
297         struct uio *uio;
298         int offset;
299         off_t base_offset;
300         struct buf *bp;
301         int error;
302         int n;
303         int flags;
304         int delta;
305         int seqcount;
306
307         if (ap->a_vp->v_type != VREG)
308                 return (EINVAL);
309         ip = VTOI(ap->a_vp);
310         hmp = ip->hmp;
311         error = 0;
312         seqcount = ap->a_ioflag >> 16;
313
314         if (ip->flags & HAMMER_INODE_RO)
315                 return (EROFS);
316
317         /*
318          * Create a transaction to cover the operations we perform.
319          */
320         hammer_start_transaction(&trans, hmp);
321         uio = ap->a_uio;
322
323         /*
324          * Check append mode
325          */
326         if (ap->a_ioflag & IO_APPEND)
327                 uio->uio_offset = ip->ino_data.size;
328
329         /*
330          * Check for illegal write offsets.  Valid range is 0...2^63-1.
331          *
332          * NOTE: the base_off assignment is required to work around what
333          * I consider to be a GCC-4 optimization bug.
334          */
335         if (uio->uio_offset < 0) {
336                 hammer_done_transaction(&trans);
337                 return (EFBIG);
338         }
339         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
340         if (uio->uio_resid > 0 && base_offset <= 0) {
341                 hammer_done_transaction(&trans);
342                 return (EFBIG);
343         }
344
345         /*
346          * Access the data typically in HAMMER_BUFSIZE blocks via the
347          * buffer cache, but HAMMER may use a variable block size based
348          * on the offset.
349          */
350         while (uio->uio_resid > 0) {
351                 int fixsize = 0;
352                 int blksize;
353                 int blkmask;
354
355                 if ((error = hammer_checkspace(hmp, HAMMER_CHECKSPACE_SLOP_WRITE)) != 0)
356                         break;
357
358                 blksize = hammer_blocksize(uio->uio_offset);
359
360                 /*
361                  * Do not allow HAMMER to blow out the buffer cache.  Very
362                  * large UIOs can lockout other processes due to bwillwrite()
363                  * mechanics.
364                  *
365                  * The hammer inode is not locked during these operations.
366                  * The vnode is locked which can interfere with the pageout
367                  * daemon for non-UIO_NOCOPY writes but should not interfere
368                  * with the buffer cache.  Even so, we cannot afford to
369                  * allow the pageout daemon to build up too many dirty buffer
370                  * cache buffers.
371                  */
372                 /*if (((int)uio->uio_offset & (blksize - 1)) == 0)*/
373                 bwillwrite(blksize);
374
375                 /*
376                  * Do not allow HAMMER to blow out system memory by
377                  * accumulating too many records.   Records are so well
378                  * decoupled from the buffer cache that it is possible
379                  * for userland to push data out to the media via
380                  * direct-write, but build up the records queued to the
381                  * backend faster then the backend can flush them out.
382                  * HAMMER has hit its write limit but the frontend has
383                  * no pushback to slow it down.
384                  */
385                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
386                         /*
387                          * Get the inode on the flush list
388                          */
389                         if (ip->rsv_recs >= 64)
390                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
391                         else if (ip->rsv_recs >= 16)
392                                 hammer_flush_inode(ip, 0);
393
394                         /*
395                          * Keep the flusher going if the system keeps
396                          * queueing records.
397                          */
398                         delta = hmp->count_newrecords -
399                                 hmp->last_newrecords;
400                         if (delta < 0 || delta > hammer_limit_recs / 2) {
401                                 hmp->last_newrecords = hmp->count_newrecords;
402                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
403                         }
404
405                         /*
406                          * If we have gotten behind start slowing
407                          * down the writers.
408                          */
409                         delta = (hmp->rsv_recs - hammer_limit_recs) *
410                                 hz / hammer_limit_recs;
411                         if (delta > 0)
412                                 tsleep(&trans, 0, "hmrslo", delta);
413                 }
414
415                 /*
416                  * Calculate the blocksize at the current offset and figure
417                  * out how much we can actually write.
418                  */
419                 blkmask = blksize - 1;
420                 offset = (int)uio->uio_offset & blkmask;
421                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
422                 n = blksize - offset;
423                 if (n > uio->uio_resid)
424                         n = uio->uio_resid;
425                 if (uio->uio_offset + n > ip->ino_data.size) {
426                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
427                         fixsize = 1;
428                 }
429
430                 if (uio->uio_segflg == UIO_NOCOPY) {
431                         /*
432                          * Issuing a write with the same data backing the
433                          * buffer.  Instantiate the buffer to collect the
434                          * backing vm pages, then read-in any missing bits.
435                          *
436                          * This case is used by vop_stdputpages().
437                          */
438                         bp = getblk(ap->a_vp, base_offset,
439                                     blksize, GETBLK_BHEAVY, 0);
440                         if ((bp->b_flags & B_CACHE) == 0) {
441                                 bqrelse(bp);
442                                 error = bread(ap->a_vp, base_offset,
443                                               blksize, &bp);
444                         }
445                 } else if (offset == 0 && uio->uio_resid >= blksize) {
446                         /*
447                          * Even though we are entirely overwriting the buffer
448                          * we may still have to zero it out to avoid a 
449                          * mmap/write visibility issue.
450                          */
451                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
452                         if ((bp->b_flags & B_CACHE) == 0)
453                                 vfs_bio_clrbuf(bp);
454                 } else if (base_offset >= ip->ino_data.size) {
455                         /*
456                          * If the base offset of the buffer is beyond the
457                          * file EOF, we don't have to issue a read.
458                          */
459                         bp = getblk(ap->a_vp, base_offset,
460                                     blksize, GETBLK_BHEAVY, 0);
461                         vfs_bio_clrbuf(bp);
462                 } else {
463                         /*
464                          * Partial overwrite, read in any missing bits then
465                          * replace the portion being written.
466                          */
467                         error = bread(ap->a_vp, base_offset, blksize, &bp);
468                         if (error == 0)
469                                 bheavy(bp);
470                 }
471                 if (error == 0) {
472                         error = uiomove((char *)bp->b_data + offset,
473                                         n, uio);
474                 }
475
476                 /*
477                  * If we screwed up we have to undo any VM size changes we
478                  * made.
479                  */
480                 if (error) {
481                         brelse(bp);
482                         if (fixsize) {
483                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
484                                           hammer_blocksize(ip->ino_data.size));
485                         }
486                         break;
487                 }
488                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
489                 if (ip->ino_data.size < uio->uio_offset) {
490                         ip->ino_data.size = uio->uio_offset;
491                         flags = HAMMER_INODE_DDIRTY;
492                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
493                 } else {
494                         flags = 0;
495                 }
496                 ip->ino_data.mtime = trans.time;
497                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
498                 hammer_modify_inode(ip, flags);
499
500                 /*
501                  * Final buffer disposition.
502                  */
503                 bp->b_flags |= B_AGE;
504                 if (ap->a_ioflag & IO_SYNC) {
505                         bwrite(bp);
506                 } else if (ap->a_ioflag & IO_DIRECT) {
507                         bawrite(bp);
508                 } else {
509                         bdwrite(bp);
510                 }
511         }
512         hammer_done_transaction(&trans);
513         return (error);
514 }
515
516 /*
517  * hammer_vop_access { vp, mode, cred }
518  */
519 static
520 int
521 hammer_vop_access(struct vop_access_args *ap)
522 {
523         struct hammer_inode *ip = VTOI(ap->a_vp);
524         uid_t uid;
525         gid_t gid;
526         int error;
527
528         uid = hammer_to_unix_xid(&ip->ino_data.uid);
529         gid = hammer_to_unix_xid(&ip->ino_data.gid);
530
531         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
532                                   ip->ino_data.uflags);
533         return (error);
534 }
535
536 /*
537  * hammer_vop_advlock { vp, id, op, fl, flags }
538  */
539 static
540 int
541 hammer_vop_advlock(struct vop_advlock_args *ap)
542 {
543         hammer_inode_t ip = VTOI(ap->a_vp);
544
545         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
546 }
547
548 /*
549  * hammer_vop_close { vp, fflag }
550  */
551 static
552 int
553 hammer_vop_close(struct vop_close_args *ap)
554 {
555         hammer_inode_t ip = VTOI(ap->a_vp);
556
557         if ((ip->flags | ip->sync_flags) & HAMMER_INODE_MODMASK)
558                 hammer_inode_waitreclaims(ip->hmp);
559         return (vop_stdclose(ap));
560 }
561
562 /*
563  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
564  *
565  * The operating system has already ensured that the directory entry
566  * does not exist and done all appropriate namespace locking.
567  */
568 static
569 int
570 hammer_vop_ncreate(struct vop_ncreate_args *ap)
571 {
572         struct hammer_transaction trans;
573         struct hammer_inode *dip;
574         struct hammer_inode *nip;
575         struct nchandle *nch;
576         int error;
577
578         nch = ap->a_nch;
579         dip = VTOI(ap->a_dvp);
580
581         if (dip->flags & HAMMER_INODE_RO)
582                 return (EROFS);
583         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
584                 return (error);
585
586         /*
587          * Create a transaction to cover the operations we perform.
588          */
589         hammer_start_transaction(&trans, dip->hmp);
590
591         /*
592          * Create a new filesystem object of the requested type.  The
593          * returned inode will be referenced and shared-locked to prevent
594          * it from being moved to the flusher.
595          */
596
597         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
598                                     dip, 0, &nip);
599         if (error) {
600                 hkprintf("hammer_create_inode error %d\n", error);
601                 hammer_done_transaction(&trans);
602                 *ap->a_vpp = NULL;
603                 return (error);
604         }
605
606         /*
607          * Add the new filesystem object to the directory.  This will also
608          * bump the inode's link count.
609          */
610         error = hammer_ip_add_directory(&trans, dip,
611                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
612                                         nip);
613         if (error)
614                 hkprintf("hammer_ip_add_directory error %d\n", error);
615
616         /*
617          * Finish up.
618          */
619         if (error) {
620                 hammer_rel_inode(nip, 0);
621                 hammer_done_transaction(&trans);
622                 *ap->a_vpp = NULL;
623         } else {
624                 error = hammer_get_vnode(nip, ap->a_vpp);
625                 hammer_done_transaction(&trans);
626                 hammer_rel_inode(nip, 0);
627                 if (error == 0) {
628                         cache_setunresolved(ap->a_nch);
629                         cache_setvp(ap->a_nch, *ap->a_vpp);
630                 }
631         }
632         return (error);
633 }
634
635 /*
636  * hammer_vop_getattr { vp, vap }
637  *
638  * Retrieve an inode's attribute information.  When accessing inodes
639  * historically we fake the atime field to ensure consistent results.
640  * The atime field is stored in the B-Tree element and allowed to be
641  * updated without cycling the element.
642  */
643 static
644 int
645 hammer_vop_getattr(struct vop_getattr_args *ap)
646 {
647         struct hammer_inode *ip = VTOI(ap->a_vp);
648         struct vattr *vap = ap->a_vap;
649
650         /*
651          * We want the fsid to be different when accessing a filesystem
652          * with different as-of's so programs like diff don't think
653          * the files are the same.
654          *
655          * We also want the fsid to be the same when comparing snapshots,
656          * or when comparing mirrors (which might be backed by different
657          * physical devices).  HAMMER fsids are based on the PFS's
658          * shared_uuid field.
659          *
660          * XXX there is a chance of collision here.  The va_fsid reported
661          * by stat is different from the more involved fsid used in the
662          * mount structure.
663          */
664         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
665                        (u_int32_t)(ip->obj_asof >> 32);
666
667         vap->va_fileid = ip->ino_leaf.base.obj_id;
668         vap->va_mode = ip->ino_data.mode;
669         vap->va_nlink = ip->ino_data.nlinks;
670         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
671         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
672         vap->va_rmajor = 0;
673         vap->va_rminor = 0;
674         vap->va_size = ip->ino_data.size;
675
676         /*
677          * We must provide a consistent atime and mtime for snapshots
678          * so people can do a 'tar cf - ... | md5' on them and get
679          * consistent results.
680          */
681         if (ip->flags & HAMMER_INODE_RO) {
682                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
683                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
684         } else {
685                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
686                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
687         }
688         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
689         vap->va_flags = ip->ino_data.uflags;
690         vap->va_gen = 1;        /* hammer inums are unique for all time */
691         vap->va_blocksize = HAMMER_BUFSIZE;
692         if (ip->ino_data.size >= HAMMER_XDEMARC) {
693                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
694                                 ~HAMMER_XBUFMASK64;
695         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
696                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
697                                 ~HAMMER_BUFMASK64;
698         } else {
699                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
700         }
701         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
702         vap->va_filerev = 0;    /* XXX */
703         /* mtime uniquely identifies any adjustments made to the file XXX */
704         vap->va_fsmid = ip->ino_data.mtime;
705         vap->va_uid_uuid = ip->ino_data.uid;
706         vap->va_gid_uuid = ip->ino_data.gid;
707         vap->va_fsid_uuid = ip->hmp->fsid;
708         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
709                           VA_FSID_UUID_VALID;
710
711         switch (ip->ino_data.obj_type) {
712         case HAMMER_OBJTYPE_CDEV:
713         case HAMMER_OBJTYPE_BDEV:
714                 vap->va_rmajor = ip->ino_data.rmajor;
715                 vap->va_rminor = ip->ino_data.rminor;
716                 break;
717         default:
718                 break;
719         }
720
721         return(0);
722 }
723
724 /*
725  * hammer_vop_nresolve { nch, dvp, cred }
726  *
727  * Locate the requested directory entry.
728  */
729 static
730 int
731 hammer_vop_nresolve(struct vop_nresolve_args *ap)
732 {
733         struct hammer_transaction trans;
734         struct namecache *ncp;
735         hammer_inode_t dip;
736         hammer_inode_t ip;
737         hammer_tid_t asof;
738         struct hammer_cursor cursor;
739         struct vnode *vp;
740         int64_t namekey;
741         int error;
742         int i;
743         int nlen;
744         int flags;
745         int ispfs;
746         int64_t obj_id;
747         u_int32_t localization;
748
749         /*
750          * Misc initialization, plus handle as-of name extensions.  Look for
751          * the '@@' extension.  Note that as-of files and directories cannot
752          * be modified.
753          */
754         dip = VTOI(ap->a_dvp);
755         ncp = ap->a_nch->ncp;
756         asof = dip->obj_asof;
757         nlen = ncp->nc_nlen;
758         flags = dip->flags;
759         ispfs = 0;
760
761         hammer_simple_transaction(&trans, dip->hmp);
762
763         for (i = 0; i < nlen; ++i) {
764                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
765                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
766                         flags |= HAMMER_INODE_RO;
767                         break;
768                 }
769         }
770         nlen = i;
771
772         /*
773          * If there is no path component the time extension is relative to
774          * dip.
775          */
776         if (nlen == 0) {
777                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
778                                       asof, dip->obj_localization,
779                                       flags, &error);
780                 if (error == 0) {
781                         error = hammer_get_vnode(ip, &vp);
782                         hammer_rel_inode(ip, 0);
783                 } else {
784                         vp = NULL;
785                 }
786                 if (error == 0) {
787                         vn_unlock(vp);
788                         cache_setvp(ap->a_nch, vp);
789                         vrele(vp);
790                 }
791                 goto done;
792         }
793
794         /*
795          * Calculate the namekey and setup the key range for the scan.  This
796          * works kinda like a chained hash table where the lower 32 bits
797          * of the namekey synthesize the chain.
798          *
799          * The key range is inclusive of both key_beg and key_end.
800          */
801         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
802
803         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
804         cursor.key_beg.localization = dip->obj_localization +
805                                       HAMMER_LOCALIZE_MISC;
806         cursor.key_beg.obj_id = dip->obj_id;
807         cursor.key_beg.key = namekey;
808         cursor.key_beg.create_tid = 0;
809         cursor.key_beg.delete_tid = 0;
810         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
811         cursor.key_beg.obj_type = 0;
812
813         cursor.key_end = cursor.key_beg;
814         cursor.key_end.key |= 0xFFFFFFFFULL;
815         cursor.asof = asof;
816         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
817
818         /*
819          * Scan all matching records (the chain), locate the one matching
820          * the requested path component.
821          *
822          * The hammer_ip_*() functions merge in-memory records with on-disk
823          * records for the purposes of the search.
824          */
825         obj_id = 0;
826         localization = HAMMER_DEF_LOCALIZATION;
827
828         if (error == 0) {
829                 error = hammer_ip_first(&cursor);
830                 while (error == 0) {
831                         error = hammer_ip_resolve_data(&cursor);
832                         if (error)
833                                 break;
834                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
835                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
836                                 obj_id = cursor.data->entry.obj_id;
837
838                                 /*
839                                  * Force relookups whenever a PFS root is
840                                  * accessed.
841                                  */
842                                 if (obj_id == HAMMER_OBJID_ROOT)
843                                         ispfs = 1;
844                                 localization = cursor.data->entry.localization;
845                                 break;
846                         }
847                         error = hammer_ip_next(&cursor);
848                 }
849         }
850         hammer_done_cursor(&cursor);
851         if (error == 0) {
852                 ip = hammer_get_inode(&trans, dip, obj_id,
853                                       asof, localization,
854                                       flags, &error);
855                 if (ispfs && asof > ip->pfsm->pfsd.sync_end_tid) {
856                         asof = ip->pfsm->pfsd.sync_end_tid;
857                         hammer_rel_inode(ip, 0);
858                         ip = hammer_get_inode(&trans, dip, obj_id,
859                                               asof, localization,
860                                               flags, &error);
861                 }
862
863
864                 if (error == 0) {
865                         error = hammer_get_vnode(ip, &vp);
866                         hammer_rel_inode(ip, 0);
867                 } else {
868                         vp = NULL;
869                 }
870                 if (error == 0) {
871                         vn_unlock(vp);
872                         cache_setvp(ap->a_nch, vp);
873                         if (ispfs)
874                                 cache_settimeout(ap->a_nch, 0);
875                         vrele(vp);
876                 }
877         } else if (error == ENOENT) {
878                 cache_setvp(ap->a_nch, NULL);
879         }
880 done:
881         hammer_done_transaction(&trans);
882         return (error);
883 }
884
885 /*
886  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
887  *
888  * Locate the parent directory of a directory vnode.
889  *
890  * dvp is referenced but not locked.  *vpp must be returned referenced and
891  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
892  * at the root, instead it could indicate that the directory we were in was
893  * removed.
894  *
895  * NOTE: as-of sequences are not linked into the directory structure.  If
896  * we are at the root with a different asof then the mount point, reload
897  * the same directory with the mount point's asof.   I'm not sure what this
898  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
899  * get confused, but it hasn't been tested.
900  */
901 static
902 int
903 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
904 {
905         struct hammer_transaction trans;
906         struct hammer_inode *dip;
907         struct hammer_inode *ip;
908         int64_t parent_obj_id;
909         u_int32_t parent_obj_localization;
910         hammer_tid_t asof;
911         int error;
912
913         dip = VTOI(ap->a_dvp);
914         asof = dip->obj_asof;
915
916         /*
917          * Whos are parent?  This could be the root of a pseudo-filesystem
918          * whos parent is in another localization domain.
919          */
920         parent_obj_id = dip->ino_data.parent_obj_id;
921         if (dip->obj_id == HAMMER_OBJID_ROOT)
922                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
923         else
924                 parent_obj_localization = dip->obj_localization;
925
926         if (parent_obj_id == 0) {
927                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
928                    asof != dip->hmp->asof) {
929                         parent_obj_id = dip->obj_id;
930                         asof = dip->hmp->asof;
931                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
932                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
933                                    dip->obj_asof);
934                 } else {
935                         *ap->a_vpp = NULL;
936                         return ENOENT;
937                 }
938         }
939
940         hammer_simple_transaction(&trans, dip->hmp);
941
942         ip = hammer_get_inode(&trans, dip, parent_obj_id,
943                               asof, parent_obj_localization,
944                               dip->flags, &error);
945         if (ip) {
946                 error = hammer_get_vnode(ip, ap->a_vpp);
947                 hammer_rel_inode(ip, 0);
948         } else {
949                 *ap->a_vpp = NULL;
950         }
951         hammer_done_transaction(&trans);
952         return (error);
953 }
954
955 /*
956  * hammer_vop_nlink { nch, dvp, vp, cred }
957  */
958 static
959 int
960 hammer_vop_nlink(struct vop_nlink_args *ap)
961 {
962         struct hammer_transaction trans;
963         struct hammer_inode *dip;
964         struct hammer_inode *ip;
965         struct nchandle *nch;
966         int error;
967
968         nch = ap->a_nch;
969         dip = VTOI(ap->a_dvp);
970         ip = VTOI(ap->a_vp);
971
972         if (dip->flags & HAMMER_INODE_RO)
973                 return (EROFS);
974         if (ip->flags & HAMMER_INODE_RO)
975                 return (EROFS);
976         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
977                 return (error);
978
979         /*
980          * Create a transaction to cover the operations we perform.
981          */
982         hammer_start_transaction(&trans, dip->hmp);
983
984         /*
985          * Add the filesystem object to the directory.  Note that neither
986          * dip nor ip are referenced or locked, but their vnodes are
987          * referenced.  This function will bump the inode's link count.
988          */
989         error = hammer_ip_add_directory(&trans, dip,
990                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
991                                         ip);
992
993         /*
994          * Finish up.
995          */
996         if (error == 0) {
997                 cache_setunresolved(nch);
998                 cache_setvp(nch, ap->a_vp);
999         }
1000         hammer_done_transaction(&trans);
1001         return (error);
1002 }
1003
1004 /*
1005  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1006  *
1007  * The operating system has already ensured that the directory entry
1008  * does not exist and done all appropriate namespace locking.
1009  */
1010 static
1011 int
1012 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1013 {
1014         struct hammer_transaction trans;
1015         struct hammer_inode *dip;
1016         struct hammer_inode *nip;
1017         struct nchandle *nch;
1018         int error;
1019
1020         nch = ap->a_nch;
1021         dip = VTOI(ap->a_dvp);
1022
1023         if (dip->flags & HAMMER_INODE_RO)
1024                 return (EROFS);
1025         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1026                 return (error);
1027
1028         /*
1029          * Create a transaction to cover the operations we perform.
1030          */
1031         hammer_start_transaction(&trans, dip->hmp);
1032
1033         /*
1034          * Create a new filesystem object of the requested type.  The
1035          * returned inode will be referenced but not locked.
1036          */
1037         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1038                                     dip, 0, &nip);
1039         if (error) {
1040                 hkprintf("hammer_mkdir error %d\n", error);
1041                 hammer_done_transaction(&trans);
1042                 *ap->a_vpp = NULL;
1043                 return (error);
1044         }
1045         /*
1046          * Add the new filesystem object to the directory.  This will also
1047          * bump the inode's link count.
1048          */
1049         error = hammer_ip_add_directory(&trans, dip,
1050                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1051                                         nip);
1052         if (error)
1053                 hkprintf("hammer_mkdir (add) error %d\n", error);
1054
1055         /*
1056          * Finish up.
1057          */
1058         if (error) {
1059                 hammer_rel_inode(nip, 0);
1060                 *ap->a_vpp = NULL;
1061         } else {
1062                 error = hammer_get_vnode(nip, ap->a_vpp);
1063                 hammer_rel_inode(nip, 0);
1064                 if (error == 0) {
1065                         cache_setunresolved(ap->a_nch);
1066                         cache_setvp(ap->a_nch, *ap->a_vpp);
1067                 }
1068         }
1069         hammer_done_transaction(&trans);
1070         return (error);
1071 }
1072
1073 /*
1074  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1075  *
1076  * The operating system has already ensured that the directory entry
1077  * does not exist and done all appropriate namespace locking.
1078  */
1079 static
1080 int
1081 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1082 {
1083         struct hammer_transaction trans;
1084         struct hammer_inode *dip;
1085         struct hammer_inode *nip;
1086         struct nchandle *nch;
1087         int error;
1088         int pseudofs;
1089
1090         nch = ap->a_nch;
1091         dip = VTOI(ap->a_dvp);
1092
1093         if (dip->flags & HAMMER_INODE_RO)
1094                 return (EROFS);
1095         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1096                 return (error);
1097
1098         /*
1099          * Create a transaction to cover the operations we perform.
1100          */
1101         hammer_start_transaction(&trans, dip->hmp);
1102
1103         /*
1104          * Create a new filesystem object of the requested type.  The
1105          * returned inode will be referenced but not locked.
1106          *
1107          * If mknod specifies a directory a pseudo-fs is created.
1108          */
1109         pseudofs = (ap->a_vap->va_type == VDIR);
1110         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1111                                     dip, pseudofs, &nip);
1112         if (error) {
1113                 hammer_done_transaction(&trans);
1114                 *ap->a_vpp = NULL;
1115                 return (error);
1116         }
1117
1118         /*
1119          * Add the new filesystem object to the directory.  This will also
1120          * bump the inode's link count.
1121          */
1122         error = hammer_ip_add_directory(&trans, dip,
1123                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1124                                         nip);
1125
1126         /*
1127          * Finish up.
1128          */
1129         if (error) {
1130                 hammer_rel_inode(nip, 0);
1131                 *ap->a_vpp = NULL;
1132         } else {
1133                 error = hammer_get_vnode(nip, ap->a_vpp);
1134                 hammer_rel_inode(nip, 0);
1135                 if (error == 0) {
1136                         cache_setunresolved(ap->a_nch);
1137                         cache_setvp(ap->a_nch, *ap->a_vpp);
1138                 }
1139         }
1140         hammer_done_transaction(&trans);
1141         return (error);
1142 }
1143
1144 /*
1145  * hammer_vop_open { vp, mode, cred, fp }
1146  */
1147 static
1148 int
1149 hammer_vop_open(struct vop_open_args *ap)
1150 {
1151         hammer_inode_t ip;
1152
1153         ip = VTOI(ap->a_vp);
1154
1155         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1156                 return (EROFS);
1157         return(vop_stdopen(ap));
1158 }
1159
1160 /*
1161  * hammer_vop_pathconf { vp, name, retval }
1162  */
1163 static
1164 int
1165 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1166 {
1167         return EOPNOTSUPP;
1168 }
1169
1170 /*
1171  * hammer_vop_print { vp }
1172  */
1173 static
1174 int
1175 hammer_vop_print(struct vop_print_args *ap)
1176 {
1177         return EOPNOTSUPP;
1178 }
1179
1180 /*
1181  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1182  */
1183 static
1184 int
1185 hammer_vop_readdir(struct vop_readdir_args *ap)
1186 {
1187         struct hammer_transaction trans;
1188         struct hammer_cursor cursor;
1189         struct hammer_inode *ip;
1190         struct uio *uio;
1191         hammer_base_elm_t base;
1192         int error;
1193         int cookie_index;
1194         int ncookies;
1195         off_t *cookies;
1196         off_t saveoff;
1197         int r;
1198
1199         ip = VTOI(ap->a_vp);
1200         uio = ap->a_uio;
1201         saveoff = uio->uio_offset;
1202
1203         if (ap->a_ncookies) {
1204                 ncookies = uio->uio_resid / 16 + 1;
1205                 if (ncookies > 1024)
1206                         ncookies = 1024;
1207                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1208                 cookie_index = 0;
1209         } else {
1210                 ncookies = -1;
1211                 cookies = NULL;
1212                 cookie_index = 0;
1213         }
1214
1215         hammer_simple_transaction(&trans, ip->hmp);
1216
1217         /*
1218          * Handle artificial entries
1219          */
1220         error = 0;
1221         if (saveoff == 0) {
1222                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1223                 if (r)
1224                         goto done;
1225                 if (cookies)
1226                         cookies[cookie_index] = saveoff;
1227                 ++saveoff;
1228                 ++cookie_index;
1229                 if (cookie_index == ncookies)
1230                         goto done;
1231         }
1232         if (saveoff == 1) {
1233                 if (ip->ino_data.parent_obj_id) {
1234                         r = vop_write_dirent(&error, uio,
1235                                              ip->ino_data.parent_obj_id,
1236                                              DT_DIR, 2, "..");
1237                 } else {
1238                         r = vop_write_dirent(&error, uio,
1239                                              ip->obj_id, DT_DIR, 2, "..");
1240                 }
1241                 if (r)
1242                         goto done;
1243                 if (cookies)
1244                         cookies[cookie_index] = saveoff;
1245                 ++saveoff;
1246                 ++cookie_index;
1247                 if (cookie_index == ncookies)
1248                         goto done;
1249         }
1250
1251         /*
1252          * Key range (begin and end inclusive) to scan.  Directory keys
1253          * directly translate to a 64 bit 'seek' position.
1254          */
1255         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1256         cursor.key_beg.localization = ip->obj_localization +
1257                                       HAMMER_LOCALIZE_MISC;
1258         cursor.key_beg.obj_id = ip->obj_id;
1259         cursor.key_beg.create_tid = 0;
1260         cursor.key_beg.delete_tid = 0;
1261         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1262         cursor.key_beg.obj_type = 0;
1263         cursor.key_beg.key = saveoff;
1264
1265         cursor.key_end = cursor.key_beg;
1266         cursor.key_end.key = HAMMER_MAX_KEY;
1267         cursor.asof = ip->obj_asof;
1268         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1269
1270         error = hammer_ip_first(&cursor);
1271
1272         while (error == 0) {
1273                 error = hammer_ip_resolve_data(&cursor);
1274                 if (error)
1275                         break;
1276                 base = &cursor.leaf->base;
1277                 saveoff = base->key;
1278                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1279
1280                 if (base->obj_id != ip->obj_id)
1281                         panic("readdir: bad record at %p", cursor.node);
1282
1283                 r = vop_write_dirent(
1284                              &error, uio, cursor.data->entry.obj_id,
1285                              hammer_get_dtype(cursor.leaf->base.obj_type),
1286                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1287                              (void *)cursor.data->entry.name);
1288                 if (r)
1289                         break;
1290                 ++saveoff;
1291                 if (cookies)
1292                         cookies[cookie_index] = base->key;
1293                 ++cookie_index;
1294                 if (cookie_index == ncookies)
1295                         break;
1296                 error = hammer_ip_next(&cursor);
1297         }
1298         hammer_done_cursor(&cursor);
1299
1300 done:
1301         hammer_done_transaction(&trans);
1302
1303         if (ap->a_eofflag)
1304                 *ap->a_eofflag = (error == ENOENT);
1305         uio->uio_offset = saveoff;
1306         if (error && cookie_index == 0) {
1307                 if (error == ENOENT)
1308                         error = 0;
1309                 if (cookies) {
1310                         kfree(cookies, M_TEMP);
1311                         *ap->a_ncookies = 0;
1312                         *ap->a_cookies = NULL;
1313                 }
1314         } else {
1315                 if (error == ENOENT)
1316                         error = 0;
1317                 if (cookies) {
1318                         *ap->a_ncookies = cookie_index;
1319                         *ap->a_cookies = cookies;
1320                 }
1321         }
1322         return(error);
1323 }
1324
1325 /*
1326  * hammer_vop_readlink { vp, uio, cred }
1327  */
1328 static
1329 int
1330 hammer_vop_readlink(struct vop_readlink_args *ap)
1331 {
1332         struct hammer_transaction trans;
1333         struct hammer_cursor cursor;
1334         struct hammer_inode *ip;
1335         int error;
1336
1337         ip = VTOI(ap->a_vp);
1338
1339         /*
1340          * Shortcut if the symlink data was stuffed into ino_data.
1341          */
1342         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1343                 error = uiomove(ip->ino_data.ext.symlink,
1344                                 ip->ino_data.size, ap->a_uio);
1345                 return(error);
1346         }
1347
1348         /*
1349          * Long version
1350          */
1351         hammer_simple_transaction(&trans, ip->hmp);
1352         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1353
1354         /*
1355          * Key range (begin and end inclusive) to scan.  Directory keys
1356          * directly translate to a 64 bit 'seek' position.
1357          */
1358         cursor.key_beg.localization = ip->obj_localization +
1359                                       HAMMER_LOCALIZE_MISC;
1360         cursor.key_beg.obj_id = ip->obj_id;
1361         cursor.key_beg.create_tid = 0;
1362         cursor.key_beg.delete_tid = 0;
1363         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1364         cursor.key_beg.obj_type = 0;
1365         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1366         cursor.asof = ip->obj_asof;
1367         cursor.flags |= HAMMER_CURSOR_ASOF;
1368
1369         error = hammer_ip_lookup(&cursor);
1370         if (error == 0) {
1371                 error = hammer_ip_resolve_data(&cursor);
1372                 if (error == 0) {
1373                         KKASSERT(cursor.leaf->data_len >=
1374                                  HAMMER_SYMLINK_NAME_OFF);
1375                         error = uiomove(cursor.data->symlink.name,
1376                                         cursor.leaf->data_len -
1377                                                 HAMMER_SYMLINK_NAME_OFF,
1378                                         ap->a_uio);
1379                 }
1380         }
1381         hammer_done_cursor(&cursor);
1382         hammer_done_transaction(&trans);
1383         return(error);
1384 }
1385
1386 /*
1387  * hammer_vop_nremove { nch, dvp, cred }
1388  */
1389 static
1390 int
1391 hammer_vop_nremove(struct vop_nremove_args *ap)
1392 {
1393         struct hammer_transaction trans;
1394         struct hammer_inode *dip;
1395         int error;
1396
1397         dip = VTOI(ap->a_dvp);
1398
1399         if (hammer_nohistory(dip) == 0 &&
1400             (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1401                 return (error);
1402         }
1403
1404         hammer_start_transaction(&trans, dip->hmp);
1405         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1406         hammer_done_transaction(&trans);
1407
1408         return (error);
1409 }
1410
1411 /*
1412  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1413  */
1414 static
1415 int
1416 hammer_vop_nrename(struct vop_nrename_args *ap)
1417 {
1418         struct hammer_transaction trans;
1419         struct namecache *fncp;
1420         struct namecache *tncp;
1421         struct hammer_inode *fdip;
1422         struct hammer_inode *tdip;
1423         struct hammer_inode *ip;
1424         struct hammer_cursor cursor;
1425         int64_t namekey;
1426         int nlen, error;
1427
1428         fdip = VTOI(ap->a_fdvp);
1429         tdip = VTOI(ap->a_tdvp);
1430         fncp = ap->a_fnch->ncp;
1431         tncp = ap->a_tnch->ncp;
1432         ip = VTOI(fncp->nc_vp);
1433         KKASSERT(ip != NULL);
1434
1435         if (fdip->flags & HAMMER_INODE_RO)
1436                 return (EROFS);
1437         if (tdip->flags & HAMMER_INODE_RO)
1438                 return (EROFS);
1439         if (ip->flags & HAMMER_INODE_RO)
1440                 return (EROFS);
1441         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1442                 return (error);
1443
1444         hammer_start_transaction(&trans, fdip->hmp);
1445
1446         /*
1447          * Remove tncp from the target directory and then link ip as
1448          * tncp. XXX pass trans to dounlink
1449          *
1450          * Force the inode sync-time to match the transaction so it is
1451          * in-sync with the creation of the target directory entry.
1452          */
1453         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1454         if (error == 0 || error == ENOENT) {
1455                 error = hammer_ip_add_directory(&trans, tdip,
1456                                                 tncp->nc_name, tncp->nc_nlen,
1457                                                 ip);
1458                 if (error == 0) {
1459                         ip->ino_data.parent_obj_id = tdip->obj_id;
1460                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1461                 }
1462         }
1463         if (error)
1464                 goto failed; /* XXX */
1465
1466         /*
1467          * Locate the record in the originating directory and remove it.
1468          *
1469          * Calculate the namekey and setup the key range for the scan.  This
1470          * works kinda like a chained hash table where the lower 32 bits
1471          * of the namekey synthesize the chain.
1472          *
1473          * The key range is inclusive of both key_beg and key_end.
1474          */
1475         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1476 retry:
1477         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1478         cursor.key_beg.localization = fdip->obj_localization +
1479                                       HAMMER_LOCALIZE_MISC;
1480         cursor.key_beg.obj_id = fdip->obj_id;
1481         cursor.key_beg.key = namekey;
1482         cursor.key_beg.create_tid = 0;
1483         cursor.key_beg.delete_tid = 0;
1484         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1485         cursor.key_beg.obj_type = 0;
1486
1487         cursor.key_end = cursor.key_beg;
1488         cursor.key_end.key |= 0xFFFFFFFFULL;
1489         cursor.asof = fdip->obj_asof;
1490         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1491
1492         /*
1493          * Scan all matching records (the chain), locate the one matching
1494          * the requested path component.
1495          *
1496          * The hammer_ip_*() functions merge in-memory records with on-disk
1497          * records for the purposes of the search.
1498          */
1499         error = hammer_ip_first(&cursor);
1500         while (error == 0) {
1501                 if (hammer_ip_resolve_data(&cursor) != 0)
1502                         break;
1503                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1504                 KKASSERT(nlen > 0);
1505                 if (fncp->nc_nlen == nlen &&
1506                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1507                         break;
1508                 }
1509                 error = hammer_ip_next(&cursor);
1510         }
1511
1512         /*
1513          * If all is ok we have to get the inode so we can adjust nlinks.
1514          *
1515          * WARNING: hammer_ip_del_directory() may have to terminate the
1516          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1517          * twice.
1518          */
1519         if (error == 0)
1520                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1521
1522         /*
1523          * XXX A deadlock here will break rename's atomicy for the purposes
1524          * of crash recovery.
1525          */
1526         if (error == EDEADLK) {
1527                 hammer_done_cursor(&cursor);
1528                 goto retry;
1529         }
1530
1531         /*
1532          * Cleanup and tell the kernel that the rename succeeded.
1533          */
1534         hammer_done_cursor(&cursor);
1535         if (error == 0)
1536                 cache_rename(ap->a_fnch, ap->a_tnch);
1537
1538 failed:
1539         hammer_done_transaction(&trans);
1540         return (error);
1541 }
1542
1543 /*
1544  * hammer_vop_nrmdir { nch, dvp, cred }
1545  */
1546 static
1547 int
1548 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1549 {
1550         struct hammer_transaction trans;
1551         struct hammer_inode *dip;
1552         int error;
1553
1554         dip = VTOI(ap->a_dvp);
1555
1556         if (hammer_nohistory(dip) == 0 &&
1557             (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1558                 return (error);
1559         }
1560
1561         hammer_start_transaction(&trans, dip->hmp);
1562         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1563         hammer_done_transaction(&trans);
1564
1565         return (error);
1566 }
1567
1568 /*
1569  * hammer_vop_setattr { vp, vap, cred }
1570  */
1571 static
1572 int
1573 hammer_vop_setattr(struct vop_setattr_args *ap)
1574 {
1575         struct hammer_transaction trans;
1576         struct vattr *vap;
1577         struct hammer_inode *ip;
1578         int modflags;
1579         int error;
1580         int truncating;
1581         int blksize;
1582         int64_t aligned_size;
1583         u_int32_t flags;
1584
1585         vap = ap->a_vap;
1586         ip = ap->a_vp->v_data;
1587         modflags = 0;
1588
1589         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1590                 return(EROFS);
1591         if (ip->flags & HAMMER_INODE_RO)
1592                 return (EROFS);
1593         if (hammer_nohistory(ip) == 0 &&
1594             (error = hammer_checkspace(ip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1595                 return (error);
1596         }
1597
1598         hammer_start_transaction(&trans, ip->hmp);
1599         error = 0;
1600
1601         if (vap->va_flags != VNOVAL) {
1602                 flags = ip->ino_data.uflags;
1603                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1604                                          hammer_to_unix_xid(&ip->ino_data.uid),
1605                                          ap->a_cred);
1606                 if (error == 0) {
1607                         if (ip->ino_data.uflags != flags) {
1608                                 ip->ino_data.uflags = flags;
1609                                 modflags |= HAMMER_INODE_DDIRTY;
1610                         }
1611                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1612                                 error = 0;
1613                                 goto done;
1614                         }
1615                 }
1616                 goto done;
1617         }
1618         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1619                 error = EPERM;
1620                 goto done;
1621         }
1622         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1623                 mode_t cur_mode = ip->ino_data.mode;
1624                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1625                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1626                 uuid_t uuid_uid;
1627                 uuid_t uuid_gid;
1628
1629                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1630                                          ap->a_cred,
1631                                          &cur_uid, &cur_gid, &cur_mode);
1632                 if (error == 0) {
1633                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1634                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1635                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1636                                  sizeof(uuid_uid)) ||
1637                             bcmp(&uuid_gid, &ip->ino_data.gid,
1638                                  sizeof(uuid_gid)) ||
1639                             ip->ino_data.mode != cur_mode
1640                         ) {
1641                                 ip->ino_data.uid = uuid_uid;
1642                                 ip->ino_data.gid = uuid_gid;
1643                                 ip->ino_data.mode = cur_mode;
1644                         }
1645                         modflags |= HAMMER_INODE_DDIRTY;
1646                 }
1647         }
1648         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1649                 switch(ap->a_vp->v_type) {
1650                 case VREG:
1651                         if (vap->va_size == ip->ino_data.size)
1652                                 break;
1653                         /*
1654                          * XXX break atomicy, we can deadlock the backend
1655                          * if we do not release the lock.  Probably not a
1656                          * big deal here.
1657                          */
1658                         blksize = hammer_blocksize(vap->va_size);
1659                         if (vap->va_size < ip->ino_data.size) {
1660                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1661                                 truncating = 1;
1662                         } else {
1663                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1664                                 truncating = 0;
1665                         }
1666                         ip->ino_data.size = vap->va_size;
1667                         modflags |= HAMMER_INODE_DDIRTY;
1668
1669                         /*
1670                          * on-media truncation is cached in the inode until
1671                          * the inode is synchronized.
1672                          */
1673                         if (truncating) {
1674                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1675 #ifdef DEBUG_TRUNCATE
1676                                 if (HammerTruncIp == NULL)
1677                                         HammerTruncIp = ip;
1678 #endif
1679                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1680                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1681                                         ip->trunc_off = vap->va_size;
1682 #ifdef DEBUG_TRUNCATE
1683                                         if (ip == HammerTruncIp)
1684                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1685 #endif
1686                                 } else if (ip->trunc_off > vap->va_size) {
1687                                         ip->trunc_off = vap->va_size;
1688 #ifdef DEBUG_TRUNCATE
1689                                         if (ip == HammerTruncIp)
1690                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1691 #endif
1692                                 } else {
1693 #ifdef DEBUG_TRUNCATE
1694                                         if (ip == HammerTruncIp)
1695                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1696 #endif
1697                                 }
1698                         }
1699
1700                         /*
1701                          * If truncating we have to clean out a portion of
1702                          * the last block on-disk.  We do this in the
1703                          * front-end buffer cache.
1704                          */
1705                         aligned_size = (vap->va_size + (blksize - 1)) &
1706                                        ~(int64_t)(blksize - 1);
1707                         if (truncating && vap->va_size < aligned_size) {
1708                                 struct buf *bp;
1709                                 int offset;
1710
1711                                 aligned_size -= blksize;
1712
1713                                 offset = (int)vap->va_size & (blksize - 1);
1714                                 error = bread(ap->a_vp, aligned_size,
1715                                               blksize, &bp);
1716                                 hammer_ip_frontend_trunc(ip, aligned_size);
1717                                 if (error == 0) {
1718                                         bzero(bp->b_data + offset,
1719                                               blksize - offset);
1720                                         bdwrite(bp);
1721                                 } else {
1722                                         kprintf("ERROR %d\n", error);
1723                                         brelse(bp);
1724                                 }
1725                         }
1726                         break;
1727                 case VDATABASE:
1728                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1729                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1730                                 ip->trunc_off = vap->va_size;
1731                         } else if (ip->trunc_off > vap->va_size) {
1732                                 ip->trunc_off = vap->va_size;
1733                         }
1734                         hammer_ip_frontend_trunc(ip, vap->va_size);
1735                         ip->ino_data.size = vap->va_size;
1736                         modflags |= HAMMER_INODE_DDIRTY;
1737                         break;
1738                 default:
1739                         error = EINVAL;
1740                         goto done;
1741                 }
1742                 break;
1743         }
1744         if (vap->va_atime.tv_sec != VNOVAL) {
1745                 ip->ino_data.atime =
1746                         hammer_timespec_to_time(&vap->va_atime);
1747                 modflags |= HAMMER_INODE_ATIME;
1748         }
1749         if (vap->va_mtime.tv_sec != VNOVAL) {
1750                 ip->ino_data.mtime =
1751                         hammer_timespec_to_time(&vap->va_mtime);
1752                 modflags |= HAMMER_INODE_MTIME;
1753         }
1754         if (vap->va_mode != (mode_t)VNOVAL) {
1755                 mode_t   cur_mode = ip->ino_data.mode;
1756                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1757                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1758
1759                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1760                                          cur_uid, cur_gid, &cur_mode);
1761                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1762                         ip->ino_data.mode = cur_mode;
1763                         modflags |= HAMMER_INODE_DDIRTY;
1764                 }
1765         }
1766 done:
1767         if (error == 0)
1768                 hammer_modify_inode(ip, modflags);
1769         hammer_done_transaction(&trans);
1770         return (error);
1771 }
1772
1773 /*
1774  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1775  */
1776 static
1777 int
1778 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1779 {
1780         struct hammer_transaction trans;
1781         struct hammer_inode *dip;
1782         struct hammer_inode *nip;
1783         struct nchandle *nch;
1784         hammer_record_t record;
1785         int error;
1786         int bytes;
1787
1788         ap->a_vap->va_type = VLNK;
1789
1790         nch = ap->a_nch;
1791         dip = VTOI(ap->a_dvp);
1792
1793         if (dip->flags & HAMMER_INODE_RO)
1794                 return (EROFS);
1795         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1796                 return (error);
1797
1798         /*
1799          * Create a transaction to cover the operations we perform.
1800          */
1801         hammer_start_transaction(&trans, dip->hmp);
1802
1803         /*
1804          * Create a new filesystem object of the requested type.  The
1805          * returned inode will be referenced but not locked.
1806          */
1807
1808         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1809                                     dip, 0, &nip);
1810         if (error) {
1811                 hammer_done_transaction(&trans);
1812                 *ap->a_vpp = NULL;
1813                 return (error);
1814         }
1815
1816         /*
1817          * Add a record representing the symlink.  symlink stores the link
1818          * as pure data, not a string, and is no \0 terminated.
1819          */
1820         if (error == 0) {
1821                 bytes = strlen(ap->a_target);
1822
1823                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1824                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1825                 } else {
1826                         record = hammer_alloc_mem_record(nip, bytes);
1827                         record->type = HAMMER_MEM_RECORD_GENERAL;
1828
1829                         record->leaf.base.localization = nip->obj_localization +
1830                                                          HAMMER_LOCALIZE_MISC;
1831                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1832                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1833                         record->leaf.data_len = bytes;
1834                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1835                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1836                         error = hammer_ip_add_record(&trans, record);
1837                 }
1838
1839                 /*
1840                  * Set the file size to the length of the link.
1841                  */
1842                 if (error == 0) {
1843                         nip->ino_data.size = bytes;
1844                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1845                 }
1846         }
1847         if (error == 0)
1848                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1849                                                 nch->ncp->nc_nlen, nip);
1850
1851         /*
1852          * Finish up.
1853          */
1854         if (error) {
1855                 hammer_rel_inode(nip, 0);
1856                 *ap->a_vpp = NULL;
1857         } else {
1858                 error = hammer_get_vnode(nip, ap->a_vpp);
1859                 hammer_rel_inode(nip, 0);
1860                 if (error == 0) {
1861                         cache_setunresolved(ap->a_nch);
1862                         cache_setvp(ap->a_nch, *ap->a_vpp);
1863                 }
1864         }
1865         hammer_done_transaction(&trans);
1866         return (error);
1867 }
1868
1869 /*
1870  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1871  */
1872 static
1873 int
1874 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1875 {
1876         struct hammer_transaction trans;
1877         struct hammer_inode *dip;
1878         int error;
1879
1880         dip = VTOI(ap->a_dvp);
1881
1882         if (hammer_nohistory(dip) == 0 &&
1883             (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0) {
1884                 return (error);
1885         }
1886
1887         hammer_start_transaction(&trans, dip->hmp);
1888         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1889                                 ap->a_cred, ap->a_flags);
1890         hammer_done_transaction(&trans);
1891
1892         return (error);
1893 }
1894
1895 /*
1896  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1897  */
1898 static
1899 int
1900 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1901 {
1902         struct hammer_inode *ip = ap->a_vp->v_data;
1903
1904         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1905                             ap->a_fflag, ap->a_cred));
1906 }
1907
1908 static
1909 int
1910 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1911 {
1912         struct mount *mp;
1913         int error;
1914
1915         mp = ap->a_head.a_ops->head.vv_mount;
1916
1917         switch(ap->a_op) {
1918         case MOUNTCTL_SET_EXPORT:
1919                 if (ap->a_ctllen != sizeof(struct export_args))
1920                         error = EINVAL;
1921                 error = hammer_vfs_export(mp, ap->a_op,
1922                                       (const struct export_args *)ap->a_ctl);
1923                 break;
1924         default:
1925                 error = journal_mountctl(ap);
1926                 break;
1927         }
1928         return(error);
1929 }
1930
1931 /*
1932  * hammer_vop_strategy { vp, bio }
1933  *
1934  * Strategy call, used for regular file read & write only.  Note that the
1935  * bp may represent a cluster.
1936  *
1937  * To simplify operation and allow better optimizations in the future,
1938  * this code does not make any assumptions with regards to buffer alignment
1939  * or size.
1940  */
1941 static
1942 int
1943 hammer_vop_strategy(struct vop_strategy_args *ap)
1944 {
1945         struct buf *bp;
1946         int error;
1947
1948         bp = ap->a_bio->bio_buf;
1949
1950         switch(bp->b_cmd) {
1951         case BUF_CMD_READ:
1952                 error = hammer_vop_strategy_read(ap);
1953                 break;
1954         case BUF_CMD_WRITE:
1955                 error = hammer_vop_strategy_write(ap);
1956                 break;
1957         default:
1958                 bp->b_error = error = EINVAL;
1959                 bp->b_flags |= B_ERROR;
1960                 biodone(ap->a_bio);
1961                 break;
1962         }
1963         return (error);
1964 }
1965
1966 /*
1967  * Read from a regular file.  Iterate the related records and fill in the
1968  * BIO/BUF.  Gaps are zero-filled.
1969  *
1970  * The support code in hammer_object.c should be used to deal with mixed
1971  * in-memory and on-disk records.
1972  *
1973  * NOTE: Can be called from the cluster code with an oversized buf.
1974  *
1975  * XXX atime update
1976  */
1977 static
1978 int
1979 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1980 {
1981         struct hammer_transaction trans;
1982         struct hammer_inode *ip;
1983         struct hammer_cursor cursor;
1984         hammer_base_elm_t base;
1985         hammer_off_t disk_offset;
1986         struct bio *bio;
1987         struct bio *nbio;
1988         struct buf *bp;
1989         int64_t rec_offset;
1990         int64_t ran_end;
1991         int64_t tmp64;
1992         int error;
1993         int boff;
1994         int roff;
1995         int n;
1996
1997         bio = ap->a_bio;
1998         bp = bio->bio_buf;
1999         ip = ap->a_vp->v_data;
2000
2001         /*
2002          * The zone-2 disk offset may have been set by the cluster code via
2003          * a BMAP operation, or else should be NOOFFSET.
2004          *
2005          * Checking the high bits for a match against zone-2 should suffice.
2006          */
2007         nbio = push_bio(bio);
2008         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2009             HAMMER_ZONE_RAW_BUFFER) {
2010                 error = hammer_io_direct_read(ip->hmp, nbio);
2011                 return (error);
2012         }
2013
2014         /*
2015          * Well, that sucked.  Do it the hard way.  If all the stars are
2016          * aligned we may still be able to issue a direct-read.
2017          */
2018         hammer_simple_transaction(&trans, ip->hmp);
2019         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2020
2021         /*
2022          * Key range (begin and end inclusive) to scan.  Note that the key's
2023          * stored in the actual records represent BASE+LEN, not BASE.  The
2024          * first record containing bio_offset will have a key > bio_offset.
2025          */
2026         cursor.key_beg.localization = ip->obj_localization +
2027                                       HAMMER_LOCALIZE_MISC;
2028         cursor.key_beg.obj_id = ip->obj_id;
2029         cursor.key_beg.create_tid = 0;
2030         cursor.key_beg.delete_tid = 0;
2031         cursor.key_beg.obj_type = 0;
2032         cursor.key_beg.key = bio->bio_offset + 1;
2033         cursor.asof = ip->obj_asof;
2034         cursor.flags |= HAMMER_CURSOR_ASOF;
2035
2036         cursor.key_end = cursor.key_beg;
2037         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2038 #if 0
2039         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2040                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2041                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2042                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2043         } else
2044 #endif
2045         {
2046                 ran_end = bio->bio_offset + bp->b_bufsize;
2047                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2048                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2049                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2050                 if (tmp64 < ran_end)
2051                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2052                 else
2053                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2054         }
2055         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2056
2057         error = hammer_ip_first(&cursor);
2058         boff = 0;
2059
2060         while (error == 0) {
2061                 /*
2062                  * Get the base file offset of the record.  The key for
2063                  * data records is (base + bytes) rather then (base).
2064                  */
2065                 base = &cursor.leaf->base;
2066                 rec_offset = base->key - cursor.leaf->data_len;
2067
2068                 /*
2069                  * Calculate the gap, if any, and zero-fill it.
2070                  *
2071                  * n is the offset of the start of the record verses our
2072                  * current seek offset in the bio.
2073                  */
2074                 n = (int)(rec_offset - (bio->bio_offset + boff));
2075                 if (n > 0) {
2076                         if (n > bp->b_bufsize - boff)
2077                                 n = bp->b_bufsize - boff;
2078                         bzero((char *)bp->b_data + boff, n);
2079                         boff += n;
2080                         n = 0;
2081                 }
2082
2083                 /*
2084                  * Calculate the data offset in the record and the number
2085                  * of bytes we can copy.
2086                  *
2087                  * There are two degenerate cases.  First, boff may already
2088                  * be at bp->b_bufsize.  Secondly, the data offset within
2089                  * the record may exceed the record's size.
2090                  */
2091                 roff = -n;
2092                 rec_offset += roff;
2093                 n = cursor.leaf->data_len - roff;
2094                 if (n <= 0) {
2095                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2096                         n = 0;
2097                 } else if (n > bp->b_bufsize - boff) {
2098                         n = bp->b_bufsize - boff;
2099                 }
2100
2101                 /*
2102                  * Deal with cached truncations.  This cool bit of code
2103                  * allows truncate()/ftruncate() to avoid having to sync
2104                  * the file.
2105                  *
2106                  * If the frontend is truncated then all backend records are
2107                  * subject to the frontend's truncation.
2108                  *
2109                  * If the backend is truncated then backend records on-disk
2110                  * (but not in-memory) are subject to the backend's
2111                  * truncation.  In-memory records owned by the backend
2112                  * represent data written after the truncation point on the
2113                  * backend and must not be truncated.
2114                  *
2115                  * Truncate operations deal with frontend buffer cache
2116                  * buffers and frontend-owned in-memory records synchronously.
2117                  */
2118                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2119                         if (hammer_cursor_ondisk(&cursor) ||
2120                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2121                                 if (ip->trunc_off <= rec_offset)
2122                                         n = 0;
2123                                 else if (ip->trunc_off < rec_offset + n)
2124                                         n = (int)(ip->trunc_off - rec_offset);
2125                         }
2126                 }
2127                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2128                         if (hammer_cursor_ondisk(&cursor)) {
2129                                 if (ip->sync_trunc_off <= rec_offset)
2130                                         n = 0;
2131                                 else if (ip->sync_trunc_off < rec_offset + n)
2132                                         n = (int)(ip->sync_trunc_off - rec_offset);
2133                         }
2134                 }
2135
2136                 /*
2137                  * Try to issue a direct read into our bio if possible,
2138                  * otherwise resolve the element data into a hammer_buffer
2139                  * and copy.
2140                  *
2141                  * The buffer on-disk should be zerod past any real
2142                  * truncation point, but may not be for any synthesized
2143                  * truncation point from above.
2144                  */
2145                 if (boff == 0 && n == bp->b_bufsize &&
2146                     ((cursor.leaf->data_offset + roff) & HAMMER_BUFMASK) == 0) {
2147                         disk_offset = hammer_blockmap_lookup(
2148                                                 trans.hmp,
2149                                                 cursor.leaf->data_offset + roff,
2150                                                 &error);
2151                         if (error)
2152                                 break;
2153                         nbio->bio_offset = disk_offset;
2154                         error = hammer_io_direct_read(trans.hmp, nbio);
2155                         goto done;
2156                 } else if (n) {
2157                         error = hammer_ip_resolve_data(&cursor);
2158                         if (error == 0) {
2159                                 bcopy((char *)cursor.data + roff,
2160                                       (char *)bp->b_data + boff, n);
2161                         }
2162                 }
2163                 if (error)
2164                         break;
2165
2166                 /*
2167                  * Iterate until we have filled the request.
2168                  */
2169                 boff += n;
2170                 if (boff == bp->b_bufsize)
2171                         break;
2172                 error = hammer_ip_next(&cursor);
2173         }
2174
2175         /*
2176          * There may have been a gap after the last record
2177          */
2178         if (error == ENOENT)
2179                 error = 0;
2180         if (error == 0 && boff != bp->b_bufsize) {
2181                 KKASSERT(boff < bp->b_bufsize);
2182                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2183                 /* boff = bp->b_bufsize; */
2184         }
2185         bp->b_resid = 0;
2186         bp->b_error = error;
2187         if (error)
2188                 bp->b_flags |= B_ERROR;
2189         biodone(ap->a_bio);
2190
2191 done:
2192         if (cursor.node)
2193                 hammer_cache_node(&ip->cache[1], cursor.node);
2194         hammer_done_cursor(&cursor);
2195         hammer_done_transaction(&trans);
2196         return(error);
2197 }
2198
2199 /*
2200  * BMAP operation - used to support cluster_read() only.
2201  *
2202  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2203  *
2204  * This routine may return EOPNOTSUPP if the opration is not supported for
2205  * the specified offset.  The contents of the pointer arguments do not
2206  * need to be initialized in that case. 
2207  *
2208  * If a disk address is available and properly aligned return 0 with 
2209  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2210  * to the run-length relative to that offset.  Callers may assume that
2211  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2212  * large, so return EOPNOTSUPP if it is not sufficiently large.
2213  */
2214 static
2215 int
2216 hammer_vop_bmap(struct vop_bmap_args *ap)
2217 {
2218         struct hammer_transaction trans;
2219         struct hammer_inode *ip;
2220         struct hammer_cursor cursor;
2221         hammer_base_elm_t base;
2222         int64_t rec_offset;
2223         int64_t ran_end;
2224         int64_t tmp64;
2225         int64_t base_offset;
2226         int64_t base_disk_offset;
2227         int64_t last_offset;
2228         hammer_off_t last_disk_offset;
2229         hammer_off_t disk_offset;
2230         int     rec_len;
2231         int     error;
2232         int     blksize;
2233
2234         ip = ap->a_vp->v_data;
2235
2236         /*
2237          * We can only BMAP regular files.  We can't BMAP database files,
2238          * directories, etc.
2239          */
2240         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2241                 return(EOPNOTSUPP);
2242
2243         /*
2244          * bmap is typically called with runp/runb both NULL when used
2245          * for writing.  We do not support BMAP for writing atm.
2246          */
2247         if (ap->a_cmd != BUF_CMD_READ)
2248                 return(EOPNOTSUPP);
2249
2250         /*
2251          * Scan the B-Tree to acquire blockmap addresses, then translate
2252          * to raw addresses.
2253          */
2254         hammer_simple_transaction(&trans, ip->hmp);
2255 #if 0
2256         kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2257 #endif
2258         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2259
2260         /*
2261          * Key range (begin and end inclusive) to scan.  Note that the key's
2262          * stored in the actual records represent BASE+LEN, not BASE.  The
2263          * first record containing bio_offset will have a key > bio_offset.
2264          */
2265         cursor.key_beg.localization = ip->obj_localization +
2266                                       HAMMER_LOCALIZE_MISC;
2267         cursor.key_beg.obj_id = ip->obj_id;
2268         cursor.key_beg.create_tid = 0;
2269         cursor.key_beg.delete_tid = 0;
2270         cursor.key_beg.obj_type = 0;
2271         if (ap->a_runb)
2272                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2273         else
2274                 cursor.key_beg.key = ap->a_loffset + 1;
2275         if (cursor.key_beg.key < 0)
2276                 cursor.key_beg.key = 0;
2277         cursor.asof = ip->obj_asof;
2278         cursor.flags |= HAMMER_CURSOR_ASOF;
2279
2280         cursor.key_end = cursor.key_beg;
2281         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2282
2283         ran_end = ap->a_loffset + MAXPHYS;
2284         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2285         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2286         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2287         if (tmp64 < ran_end)
2288                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2289         else
2290                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2291
2292         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2293
2294         error = hammer_ip_first(&cursor);
2295         base_offset = last_offset = 0;
2296         base_disk_offset = last_disk_offset = 0;
2297
2298         while (error == 0) {
2299                 /*
2300                  * Get the base file offset of the record.  The key for
2301                  * data records is (base + bytes) rather then (base).
2302                  *
2303                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2304                  * The extra bytes should be zero on-disk and the BMAP op
2305                  * should still be ok.
2306                  */
2307                 base = &cursor.leaf->base;
2308                 rec_offset = base->key - cursor.leaf->data_len;
2309                 rec_len    = cursor.leaf->data_len;
2310
2311                 /*
2312                  * Incorporate any cached truncation.
2313                  *
2314                  * NOTE: Modifications to rec_len based on synthesized
2315                  * truncation points remove the guarantee that any extended
2316                  * data on disk is zero (since the truncations may not have
2317                  * taken place on-media yet).
2318                  */
2319                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2320                         if (hammer_cursor_ondisk(&cursor) ||
2321                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2322                                 if (ip->trunc_off <= rec_offset)
2323                                         rec_len = 0;
2324                                 else if (ip->trunc_off < rec_offset + rec_len)
2325                                         rec_len = (int)(ip->trunc_off - rec_offset);
2326                         }
2327                 }
2328                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2329                         if (hammer_cursor_ondisk(&cursor)) {
2330                                 if (ip->sync_trunc_off <= rec_offset)
2331                                         rec_len = 0;
2332                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2333                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2334                         }
2335                 }
2336
2337                 /*
2338                  * Accumulate information.  If we have hit a discontiguous
2339                  * block reset base_offset unless we are already beyond the
2340                  * requested offset.  If we are, that's it, we stop.
2341                  */
2342                 disk_offset = hammer_blockmap_lookup(trans.hmp,
2343                                                      cursor.leaf->data_offset,
2344                                                      &error);
2345                 if (error)
2346                         break;
2347                 if (rec_offset != last_offset ||
2348                     disk_offset != last_disk_offset) {
2349                         if (rec_offset > ap->a_loffset)
2350                                 break;
2351                         base_offset = rec_offset;
2352                         base_disk_offset = disk_offset;
2353                 }
2354                 last_offset = rec_offset + rec_len;
2355                 last_disk_offset = disk_offset + rec_len;
2356
2357                 error = hammer_ip_next(&cursor);
2358         }
2359
2360 #if 0
2361         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2362                 ap->a_loffset, base_offset, last_offset);
2363         kprintf("BMAP %16s:  %016llx - %016llx\n",
2364                 "", base_disk_offset, last_disk_offset);
2365 #endif
2366
2367         if (cursor.node) {
2368                 hammer_cache_node(&ip->cache[1], cursor.node);
2369 #if 0
2370                 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2371 #endif
2372         }
2373         hammer_done_cursor(&cursor);
2374         hammer_done_transaction(&trans);
2375
2376         /*
2377          * If we couldn't find any records or the records we did find were
2378          * all behind the requested offset, return failure.  A forward
2379          * truncation can leave a hole w/ no on-disk records.
2380          */
2381         if (last_offset == 0 || last_offset < ap->a_loffset)
2382                 return (EOPNOTSUPP);
2383
2384         /*
2385          * Figure out the block size at the requested offset and adjust
2386          * our limits so the cluster_read() does not create inappropriately
2387          * sized buffer cache buffers.
2388          */
2389         blksize = hammer_blocksize(ap->a_loffset);
2390         if (hammer_blocksize(base_offset) != blksize) {
2391                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2392         }
2393         if (last_offset != ap->a_loffset &&
2394             hammer_blocksize(last_offset - 1) != blksize) {
2395                 last_offset = hammer_blockdemarc(ap->a_loffset,
2396                                                  last_offset - 1);
2397         }
2398
2399         /*
2400          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2401          * from occuring.
2402          */
2403         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2404
2405         /*
2406          * If doffsetp is not aligned or the forward run size does
2407          * not cover a whole buffer, disallow the direct I/O.
2408          */
2409         if ((disk_offset & HAMMER_BUFMASK) ||
2410             (last_offset - ap->a_loffset) < blksize) {
2411                 error = EOPNOTSUPP;
2412         } else {
2413                 *ap->a_doffsetp = disk_offset;
2414                 if (ap->a_runb) {
2415                         *ap->a_runb = ap->a_loffset - base_offset;
2416                         KKASSERT(*ap->a_runb >= 0);
2417                 }
2418                 if (ap->a_runp) {
2419                         *ap->a_runp = last_offset - ap->a_loffset;
2420                         KKASSERT(*ap->a_runp >= 0);
2421                 }
2422                 error = 0;
2423         }
2424         return(error);
2425 }
2426
2427 /*
2428  * Write to a regular file.   Because this is a strategy call the OS is
2429  * trying to actually get data onto the media.
2430  */
2431 static
2432 int
2433 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2434 {
2435         hammer_record_t record;
2436         hammer_mount_t hmp;
2437         hammer_inode_t ip;
2438         struct bio *bio;
2439         struct buf *bp;
2440         int blksize;
2441         int bytes;
2442         int error;
2443
2444         bio = ap->a_bio;
2445         bp = bio->bio_buf;
2446         ip = ap->a_vp->v_data;
2447         hmp = ip->hmp;
2448
2449         blksize = hammer_blocksize(bio->bio_offset);
2450         KKASSERT(bp->b_bufsize == blksize);
2451
2452         if (ip->flags & HAMMER_INODE_RO) {
2453                 bp->b_error = EROFS;
2454                 bp->b_flags |= B_ERROR;
2455                 biodone(ap->a_bio);
2456                 return(EROFS);
2457         }
2458
2459         /*
2460          * Interlock with inode destruction (no in-kernel or directory
2461          * topology visibility).  If we queue new IO while trying to
2462          * destroy the inode we can deadlock the vtrunc call in
2463          * hammer_inode_unloadable_check().
2464          */
2465         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2466                 bp->b_resid = 0;
2467                 biodone(ap->a_bio);
2468                 return(0);
2469         }
2470
2471         /*
2472          * Reserve space and issue a direct-write from the front-end. 
2473          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2474          * allocations.
2475          *
2476          * An in-memory record will be installed to reference the storage
2477          * until the flusher can get to it.
2478          *
2479          * Since we own the high level bio the front-end will not try to
2480          * do a direct-read until the write completes.
2481          *
2482          * NOTE: The only time we do not reserve a full-sized buffers
2483          * worth of data is if the file is small.  We do not try to
2484          * allocate a fragment (from the small-data zone) at the end of
2485          * an otherwise large file as this can lead to wildly separated
2486          * data.
2487          */
2488         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2489         KKASSERT(bio->bio_offset < ip->ino_data.size);
2490         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2491                 bytes = bp->b_bufsize;
2492         else
2493                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2494
2495         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2496                                     bytes, &error);
2497         if (record) {
2498                 hammer_io_direct_write(hmp, &record->leaf, bio);
2499                 hammer_rel_mem_record(record);
2500                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2501                         hammer_flush_inode(ip, 0);
2502         } else {
2503                 bp->b_bio2.bio_offset = NOOFFSET;
2504                 bp->b_error = error;
2505                 bp->b_flags |= B_ERROR;
2506                 biodone(ap->a_bio);
2507         }
2508         return(error);
2509 }
2510
2511 /*
2512  * dounlink - disconnect a directory entry
2513  *
2514  * XXX whiteout support not really in yet
2515  */
2516 static int
2517 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2518                 struct vnode *dvp, struct ucred *cred, int flags)
2519 {
2520         struct namecache *ncp;
2521         hammer_inode_t dip;
2522         hammer_inode_t ip;
2523         struct hammer_cursor cursor;
2524         int64_t namekey;
2525         int nlen, error;
2526
2527         /*
2528          * Calculate the namekey and setup the key range for the scan.  This
2529          * works kinda like a chained hash table where the lower 32 bits
2530          * of the namekey synthesize the chain.
2531          *
2532          * The key range is inclusive of both key_beg and key_end.
2533          */
2534         dip = VTOI(dvp);
2535         ncp = nch->ncp;
2536
2537         if (dip->flags & HAMMER_INODE_RO)
2538                 return (EROFS);
2539
2540         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2541 retry:
2542         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2543         cursor.key_beg.localization = dip->obj_localization +
2544                                       HAMMER_LOCALIZE_MISC;
2545         cursor.key_beg.obj_id = dip->obj_id;
2546         cursor.key_beg.key = namekey;
2547         cursor.key_beg.create_tid = 0;
2548         cursor.key_beg.delete_tid = 0;
2549         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2550         cursor.key_beg.obj_type = 0;
2551
2552         cursor.key_end = cursor.key_beg;
2553         cursor.key_end.key |= 0xFFFFFFFFULL;
2554         cursor.asof = dip->obj_asof;
2555         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2556
2557         /*
2558          * Scan all matching records (the chain), locate the one matching
2559          * the requested path component.  info->last_error contains the
2560          * error code on search termination and could be 0, ENOENT, or
2561          * something else.
2562          *
2563          * The hammer_ip_*() functions merge in-memory records with on-disk
2564          * records for the purposes of the search.
2565          */
2566         error = hammer_ip_first(&cursor);
2567
2568         while (error == 0) {
2569                 error = hammer_ip_resolve_data(&cursor);
2570                 if (error)
2571                         break;
2572                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2573                 KKASSERT(nlen > 0);
2574                 if (ncp->nc_nlen == nlen &&
2575                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2576                         break;
2577                 }
2578                 error = hammer_ip_next(&cursor);
2579         }
2580
2581         /*
2582          * If all is ok we have to get the inode so we can adjust nlinks.
2583          * To avoid a deadlock with the flusher we must release the inode
2584          * lock on the directory when acquiring the inode for the entry.
2585          *
2586          * If the target is a directory, it must be empty.
2587          */
2588         if (error == 0) {
2589                 hammer_unlock(&cursor.ip->lock);
2590                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2591                                       dip->hmp->asof,
2592                                       cursor.data->entry.localization,
2593                                       0, &error);
2594                 hammer_lock_sh(&cursor.ip->lock);
2595                 if (error == ENOENT) {
2596                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2597                         Debugger("ENOENT unlinking object that should exist");
2598                 }
2599
2600                 /*
2601                  * If we are trying to remove a directory the directory must
2602                  * be empty.
2603                  *
2604                  * WARNING: hammer_ip_check_directory_empty() may have to
2605                  * terminate the cursor to avoid a deadlock.  It is ok to
2606                  * call hammer_done_cursor() twice.
2607                  */
2608                 if (error == 0 && ip->ino_data.obj_type ==
2609                                   HAMMER_OBJTYPE_DIRECTORY) {
2610                         error = hammer_ip_check_directory_empty(trans, ip);
2611                 }
2612
2613                 /*
2614                  * Delete the directory entry.
2615                  *
2616                  * WARNING: hammer_ip_del_directory() may have to terminate
2617                  * the cursor to avoid a deadlock.  It is ok to call
2618                  * hammer_done_cursor() twice.
2619                  */
2620                 if (error == 0) {
2621                         error = hammer_ip_del_directory(trans, &cursor,
2622                                                         dip, ip);
2623                 }
2624                 hammer_done_cursor(&cursor);
2625                 if (error == 0) {
2626                         cache_setunresolved(nch);
2627                         cache_setvp(nch, NULL);
2628                         /* XXX locking */
2629                         if (ip->vp)
2630                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2631                 }
2632                 if (ip)
2633                         hammer_rel_inode(ip, 0);
2634         } else {
2635                 hammer_done_cursor(&cursor);
2636         }
2637         if (error == EDEADLK)
2638                 goto retry;
2639
2640         return (error);
2641 }
2642
2643 /************************************************************************
2644  *                          FIFO AND SPECFS OPS                         *
2645  ************************************************************************
2646  *
2647  */
2648
2649 static int
2650 hammer_vop_fifoclose (struct vop_close_args *ap)
2651 {
2652         /* XXX update itimes */
2653         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2654 }
2655
2656 static int
2657 hammer_vop_fiforead (struct vop_read_args *ap)
2658 {
2659         int error;
2660
2661         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2662         /* XXX update access time */
2663         return (error);
2664 }
2665
2666 static int
2667 hammer_vop_fifowrite (struct vop_write_args *ap)
2668 {
2669         int error;
2670
2671         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2672         /* XXX update access time */
2673         return (error);
2674 }
2675
2676 static int
2677 hammer_vop_specclose (struct vop_close_args *ap)
2678 {
2679         /* XXX update itimes */
2680         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2681 }
2682
2683 static int
2684 hammer_vop_specread (struct vop_read_args *ap)
2685 {
2686         /* XXX update access time */
2687         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2688 }
2689
2690 static int
2691 hammer_vop_specwrite (struct vop_write_args *ap)
2692 {
2693         /* XXX update last change time */
2694         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2695 }
2696