When creating a new HAMMER filesystem also create a PFS record for it,
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.81 2008/07/07 00:24:31 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91
92 struct vop_ops hammer_vnode_vops = {
93         .vop_default =          vop_defaultop,
94         .vop_fsync =            hammer_vop_fsync,
95         .vop_getpages =         vop_stdgetpages,
96         .vop_putpages =         vop_stdputpages,
97         .vop_read =             hammer_vop_read,
98         .vop_write =            hammer_vop_write,
99         .vop_access =           hammer_vop_access,
100         .vop_advlock =          hammer_vop_advlock,
101         .vop_close =            hammer_vop_close,
102         .vop_ncreate =          hammer_vop_ncreate,
103         .vop_getattr =          hammer_vop_getattr,
104         .vop_inactive =         hammer_vop_inactive,
105         .vop_reclaim =          hammer_vop_reclaim,
106         .vop_nresolve =         hammer_vop_nresolve,
107         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
108         .vop_nlink =            hammer_vop_nlink,
109         .vop_nmkdir =           hammer_vop_nmkdir,
110         .vop_nmknod =           hammer_vop_nmknod,
111         .vop_open =             hammer_vop_open,
112         .vop_pathconf =         hammer_vop_pathconf,
113         .vop_print =            hammer_vop_print,
114         .vop_readdir =          hammer_vop_readdir,
115         .vop_readlink =         hammer_vop_readlink,
116         .vop_nremove =          hammer_vop_nremove,
117         .vop_nrename =          hammer_vop_nrename,
118         .vop_nrmdir =           hammer_vop_nrmdir,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl
126 };
127
128 struct vop_ops hammer_spec_vops = {
129         .vop_default =          spec_vnoperate,
130         .vop_fsync =            hammer_vop_fsync,
131         .vop_read =             hammer_vop_specread,
132         .vop_write =            hammer_vop_specwrite,
133         .vop_access =           hammer_vop_access,
134         .vop_close =            hammer_vop_specclose,
135         .vop_getattr =          hammer_vop_getattr,
136         .vop_inactive =         hammer_vop_inactive,
137         .vop_reclaim =          hammer_vop_reclaim,
138         .vop_setattr =          hammer_vop_setattr
139 };
140
141 struct vop_ops hammer_fifo_vops = {
142         .vop_default =          fifo_vnoperate,
143         .vop_fsync =            hammer_vop_fsync,
144         .vop_read =             hammer_vop_fiforead,
145         .vop_write =            hammer_vop_fifowrite,
146         .vop_access =           hammer_vop_access,
147         .vop_close =            hammer_vop_fifoclose,
148         .vop_getattr =          hammer_vop_getattr,
149         .vop_inactive =         hammer_vop_inactive,
150         .vop_reclaim =          hammer_vop_reclaim,
151         .vop_setattr =          hammer_vop_setattr
152 };
153
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159                            struct vnode *dvp, struct ucred *cred, int flags);
160 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
161 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
162
163 #if 0
164 static
165 int
166 hammer_vop_vnoperate(struct vop_generic_args *)
167 {
168         return (VOCALL(&hammer_vnode_vops, ap));
169 }
170 #endif
171
172 /*
173  * hammer_vop_fsync { vp, waitfor }
174  *
175  * fsync() an inode to disk and wait for it to be completely committed
176  * such that the information would not be undone if a crash occured after
177  * return.
178  */
179 static
180 int
181 hammer_vop_fsync(struct vop_fsync_args *ap)
182 {
183         hammer_inode_t ip = VTOI(ap->a_vp);
184
185         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
186         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
187         if (ap->a_waitfor == MNT_WAIT)
188                 hammer_wait_inode(ip);
189         return (ip->error);
190 }
191
192 /*
193  * hammer_vop_read { vp, uio, ioflag, cred }
194  */
195 static
196 int
197 hammer_vop_read(struct vop_read_args *ap)
198 {
199         struct hammer_transaction trans;
200         hammer_inode_t ip;
201         off_t offset;
202         struct buf *bp;
203         struct uio *uio;
204         int error;
205         int n;
206         int seqcount;
207         int ioseqcount;
208         int blksize;
209
210         if (ap->a_vp->v_type != VREG)
211                 return (EINVAL);
212         ip = VTOI(ap->a_vp);
213         error = 0;
214         uio = ap->a_uio;
215
216         /*
217          * Allow the UIO's size to override the sequential heuristic.
218          */
219         blksize = hammer_blocksize(uio->uio_offset);
220         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
221         ioseqcount = ap->a_ioflag >> 16;
222         if (seqcount < ioseqcount)
223                 seqcount = ioseqcount;
224
225         hammer_start_transaction(&trans, ip->hmp);
226
227         /*
228          * Access the data typically in HAMMER_BUFSIZE blocks via the
229          * buffer cache, but HAMMER may use a variable block size based
230          * on the offset.
231          */
232         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
233                 int64_t base_offset;
234                 int64_t file_limit;
235
236                 blksize = hammer_blocksize(uio->uio_offset);
237                 offset = (int)uio->uio_offset & (blksize - 1);
238                 base_offset = uio->uio_offset - offset;
239
240                 if (hammer_debug_cluster_enable) {
241                         /*
242                          * Use file_limit to prevent cluster_read() from
243                          * creating buffers of the wrong block size past
244                          * the demarc.
245                          */
246                         file_limit = ip->ino_data.size;
247                         if (base_offset < HAMMER_XDEMARC &&
248                             file_limit > HAMMER_XDEMARC) {
249                                 file_limit = HAMMER_XDEMARC;
250                         }
251                         error = cluster_read(ap->a_vp,
252                                              file_limit, base_offset,
253                                              blksize, MAXPHYS,
254                                              seqcount, &bp);
255                 } else {
256                         error = bread(ap->a_vp, base_offset, blksize, &bp);
257                 }
258                 if (error) {
259                         kprintf("error %d\n", error);
260                         brelse(bp);
261                         break;
262                 }
263
264                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
265                 n = blksize - offset;
266                 if (n > uio->uio_resid)
267                         n = uio->uio_resid;
268                 if (n > ip->ino_data.size - uio->uio_offset)
269                         n = (int)(ip->ino_data.size - uio->uio_offset);
270                 error = uiomove((char *)bp->b_data + offset, n, uio);
271
272                 /* data has a lower priority then meta-data */
273                 bp->b_flags |= B_AGE;
274                 bqrelse(bp);
275                 if (error)
276                         break;
277         }
278         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
279             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
280                 ip->ino_data.atime = trans.time;
281                 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
282         }
283         hammer_done_transaction(&trans);
284         return (error);
285 }
286
287 /*
288  * hammer_vop_write { vp, uio, ioflag, cred }
289  */
290 static
291 int
292 hammer_vop_write(struct vop_write_args *ap)
293 {
294         struct hammer_transaction trans;
295         struct hammer_inode *ip;
296         hammer_mount_t hmp;
297         struct uio *uio;
298         int offset;
299         off_t base_offset;
300         struct buf *bp;
301         int error;
302         int n;
303         int flags;
304         int delta;
305         int seqcount;
306
307         if (ap->a_vp->v_type != VREG)
308                 return (EINVAL);
309         ip = VTOI(ap->a_vp);
310         hmp = ip->hmp;
311         error = 0;
312         seqcount = ap->a_ioflag >> 16;
313
314         if (ip->flags & HAMMER_INODE_RO)
315                 return (EROFS);
316
317         /*
318          * Create a transaction to cover the operations we perform.
319          */
320         hammer_start_transaction(&trans, hmp);
321         uio = ap->a_uio;
322
323         /*
324          * Check append mode
325          */
326         if (ap->a_ioflag & IO_APPEND)
327                 uio->uio_offset = ip->ino_data.size;
328
329         /*
330          * Check for illegal write offsets.  Valid range is 0...2^63-1.
331          *
332          * NOTE: the base_off assignment is required to work around what
333          * I consider to be a GCC-4 optimization bug.
334          */
335         if (uio->uio_offset < 0) {
336                 hammer_done_transaction(&trans);
337                 return (EFBIG);
338         }
339         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
340         if (uio->uio_resid > 0 && base_offset <= 0) {
341                 hammer_done_transaction(&trans);
342                 return (EFBIG);
343         }
344
345         /*
346          * Access the data typically in HAMMER_BUFSIZE blocks via the
347          * buffer cache, but HAMMER may use a variable block size based
348          * on the offset.
349          */
350         while (uio->uio_resid > 0) {
351                 int fixsize = 0;
352                 int blksize;
353                 int blkmask;
354
355                 if ((error = hammer_checkspace(hmp, HAMMER_CHECKSPACE_SLOP_WRITE)) != 0)
356                         break;
357
358                 blksize = hammer_blocksize(uio->uio_offset);
359
360                 /*
361                  * Do not allow HAMMER to blow out the buffer cache.  Very
362                  * large UIOs can lockout other processes due to bwillwrite()
363                  * mechanics.
364                  *
365                  * Do not allow HAMMER to blow out system memory by
366                  * accumulating too many records.   Records are so well
367                  * decoupled from the buffer cache that it is possible
368                  * for userland to push data out to the media via
369                  * direct-write, but build up the records queued to the
370                  * backend faster then the backend can flush them out.
371                  * HAMMER has hit its write limit but the frontend has
372                  * no pushback to slow it down.
373                  *
374                  * The hammer inode is not locked during these operations.
375                  * The vnode is locked which can interfere with the pageout
376                  * daemon for non-UIO_NOCOPY writes but should not interfere
377                  * with the buffer cache.  Even so, we cannot afford to
378                  * allow the pageout daemon to build up too many dirty buffer
379                  * cache buffers.
380                  */
381                 bwillwrite(blksize);
382
383                 /*
384                  * Pending record flush check.
385                  */
386                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
387                         /*
388                          * Get the inode on the flush list
389                          */
390                         if (ip->rsv_recs >= 64)
391                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
392                         else if (ip->rsv_recs >= 16)
393                                 hammer_flush_inode(ip, 0);
394
395                         /*
396                          * Keep the flusher going if the system keeps
397                          * queueing records.
398                          */
399                         delta = hmp->count_newrecords -
400                                 hmp->last_newrecords;
401                         if (delta < 0 || delta > hammer_limit_recs / 2) {
402                                 hmp->last_newrecords = hmp->count_newrecords;
403                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
404                         }
405
406                         /*
407                          * If we have gotten behind start slowing
408                          * down the writers.
409                          */
410                         delta = (hmp->rsv_recs - hammer_limit_recs) *
411                                 hz / hammer_limit_recs;
412                         if (delta > 0)
413                                 tsleep(&trans, 0, "hmrslo", delta);
414                 }
415
416                 /*
417                  * Calculate the blocksize at the current offset and figure
418                  * out how much we can actually write.
419                  */
420                 blkmask = blksize - 1;
421                 offset = (int)uio->uio_offset & blkmask;
422                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
423                 n = blksize - offset;
424                 if (n > uio->uio_resid)
425                         n = uio->uio_resid;
426                 if (uio->uio_offset + n > ip->ino_data.size) {
427                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
428                         fixsize = 1;
429                 }
430
431                 if (uio->uio_segflg == UIO_NOCOPY) {
432                         /*
433                          * Issuing a write with the same data backing the
434                          * buffer.  Instantiate the buffer to collect the
435                          * backing vm pages, then read-in any missing bits.
436                          *
437                          * This case is used by vop_stdputpages().
438                          */
439                         bp = getblk(ap->a_vp, base_offset,
440                                     blksize, GETBLK_BHEAVY, 0);
441                         if ((bp->b_flags & B_CACHE) == 0) {
442                                 bqrelse(bp);
443                                 error = bread(ap->a_vp, base_offset,
444                                               blksize, &bp);
445                         }
446                 } else if (offset == 0 && uio->uio_resid >= blksize) {
447                         /*
448                          * Even though we are entirely overwriting the buffer
449                          * we may still have to zero it out to avoid a 
450                          * mmap/write visibility issue.
451                          */
452                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
453                         if ((bp->b_flags & B_CACHE) == 0)
454                                 vfs_bio_clrbuf(bp);
455                 } else if (base_offset >= ip->ino_data.size) {
456                         /*
457                          * If the base offset of the buffer is beyond the
458                          * file EOF, we don't have to issue a read.
459                          */
460                         bp = getblk(ap->a_vp, base_offset,
461                                     blksize, GETBLK_BHEAVY, 0);
462                         vfs_bio_clrbuf(bp);
463                 } else {
464                         /*
465                          * Partial overwrite, read in any missing bits then
466                          * replace the portion being written.
467                          */
468                         error = bread(ap->a_vp, base_offset, blksize, &bp);
469                         if (error == 0)
470                                 bheavy(bp);
471                 }
472                 if (error == 0) {
473                         error = uiomove((char *)bp->b_data + offset,
474                                         n, uio);
475                 }
476
477                 /*
478                  * If we screwed up we have to undo any VM size changes we
479                  * made.
480                  */
481                 if (error) {
482                         brelse(bp);
483                         if (fixsize) {
484                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
485                                           hammer_blocksize(ip->ino_data.size));
486                         }
487                         break;
488                 }
489                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
490                 if (ip->ino_data.size < uio->uio_offset) {
491                         ip->ino_data.size = uio->uio_offset;
492                         flags = HAMMER_INODE_DDIRTY;
493                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
494                 } else {
495                         flags = 0;
496                 }
497                 ip->ino_data.mtime = trans.time;
498                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
499                 hammer_modify_inode(ip, flags);
500
501                 /*
502                  * Final buffer disposition.
503                  */
504                 bp->b_flags |= B_AGE;
505                 if (ap->a_ioflag & IO_SYNC) {
506                         bwrite(bp);
507                 } else if (ap->a_ioflag & IO_DIRECT) {
508                         bawrite(bp);
509                 } else {
510                         bdwrite(bp);
511                 }
512         }
513         hammer_done_transaction(&trans);
514         return (error);
515 }
516
517 /*
518  * hammer_vop_access { vp, mode, cred }
519  */
520 static
521 int
522 hammer_vop_access(struct vop_access_args *ap)
523 {
524         struct hammer_inode *ip = VTOI(ap->a_vp);
525         uid_t uid;
526         gid_t gid;
527         int error;
528
529         uid = hammer_to_unix_xid(&ip->ino_data.uid);
530         gid = hammer_to_unix_xid(&ip->ino_data.gid);
531
532         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
533                                   ip->ino_data.uflags);
534         return (error);
535 }
536
537 /*
538  * hammer_vop_advlock { vp, id, op, fl, flags }
539  */
540 static
541 int
542 hammer_vop_advlock(struct vop_advlock_args *ap)
543 {
544         hammer_inode_t ip = VTOI(ap->a_vp);
545
546         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
547 }
548
549 /*
550  * hammer_vop_close { vp, fflag }
551  */
552 static
553 int
554 hammer_vop_close(struct vop_close_args *ap)
555 {
556         hammer_inode_t ip = VTOI(ap->a_vp);
557
558         if ((ip->flags | ip->sync_flags) & HAMMER_INODE_MODMASK)
559                 hammer_inode_waitreclaims(ip->hmp);
560         return (vop_stdclose(ap));
561 }
562
563 /*
564  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
565  *
566  * The operating system has already ensured that the directory entry
567  * does not exist and done all appropriate namespace locking.
568  */
569 static
570 int
571 hammer_vop_ncreate(struct vop_ncreate_args *ap)
572 {
573         struct hammer_transaction trans;
574         struct hammer_inode *dip;
575         struct hammer_inode *nip;
576         struct nchandle *nch;
577         int error;
578
579         nch = ap->a_nch;
580         dip = VTOI(ap->a_dvp);
581
582         if (dip->flags & HAMMER_INODE_RO)
583                 return (EROFS);
584         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
585                 return (error);
586
587         /*
588          * Create a transaction to cover the operations we perform.
589          */
590         hammer_start_transaction(&trans, dip->hmp);
591
592         /*
593          * Create a new filesystem object of the requested type.  The
594          * returned inode will be referenced and shared-locked to prevent
595          * it from being moved to the flusher.
596          */
597
598         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
599                                     dip, 0, &nip);
600         if (error) {
601                 hkprintf("hammer_create_inode error %d\n", error);
602                 hammer_done_transaction(&trans);
603                 *ap->a_vpp = NULL;
604                 return (error);
605         }
606
607         /*
608          * Add the new filesystem object to the directory.  This will also
609          * bump the inode's link count.
610          */
611         error = hammer_ip_add_directory(&trans, dip,
612                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
613                                         nip);
614         if (error)
615                 hkprintf("hammer_ip_add_directory error %d\n", error);
616
617         /*
618          * Finish up.
619          */
620         if (error) {
621                 hammer_rel_inode(nip, 0);
622                 hammer_done_transaction(&trans);
623                 *ap->a_vpp = NULL;
624         } else {
625                 error = hammer_get_vnode(nip, ap->a_vpp);
626                 hammer_done_transaction(&trans);
627                 hammer_rel_inode(nip, 0);
628                 if (error == 0) {
629                         cache_setunresolved(ap->a_nch);
630                         cache_setvp(ap->a_nch, *ap->a_vpp);
631                 }
632         }
633         return (error);
634 }
635
636 /*
637  * hammer_vop_getattr { vp, vap }
638  *
639  * Retrieve an inode's attribute information.  When accessing inodes
640  * historically we fake the atime field to ensure consistent results.
641  * The atime field is stored in the B-Tree element and allowed to be
642  * updated without cycling the element.
643  */
644 static
645 int
646 hammer_vop_getattr(struct vop_getattr_args *ap)
647 {
648         struct hammer_inode *ip = VTOI(ap->a_vp);
649         struct vattr *vap = ap->a_vap;
650
651         vap->va_fsid = ip->hmp->fsid_udev;
652         /* 
653          * XXX munge the device if we are in a pseudo-fs, so user utilities
654          * do not think its the same 'filesystem'.
655          */
656         if (ip->obj_localization)
657                 vap->va_fsid += ip->obj_localization;
658         vap->va_fileid = ip->ino_leaf.base.obj_id;
659         vap->va_mode = ip->ino_data.mode;
660         vap->va_nlink = ip->ino_data.nlinks;
661         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
662         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
663         vap->va_rmajor = 0;
664         vap->va_rminor = 0;
665         vap->va_size = ip->ino_data.size;
666
667         /*
668          * We must provide a consistent atime and mtime for snapshots
669          * so people can do a 'tar cf - ... | md5' on them and get
670          * consistent results.
671          */
672         if (ip->flags & HAMMER_INODE_RO) {
673                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
674                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
675         } else {
676                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
677                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
678         }
679         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
680         vap->va_flags = ip->ino_data.uflags;
681         vap->va_gen = 1;        /* hammer inums are unique for all time */
682         vap->va_blocksize = HAMMER_BUFSIZE;
683         if (ip->ino_data.size >= HAMMER_XDEMARC) {
684                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
685                                 ~HAMMER_XBUFMASK64;
686         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
687                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
688                                 ~HAMMER_BUFMASK64;
689         } else {
690                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
691         }
692         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
693         vap->va_filerev = 0;    /* XXX */
694         /* mtime uniquely identifies any adjustments made to the file XXX */
695         vap->va_fsmid = ip->ino_data.mtime;
696         vap->va_uid_uuid = ip->ino_data.uid;
697         vap->va_gid_uuid = ip->ino_data.gid;
698         vap->va_fsid_uuid = ip->hmp->fsid;
699         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
700                           VA_FSID_UUID_VALID;
701
702         switch (ip->ino_data.obj_type) {
703         case HAMMER_OBJTYPE_CDEV:
704         case HAMMER_OBJTYPE_BDEV:
705                 vap->va_rmajor = ip->ino_data.rmajor;
706                 vap->va_rminor = ip->ino_data.rminor;
707                 break;
708         default:
709                 break;
710         }
711
712         return(0);
713 }
714
715 /*
716  * hammer_vop_nresolve { nch, dvp, cred }
717  *
718  * Locate the requested directory entry.
719  */
720 static
721 int
722 hammer_vop_nresolve(struct vop_nresolve_args *ap)
723 {
724         struct hammer_transaction trans;
725         struct namecache *ncp;
726         hammer_inode_t dip;
727         hammer_inode_t ip;
728         hammer_tid_t asof;
729         struct hammer_cursor cursor;
730         struct vnode *vp;
731         int64_t namekey;
732         int error;
733         int i;
734         int nlen;
735         int flags;
736         int64_t obj_id;
737         u_int32_t localization;
738
739         /*
740          * Misc initialization, plus handle as-of name extensions.  Look for
741          * the '@@' extension.  Note that as-of files and directories cannot
742          * be modified.
743          */
744         dip = VTOI(ap->a_dvp);
745         ncp = ap->a_nch->ncp;
746         asof = dip->obj_asof;
747         nlen = ncp->nc_nlen;
748         flags = dip->flags;
749
750         hammer_simple_transaction(&trans, dip->hmp);
751
752         for (i = 0; i < nlen; ++i) {
753                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
754                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
755                         flags |= HAMMER_INODE_RO;
756                         break;
757                 }
758         }
759         nlen = i;
760
761         /*
762          * If there is no path component the time extension is relative to
763          * dip.
764          */
765         if (nlen == 0) {
766                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
767                                       asof, dip->obj_localization,
768                                       flags, &error);
769                 if (error == 0) {
770                         error = hammer_get_vnode(ip, &vp);
771                         hammer_rel_inode(ip, 0);
772                 } else {
773                         vp = NULL;
774                 }
775                 if (error == 0) {
776                         vn_unlock(vp);
777                         cache_setvp(ap->a_nch, vp);
778                         vrele(vp);
779                 }
780                 goto done;
781         }
782
783         /*
784          * Calculate the namekey and setup the key range for the scan.  This
785          * works kinda like a chained hash table where the lower 32 bits
786          * of the namekey synthesize the chain.
787          *
788          * The key range is inclusive of both key_beg and key_end.
789          */
790         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
791
792         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
793         cursor.key_beg.localization = dip->obj_localization +
794                                       HAMMER_LOCALIZE_MISC;
795         cursor.key_beg.obj_id = dip->obj_id;
796         cursor.key_beg.key = namekey;
797         cursor.key_beg.create_tid = 0;
798         cursor.key_beg.delete_tid = 0;
799         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
800         cursor.key_beg.obj_type = 0;
801
802         cursor.key_end = cursor.key_beg;
803         cursor.key_end.key |= 0xFFFFFFFFULL;
804         cursor.asof = asof;
805         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
806
807         /*
808          * Scan all matching records (the chain), locate the one matching
809          * the requested path component.
810          *
811          * The hammer_ip_*() functions merge in-memory records with on-disk
812          * records for the purposes of the search.
813          */
814         obj_id = 0;
815         localization = HAMMER_DEF_LOCALIZATION;
816
817         if (error == 0) {
818                 error = hammer_ip_first(&cursor);
819                 while (error == 0) {
820                         error = hammer_ip_resolve_data(&cursor);
821                         if (error)
822                                 break;
823                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
824                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
825                                 obj_id = cursor.data->entry.obj_id;
826                                 localization = cursor.data->entry.localization;
827                                 break;
828                         }
829                         error = hammer_ip_next(&cursor);
830                 }
831         }
832         hammer_done_cursor(&cursor);
833         if (error == 0) {
834                 ip = hammer_get_inode(&trans, dip, obj_id,
835                                       asof, localization,
836                                       flags, &error);
837                 if (error == 0) {
838                         error = hammer_get_vnode(ip, &vp);
839                         hammer_rel_inode(ip, 0);
840                 } else {
841                         vp = NULL;
842                 }
843                 if (error == 0) {
844                         vn_unlock(vp);
845                         cache_setvp(ap->a_nch, vp);
846                         vrele(vp);
847                 }
848         } else if (error == ENOENT) {
849                 cache_setvp(ap->a_nch, NULL);
850         }
851 done:
852         hammer_done_transaction(&trans);
853         return (error);
854 }
855
856 /*
857  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
858  *
859  * Locate the parent directory of a directory vnode.
860  *
861  * dvp is referenced but not locked.  *vpp must be returned referenced and
862  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
863  * at the root, instead it could indicate that the directory we were in was
864  * removed.
865  *
866  * NOTE: as-of sequences are not linked into the directory structure.  If
867  * we are at the root with a different asof then the mount point, reload
868  * the same directory with the mount point's asof.   I'm not sure what this
869  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
870  * get confused, but it hasn't been tested.
871  */
872 static
873 int
874 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
875 {
876         struct hammer_transaction trans;
877         struct hammer_inode *dip;
878         struct hammer_inode *ip;
879         int64_t parent_obj_id;
880         u_int32_t parent_obj_localization;
881         hammer_tid_t asof;
882         int error;
883
884         dip = VTOI(ap->a_dvp);
885         asof = dip->obj_asof;
886
887         /*
888          * Whos are parent?  This could be the root of a pseudo-filesystem
889          * whos parent is in another localization domain.
890          */
891         parent_obj_id = dip->ino_data.parent_obj_id;
892         if (dip->obj_id == HAMMER_OBJID_ROOT)
893                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
894         else
895                 parent_obj_localization = dip->obj_localization;
896
897         if (parent_obj_id == 0) {
898                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
899                    asof != dip->hmp->asof) {
900                         parent_obj_id = dip->obj_id;
901                         asof = dip->hmp->asof;
902                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
903                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
904                                    dip->obj_asof);
905                 } else {
906                         *ap->a_vpp = NULL;
907                         return ENOENT;
908                 }
909         }
910
911         hammer_simple_transaction(&trans, dip->hmp);
912
913         ip = hammer_get_inode(&trans, dip, parent_obj_id,
914                               asof, parent_obj_localization,
915                               dip->flags, &error);
916         if (ip) {
917                 error = hammer_get_vnode(ip, ap->a_vpp);
918                 hammer_rel_inode(ip, 0);
919         } else {
920                 *ap->a_vpp = NULL;
921         }
922         hammer_done_transaction(&trans);
923         return (error);
924 }
925
926 /*
927  * hammer_vop_nlink { nch, dvp, vp, cred }
928  */
929 static
930 int
931 hammer_vop_nlink(struct vop_nlink_args *ap)
932 {
933         struct hammer_transaction trans;
934         struct hammer_inode *dip;
935         struct hammer_inode *ip;
936         struct nchandle *nch;
937         int error;
938
939         nch = ap->a_nch;
940         dip = VTOI(ap->a_dvp);
941         ip = VTOI(ap->a_vp);
942
943         if (dip->flags & HAMMER_INODE_RO)
944                 return (EROFS);
945         if (ip->flags & HAMMER_INODE_RO)
946                 return (EROFS);
947         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
948                 return (error);
949
950         /*
951          * Create a transaction to cover the operations we perform.
952          */
953         hammer_start_transaction(&trans, dip->hmp);
954
955         /*
956          * Add the filesystem object to the directory.  Note that neither
957          * dip nor ip are referenced or locked, but their vnodes are
958          * referenced.  This function will bump the inode's link count.
959          */
960         error = hammer_ip_add_directory(&trans, dip,
961                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
962                                         ip);
963
964         /*
965          * Finish up.
966          */
967         if (error == 0) {
968                 cache_setunresolved(nch);
969                 cache_setvp(nch, ap->a_vp);
970         }
971         hammer_done_transaction(&trans);
972         return (error);
973 }
974
975 /*
976  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
977  *
978  * The operating system has already ensured that the directory entry
979  * does not exist and done all appropriate namespace locking.
980  */
981 static
982 int
983 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
984 {
985         struct hammer_transaction trans;
986         struct hammer_inode *dip;
987         struct hammer_inode *nip;
988         struct nchandle *nch;
989         int error;
990
991         nch = ap->a_nch;
992         dip = VTOI(ap->a_dvp);
993
994         if (dip->flags & HAMMER_INODE_RO)
995                 return (EROFS);
996         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
997                 return (error);
998
999         /*
1000          * Create a transaction to cover the operations we perform.
1001          */
1002         hammer_start_transaction(&trans, dip->hmp);
1003
1004         /*
1005          * Create a new filesystem object of the requested type.  The
1006          * returned inode will be referenced but not locked.
1007          */
1008         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1009                                     dip, 0, &nip);
1010         if (error) {
1011                 hkprintf("hammer_mkdir error %d\n", error);
1012                 hammer_done_transaction(&trans);
1013                 *ap->a_vpp = NULL;
1014                 return (error);
1015         }
1016         /*
1017          * Add the new filesystem object to the directory.  This will also
1018          * bump the inode's link count.
1019          */
1020         error = hammer_ip_add_directory(&trans, dip,
1021                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1022                                         nip);
1023         if (error)
1024                 hkprintf("hammer_mkdir (add) error %d\n", error);
1025
1026         /*
1027          * Finish up.
1028          */
1029         if (error) {
1030                 hammer_rel_inode(nip, 0);
1031                 *ap->a_vpp = NULL;
1032         } else {
1033                 error = hammer_get_vnode(nip, ap->a_vpp);
1034                 hammer_rel_inode(nip, 0);
1035                 if (error == 0) {
1036                         cache_setunresolved(ap->a_nch);
1037                         cache_setvp(ap->a_nch, *ap->a_vpp);
1038                 }
1039         }
1040         hammer_done_transaction(&trans);
1041         return (error);
1042 }
1043
1044 /*
1045  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1046  *
1047  * The operating system has already ensured that the directory entry
1048  * does not exist and done all appropriate namespace locking.
1049  */
1050 static
1051 int
1052 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1053 {
1054         struct hammer_transaction trans;
1055         struct hammer_inode *dip;
1056         struct hammer_inode *nip;
1057         struct nchandle *nch;
1058         int error;
1059         int pseudofs;
1060
1061         nch = ap->a_nch;
1062         dip = VTOI(ap->a_dvp);
1063
1064         if (dip->flags & HAMMER_INODE_RO)
1065                 return (EROFS);
1066         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1067                 return (error);
1068
1069         /*
1070          * Create a transaction to cover the operations we perform.
1071          */
1072         hammer_start_transaction(&trans, dip->hmp);
1073
1074         /*
1075          * Create a new filesystem object of the requested type.  The
1076          * returned inode will be referenced but not locked.
1077          *
1078          * If mknod specifies a directory a pseudo-fs is created.
1079          */
1080         pseudofs = (ap->a_vap->va_type == VDIR);
1081         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1082                                     dip, pseudofs, &nip);
1083         if (error) {
1084                 hammer_done_transaction(&trans);
1085                 *ap->a_vpp = NULL;
1086                 return (error);
1087         }
1088
1089         /*
1090          * Add the new filesystem object to the directory.  This will also
1091          * bump the inode's link count.
1092          */
1093         error = hammer_ip_add_directory(&trans, dip,
1094                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1095                                         nip);
1096
1097         /*
1098          * Finish up.
1099          */
1100         if (error) {
1101                 hammer_rel_inode(nip, 0);
1102                 *ap->a_vpp = NULL;
1103         } else {
1104                 error = hammer_get_vnode(nip, ap->a_vpp);
1105                 hammer_rel_inode(nip, 0);
1106                 if (error == 0) {
1107                         cache_setunresolved(ap->a_nch);
1108                         cache_setvp(ap->a_nch, *ap->a_vpp);
1109                 }
1110         }
1111         hammer_done_transaction(&trans);
1112         return (error);
1113 }
1114
1115 /*
1116  * hammer_vop_open { vp, mode, cred, fp }
1117  */
1118 static
1119 int
1120 hammer_vop_open(struct vop_open_args *ap)
1121 {
1122         hammer_inode_t ip;
1123
1124         ip = VTOI(ap->a_vp);
1125
1126         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1127                 return (EROFS);
1128         return(vop_stdopen(ap));
1129 }
1130
1131 /*
1132  * hammer_vop_pathconf { vp, name, retval }
1133  */
1134 static
1135 int
1136 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1137 {
1138         return EOPNOTSUPP;
1139 }
1140
1141 /*
1142  * hammer_vop_print { vp }
1143  */
1144 static
1145 int
1146 hammer_vop_print(struct vop_print_args *ap)
1147 {
1148         return EOPNOTSUPP;
1149 }
1150
1151 /*
1152  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1153  */
1154 static
1155 int
1156 hammer_vop_readdir(struct vop_readdir_args *ap)
1157 {
1158         struct hammer_transaction trans;
1159         struct hammer_cursor cursor;
1160         struct hammer_inode *ip;
1161         struct uio *uio;
1162         hammer_base_elm_t base;
1163         int error;
1164         int cookie_index;
1165         int ncookies;
1166         off_t *cookies;
1167         off_t saveoff;
1168         int r;
1169
1170         ip = VTOI(ap->a_vp);
1171         uio = ap->a_uio;
1172         saveoff = uio->uio_offset;
1173
1174         if (ap->a_ncookies) {
1175                 ncookies = uio->uio_resid / 16 + 1;
1176                 if (ncookies > 1024)
1177                         ncookies = 1024;
1178                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1179                 cookie_index = 0;
1180         } else {
1181                 ncookies = -1;
1182                 cookies = NULL;
1183                 cookie_index = 0;
1184         }
1185
1186         hammer_simple_transaction(&trans, ip->hmp);
1187
1188         /*
1189          * Handle artificial entries
1190          */
1191         error = 0;
1192         if (saveoff == 0) {
1193                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1194                 if (r)
1195                         goto done;
1196                 if (cookies)
1197                         cookies[cookie_index] = saveoff;
1198                 ++saveoff;
1199                 ++cookie_index;
1200                 if (cookie_index == ncookies)
1201                         goto done;
1202         }
1203         if (saveoff == 1) {
1204                 if (ip->ino_data.parent_obj_id) {
1205                         r = vop_write_dirent(&error, uio,
1206                                              ip->ino_data.parent_obj_id,
1207                                              DT_DIR, 2, "..");
1208                 } else {
1209                         r = vop_write_dirent(&error, uio,
1210                                              ip->obj_id, DT_DIR, 2, "..");
1211                 }
1212                 if (r)
1213                         goto done;
1214                 if (cookies)
1215                         cookies[cookie_index] = saveoff;
1216                 ++saveoff;
1217                 ++cookie_index;
1218                 if (cookie_index == ncookies)
1219                         goto done;
1220         }
1221
1222         /*
1223          * Key range (begin and end inclusive) to scan.  Directory keys
1224          * directly translate to a 64 bit 'seek' position.
1225          */
1226         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1227         cursor.key_beg.localization = ip->obj_localization +
1228                                       HAMMER_LOCALIZE_MISC;
1229         cursor.key_beg.obj_id = ip->obj_id;
1230         cursor.key_beg.create_tid = 0;
1231         cursor.key_beg.delete_tid = 0;
1232         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1233         cursor.key_beg.obj_type = 0;
1234         cursor.key_beg.key = saveoff;
1235
1236         cursor.key_end = cursor.key_beg;
1237         cursor.key_end.key = HAMMER_MAX_KEY;
1238         cursor.asof = ip->obj_asof;
1239         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1240
1241         error = hammer_ip_first(&cursor);
1242
1243         while (error == 0) {
1244                 error = hammer_ip_resolve_data(&cursor);
1245                 if (error)
1246                         break;
1247                 base = &cursor.leaf->base;
1248                 saveoff = base->key;
1249                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1250
1251                 if (base->obj_id != ip->obj_id)
1252                         panic("readdir: bad record at %p", cursor.node);
1253
1254                 r = vop_write_dirent(
1255                              &error, uio, cursor.data->entry.obj_id,
1256                              hammer_get_dtype(cursor.leaf->base.obj_type),
1257                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1258                              (void *)cursor.data->entry.name);
1259                 if (r)
1260                         break;
1261                 ++saveoff;
1262                 if (cookies)
1263                         cookies[cookie_index] = base->key;
1264                 ++cookie_index;
1265                 if (cookie_index == ncookies)
1266                         break;
1267                 error = hammer_ip_next(&cursor);
1268         }
1269         hammer_done_cursor(&cursor);
1270
1271 done:
1272         hammer_done_transaction(&trans);
1273
1274         if (ap->a_eofflag)
1275                 *ap->a_eofflag = (error == ENOENT);
1276         uio->uio_offset = saveoff;
1277         if (error && cookie_index == 0) {
1278                 if (error == ENOENT)
1279                         error = 0;
1280                 if (cookies) {
1281                         kfree(cookies, M_TEMP);
1282                         *ap->a_ncookies = 0;
1283                         *ap->a_cookies = NULL;
1284                 }
1285         } else {
1286                 if (error == ENOENT)
1287                         error = 0;
1288                 if (cookies) {
1289                         *ap->a_ncookies = cookie_index;
1290                         *ap->a_cookies = cookies;
1291                 }
1292         }
1293         return(error);
1294 }
1295
1296 /*
1297  * hammer_vop_readlink { vp, uio, cred }
1298  */
1299 static
1300 int
1301 hammer_vop_readlink(struct vop_readlink_args *ap)
1302 {
1303         struct hammer_transaction trans;
1304         struct hammer_cursor cursor;
1305         struct hammer_inode *ip;
1306         int error;
1307
1308         ip = VTOI(ap->a_vp);
1309
1310         /*
1311          * Shortcut if the symlink data was stuffed into ino_data.
1312          */
1313         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1314                 error = uiomove(ip->ino_data.ext.symlink,
1315                                 ip->ino_data.size, ap->a_uio);
1316                 return(error);
1317         }
1318
1319         /*
1320          * Long version
1321          */
1322         hammer_simple_transaction(&trans, ip->hmp);
1323         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1324
1325         /*
1326          * Key range (begin and end inclusive) to scan.  Directory keys
1327          * directly translate to a 64 bit 'seek' position.
1328          */
1329         cursor.key_beg.localization = ip->obj_localization +
1330                                       HAMMER_LOCALIZE_MISC;
1331         cursor.key_beg.obj_id = ip->obj_id;
1332         cursor.key_beg.create_tid = 0;
1333         cursor.key_beg.delete_tid = 0;
1334         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1335         cursor.key_beg.obj_type = 0;
1336         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1337         cursor.asof = ip->obj_asof;
1338         cursor.flags |= HAMMER_CURSOR_ASOF;
1339
1340         error = hammer_ip_lookup(&cursor);
1341         if (error == 0) {
1342                 error = hammer_ip_resolve_data(&cursor);
1343                 if (error == 0) {
1344                         KKASSERT(cursor.leaf->data_len >=
1345                                  HAMMER_SYMLINK_NAME_OFF);
1346                         error = uiomove(cursor.data->symlink.name,
1347                                         cursor.leaf->data_len -
1348                                                 HAMMER_SYMLINK_NAME_OFF,
1349                                         ap->a_uio);
1350                 }
1351         }
1352         hammer_done_cursor(&cursor);
1353         hammer_done_transaction(&trans);
1354         return(error);
1355 }
1356
1357 /*
1358  * hammer_vop_nremove { nch, dvp, cred }
1359  */
1360 static
1361 int
1362 hammer_vop_nremove(struct vop_nremove_args *ap)
1363 {
1364         struct hammer_transaction trans;
1365         struct hammer_inode *dip;
1366         int error;
1367
1368         dip = VTOI(ap->a_dvp);
1369
1370         if (hammer_nohistory(dip) == 0 &&
1371             (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1372                 return (error);
1373         }
1374
1375         hammer_start_transaction(&trans, dip->hmp);
1376         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1377         hammer_done_transaction(&trans);
1378
1379         return (error);
1380 }
1381
1382 /*
1383  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1384  */
1385 static
1386 int
1387 hammer_vop_nrename(struct vop_nrename_args *ap)
1388 {
1389         struct hammer_transaction trans;
1390         struct namecache *fncp;
1391         struct namecache *tncp;
1392         struct hammer_inode *fdip;
1393         struct hammer_inode *tdip;
1394         struct hammer_inode *ip;
1395         struct hammer_cursor cursor;
1396         int64_t namekey;
1397         int nlen, error;
1398
1399         fdip = VTOI(ap->a_fdvp);
1400         tdip = VTOI(ap->a_tdvp);
1401         fncp = ap->a_fnch->ncp;
1402         tncp = ap->a_tnch->ncp;
1403         ip = VTOI(fncp->nc_vp);
1404         KKASSERT(ip != NULL);
1405
1406         if (fdip->flags & HAMMER_INODE_RO)
1407                 return (EROFS);
1408         if (tdip->flags & HAMMER_INODE_RO)
1409                 return (EROFS);
1410         if (ip->flags & HAMMER_INODE_RO)
1411                 return (EROFS);
1412         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1413                 return (error);
1414
1415         hammer_start_transaction(&trans, fdip->hmp);
1416
1417         /*
1418          * Remove tncp from the target directory and then link ip as
1419          * tncp. XXX pass trans to dounlink
1420          *
1421          * Force the inode sync-time to match the transaction so it is
1422          * in-sync with the creation of the target directory entry.
1423          */
1424         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1425         if (error == 0 || error == ENOENT) {
1426                 error = hammer_ip_add_directory(&trans, tdip,
1427                                                 tncp->nc_name, tncp->nc_nlen,
1428                                                 ip);
1429                 if (error == 0) {
1430                         ip->ino_data.parent_obj_id = tdip->obj_id;
1431                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1432                 }
1433         }
1434         if (error)
1435                 goto failed; /* XXX */
1436
1437         /*
1438          * Locate the record in the originating directory and remove it.
1439          *
1440          * Calculate the namekey and setup the key range for the scan.  This
1441          * works kinda like a chained hash table where the lower 32 bits
1442          * of the namekey synthesize the chain.
1443          *
1444          * The key range is inclusive of both key_beg and key_end.
1445          */
1446         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1447 retry:
1448         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1449         cursor.key_beg.localization = fdip->obj_localization +
1450                                       HAMMER_LOCALIZE_MISC;
1451         cursor.key_beg.obj_id = fdip->obj_id;
1452         cursor.key_beg.key = namekey;
1453         cursor.key_beg.create_tid = 0;
1454         cursor.key_beg.delete_tid = 0;
1455         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1456         cursor.key_beg.obj_type = 0;
1457
1458         cursor.key_end = cursor.key_beg;
1459         cursor.key_end.key |= 0xFFFFFFFFULL;
1460         cursor.asof = fdip->obj_asof;
1461         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1462
1463         /*
1464          * Scan all matching records (the chain), locate the one matching
1465          * the requested path component.
1466          *
1467          * The hammer_ip_*() functions merge in-memory records with on-disk
1468          * records for the purposes of the search.
1469          */
1470         error = hammer_ip_first(&cursor);
1471         while (error == 0) {
1472                 if (hammer_ip_resolve_data(&cursor) != 0)
1473                         break;
1474                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1475                 KKASSERT(nlen > 0);
1476                 if (fncp->nc_nlen == nlen &&
1477                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1478                         break;
1479                 }
1480                 error = hammer_ip_next(&cursor);
1481         }
1482
1483         /*
1484          * If all is ok we have to get the inode so we can adjust nlinks.
1485          *
1486          * WARNING: hammer_ip_del_directory() may have to terminate the
1487          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1488          * twice.
1489          */
1490         if (error == 0)
1491                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1492
1493         /*
1494          * XXX A deadlock here will break rename's atomicy for the purposes
1495          * of crash recovery.
1496          */
1497         if (error == EDEADLK) {
1498                 hammer_done_cursor(&cursor);
1499                 goto retry;
1500         }
1501
1502         /*
1503          * Cleanup and tell the kernel that the rename succeeded.
1504          */
1505         hammer_done_cursor(&cursor);
1506         if (error == 0)
1507                 cache_rename(ap->a_fnch, ap->a_tnch);
1508
1509 failed:
1510         hammer_done_transaction(&trans);
1511         return (error);
1512 }
1513
1514 /*
1515  * hammer_vop_nrmdir { nch, dvp, cred }
1516  */
1517 static
1518 int
1519 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1520 {
1521         struct hammer_transaction trans;
1522         struct hammer_inode *dip;
1523         int error;
1524
1525         dip = VTOI(ap->a_dvp);
1526
1527         if (hammer_nohistory(dip) == 0 &&
1528             (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1529                 return (error);
1530         }
1531
1532         hammer_start_transaction(&trans, dip->hmp);
1533         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1534         hammer_done_transaction(&trans);
1535
1536         return (error);
1537 }
1538
1539 /*
1540  * hammer_vop_setattr { vp, vap, cred }
1541  */
1542 static
1543 int
1544 hammer_vop_setattr(struct vop_setattr_args *ap)
1545 {
1546         struct hammer_transaction trans;
1547         struct vattr *vap;
1548         struct hammer_inode *ip;
1549         int modflags;
1550         int error;
1551         int truncating;
1552         int blksize;
1553         int64_t aligned_size;
1554         u_int32_t flags;
1555
1556         vap = ap->a_vap;
1557         ip = ap->a_vp->v_data;
1558         modflags = 0;
1559
1560         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1561                 return(EROFS);
1562         if (ip->flags & HAMMER_INODE_RO)
1563                 return (EROFS);
1564         if (hammer_nohistory(ip) == 0 &&
1565             (error = hammer_checkspace(ip->hmp, HAMMER_CHECKSPACE_SLOP_REMOVE)) != 0) {
1566                 return (error);
1567         }
1568
1569         hammer_start_transaction(&trans, ip->hmp);
1570         error = 0;
1571
1572         if (vap->va_flags != VNOVAL) {
1573                 flags = ip->ino_data.uflags;
1574                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1575                                          hammer_to_unix_xid(&ip->ino_data.uid),
1576                                          ap->a_cred);
1577                 if (error == 0) {
1578                         if (ip->ino_data.uflags != flags) {
1579                                 ip->ino_data.uflags = flags;
1580                                 modflags |= HAMMER_INODE_DDIRTY;
1581                         }
1582                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1583                                 error = 0;
1584                                 goto done;
1585                         }
1586                 }
1587                 goto done;
1588         }
1589         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1590                 error = EPERM;
1591                 goto done;
1592         }
1593         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1594                 mode_t cur_mode = ip->ino_data.mode;
1595                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1596                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1597                 uuid_t uuid_uid;
1598                 uuid_t uuid_gid;
1599
1600                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1601                                          ap->a_cred,
1602                                          &cur_uid, &cur_gid, &cur_mode);
1603                 if (error == 0) {
1604                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1605                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1606                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1607                                  sizeof(uuid_uid)) ||
1608                             bcmp(&uuid_gid, &ip->ino_data.gid,
1609                                  sizeof(uuid_gid)) ||
1610                             ip->ino_data.mode != cur_mode
1611                         ) {
1612                                 ip->ino_data.uid = uuid_uid;
1613                                 ip->ino_data.gid = uuid_gid;
1614                                 ip->ino_data.mode = cur_mode;
1615                         }
1616                         modflags |= HAMMER_INODE_DDIRTY;
1617                 }
1618         }
1619         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1620                 switch(ap->a_vp->v_type) {
1621                 case VREG:
1622                         if (vap->va_size == ip->ino_data.size)
1623                                 break;
1624                         /*
1625                          * XXX break atomicy, we can deadlock the backend
1626                          * if we do not release the lock.  Probably not a
1627                          * big deal here.
1628                          */
1629                         blksize = hammer_blocksize(vap->va_size);
1630                         if (vap->va_size < ip->ino_data.size) {
1631                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1632                                 truncating = 1;
1633                         } else {
1634                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1635                                 truncating = 0;
1636                         }
1637                         ip->ino_data.size = vap->va_size;
1638                         modflags |= HAMMER_INODE_DDIRTY;
1639
1640                         /*
1641                          * on-media truncation is cached in the inode until
1642                          * the inode is synchronized.
1643                          */
1644                         if (truncating) {
1645                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1646 #ifdef DEBUG_TRUNCATE
1647                                 if (HammerTruncIp == NULL)
1648                                         HammerTruncIp = ip;
1649 #endif
1650                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1651                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1652                                         ip->trunc_off = vap->va_size;
1653 #ifdef DEBUG_TRUNCATE
1654                                         if (ip == HammerTruncIp)
1655                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1656 #endif
1657                                 } else if (ip->trunc_off > vap->va_size) {
1658                                         ip->trunc_off = vap->va_size;
1659 #ifdef DEBUG_TRUNCATE
1660                                         if (ip == HammerTruncIp)
1661                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1662 #endif
1663                                 } else {
1664 #ifdef DEBUG_TRUNCATE
1665                                         if (ip == HammerTruncIp)
1666                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1667 #endif
1668                                 }
1669                         }
1670
1671                         /*
1672                          * If truncating we have to clean out a portion of
1673                          * the last block on-disk.  We do this in the
1674                          * front-end buffer cache.
1675                          */
1676                         aligned_size = (vap->va_size + (blksize - 1)) &
1677                                        ~(int64_t)(blksize - 1);
1678                         if (truncating && vap->va_size < aligned_size) {
1679                                 struct buf *bp;
1680                                 int offset;
1681
1682                                 aligned_size -= blksize;
1683
1684                                 offset = (int)vap->va_size & (blksize - 1);
1685                                 error = bread(ap->a_vp, aligned_size,
1686                                               blksize, &bp);
1687                                 hammer_ip_frontend_trunc(ip, aligned_size);
1688                                 if (error == 0) {
1689                                         bzero(bp->b_data + offset,
1690                                               blksize - offset);
1691                                         bdwrite(bp);
1692                                 } else {
1693                                         kprintf("ERROR %d\n", error);
1694                                         brelse(bp);
1695                                 }
1696                         }
1697                         break;
1698                 case VDATABASE:
1699                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1700                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1701                                 ip->trunc_off = vap->va_size;
1702                         } else if (ip->trunc_off > vap->va_size) {
1703                                 ip->trunc_off = vap->va_size;
1704                         }
1705                         hammer_ip_frontend_trunc(ip, vap->va_size);
1706                         ip->ino_data.size = vap->va_size;
1707                         modflags |= HAMMER_INODE_DDIRTY;
1708                         break;
1709                 default:
1710                         error = EINVAL;
1711                         goto done;
1712                 }
1713                 break;
1714         }
1715         if (vap->va_atime.tv_sec != VNOVAL) {
1716                 ip->ino_data.atime =
1717                         hammer_timespec_to_time(&vap->va_atime);
1718                 modflags |= HAMMER_INODE_ATIME;
1719         }
1720         if (vap->va_mtime.tv_sec != VNOVAL) {
1721                 ip->ino_data.mtime =
1722                         hammer_timespec_to_time(&vap->va_mtime);
1723                 modflags |= HAMMER_INODE_MTIME;
1724         }
1725         if (vap->va_mode != (mode_t)VNOVAL) {
1726                 mode_t   cur_mode = ip->ino_data.mode;
1727                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1728                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1729
1730                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1731                                          cur_uid, cur_gid, &cur_mode);
1732                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1733                         ip->ino_data.mode = cur_mode;
1734                         modflags |= HAMMER_INODE_DDIRTY;
1735                 }
1736         }
1737 done:
1738         if (error == 0)
1739                 hammer_modify_inode(ip, modflags);
1740         hammer_done_transaction(&trans);
1741         return (error);
1742 }
1743
1744 /*
1745  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1746  */
1747 static
1748 int
1749 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1750 {
1751         struct hammer_transaction trans;
1752         struct hammer_inode *dip;
1753         struct hammer_inode *nip;
1754         struct nchandle *nch;
1755         hammer_record_t record;
1756         int error;
1757         int bytes;
1758
1759         ap->a_vap->va_type = VLNK;
1760
1761         nch = ap->a_nch;
1762         dip = VTOI(ap->a_dvp);
1763
1764         if (dip->flags & HAMMER_INODE_RO)
1765                 return (EROFS);
1766         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0)
1767                 return (error);
1768
1769         /*
1770          * Create a transaction to cover the operations we perform.
1771          */
1772         hammer_start_transaction(&trans, dip->hmp);
1773
1774         /*
1775          * Create a new filesystem object of the requested type.  The
1776          * returned inode will be referenced but not locked.
1777          */
1778
1779         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1780                                     dip, 0, &nip);
1781         if (error) {
1782                 hammer_done_transaction(&trans);
1783                 *ap->a_vpp = NULL;
1784                 return (error);
1785         }
1786
1787         /*
1788          * Add a record representing the symlink.  symlink stores the link
1789          * as pure data, not a string, and is no \0 terminated.
1790          */
1791         if (error == 0) {
1792                 bytes = strlen(ap->a_target);
1793
1794                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1795                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1796                 } else {
1797                         record = hammer_alloc_mem_record(nip, bytes);
1798                         record->type = HAMMER_MEM_RECORD_GENERAL;
1799
1800                         record->leaf.base.localization = nip->obj_localization +
1801                                                          HAMMER_LOCALIZE_MISC;
1802                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1803                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1804                         record->leaf.data_len = bytes;
1805                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1806                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1807                         error = hammer_ip_add_record(&trans, record);
1808                 }
1809
1810                 /*
1811                  * Set the file size to the length of the link.
1812                  */
1813                 if (error == 0) {
1814                         nip->ino_data.size = bytes;
1815                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1816                 }
1817         }
1818         if (error == 0)
1819                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1820                                                 nch->ncp->nc_nlen, nip);
1821
1822         /*
1823          * Finish up.
1824          */
1825         if (error) {
1826                 hammer_rel_inode(nip, 0);
1827                 *ap->a_vpp = NULL;
1828         } else {
1829                 error = hammer_get_vnode(nip, ap->a_vpp);
1830                 hammer_rel_inode(nip, 0);
1831                 if (error == 0) {
1832                         cache_setunresolved(ap->a_nch);
1833                         cache_setvp(ap->a_nch, *ap->a_vpp);
1834                 }
1835         }
1836         hammer_done_transaction(&trans);
1837         return (error);
1838 }
1839
1840 /*
1841  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1842  */
1843 static
1844 int
1845 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1846 {
1847         struct hammer_transaction trans;
1848         struct hammer_inode *dip;
1849         int error;
1850
1851         dip = VTOI(ap->a_dvp);
1852
1853         if (hammer_nohistory(dip) == 0 &&
1854             (error = hammer_checkspace(dip->hmp, HAMMER_CHECKSPACE_SLOP_CREATE)) != 0) {
1855                 return (error);
1856         }
1857
1858         hammer_start_transaction(&trans, dip->hmp);
1859         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1860                                 ap->a_cred, ap->a_flags);
1861         hammer_done_transaction(&trans);
1862
1863         return (error);
1864 }
1865
1866 /*
1867  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1868  */
1869 static
1870 int
1871 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1872 {
1873         struct hammer_inode *ip = ap->a_vp->v_data;
1874
1875         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1876                             ap->a_fflag, ap->a_cred));
1877 }
1878
1879 static
1880 int
1881 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1882 {
1883         struct mount *mp;
1884         int error;
1885
1886         mp = ap->a_head.a_ops->head.vv_mount;
1887
1888         switch(ap->a_op) {
1889         case MOUNTCTL_SET_EXPORT:
1890                 if (ap->a_ctllen != sizeof(struct export_args))
1891                         error = EINVAL;
1892                 error = hammer_vfs_export(mp, ap->a_op,
1893                                       (const struct export_args *)ap->a_ctl);
1894                 break;
1895         default:
1896                 error = journal_mountctl(ap);
1897                 break;
1898         }
1899         return(error);
1900 }
1901
1902 /*
1903  * hammer_vop_strategy { vp, bio }
1904  *
1905  * Strategy call, used for regular file read & write only.  Note that the
1906  * bp may represent a cluster.
1907  *
1908  * To simplify operation and allow better optimizations in the future,
1909  * this code does not make any assumptions with regards to buffer alignment
1910  * or size.
1911  */
1912 static
1913 int
1914 hammer_vop_strategy(struct vop_strategy_args *ap)
1915 {
1916         struct buf *bp;
1917         int error;
1918
1919         bp = ap->a_bio->bio_buf;
1920
1921         switch(bp->b_cmd) {
1922         case BUF_CMD_READ:
1923                 error = hammer_vop_strategy_read(ap);
1924                 break;
1925         case BUF_CMD_WRITE:
1926                 error = hammer_vop_strategy_write(ap);
1927                 break;
1928         default:
1929                 bp->b_error = error = EINVAL;
1930                 bp->b_flags |= B_ERROR;
1931                 biodone(ap->a_bio);
1932                 break;
1933         }
1934         return (error);
1935 }
1936
1937 /*
1938  * Read from a regular file.  Iterate the related records and fill in the
1939  * BIO/BUF.  Gaps are zero-filled.
1940  *
1941  * The support code in hammer_object.c should be used to deal with mixed
1942  * in-memory and on-disk records.
1943  *
1944  * NOTE: Can be called from the cluster code with an oversized buf.
1945  *
1946  * XXX atime update
1947  */
1948 static
1949 int
1950 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1951 {
1952         struct hammer_transaction trans;
1953         struct hammer_inode *ip;
1954         struct hammer_cursor cursor;
1955         hammer_base_elm_t base;
1956         hammer_off_t disk_offset;
1957         struct bio *bio;
1958         struct bio *nbio;
1959         struct buf *bp;
1960         int64_t rec_offset;
1961         int64_t ran_end;
1962         int64_t tmp64;
1963         int error;
1964         int boff;
1965         int roff;
1966         int n;
1967
1968         bio = ap->a_bio;
1969         bp = bio->bio_buf;
1970         ip = ap->a_vp->v_data;
1971
1972         /*
1973          * The zone-2 disk offset may have been set by the cluster code via
1974          * a BMAP operation, or else should be NOOFFSET.
1975          *
1976          * Checking the high bits for a match against zone-2 should suffice.
1977          */
1978         nbio = push_bio(bio);
1979         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
1980             HAMMER_ZONE_RAW_BUFFER) {
1981                 error = hammer_io_direct_read(ip->hmp, nbio);
1982                 return (error);
1983         }
1984
1985         /*
1986          * Well, that sucked.  Do it the hard way.  If all the stars are
1987          * aligned we may still be able to issue a direct-read.
1988          */
1989         hammer_simple_transaction(&trans, ip->hmp);
1990         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1991
1992         /*
1993          * Key range (begin and end inclusive) to scan.  Note that the key's
1994          * stored in the actual records represent BASE+LEN, not BASE.  The
1995          * first record containing bio_offset will have a key > bio_offset.
1996          */
1997         cursor.key_beg.localization = ip->obj_localization +
1998                                       HAMMER_LOCALIZE_MISC;
1999         cursor.key_beg.obj_id = ip->obj_id;
2000         cursor.key_beg.create_tid = 0;
2001         cursor.key_beg.delete_tid = 0;
2002         cursor.key_beg.obj_type = 0;
2003         cursor.key_beg.key = bio->bio_offset + 1;
2004         cursor.asof = ip->obj_asof;
2005         cursor.flags |= HAMMER_CURSOR_ASOF;
2006
2007         cursor.key_end = cursor.key_beg;
2008         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2009 #if 0
2010         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2011                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2012                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2013                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2014         } else
2015 #endif
2016         {
2017                 ran_end = bio->bio_offset + bp->b_bufsize;
2018                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2019                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2020                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2021                 if (tmp64 < ran_end)
2022                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2023                 else
2024                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2025         }
2026         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2027
2028         error = hammer_ip_first(&cursor);
2029         boff = 0;
2030
2031         while (error == 0) {
2032                 /*
2033                  * Get the base file offset of the record.  The key for
2034                  * data records is (base + bytes) rather then (base).
2035                  */
2036                 base = &cursor.leaf->base;
2037                 rec_offset = base->key - cursor.leaf->data_len;
2038
2039                 /*
2040                  * Calculate the gap, if any, and zero-fill it.
2041                  *
2042                  * n is the offset of the start of the record verses our
2043                  * current seek offset in the bio.
2044                  */
2045                 n = (int)(rec_offset - (bio->bio_offset + boff));
2046                 if (n > 0) {
2047                         if (n > bp->b_bufsize - boff)
2048                                 n = bp->b_bufsize - boff;
2049                         bzero((char *)bp->b_data + boff, n);
2050                         boff += n;
2051                         n = 0;
2052                 }
2053
2054                 /*
2055                  * Calculate the data offset in the record and the number
2056                  * of bytes we can copy.
2057                  *
2058                  * There are two degenerate cases.  First, boff may already
2059                  * be at bp->b_bufsize.  Secondly, the data offset within
2060                  * the record may exceed the record's size.
2061                  */
2062                 roff = -n;
2063                 rec_offset += roff;
2064                 n = cursor.leaf->data_len - roff;
2065                 if (n <= 0) {
2066                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2067                         n = 0;
2068                 } else if (n > bp->b_bufsize - boff) {
2069                         n = bp->b_bufsize - boff;
2070                 }
2071
2072                 /*
2073                  * Deal with cached truncations.  This cool bit of code
2074                  * allows truncate()/ftruncate() to avoid having to sync
2075                  * the file.
2076                  *
2077                  * If the frontend is truncated then all backend records are
2078                  * subject to the frontend's truncation.
2079                  *
2080                  * If the backend is truncated then backend records on-disk
2081                  * (but not in-memory) are subject to the backend's
2082                  * truncation.  In-memory records owned by the backend
2083                  * represent data written after the truncation point on the
2084                  * backend and must not be truncated.
2085                  *
2086                  * Truncate operations deal with frontend buffer cache
2087                  * buffers and frontend-owned in-memory records synchronously.
2088                  */
2089                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2090                         if (hammer_cursor_ondisk(&cursor) ||
2091                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2092                                 if (ip->trunc_off <= rec_offset)
2093                                         n = 0;
2094                                 else if (ip->trunc_off < rec_offset + n)
2095                                         n = (int)(ip->trunc_off - rec_offset);
2096                         }
2097                 }
2098                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2099                         if (hammer_cursor_ondisk(&cursor)) {
2100                                 if (ip->sync_trunc_off <= rec_offset)
2101                                         n = 0;
2102                                 else if (ip->sync_trunc_off < rec_offset + n)
2103                                         n = (int)(ip->sync_trunc_off - rec_offset);
2104                         }
2105                 }
2106
2107                 /*
2108                  * Try to issue a direct read into our bio if possible,
2109                  * otherwise resolve the element data into a hammer_buffer
2110                  * and copy.
2111                  *
2112                  * The buffer on-disk should be zerod past any real
2113                  * truncation point, but may not be for any synthesized
2114                  * truncation point from above.
2115                  */
2116                 if (boff == 0 && n == bp->b_bufsize &&
2117                     ((cursor.leaf->data_offset + roff) & HAMMER_BUFMASK) == 0) {
2118                         disk_offset = hammer_blockmap_lookup(
2119                                                 trans.hmp,
2120                                                 cursor.leaf->data_offset + roff,
2121                                                 &error);
2122                         if (error)
2123                                 break;
2124                         nbio->bio_offset = disk_offset;
2125                         error = hammer_io_direct_read(trans.hmp, nbio);
2126                         goto done;
2127                 } else if (n) {
2128                         error = hammer_ip_resolve_data(&cursor);
2129                         if (error == 0) {
2130                                 bcopy((char *)cursor.data + roff,
2131                                       (char *)bp->b_data + boff, n);
2132                         }
2133                 }
2134                 if (error)
2135                         break;
2136
2137                 /*
2138                  * Iterate until we have filled the request.
2139                  */
2140                 boff += n;
2141                 if (boff == bp->b_bufsize)
2142                         break;
2143                 error = hammer_ip_next(&cursor);
2144         }
2145
2146         /*
2147          * There may have been a gap after the last record
2148          */
2149         if (error == ENOENT)
2150                 error = 0;
2151         if (error == 0 && boff != bp->b_bufsize) {
2152                 KKASSERT(boff < bp->b_bufsize);
2153                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2154                 /* boff = bp->b_bufsize; */
2155         }
2156         bp->b_resid = 0;
2157         bp->b_error = error;
2158         if (error)
2159                 bp->b_flags |= B_ERROR;
2160         biodone(ap->a_bio);
2161
2162 done:
2163         if (cursor.node)
2164                 hammer_cache_node(&ip->cache[1], cursor.node);
2165         hammer_done_cursor(&cursor);
2166         hammer_done_transaction(&trans);
2167         return(error);
2168 }
2169
2170 /*
2171  * BMAP operation - used to support cluster_read() only.
2172  *
2173  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2174  *
2175  * This routine may return EOPNOTSUPP if the opration is not supported for
2176  * the specified offset.  The contents of the pointer arguments do not
2177  * need to be initialized in that case. 
2178  *
2179  * If a disk address is available and properly aligned return 0 with 
2180  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2181  * to the run-length relative to that offset.  Callers may assume that
2182  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2183  * large, so return EOPNOTSUPP if it is not sufficiently large.
2184  */
2185 static
2186 int
2187 hammer_vop_bmap(struct vop_bmap_args *ap)
2188 {
2189         struct hammer_transaction trans;
2190         struct hammer_inode *ip;
2191         struct hammer_cursor cursor;
2192         hammer_base_elm_t base;
2193         int64_t rec_offset;
2194         int64_t ran_end;
2195         int64_t tmp64;
2196         int64_t base_offset;
2197         int64_t base_disk_offset;
2198         int64_t last_offset;
2199         hammer_off_t last_disk_offset;
2200         hammer_off_t disk_offset;
2201         int     rec_len;
2202         int     error;
2203         int     blksize;
2204
2205         ip = ap->a_vp->v_data;
2206
2207         /*
2208          * We can only BMAP regular files.  We can't BMAP database files,
2209          * directories, etc.
2210          */
2211         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2212                 return(EOPNOTSUPP);
2213
2214         /*
2215          * bmap is typically called with runp/runb both NULL when used
2216          * for writing.  We do not support BMAP for writing atm.
2217          */
2218         if (ap->a_cmd != BUF_CMD_READ)
2219                 return(EOPNOTSUPP);
2220
2221         /*
2222          * Scan the B-Tree to acquire blockmap addresses, then translate
2223          * to raw addresses.
2224          */
2225         hammer_simple_transaction(&trans, ip->hmp);
2226 #if 0
2227         kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2228 #endif
2229         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2230
2231         /*
2232          * Key range (begin and end inclusive) to scan.  Note that the key's
2233          * stored in the actual records represent BASE+LEN, not BASE.  The
2234          * first record containing bio_offset will have a key > bio_offset.
2235          */
2236         cursor.key_beg.localization = ip->obj_localization +
2237                                       HAMMER_LOCALIZE_MISC;
2238         cursor.key_beg.obj_id = ip->obj_id;
2239         cursor.key_beg.create_tid = 0;
2240         cursor.key_beg.delete_tid = 0;
2241         cursor.key_beg.obj_type = 0;
2242         if (ap->a_runb)
2243                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2244         else
2245                 cursor.key_beg.key = ap->a_loffset + 1;
2246         if (cursor.key_beg.key < 0)
2247                 cursor.key_beg.key = 0;
2248         cursor.asof = ip->obj_asof;
2249         cursor.flags |= HAMMER_CURSOR_ASOF;
2250
2251         cursor.key_end = cursor.key_beg;
2252         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2253
2254         ran_end = ap->a_loffset + MAXPHYS;
2255         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2256         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2257         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2258         if (tmp64 < ran_end)
2259                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2260         else
2261                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2262
2263         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2264
2265         error = hammer_ip_first(&cursor);
2266         base_offset = last_offset = 0;
2267         base_disk_offset = last_disk_offset = 0;
2268
2269         while (error == 0) {
2270                 /*
2271                  * Get the base file offset of the record.  The key for
2272                  * data records is (base + bytes) rather then (base).
2273                  *
2274                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2275                  * The extra bytes should be zero on-disk and the BMAP op
2276                  * should still be ok.
2277                  */
2278                 base = &cursor.leaf->base;
2279                 rec_offset = base->key - cursor.leaf->data_len;
2280                 rec_len    = cursor.leaf->data_len;
2281
2282                 /*
2283                  * Incorporate any cached truncation.
2284                  *
2285                  * NOTE: Modifications to rec_len based on synthesized
2286                  * truncation points remove the guarantee that any extended
2287                  * data on disk is zero (since the truncations may not have
2288                  * taken place on-media yet).
2289                  */
2290                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2291                         if (hammer_cursor_ondisk(&cursor) ||
2292                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2293                                 if (ip->trunc_off <= rec_offset)
2294                                         rec_len = 0;
2295                                 else if (ip->trunc_off < rec_offset + rec_len)
2296                                         rec_len = (int)(ip->trunc_off - rec_offset);
2297                         }
2298                 }
2299                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2300                         if (hammer_cursor_ondisk(&cursor)) {
2301                                 if (ip->sync_trunc_off <= rec_offset)
2302                                         rec_len = 0;
2303                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2304                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2305                         }
2306                 }
2307
2308                 /*
2309                  * Accumulate information.  If we have hit a discontiguous
2310                  * block reset base_offset unless we are already beyond the
2311                  * requested offset.  If we are, that's it, we stop.
2312                  */
2313                 disk_offset = hammer_blockmap_lookup(trans.hmp,
2314                                                      cursor.leaf->data_offset,
2315                                                      &error);
2316                 if (error)
2317                         break;
2318                 if (rec_offset != last_offset ||
2319                     disk_offset != last_disk_offset) {
2320                         if (rec_offset > ap->a_loffset)
2321                                 break;
2322                         base_offset = rec_offset;
2323                         base_disk_offset = disk_offset;
2324                 }
2325                 last_offset = rec_offset + rec_len;
2326                 last_disk_offset = disk_offset + rec_len;
2327
2328                 error = hammer_ip_next(&cursor);
2329         }
2330
2331 #if 0
2332         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2333                 ap->a_loffset, base_offset, last_offset);
2334         kprintf("BMAP %16s:  %016llx - %016llx\n",
2335                 "", base_disk_offset, last_disk_offset);
2336 #endif
2337
2338         if (cursor.node) {
2339                 hammer_cache_node(&ip->cache[1], cursor.node);
2340 #if 0
2341                 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2342 #endif
2343         }
2344         hammer_done_cursor(&cursor);
2345         hammer_done_transaction(&trans);
2346
2347         /*
2348          * If we couldn't find any records or the records we did find were
2349          * all behind the requested offset, return failure.  A forward
2350          * truncation can leave a hole w/ no on-disk records.
2351          */
2352         if (last_offset == 0 || last_offset < ap->a_loffset)
2353                 return (EOPNOTSUPP);
2354
2355         /*
2356          * Figure out the block size at the requested offset and adjust
2357          * our limits so the cluster_read() does not create inappropriately
2358          * sized buffer cache buffers.
2359          */
2360         blksize = hammer_blocksize(ap->a_loffset);
2361         if (hammer_blocksize(base_offset) != blksize) {
2362                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2363         }
2364         if (last_offset != ap->a_loffset &&
2365             hammer_blocksize(last_offset - 1) != blksize) {
2366                 last_offset = hammer_blockdemarc(ap->a_loffset,
2367                                                  last_offset - 1);
2368         }
2369
2370         /*
2371          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2372          * from occuring.
2373          */
2374         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2375
2376         /*
2377          * If doffsetp is not aligned or the forward run size does
2378          * not cover a whole buffer, disallow the direct I/O.
2379          */
2380         if ((disk_offset & HAMMER_BUFMASK) ||
2381             (last_offset - ap->a_loffset) < blksize) {
2382                 error = EOPNOTSUPP;
2383         } else {
2384                 *ap->a_doffsetp = disk_offset;
2385                 if (ap->a_runb) {
2386                         *ap->a_runb = ap->a_loffset - base_offset;
2387                         KKASSERT(*ap->a_runb >= 0);
2388                 }
2389                 if (ap->a_runp) {
2390                         *ap->a_runp = last_offset - ap->a_loffset;
2391                         KKASSERT(*ap->a_runp >= 0);
2392                 }
2393                 error = 0;
2394         }
2395         return(error);
2396 }
2397
2398 /*
2399  * Write to a regular file.   Because this is a strategy call the OS is
2400  * trying to actually get data onto the media.
2401  */
2402 static
2403 int
2404 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2405 {
2406         hammer_record_t record;
2407         hammer_mount_t hmp;
2408         hammer_inode_t ip;
2409         struct bio *bio;
2410         struct buf *bp;
2411         int blksize;
2412         int bytes;
2413         int error;
2414
2415         bio = ap->a_bio;
2416         bp = bio->bio_buf;
2417         ip = ap->a_vp->v_data;
2418         hmp = ip->hmp;
2419
2420         blksize = hammer_blocksize(bio->bio_offset);
2421         KKASSERT(bp->b_bufsize == blksize);
2422
2423         if (ip->flags & HAMMER_INODE_RO) {
2424                 bp->b_error = EROFS;
2425                 bp->b_flags |= B_ERROR;
2426                 biodone(ap->a_bio);
2427                 return(EROFS);
2428         }
2429
2430         /*
2431          * Interlock with inode destruction (no in-kernel or directory
2432          * topology visibility).  If we queue new IO while trying to
2433          * destroy the inode we can deadlock the vtrunc call in
2434          * hammer_inode_unloadable_check().
2435          */
2436         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2437                 bp->b_resid = 0;
2438                 biodone(ap->a_bio);
2439                 return(0);
2440         }
2441
2442         /*
2443          * Reserve space and issue a direct-write from the front-end. 
2444          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2445          * allocations.
2446          *
2447          * An in-memory record will be installed to reference the storage
2448          * until the flusher can get to it.
2449          *
2450          * Since we own the high level bio the front-end will not try to
2451          * do a direct-read until the write completes.
2452          *
2453          * NOTE: The only time we do not reserve a full-sized buffers
2454          * worth of data is if the file is small.  We do not try to
2455          * allocate a fragment (from the small-data zone) at the end of
2456          * an otherwise large file as this can lead to wildly separated
2457          * data.
2458          */
2459         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2460         KKASSERT(bio->bio_offset < ip->ino_data.size);
2461         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2462                 bytes = bp->b_bufsize;
2463         else
2464                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2465
2466         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2467                                     bytes, &error);
2468         if (record) {
2469                 hammer_io_direct_write(hmp, &record->leaf, bio);
2470                 hammer_rel_mem_record(record);
2471                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2472                         hammer_flush_inode(ip, 0);
2473         } else {
2474                 bp->b_bio2.bio_offset = NOOFFSET;
2475                 bp->b_error = error;
2476                 bp->b_flags |= B_ERROR;
2477                 biodone(ap->a_bio);
2478         }
2479         return(error);
2480 }
2481
2482 /*
2483  * dounlink - disconnect a directory entry
2484  *
2485  * XXX whiteout support not really in yet
2486  */
2487 static int
2488 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2489                 struct vnode *dvp, struct ucred *cred, int flags)
2490 {
2491         struct namecache *ncp;
2492         hammer_inode_t dip;
2493         hammer_inode_t ip;
2494         struct hammer_cursor cursor;
2495         int64_t namekey;
2496         int nlen, error;
2497
2498         /*
2499          * Calculate the namekey and setup the key range for the scan.  This
2500          * works kinda like a chained hash table where the lower 32 bits
2501          * of the namekey synthesize the chain.
2502          *
2503          * The key range is inclusive of both key_beg and key_end.
2504          */
2505         dip = VTOI(dvp);
2506         ncp = nch->ncp;
2507
2508         if (dip->flags & HAMMER_INODE_RO)
2509                 return (EROFS);
2510
2511         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2512 retry:
2513         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2514         cursor.key_beg.localization = dip->obj_localization +
2515                                       HAMMER_LOCALIZE_MISC;
2516         cursor.key_beg.obj_id = dip->obj_id;
2517         cursor.key_beg.key = namekey;
2518         cursor.key_beg.create_tid = 0;
2519         cursor.key_beg.delete_tid = 0;
2520         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2521         cursor.key_beg.obj_type = 0;
2522
2523         cursor.key_end = cursor.key_beg;
2524         cursor.key_end.key |= 0xFFFFFFFFULL;
2525         cursor.asof = dip->obj_asof;
2526         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2527
2528         /*
2529          * Scan all matching records (the chain), locate the one matching
2530          * the requested path component.  info->last_error contains the
2531          * error code on search termination and could be 0, ENOENT, or
2532          * something else.
2533          *
2534          * The hammer_ip_*() functions merge in-memory records with on-disk
2535          * records for the purposes of the search.
2536          */
2537         error = hammer_ip_first(&cursor);
2538
2539         while (error == 0) {
2540                 error = hammer_ip_resolve_data(&cursor);
2541                 if (error)
2542                         break;
2543                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2544                 KKASSERT(nlen > 0);
2545                 if (ncp->nc_nlen == nlen &&
2546                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2547                         break;
2548                 }
2549                 error = hammer_ip_next(&cursor);
2550         }
2551
2552         /*
2553          * If all is ok we have to get the inode so we can adjust nlinks.
2554          * To avoid a deadlock with the flusher we must release the inode
2555          * lock on the directory when acquiring the inode for the entry.
2556          *
2557          * If the target is a directory, it must be empty.
2558          */
2559         if (error == 0) {
2560                 hammer_unlock(&cursor.ip->lock);
2561                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2562                                       dip->hmp->asof,
2563                                       cursor.data->entry.localization,
2564                                       0, &error);
2565                 hammer_lock_sh(&cursor.ip->lock);
2566                 if (error == ENOENT) {
2567                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2568                         Debugger("ENOENT unlinking object that should exist");
2569                 }
2570
2571                 /*
2572                  * If we are trying to remove a directory the directory must
2573                  * be empty.
2574                  *
2575                  * WARNING: hammer_ip_check_directory_empty() may have to
2576                  * terminate the cursor to avoid a deadlock.  It is ok to
2577                  * call hammer_done_cursor() twice.
2578                  */
2579                 if (error == 0 && ip->ino_data.obj_type ==
2580                                   HAMMER_OBJTYPE_DIRECTORY) {
2581                         error = hammer_ip_check_directory_empty(trans, ip);
2582                 }
2583
2584                 /*
2585                  * Delete the directory entry.
2586                  *
2587                  * WARNING: hammer_ip_del_directory() may have to terminate
2588                  * the cursor to avoid a deadlock.  It is ok to call
2589                  * hammer_done_cursor() twice.
2590                  */
2591                 if (error == 0) {
2592                         error = hammer_ip_del_directory(trans, &cursor,
2593                                                         dip, ip);
2594                 }
2595                 hammer_done_cursor(&cursor);
2596                 if (error == 0) {
2597                         cache_setunresolved(nch);
2598                         cache_setvp(nch, NULL);
2599                         /* XXX locking */
2600                         if (ip->vp)
2601                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2602                 }
2603                 if (ip)
2604                         hammer_rel_inode(ip, 0);
2605         } else {
2606                 hammer_done_cursor(&cursor);
2607         }
2608         if (error == EDEADLK)
2609                 goto retry;
2610
2611         return (error);
2612 }
2613
2614 /************************************************************************
2615  *                          FIFO AND SPECFS OPS                         *
2616  ************************************************************************
2617  *
2618  */
2619
2620 static int
2621 hammer_vop_fifoclose (struct vop_close_args *ap)
2622 {
2623         /* XXX update itimes */
2624         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2625 }
2626
2627 static int
2628 hammer_vop_fiforead (struct vop_read_args *ap)
2629 {
2630         int error;
2631
2632         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2633         /* XXX update access time */
2634         return (error);
2635 }
2636
2637 static int
2638 hammer_vop_fifowrite (struct vop_write_args *ap)
2639 {
2640         int error;
2641
2642         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2643         /* XXX update access time */
2644         return (error);
2645 }
2646
2647 static int
2648 hammer_vop_specclose (struct vop_close_args *ap)
2649 {
2650         /* XXX update itimes */
2651         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2652 }
2653
2654 static int
2655 hammer_vop_specread (struct vop_read_args *ap)
2656 {
2657         /* XXX update access time */
2658         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2659 }
2660
2661 static int
2662 hammer_vop_specwrite (struct vop_write_args *ap)
2663 {
2664         /* XXX update last change time */
2665         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2666 }
2667