98bdcc1242c75636b3581e01024355449e801043
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.86 2008/07/11 05:44:23 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_bmap(struct vop_bmap_args *ap);
79 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
80 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
81 static int hammer_vop_ioctl(struct vop_ioctl_args *);
82 static int hammer_vop_mountctl(struct vop_mountctl_args *);
83
84 static int hammer_vop_fifoclose (struct vop_close_args *);
85 static int hammer_vop_fiforead (struct vop_read_args *);
86 static int hammer_vop_fifowrite (struct vop_write_args *);
87
88 static int hammer_vop_specclose (struct vop_close_args *);
89 static int hammer_vop_specread (struct vop_read_args *);
90 static int hammer_vop_specwrite (struct vop_write_args *);
91
92 struct vop_ops hammer_vnode_vops = {
93         .vop_default =          vop_defaultop,
94         .vop_fsync =            hammer_vop_fsync,
95         .vop_getpages =         vop_stdgetpages,
96         .vop_putpages =         vop_stdputpages,
97         .vop_read =             hammer_vop_read,
98         .vop_write =            hammer_vop_write,
99         .vop_access =           hammer_vop_access,
100         .vop_advlock =          hammer_vop_advlock,
101         .vop_close =            hammer_vop_close,
102         .vop_ncreate =          hammer_vop_ncreate,
103         .vop_getattr =          hammer_vop_getattr,
104         .vop_inactive =         hammer_vop_inactive,
105         .vop_reclaim =          hammer_vop_reclaim,
106         .vop_nresolve =         hammer_vop_nresolve,
107         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
108         .vop_nlink =            hammer_vop_nlink,
109         .vop_nmkdir =           hammer_vop_nmkdir,
110         .vop_nmknod =           hammer_vop_nmknod,
111         .vop_open =             hammer_vop_open,
112         .vop_pathconf =         hammer_vop_pathconf,
113         .vop_print =            hammer_vop_print,
114         .vop_readdir =          hammer_vop_readdir,
115         .vop_readlink =         hammer_vop_readlink,
116         .vop_nremove =          hammer_vop_nremove,
117         .vop_nrename =          hammer_vop_nrename,
118         .vop_nrmdir =           hammer_vop_nrmdir,
119         .vop_setattr =          hammer_vop_setattr,
120         .vop_bmap =             hammer_vop_bmap,
121         .vop_strategy =         hammer_vop_strategy,
122         .vop_nsymlink =         hammer_vop_nsymlink,
123         .vop_nwhiteout =        hammer_vop_nwhiteout,
124         .vop_ioctl =            hammer_vop_ioctl,
125         .vop_mountctl =         hammer_vop_mountctl
126 };
127
128 struct vop_ops hammer_spec_vops = {
129         .vop_default =          spec_vnoperate,
130         .vop_fsync =            hammer_vop_fsync,
131         .vop_read =             hammer_vop_specread,
132         .vop_write =            hammer_vop_specwrite,
133         .vop_access =           hammer_vop_access,
134         .vop_close =            hammer_vop_specclose,
135         .vop_getattr =          hammer_vop_getattr,
136         .vop_inactive =         hammer_vop_inactive,
137         .vop_reclaim =          hammer_vop_reclaim,
138         .vop_setattr =          hammer_vop_setattr
139 };
140
141 struct vop_ops hammer_fifo_vops = {
142         .vop_default =          fifo_vnoperate,
143         .vop_fsync =            hammer_vop_fsync,
144         .vop_read =             hammer_vop_fiforead,
145         .vop_write =            hammer_vop_fifowrite,
146         .vop_access =           hammer_vop_access,
147         .vop_close =            hammer_vop_fifoclose,
148         .vop_getattr =          hammer_vop_getattr,
149         .vop_inactive =         hammer_vop_inactive,
150         .vop_reclaim =          hammer_vop_reclaim,
151         .vop_setattr =          hammer_vop_setattr
152 };
153
154 #ifdef DEBUG_TRUNCATE
155 struct hammer_inode *HammerTruncIp;
156 #endif
157
158 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
159                            struct vnode *dvp, struct ucred *cred, int flags);
160 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
161 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
162
163 #if 0
164 static
165 int
166 hammer_vop_vnoperate(struct vop_generic_args *)
167 {
168         return (VOCALL(&hammer_vnode_vops, ap));
169 }
170 #endif
171
172 /*
173  * hammer_vop_fsync { vp, waitfor }
174  *
175  * fsync() an inode to disk and wait for it to be completely committed
176  * such that the information would not be undone if a crash occured after
177  * return.
178  */
179 static
180 int
181 hammer_vop_fsync(struct vop_fsync_args *ap)
182 {
183         hammer_inode_t ip = VTOI(ap->a_vp);
184
185         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
186         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
187         if (ap->a_waitfor == MNT_WAIT)
188                 hammer_wait_inode(ip);
189         return (ip->error);
190 }
191
192 /*
193  * hammer_vop_read { vp, uio, ioflag, cred }
194  */
195 static
196 int
197 hammer_vop_read(struct vop_read_args *ap)
198 {
199         struct hammer_transaction trans;
200         hammer_inode_t ip;
201         off_t offset;
202         struct buf *bp;
203         struct uio *uio;
204         int error;
205         int n;
206         int seqcount;
207         int ioseqcount;
208         int blksize;
209
210         if (ap->a_vp->v_type != VREG)
211                 return (EINVAL);
212         ip = VTOI(ap->a_vp);
213         error = 0;
214         uio = ap->a_uio;
215
216         /*
217          * Allow the UIO's size to override the sequential heuristic.
218          */
219         blksize = hammer_blocksize(uio->uio_offset);
220         seqcount = (uio->uio_resid + (blksize - 1)) / blksize;
221         ioseqcount = ap->a_ioflag >> 16;
222         if (seqcount < ioseqcount)
223                 seqcount = ioseqcount;
224
225         hammer_start_transaction(&trans, ip->hmp);
226
227         /*
228          * Access the data typically in HAMMER_BUFSIZE blocks via the
229          * buffer cache, but HAMMER may use a variable block size based
230          * on the offset.
231          */
232         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
233                 int64_t base_offset;
234                 int64_t file_limit;
235
236                 blksize = hammer_blocksize(uio->uio_offset);
237                 offset = (int)uio->uio_offset & (blksize - 1);
238                 base_offset = uio->uio_offset - offset;
239
240                 if (hammer_debug_cluster_enable) {
241                         /*
242                          * Use file_limit to prevent cluster_read() from
243                          * creating buffers of the wrong block size past
244                          * the demarc.
245                          */
246                         file_limit = ip->ino_data.size;
247                         if (base_offset < HAMMER_XDEMARC &&
248                             file_limit > HAMMER_XDEMARC) {
249                                 file_limit = HAMMER_XDEMARC;
250                         }
251                         error = cluster_read(ap->a_vp,
252                                              file_limit, base_offset,
253                                              blksize, MAXPHYS,
254                                              seqcount, &bp);
255                 } else {
256                         error = bread(ap->a_vp, base_offset, blksize, &bp);
257                 }
258                 if (error) {
259                         kprintf("error %d\n", error);
260                         brelse(bp);
261                         break;
262                 }
263
264                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
265                 n = blksize - offset;
266                 if (n > uio->uio_resid)
267                         n = uio->uio_resid;
268                 if (n > ip->ino_data.size - uio->uio_offset)
269                         n = (int)(ip->ino_data.size - uio->uio_offset);
270                 error = uiomove((char *)bp->b_data + offset, n, uio);
271
272                 /* data has a lower priority then meta-data */
273                 bp->b_flags |= B_AGE;
274                 bqrelse(bp);
275                 if (error)
276                         break;
277         }
278         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
279             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
280                 ip->ino_data.atime = trans.time;
281                 hammer_modify_inode(ip, HAMMER_INODE_ATIME);
282         }
283         hammer_done_transaction(&trans);
284         return (error);
285 }
286
287 /*
288  * hammer_vop_write { vp, uio, ioflag, cred }
289  */
290 static
291 int
292 hammer_vop_write(struct vop_write_args *ap)
293 {
294         struct hammer_transaction trans;
295         struct hammer_inode *ip;
296         hammer_mount_t hmp;
297         struct uio *uio;
298         int offset;
299         off_t base_offset;
300         struct buf *bp;
301         int error;
302         int n;
303         int flags;
304         int delta;
305         int seqcount;
306
307         if (ap->a_vp->v_type != VREG)
308                 return (EINVAL);
309         ip = VTOI(ap->a_vp);
310         hmp = ip->hmp;
311         error = 0;
312         seqcount = ap->a_ioflag >> 16;
313
314         if (ip->flags & HAMMER_INODE_RO)
315                 return (EROFS);
316
317         /*
318          * Create a transaction to cover the operations we perform.
319          */
320         hammer_start_transaction(&trans, hmp);
321         uio = ap->a_uio;
322
323         /*
324          * Check append mode
325          */
326         if (ap->a_ioflag & IO_APPEND)
327                 uio->uio_offset = ip->ino_data.size;
328
329         /*
330          * Check for illegal write offsets.  Valid range is 0...2^63-1.
331          *
332          * NOTE: the base_off assignment is required to work around what
333          * I consider to be a GCC-4 optimization bug.
334          */
335         if (uio->uio_offset < 0) {
336                 hammer_done_transaction(&trans);
337                 return (EFBIG);
338         }
339         base_offset = uio->uio_offset + uio->uio_resid; /* work around gcc-4 */
340         if (uio->uio_resid > 0 && base_offset <= 0) {
341                 hammer_done_transaction(&trans);
342                 return (EFBIG);
343         }
344
345         /*
346          * Access the data typically in HAMMER_BUFSIZE blocks via the
347          * buffer cache, but HAMMER may use a variable block size based
348          * on the offset.
349          */
350         while (uio->uio_resid > 0) {
351                 int fixsize = 0;
352                 int blksize;
353                 int blkmask;
354
355                 if ((error = hammer_checkspace(hmp, HAMMER_CHKSPC_WRITE)) != 0)
356                         break;
357
358                 blksize = hammer_blocksize(uio->uio_offset);
359
360                 /*
361                  * Do not allow HAMMER to blow out the buffer cache.  Very
362                  * large UIOs can lockout other processes due to bwillwrite()
363                  * mechanics.
364                  *
365                  * The hammer inode is not locked during these operations.
366                  * The vnode is locked which can interfere with the pageout
367                  * daemon for non-UIO_NOCOPY writes but should not interfere
368                  * with the buffer cache.  Even so, we cannot afford to
369                  * allow the pageout daemon to build up too many dirty buffer
370                  * cache buffers.
371                  */
372                 /*if (((int)uio->uio_offset & (blksize - 1)) == 0)*/
373                 bwillwrite(blksize);
374
375                 /*
376                  * Do not allow HAMMER to blow out system memory by
377                  * accumulating too many records.   Records are so well
378                  * decoupled from the buffer cache that it is possible
379                  * for userland to push data out to the media via
380                  * direct-write, but build up the records queued to the
381                  * backend faster then the backend can flush them out.
382                  * HAMMER has hit its write limit but the frontend has
383                  * no pushback to slow it down.
384                  */
385                 if (hmp->rsv_recs > hammer_limit_recs / 2) {
386                         /*
387                          * Get the inode on the flush list
388                          */
389                         if (ip->rsv_recs >= 64)
390                                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
391                         else if (ip->rsv_recs >= 16)
392                                 hammer_flush_inode(ip, 0);
393
394                         /*
395                          * Keep the flusher going if the system keeps
396                          * queueing records.
397                          */
398                         delta = hmp->count_newrecords -
399                                 hmp->last_newrecords;
400                         if (delta < 0 || delta > hammer_limit_recs / 2) {
401                                 hmp->last_newrecords = hmp->count_newrecords;
402                                 hammer_sync_hmp(hmp, MNT_NOWAIT);
403                         }
404
405                         /*
406                          * If we have gotten behind start slowing
407                          * down the writers.
408                          */
409                         delta = (hmp->rsv_recs - hammer_limit_recs) *
410                                 hz / hammer_limit_recs;
411                         if (delta > 0)
412                                 tsleep(&trans, 0, "hmrslo", delta);
413                 }
414
415                 /*
416                  * Calculate the blocksize at the current offset and figure
417                  * out how much we can actually write.
418                  */
419                 blkmask = blksize - 1;
420                 offset = (int)uio->uio_offset & blkmask;
421                 base_offset = uio->uio_offset & ~(int64_t)blkmask;
422                 n = blksize - offset;
423                 if (n > uio->uio_resid)
424                         n = uio->uio_resid;
425                 if (uio->uio_offset + n > ip->ino_data.size) {
426                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
427                         fixsize = 1;
428                 }
429
430                 if (uio->uio_segflg == UIO_NOCOPY) {
431                         /*
432                          * Issuing a write with the same data backing the
433                          * buffer.  Instantiate the buffer to collect the
434                          * backing vm pages, then read-in any missing bits.
435                          *
436                          * This case is used by vop_stdputpages().
437                          */
438                         bp = getblk(ap->a_vp, base_offset,
439                                     blksize, GETBLK_BHEAVY, 0);
440                         if ((bp->b_flags & B_CACHE) == 0) {
441                                 bqrelse(bp);
442                                 error = bread(ap->a_vp, base_offset,
443                                               blksize, &bp);
444                         }
445                 } else if (offset == 0 && uio->uio_resid >= blksize) {
446                         /*
447                          * Even though we are entirely overwriting the buffer
448                          * we may still have to zero it out to avoid a 
449                          * mmap/write visibility issue.
450                          */
451                         bp = getblk(ap->a_vp, base_offset, blksize, GETBLK_BHEAVY, 0);
452                         if ((bp->b_flags & B_CACHE) == 0)
453                                 vfs_bio_clrbuf(bp);
454                 } else if (base_offset >= ip->ino_data.size) {
455                         /*
456                          * If the base offset of the buffer is beyond the
457                          * file EOF, we don't have to issue a read.
458                          */
459                         bp = getblk(ap->a_vp, base_offset,
460                                     blksize, GETBLK_BHEAVY, 0);
461                         vfs_bio_clrbuf(bp);
462                 } else {
463                         /*
464                          * Partial overwrite, read in any missing bits then
465                          * replace the portion being written.
466                          */
467                         error = bread(ap->a_vp, base_offset, blksize, &bp);
468                         if (error == 0)
469                                 bheavy(bp);
470                 }
471                 if (error == 0) {
472                         error = uiomove((char *)bp->b_data + offset,
473                                         n, uio);
474                 }
475
476                 /*
477                  * If we screwed up we have to undo any VM size changes we
478                  * made.
479                  */
480                 if (error) {
481                         brelse(bp);
482                         if (fixsize) {
483                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
484                                           hammer_blocksize(ip->ino_data.size));
485                         }
486                         break;
487                 }
488                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
489                 if (ip->ino_data.size < uio->uio_offset) {
490                         ip->ino_data.size = uio->uio_offset;
491                         flags = HAMMER_INODE_DDIRTY;
492                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
493                 } else {
494                         flags = 0;
495                 }
496                 ip->ino_data.mtime = trans.time;
497                 flags |= HAMMER_INODE_MTIME | HAMMER_INODE_BUFS;
498                 hammer_modify_inode(ip, flags);
499
500                 /*
501                  * Final buffer disposition.
502                  */
503                 bp->b_flags |= B_AGE;
504                 if (ap->a_ioflag & IO_SYNC) {
505                         bwrite(bp);
506                 } else if (ap->a_ioflag & IO_DIRECT) {
507                         bawrite(bp);
508                 } else {
509                         bdwrite(bp);
510                 }
511         }
512         hammer_done_transaction(&trans);
513         return (error);
514 }
515
516 /*
517  * hammer_vop_access { vp, mode, cred }
518  */
519 static
520 int
521 hammer_vop_access(struct vop_access_args *ap)
522 {
523         struct hammer_inode *ip = VTOI(ap->a_vp);
524         uid_t uid;
525         gid_t gid;
526         int error;
527
528         uid = hammer_to_unix_xid(&ip->ino_data.uid);
529         gid = hammer_to_unix_xid(&ip->ino_data.gid);
530
531         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
532                                   ip->ino_data.uflags);
533         return (error);
534 }
535
536 /*
537  * hammer_vop_advlock { vp, id, op, fl, flags }
538  */
539 static
540 int
541 hammer_vop_advlock(struct vop_advlock_args *ap)
542 {
543         hammer_inode_t ip = VTOI(ap->a_vp);
544
545         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
546 }
547
548 /*
549  * hammer_vop_close { vp, fflag }
550  */
551 static
552 int
553 hammer_vop_close(struct vop_close_args *ap)
554 {
555         hammer_inode_t ip = VTOI(ap->a_vp);
556
557         if ((ip->flags | ip->sync_flags) & HAMMER_INODE_MODMASK)
558                 hammer_inode_waitreclaims(ip->hmp);
559         return (vop_stdclose(ap));
560 }
561
562 /*
563  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
564  *
565  * The operating system has already ensured that the directory entry
566  * does not exist and done all appropriate namespace locking.
567  */
568 static
569 int
570 hammer_vop_ncreate(struct vop_ncreate_args *ap)
571 {
572         struct hammer_transaction trans;
573         struct hammer_inode *dip;
574         struct hammer_inode *nip;
575         struct nchandle *nch;
576         int error;
577
578         nch = ap->a_nch;
579         dip = VTOI(ap->a_dvp);
580
581         if (dip->flags & HAMMER_INODE_RO)
582                 return (EROFS);
583         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
584                 return (error);
585
586         /*
587          * Create a transaction to cover the operations we perform.
588          */
589         hammer_start_transaction(&trans, dip->hmp);
590
591         /*
592          * Create a new filesystem object of the requested type.  The
593          * returned inode will be referenced and shared-locked to prevent
594          * it from being moved to the flusher.
595          */
596
597         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
598                                     dip, NULL, &nip);
599         if (error) {
600                 hkprintf("hammer_create_inode error %d\n", error);
601                 hammer_done_transaction(&trans);
602                 *ap->a_vpp = NULL;
603                 return (error);
604         }
605
606         /*
607          * Add the new filesystem object to the directory.  This will also
608          * bump the inode's link count.
609          */
610         error = hammer_ip_add_directory(&trans, dip,
611                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
612                                         nip);
613         if (error)
614                 hkprintf("hammer_ip_add_directory error %d\n", error);
615
616         /*
617          * Finish up.
618          */
619         if (error) {
620                 hammer_rel_inode(nip, 0);
621                 hammer_done_transaction(&trans);
622                 *ap->a_vpp = NULL;
623         } else {
624                 error = hammer_get_vnode(nip, ap->a_vpp);
625                 hammer_done_transaction(&trans);
626                 hammer_rel_inode(nip, 0);
627                 if (error == 0) {
628                         cache_setunresolved(ap->a_nch);
629                         cache_setvp(ap->a_nch, *ap->a_vpp);
630                 }
631         }
632         return (error);
633 }
634
635 /*
636  * hammer_vop_getattr { vp, vap }
637  *
638  * Retrieve an inode's attribute information.  When accessing inodes
639  * historically we fake the atime field to ensure consistent results.
640  * The atime field is stored in the B-Tree element and allowed to be
641  * updated without cycling the element.
642  */
643 static
644 int
645 hammer_vop_getattr(struct vop_getattr_args *ap)
646 {
647         struct hammer_inode *ip = VTOI(ap->a_vp);
648         struct vattr *vap = ap->a_vap;
649
650         /*
651          * We want the fsid to be different when accessing a filesystem
652          * with different as-of's so programs like diff don't think
653          * the files are the same.
654          *
655          * We also want the fsid to be the same when comparing snapshots,
656          * or when comparing mirrors (which might be backed by different
657          * physical devices).  HAMMER fsids are based on the PFS's
658          * shared_uuid field.
659          *
660          * XXX there is a chance of collision here.  The va_fsid reported
661          * by stat is different from the more involved fsid used in the
662          * mount structure.
663          */
664         vap->va_fsid = ip->pfsm->fsid_udev ^ (u_int32_t)ip->obj_asof ^
665                        (u_int32_t)(ip->obj_asof >> 32);
666
667         vap->va_fileid = ip->ino_leaf.base.obj_id;
668         vap->va_mode = ip->ino_data.mode;
669         vap->va_nlink = ip->ino_data.nlinks;
670         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
671         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
672         vap->va_rmajor = 0;
673         vap->va_rminor = 0;
674         vap->va_size = ip->ino_data.size;
675
676         /*
677          * We must provide a consistent atime and mtime for snapshots
678          * so people can do a 'tar cf - ... | md5' on them and get
679          * consistent results.
680          */
681         if (ip->flags & HAMMER_INODE_RO) {
682                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_atime);
683                 hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_mtime);
684         } else {
685                 hammer_time_to_timespec(ip->ino_data.atime, &vap->va_atime);
686                 hammer_time_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
687         }
688         hammer_time_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
689         vap->va_flags = ip->ino_data.uflags;
690         vap->va_gen = 1;        /* hammer inums are unique for all time */
691         vap->va_blocksize = HAMMER_BUFSIZE;
692         if (ip->ino_data.size >= HAMMER_XDEMARC) {
693                 vap->va_bytes = (ip->ino_data.size + HAMMER_XBUFMASK64) &
694                                 ~HAMMER_XBUFMASK64;
695         } else if (ip->ino_data.size > HAMMER_BUFSIZE / 2) {
696                 vap->va_bytes = (ip->ino_data.size + HAMMER_BUFMASK64) &
697                                 ~HAMMER_BUFMASK64;
698         } else {
699                 vap->va_bytes = (ip->ino_data.size + 15) & ~15;
700         }
701         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
702         vap->va_filerev = 0;    /* XXX */
703         /* mtime uniquely identifies any adjustments made to the file XXX */
704         vap->va_fsmid = ip->ino_data.mtime;
705         vap->va_uid_uuid = ip->ino_data.uid;
706         vap->va_gid_uuid = ip->ino_data.gid;
707         vap->va_fsid_uuid = ip->hmp->fsid;
708         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
709                           VA_FSID_UUID_VALID;
710
711         switch (ip->ino_data.obj_type) {
712         case HAMMER_OBJTYPE_CDEV:
713         case HAMMER_OBJTYPE_BDEV:
714                 vap->va_rmajor = ip->ino_data.rmajor;
715                 vap->va_rminor = ip->ino_data.rminor;
716                 break;
717         default:
718                 break;
719         }
720         return(0);
721 }
722
723 /*
724  * hammer_vop_nresolve { nch, dvp, cred }
725  *
726  * Locate the requested directory entry.
727  */
728 static
729 int
730 hammer_vop_nresolve(struct vop_nresolve_args *ap)
731 {
732         struct hammer_transaction trans;
733         struct namecache *ncp;
734         hammer_inode_t dip;
735         hammer_inode_t ip;
736         hammer_tid_t asof;
737         struct hammer_cursor cursor;
738         struct vnode *vp;
739         int64_t namekey;
740         int error;
741         int i;
742         int nlen;
743         int flags;
744         int ispfs;
745         int64_t obj_id;
746         u_int32_t localization;
747
748         /*
749          * Misc initialization, plus handle as-of name extensions.  Look for
750          * the '@@' extension.  Note that as-of files and directories cannot
751          * be modified.
752          */
753         dip = VTOI(ap->a_dvp);
754         ncp = ap->a_nch->ncp;
755         asof = dip->obj_asof;
756         nlen = ncp->nc_nlen;
757         flags = dip->flags & HAMMER_INODE_RO;
758         ispfs = 0;
759
760         hammer_simple_transaction(&trans, dip->hmp);
761
762         for (i = 0; i < nlen; ++i) {
763                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
764                         asof = hammer_str_to_tid(ncp->nc_name + i + 2,
765                                                  &ispfs, &localization);
766                         if (asof != HAMMER_MAX_TID)
767                                 flags |= HAMMER_INODE_RO;
768                         break;
769                 }
770         }
771         nlen = i;
772
773         /*
774          * If this is a PFS softlink we dive into the PFS
775          */
776         if (ispfs && nlen == 0) {
777                 ip = hammer_get_inode(&trans, dip, HAMMER_OBJID_ROOT,
778                                       asof, localization,
779                                       flags, &error);
780                 if (error == 0) {
781                         error = hammer_get_vnode(ip, &vp);
782                         hammer_rel_inode(ip, 0);
783                 } else {
784                         vp = NULL;
785                 }
786                 if (error == 0) {
787                         vn_unlock(vp);
788                         cache_setvp(ap->a_nch, vp);
789                         vrele(vp);
790                 }
791                 goto done;
792         }
793
794         /*
795          * If there is no path component the time extension is relative to
796          * dip.
797          */
798         if (nlen == 0) {
799                 ip = hammer_get_inode(&trans, dip, dip->obj_id,
800                                       asof, dip->obj_localization,
801                                       flags, &error);
802                 if (error == 0) {
803                         error = hammer_get_vnode(ip, &vp);
804                         hammer_rel_inode(ip, 0);
805                 } else {
806                         vp = NULL;
807                 }
808                 if (error == 0) {
809                         vn_unlock(vp);
810                         cache_setvp(ap->a_nch, vp);
811                         vrele(vp);
812                 }
813                 goto done;
814         }
815
816         /*
817          * Calculate the namekey and setup the key range for the scan.  This
818          * works kinda like a chained hash table where the lower 32 bits
819          * of the namekey synthesize the chain.
820          *
821          * The key range is inclusive of both key_beg and key_end.
822          */
823         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
824
825         error = hammer_init_cursor(&trans, &cursor, &dip->cache[1], dip);
826         cursor.key_beg.localization = dip->obj_localization +
827                                       HAMMER_LOCALIZE_MISC;
828         cursor.key_beg.obj_id = dip->obj_id;
829         cursor.key_beg.key = namekey;
830         cursor.key_beg.create_tid = 0;
831         cursor.key_beg.delete_tid = 0;
832         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
833         cursor.key_beg.obj_type = 0;
834
835         cursor.key_end = cursor.key_beg;
836         cursor.key_end.key |= 0xFFFFFFFFULL;
837         cursor.asof = asof;
838         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
839
840         /*
841          * Scan all matching records (the chain), locate the one matching
842          * the requested path component.
843          *
844          * The hammer_ip_*() functions merge in-memory records with on-disk
845          * records for the purposes of the search.
846          */
847         obj_id = 0;
848         localization = HAMMER_DEF_LOCALIZATION;
849
850         if (error == 0) {
851                 error = hammer_ip_first(&cursor);
852                 while (error == 0) {
853                         error = hammer_ip_resolve_data(&cursor);
854                         if (error)
855                                 break;
856                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
857                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
858                                 obj_id = cursor.data->entry.obj_id;
859                                 localization = cursor.data->entry.localization;
860                                 break;
861                         }
862                         error = hammer_ip_next(&cursor);
863                 }
864         }
865         hammer_done_cursor(&cursor);
866         if (error == 0) {
867                 ip = hammer_get_inode(&trans, dip, obj_id,
868                                       asof, localization,
869                                       flags, &error);
870                 if (error == 0) {
871                         error = hammer_get_vnode(ip, &vp);
872                         hammer_rel_inode(ip, 0);
873                 } else {
874                         vp = NULL;
875                 }
876                 if (error == 0) {
877                         vn_unlock(vp);
878                         cache_setvp(ap->a_nch, vp);
879                         vrele(vp);
880                 }
881         } else if (error == ENOENT) {
882                 cache_setvp(ap->a_nch, NULL);
883         }
884 done:
885         hammer_done_transaction(&trans);
886         return (error);
887 }
888
889 /*
890  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
891  *
892  * Locate the parent directory of a directory vnode.
893  *
894  * dvp is referenced but not locked.  *vpp must be returned referenced and
895  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
896  * at the root, instead it could indicate that the directory we were in was
897  * removed.
898  *
899  * NOTE: as-of sequences are not linked into the directory structure.  If
900  * we are at the root with a different asof then the mount point, reload
901  * the same directory with the mount point's asof.   I'm not sure what this
902  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
903  * get confused, but it hasn't been tested.
904  */
905 static
906 int
907 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
908 {
909         struct hammer_transaction trans;
910         struct hammer_inode *dip;
911         struct hammer_inode *ip;
912         int64_t parent_obj_id;
913         u_int32_t parent_obj_localization;
914         hammer_tid_t asof;
915         int error;
916
917         dip = VTOI(ap->a_dvp);
918         asof = dip->obj_asof;
919
920         /*
921          * Whos are parent?  This could be the root of a pseudo-filesystem
922          * whos parent is in another localization domain.
923          */
924         parent_obj_id = dip->ino_data.parent_obj_id;
925         if (dip->obj_id == HAMMER_OBJID_ROOT)
926                 parent_obj_localization = dip->ino_data.ext.obj.parent_obj_localization;
927         else
928                 parent_obj_localization = dip->obj_localization;
929
930         if (parent_obj_id == 0) {
931                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
932                    asof != dip->hmp->asof) {
933                         parent_obj_id = dip->obj_id;
934                         asof = dip->hmp->asof;
935                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
936                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
937                                    dip->obj_asof);
938                 } else {
939                         *ap->a_vpp = NULL;
940                         return ENOENT;
941                 }
942         }
943
944         hammer_simple_transaction(&trans, dip->hmp);
945
946         ip = hammer_get_inode(&trans, dip, parent_obj_id,
947                               asof, parent_obj_localization,
948                               dip->flags, &error);
949         if (ip) {
950                 error = hammer_get_vnode(ip, ap->a_vpp);
951                 hammer_rel_inode(ip, 0);
952         } else {
953                 *ap->a_vpp = NULL;
954         }
955         hammer_done_transaction(&trans);
956         return (error);
957 }
958
959 /*
960  * hammer_vop_nlink { nch, dvp, vp, cred }
961  */
962 static
963 int
964 hammer_vop_nlink(struct vop_nlink_args *ap)
965 {
966         struct hammer_transaction trans;
967         struct hammer_inode *dip;
968         struct hammer_inode *ip;
969         struct nchandle *nch;
970         int error;
971
972         nch = ap->a_nch;
973         dip = VTOI(ap->a_dvp);
974         ip = VTOI(ap->a_vp);
975
976         if (dip->flags & HAMMER_INODE_RO)
977                 return (EROFS);
978         if (ip->flags & HAMMER_INODE_RO)
979                 return (EROFS);
980         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
981                 return (error);
982
983         /*
984          * Create a transaction to cover the operations we perform.
985          */
986         hammer_start_transaction(&trans, dip->hmp);
987
988         /*
989          * Add the filesystem object to the directory.  Note that neither
990          * dip nor ip are referenced or locked, but their vnodes are
991          * referenced.  This function will bump the inode's link count.
992          */
993         error = hammer_ip_add_directory(&trans, dip,
994                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
995                                         ip);
996
997         /*
998          * Finish up.
999          */
1000         if (error == 0) {
1001                 cache_setunresolved(nch);
1002                 cache_setvp(nch, ap->a_vp);
1003         }
1004         hammer_done_transaction(&trans);
1005         return (error);
1006 }
1007
1008 /*
1009  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
1010  *
1011  * The operating system has already ensured that the directory entry
1012  * does not exist and done all appropriate namespace locking.
1013  */
1014 static
1015 int
1016 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
1017 {
1018         struct hammer_transaction trans;
1019         struct hammer_inode *dip;
1020         struct hammer_inode *nip;
1021         struct nchandle *nch;
1022         int error;
1023
1024         nch = ap->a_nch;
1025         dip = VTOI(ap->a_dvp);
1026
1027         if (dip->flags & HAMMER_INODE_RO)
1028                 return (EROFS);
1029         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1030                 return (error);
1031
1032         /*
1033          * Create a transaction to cover the operations we perform.
1034          */
1035         hammer_start_transaction(&trans, dip->hmp);
1036
1037         /*
1038          * Create a new filesystem object of the requested type.  The
1039          * returned inode will be referenced but not locked.
1040          */
1041         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1042                                     dip, NULL, &nip);
1043         if (error) {
1044                 hkprintf("hammer_mkdir error %d\n", error);
1045                 hammer_done_transaction(&trans);
1046                 *ap->a_vpp = NULL;
1047                 return (error);
1048         }
1049         /*
1050          * Add the new filesystem object to the directory.  This will also
1051          * bump the inode's link count.
1052          */
1053         error = hammer_ip_add_directory(&trans, dip,
1054                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1055                                         nip);
1056         if (error)
1057                 hkprintf("hammer_mkdir (add) error %d\n", error);
1058
1059         /*
1060          * Finish up.
1061          */
1062         if (error) {
1063                 hammer_rel_inode(nip, 0);
1064                 *ap->a_vpp = NULL;
1065         } else {
1066                 error = hammer_get_vnode(nip, ap->a_vpp);
1067                 hammer_rel_inode(nip, 0);
1068                 if (error == 0) {
1069                         cache_setunresolved(ap->a_nch);
1070                         cache_setvp(ap->a_nch, *ap->a_vpp);
1071                 }
1072         }
1073         hammer_done_transaction(&trans);
1074         return (error);
1075 }
1076
1077 /*
1078  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
1079  *
1080  * The operating system has already ensured that the directory entry
1081  * does not exist and done all appropriate namespace locking.
1082  */
1083 static
1084 int
1085 hammer_vop_nmknod(struct vop_nmknod_args *ap)
1086 {
1087         struct hammer_transaction trans;
1088         struct hammer_inode *dip;
1089         struct hammer_inode *nip;
1090         struct nchandle *nch;
1091         int error;
1092
1093         nch = ap->a_nch;
1094         dip = VTOI(ap->a_dvp);
1095
1096         if (dip->flags & HAMMER_INODE_RO)
1097                 return (EROFS);
1098         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1099                 return (error);
1100
1101         /*
1102          * Create a transaction to cover the operations we perform.
1103          */
1104         hammer_start_transaction(&trans, dip->hmp);
1105
1106         /*
1107          * Create a new filesystem object of the requested type.  The
1108          * returned inode will be referenced but not locked.
1109          *
1110          * If mknod specifies a directory a pseudo-fs is created.
1111          */
1112         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1113                                     dip, NULL, &nip);
1114         if (error) {
1115                 hammer_done_transaction(&trans);
1116                 *ap->a_vpp = NULL;
1117                 return (error);
1118         }
1119
1120         /*
1121          * Add the new filesystem object to the directory.  This will also
1122          * bump the inode's link count.
1123          */
1124         error = hammer_ip_add_directory(&trans, dip,
1125                                         nch->ncp->nc_name, nch->ncp->nc_nlen,
1126                                         nip);
1127
1128         /*
1129          * Finish up.
1130          */
1131         if (error) {
1132                 hammer_rel_inode(nip, 0);
1133                 *ap->a_vpp = NULL;
1134         } else {
1135                 error = hammer_get_vnode(nip, ap->a_vpp);
1136                 hammer_rel_inode(nip, 0);
1137                 if (error == 0) {
1138                         cache_setunresolved(ap->a_nch);
1139                         cache_setvp(ap->a_nch, *ap->a_vpp);
1140                 }
1141         }
1142         hammer_done_transaction(&trans);
1143         return (error);
1144 }
1145
1146 /*
1147  * hammer_vop_open { vp, mode, cred, fp }
1148  */
1149 static
1150 int
1151 hammer_vop_open(struct vop_open_args *ap)
1152 {
1153         hammer_inode_t ip;
1154
1155         ip = VTOI(ap->a_vp);
1156
1157         if ((ap->a_mode & FWRITE) && (ip->flags & HAMMER_INODE_RO))
1158                 return (EROFS);
1159         return(vop_stdopen(ap));
1160 }
1161
1162 /*
1163  * hammer_vop_pathconf { vp, name, retval }
1164  */
1165 static
1166 int
1167 hammer_vop_pathconf(struct vop_pathconf_args *ap)
1168 {
1169         return EOPNOTSUPP;
1170 }
1171
1172 /*
1173  * hammer_vop_print { vp }
1174  */
1175 static
1176 int
1177 hammer_vop_print(struct vop_print_args *ap)
1178 {
1179         return EOPNOTSUPP;
1180 }
1181
1182 /*
1183  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
1184  */
1185 static
1186 int
1187 hammer_vop_readdir(struct vop_readdir_args *ap)
1188 {
1189         struct hammer_transaction trans;
1190         struct hammer_cursor cursor;
1191         struct hammer_inode *ip;
1192         struct uio *uio;
1193         hammer_base_elm_t base;
1194         int error;
1195         int cookie_index;
1196         int ncookies;
1197         off_t *cookies;
1198         off_t saveoff;
1199         int r;
1200         int dtype;
1201
1202         ip = VTOI(ap->a_vp);
1203         uio = ap->a_uio;
1204         saveoff = uio->uio_offset;
1205
1206         if (ap->a_ncookies) {
1207                 ncookies = uio->uio_resid / 16 + 1;
1208                 if (ncookies > 1024)
1209                         ncookies = 1024;
1210                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1211                 cookie_index = 0;
1212         } else {
1213                 ncookies = -1;
1214                 cookies = NULL;
1215                 cookie_index = 0;
1216         }
1217
1218         hammer_simple_transaction(&trans, ip->hmp);
1219
1220         /*
1221          * Handle artificial entries
1222          */
1223         error = 0;
1224         if (saveoff == 0) {
1225                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1226                 if (r)
1227                         goto done;
1228                 if (cookies)
1229                         cookies[cookie_index] = saveoff;
1230                 ++saveoff;
1231                 ++cookie_index;
1232                 if (cookie_index == ncookies)
1233                         goto done;
1234         }
1235         if (saveoff == 1) {
1236                 if (ip->ino_data.parent_obj_id) {
1237                         r = vop_write_dirent(&error, uio,
1238                                              ip->ino_data.parent_obj_id,
1239                                              DT_DIR, 2, "..");
1240                 } else {
1241                         r = vop_write_dirent(&error, uio,
1242                                              ip->obj_id, DT_DIR, 2, "..");
1243                 }
1244                 if (r)
1245                         goto done;
1246                 if (cookies)
1247                         cookies[cookie_index] = saveoff;
1248                 ++saveoff;
1249                 ++cookie_index;
1250                 if (cookie_index == ncookies)
1251                         goto done;
1252         }
1253
1254         /*
1255          * Key range (begin and end inclusive) to scan.  Directory keys
1256          * directly translate to a 64 bit 'seek' position.
1257          */
1258         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1259         cursor.key_beg.localization = ip->obj_localization +
1260                                       HAMMER_LOCALIZE_MISC;
1261         cursor.key_beg.obj_id = ip->obj_id;
1262         cursor.key_beg.create_tid = 0;
1263         cursor.key_beg.delete_tid = 0;
1264         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1265         cursor.key_beg.obj_type = 0;
1266         cursor.key_beg.key = saveoff;
1267
1268         cursor.key_end = cursor.key_beg;
1269         cursor.key_end.key = HAMMER_MAX_KEY;
1270         cursor.asof = ip->obj_asof;
1271         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1272
1273         error = hammer_ip_first(&cursor);
1274
1275         while (error == 0) {
1276                 error = hammer_ip_resolve_data(&cursor);
1277                 if (error)
1278                         break;
1279                 base = &cursor.leaf->base;
1280                 saveoff = base->key;
1281                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1282
1283                 if (base->obj_id != ip->obj_id)
1284                         panic("readdir: bad record at %p", cursor.node);
1285
1286                 /*
1287                  * Convert pseudo-filesystems into softlinks
1288                  */
1289                 dtype = hammer_get_dtype(cursor.leaf->base.obj_type);
1290                 r = vop_write_dirent(
1291                              &error, uio, cursor.data->entry.obj_id,
1292                              dtype,
1293                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1294                              (void *)cursor.data->entry.name);
1295                 if (r)
1296                         break;
1297                 ++saveoff;
1298                 if (cookies)
1299                         cookies[cookie_index] = base->key;
1300                 ++cookie_index;
1301                 if (cookie_index == ncookies)
1302                         break;
1303                 error = hammer_ip_next(&cursor);
1304         }
1305         hammer_done_cursor(&cursor);
1306
1307 done:
1308         hammer_done_transaction(&trans);
1309
1310         if (ap->a_eofflag)
1311                 *ap->a_eofflag = (error == ENOENT);
1312         uio->uio_offset = saveoff;
1313         if (error && cookie_index == 0) {
1314                 if (error == ENOENT)
1315                         error = 0;
1316                 if (cookies) {
1317                         kfree(cookies, M_TEMP);
1318                         *ap->a_ncookies = 0;
1319                         *ap->a_cookies = NULL;
1320                 }
1321         } else {
1322                 if (error == ENOENT)
1323                         error = 0;
1324                 if (cookies) {
1325                         *ap->a_ncookies = cookie_index;
1326                         *ap->a_cookies = cookies;
1327                 }
1328         }
1329         return(error);
1330 }
1331
1332 /*
1333  * hammer_vop_readlink { vp, uio, cred }
1334  */
1335 static
1336 int
1337 hammer_vop_readlink(struct vop_readlink_args *ap)
1338 {
1339         struct hammer_transaction trans;
1340         struct hammer_cursor cursor;
1341         struct hammer_inode *ip;
1342         char buf[32];
1343         u_int32_t localization;
1344         hammer_pseudofs_inmem_t pfsm;
1345         int error;
1346
1347         ip = VTOI(ap->a_vp);
1348
1349         /*
1350          * Shortcut if the symlink data was stuffed into ino_data.
1351          *
1352          * Also expand special "@@PFS%05d" softlinks.
1353          */
1354         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1355                 char *ptr;
1356                 int bytes;
1357
1358                 ptr = ip->ino_data.ext.symlink;
1359                 bytes = (int)ip->ino_data.size;
1360                 if (bytes == 10 && strncmp(ptr, "@@PFS", 5) == 0) {
1361                         hammer_simple_transaction(&trans, ip->hmp);
1362                         bcopy(ptr + 5, buf, 5);
1363                         buf[5] = 0;
1364                         localization = strtoul(buf, NULL, 10) << 16;
1365                         pfsm = hammer_load_pseudofs(&trans, localization,
1366                                                     &error);
1367                         if (error == 0) {
1368                                 if (pfsm->pfsd.mirror_flags &
1369                                     HAMMER_PFSD_SLAVE) {
1370                                         ksnprintf(buf, sizeof(buf),
1371                                                   "@@0x%016llx:%05d",
1372                                                   pfsm->pfsd.sync_end_tid,
1373                                                   localization >> 16);
1374                                 } else {
1375                                         ksnprintf(buf, sizeof(buf),
1376                                                   "@@0x%016llx:%05d",
1377                                                   HAMMER_MAX_TID,
1378                                                   localization >> 16);
1379                                 }
1380                                 ptr = buf;
1381                                 bytes = strlen(buf);
1382                         }
1383                         if (pfsm)
1384                                 hammer_rel_pseudofs(trans.hmp, pfsm);
1385                         hammer_done_transaction(&trans);
1386                 }
1387                 error = uiomove(ptr, bytes, ap->a_uio);
1388                 return(error);
1389         }
1390
1391         /*
1392          * Long version
1393          */
1394         hammer_simple_transaction(&trans, ip->hmp);
1395         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
1396
1397         /*
1398          * Key range (begin and end inclusive) to scan.  Directory keys
1399          * directly translate to a 64 bit 'seek' position.
1400          */
1401         cursor.key_beg.localization = ip->obj_localization +
1402                                       HAMMER_LOCALIZE_MISC;
1403         cursor.key_beg.obj_id = ip->obj_id;
1404         cursor.key_beg.create_tid = 0;
1405         cursor.key_beg.delete_tid = 0;
1406         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1407         cursor.key_beg.obj_type = 0;
1408         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1409         cursor.asof = ip->obj_asof;
1410         cursor.flags |= HAMMER_CURSOR_ASOF;
1411
1412         error = hammer_ip_lookup(&cursor);
1413         if (error == 0) {
1414                 error = hammer_ip_resolve_data(&cursor);
1415                 if (error == 0) {
1416                         KKASSERT(cursor.leaf->data_len >=
1417                                  HAMMER_SYMLINK_NAME_OFF);
1418                         error = uiomove(cursor.data->symlink.name,
1419                                         cursor.leaf->data_len -
1420                                                 HAMMER_SYMLINK_NAME_OFF,
1421                                         ap->a_uio);
1422                 }
1423         }
1424         hammer_done_cursor(&cursor);
1425         hammer_done_transaction(&trans);
1426         return(error);
1427 }
1428
1429 /*
1430  * hammer_vop_nremove { nch, dvp, cred }
1431  */
1432 static
1433 int
1434 hammer_vop_nremove(struct vop_nremove_args *ap)
1435 {
1436         struct hammer_transaction trans;
1437         struct hammer_inode *dip;
1438         int error;
1439
1440         dip = VTOI(ap->a_dvp);
1441
1442         if (hammer_nohistory(dip) == 0 &&
1443             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1444                 return (error);
1445         }
1446
1447         hammer_start_transaction(&trans, dip->hmp);
1448         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1449         hammer_done_transaction(&trans);
1450
1451         return (error);
1452 }
1453
1454 /*
1455  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1456  */
1457 static
1458 int
1459 hammer_vop_nrename(struct vop_nrename_args *ap)
1460 {
1461         struct hammer_transaction trans;
1462         struct namecache *fncp;
1463         struct namecache *tncp;
1464         struct hammer_inode *fdip;
1465         struct hammer_inode *tdip;
1466         struct hammer_inode *ip;
1467         struct hammer_cursor cursor;
1468         int64_t namekey;
1469         int nlen, error;
1470
1471         fdip = VTOI(ap->a_fdvp);
1472         tdip = VTOI(ap->a_tdvp);
1473         fncp = ap->a_fnch->ncp;
1474         tncp = ap->a_tnch->ncp;
1475         ip = VTOI(fncp->nc_vp);
1476         KKASSERT(ip != NULL);
1477
1478         if (fdip->flags & HAMMER_INODE_RO)
1479                 return (EROFS);
1480         if (tdip->flags & HAMMER_INODE_RO)
1481                 return (EROFS);
1482         if (ip->flags & HAMMER_INODE_RO)
1483                 return (EROFS);
1484         if ((error = hammer_checkspace(fdip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1485                 return (error);
1486
1487         hammer_start_transaction(&trans, fdip->hmp);
1488
1489         /*
1490          * Remove tncp from the target directory and then link ip as
1491          * tncp. XXX pass trans to dounlink
1492          *
1493          * Force the inode sync-time to match the transaction so it is
1494          * in-sync with the creation of the target directory entry.
1495          */
1496         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1497         if (error == 0 || error == ENOENT) {
1498                 error = hammer_ip_add_directory(&trans, tdip,
1499                                                 tncp->nc_name, tncp->nc_nlen,
1500                                                 ip);
1501                 if (error == 0) {
1502                         ip->ino_data.parent_obj_id = tdip->obj_id;
1503                         hammer_modify_inode(ip, HAMMER_INODE_DDIRTY);
1504                 }
1505         }
1506         if (error)
1507                 goto failed; /* XXX */
1508
1509         /*
1510          * Locate the record in the originating directory and remove it.
1511          *
1512          * Calculate the namekey and setup the key range for the scan.  This
1513          * works kinda like a chained hash table where the lower 32 bits
1514          * of the namekey synthesize the chain.
1515          *
1516          * The key range is inclusive of both key_beg and key_end.
1517          */
1518         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1519 retry:
1520         hammer_init_cursor(&trans, &cursor, &fdip->cache[1], fdip);
1521         cursor.key_beg.localization = fdip->obj_localization +
1522                                       HAMMER_LOCALIZE_MISC;
1523         cursor.key_beg.obj_id = fdip->obj_id;
1524         cursor.key_beg.key = namekey;
1525         cursor.key_beg.create_tid = 0;
1526         cursor.key_beg.delete_tid = 0;
1527         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1528         cursor.key_beg.obj_type = 0;
1529
1530         cursor.key_end = cursor.key_beg;
1531         cursor.key_end.key |= 0xFFFFFFFFULL;
1532         cursor.asof = fdip->obj_asof;
1533         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1534
1535         /*
1536          * Scan all matching records (the chain), locate the one matching
1537          * the requested path component.
1538          *
1539          * The hammer_ip_*() functions merge in-memory records with on-disk
1540          * records for the purposes of the search.
1541          */
1542         error = hammer_ip_first(&cursor);
1543         while (error == 0) {
1544                 if (hammer_ip_resolve_data(&cursor) != 0)
1545                         break;
1546                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1547                 KKASSERT(nlen > 0);
1548                 if (fncp->nc_nlen == nlen &&
1549                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1550                         break;
1551                 }
1552                 error = hammer_ip_next(&cursor);
1553         }
1554
1555         /*
1556          * If all is ok we have to get the inode so we can adjust nlinks.
1557          *
1558          * WARNING: hammer_ip_del_directory() may have to terminate the
1559          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1560          * twice.
1561          */
1562         if (error == 0)
1563                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1564
1565         /*
1566          * XXX A deadlock here will break rename's atomicy for the purposes
1567          * of crash recovery.
1568          */
1569         if (error == EDEADLK) {
1570                 hammer_done_cursor(&cursor);
1571                 goto retry;
1572         }
1573
1574         /*
1575          * Cleanup and tell the kernel that the rename succeeded.
1576          */
1577         hammer_done_cursor(&cursor);
1578         if (error == 0)
1579                 cache_rename(ap->a_fnch, ap->a_tnch);
1580
1581 failed:
1582         hammer_done_transaction(&trans);
1583         return (error);
1584 }
1585
1586 /*
1587  * hammer_vop_nrmdir { nch, dvp, cred }
1588  */
1589 static
1590 int
1591 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1592 {
1593         struct hammer_transaction trans;
1594         struct hammer_inode *dip;
1595         int error;
1596
1597         dip = VTOI(ap->a_dvp);
1598
1599         if (hammer_nohistory(dip) == 0 &&
1600             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1601                 return (error);
1602         }
1603
1604         hammer_start_transaction(&trans, dip->hmp);
1605         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1606         hammer_done_transaction(&trans);
1607
1608         return (error);
1609 }
1610
1611 /*
1612  * hammer_vop_setattr { vp, vap, cred }
1613  */
1614 static
1615 int
1616 hammer_vop_setattr(struct vop_setattr_args *ap)
1617 {
1618         struct hammer_transaction trans;
1619         struct vattr *vap;
1620         struct hammer_inode *ip;
1621         int modflags;
1622         int error;
1623         int truncating;
1624         int blksize;
1625         int64_t aligned_size;
1626         u_int32_t flags;
1627
1628         vap = ap->a_vap;
1629         ip = ap->a_vp->v_data;
1630         modflags = 0;
1631
1632         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1633                 return(EROFS);
1634         if (ip->flags & HAMMER_INODE_RO)
1635                 return (EROFS);
1636         if (hammer_nohistory(ip) == 0 &&
1637             (error = hammer_checkspace(ip->hmp, HAMMER_CHKSPC_REMOVE)) != 0) {
1638                 return (error);
1639         }
1640
1641         hammer_start_transaction(&trans, ip->hmp);
1642         error = 0;
1643
1644         if (vap->va_flags != VNOVAL) {
1645                 flags = ip->ino_data.uflags;
1646                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1647                                          hammer_to_unix_xid(&ip->ino_data.uid),
1648                                          ap->a_cred);
1649                 if (error == 0) {
1650                         if (ip->ino_data.uflags != flags) {
1651                                 ip->ino_data.uflags = flags;
1652                                 modflags |= HAMMER_INODE_DDIRTY;
1653                         }
1654                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1655                                 error = 0;
1656                                 goto done;
1657                         }
1658                 }
1659                 goto done;
1660         }
1661         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1662                 error = EPERM;
1663                 goto done;
1664         }
1665         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
1666                 mode_t cur_mode = ip->ino_data.mode;
1667                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1668                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1669                 uuid_t uuid_uid;
1670                 uuid_t uuid_gid;
1671
1672                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
1673                                          ap->a_cred,
1674                                          &cur_uid, &cur_gid, &cur_mode);
1675                 if (error == 0) {
1676                         hammer_guid_to_uuid(&uuid_uid, cur_uid);
1677                         hammer_guid_to_uuid(&uuid_gid, cur_gid);
1678                         if (bcmp(&uuid_uid, &ip->ino_data.uid,
1679                                  sizeof(uuid_uid)) ||
1680                             bcmp(&uuid_gid, &ip->ino_data.gid,
1681                                  sizeof(uuid_gid)) ||
1682                             ip->ino_data.mode != cur_mode
1683                         ) {
1684                                 ip->ino_data.uid = uuid_uid;
1685                                 ip->ino_data.gid = uuid_gid;
1686                                 ip->ino_data.mode = cur_mode;
1687                         }
1688                         modflags |= HAMMER_INODE_DDIRTY;
1689                 }
1690         }
1691         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1692                 switch(ap->a_vp->v_type) {
1693                 case VREG:
1694                         if (vap->va_size == ip->ino_data.size)
1695                                 break;
1696                         /*
1697                          * XXX break atomicy, we can deadlock the backend
1698                          * if we do not release the lock.  Probably not a
1699                          * big deal here.
1700                          */
1701                         blksize = hammer_blocksize(vap->va_size);
1702                         if (vap->va_size < ip->ino_data.size) {
1703                                 vtruncbuf(ap->a_vp, vap->va_size, blksize);
1704                                 truncating = 1;
1705                         } else {
1706                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1707                                 truncating = 0;
1708                         }
1709                         ip->ino_data.size = vap->va_size;
1710                         modflags |= HAMMER_INODE_DDIRTY;
1711
1712                         /*
1713                          * on-media truncation is cached in the inode until
1714                          * the inode is synchronized.
1715                          */
1716                         if (truncating) {
1717                                 hammer_ip_frontend_trunc(ip, vap->va_size);
1718 #ifdef DEBUG_TRUNCATE
1719                                 if (HammerTruncIp == NULL)
1720                                         HammerTruncIp = ip;
1721 #endif
1722                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1723                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1724                                         ip->trunc_off = vap->va_size;
1725 #ifdef DEBUG_TRUNCATE
1726                                         if (ip == HammerTruncIp)
1727                                         kprintf("truncate1 %016llx\n", ip->trunc_off);
1728 #endif
1729                                 } else if (ip->trunc_off > vap->va_size) {
1730                                         ip->trunc_off = vap->va_size;
1731 #ifdef DEBUG_TRUNCATE
1732                                         if (ip == HammerTruncIp)
1733                                         kprintf("truncate2 %016llx\n", ip->trunc_off);
1734 #endif
1735                                 } else {
1736 #ifdef DEBUG_TRUNCATE
1737                                         if (ip == HammerTruncIp)
1738                                         kprintf("truncate3 %016llx (ignored)\n", vap->va_size);
1739 #endif
1740                                 }
1741                         }
1742
1743                         /*
1744                          * If truncating we have to clean out a portion of
1745                          * the last block on-disk.  We do this in the
1746                          * front-end buffer cache.
1747                          */
1748                         aligned_size = (vap->va_size + (blksize - 1)) &
1749                                        ~(int64_t)(blksize - 1);
1750                         if (truncating && vap->va_size < aligned_size) {
1751                                 struct buf *bp;
1752                                 int offset;
1753
1754                                 aligned_size -= blksize;
1755
1756                                 offset = (int)vap->va_size & (blksize - 1);
1757                                 error = bread(ap->a_vp, aligned_size,
1758                                               blksize, &bp);
1759                                 hammer_ip_frontend_trunc(ip, aligned_size);
1760                                 if (error == 0) {
1761                                         bzero(bp->b_data + offset,
1762                                               blksize - offset);
1763                                         bdwrite(bp);
1764                                 } else {
1765                                         kprintf("ERROR %d\n", error);
1766                                         brelse(bp);
1767                                 }
1768                         }
1769                         break;
1770                 case VDATABASE:
1771                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1772                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1773                                 ip->trunc_off = vap->va_size;
1774                         } else if (ip->trunc_off > vap->va_size) {
1775                                 ip->trunc_off = vap->va_size;
1776                         }
1777                         hammer_ip_frontend_trunc(ip, vap->va_size);
1778                         ip->ino_data.size = vap->va_size;
1779                         modflags |= HAMMER_INODE_DDIRTY;
1780                         break;
1781                 default:
1782                         error = EINVAL;
1783                         goto done;
1784                 }
1785                 break;
1786         }
1787         if (vap->va_atime.tv_sec != VNOVAL) {
1788                 ip->ino_data.atime =
1789                         hammer_timespec_to_time(&vap->va_atime);
1790                 modflags |= HAMMER_INODE_ATIME;
1791         }
1792         if (vap->va_mtime.tv_sec != VNOVAL) {
1793                 ip->ino_data.mtime =
1794                         hammer_timespec_to_time(&vap->va_mtime);
1795                 modflags |= HAMMER_INODE_MTIME;
1796         }
1797         if (vap->va_mode != (mode_t)VNOVAL) {
1798                 mode_t   cur_mode = ip->ino_data.mode;
1799                 uid_t cur_uid = hammer_to_unix_xid(&ip->ino_data.uid);
1800                 gid_t cur_gid = hammer_to_unix_xid(&ip->ino_data.gid);
1801
1802                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
1803                                          cur_uid, cur_gid, &cur_mode);
1804                 if (error == 0 && ip->ino_data.mode != cur_mode) {
1805                         ip->ino_data.mode = cur_mode;
1806                         modflags |= HAMMER_INODE_DDIRTY;
1807                 }
1808         }
1809 done:
1810         if (error == 0)
1811                 hammer_modify_inode(ip, modflags);
1812         hammer_done_transaction(&trans);
1813         return (error);
1814 }
1815
1816 /*
1817  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1818  */
1819 static
1820 int
1821 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1822 {
1823         struct hammer_transaction trans;
1824         struct hammer_inode *dip;
1825         struct hammer_inode *nip;
1826         struct nchandle *nch;
1827         hammer_record_t record;
1828         int error;
1829         int bytes;
1830
1831         ap->a_vap->va_type = VLNK;
1832
1833         nch = ap->a_nch;
1834         dip = VTOI(ap->a_dvp);
1835
1836         if (dip->flags & HAMMER_INODE_RO)
1837                 return (EROFS);
1838         if ((error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0)
1839                 return (error);
1840
1841         /*
1842          * Create a transaction to cover the operations we perform.
1843          */
1844         hammer_start_transaction(&trans, dip->hmp);
1845
1846         /*
1847          * Create a new filesystem object of the requested type.  The
1848          * returned inode will be referenced but not locked.
1849          */
1850
1851         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred,
1852                                     dip, NULL, &nip);
1853         if (error) {
1854                 hammer_done_transaction(&trans);
1855                 *ap->a_vpp = NULL;
1856                 return (error);
1857         }
1858
1859         /*
1860          * Add a record representing the symlink.  symlink stores the link
1861          * as pure data, not a string, and is no \0 terminated.
1862          */
1863         if (error == 0) {
1864                 bytes = strlen(ap->a_target);
1865
1866                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1867                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1868                 } else {
1869                         record = hammer_alloc_mem_record(nip, bytes);
1870                         record->type = HAMMER_MEM_RECORD_GENERAL;
1871
1872                         record->leaf.base.localization = nip->obj_localization +
1873                                                          HAMMER_LOCALIZE_MISC;
1874                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1875                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1876                         record->leaf.data_len = bytes;
1877                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1878                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1879                         error = hammer_ip_add_record(&trans, record);
1880                 }
1881
1882                 /*
1883                  * Set the file size to the length of the link.
1884                  */
1885                 if (error == 0) {
1886                         nip->ino_data.size = bytes;
1887                         hammer_modify_inode(nip, HAMMER_INODE_DDIRTY);
1888                 }
1889         }
1890         if (error == 0)
1891                 error = hammer_ip_add_directory(&trans, dip, nch->ncp->nc_name,
1892                                                 nch->ncp->nc_nlen, nip);
1893
1894         /*
1895          * Finish up.
1896          */
1897         if (error) {
1898                 hammer_rel_inode(nip, 0);
1899                 *ap->a_vpp = NULL;
1900         } else {
1901                 error = hammer_get_vnode(nip, ap->a_vpp);
1902                 hammer_rel_inode(nip, 0);
1903                 if (error == 0) {
1904                         cache_setunresolved(ap->a_nch);
1905                         cache_setvp(ap->a_nch, *ap->a_vpp);
1906                 }
1907         }
1908         hammer_done_transaction(&trans);
1909         return (error);
1910 }
1911
1912 /*
1913  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1914  */
1915 static
1916 int
1917 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1918 {
1919         struct hammer_transaction trans;
1920         struct hammer_inode *dip;
1921         int error;
1922
1923         dip = VTOI(ap->a_dvp);
1924
1925         if (hammer_nohistory(dip) == 0 &&
1926             (error = hammer_checkspace(dip->hmp, HAMMER_CHKSPC_CREATE)) != 0) {
1927                 return (error);
1928         }
1929
1930         hammer_start_transaction(&trans, dip->hmp);
1931         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1932                                 ap->a_cred, ap->a_flags);
1933         hammer_done_transaction(&trans);
1934
1935         return (error);
1936 }
1937
1938 /*
1939  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1940  */
1941 static
1942 int
1943 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1944 {
1945         struct hammer_inode *ip = ap->a_vp->v_data;
1946
1947         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1948                             ap->a_fflag, ap->a_cred));
1949 }
1950
1951 static
1952 int
1953 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1954 {
1955         struct mount *mp;
1956         int error;
1957
1958         mp = ap->a_head.a_ops->head.vv_mount;
1959
1960         switch(ap->a_op) {
1961         case MOUNTCTL_SET_EXPORT:
1962                 if (ap->a_ctllen != sizeof(struct export_args))
1963                         error = EINVAL;
1964                 error = hammer_vfs_export(mp, ap->a_op,
1965                                       (const struct export_args *)ap->a_ctl);
1966                 break;
1967         default:
1968                 error = journal_mountctl(ap);
1969                 break;
1970         }
1971         return(error);
1972 }
1973
1974 /*
1975  * hammer_vop_strategy { vp, bio }
1976  *
1977  * Strategy call, used for regular file read & write only.  Note that the
1978  * bp may represent a cluster.
1979  *
1980  * To simplify operation and allow better optimizations in the future,
1981  * this code does not make any assumptions with regards to buffer alignment
1982  * or size.
1983  */
1984 static
1985 int
1986 hammer_vop_strategy(struct vop_strategy_args *ap)
1987 {
1988         struct buf *bp;
1989         int error;
1990
1991         bp = ap->a_bio->bio_buf;
1992
1993         switch(bp->b_cmd) {
1994         case BUF_CMD_READ:
1995                 error = hammer_vop_strategy_read(ap);
1996                 break;
1997         case BUF_CMD_WRITE:
1998                 error = hammer_vop_strategy_write(ap);
1999                 break;
2000         default:
2001                 bp->b_error = error = EINVAL;
2002                 bp->b_flags |= B_ERROR;
2003                 biodone(ap->a_bio);
2004                 break;
2005         }
2006         return (error);
2007 }
2008
2009 /*
2010  * Read from a regular file.  Iterate the related records and fill in the
2011  * BIO/BUF.  Gaps are zero-filled.
2012  *
2013  * The support code in hammer_object.c should be used to deal with mixed
2014  * in-memory and on-disk records.
2015  *
2016  * NOTE: Can be called from the cluster code with an oversized buf.
2017  *
2018  * XXX atime update
2019  */
2020 static
2021 int
2022 hammer_vop_strategy_read(struct vop_strategy_args *ap)
2023 {
2024         struct hammer_transaction trans;
2025         struct hammer_inode *ip;
2026         struct hammer_cursor cursor;
2027         hammer_base_elm_t base;
2028         hammer_off_t disk_offset;
2029         struct bio *bio;
2030         struct bio *nbio;
2031         struct buf *bp;
2032         int64_t rec_offset;
2033         int64_t ran_end;
2034         int64_t tmp64;
2035         int error;
2036         int boff;
2037         int roff;
2038         int n;
2039
2040         bio = ap->a_bio;
2041         bp = bio->bio_buf;
2042         ip = ap->a_vp->v_data;
2043
2044         /*
2045          * The zone-2 disk offset may have been set by the cluster code via
2046          * a BMAP operation, or else should be NOOFFSET.
2047          *
2048          * Checking the high bits for a match against zone-2 should suffice.
2049          */
2050         nbio = push_bio(bio);
2051         if ((nbio->bio_offset & HAMMER_OFF_ZONE_MASK) ==
2052             HAMMER_ZONE_RAW_BUFFER) {
2053                 error = hammer_io_direct_read(ip->hmp, nbio);
2054                 return (error);
2055         }
2056
2057         /*
2058          * Well, that sucked.  Do it the hard way.  If all the stars are
2059          * aligned we may still be able to issue a direct-read.
2060          */
2061         hammer_simple_transaction(&trans, ip->hmp);
2062         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2063
2064         /*
2065          * Key range (begin and end inclusive) to scan.  Note that the key's
2066          * stored in the actual records represent BASE+LEN, not BASE.  The
2067          * first record containing bio_offset will have a key > bio_offset.
2068          */
2069         cursor.key_beg.localization = ip->obj_localization +
2070                                       HAMMER_LOCALIZE_MISC;
2071         cursor.key_beg.obj_id = ip->obj_id;
2072         cursor.key_beg.create_tid = 0;
2073         cursor.key_beg.delete_tid = 0;
2074         cursor.key_beg.obj_type = 0;
2075         cursor.key_beg.key = bio->bio_offset + 1;
2076         cursor.asof = ip->obj_asof;
2077         cursor.flags |= HAMMER_CURSOR_ASOF;
2078
2079         cursor.key_end = cursor.key_beg;
2080         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2081 #if 0
2082         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
2083                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
2084                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
2085                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2086         } else
2087 #endif
2088         {
2089                 ran_end = bio->bio_offset + bp->b_bufsize;
2090                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2091                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2092                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2093                 if (tmp64 < ran_end)
2094                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2095                 else
2096                         cursor.key_end.key = ran_end + MAXPHYS + 1;
2097         }
2098         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2099
2100         error = hammer_ip_first(&cursor);
2101         boff = 0;
2102
2103         while (error == 0) {
2104                 /*
2105                  * Get the base file offset of the record.  The key for
2106                  * data records is (base + bytes) rather then (base).
2107                  */
2108                 base = &cursor.leaf->base;
2109                 rec_offset = base->key - cursor.leaf->data_len;
2110
2111                 /*
2112                  * Calculate the gap, if any, and zero-fill it.
2113                  *
2114                  * n is the offset of the start of the record verses our
2115                  * current seek offset in the bio.
2116                  */
2117                 n = (int)(rec_offset - (bio->bio_offset + boff));
2118                 if (n > 0) {
2119                         if (n > bp->b_bufsize - boff)
2120                                 n = bp->b_bufsize - boff;
2121                         bzero((char *)bp->b_data + boff, n);
2122                         boff += n;
2123                         n = 0;
2124                 }
2125
2126                 /*
2127                  * Calculate the data offset in the record and the number
2128                  * of bytes we can copy.
2129                  *
2130                  * There are two degenerate cases.  First, boff may already
2131                  * be at bp->b_bufsize.  Secondly, the data offset within
2132                  * the record may exceed the record's size.
2133                  */
2134                 roff = -n;
2135                 rec_offset += roff;
2136                 n = cursor.leaf->data_len - roff;
2137                 if (n <= 0) {
2138                         kprintf("strategy_read: bad n=%d roff=%d\n", n, roff);
2139                         n = 0;
2140                 } else if (n > bp->b_bufsize - boff) {
2141                         n = bp->b_bufsize - boff;
2142                 }
2143
2144                 /*
2145                  * Deal with cached truncations.  This cool bit of code
2146                  * allows truncate()/ftruncate() to avoid having to sync
2147                  * the file.
2148                  *
2149                  * If the frontend is truncated then all backend records are
2150                  * subject to the frontend's truncation.
2151                  *
2152                  * If the backend is truncated then backend records on-disk
2153                  * (but not in-memory) are subject to the backend's
2154                  * truncation.  In-memory records owned by the backend
2155                  * represent data written after the truncation point on the
2156                  * backend and must not be truncated.
2157                  *
2158                  * Truncate operations deal with frontend buffer cache
2159                  * buffers and frontend-owned in-memory records synchronously.
2160                  */
2161                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2162                         if (hammer_cursor_ondisk(&cursor) ||
2163                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2164                                 if (ip->trunc_off <= rec_offset)
2165                                         n = 0;
2166                                 else if (ip->trunc_off < rec_offset + n)
2167                                         n = (int)(ip->trunc_off - rec_offset);
2168                         }
2169                 }
2170                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2171                         if (hammer_cursor_ondisk(&cursor)) {
2172                                 if (ip->sync_trunc_off <= rec_offset)
2173                                         n = 0;
2174                                 else if (ip->sync_trunc_off < rec_offset + n)
2175                                         n = (int)(ip->sync_trunc_off - rec_offset);
2176                         }
2177                 }
2178
2179                 /*
2180                  * Try to issue a direct read into our bio if possible,
2181                  * otherwise resolve the element data into a hammer_buffer
2182                  * and copy.
2183                  *
2184                  * The buffer on-disk should be zerod past any real
2185                  * truncation point, but may not be for any synthesized
2186                  * truncation point from above.
2187                  */
2188                 if (boff == 0 && n == bp->b_bufsize &&
2189                     ((cursor.leaf->data_offset + roff) & HAMMER_BUFMASK) == 0) {
2190                         disk_offset = hammer_blockmap_lookup(
2191                                                 trans.hmp,
2192                                                 cursor.leaf->data_offset + roff,
2193                                                 &error);
2194                         if (error)
2195                                 break;
2196                         nbio->bio_offset = disk_offset;
2197                         error = hammer_io_direct_read(trans.hmp, nbio);
2198                         goto done;
2199                 } else if (n) {
2200                         error = hammer_ip_resolve_data(&cursor);
2201                         if (error == 0) {
2202                                 bcopy((char *)cursor.data + roff,
2203                                       (char *)bp->b_data + boff, n);
2204                         }
2205                 }
2206                 if (error)
2207                         break;
2208
2209                 /*
2210                  * Iterate until we have filled the request.
2211                  */
2212                 boff += n;
2213                 if (boff == bp->b_bufsize)
2214                         break;
2215                 error = hammer_ip_next(&cursor);
2216         }
2217
2218         /*
2219          * There may have been a gap after the last record
2220          */
2221         if (error == ENOENT)
2222                 error = 0;
2223         if (error == 0 && boff != bp->b_bufsize) {
2224                 KKASSERT(boff < bp->b_bufsize);
2225                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
2226                 /* boff = bp->b_bufsize; */
2227         }
2228         bp->b_resid = 0;
2229         bp->b_error = error;
2230         if (error)
2231                 bp->b_flags |= B_ERROR;
2232         biodone(ap->a_bio);
2233
2234 done:
2235         if (cursor.node)
2236                 hammer_cache_node(&ip->cache[1], cursor.node);
2237         hammer_done_cursor(&cursor);
2238         hammer_done_transaction(&trans);
2239         return(error);
2240 }
2241
2242 /*
2243  * BMAP operation - used to support cluster_read() only.
2244  *
2245  * (struct vnode *vp, off_t loffset, off_t *doffsetp, int *runp, int *runb)
2246  *
2247  * This routine may return EOPNOTSUPP if the opration is not supported for
2248  * the specified offset.  The contents of the pointer arguments do not
2249  * need to be initialized in that case. 
2250  *
2251  * If a disk address is available and properly aligned return 0 with 
2252  * *doffsetp set to the zone-2 address, and *runp / *runb set appropriately
2253  * to the run-length relative to that offset.  Callers may assume that
2254  * *doffsetp is valid if 0 is returned, even if *runp is not sufficiently
2255  * large, so return EOPNOTSUPP if it is not sufficiently large.
2256  */
2257 static
2258 int
2259 hammer_vop_bmap(struct vop_bmap_args *ap)
2260 {
2261         struct hammer_transaction trans;
2262         struct hammer_inode *ip;
2263         struct hammer_cursor cursor;
2264         hammer_base_elm_t base;
2265         int64_t rec_offset;
2266         int64_t ran_end;
2267         int64_t tmp64;
2268         int64_t base_offset;
2269         int64_t base_disk_offset;
2270         int64_t last_offset;
2271         hammer_off_t last_disk_offset;
2272         hammer_off_t disk_offset;
2273         int     rec_len;
2274         int     error;
2275         int     blksize;
2276
2277         ip = ap->a_vp->v_data;
2278
2279         /*
2280          * We can only BMAP regular files.  We can't BMAP database files,
2281          * directories, etc.
2282          */
2283         if (ip->ino_data.obj_type != HAMMER_OBJTYPE_REGFILE)
2284                 return(EOPNOTSUPP);
2285
2286         /*
2287          * bmap is typically called with runp/runb both NULL when used
2288          * for writing.  We do not support BMAP for writing atm.
2289          */
2290         if (ap->a_cmd != BUF_CMD_READ)
2291                 return(EOPNOTSUPP);
2292
2293         /*
2294          * Scan the B-Tree to acquire blockmap addresses, then translate
2295          * to raw addresses.
2296          */
2297         hammer_simple_transaction(&trans, ip->hmp);
2298 #if 0
2299         kprintf("bmap_beg %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2300 #endif
2301         hammer_init_cursor(&trans, &cursor, &ip->cache[1], ip);
2302
2303         /*
2304          * Key range (begin and end inclusive) to scan.  Note that the key's
2305          * stored in the actual records represent BASE+LEN, not BASE.  The
2306          * first record containing bio_offset will have a key > bio_offset.
2307          */
2308         cursor.key_beg.localization = ip->obj_localization +
2309                                       HAMMER_LOCALIZE_MISC;
2310         cursor.key_beg.obj_id = ip->obj_id;
2311         cursor.key_beg.create_tid = 0;
2312         cursor.key_beg.delete_tid = 0;
2313         cursor.key_beg.obj_type = 0;
2314         if (ap->a_runb)
2315                 cursor.key_beg.key = ap->a_loffset - MAXPHYS + 1;
2316         else
2317                 cursor.key_beg.key = ap->a_loffset + 1;
2318         if (cursor.key_beg.key < 0)
2319                 cursor.key_beg.key = 0;
2320         cursor.asof = ip->obj_asof;
2321         cursor.flags |= HAMMER_CURSOR_ASOF;
2322
2323         cursor.key_end = cursor.key_beg;
2324         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
2325
2326         ran_end = ap->a_loffset + MAXPHYS;
2327         cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
2328         cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
2329         tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
2330         if (tmp64 < ran_end)
2331                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
2332         else
2333                 cursor.key_end.key = ran_end + MAXPHYS + 1;
2334
2335         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
2336
2337         error = hammer_ip_first(&cursor);
2338         base_offset = last_offset = 0;
2339         base_disk_offset = last_disk_offset = 0;
2340
2341         while (error == 0) {
2342                 /*
2343                  * Get the base file offset of the record.  The key for
2344                  * data records is (base + bytes) rather then (base).
2345                  *
2346                  * NOTE: rec_offset + rec_len may exceed the end-of-file.
2347                  * The extra bytes should be zero on-disk and the BMAP op
2348                  * should still be ok.
2349                  */
2350                 base = &cursor.leaf->base;
2351                 rec_offset = base->key - cursor.leaf->data_len;
2352                 rec_len    = cursor.leaf->data_len;
2353
2354                 /*
2355                  * Incorporate any cached truncation.
2356                  *
2357                  * NOTE: Modifications to rec_len based on synthesized
2358                  * truncation points remove the guarantee that any extended
2359                  * data on disk is zero (since the truncations may not have
2360                  * taken place on-media yet).
2361                  */
2362                 if (ip->flags & HAMMER_INODE_TRUNCATED) {
2363                         if (hammer_cursor_ondisk(&cursor) ||
2364                             cursor.iprec->flush_state == HAMMER_FST_FLUSH) {
2365                                 if (ip->trunc_off <= rec_offset)
2366                                         rec_len = 0;
2367                                 else if (ip->trunc_off < rec_offset + rec_len)
2368                                         rec_len = (int)(ip->trunc_off - rec_offset);
2369                         }
2370                 }
2371                 if (ip->sync_flags & HAMMER_INODE_TRUNCATED) {
2372                         if (hammer_cursor_ondisk(&cursor)) {
2373                                 if (ip->sync_trunc_off <= rec_offset)
2374                                         rec_len = 0;
2375                                 else if (ip->sync_trunc_off < rec_offset + rec_len)
2376                                         rec_len = (int)(ip->sync_trunc_off - rec_offset);
2377                         }
2378                 }
2379
2380                 /*
2381                  * Accumulate information.  If we have hit a discontiguous
2382                  * block reset base_offset unless we are already beyond the
2383                  * requested offset.  If we are, that's it, we stop.
2384                  */
2385                 disk_offset = hammer_blockmap_lookup(trans.hmp,
2386                                                      cursor.leaf->data_offset,
2387                                                      &error);
2388                 if (error)
2389                         break;
2390                 if (rec_offset != last_offset ||
2391                     disk_offset != last_disk_offset) {
2392                         if (rec_offset > ap->a_loffset)
2393                                 break;
2394                         base_offset = rec_offset;
2395                         base_disk_offset = disk_offset;
2396                 }
2397                 last_offset = rec_offset + rec_len;
2398                 last_disk_offset = disk_offset + rec_len;
2399
2400                 error = hammer_ip_next(&cursor);
2401         }
2402
2403 #if 0
2404         kprintf("BMAP %016llx:  %016llx - %016llx\n",
2405                 ap->a_loffset, base_offset, last_offset);
2406         kprintf("BMAP %16s:  %016llx - %016llx\n",
2407                 "", base_disk_offset, last_disk_offset);
2408 #endif
2409
2410         if (cursor.node) {
2411                 hammer_cache_node(&ip->cache[1], cursor.node);
2412 #if 0
2413                 kprintf("bmap_end2 %016llx ip->cache %p\n", ap->a_loffset, ip->cache[1]);
2414 #endif
2415         }
2416         hammer_done_cursor(&cursor);
2417         hammer_done_transaction(&trans);
2418
2419         /*
2420          * If we couldn't find any records or the records we did find were
2421          * all behind the requested offset, return failure.  A forward
2422          * truncation can leave a hole w/ no on-disk records.
2423          */
2424         if (last_offset == 0 || last_offset < ap->a_loffset)
2425                 return (EOPNOTSUPP);
2426
2427         /*
2428          * Figure out the block size at the requested offset and adjust
2429          * our limits so the cluster_read() does not create inappropriately
2430          * sized buffer cache buffers.
2431          */
2432         blksize = hammer_blocksize(ap->a_loffset);
2433         if (hammer_blocksize(base_offset) != blksize) {
2434                 base_offset = hammer_blockdemarc(base_offset, ap->a_loffset);
2435         }
2436         if (last_offset != ap->a_loffset &&
2437             hammer_blocksize(last_offset - 1) != blksize) {
2438                 last_offset = hammer_blockdemarc(ap->a_loffset,
2439                                                  last_offset - 1);
2440         }
2441
2442         /*
2443          * Returning EOPNOTSUPP simply prevents the direct-IO optimization
2444          * from occuring.
2445          */
2446         disk_offset = base_disk_offset + (ap->a_loffset - base_offset);
2447
2448         /*
2449          * If doffsetp is not aligned or the forward run size does
2450          * not cover a whole buffer, disallow the direct I/O.
2451          */
2452         if ((disk_offset & HAMMER_BUFMASK) ||
2453             (last_offset - ap->a_loffset) < blksize) {
2454                 error = EOPNOTSUPP;
2455         } else {
2456                 *ap->a_doffsetp = disk_offset;
2457                 if (ap->a_runb) {
2458                         *ap->a_runb = ap->a_loffset - base_offset;
2459                         KKASSERT(*ap->a_runb >= 0);
2460                 }
2461                 if (ap->a_runp) {
2462                         *ap->a_runp = last_offset - ap->a_loffset;
2463                         KKASSERT(*ap->a_runp >= 0);
2464                 }
2465                 error = 0;
2466         }
2467         return(error);
2468 }
2469
2470 /*
2471  * Write to a regular file.   Because this is a strategy call the OS is
2472  * trying to actually get data onto the media.
2473  */
2474 static
2475 int
2476 hammer_vop_strategy_write(struct vop_strategy_args *ap)
2477 {
2478         hammer_record_t record;
2479         hammer_mount_t hmp;
2480         hammer_inode_t ip;
2481         struct bio *bio;
2482         struct buf *bp;
2483         int blksize;
2484         int bytes;
2485         int error;
2486
2487         bio = ap->a_bio;
2488         bp = bio->bio_buf;
2489         ip = ap->a_vp->v_data;
2490         hmp = ip->hmp;
2491
2492         blksize = hammer_blocksize(bio->bio_offset);
2493         KKASSERT(bp->b_bufsize == blksize);
2494
2495         if (ip->flags & HAMMER_INODE_RO) {
2496                 bp->b_error = EROFS;
2497                 bp->b_flags |= B_ERROR;
2498                 biodone(ap->a_bio);
2499                 return(EROFS);
2500         }
2501
2502         /*
2503          * Interlock with inode destruction (no in-kernel or directory
2504          * topology visibility).  If we queue new IO while trying to
2505          * destroy the inode we can deadlock the vtrunc call in
2506          * hammer_inode_unloadable_check().
2507          */
2508         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
2509                 bp->b_resid = 0;
2510                 biodone(ap->a_bio);
2511                 return(0);
2512         }
2513
2514         /*
2515          * Reserve space and issue a direct-write from the front-end. 
2516          * NOTE: The direct_io code will hammer_bread/bcopy smaller
2517          * allocations.
2518          *
2519          * An in-memory record will be installed to reference the storage
2520          * until the flusher can get to it.
2521          *
2522          * Since we own the high level bio the front-end will not try to
2523          * do a direct-read until the write completes.
2524          *
2525          * NOTE: The only time we do not reserve a full-sized buffers
2526          * worth of data is if the file is small.  We do not try to
2527          * allocate a fragment (from the small-data zone) at the end of
2528          * an otherwise large file as this can lead to wildly separated
2529          * data.
2530          */
2531         KKASSERT((bio->bio_offset & HAMMER_BUFMASK) == 0);
2532         KKASSERT(bio->bio_offset < ip->ino_data.size);
2533         if (bio->bio_offset || ip->ino_data.size > HAMMER_BUFSIZE / 2)
2534                 bytes = bp->b_bufsize;
2535         else
2536                 bytes = ((int)ip->ino_data.size + 15) & ~15;
2537
2538         record = hammer_ip_add_bulk(ip, bio->bio_offset, bp->b_data,
2539                                     bytes, &error);
2540         if (record) {
2541                 hammer_io_direct_write(hmp, &record->leaf, bio);
2542                 hammer_rel_mem_record(record);
2543                 if (ip->rsv_recs > 1 && hmp->rsv_recs > hammer_limit_recs)
2544                         hammer_flush_inode(ip, 0);
2545         } else {
2546                 bp->b_bio2.bio_offset = NOOFFSET;
2547                 bp->b_error = error;
2548                 bp->b_flags |= B_ERROR;
2549                 biodone(ap->a_bio);
2550         }
2551         return(error);
2552 }
2553
2554 /*
2555  * dounlink - disconnect a directory entry
2556  *
2557  * XXX whiteout support not really in yet
2558  */
2559 static int
2560 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2561                 struct vnode *dvp, struct ucred *cred, int flags)
2562 {
2563         struct namecache *ncp;
2564         hammer_inode_t dip;
2565         hammer_inode_t ip;
2566         struct hammer_cursor cursor;
2567         int64_t namekey;
2568         int nlen, error;
2569
2570         /*
2571          * Calculate the namekey and setup the key range for the scan.  This
2572          * works kinda like a chained hash table where the lower 32 bits
2573          * of the namekey synthesize the chain.
2574          *
2575          * The key range is inclusive of both key_beg and key_end.
2576          */
2577         dip = VTOI(dvp);
2578         ncp = nch->ncp;
2579
2580         if (dip->flags & HAMMER_INODE_RO)
2581                 return (EROFS);
2582
2583         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2584 retry:
2585         hammer_init_cursor(trans, &cursor, &dip->cache[1], dip);
2586         cursor.key_beg.localization = dip->obj_localization +
2587                                       HAMMER_LOCALIZE_MISC;
2588         cursor.key_beg.obj_id = dip->obj_id;
2589         cursor.key_beg.key = namekey;
2590         cursor.key_beg.create_tid = 0;
2591         cursor.key_beg.delete_tid = 0;
2592         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2593         cursor.key_beg.obj_type = 0;
2594
2595         cursor.key_end = cursor.key_beg;
2596         cursor.key_end.key |= 0xFFFFFFFFULL;
2597         cursor.asof = dip->obj_asof;
2598         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2599
2600         /*
2601          * Scan all matching records (the chain), locate the one matching
2602          * the requested path component.  info->last_error contains the
2603          * error code on search termination and could be 0, ENOENT, or
2604          * something else.
2605          *
2606          * The hammer_ip_*() functions merge in-memory records with on-disk
2607          * records for the purposes of the search.
2608          */
2609         error = hammer_ip_first(&cursor);
2610
2611         while (error == 0) {
2612                 error = hammer_ip_resolve_data(&cursor);
2613                 if (error)
2614                         break;
2615                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2616                 KKASSERT(nlen > 0);
2617                 if (ncp->nc_nlen == nlen &&
2618                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2619                         break;
2620                 }
2621                 error = hammer_ip_next(&cursor);
2622         }
2623
2624         /*
2625          * If all is ok we have to get the inode so we can adjust nlinks.
2626          * To avoid a deadlock with the flusher we must release the inode
2627          * lock on the directory when acquiring the inode for the entry.
2628          *
2629          * If the target is a directory, it must be empty.
2630          */
2631         if (error == 0) {
2632                 hammer_unlock(&cursor.ip->lock);
2633                 ip = hammer_get_inode(trans, dip, cursor.data->entry.obj_id,
2634                                       dip->hmp->asof,
2635                                       cursor.data->entry.localization,
2636                                       0, &error);
2637                 hammer_lock_sh(&cursor.ip->lock);
2638                 if (error == ENOENT) {
2639                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2640                         Debugger("ENOENT unlinking object that should exist");
2641                 }
2642
2643                 /*
2644                  * If we are trying to remove a directory the directory must
2645                  * be empty.
2646                  *
2647                  * WARNING: hammer_ip_check_directory_empty() may have to
2648                  * terminate the cursor to avoid a deadlock.  It is ok to
2649                  * call hammer_done_cursor() twice.
2650                  */
2651                 if (error == 0 && ip->ino_data.obj_type ==
2652                                   HAMMER_OBJTYPE_DIRECTORY) {
2653                         error = hammer_ip_check_directory_empty(trans, ip);
2654                 }
2655
2656                 /*
2657                  * Delete the directory entry.
2658                  *
2659                  * WARNING: hammer_ip_del_directory() may have to terminate
2660                  * the cursor to avoid a deadlock.  It is ok to call
2661                  * hammer_done_cursor() twice.
2662                  */
2663                 if (error == 0) {
2664                         error = hammer_ip_del_directory(trans, &cursor,
2665                                                         dip, ip);
2666                 }
2667                 hammer_done_cursor(&cursor);
2668                 if (error == 0) {
2669                         cache_setunresolved(nch);
2670                         cache_setvp(nch, NULL);
2671                         /* XXX locking */
2672                         if (ip->vp)
2673                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2674                 }
2675                 if (ip)
2676                         hammer_rel_inode(ip, 0);
2677         } else {
2678                 hammer_done_cursor(&cursor);
2679         }
2680         if (error == EDEADLK)
2681                 goto retry;
2682
2683         return (error);
2684 }
2685
2686 /************************************************************************
2687  *                          FIFO AND SPECFS OPS                         *
2688  ************************************************************************
2689  *
2690  */
2691
2692 static int
2693 hammer_vop_fifoclose (struct vop_close_args *ap)
2694 {
2695         /* XXX update itimes */
2696         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2697 }
2698
2699 static int
2700 hammer_vop_fiforead (struct vop_read_args *ap)
2701 {
2702         int error;
2703
2704         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2705         /* XXX update access time */
2706         return (error);
2707 }
2708
2709 static int
2710 hammer_vop_fifowrite (struct vop_write_args *ap)
2711 {
2712         int error;
2713
2714         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2715         /* XXX update access time */
2716         return (error);
2717 }
2718
2719 static int
2720 hammer_vop_specclose (struct vop_close_args *ap)
2721 {
2722         /* XXX update itimes */
2723         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2724 }
2725
2726 static int
2727 hammer_vop_specread (struct vop_read_args *ap)
2728 {
2729         /* XXX update access time */
2730         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2731 }
2732
2733 static int
2734 hammer_vop_specwrite (struct vop_write_args *ap)
2735 {
2736         /* XXX update last change time */
2737         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2738 }
2739