HAMMER 47/Many: Stabilization pass
[dragonfly.git] / sys / vfs / hammer / hammer_vnops.c
1 /*
2  * Copyright (c) 2007-2008 The DragonFly Project.  All rights reserved.
3  * 
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * 
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  * 
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  * 
34  * $DragonFly: src/sys/vfs/hammer/hammer_vnops.c,v 1.55 2008/05/22 04:14:01 dillon Exp $
35  */
36
37 #include <sys/param.h>
38 #include <sys/systm.h>
39 #include <sys/kernel.h>
40 #include <sys/fcntl.h>
41 #include <sys/namecache.h>
42 #include <sys/vnode.h>
43 #include <sys/lockf.h>
44 #include <sys/event.h>
45 #include <sys/stat.h>
46 #include <sys/dirent.h>
47 #include <vm/vm_extern.h>
48 #include <vfs/fifofs/fifo.h>
49 #include "hammer.h"
50
51 /*
52  * USERFS VNOPS
53  */
54 /*static int hammer_vop_vnoperate(struct vop_generic_args *);*/
55 static int hammer_vop_fsync(struct vop_fsync_args *);
56 static int hammer_vop_read(struct vop_read_args *);
57 static int hammer_vop_write(struct vop_write_args *);
58 static int hammer_vop_access(struct vop_access_args *);
59 static int hammer_vop_advlock(struct vop_advlock_args *);
60 static int hammer_vop_close(struct vop_close_args *);
61 static int hammer_vop_ncreate(struct vop_ncreate_args *);
62 static int hammer_vop_getattr(struct vop_getattr_args *);
63 static int hammer_vop_nresolve(struct vop_nresolve_args *);
64 static int hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *);
65 static int hammer_vop_nlink(struct vop_nlink_args *);
66 static int hammer_vop_nmkdir(struct vop_nmkdir_args *);
67 static int hammer_vop_nmknod(struct vop_nmknod_args *);
68 static int hammer_vop_open(struct vop_open_args *);
69 static int hammer_vop_pathconf(struct vop_pathconf_args *);
70 static int hammer_vop_print(struct vop_print_args *);
71 static int hammer_vop_readdir(struct vop_readdir_args *);
72 static int hammer_vop_readlink(struct vop_readlink_args *);
73 static int hammer_vop_nremove(struct vop_nremove_args *);
74 static int hammer_vop_nrename(struct vop_nrename_args *);
75 static int hammer_vop_nrmdir(struct vop_nrmdir_args *);
76 static int hammer_vop_setattr(struct vop_setattr_args *);
77 static int hammer_vop_strategy(struct vop_strategy_args *);
78 static int hammer_vop_nsymlink(struct vop_nsymlink_args *);
79 static int hammer_vop_nwhiteout(struct vop_nwhiteout_args *);
80 static int hammer_vop_ioctl(struct vop_ioctl_args *);
81 static int hammer_vop_mountctl(struct vop_mountctl_args *);
82
83 static int hammer_vop_fifoclose (struct vop_close_args *);
84 static int hammer_vop_fiforead (struct vop_read_args *);
85 static int hammer_vop_fifowrite (struct vop_write_args *);
86
87 static int hammer_vop_specclose (struct vop_close_args *);
88 static int hammer_vop_specread (struct vop_read_args *);
89 static int hammer_vop_specwrite (struct vop_write_args *);
90
91 struct vop_ops hammer_vnode_vops = {
92         .vop_default =          vop_defaultop,
93         .vop_fsync =            hammer_vop_fsync,
94         .vop_getpages =         vop_stdgetpages,
95         .vop_putpages =         vop_stdputpages,
96         .vop_read =             hammer_vop_read,
97         .vop_write =            hammer_vop_write,
98         .vop_access =           hammer_vop_access,
99         .vop_advlock =          hammer_vop_advlock,
100         .vop_close =            hammer_vop_close,
101         .vop_ncreate =          hammer_vop_ncreate,
102         .vop_getattr =          hammer_vop_getattr,
103         .vop_inactive =         hammer_vop_inactive,
104         .vop_reclaim =          hammer_vop_reclaim,
105         .vop_nresolve =         hammer_vop_nresolve,
106         .vop_nlookupdotdot =    hammer_vop_nlookupdotdot,
107         .vop_nlink =            hammer_vop_nlink,
108         .vop_nmkdir =           hammer_vop_nmkdir,
109         .vop_nmknod =           hammer_vop_nmknod,
110         .vop_open =             hammer_vop_open,
111         .vop_pathconf =         hammer_vop_pathconf,
112         .vop_print =            hammer_vop_print,
113         .vop_readdir =          hammer_vop_readdir,
114         .vop_readlink =         hammer_vop_readlink,
115         .vop_nremove =          hammer_vop_nremove,
116         .vop_nrename =          hammer_vop_nrename,
117         .vop_nrmdir =           hammer_vop_nrmdir,
118         .vop_setattr =          hammer_vop_setattr,
119         .vop_strategy =         hammer_vop_strategy,
120         .vop_nsymlink =         hammer_vop_nsymlink,
121         .vop_nwhiteout =        hammer_vop_nwhiteout,
122         .vop_ioctl =            hammer_vop_ioctl,
123         .vop_mountctl =         hammer_vop_mountctl
124 };
125
126 struct vop_ops hammer_spec_vops = {
127         .vop_default =          spec_vnoperate,
128         .vop_fsync =            hammer_vop_fsync,
129         .vop_read =             hammer_vop_specread,
130         .vop_write =            hammer_vop_specwrite,
131         .vop_access =           hammer_vop_access,
132         .vop_close =            hammer_vop_specclose,
133         .vop_getattr =          hammer_vop_getattr,
134         .vop_inactive =         hammer_vop_inactive,
135         .vop_reclaim =          hammer_vop_reclaim,
136         .vop_setattr =          hammer_vop_setattr
137 };
138
139 struct vop_ops hammer_fifo_vops = {
140         .vop_default =          fifo_vnoperate,
141         .vop_fsync =            hammer_vop_fsync,
142         .vop_read =             hammer_vop_fiforead,
143         .vop_write =            hammer_vop_fifowrite,
144         .vop_access =           hammer_vop_access,
145         .vop_close =            hammer_vop_fifoclose,
146         .vop_getattr =          hammer_vop_getattr,
147         .vop_inactive =         hammer_vop_inactive,
148         .vop_reclaim =          hammer_vop_reclaim,
149         .vop_setattr =          hammer_vop_setattr
150 };
151
152 static int hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
153                            struct vnode *dvp, struct ucred *cred, int flags);
154 static int hammer_vop_strategy_read(struct vop_strategy_args *ap);
155 static int hammer_vop_strategy_write(struct vop_strategy_args *ap);
156
157 #if 0
158 static
159 int
160 hammer_vop_vnoperate(struct vop_generic_args *)
161 {
162         return (VOCALL(&hammer_vnode_vops, ap));
163 }
164 #endif
165
166 /*
167  * hammer_vop_fsync { vp, waitfor }
168  */
169 static
170 int
171 hammer_vop_fsync(struct vop_fsync_args *ap)
172 {
173         hammer_inode_t ip = VTOI(ap->a_vp);
174
175         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
176         vfsync(ap->a_vp, ap->a_waitfor, 1, NULL, NULL);
177         if (ap->a_waitfor == MNT_WAIT)
178                 hammer_wait_inode(ip);
179         return (ip->error);
180 }
181
182 /*
183  * hammer_vop_read { vp, uio, ioflag, cred }
184  */
185 static
186 int
187 hammer_vop_read(struct vop_read_args *ap)
188 {
189         struct hammer_transaction trans;
190         hammer_inode_t ip;
191         off_t offset;
192         struct buf *bp;
193         struct uio *uio;
194         int error;
195         int n;
196         int seqcount;
197
198         if (ap->a_vp->v_type != VREG)
199                 return (EINVAL);
200         ip = VTOI(ap->a_vp);
201         error = 0;
202         seqcount = ap->a_ioflag >> 16;
203
204         hammer_start_transaction(&trans, ip->hmp);
205
206         /*
207          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
208          */
209         uio = ap->a_uio;
210         while (uio->uio_resid > 0 && uio->uio_offset < ip->ino_data.size) {
211                 offset = uio->uio_offset & HAMMER_BUFMASK;
212 #if 0
213                 error = cluster_read(ap->a_vp, ip->ino_data.size,
214                                      uio->uio_offset - offset, HAMMER_BUFSIZE,
215                                      MAXBSIZE, seqcount, &bp);
216 #endif
217                 error = bread(ap->a_vp, uio->uio_offset - offset,
218                               HAMMER_BUFSIZE, &bp);
219                 if (error) {
220                         brelse(bp);
221                         break;
222                 }
223                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
224                 n = HAMMER_BUFSIZE - offset;
225                 if (n > uio->uio_resid)
226                         n = uio->uio_resid;
227                 if (n > ip->ino_data.size - uio->uio_offset)
228                         n = (int)(ip->ino_data.size - uio->uio_offset);
229                 error = uiomove((char *)bp->b_data + offset, n, uio);
230                 if (error) {
231                         bqrelse(bp);
232                         break;
233                 }
234                 bqrelse(bp);
235         }
236         if ((ip->flags & HAMMER_INODE_RO) == 0 &&
237             (ip->hmp->mp->mnt_flag & MNT_NOATIME) == 0) {
238                 ip->ino_leaf.atime = trans.time;
239                 hammer_modify_inode(&trans, ip, HAMMER_INODE_ITIMES);
240         }
241         hammer_done_transaction(&trans);
242         return (error);
243 }
244
245 /*
246  * hammer_vop_write { vp, uio, ioflag, cred }
247  */
248 static
249 int
250 hammer_vop_write(struct vop_write_args *ap)
251 {
252         struct hammer_transaction trans;
253         struct hammer_inode *ip;
254         struct uio *uio;
255         off_t offset;
256         struct buf *bp;
257         int error;
258         int n;
259         int flags;
260         int count;
261
262         if (ap->a_vp->v_type != VREG)
263                 return (EINVAL);
264         ip = VTOI(ap->a_vp);
265         error = 0;
266
267         if (ip->flags & HAMMER_INODE_RO)
268                 return (EROFS);
269
270         /*
271          * Create a transaction to cover the operations we perform.
272          */
273         hammer_start_transaction(&trans, ip->hmp);
274         uio = ap->a_uio;
275
276         /*
277          * Check append mode
278          */
279         if (ap->a_ioflag & IO_APPEND)
280                 uio->uio_offset = ip->ino_data.size;
281
282         /*
283          * Check for illegal write offsets.  Valid range is 0...2^63-1
284          */
285         if (uio->uio_offset < 0 || uio->uio_offset + uio->uio_resid <= 0) {
286                 hammer_done_transaction(&trans);
287                 return (EFBIG);
288         }
289
290         /*
291          * Access the data in HAMMER_BUFSIZE blocks via the buffer cache.
292          */
293         count = 0;
294         while (uio->uio_resid > 0) {
295                 int fixsize = 0;
296
297                 /*
298                  * Do not allow huge writes to deadlock the buffer cache
299                  */
300                 if ((++count & 15) == 0) {
301                         vn_unlock(ap->a_vp);
302                         if ((ap->a_ioflag & IO_NOBWILL) == 0)
303                                 bwillwrite();
304                         vn_lock(ap->a_vp, LK_EXCLUSIVE|LK_RETRY);
305                 }
306
307                 offset = uio->uio_offset & HAMMER_BUFMASK;
308                 n = HAMMER_BUFSIZE - offset;
309                 if (n > uio->uio_resid)
310                         n = uio->uio_resid;
311                 if (uio->uio_offset + n > ip->ino_data.size) {
312                         vnode_pager_setsize(ap->a_vp, uio->uio_offset + n);
313                         fixsize = 1;
314                 }
315
316                 if (uio->uio_segflg == UIO_NOCOPY) {
317                         /*
318                          * Issuing a write with the same data backing the
319                          * buffer.  Instantiate the buffer to collect the
320                          * backing vm pages, then read-in any missing bits.
321                          *
322                          * This case is used by vop_stdputpages().
323                          */
324                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
325                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
326                         if ((bp->b_flags & B_CACHE) == 0) {
327                                 bqrelse(bp);
328                                 error = bread(ap->a_vp,
329                                               uio->uio_offset - offset,
330                                               HAMMER_BUFSIZE, &bp);
331                         }
332                 } else if (offset == 0 && uio->uio_resid >= HAMMER_BUFSIZE) {
333                         /*
334                          * Even though we are entirely overwriting the buffer
335                          * we may still have to zero it out to avoid a 
336                          * mmap/write visibility issue.
337                          */
338                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
339                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
340                         if ((bp->b_flags & B_CACHE) == 0)
341                                 vfs_bio_clrbuf(bp);
342                 } else if (uio->uio_offset - offset >= ip->ino_data.size) {
343                         /*
344                          * If the base offset of the buffer is beyond the
345                          * file EOF, we don't have to issue a read.
346                          */
347                         bp = getblk(ap->a_vp, uio->uio_offset - offset,
348                                     HAMMER_BUFSIZE, GETBLK_BHEAVY, 0);
349                         vfs_bio_clrbuf(bp);
350                 } else {
351                         /*
352                          * Partial overwrite, read in any missing bits then
353                          * replace the portion being written.
354                          */
355                         error = bread(ap->a_vp, uio->uio_offset - offset,
356                                       HAMMER_BUFSIZE, &bp);
357                         if (error == 0)
358                                 bheavy(bp);
359                 }
360                 if (error == 0)
361                         error = uiomove((char *)bp->b_data + offset, n, uio);
362
363                 /*
364                  * If we screwed up we have to undo any VM size changes we
365                  * made.
366                  */
367                 if (error) {
368                         brelse(bp);
369                         if (fixsize) {
370                                 vtruncbuf(ap->a_vp, ip->ino_data.size,
371                                           HAMMER_BUFSIZE);
372                         }
373                         break;
374                 }
375                 /* bp->b_flags |= B_CLUSTEROK; temporarily disabled */
376                 if (ip->ino_data.size < uio->uio_offset) {
377                         ip->ino_data.size = uio->uio_offset;
378                         flags = HAMMER_INODE_DDIRTY;
379                         vnode_pager_setsize(ap->a_vp, ip->ino_data.size);
380                 } else {
381                         flags = 0;
382                 }
383                 ip->ino_data.mtime = trans.time;
384                 flags |= HAMMER_INODE_ITIMES | HAMMER_INODE_BUFS;
385                 flags |= HAMMER_INODE_DDIRTY;   /* XXX mtime */
386                 hammer_modify_inode(&trans, ip, flags);
387
388                 if (ap->a_ioflag & IO_SYNC) {
389                         bwrite(bp);
390                 } else if (ap->a_ioflag & IO_DIRECT) {
391                         bawrite(bp);
392 #if 0
393                 } else if ((ap->a_ioflag >> 16) == IO_SEQMAX &&
394                            (uio->uio_offset & HAMMER_BUFMASK) == 0) {
395                         /*
396                          * XXX HAMMER can only fsync the whole inode,
397                          * doing it on every buffer would be a bad idea.
398                          */
399                         /*
400                          * If seqcount indicates sequential operation and
401                          * we just finished filling a buffer, push it out
402                          * now to prevent the buffer cache from becoming
403                          * too full, which would trigger non-optimal
404                          * flushes.
405                          */
406                         bdwrite(bp);
407 #endif
408                 } else {
409                         bdwrite(bp);
410                 }
411         }
412         hammer_done_transaction(&trans);
413         return (error);
414 }
415
416 /*
417  * hammer_vop_access { vp, mode, cred }
418  */
419 static
420 int
421 hammer_vop_access(struct vop_access_args *ap)
422 {
423         struct hammer_inode *ip = VTOI(ap->a_vp);
424         uid_t uid;
425         gid_t gid;
426         int error;
427
428         uid = hammer_to_unix_xid(&ip->ino_data.uid);
429         gid = hammer_to_unix_xid(&ip->ino_data.gid);
430
431         error = vop_helper_access(ap, uid, gid, ip->ino_data.mode,
432                                   ip->ino_data.uflags);
433         return (error);
434 }
435
436 /*
437  * hammer_vop_advlock { vp, id, op, fl, flags }
438  */
439 static
440 int
441 hammer_vop_advlock(struct vop_advlock_args *ap)
442 {
443         struct hammer_inode *ip = VTOI(ap->a_vp);
444
445         return (lf_advlock(ap, &ip->advlock, ip->ino_data.size));
446 }
447
448 /*
449  * hammer_vop_close { vp, fflag }
450  */
451 static
452 int
453 hammer_vop_close(struct vop_close_args *ap)
454 {
455         return (vop_stdclose(ap));
456 }
457
458 /*
459  * hammer_vop_ncreate { nch, dvp, vpp, cred, vap }
460  *
461  * The operating system has already ensured that the directory entry
462  * does not exist and done all appropriate namespace locking.
463  */
464 static
465 int
466 hammer_vop_ncreate(struct vop_ncreate_args *ap)
467 {
468         struct hammer_transaction trans;
469         struct hammer_inode *dip;
470         struct hammer_inode *nip;
471         struct nchandle *nch;
472         int error;
473
474         nch = ap->a_nch;
475         dip = VTOI(ap->a_dvp);
476
477         if (dip->flags & HAMMER_INODE_RO)
478                 return (EROFS);
479
480         /*
481          * Create a transaction to cover the operations we perform.
482          */
483         hammer_start_transaction(&trans, dip->hmp);
484
485         /*
486          * Create a new filesystem object of the requested type.  The
487          * returned inode will be referenced and shared-locked to prevent
488          * it from being moved to the flusher.
489          */
490
491         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
492         if (error) {
493                 hkprintf("hammer_create_inode error %d\n", error);
494                 hammer_done_transaction(&trans);
495                 *ap->a_vpp = NULL;
496                 return (error);
497         }
498
499         /*
500          * Add the new filesystem object to the directory.  This will also
501          * bump the inode's link count.
502          */
503         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
504         if (error)
505                 hkprintf("hammer_ip_add_directory error %d\n", error);
506
507         /*
508          * Finish up.
509          */
510         if (error) {
511                 hammer_rel_inode(nip, 0);
512                 hammer_done_transaction(&trans);
513                 *ap->a_vpp = NULL;
514         } else {
515                 error = hammer_get_vnode(nip, ap->a_vpp);
516                 hammer_done_transaction(&trans);
517                 hammer_rel_inode(nip, 0);
518                 if (error == 0) {
519                         cache_setunresolved(ap->a_nch);
520                         cache_setvp(ap->a_nch, *ap->a_vpp);
521                 }
522         }
523         return (error);
524 }
525
526 /*
527  * hammer_vop_getattr { vp, vap }
528  *
529  * Retrieve an inode's attribute information.  When accessing inodes
530  * historically we fake the atime field to ensure consistent results.
531  * The atime field is stored in the B-Tree element and allowed to be
532  * updated without cycling the element.
533  */
534 static
535 int
536 hammer_vop_getattr(struct vop_getattr_args *ap)
537 {
538         struct hammer_inode *ip = VTOI(ap->a_vp);
539         struct vattr *vap = ap->a_vap;
540
541 #if 0
542         if (cache_check_fsmid_vp(ap->a_vp, &ip->fsmid) &&
543             (vp->v_mount->mnt_flag & MNT_RDONLY) == 0 &&
544             ip->obj_asof == XXX
545         ) {
546                 /* LAZYMOD XXX */
547         }
548         hammer_itimes(ap->a_vp);
549 #endif
550
551         vap->va_fsid = ip->hmp->fsid_udev;
552         vap->va_fileid = ip->ino_leaf.base.obj_id;
553         vap->va_mode = ip->ino_data.mode;
554         vap->va_nlink = ip->ino_data.nlinks;
555         vap->va_uid = hammer_to_unix_xid(&ip->ino_data.uid);
556         vap->va_gid = hammer_to_unix_xid(&ip->ino_data.gid);
557         vap->va_rmajor = 0;
558         vap->va_rminor = 0;
559         vap->va_size = ip->ino_data.size;
560         if (ip->flags & HAMMER_INODE_RO)
561                 hammer_to_timespec(ip->ino_data.mtime, &vap->va_atime);
562         else
563                 hammer_to_timespec(ip->ino_leaf.atime, &vap->va_atime);
564         hammer_to_timespec(ip->ino_data.mtime, &vap->va_mtime);
565         hammer_to_timespec(ip->ino_data.ctime, &vap->va_ctime);
566         vap->va_flags = ip->ino_data.uflags;
567         vap->va_gen = 1;        /* hammer inums are unique for all time */
568         vap->va_blocksize = HAMMER_BUFSIZE;
569         vap->va_bytes = (ip->ino_data.size + 63) & ~63;
570         vap->va_type = hammer_get_vnode_type(ip->ino_data.obj_type);
571         vap->va_filerev = 0;    /* XXX */
572         /* mtime uniquely identifies any adjustments made to the file */
573         vap->va_fsmid = ip->ino_data.mtime;
574         vap->va_uid_uuid = ip->ino_data.uid;
575         vap->va_gid_uuid = ip->ino_data.gid;
576         vap->va_fsid_uuid = ip->hmp->fsid;
577         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
578                           VA_FSID_UUID_VALID;
579
580         switch (ip->ino_data.obj_type) {
581         case HAMMER_OBJTYPE_CDEV:
582         case HAMMER_OBJTYPE_BDEV:
583                 vap->va_rmajor = ip->ino_data.rmajor;
584                 vap->va_rminor = ip->ino_data.rminor;
585                 break;
586         default:
587                 break;
588         }
589
590         return(0);
591 }
592
593 /*
594  * hammer_vop_nresolve { nch, dvp, cred }
595  *
596  * Locate the requested directory entry.
597  */
598 static
599 int
600 hammer_vop_nresolve(struct vop_nresolve_args *ap)
601 {
602         struct hammer_transaction trans;
603         struct namecache *ncp;
604         hammer_inode_t dip;
605         hammer_inode_t ip;
606         hammer_tid_t asof;
607         struct hammer_cursor cursor;
608         struct vnode *vp;
609         int64_t namekey;
610         int error;
611         int i;
612         int nlen;
613         int flags;
614         u_int64_t obj_id;
615
616         /*
617          * Misc initialization, plus handle as-of name extensions.  Look for
618          * the '@@' extension.  Note that as-of files and directories cannot
619          * be modified.
620          */
621         dip = VTOI(ap->a_dvp);
622         ncp = ap->a_nch->ncp;
623         asof = dip->obj_asof;
624         nlen = ncp->nc_nlen;
625         flags = dip->flags;
626
627         hammer_simple_transaction(&trans, dip->hmp);
628
629         for (i = 0; i < nlen; ++i) {
630                 if (ncp->nc_name[i] == '@' && ncp->nc_name[i+1] == '@') {
631                         asof = hammer_str_to_tid(ncp->nc_name + i + 2);
632                         flags |= HAMMER_INODE_RO;
633                         break;
634                 }
635         }
636         nlen = i;
637
638         /*
639          * If there is no path component the time extension is relative to
640          * dip.
641          */
642         if (nlen == 0) {
643                 ip = hammer_get_inode(&trans, &dip->cache[1], dip->obj_id,
644                                       asof, flags, &error);
645                 if (error == 0) {
646                         error = hammer_get_vnode(ip, &vp);
647                         hammer_rel_inode(ip, 0);
648                 } else {
649                         vp = NULL;
650                 }
651                 if (error == 0) {
652                         vn_unlock(vp);
653                         cache_setvp(ap->a_nch, vp);
654                         vrele(vp);
655                 }
656                 goto done;
657         }
658
659         /*
660          * Calculate the namekey and setup the key range for the scan.  This
661          * works kinda like a chained hash table where the lower 32 bits
662          * of the namekey synthesize the chain.
663          *
664          * The key range is inclusive of both key_beg and key_end.
665          */
666         namekey = hammer_directory_namekey(ncp->nc_name, nlen);
667
668         error = hammer_init_cursor(&trans, &cursor, &dip->cache[0], dip);
669         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
670         cursor.key_beg.obj_id = dip->obj_id;
671         cursor.key_beg.key = namekey;
672         cursor.key_beg.create_tid = 0;
673         cursor.key_beg.delete_tid = 0;
674         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
675         cursor.key_beg.obj_type = 0;
676
677         cursor.key_end = cursor.key_beg;
678         cursor.key_end.key |= 0xFFFFFFFFULL;
679         cursor.asof = asof;
680         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
681
682         /*
683          * Scan all matching records (the chain), locate the one matching
684          * the requested path component.
685          *
686          * The hammer_ip_*() functions merge in-memory records with on-disk
687          * records for the purposes of the search.
688          */
689         obj_id = 0;
690
691         if (error == 0) {
692                 error = hammer_ip_first(&cursor);
693                 while (error == 0) {
694                         error = hammer_ip_resolve_data(&cursor);
695                         if (error)
696                                 break;
697                         if (nlen == cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF &&
698                             bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
699                                 obj_id = cursor.data->entry.obj_id;
700                                 break;
701                         }
702                         error = hammer_ip_next(&cursor);
703                 }
704         }
705         hammer_done_cursor(&cursor);
706         if (error == 0) {
707                 ip = hammer_get_inode(&trans, &dip->cache[1],
708                                       obj_id, asof, flags, &error);
709                 if (error == 0) {
710                         error = hammer_get_vnode(ip, &vp);
711                         hammer_rel_inode(ip, 0);
712                 } else {
713                         vp = NULL;
714                 }
715                 if (error == 0) {
716                         vn_unlock(vp);
717                         cache_setvp(ap->a_nch, vp);
718                         vrele(vp);
719                 }
720         } else if (error == ENOENT) {
721                 cache_setvp(ap->a_nch, NULL);
722         }
723 done:
724         hammer_done_transaction(&trans);
725         return (error);
726 }
727
728 /*
729  * hammer_vop_nlookupdotdot { dvp, vpp, cred }
730  *
731  * Locate the parent directory of a directory vnode.
732  *
733  * dvp is referenced but not locked.  *vpp must be returned referenced and
734  * locked.  A parent_obj_id of 0 does not necessarily indicate that we are
735  * at the root, instead it could indicate that the directory we were in was
736  * removed.
737  *
738  * NOTE: as-of sequences are not linked into the directory structure.  If
739  * we are at the root with a different asof then the mount point, reload
740  * the same directory with the mount point's asof.   I'm not sure what this
741  * will do to NFS.  We encode ASOF stamps in NFS file handles so it might not
742  * get confused, but it hasn't been tested.
743  */
744 static
745 int
746 hammer_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
747 {
748         struct hammer_transaction trans;
749         struct hammer_inode *dip;
750         struct hammer_inode *ip;
751         int64_t parent_obj_id;
752         hammer_tid_t asof;
753         int error;
754
755         dip = VTOI(ap->a_dvp);
756         asof = dip->obj_asof;
757         parent_obj_id = dip->ino_data.parent_obj_id;
758
759         if (parent_obj_id == 0) {
760                 if (dip->obj_id == HAMMER_OBJID_ROOT &&
761                    asof != dip->hmp->asof) {
762                         parent_obj_id = dip->obj_id;
763                         asof = dip->hmp->asof;
764                         *ap->a_fakename = kmalloc(19, M_TEMP, M_WAITOK);
765                         ksnprintf(*ap->a_fakename, 19, "0x%016llx",
766                                    dip->obj_asof);
767                 } else {
768                         *ap->a_vpp = NULL;
769                         return ENOENT;
770                 }
771         }
772
773         hammer_simple_transaction(&trans, dip->hmp);
774
775         ip = hammer_get_inode(&trans, &dip->cache[1], parent_obj_id,
776                               asof, dip->flags, &error);
777         if (ip) {
778                 error = hammer_get_vnode(ip, ap->a_vpp);
779                 hammer_rel_inode(ip, 0);
780         } else {
781                 *ap->a_vpp = NULL;
782         }
783         hammer_done_transaction(&trans);
784         return (error);
785 }
786
787 /*
788  * hammer_vop_nlink { nch, dvp, vp, cred }
789  */
790 static
791 int
792 hammer_vop_nlink(struct vop_nlink_args *ap)
793 {
794         struct hammer_transaction trans;
795         struct hammer_inode *dip;
796         struct hammer_inode *ip;
797         struct nchandle *nch;
798         int error;
799
800         nch = ap->a_nch;
801         dip = VTOI(ap->a_dvp);
802         ip = VTOI(ap->a_vp);
803
804         if (dip->flags & HAMMER_INODE_RO)
805                 return (EROFS);
806         if (ip->flags & HAMMER_INODE_RO)
807                 return (EROFS);
808
809         /*
810          * Create a transaction to cover the operations we perform.
811          */
812         hammer_start_transaction(&trans, dip->hmp);
813
814         /*
815          * Add the filesystem object to the directory.  Note that neither
816          * dip nor ip are referenced or locked, but their vnodes are
817          * referenced.  This function will bump the inode's link count.
818          */
819         error = hammer_ip_add_directory(&trans, dip, nch->ncp, ip);
820
821         /*
822          * Finish up.
823          */
824         if (error == 0) {
825                 cache_setunresolved(nch);
826                 cache_setvp(nch, ap->a_vp);
827         }
828         hammer_done_transaction(&trans);
829         return (error);
830 }
831
832 /*
833  * hammer_vop_nmkdir { nch, dvp, vpp, cred, vap }
834  *
835  * The operating system has already ensured that the directory entry
836  * does not exist and done all appropriate namespace locking.
837  */
838 static
839 int
840 hammer_vop_nmkdir(struct vop_nmkdir_args *ap)
841 {
842         struct hammer_transaction trans;
843         struct hammer_inode *dip;
844         struct hammer_inode *nip;
845         struct nchandle *nch;
846         int error;
847
848         nch = ap->a_nch;
849         dip = VTOI(ap->a_dvp);
850
851         if (dip->flags & HAMMER_INODE_RO)
852                 return (EROFS);
853
854         /*
855          * Create a transaction to cover the operations we perform.
856          */
857         hammer_start_transaction(&trans, dip->hmp);
858
859         /*
860          * Create a new filesystem object of the requested type.  The
861          * returned inode will be referenced but not locked.
862          */
863         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
864         if (error) {
865                 hkprintf("hammer_mkdir error %d\n", error);
866                 hammer_done_transaction(&trans);
867                 *ap->a_vpp = NULL;
868                 return (error);
869         }
870         /*
871          * Add the new filesystem object to the directory.  This will also
872          * bump the inode's link count.
873          */
874         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
875         if (error)
876                 hkprintf("hammer_mkdir (add) error %d\n", error);
877
878         /*
879          * Finish up.
880          */
881         if (error) {
882                 hammer_rel_inode(nip, 0);
883                 *ap->a_vpp = NULL;
884         } else {
885                 error = hammer_get_vnode(nip, ap->a_vpp);
886                 hammer_rel_inode(nip, 0);
887                 if (error == 0) {
888                         cache_setunresolved(ap->a_nch);
889                         cache_setvp(ap->a_nch, *ap->a_vpp);
890                 }
891         }
892         hammer_done_transaction(&trans);
893         return (error);
894 }
895
896 /*
897  * hammer_vop_nmknod { nch, dvp, vpp, cred, vap }
898  *
899  * The operating system has already ensured that the directory entry
900  * does not exist and done all appropriate namespace locking.
901  */
902 static
903 int
904 hammer_vop_nmknod(struct vop_nmknod_args *ap)
905 {
906         struct hammer_transaction trans;
907         struct hammer_inode *dip;
908         struct hammer_inode *nip;
909         struct nchandle *nch;
910         int error;
911
912         nch = ap->a_nch;
913         dip = VTOI(ap->a_dvp);
914
915         if (dip->flags & HAMMER_INODE_RO)
916                 return (EROFS);
917
918         /*
919          * Create a transaction to cover the operations we perform.
920          */
921         hammer_start_transaction(&trans, dip->hmp);
922
923         /*
924          * Create a new filesystem object of the requested type.  The
925          * returned inode will be referenced but not locked.
926          */
927         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
928         if (error) {
929                 hammer_done_transaction(&trans);
930                 *ap->a_vpp = NULL;
931                 return (error);
932         }
933
934         /*
935          * Add the new filesystem object to the directory.  This will also
936          * bump the inode's link count.
937          */
938         error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
939
940         /*
941          * Finish up.
942          */
943         if (error) {
944                 hammer_rel_inode(nip, 0);
945                 *ap->a_vpp = NULL;
946         } else {
947                 error = hammer_get_vnode(nip, ap->a_vpp);
948                 hammer_rel_inode(nip, 0);
949                 if (error == 0) {
950                         cache_setunresolved(ap->a_nch);
951                         cache_setvp(ap->a_nch, *ap->a_vpp);
952                 }
953         }
954         hammer_done_transaction(&trans);
955         return (error);
956 }
957
958 /*
959  * hammer_vop_open { vp, mode, cred, fp }
960  */
961 static
962 int
963 hammer_vop_open(struct vop_open_args *ap)
964 {
965         if ((ap->a_mode & FWRITE) && (VTOI(ap->a_vp)->flags & HAMMER_INODE_RO))
966                 return (EROFS);
967
968         return(vop_stdopen(ap));
969 }
970
971 /*
972  * hammer_vop_pathconf { vp, name, retval }
973  */
974 static
975 int
976 hammer_vop_pathconf(struct vop_pathconf_args *ap)
977 {
978         return EOPNOTSUPP;
979 }
980
981 /*
982  * hammer_vop_print { vp }
983  */
984 static
985 int
986 hammer_vop_print(struct vop_print_args *ap)
987 {
988         return EOPNOTSUPP;
989 }
990
991 /*
992  * hammer_vop_readdir { vp, uio, cred, *eofflag, *ncookies, off_t **cookies }
993  */
994 static
995 int
996 hammer_vop_readdir(struct vop_readdir_args *ap)
997 {
998         struct hammer_transaction trans;
999         struct hammer_cursor cursor;
1000         struct hammer_inode *ip;
1001         struct uio *uio;
1002         hammer_base_elm_t base;
1003         int error;
1004         int cookie_index;
1005         int ncookies;
1006         off_t *cookies;
1007         off_t saveoff;
1008         int r;
1009
1010         ip = VTOI(ap->a_vp);
1011         uio = ap->a_uio;
1012         saveoff = uio->uio_offset;
1013
1014         if (ap->a_ncookies) {
1015                 ncookies = uio->uio_resid / 16 + 1;
1016                 if (ncookies > 1024)
1017                         ncookies = 1024;
1018                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
1019                 cookie_index = 0;
1020         } else {
1021                 ncookies = -1;
1022                 cookies = NULL;
1023                 cookie_index = 0;
1024         }
1025
1026         hammer_simple_transaction(&trans, ip->hmp);
1027
1028         /*
1029          * Handle artificial entries
1030          */
1031         error = 0;
1032         if (saveoff == 0) {
1033                 r = vop_write_dirent(&error, uio, ip->obj_id, DT_DIR, 1, ".");
1034                 if (r)
1035                         goto done;
1036                 if (cookies)
1037                         cookies[cookie_index] = saveoff;
1038                 ++saveoff;
1039                 ++cookie_index;
1040                 if (cookie_index == ncookies)
1041                         goto done;
1042         }
1043         if (saveoff == 1) {
1044                 if (ip->ino_data.parent_obj_id) {
1045                         r = vop_write_dirent(&error, uio,
1046                                              ip->ino_data.parent_obj_id,
1047                                              DT_DIR, 2, "..");
1048                 } else {
1049                         r = vop_write_dirent(&error, uio,
1050                                              ip->obj_id, DT_DIR, 2, "..");
1051                 }
1052                 if (r)
1053                         goto done;
1054                 if (cookies)
1055                         cookies[cookie_index] = saveoff;
1056                 ++saveoff;
1057                 ++cookie_index;
1058                 if (cookie_index == ncookies)
1059                         goto done;
1060         }
1061
1062         /*
1063          * Key range (begin and end inclusive) to scan.  Directory keys
1064          * directly translate to a 64 bit 'seek' position.
1065          */
1066         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1067         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1068         cursor.key_beg.obj_id = ip->obj_id;
1069         cursor.key_beg.create_tid = 0;
1070         cursor.key_beg.delete_tid = 0;
1071         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1072         cursor.key_beg.obj_type = 0;
1073         cursor.key_beg.key = saveoff;
1074
1075         cursor.key_end = cursor.key_beg;
1076         cursor.key_end.key = HAMMER_MAX_KEY;
1077         cursor.asof = ip->obj_asof;
1078         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1079
1080         error = hammer_ip_first(&cursor);
1081
1082         while (error == 0) {
1083                 error = hammer_ip_resolve_data(&cursor);
1084                 if (error)
1085                         break;
1086                 base = &cursor.leaf->base;
1087                 saveoff = base->key;
1088                 KKASSERT(cursor.leaf->data_len > HAMMER_ENTRY_NAME_OFF);
1089
1090                 if (base->obj_id != ip->obj_id)
1091                         panic("readdir: bad record at %p", cursor.node);
1092
1093                 r = vop_write_dirent(
1094                              &error, uio, cursor.data->entry.obj_id,
1095                              hammer_get_dtype(cursor.leaf->base.obj_type),
1096                              cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF ,
1097                              (void *)cursor.data->entry.name);
1098                 if (r)
1099                         break;
1100                 ++saveoff;
1101                 if (cookies)
1102                         cookies[cookie_index] = base->key;
1103                 ++cookie_index;
1104                 if (cookie_index == ncookies)
1105                         break;
1106                 error = hammer_ip_next(&cursor);
1107         }
1108         hammer_done_cursor(&cursor);
1109
1110 done:
1111         hammer_done_transaction(&trans);
1112
1113         if (ap->a_eofflag)
1114                 *ap->a_eofflag = (error == ENOENT);
1115         uio->uio_offset = saveoff;
1116         if (error && cookie_index == 0) {
1117                 if (error == ENOENT)
1118                         error = 0;
1119                 if (cookies) {
1120                         kfree(cookies, M_TEMP);
1121                         *ap->a_ncookies = 0;
1122                         *ap->a_cookies = NULL;
1123                 }
1124         } else {
1125                 if (error == ENOENT)
1126                         error = 0;
1127                 if (cookies) {
1128                         *ap->a_ncookies = cookie_index;
1129                         *ap->a_cookies = cookies;
1130                 }
1131         }
1132         return(error);
1133 }
1134
1135 /*
1136  * hammer_vop_readlink { vp, uio, cred }
1137  */
1138 static
1139 int
1140 hammer_vop_readlink(struct vop_readlink_args *ap)
1141 {
1142         struct hammer_transaction trans;
1143         struct hammer_cursor cursor;
1144         struct hammer_inode *ip;
1145         int error;
1146
1147         ip = VTOI(ap->a_vp);
1148
1149         /*
1150          * Shortcut if the symlink data was stuffed into ino_data.
1151          */
1152         if (ip->ino_data.size <= HAMMER_INODE_BASESYMLEN) {
1153                 error = uiomove(ip->ino_data.ext.symlink,
1154                                 ip->ino_data.size, ap->a_uio);
1155                 return(error);
1156         }
1157
1158         /*
1159          * Long version
1160          */
1161         hammer_simple_transaction(&trans, ip->hmp);
1162         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1163
1164         /*
1165          * Key range (begin and end inclusive) to scan.  Directory keys
1166          * directly translate to a 64 bit 'seek' position.
1167          */
1168         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC; /* XXX */
1169         cursor.key_beg.obj_id = ip->obj_id;
1170         cursor.key_beg.create_tid = 0;
1171         cursor.key_beg.delete_tid = 0;
1172         cursor.key_beg.rec_type = HAMMER_RECTYPE_FIX;
1173         cursor.key_beg.obj_type = 0;
1174         cursor.key_beg.key = HAMMER_FIXKEY_SYMLINK;
1175         cursor.asof = ip->obj_asof;
1176         cursor.flags |= HAMMER_CURSOR_ASOF;
1177
1178         error = hammer_ip_lookup(&cursor);
1179         if (error == 0) {
1180                 error = hammer_ip_resolve_data(&cursor);
1181                 if (error == 0) {
1182                         KKASSERT(cursor.leaf->data_len >=
1183                                  HAMMER_SYMLINK_NAME_OFF);
1184                         error = uiomove(cursor.data->symlink.name,
1185                                         cursor.leaf->data_len -
1186                                                 HAMMER_SYMLINK_NAME_OFF,
1187                                         ap->a_uio);
1188                 }
1189         }
1190         hammer_done_cursor(&cursor);
1191         hammer_done_transaction(&trans);
1192         return(error);
1193 }
1194
1195 /*
1196  * hammer_vop_nremove { nch, dvp, cred }
1197  */
1198 static
1199 int
1200 hammer_vop_nremove(struct vop_nremove_args *ap)
1201 {
1202         struct hammer_transaction trans;
1203         int error;
1204
1205         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1206         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1207         hammer_done_transaction(&trans);
1208
1209         return (error);
1210 }
1211
1212 /*
1213  * hammer_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1214  */
1215 static
1216 int
1217 hammer_vop_nrename(struct vop_nrename_args *ap)
1218 {
1219         struct hammer_transaction trans;
1220         struct namecache *fncp;
1221         struct namecache *tncp;
1222         struct hammer_inode *fdip;
1223         struct hammer_inode *tdip;
1224         struct hammer_inode *ip;
1225         struct hammer_cursor cursor;
1226         int64_t namekey;
1227         int nlen, error;
1228
1229         fdip = VTOI(ap->a_fdvp);
1230         tdip = VTOI(ap->a_tdvp);
1231         fncp = ap->a_fnch->ncp;
1232         tncp = ap->a_tnch->ncp;
1233         ip = VTOI(fncp->nc_vp);
1234         KKASSERT(ip != NULL);
1235
1236         if (fdip->flags & HAMMER_INODE_RO)
1237                 return (EROFS);
1238         if (tdip->flags & HAMMER_INODE_RO)
1239                 return (EROFS);
1240         if (ip->flags & HAMMER_INODE_RO)
1241                 return (EROFS);
1242
1243         hammer_start_transaction(&trans, fdip->hmp);
1244
1245         /*
1246          * Remove tncp from the target directory and then link ip as
1247          * tncp. XXX pass trans to dounlink
1248          *
1249          * Force the inode sync-time to match the transaction so it is
1250          * in-sync with the creation of the target directory entry.
1251          */
1252         error = hammer_dounlink(&trans, ap->a_tnch, ap->a_tdvp, ap->a_cred, 0);
1253         if (error == 0 || error == ENOENT) {
1254                 error = hammer_ip_add_directory(&trans, tdip, tncp, ip);
1255                 if (error == 0) {
1256                         ip->ino_data.parent_obj_id = tdip->obj_id;
1257                         hammer_modify_inode(&trans, ip, HAMMER_INODE_DDIRTY);
1258                 }
1259         }
1260         if (error)
1261                 goto failed; /* XXX */
1262
1263         /*
1264          * Locate the record in the originating directory and remove it.
1265          *
1266          * Calculate the namekey and setup the key range for the scan.  This
1267          * works kinda like a chained hash table where the lower 32 bits
1268          * of the namekey synthesize the chain.
1269          *
1270          * The key range is inclusive of both key_beg and key_end.
1271          */
1272         namekey = hammer_directory_namekey(fncp->nc_name, fncp->nc_nlen);
1273 retry:
1274         hammer_init_cursor(&trans, &cursor, &fdip->cache[0], fdip);
1275         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1276         cursor.key_beg.obj_id = fdip->obj_id;
1277         cursor.key_beg.key = namekey;
1278         cursor.key_beg.create_tid = 0;
1279         cursor.key_beg.delete_tid = 0;
1280         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
1281         cursor.key_beg.obj_type = 0;
1282
1283         cursor.key_end = cursor.key_beg;
1284         cursor.key_end.key |= 0xFFFFFFFFULL;
1285         cursor.asof = fdip->obj_asof;
1286         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
1287
1288         /*
1289          * Scan all matching records (the chain), locate the one matching
1290          * the requested path component.
1291          *
1292          * The hammer_ip_*() functions merge in-memory records with on-disk
1293          * records for the purposes of the search.
1294          */
1295         error = hammer_ip_first(&cursor);
1296         while (error == 0) {
1297                 if (hammer_ip_resolve_data(&cursor) != 0)
1298                         break;
1299                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
1300                 KKASSERT(nlen > 0);
1301                 if (fncp->nc_nlen == nlen &&
1302                     bcmp(fncp->nc_name, cursor.data->entry.name, nlen) == 0) {
1303                         break;
1304                 }
1305                 error = hammer_ip_next(&cursor);
1306         }
1307
1308         /*
1309          * If all is ok we have to get the inode so we can adjust nlinks.
1310          *
1311          * WARNING: hammer_ip_del_directory() may have to terminate the
1312          * cursor to avoid a recursion.  It's ok to call hammer_done_cursor()
1313          * twice.
1314          */
1315         if (error == 0)
1316                 error = hammer_ip_del_directory(&trans, &cursor, fdip, ip);
1317
1318         /*
1319          * XXX A deadlock here will break rename's atomicy for the purposes
1320          * of crash recovery.
1321          */
1322         if (error == EDEADLK) {
1323                 hammer_done_cursor(&cursor);
1324                 goto retry;
1325         }
1326
1327         /*
1328          * Cleanup and tell the kernel that the rename succeeded.
1329          */
1330         hammer_done_cursor(&cursor);
1331         if (error == 0)
1332                 cache_rename(ap->a_fnch, ap->a_tnch);
1333
1334 failed:
1335         hammer_done_transaction(&trans);
1336         return (error);
1337 }
1338
1339 /*
1340  * hammer_vop_nrmdir { nch, dvp, cred }
1341  */
1342 static
1343 int
1344 hammer_vop_nrmdir(struct vop_nrmdir_args *ap)
1345 {
1346         struct hammer_transaction trans;
1347         int error;
1348
1349         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1350         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp, ap->a_cred, 0);
1351         hammer_done_transaction(&trans);
1352
1353         return (error);
1354 }
1355
1356 /*
1357  * hammer_vop_setattr { vp, vap, cred }
1358  */
1359 static
1360 int
1361 hammer_vop_setattr(struct vop_setattr_args *ap)
1362 {
1363         struct hammer_transaction trans;
1364         struct vattr *vap;
1365         struct hammer_inode *ip;
1366         int modflags;
1367         int error;
1368         int truncating;
1369         off_t aligned_size;
1370         u_int32_t flags;
1371         uuid_t uuid;
1372
1373         vap = ap->a_vap;
1374         ip = ap->a_vp->v_data;
1375         modflags = 0;
1376
1377         if (ap->a_vp->v_mount->mnt_flag & MNT_RDONLY)
1378                 return(EROFS);
1379         if (ip->flags & HAMMER_INODE_RO)
1380                 return (EROFS);
1381
1382         hammer_start_transaction(&trans, ip->hmp);
1383         error = 0;
1384
1385         if (vap->va_flags != VNOVAL) {
1386                 flags = ip->ino_data.uflags;
1387                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
1388                                          hammer_to_unix_xid(&ip->ino_data.uid),
1389                                          ap->a_cred);
1390                 if (error == 0) {
1391                         if (ip->ino_data.uflags != flags) {
1392                                 ip->ino_data.uflags = flags;
1393                                 modflags |= HAMMER_INODE_DDIRTY;
1394                         }
1395                         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1396                                 error = 0;
1397                                 goto done;
1398                         }
1399                 }
1400                 goto done;
1401         }
1402         if (ip->ino_data.uflags & (IMMUTABLE | APPEND)) {
1403                 error = EPERM;
1404                 goto done;
1405         }
1406         if (vap->va_uid != (uid_t)VNOVAL) {
1407                 hammer_guid_to_uuid(&uuid, vap->va_uid);
1408                 if (bcmp(&uuid, &ip->ino_data.uid, sizeof(uuid)) != 0) {
1409                         ip->ino_data.uid = uuid;
1410                         modflags |= HAMMER_INODE_DDIRTY;
1411                 }
1412         }
1413         if (vap->va_gid != (uid_t)VNOVAL) {
1414                 hammer_guid_to_uuid(&uuid, vap->va_gid);
1415                 if (bcmp(&uuid, &ip->ino_data.gid, sizeof(uuid)) != 0) {
1416                         ip->ino_data.gid = uuid;
1417                         modflags |= HAMMER_INODE_DDIRTY;
1418                 }
1419         }
1420         while (vap->va_size != VNOVAL && ip->ino_data.size != vap->va_size) {
1421                 switch(ap->a_vp->v_type) {
1422                 case VREG:
1423                         if (vap->va_size == ip->ino_data.size)
1424                                 break;
1425                         /*
1426                          * XXX break atomicy, we can deadlock the backend
1427                          * if we do not release the lock.  Probably not a
1428                          * big deal here.
1429                          */
1430                         if (vap->va_size < ip->ino_data.size) {
1431                                 vtruncbuf(ap->a_vp, vap->va_size,
1432                                           HAMMER_BUFSIZE);
1433                                 truncating = 1;
1434                         } else {
1435                                 vnode_pager_setsize(ap->a_vp, vap->va_size);
1436                                 truncating = 0;
1437                         }
1438                         ip->ino_data.size = vap->va_size;
1439                         modflags |= HAMMER_INODE_DDIRTY;
1440                         aligned_size = (vap->va_size + HAMMER_BUFMASK) &
1441                                        ~HAMMER_BUFMASK64;
1442
1443                         /*
1444                          * on-media truncation is cached in the inode until
1445                          * the inode is synchronized.
1446                          */
1447                         if (truncating) {
1448                                 if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1449                                         ip->flags |= HAMMER_INODE_TRUNCATED;
1450                                         ip->trunc_off = vap->va_size;
1451                                 } else if (ip->trunc_off > vap->va_size) {
1452                                         ip->trunc_off = vap->va_size;
1453                                 }
1454                         }
1455
1456                         /*
1457                          * If truncating we have to clean out a portion of
1458                          * the last block on-disk.  We do this in the
1459                          * front-end buffer cache.
1460                          */
1461                         if (truncating && vap->va_size < aligned_size) {
1462                                 struct buf *bp;
1463                                 int offset;
1464
1465                                 offset = vap->va_size & HAMMER_BUFMASK;
1466                                 error = bread(ap->a_vp,
1467                                               aligned_size - HAMMER_BUFSIZE,
1468                                               HAMMER_BUFSIZE, &bp);
1469                                 if (error == 0) {
1470                                         bzero(bp->b_data + offset,
1471                                               HAMMER_BUFSIZE - offset);
1472                                         bdwrite(bp);
1473                                 } else {
1474                                         brelse(bp);
1475                                 }
1476                         }
1477                         break;
1478                 case VDATABASE:
1479                         if ((ip->flags & HAMMER_INODE_TRUNCATED) == 0) {
1480                                 ip->flags |= HAMMER_INODE_TRUNCATED;
1481                                 ip->trunc_off = vap->va_size;
1482                         } else if (ip->trunc_off > vap->va_size) {
1483                                 ip->trunc_off = vap->va_size;
1484                         }
1485                         ip->ino_data.size = vap->va_size;
1486                         modflags |= HAMMER_INODE_DDIRTY;
1487                         break;
1488                 default:
1489                         error = EINVAL;
1490                         goto done;
1491                 }
1492                 break;
1493         }
1494         if (vap->va_atime.tv_sec != VNOVAL) {
1495                 ip->ino_leaf.atime =
1496                         hammer_timespec_to_transid(&vap->va_atime);
1497                 modflags |= HAMMER_INODE_ITIMES;
1498         }
1499         if (vap->va_mtime.tv_sec != VNOVAL) {
1500                 ip->ino_data.mtime =
1501                         hammer_timespec_to_transid(&vap->va_mtime);
1502                 modflags |= HAMMER_INODE_ITIMES;
1503                 modflags |= HAMMER_INODE_DDIRTY;        /* XXX mtime */
1504         }
1505         if (vap->va_mode != (mode_t)VNOVAL) {
1506                 if (ip->ino_data.mode != vap->va_mode) {
1507                         ip->ino_data.mode = vap->va_mode;
1508                         modflags |= HAMMER_INODE_DDIRTY;
1509                 }
1510         }
1511 done:
1512         if (error == 0)
1513                 hammer_modify_inode(&trans, ip, modflags);
1514         hammer_done_transaction(&trans);
1515         return (error);
1516 }
1517
1518 /*
1519  * hammer_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1520  */
1521 static
1522 int
1523 hammer_vop_nsymlink(struct vop_nsymlink_args *ap)
1524 {
1525         struct hammer_transaction trans;
1526         struct hammer_inode *dip;
1527         struct hammer_inode *nip;
1528         struct nchandle *nch;
1529         hammer_record_t record;
1530         int error;
1531         int bytes;
1532
1533         ap->a_vap->va_type = VLNK;
1534
1535         nch = ap->a_nch;
1536         dip = VTOI(ap->a_dvp);
1537
1538         if (dip->flags & HAMMER_INODE_RO)
1539                 return (EROFS);
1540
1541         /*
1542          * Create a transaction to cover the operations we perform.
1543          */
1544         hammer_start_transaction(&trans, dip->hmp);
1545
1546         /*
1547          * Create a new filesystem object of the requested type.  The
1548          * returned inode will be referenced but not locked.
1549          */
1550
1551         error = hammer_create_inode(&trans, ap->a_vap, ap->a_cred, dip, &nip);
1552         if (error) {
1553                 hammer_done_transaction(&trans);
1554                 *ap->a_vpp = NULL;
1555                 return (error);
1556         }
1557
1558         /*
1559          * Add a record representing the symlink.  symlink stores the link
1560          * as pure data, not a string, and is no \0 terminated.
1561          */
1562         if (error == 0) {
1563                 bytes = strlen(ap->a_target);
1564
1565                 if (bytes <= HAMMER_INODE_BASESYMLEN) {
1566                         bcopy(ap->a_target, nip->ino_data.ext.symlink, bytes);
1567                 } else {
1568                         record = hammer_alloc_mem_record(nip, bytes);
1569                         record->type = HAMMER_MEM_RECORD_GENERAL;
1570
1571                         record->leaf.base.localization = HAMMER_LOCALIZE_MISC;
1572                         record->leaf.base.key = HAMMER_FIXKEY_SYMLINK;
1573                         record->leaf.base.rec_type = HAMMER_RECTYPE_FIX;
1574                         record->leaf.data_len = bytes;
1575                         KKASSERT(HAMMER_SYMLINK_NAME_OFF == 0);
1576                         bcopy(ap->a_target, record->data->symlink.name, bytes);
1577                         error = hammer_ip_add_record(&trans, record);
1578                 }
1579
1580                 /*
1581                  * Set the file size to the length of the link.
1582                  */
1583                 if (error == 0) {
1584                         nip->ino_data.size = bytes;
1585                         hammer_modify_inode(&trans, nip, HAMMER_INODE_DDIRTY);
1586                 }
1587         }
1588         if (error == 0)
1589                 error = hammer_ip_add_directory(&trans, dip, nch->ncp, nip);
1590
1591         /*
1592          * Finish up.
1593          */
1594         if (error) {
1595                 hammer_rel_inode(nip, 0);
1596                 *ap->a_vpp = NULL;
1597         } else {
1598                 error = hammer_get_vnode(nip, ap->a_vpp);
1599                 hammer_rel_inode(nip, 0);
1600                 if (error == 0) {
1601                         cache_setunresolved(ap->a_nch);
1602                         cache_setvp(ap->a_nch, *ap->a_vpp);
1603                 }
1604         }
1605         hammer_done_transaction(&trans);
1606         return (error);
1607 }
1608
1609 /*
1610  * hammer_vop_nwhiteout { nch, dvp, cred, flags }
1611  */
1612 static
1613 int
1614 hammer_vop_nwhiteout(struct vop_nwhiteout_args *ap)
1615 {
1616         struct hammer_transaction trans;
1617         int error;
1618
1619         hammer_start_transaction(&trans, VTOI(ap->a_dvp)->hmp);
1620         error = hammer_dounlink(&trans, ap->a_nch, ap->a_dvp,
1621                                 ap->a_cred, ap->a_flags);
1622         hammer_done_transaction(&trans);
1623
1624         return (error);
1625 }
1626
1627 /*
1628  * hammer_vop_ioctl { vp, command, data, fflag, cred }
1629  */
1630 static
1631 int
1632 hammer_vop_ioctl(struct vop_ioctl_args *ap)
1633 {
1634         struct hammer_inode *ip = ap->a_vp->v_data;
1635
1636         return(hammer_ioctl(ip, ap->a_command, ap->a_data,
1637                             ap->a_fflag, ap->a_cred));
1638 }
1639
1640 static
1641 int
1642 hammer_vop_mountctl(struct vop_mountctl_args *ap)
1643 {
1644         struct mount *mp;
1645         int error;
1646
1647         mp = ap->a_head.a_ops->head.vv_mount;
1648
1649         switch(ap->a_op) {
1650         case MOUNTCTL_SET_EXPORT:
1651                 if (ap->a_ctllen != sizeof(struct export_args))
1652                         error = EINVAL;
1653                 error = hammer_vfs_export(mp, ap->a_op,
1654                                       (const struct export_args *)ap->a_ctl);
1655                 break;
1656         default:
1657                 error = journal_mountctl(ap);
1658                 break;
1659         }
1660         return(error);
1661 }
1662
1663 /*
1664  * hammer_vop_strategy { vp, bio }
1665  *
1666  * Strategy call, used for regular file read & write only.  Note that the
1667  * bp may represent a cluster.
1668  *
1669  * To simplify operation and allow better optimizations in the future,
1670  * this code does not make any assumptions with regards to buffer alignment
1671  * or size.
1672  */
1673 static
1674 int
1675 hammer_vop_strategy(struct vop_strategy_args *ap)
1676 {
1677         struct buf *bp;
1678         int error;
1679
1680         bp = ap->a_bio->bio_buf;
1681
1682         switch(bp->b_cmd) {
1683         case BUF_CMD_READ:
1684                 error = hammer_vop_strategy_read(ap);
1685                 break;
1686         case BUF_CMD_WRITE:
1687                 error = hammer_vop_strategy_write(ap);
1688                 break;
1689         default:
1690                 bp->b_error = error = EINVAL;
1691                 bp->b_flags |= B_ERROR;
1692                 biodone(ap->a_bio);
1693                 break;
1694         }
1695         return (error);
1696 }
1697
1698 /*
1699  * Read from a regular file.  Iterate the related records and fill in the
1700  * BIO/BUF.  Gaps are zero-filled.
1701  *
1702  * The support code in hammer_object.c should be used to deal with mixed
1703  * in-memory and on-disk records.
1704  *
1705  * XXX atime update
1706  */
1707 static
1708 int
1709 hammer_vop_strategy_read(struct vop_strategy_args *ap)
1710 {
1711         struct hammer_transaction trans;
1712         struct hammer_inode *ip;
1713         struct hammer_cursor cursor;
1714         hammer_base_elm_t base;
1715         struct bio *bio;
1716         struct buf *bp;
1717         int64_t rec_offset;
1718         int64_t ran_end;
1719         int64_t tmp64;
1720         int error;
1721         int boff;
1722         int roff;
1723         int n;
1724
1725         bio = ap->a_bio;
1726         bp = bio->bio_buf;
1727         ip = ap->a_vp->v_data;
1728
1729         hammer_simple_transaction(&trans, ip->hmp);
1730         hammer_init_cursor(&trans, &cursor, &ip->cache[0], ip);
1731
1732         /*
1733          * Key range (begin and end inclusive) to scan.  Note that the key's
1734          * stored in the actual records represent BASE+LEN, not BASE.  The
1735          * first record containing bio_offset will have a key > bio_offset.
1736          */
1737         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
1738         cursor.key_beg.obj_id = ip->obj_id;
1739         cursor.key_beg.create_tid = 0;
1740         cursor.key_beg.delete_tid = 0;
1741         cursor.key_beg.obj_type = 0;
1742         cursor.key_beg.key = bio->bio_offset + 1;
1743         cursor.asof = ip->obj_asof;
1744         cursor.flags |= HAMMER_CURSOR_ASOF | HAMMER_CURSOR_DATAEXTOK;
1745
1746         cursor.key_end = cursor.key_beg;
1747         KKASSERT(ip->ino_data.obj_type == HAMMER_OBJTYPE_REGFILE);
1748 #if 0
1749         if (ip->ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1750                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DB;
1751                 cursor.key_end.rec_type = HAMMER_RECTYPE_DB;
1752                 cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1753         } else
1754 #endif
1755         {
1756                 ran_end = bio->bio_offset + bp->b_bufsize;
1757                 cursor.key_beg.rec_type = HAMMER_RECTYPE_DATA;
1758                 cursor.key_end.rec_type = HAMMER_RECTYPE_DATA;
1759                 tmp64 = ran_end + MAXPHYS + 1;  /* work-around GCC-4 bug */
1760                 if (tmp64 < ran_end)
1761                         cursor.key_end.key = 0x7FFFFFFFFFFFFFFFLL;
1762                 else
1763                         cursor.key_end.key = ran_end + MAXPHYS + 1;
1764         }
1765         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE;
1766
1767         error = hammer_ip_first(&cursor);
1768         boff = 0;
1769
1770         while (error == 0) {
1771                 error = hammer_ip_resolve_data(&cursor);
1772                 if (error)
1773                         break;
1774                 base = &cursor.leaf->base;
1775
1776                 rec_offset = base->key - cursor.leaf->data_len;
1777
1778                 /*
1779                  * Calculate the gap, if any, and zero-fill it.
1780                  */
1781                 n = (int)(rec_offset - (bio->bio_offset + boff));
1782                 if (n > 0) {
1783                         if (n > bp->b_bufsize - boff)
1784                                 n = bp->b_bufsize - boff;
1785                         bzero((char *)bp->b_data + boff, n);
1786                         boff += n;
1787                         n = 0;
1788                 }
1789
1790                 /*
1791                  * Calculate the data offset in the record and the number
1792                  * of bytes we can copy.
1793                  *
1794                  * Note there is a degenerate case here where boff may
1795                  * already be at bp->b_bufsize.
1796                  */
1797                 roff = -n;
1798                 rec_offset += roff;
1799                 n = cursor.leaf->data_len - roff;
1800                 KKASSERT(n > 0);
1801                 if (n > bp->b_bufsize - boff)
1802                         n = bp->b_bufsize - boff;
1803
1804                 /*
1805                  * If we cached a truncation point on our front-end the
1806                  * on-disk version may still have physical records beyond
1807                  * that point.  Truncate visibility.
1808                  */
1809                 if (ip->trunc_off <= rec_offset)
1810                         n = 0;
1811                 else if (ip->trunc_off < rec_offset + n)
1812                         n = (int)(ip->trunc_off - rec_offset);
1813
1814                 /*
1815                  * Copy
1816                  */
1817                 if (n) {
1818                         bcopy((char *)cursor.data + roff,
1819                               (char *)bp->b_data + boff, n);
1820                         boff += n;
1821                 }
1822                 if (boff == bp->b_bufsize)
1823                         break;
1824                 error = hammer_ip_next(&cursor);
1825         }
1826         hammer_done_cursor(&cursor);
1827         hammer_done_transaction(&trans);
1828
1829         /*
1830          * There may have been a gap after the last record
1831          */
1832         if (error == ENOENT)
1833                 error = 0;
1834         if (error == 0 && boff != bp->b_bufsize) {
1835                 KKASSERT(boff < bp->b_bufsize);
1836                 bzero((char *)bp->b_data + boff, bp->b_bufsize - boff);
1837                 /* boff = bp->b_bufsize; */
1838         }
1839         bp->b_resid = 0;
1840         bp->b_error = error;
1841         if (error)
1842                 bp->b_flags |= B_ERROR;
1843         biodone(ap->a_bio);
1844         return(error);
1845 }
1846
1847 /*
1848  * Write to a regular file.   Because this is a strategy call the OS is
1849  * trying to actually sync data to the media.   HAMMER can only flush
1850  * the entire inode (so the TID remains properly synchronized).
1851  *
1852  * Basically all we do here is place the bio on the inode's flush queue
1853  * and activate the flusher.
1854  */
1855 static
1856 int
1857 hammer_vop_strategy_write(struct vop_strategy_args *ap)
1858 {
1859         hammer_inode_t ip;
1860         struct bio *bio;
1861         struct buf *bp;
1862
1863         bio = ap->a_bio;
1864         bp = bio->bio_buf;
1865         ip = ap->a_vp->v_data;
1866
1867         if (ip->flags & HAMMER_INODE_RO) {
1868                 bp->b_error = EROFS;
1869                 bp->b_flags |= B_ERROR;
1870                 biodone(ap->a_bio);
1871                 return(EROFS);
1872         }
1873
1874         /*
1875          * Interlock with inode destruction (no in-kernel or directory
1876          * topology visibility).  If we queue new IO while trying to
1877          * destroy the inode we can deadlock the vtrunc call in
1878          * hammer_inode_unloadable_check().
1879          */
1880         if (ip->flags & (HAMMER_INODE_DELETING|HAMMER_INODE_DELETED)) {
1881                 bp->b_resid = 0;
1882                 biodone(ap->a_bio);
1883                 return(0);
1884         }
1885
1886         /*
1887          * If the inode is being flushed we cannot re-queue buffers
1888          * it may have already flushed, or it could result in duplicate
1889          * records in the database.
1890          */
1891         BUF_KERNPROC(bp);
1892         if (ip->flags & HAMMER_INODE_WRITE_ALT)
1893                 TAILQ_INSERT_TAIL(&ip->bio_alt_list, bio, bio_act);
1894         else
1895                 TAILQ_INSERT_TAIL(&ip->bio_list, bio, bio_act);
1896         ++hammer_bio_count;
1897         hammer_modify_inode(NULL, ip, HAMMER_INODE_BUFS);
1898
1899         hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1900 #if 0
1901         /*
1902          * XXX 
1903          *
1904          * If the write was not part of an integrated flush operation then
1905          * signal a flush.
1906          */
1907         if (ip->flush_state != HAMMER_FST_FLUSH ||
1908             (ip->flags & HAMMER_INODE_WRITE_ALT)) {
1909                 hammer_flush_inode(ip, HAMMER_FLUSH_SIGNAL);
1910         }
1911 #endif
1912         return(0);
1913 }
1914
1915 /*
1916  * Backend code which actually performs the write to the media.  This
1917  * routine is typically called from the flusher.  The bio will be disposed
1918  * of (biodone'd) by this routine.
1919  *
1920  * Iterate the related records and mark for deletion.  If existing edge
1921  * records (left and right side) overlap our write they have to be marked
1922  * deleted and new records created, usually referencing a portion of the
1923  * original data.  Then add a record to represent the buffer.
1924  */
1925 int
1926 hammer_dowrite(hammer_cursor_t cursor, hammer_inode_t ip, struct bio *bio)
1927 {
1928         struct buf *bp = bio->bio_buf;
1929         int error;
1930
1931         KKASSERT(ip->flush_state == HAMMER_FST_FLUSH);
1932
1933         /*
1934          * If the inode is going or gone, just throw away any frontend
1935          * buffers.
1936          */
1937         if (ip->flags & HAMMER_INODE_DELETED) {
1938                 bp->b_resid = 0;
1939                 biodone(bio);
1940                 --hammer_bio_count;
1941                 return(0);
1942         }
1943
1944         /*
1945          * Delete any records overlapping our range.  This function will
1946          * (eventually) properly truncate partial overlaps.
1947          */
1948         if (ip->sync_ino_data.obj_type == HAMMER_OBJTYPE_DBFILE) {
1949                 error = hammer_ip_delete_range(cursor, ip, bio->bio_offset,
1950                                                bio->bio_offset);
1951         } else {
1952                 error = hammer_ip_delete_range(cursor, ip, bio->bio_offset,
1953                                                bio->bio_offset +
1954                                                 bp->b_bufsize - 1);
1955         }
1956
1957         /*
1958          * Add a single record to cover the write.  We can write a record
1959          * with only the actual file data - for example, a small 200 byte
1960          * file does not have to write out a 16K record.
1961          *
1962          * While the data size does not have to be aligned, we still do it
1963          * to reduce fragmentation in a future allocation model.
1964          */
1965         if (error == 0) {
1966                 int limit_size;
1967
1968                 if (ip->sync_ino_data.size - bio->bio_offset > 
1969                     bp->b_bufsize) {
1970                             limit_size = bp->b_bufsize;
1971                 } else {
1972                         limit_size = (int)(ip->sync_ino_data.size -
1973                                            bio->bio_offset);
1974                         KKASSERT(limit_size >= 0);
1975                         limit_size = (limit_size + 63) & ~63;
1976                 }
1977                 if (limit_size) {
1978                         error = hammer_ip_sync_data(cursor, ip, bio->bio_offset,
1979                                                     bp->b_data, limit_size);
1980                 }
1981         }
1982         if (error)
1983                 Debugger("hammer_dowrite: error");
1984
1985         if (error) {
1986                 bp->b_resid = bp->b_bufsize;
1987                 bp->b_error = error;
1988                 bp->b_flags |= B_ERROR;
1989         } else {
1990                 bp->b_resid = 0;
1991         }
1992         biodone(bio);
1993         --hammer_bio_count;
1994         return(error);
1995 }
1996
1997 /*
1998  * dounlink - disconnect a directory entry
1999  *
2000  * XXX whiteout support not really in yet
2001  */
2002 static int
2003 hammer_dounlink(hammer_transaction_t trans, struct nchandle *nch,
2004                 struct vnode *dvp, struct ucred *cred, int flags)
2005 {
2006         struct namecache *ncp;
2007         hammer_inode_t dip;
2008         hammer_inode_t ip;
2009         struct hammer_cursor cursor;
2010         int64_t namekey;
2011         int nlen, error;
2012
2013         /*
2014          * Calculate the namekey and setup the key range for the scan.  This
2015          * works kinda like a chained hash table where the lower 32 bits
2016          * of the namekey synthesize the chain.
2017          *
2018          * The key range is inclusive of both key_beg and key_end.
2019          */
2020         dip = VTOI(dvp);
2021         ncp = nch->ncp;
2022
2023         if (dip->flags & HAMMER_INODE_RO)
2024                 return (EROFS);
2025
2026         namekey = hammer_directory_namekey(ncp->nc_name, ncp->nc_nlen);
2027 retry:
2028         hammer_init_cursor(trans, &cursor, &dip->cache[0], dip);
2029         cursor.key_beg.localization = HAMMER_LOCALIZE_MISC;
2030         cursor.key_beg.obj_id = dip->obj_id;
2031         cursor.key_beg.key = namekey;
2032         cursor.key_beg.create_tid = 0;
2033         cursor.key_beg.delete_tid = 0;
2034         cursor.key_beg.rec_type = HAMMER_RECTYPE_DIRENTRY;
2035         cursor.key_beg.obj_type = 0;
2036
2037         cursor.key_end = cursor.key_beg;
2038         cursor.key_end.key |= 0xFFFFFFFFULL;
2039         cursor.asof = dip->obj_asof;
2040         cursor.flags |= HAMMER_CURSOR_END_INCLUSIVE | HAMMER_CURSOR_ASOF;
2041
2042         /*
2043          * Scan all matching records (the chain), locate the one matching
2044          * the requested path component.  info->last_error contains the
2045          * error code on search termination and could be 0, ENOENT, or
2046          * something else.
2047          *
2048          * The hammer_ip_*() functions merge in-memory records with on-disk
2049          * records for the purposes of the search.
2050          */
2051         error = hammer_ip_first(&cursor);
2052
2053         while (error == 0) {
2054                 error = hammer_ip_resolve_data(&cursor);
2055                 if (error)
2056                         break;
2057                 nlen = cursor.leaf->data_len - HAMMER_ENTRY_NAME_OFF;
2058                 KKASSERT(nlen > 0);
2059                 if (ncp->nc_nlen == nlen &&
2060                     bcmp(ncp->nc_name, cursor.data->entry.name, nlen) == 0) {
2061                         break;
2062                 }
2063                 error = hammer_ip_next(&cursor);
2064         }
2065
2066         /*
2067          * If all is ok we have to get the inode so we can adjust nlinks.
2068          *
2069          * If the target is a directory, it must be empty.
2070          */
2071         if (error == 0) {
2072                 ip = hammer_get_inode(trans, &dip->cache[1],
2073                                       cursor.data->entry.obj_id,
2074                                       dip->hmp->asof, 0, &error);
2075                 if (error == ENOENT) {
2076                         kprintf("obj_id %016llx\n", cursor.data->entry.obj_id);
2077                         Debugger("ENOENT unlinking object that should exist");
2078                 }
2079
2080                 /*
2081                  * If we are trying to remove a directory the directory must
2082                  * be empty.
2083                  *
2084                  * WARNING: hammer_ip_check_directory_empty() may have to
2085                  * terminate the cursor to avoid a deadlock.  It is ok to
2086                  * call hammer_done_cursor() twice.
2087                  */
2088                 if (error == 0 && ip->ino_data.obj_type ==
2089                                   HAMMER_OBJTYPE_DIRECTORY) {
2090                         error = hammer_ip_check_directory_empty(trans, ip);
2091                 }
2092
2093                 /*
2094                  * Delete the directory entry.
2095                  *
2096                  * WARNING: hammer_ip_del_directory() may have to terminate
2097                  * the cursor to avoid a deadlock.  It is ok to call
2098                  * hammer_done_cursor() twice.
2099                  */
2100                 if (error == 0) {
2101                         error = hammer_ip_del_directory(trans, &cursor,
2102                                                         dip, ip);
2103                 }
2104                 if (error == 0) {
2105                         cache_setunresolved(nch);
2106                         cache_setvp(nch, NULL);
2107                         /* XXX locking */
2108                         if (ip->vp)
2109                                 cache_inval_vp(ip->vp, CINV_DESTROY);
2110                 }
2111                 hammer_rel_inode(ip, 0);
2112         }
2113         hammer_done_cursor(&cursor);
2114         if (error == EDEADLK)
2115                 goto retry;
2116
2117         return (error);
2118 }
2119
2120 /************************************************************************
2121  *                          FIFO AND SPECFS OPS                         *
2122  ************************************************************************
2123  *
2124  */
2125
2126 static int
2127 hammer_vop_fifoclose (struct vop_close_args *ap)
2128 {
2129         /* XXX update itimes */
2130         return (VOCALL(&fifo_vnode_vops, &ap->a_head));
2131 }
2132
2133 static int
2134 hammer_vop_fiforead (struct vop_read_args *ap)
2135 {
2136         int error;
2137
2138         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2139         /* XXX update access time */
2140         return (error);
2141 }
2142
2143 static int
2144 hammer_vop_fifowrite (struct vop_write_args *ap)
2145 {
2146         int error;
2147
2148         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2149         /* XXX update access time */
2150         return (error);
2151 }
2152
2153 static int
2154 hammer_vop_specclose (struct vop_close_args *ap)
2155 {
2156         /* XXX update itimes */
2157         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2158 }
2159
2160 static int
2161 hammer_vop_specread (struct vop_read_args *ap)
2162 {
2163         /* XXX update access time */
2164         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2165 }
2166
2167 static int
2168 hammer_vop_specwrite (struct vop_write_args *ap)
2169 {
2170         /* XXX update last change time */
2171         return (VOCALL(&spec_vnode_vops, &ap->a_head));
2172 }
2173