sys/vfs/hammer2: Adjust some kprintfs in vfsops
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * Kernel Filesystem interface
38  *
39  * NOTE! local ipdata pointers must be reloaded on any modifying operation
40  *       to the inode as its underlying chain may have changed.
41  */
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/fcntl.h>
47 #include <sys/buf.h>
48 #include <sys/proc.h>
49 #include <sys/mount.h>
50 #include <sys/vnode.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54 #include <sys/objcache.h>
55 #include <sys/event.h>
56 #include <sys/file.h>
57 #include <vfs/fifofs/fifo.h>
58
59 #include "hammer2.h"
60
61 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
62                                 int seqcount);
63 static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
64                                 int ioflag, int seqcount);
65 static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize);
66 static void hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize);
67
68 struct objcache *cache_xops;
69
70 static __inline
71 void
72 hammer2_knote(struct vnode *vp, int flags)
73 {
74         if (flags)
75                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
76 }
77
78 /*
79  * Last reference to a vnode is going away but it is still cached.
80  */
81 static
82 int
83 hammer2_vop_inactive(struct vop_inactive_args *ap)
84 {
85         hammer2_inode_t *ip;
86         struct vnode *vp;
87
88         vp = ap->a_vp;
89         ip = VTOI(vp);
90
91         /*
92          * Degenerate case
93          */
94         if (ip == NULL) {
95                 vrecycle(vp);
96                 return (0);
97         }
98
99         /*
100          * Check for deleted inodes and recycle immediately on the last
101          * release.  Be sure to destroy any left-over buffer cache buffers
102          * so we do not waste time trying to flush them.
103          *
104          * Note that deleting the file block chains under the inode chain
105          * would just be a waste of energy, so don't do it.
106          *
107          * WARNING: nvtruncbuf() can only be safely called without the inode
108          *          lock held due to the way our write thread works.
109          */
110         if (ip->flags & HAMMER2_INODE_ISUNLINKED) {
111                 hammer2_key_t lbase;
112                 int nblksize;
113
114                 /*
115                  * Detect updates to the embedded data which may be
116                  * synchronized by the strategy code.  Simply mark the
117                  * inode modified so it gets picked up by our normal flush.
118                  */
119                 nblksize = hammer2_calc_logical(ip, 0, &lbase, NULL);
120                 nvtruncbuf(vp, 0, nblksize, 0, 0);
121                 vrecycle(vp);
122         }
123         return (0);
124 }
125
126 /*
127  * Reclaim a vnode so that it can be reused; after the inode is
128  * disassociated, the filesystem must manage it alone.
129  */
130 static
131 int
132 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
133 {
134         hammer2_inode_t *ip;
135         hammer2_pfs_t *pmp;
136         struct vnode *vp;
137
138         vp = ap->a_vp;
139         ip = VTOI(vp);
140         if (ip == NULL) {
141                 return(0);
142         }
143         pmp = ip->pmp;
144
145         /*
146          * The final close of a deleted file or directory marks it for
147          * destruction.  The DELETED flag allows the flusher to shortcut
148          * any modified blocks still unflushed (that is, just ignore them).
149          *
150          * HAMMER2 usually does not try to optimize the freemap by returning
151          * deleted blocks to it as it does not usually know how many snapshots
152          * might be referencing portions of the file/dir.
153          */
154         vp->v_data = NULL;
155         ip->vp = NULL;
156
157         /*
158          * NOTE! We do not attempt to flush chains here, flushing is
159          *       really fragile and could also deadlock.
160          */
161         vclrisdirty(vp);
162
163         /*
164          * Modified inodes will already be on SIDEQ or SYNCQ.  However,
165          * unlinked-but-open inodes may already have been synced and might
166          * still require deletion-on-reclaim.
167          */
168         if ((ip->flags & (HAMMER2_INODE_ISUNLINKED |
169                           HAMMER2_INODE_DELETING)) ==
170             HAMMER2_INODE_ISUNLINKED) {
171                 hammer2_inode_lock(ip, 0);
172                 if ((ip->flags & (HAMMER2_INODE_ISUNLINKED |
173                                   HAMMER2_INODE_DELETING)) ==
174                     HAMMER2_INODE_ISUNLINKED) {
175                         atomic_set_int(&ip->flags, HAMMER2_INODE_DELETING);
176                         hammer2_inode_delayed_sideq(ip);
177                 }
178                 hammer2_inode_unlock(ip);
179         }
180
181         /*
182          * Modified inodes will already be on SIDEQ or SYNCQ, no further
183          * action is needed.
184          *
185          * We cannot safely synchronize the inode from inside the reclaim
186          * due to potentially deep locks held as-of when the reclaim occurs.
187          * Interactions and potential deadlocks abound.  We also can't do it
188          * here without desynchronizing from the related directory entrie(s).
189          */
190         hammer2_inode_drop(ip);                 /* vp ref */
191
192         /*
193          * XXX handle background sync when ip dirty, kernel will no longer
194          * notify us regarding this inode because there is no longer a
195          * vnode attached to it.
196          */
197
198         return (0);
199 }
200
201 /*
202  * Currently this function synchronizes the front-end inode state to the
203  * backend chain topology, then flushes the inode's chain and sub-topology
204  * to backend media.  This function does not flush the root topology down to
205  * the inode.
206  */
207 static
208 int
209 hammer2_vop_fsync(struct vop_fsync_args *ap)
210 {
211         hammer2_inode_t *ip;
212         struct vnode *vp;
213         int error1;
214         int error2;
215
216         vp = ap->a_vp;
217         ip = VTOI(vp);
218         error1 = 0;
219
220         hammer2_trans_init(ip->pmp, 0);
221
222         /*
223          * Flush dirty buffers in the file's logical buffer cache.
224          * It is best to wait for the strategy code to commit the
225          * buffers to the device's backing buffer cache before
226          * then trying to flush the inode.
227          *
228          * This should be quick, but certain inode modifications cached
229          * entirely in the hammer2_inode structure may not trigger a
230          * buffer read until the flush so the fsync can wind up also
231          * doing scattered reads.
232          */
233         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
234         bio_track_wait(&vp->v_track_write, 0, 0);
235
236         /*
237          * Flush any inode changes
238          */
239         hammer2_inode_lock(ip, 0);
240         if (ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED))
241                 error1 = hammer2_inode_chain_sync(ip);
242
243         /*
244          * Flush dirty chains related to the inode.
245          *
246          * NOTE! We are not in a flush transaction.  The inode remains on
247          *       the sideq so the filesystem syncer can synchronize it to
248          *       the volume root.
249          */
250         error2 = hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP);
251         if (error2)
252                 error1 = error2;
253
254         /*
255          * We may be able to clear the vnode dirty flag.  The
256          * hammer2_pfs_moderate() code depends on this usually working.
257          */
258         if ((ip->flags & (HAMMER2_INODE_MODIFIED |
259                           HAMMER2_INODE_RESIZED |
260                           HAMMER2_INODE_DIRTYDATA)) == 0 &&
261             RB_EMPTY(&vp->v_rbdirty_tree) &&
262             !bio_track_active(&vp->v_track_write)) {
263                 vclrisdirty(vp);
264         }
265         hammer2_inode_unlock(ip);
266         hammer2_trans_done(ip->pmp, 0);
267
268         return (error1);
269 }
270
271 /*
272  * No lock needed, just handle ip->update
273  */
274 static
275 int
276 hammer2_vop_access(struct vop_access_args *ap)
277 {
278         hammer2_inode_t *ip = VTOI(ap->a_vp);
279         uid_t uid;
280         gid_t gid;
281         mode_t mode;
282         uint32_t uflags;
283         int error;
284         int update;
285
286 retry:
287         update = spin_access_start(&ip->cluster_spin);
288
289         /*hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);*/
290         uid = hammer2_to_unix_xid(&ip->meta.uid);
291         gid = hammer2_to_unix_xid(&ip->meta.gid);
292         mode = ip->meta.mode;
293         uflags = ip->meta.uflags;
294         /*hammer2_inode_unlock(ip);*/
295
296         if (__predict_false(spin_access_end(&ip->cluster_spin, update)))
297                 goto retry;
298
299         error = vop_helper_access(ap, uid, gid, mode, uflags);
300
301         return (error);
302 }
303
304 static
305 int
306 hammer2_vop_getattr(struct vop_getattr_args *ap)
307 {
308         hammer2_pfs_t *pmp;
309         hammer2_inode_t *ip;
310         struct vnode *vp;
311         struct vattr *vap;
312         hammer2_chain_t *chain;
313         int update;
314         int i;
315
316         vp = ap->a_vp;
317         vap = ap->a_vap;
318
319         ip = VTOI(vp);
320         pmp = ip->pmp;
321
322 retry:
323         update = spin_access_start(&ip->cluster_spin);
324
325         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
326         vap->va_fileid = ip->meta.inum;
327         vap->va_mode = ip->meta.mode;
328         vap->va_nlink = ip->meta.nlinks;
329         vap->va_uid = hammer2_to_unix_xid(&ip->meta.uid);
330         vap->va_gid = hammer2_to_unix_xid(&ip->meta.gid);
331         vap->va_rmajor = 0;
332         vap->va_rminor = 0;
333         vap->va_size = ip->meta.size;   /* protected by shared lock */
334         vap->va_blocksize = HAMMER2_PBUFSIZE;
335         vap->va_flags = ip->meta.uflags;
336         hammer2_time_to_timespec(ip->meta.ctime, &vap->va_ctime);
337         hammer2_time_to_timespec(ip->meta.mtime, &vap->va_mtime);
338         hammer2_time_to_timespec(ip->meta.mtime, &vap->va_atime);
339         vap->va_gen = 1;
340         vap->va_bytes = 0;
341         if (ip->meta.type == HAMMER2_OBJTYPE_DIRECTORY) {
342                 /*
343                  * Can't really calculate directory use sans the files under
344                  * it, just assume one block for now.
345                  */
346                 vap->va_bytes += HAMMER2_INODE_BYTES;
347         } else {
348                 for (i = 0; i < ip->cluster.nchains; ++i) {
349                         if ((chain = ip->cluster.array[i].chain) != NULL) {
350                                 if (vap->va_bytes <
351                                     chain->bref.embed.stats.data_count) {
352                                         vap->va_bytes =
353                                             chain->bref.embed.stats.data_count;
354                                 }
355                         }
356                 }
357         }
358         vap->va_type = hammer2_get_vtype(ip->meta.type);
359         vap->va_filerev = 0;
360         vap->va_uid_uuid = ip->meta.uid;
361         vap->va_gid_uuid = ip->meta.gid;
362         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
363                           VA_FSID_UUID_VALID;
364
365         if (__predict_false(spin_access_end(&ip->cluster_spin, update)))
366                 goto retry;
367
368         return (0);
369 }
370
371 static
372 int
373 hammer2_vop_getattr_lite(struct vop_getattr_lite_args *ap)
374 {
375         hammer2_pfs_t *pmp;
376         hammer2_inode_t *ip;
377         struct vnode *vp;
378         struct vattr_lite *lvap;
379         int update;
380
381         vp = ap->a_vp;
382         lvap = ap->a_lvap;
383
384         ip = VTOI(vp);
385         pmp = ip->pmp;
386
387 retry:
388         update = spin_access_start(&ip->cluster_spin);
389
390 #if 0
391         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
392         vap->va_fileid = ip->meta.inum;
393 #endif
394         lvap->va_mode = ip->meta.mode;
395         lvap->va_nlink = ip->meta.nlinks;
396         lvap->va_uid = hammer2_to_unix_xid(&ip->meta.uid);
397         lvap->va_gid = hammer2_to_unix_xid(&ip->meta.gid);
398 #if 0
399         vap->va_rmajor = 0;
400         vap->va_rminor = 0;
401 #endif
402         lvap->va_size = ip->meta.size;
403 #if 0
404         vap->va_blocksize = HAMMER2_PBUFSIZE;
405 #endif
406         lvap->va_flags = ip->meta.uflags;
407         lvap->va_type = hammer2_get_vtype(ip->meta.type);
408 #if 0
409         vap->va_filerev = 0;
410         vap->va_uid_uuid = ip->meta.uid;
411         vap->va_gid_uuid = ip->meta.gid;
412         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
413                           VA_FSID_UUID_VALID;
414 #endif
415
416         if (__predict_false(spin_access_end(&ip->cluster_spin, update)))
417                 goto retry;
418
419         return (0);
420 }
421
422 static
423 int
424 hammer2_vop_setattr(struct vop_setattr_args *ap)
425 {
426         hammer2_inode_t *ip;
427         struct vnode *vp;
428         struct vattr *vap;
429         int error;
430         int kflags = 0;
431         uint64_t ctime;
432
433         vp = ap->a_vp;
434         vap = ap->a_vap;
435         hammer2_update_time(&ctime);
436
437         ip = VTOI(vp);
438
439         if (ip->pmp->ronly)
440                 return (EROFS);
441
442         /*
443          * Normally disallow setattr if there is no space, unless we
444          * are in emergency mode (might be needed to chflags -R noschg
445          * files prior to removal).
446          */
447         if ((ip->pmp->flags & HAMMER2_PMPF_EMERG) == 0 &&
448             hammer2_vfs_enospace(ip, 0, ap->a_cred) > 1) {
449                 return (ENOSPC);
450         }
451
452         hammer2_trans_init(ip->pmp, 0);
453         hammer2_inode_lock(ip, 0);
454         error = 0;
455
456         if (vap->va_flags != VNOVAL) {
457                 uint32_t flags;
458
459                 flags = ip->meta.uflags;
460                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
461                                      hammer2_to_unix_xid(&ip->meta.uid),
462                                      ap->a_cred);
463                 if (error == 0) {
464                         if (ip->meta.uflags != flags) {
465                                 hammer2_inode_modify(ip);
466                                 spin_lock_update(&ip->cluster_spin);
467                                 ip->meta.uflags = flags;
468                                 ip->meta.ctime = ctime;
469                                 spin_unlock_update(&ip->cluster_spin);
470                                 kflags |= NOTE_ATTRIB;
471                         }
472                         if (ip->meta.uflags & (IMMUTABLE | APPEND)) {
473                                 error = 0;
474                                 goto done;
475                         }
476                 }
477                 goto done;
478         }
479         if (ip->meta.uflags & (IMMUTABLE | APPEND)) {
480                 error = EPERM;
481                 goto done;
482         }
483         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
484                 mode_t cur_mode = ip->meta.mode;
485                 uid_t cur_uid = hammer2_to_unix_xid(&ip->meta.uid);
486                 gid_t cur_gid = hammer2_to_unix_xid(&ip->meta.gid);
487                 uuid_t uuid_uid;
488                 uuid_t uuid_gid;
489
490                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
491                                          ap->a_cred,
492                                          &cur_uid, &cur_gid, &cur_mode);
493                 if (error == 0) {
494                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
495                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
496                         if (bcmp(&uuid_uid, &ip->meta.uid, sizeof(uuid_uid)) ||
497                             bcmp(&uuid_gid, &ip->meta.gid, sizeof(uuid_gid)) ||
498                             ip->meta.mode != cur_mode
499                         ) {
500                                 hammer2_inode_modify(ip);
501                                 spin_lock_update(&ip->cluster_spin);
502                                 ip->meta.uid = uuid_uid;
503                                 ip->meta.gid = uuid_gid;
504                                 ip->meta.mode = cur_mode;
505                                 ip->meta.ctime = ctime;
506                                 spin_unlock_update(&ip->cluster_spin);
507                         }
508                         kflags |= NOTE_ATTRIB;
509                 }
510         }
511
512         /*
513          * Resize the file
514          */
515         if (vap->va_size != VNOVAL && ip->meta.size != vap->va_size) {
516                 switch(vp->v_type) {
517                 case VREG:
518                         if (vap->va_size == ip->meta.size)
519                                 break;
520                         if (vap->va_size < ip->meta.size) {
521                                 hammer2_mtx_ex(&ip->truncate_lock);
522                                 hammer2_truncate_file(ip, vap->va_size);
523                                 hammer2_mtx_unlock(&ip->truncate_lock);
524                                 kflags |= NOTE_WRITE;
525                         } else {
526                                 hammer2_extend_file(ip, vap->va_size);
527                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
528                         }
529                         hammer2_inode_modify(ip);
530                         ip->meta.mtime = ctime;
531                         vclrflags(vp, VLASTWRITETS);
532                         break;
533                 default:
534                         error = EINVAL;
535                         goto done;
536                 }
537         }
538 #if 0
539         /* atime not supported */
540         if (vap->va_atime.tv_sec != VNOVAL) {
541                 hammer2_inode_modify(ip);
542                 ip->meta.atime = hammer2_timespec_to_time(&vap->va_atime);
543                 kflags |= NOTE_ATTRIB;
544         }
545 #endif
546         if (vap->va_mode != (mode_t)VNOVAL) {
547                 mode_t cur_mode = ip->meta.mode;
548                 uid_t cur_uid = hammer2_to_unix_xid(&ip->meta.uid);
549                 gid_t cur_gid = hammer2_to_unix_xid(&ip->meta.gid);
550
551                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
552                                          cur_uid, cur_gid, &cur_mode);
553                 if (error == 0 && ip->meta.mode != cur_mode) {
554                         hammer2_inode_modify(ip);
555                         spin_lock_update(&ip->cluster_spin);
556                         ip->meta.mode = cur_mode;
557                         ip->meta.ctime = ctime;
558                         spin_unlock_update(&ip->cluster_spin);
559                         kflags |= NOTE_ATTRIB;
560                 }
561         }
562
563         if (vap->va_mtime.tv_sec != VNOVAL) {
564                 hammer2_inode_modify(ip);
565                 ip->meta.mtime = hammer2_timespec_to_time(&vap->va_mtime);
566                 kflags |= NOTE_ATTRIB;
567                 vclrflags(vp, VLASTWRITETS);
568         }
569
570 done:
571         /*
572          * If a truncation occurred we must call chain_sync() now in order
573          * to trim the related data chains, otherwise a later expansion can
574          * cause havoc.
575          *
576          * If an extend occured that changed the DIRECTDATA state, we must
577          * call inode_chain_sync now in order to prepare the inode's indirect
578          * block table.
579          *
580          * WARNING! This means we are making an adjustment to the inode's
581          * chain outside of sync/fsync, and not just to inode->meta, which
582          * may result in some consistency issues if a crash were to occur
583          * at just the wrong time.
584          */
585         if (ip->flags & HAMMER2_INODE_RESIZED)
586                 hammer2_inode_chain_sync(ip);
587
588         /*
589          * Cleanup.
590          */
591         hammer2_inode_unlock(ip);
592         hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
593         hammer2_knote(ip->vp, kflags);
594
595         return (error);
596 }
597
598 static
599 int
600 hammer2_vop_readdir(struct vop_readdir_args *ap)
601 {
602         hammer2_xop_readdir_t *xop;
603         hammer2_blockref_t bref;
604         hammer2_inode_t *ip;
605         hammer2_tid_t inum;
606         hammer2_key_t lkey;
607         struct uio *uio;
608         off_t *cookies;
609         off_t saveoff;
610         int cookie_index;
611         int ncookies;
612         int error;
613         int eofflag;
614         int r;
615
616         ip = VTOI(ap->a_vp);
617         uio = ap->a_uio;
618         saveoff = uio->uio_offset;
619         eofflag = 0;
620         error = 0;
621
622         /*
623          * Setup cookies directory entry cookies if requested
624          */
625         if (ap->a_ncookies) {
626                 ncookies = uio->uio_resid / 16 + 1;
627                 if (ncookies > 1024)
628                         ncookies = 1024;
629                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
630         } else {
631                 ncookies = -1;
632                 cookies = NULL;
633         }
634         cookie_index = 0;
635
636         hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
637
638         /*
639          * Handle artificial entries.  To ensure that only positive 64 bit
640          * quantities are returned to userland we always strip off bit 63.
641          * The hash code is designed such that codes 0x0000-0x7FFF are not
642          * used, allowing us to use these codes for articial entries.
643          *
644          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
645          * allow '..' to cross the mount point into (e.g.) the super-root.
646          */
647         if (saveoff == 0) {
648                 inum = ip->meta.inum & HAMMER2_DIRHASH_USERMSK;
649                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
650                 if (r)
651                         goto done;
652                 if (cookies)
653                         cookies[cookie_index] = saveoff;
654                 ++saveoff;
655                 ++cookie_index;
656                 if (cookie_index == ncookies)
657                         goto done;
658         }
659
660         if (saveoff == 1) {
661                 inum = ip->meta.inum & HAMMER2_DIRHASH_USERMSK;
662                 if (ip != ip->pmp->iroot)
663                         inum = ip->meta.iparent & HAMMER2_DIRHASH_USERMSK;
664                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
665                 if (r)
666                         goto done;
667                 if (cookies)
668                         cookies[cookie_index] = saveoff;
669                 ++saveoff;
670                 ++cookie_index;
671                 if (cookie_index == ncookies)
672                         goto done;
673         }
674
675         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
676         if (hammer2_debug & 0x0020)
677                 kprintf("readdir: lkey %016jx\n", lkey);
678         if (error)
679                 goto done;
680
681         /*
682          * Use XOP for cluster scan.
683          *
684          * parent is the inode cluster, already locked for us.  Don't
685          * double lock shared locks as this will screw up upgrades.
686          */
687         xop = hammer2_xop_alloc(ip, 0);
688         xop->lkey = lkey;
689         hammer2_xop_start(&xop->head, &hammer2_readdir_desc);
690
691         for (;;) {
692                 const hammer2_inode_data_t *ripdata;
693                 const char *dname;
694                 int dtype;
695
696                 error = hammer2_xop_collect(&xop->head, 0);
697                 error = hammer2_error_to_errno(error);
698                 if (error) {
699                         break;
700                 }
701                 if (cookie_index == ncookies)
702                         break;
703                 if (hammer2_debug & 0x0020)
704                         kprintf("cluster chain %p %p\n",
705                                 xop->head.cluster.focus,
706                                 (xop->head.cluster.focus ?
707                                  xop->head.cluster.focus->data : (void *)-1));
708                 hammer2_cluster_bref(&xop->head.cluster, &bref);
709
710                 if (bref.type == HAMMER2_BREF_TYPE_INODE) {
711                         ripdata = &hammer2_xop_gdata(&xop->head)->ipdata;
712                         dtype = hammer2_get_dtype(ripdata->meta.type);
713                         saveoff = bref.key & HAMMER2_DIRHASH_USERMSK;
714                         r = vop_write_dirent(&error, uio,
715                                              ripdata->meta.inum &
716                                               HAMMER2_DIRHASH_USERMSK,
717                                              dtype,
718                                              ripdata->meta.name_len,
719                                              ripdata->filename);
720                         hammer2_xop_pdata(&xop->head);
721                         if (r)
722                                 break;
723                         if (cookies)
724                                 cookies[cookie_index] = saveoff;
725                         ++cookie_index;
726                 } else if (bref.type == HAMMER2_BREF_TYPE_DIRENT) {
727                         uint16_t namlen;
728
729                         dtype = hammer2_get_dtype(bref.embed.dirent.type);
730                         saveoff = bref.key & HAMMER2_DIRHASH_USERMSK;
731                         namlen = bref.embed.dirent.namlen;
732                         if (namlen <= sizeof(bref.check.buf)) {
733                                 dname = bref.check.buf;
734                         } else {
735                                 dname = hammer2_xop_gdata(&xop->head)->buf;
736                         }
737                         r = vop_write_dirent(&error, uio,
738                                              bref.embed.dirent.inum, dtype,
739                                              namlen, dname);
740                         if (namlen > sizeof(bref.check.buf))
741                                 hammer2_xop_pdata(&xop->head);
742                         if (r)
743                                 break;
744                         if (cookies)
745                                 cookies[cookie_index] = saveoff;
746                         ++cookie_index;
747                 } else {
748                         /* XXX chain error */
749                         kprintf("bad chain type readdir %d\n", bref.type);
750                 }
751         }
752         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
753         if (error == ENOENT) {
754                 error = 0;
755                 eofflag = 1;
756                 saveoff = (hammer2_key_t)-1;
757         } else {
758                 saveoff = bref.key & HAMMER2_DIRHASH_USERMSK;
759         }
760 done:
761         hammer2_inode_unlock(ip);
762         if (ap->a_eofflag)
763                 *ap->a_eofflag = eofflag;
764         if (hammer2_debug & 0x0020)
765                 kprintf("readdir: done at %016jx\n", saveoff);
766         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
767         if (error && cookie_index == 0) {
768                 if (cookies) {
769                         kfree(cookies, M_TEMP);
770                         *ap->a_ncookies = 0;
771                         *ap->a_cookies = NULL;
772                 }
773         } else {
774                 if (cookies) {
775                         *ap->a_ncookies = cookie_index;
776                         *ap->a_cookies = cookies;
777                 }
778         }
779         return (error);
780 }
781
782 /*
783  * hammer2_vop_readlink { vp, uio, cred }
784  */
785 static
786 int
787 hammer2_vop_readlink(struct vop_readlink_args *ap)
788 {
789         struct vnode *vp;
790         hammer2_inode_t *ip;
791         int error;
792
793         vp = ap->a_vp;
794         if (vp->v_type != VLNK)
795                 return (EINVAL);
796         ip = VTOI(vp);
797
798         error = hammer2_read_file(ip, ap->a_uio, 0);
799         return (error);
800 }
801
802 static
803 int
804 hammer2_vop_read(struct vop_read_args *ap)
805 {
806         struct vnode *vp;
807         hammer2_inode_t *ip;
808         struct uio *uio;
809         int error;
810         int seqcount;
811         int bigread;
812
813         /*
814          * Read operations supported on this vnode?
815          */
816         vp = ap->a_vp;
817         if (vp->v_type == VDIR)
818                 return (EISDIR);
819         if (vp->v_type != VREG)
820                 return (EINVAL);
821
822         /*
823          * Misc
824          */
825         ip = VTOI(vp);
826         uio = ap->a_uio;
827         error = 0;
828
829         seqcount = ap->a_ioflag >> IO_SEQSHIFT;
830         bigread = (uio->uio_resid > 100 * 1024 * 1024);
831
832         error = hammer2_read_file(ip, uio, seqcount);
833         return (error);
834 }
835
836 static
837 int
838 hammer2_vop_write(struct vop_write_args *ap)
839 {
840         hammer2_inode_t *ip;
841         thread_t td;
842         struct vnode *vp;
843         struct uio *uio;
844         int error;
845         int seqcount;
846         int ioflag;
847
848         /*
849          * Read operations supported on this vnode?
850          */
851         vp = ap->a_vp;
852         if (vp->v_type != VREG)
853                 return (EINVAL);
854
855         /*
856          * Misc
857          */
858         ip = VTOI(vp);
859         ioflag = ap->a_ioflag;
860         uio = ap->a_uio;
861         error = 0;
862         if (ip->pmp->ronly || (ip->pmp->flags & HAMMER2_PMPF_EMERG))
863                 return (EROFS);
864         switch (hammer2_vfs_enospace(ip, uio->uio_resid, ap->a_cred)) {
865         case 2:
866                 return (ENOSPC);
867         case 1:
868                 ioflag |= IO_DIRECT;    /* semi-synchronous */
869                 /* fall through */
870         default:
871                 break;
872         }
873
874         seqcount = ioflag >> IO_SEQSHIFT;
875
876         /*
877          * Check resource limit
878          */
879         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
880             uio->uio_offset + uio->uio_resid >
881              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
882                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
883                 return (EFBIG);
884         }
885
886         /*
887          * The transaction interlocks against flush initiations
888          * (note: but will run concurrently with the actual flush).
889          *
890          * To avoid deadlocking against the VM system, we must flag any
891          * transaction related to the buffer cache or other direct
892          * VM page manipulation.
893          */
894         if (uio->uio_segflg == UIO_NOCOPY) {
895                 hammer2_trans_init(ip->pmp, HAMMER2_TRANS_BUFCACHE);
896         } else {
897                 hammer2_trans_init(ip->pmp, 0);
898         }
899         error = hammer2_write_file(ip, uio, ioflag, seqcount);
900         if (uio->uio_segflg == UIO_NOCOPY)
901                 hammer2_trans_done(ip->pmp, HAMMER2_TRANS_BUFCACHE |
902                                             HAMMER2_TRANS_SIDEQ);
903         else
904                 hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
905
906         return (error);
907 }
908
909 /*
910  * Perform read operations on a file or symlink given an UNLOCKED
911  * inode and uio.
912  *
913  * The passed ip is not locked.
914  */
915 static
916 int
917 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
918 {
919         hammer2_off_t size;
920         struct buf *bp;
921         int error;
922
923         error = 0;
924
925         /*
926          * UIO read loop.
927          *
928          * WARNING! Assumes that the kernel interlocks size changes at the
929          *          vnode level.
930          */
931         hammer2_mtx_sh(&ip->lock);
932         hammer2_mtx_sh(&ip->truncate_lock);
933         size = ip->meta.size;
934         hammer2_mtx_unlock(&ip->lock);
935
936         while (uio->uio_resid > 0 && uio->uio_offset < size) {
937                 hammer2_key_t lbase;
938                 hammer2_key_t leof;
939                 int lblksize;
940                 int loff;
941                 int n;
942
943                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
944                                                 &lbase, &leof);
945
946 #if 1
947                 bp = NULL;
948                 error = cluster_readx(ip->vp, leof, lbase, lblksize,
949                                       B_NOTMETA | B_KVABIO,
950                                       uio->uio_resid,
951                                       seqcount * MAXBSIZE,
952                                       &bp);
953 #else
954                 if (uio->uio_segflg == UIO_NOCOPY) {
955                         bp = getblk(ip->vp, lbase, lblksize,
956                                     GETBLK_BHEAVY | GETBLK_KVABIO, 0);
957                         if (bp->b_flags & B_CACHE) {
958                                 int i;
959                                 int j = 0;
960                                 if (bp->b_xio.xio_npages != 16)
961                                         kprintf("NPAGES BAD\n");
962                                 for (i = 0; i < bp->b_xio.xio_npages; ++i) {
963                                         vm_page_t m;
964                                         m = bp->b_xio.xio_pages[i];
965                                         if (m == NULL || m->valid == 0) {
966                                                 kprintf("bp %016jx %016jx pg %d inv",
967                                                         lbase, leof, i);
968                                                 if (m)
969                                                         kprintf("m->object %p/%p", m->object, ip->vp->v_object);
970                                                 kprintf("\n");
971                                                 j = 1;
972                                         }
973                                 }
974                                 if (j)
975                                         kprintf("b_flags %08x, b_error %d\n", bp->b_flags, bp->b_error);
976                         }
977                         bqrelse(bp);
978                 }
979                 error = bread_kvabio(ip->vp, lbase, lblksize, &bp);
980 #endif
981                 if (error) {
982                         brelse(bp);
983                         break;
984                 }
985                 bkvasync(bp);
986                 loff = (int)(uio->uio_offset - lbase);
987                 n = lblksize - loff;
988                 if (n > uio->uio_resid)
989                         n = uio->uio_resid;
990                 if (n > size - uio->uio_offset)
991                         n = (int)(size - uio->uio_offset);
992                 bp->b_flags |= B_AGE;
993                 uiomovebp(bp, (char *)bp->b_data + loff, n, uio);
994                 bqrelse(bp);
995         }
996         hammer2_mtx_unlock(&ip->truncate_lock);
997
998         return (error);
999 }
1000
1001 /*
1002  * Write to the file represented by the inode via the logical buffer cache.
1003  * The inode may represent a regular file or a symlink.
1004  *
1005  * The inode must not be locked.
1006  */
1007 static
1008 int
1009 hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
1010                    int ioflag, int seqcount)
1011 {
1012         hammer2_key_t old_eof;
1013         hammer2_key_t new_eof;
1014         struct buf *bp;
1015         int kflags;
1016         int error;
1017         int modified;
1018
1019         /*
1020          * Setup if append
1021          *
1022          * WARNING! Assumes that the kernel interlocks size changes at the
1023          *          vnode level.
1024          */
1025         hammer2_mtx_ex(&ip->lock);
1026         hammer2_mtx_sh(&ip->truncate_lock);
1027         if (ioflag & IO_APPEND)
1028                 uio->uio_offset = ip->meta.size;
1029         old_eof = ip->meta.size;
1030
1031         /*
1032          * Extend the file if necessary.  If the write fails at some point
1033          * we will truncate it back down to cover as much as we were able
1034          * to write.
1035          *
1036          * Doing this now makes it easier to calculate buffer sizes in
1037          * the loop.
1038          */
1039         kflags = 0;
1040         error = 0;
1041         modified = 0;
1042
1043         if (uio->uio_offset + uio->uio_resid > old_eof) {
1044                 new_eof = uio->uio_offset + uio->uio_resid;
1045                 modified = 1;
1046                 hammer2_extend_file(ip, new_eof);
1047                 kflags |= NOTE_EXTEND;
1048         } else {
1049                 new_eof = old_eof;
1050         }
1051         hammer2_mtx_unlock(&ip->lock);
1052
1053         /*
1054          * UIO write loop
1055          */
1056         while (uio->uio_resid > 0) {
1057                 hammer2_key_t lbase;
1058                 int trivial;
1059                 int endofblk;
1060                 int lblksize;
1061                 int loff;
1062                 int n;
1063
1064                 /*
1065                  * Don't allow the buffer build to blow out the buffer
1066                  * cache.
1067                  */
1068                 if ((ioflag & IO_RECURSE) == 0)
1069                         bwillwrite(HAMMER2_PBUFSIZE);
1070
1071                 /*
1072                  * This nominally tells us how much we can cluster and
1073                  * what the logical buffer size needs to be.  Currently
1074                  * we don't try to cluster the write and just handle one
1075                  * block at a time.
1076                  */
1077                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
1078                                                 &lbase, NULL);
1079                 loff = (int)(uio->uio_offset - lbase);
1080
1081                 KKASSERT(lblksize <= MAXBSIZE);
1082
1083                 /*
1084                  * Calculate bytes to copy this transfer and whether the
1085                  * copy completely covers the buffer or not.
1086                  */
1087                 trivial = 0;
1088                 n = lblksize - loff;
1089                 if (n > uio->uio_resid) {
1090                         n = uio->uio_resid;
1091                         if (loff == lbase && uio->uio_offset + n == new_eof)
1092                                 trivial = 1;
1093                         endofblk = 0;
1094                 } else {
1095                         if (loff == 0)
1096                                 trivial = 1;
1097                         endofblk = 1;
1098                 }
1099                 if (lbase >= new_eof)
1100                         trivial = 1;
1101
1102                 /*
1103                  * Get the buffer
1104                  */
1105                 if (uio->uio_segflg == UIO_NOCOPY) {
1106                         /*
1107                          * Issuing a write with the same data backing the
1108                          * buffer.  Instantiate the buffer to collect the
1109                          * backing vm pages, then read-in any missing bits.
1110                          *
1111                          * This case is used by vop_stdputpages().
1112                          */
1113                         bp = getblk(ip->vp, lbase, lblksize,
1114                                     GETBLK_BHEAVY | GETBLK_KVABIO, 0);
1115                         if ((bp->b_flags & B_CACHE) == 0) {
1116                                 bqrelse(bp);
1117                                 error = bread_kvabio(ip->vp, lbase,
1118                                                      lblksize, &bp);
1119                         }
1120                 } else if (trivial) {
1121                         /*
1122                          * Even though we are entirely overwriting the buffer
1123                          * we may still have to zero it out to avoid a
1124                          * mmap/write visibility issue.
1125                          */
1126                         bp = getblk(ip->vp, lbase, lblksize,
1127                                     GETBLK_BHEAVY | GETBLK_KVABIO, 0);
1128                         if ((bp->b_flags & B_CACHE) == 0)
1129                                 vfs_bio_clrbuf(bp);
1130                 } else {
1131                         /*
1132                          * Partial overwrite, read in any missing bits then
1133                          * replace the portion being written.
1134                          *
1135                          * (The strategy code will detect zero-fill physical
1136                          * blocks for this case).
1137                          */
1138                         error = bread_kvabio(ip->vp, lbase, lblksize, &bp);
1139                         if (error == 0)
1140                                 bheavy(bp);
1141                 }
1142
1143                 if (error) {
1144                         brelse(bp);
1145                         break;
1146                 }
1147
1148                 /*
1149                  * Ok, copy the data in
1150                  */
1151                 bkvasync(bp);
1152                 error = uiomovebp(bp, bp->b_data + loff, n, uio);
1153                 kflags |= NOTE_WRITE;
1154                 modified = 1;
1155                 if (error) {
1156                         brelse(bp);
1157                         break;
1158                 }
1159
1160                 /*
1161                  * WARNING: Pageout daemon will issue UIO_NOCOPY writes
1162                  *          with IO_SYNC or IO_ASYNC set.  These writes
1163                  *          must be handled as the pageout daemon expects.
1164                  *
1165                  * NOTE!    H2 relies on cluster_write() here because it
1166                  *          cannot preallocate disk blocks at the logical
1167                  *          level due to not knowing what the compression
1168                  *          size will be at this time.
1169                  *
1170                  *          We must use cluster_write() here and we depend
1171                  *          on the write-behind feature to flush buffers
1172                  *          appropriately.  If we let the buffer daemons do
1173                  *          it the block allocations will be all over the
1174                  *          map.
1175                  */
1176                 if (ioflag & IO_SYNC) {
1177                         bwrite(bp);
1178                 } else if ((ioflag & IO_DIRECT) && endofblk) {
1179                         bawrite(bp);
1180                 } else if (ioflag & IO_ASYNC) {
1181                         bawrite(bp);
1182                 } else if (ip->vp->v_mount->mnt_flag & MNT_NOCLUSTERW) {
1183                         bdwrite(bp);
1184                 } else {
1185 #if 1
1186                         bp->b_flags |= B_CLUSTEROK;
1187                         cluster_write(bp, new_eof, lblksize, seqcount);
1188 #else
1189                         bp->b_flags |= B_CLUSTEROK;
1190                         bdwrite(bp);
1191 #endif
1192                 }
1193         }
1194
1195         /*
1196          * Cleanup.  If we extended the file EOF but failed to write through
1197          * the entire write is a failure and we have to back-up.
1198          */
1199         if (error && new_eof != old_eof) {
1200                 hammer2_mtx_unlock(&ip->truncate_lock);
1201                 hammer2_mtx_ex(&ip->lock);              /* note lock order */
1202                 hammer2_mtx_ex(&ip->truncate_lock);     /* note lock order */
1203                 hammer2_truncate_file(ip, old_eof);
1204                 if (ip->flags & HAMMER2_INODE_MODIFIED)
1205                         hammer2_inode_chain_sync(ip);
1206                 hammer2_mtx_unlock(&ip->lock);
1207         } else if (modified) {
1208                 struct vnode *vp = ip->vp;
1209
1210                 hammer2_mtx_ex(&ip->lock);
1211                 hammer2_inode_modify(ip);
1212                 if (uio->uio_segflg == UIO_NOCOPY) {
1213                         if (vp->v_flag & VLASTWRITETS) {
1214                                 ip->meta.mtime =
1215                                     (unsigned long)vp->v_lastwrite_ts.tv_sec *
1216                                     1000000 +
1217                                     vp->v_lastwrite_ts.tv_nsec / 1000;
1218                         }
1219                 } else {
1220                         hammer2_update_time(&ip->meta.mtime);
1221                         vclrflags(vp, VLASTWRITETS);
1222                 }
1223
1224 #if 0
1225                 /*
1226                  * REMOVED - handled by hammer2_extend_file().  Do not issue
1227                  * a chain_sync() outside of a sync/fsync except for DIRECTDATA
1228                  * state changes.
1229                  *
1230                  * Under normal conditions we only issue a chain_sync if
1231                  * the inode's DIRECTDATA state changed.
1232                  */
1233                 if (ip->flags & HAMMER2_INODE_RESIZED)
1234                         hammer2_inode_chain_sync(ip);
1235 #endif
1236                 hammer2_mtx_unlock(&ip->lock);
1237                 hammer2_knote(ip->vp, kflags);
1238         }
1239         hammer2_trans_assert_strategy(ip->pmp);
1240         hammer2_mtx_unlock(&ip->truncate_lock);
1241
1242         return error;
1243 }
1244
1245 /*
1246  * Truncate the size of a file.  The inode must be locked.
1247  *
1248  * We must unconditionally set HAMMER2_INODE_RESIZED to properly
1249  * ensure that any on-media data beyond the new file EOF has been destroyed.
1250  *
1251  * WARNING: nvtruncbuf() can only be safely called without the inode lock
1252  *          held due to the way our write thread works.  If the truncation
1253  *          occurs in the middle of a buffer, nvtruncbuf() is responsible
1254  *          for dirtying that buffer and zeroing out trailing bytes.
1255  *
1256  * WARNING! Assumes that the kernel interlocks size changes at the
1257  *          vnode level.
1258  *
1259  * WARNING! Caller assumes responsibility for removing dead blocks
1260  *          if INODE_RESIZED is set.
1261  */
1262 static
1263 void
1264 hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1265 {
1266         hammer2_key_t lbase;
1267         int nblksize;
1268
1269         hammer2_mtx_unlock(&ip->lock);
1270         if (ip->vp) {
1271                 nblksize = hammer2_calc_logical(ip, nsize, &lbase, NULL);
1272                 nvtruncbuf(ip->vp, nsize,
1273                            nblksize, (int)nsize & (nblksize - 1),
1274                            0);
1275         }
1276         hammer2_mtx_ex(&ip->lock);
1277         KKASSERT((ip->flags & HAMMER2_INODE_RESIZED) == 0);
1278         ip->osize = ip->meta.size;
1279         ip->meta.size = nsize;
1280         atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED);
1281         hammer2_inode_modify(ip);
1282 }
1283
1284 /*
1285  * Extend the size of a file.  The inode must be locked.
1286  *
1287  * Even though the file size is changing, we do not have to set the
1288  * INODE_RESIZED bit unless the file size crosses the EMBEDDED_BYTES
1289  * boundary.  When this occurs a hammer2_inode_chain_sync() is required
1290  * to prepare the inode cluster's indirect block table, otherwise
1291  * async execution of the strategy code will implode on us.
1292  *
1293  * WARNING! Assumes that the kernel interlocks size changes at the
1294  *          vnode level.
1295  *
1296  * WARNING! Caller assumes responsibility for transitioning out
1297  *          of the inode DIRECTDATA mode if INODE_RESIZED is set.
1298  */
1299 static
1300 void
1301 hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1302 {
1303         hammer2_key_t lbase;
1304         hammer2_key_t osize;
1305         int oblksize;
1306         int nblksize;
1307         int error;
1308
1309         KKASSERT((ip->flags & HAMMER2_INODE_RESIZED) == 0);
1310         hammer2_inode_modify(ip);
1311         osize = ip->meta.size;
1312         ip->osize = osize;
1313         ip->meta.size = nsize;
1314
1315         /*
1316          * We must issue a chain_sync() when the DIRECTDATA state changes
1317          * to prevent confusion between the flush code and the in-memory
1318          * state.  This is not perfect because we are doing it outside of
1319          * a sync/fsync operation, so it might not be fully synchronized
1320          * with the meta-data topology flush.
1321          *
1322          * We must retain and re-dirty the buffer cache buffer containing
1323          * the direct data so it can be written to a real block.  It should
1324          * not be possible for a bread error to occur since the original data
1325          * is extracted from the inode structure directly.
1326          */
1327         if (osize <= HAMMER2_EMBEDDED_BYTES && nsize > HAMMER2_EMBEDDED_BYTES) {
1328                 if (osize) {
1329                         struct buf *bp;
1330
1331                         oblksize = hammer2_calc_logical(ip, 0, NULL, NULL);
1332                         error = bread_kvabio(ip->vp, 0, oblksize, &bp);
1333                         atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED);
1334                         hammer2_inode_chain_sync(ip);
1335                         if (error == 0) {
1336                                 bheavy(bp);
1337                                 bdwrite(bp);
1338                         } else {
1339                                 brelse(bp);
1340                         }
1341                 } else {
1342                         atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED);
1343                         hammer2_inode_chain_sync(ip);
1344                 }
1345         }
1346         hammer2_mtx_unlock(&ip->lock);
1347         if (ip->vp) {
1348                 oblksize = hammer2_calc_logical(ip, osize, &lbase, NULL);
1349                 nblksize = hammer2_calc_logical(ip, nsize, &lbase, NULL);
1350                 nvextendbuf(ip->vp,
1351                             osize, nsize,
1352                             oblksize, nblksize,
1353                             -1, -1, 0);
1354         }
1355         hammer2_mtx_ex(&ip->lock);
1356 }
1357
1358 static
1359 int
1360 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1361 {
1362         hammer2_xop_nresolve_t *xop;
1363         hammer2_inode_t *ip;
1364         hammer2_inode_t *dip;
1365         struct namecache *ncp;
1366         struct vnode *vp;
1367         int error;
1368
1369         dip = VTOI(ap->a_dvp);
1370         xop = hammer2_xop_alloc(dip, 0);
1371
1372         ncp = ap->a_nch->ncp;
1373         hammer2_xop_setname(&xop->head, ncp->nc_name, ncp->nc_nlen);
1374
1375         /*
1376          * Note: In DragonFly the kernel handles '.' and '..'.
1377          */
1378         hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);
1379         hammer2_xop_start(&xop->head, &hammer2_nresolve_desc);
1380
1381         error = hammer2_xop_collect(&xop->head, 0);
1382         error = hammer2_error_to_errno(error);
1383         if (error) {
1384                 ip = NULL;
1385         } else {
1386                 ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
1387         }
1388         hammer2_inode_unlock(dip);
1389
1390         /*
1391          * Acquire the related vnode
1392          *
1393          * NOTE: For error processing, only ENOENT resolves the namecache
1394          *       entry to NULL, otherwise we just return the error and
1395          *       leave the namecache unresolved.
1396          *
1397          * NOTE: multiple hammer2_inode structures can be aliased to the
1398          *       same chain element, for example for hardlinks.  This
1399          *       use case does not 'reattach' inode associations that
1400          *       might already exist, but always allocates a new one.
1401          *
1402          * WARNING: inode structure is locked exclusively via inode_get
1403          *          but chain was locked shared.  inode_unlock()
1404          *          will handle it properly.
1405          */
1406         if (ip) {
1407                 vp = hammer2_igetv(ip, &error); /* error set to UNIX error */
1408                 if (error == 0) {
1409                         vn_unlock(vp);
1410                         cache_setvp(ap->a_nch, vp);
1411                 } else if (error == ENOENT) {
1412                         cache_setvp(ap->a_nch, NULL);
1413                 }
1414                 hammer2_inode_unlock(ip);
1415
1416                 /*
1417                  * The vp should not be released until after we've disposed
1418                  * of our locks, because it might cause vop_inactive() to
1419                  * be called.
1420                  */
1421                 if (vp)
1422                         vrele(vp);
1423         } else {
1424                 error = ENOENT;
1425                 cache_setvp(ap->a_nch, NULL);
1426         }
1427         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1428         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1429                 ("resolve error %d/%p ap %p\n",
1430                  error, ap->a_nch->ncp->nc_vp, ap));
1431
1432         return error;
1433 }
1434
1435 static
1436 int
1437 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1438 {
1439         hammer2_inode_t *dip;
1440         hammer2_tid_t inum;
1441         int error;
1442
1443         dip = VTOI(ap->a_dvp);
1444         inum = dip->meta.iparent;
1445         *ap->a_vpp = NULL;
1446
1447         if (inum) {
1448                 error = hammer2_vfs_vget(ap->a_dvp->v_mount, NULL,
1449                                          inum, ap->a_vpp);
1450         } else {
1451                 error = ENOENT;
1452         }
1453         return error;
1454 }
1455
1456 static
1457 int
1458 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1459 {
1460         hammer2_inode_t *dip;
1461         hammer2_inode_t *nip;
1462         struct namecache *ncp;
1463         const uint8_t *name;
1464         size_t name_len;
1465         hammer2_tid_t inum;
1466         int error;
1467
1468         dip = VTOI(ap->a_dvp);
1469         if (dip->pmp->ronly || (dip->pmp->flags & HAMMER2_PMPF_EMERG))
1470                 return (EROFS);
1471         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1472                 return (ENOSPC);
1473
1474         ncp = ap->a_nch->ncp;
1475         name = ncp->nc_name;
1476         name_len = ncp->nc_nlen;
1477
1478         hammer2_trans_init(dip->pmp, 0);
1479
1480         inum = hammer2_trans_newinum(dip->pmp);
1481
1482         /*
1483          * Create the actual inode as a hidden file in the iroot, then
1484          * create the directory entry.  The creation of the actual inode
1485          * sets its nlinks to 1 which is the value we desire.
1486          *
1487          * dip must be locked before nip to avoid deadlock.
1488          */
1489         hammer2_inode_lock(dip, 0);
1490         nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
1491                                           inum, &error);
1492         if (error) {
1493                 error = hammer2_error_to_errno(error);
1494         } else {
1495                 error = hammer2_dirent_create(dip, name, name_len,
1496                                               nip->meta.inum, nip->meta.type);
1497                 /* returns UNIX error code */
1498         }
1499         if (error) {
1500                 if (nip) {
1501                         hammer2_inode_unlink_finisher(nip, 0);
1502                         hammer2_inode_unlock(nip);
1503                         nip = NULL;
1504                 }
1505                 *ap->a_vpp = NULL;
1506         } else {
1507                 /*
1508                  * inode_depend() must occur before the igetv() because
1509                  * the igetv() can temporarily release the inode lock.
1510                  */
1511                 hammer2_inode_depend(dip, nip); /* before igetv */
1512                 *ap->a_vpp = hammer2_igetv(nip, &error);
1513                 hammer2_inode_unlock(nip);
1514         }
1515
1516         /*
1517          * Update dip's mtime
1518          *
1519          * We can use a shared inode lock and allow the meta.mtime update
1520          * SMP race.  hammer2_inode_modify() is MPSAFE w/a shared lock.
1521          */
1522         if (error == 0) {
1523                 uint64_t mtime;
1524
1525                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1526                 hammer2_update_time(&mtime);
1527                 hammer2_inode_modify(dip);
1528                 dip->meta.mtime = mtime;
1529                 /*hammer2_inode_unlock(dip);*/
1530         }
1531         hammer2_inode_unlock(dip);
1532
1533         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1534
1535         if (error == 0) {
1536                 cache_setunresolved(ap->a_nch);
1537                 cache_setvp(ap->a_nch, *ap->a_vpp);
1538                 hammer2_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1539         }
1540         return error;
1541 }
1542
1543 static
1544 int
1545 hammer2_vop_open(struct vop_open_args *ap)
1546 {
1547         return vop_stdopen(ap);
1548 }
1549
1550 /*
1551  * hammer2_vop_advlock { vp, id, op, fl, flags }
1552  */
1553 static
1554 int
1555 hammer2_vop_advlock(struct vop_advlock_args *ap)
1556 {
1557         hammer2_inode_t *ip = VTOI(ap->a_vp);
1558         hammer2_off_t size;
1559
1560         size = ip->meta.size;
1561         return (lf_advlock(ap, &ip->advlock, size));
1562 }
1563
1564 static
1565 int
1566 hammer2_vop_close(struct vop_close_args *ap)
1567 {
1568         return vop_stdclose(ap);
1569 }
1570
1571 /*
1572  * hammer2_vop_nlink { nch, dvp, vp, cred }
1573  *
1574  * Create a hardlink from (vp) to {dvp, nch}.
1575  */
1576 static
1577 int
1578 hammer2_vop_nlink(struct vop_nlink_args *ap)
1579 {
1580         hammer2_inode_t *tdip;  /* target directory to create link in */
1581         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1582         struct namecache *ncp;
1583         const uint8_t *name;
1584         size_t name_len;
1585         int error;
1586
1587         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1588                 return(EXDEV);
1589
1590         tdip = VTOI(ap->a_dvp);
1591         if (tdip->pmp->ronly || (tdip->pmp->flags & HAMMER2_PMPF_EMERG))
1592                 return (EROFS);
1593         if (hammer2_vfs_enospace(tdip, 0, ap->a_cred) > 1)
1594                 return (ENOSPC);
1595
1596         ncp = ap->a_nch->ncp;
1597         name = ncp->nc_name;
1598         name_len = ncp->nc_nlen;
1599
1600         /*
1601          * ip represents the file being hardlinked.  The file could be a
1602          * normal file or a hardlink target if it has already been hardlinked.
1603          * (with the new semantics, it will almost always be a hardlink
1604          * target).
1605          *
1606          * Bump nlinks and potentially also create or move the hardlink
1607          * target in the parent directory common to (ip) and (tdip).  The
1608          * consolidation code can modify ip->cluster.  The returned cluster
1609          * is locked.
1610          */
1611         ip = VTOI(ap->a_vp);
1612         KASSERT(ip->pmp, ("ip->pmp is NULL %p %p", ip, ip->pmp));
1613         hammer2_trans_init(ip->pmp, 0);
1614
1615         /*
1616          * Target should be an indexed inode or there's no way we will ever
1617          * be able to find it!
1618          */
1619         KKASSERT((ip->meta.name_key & HAMMER2_DIRHASH_VISIBLE) == 0);
1620
1621         error = 0;
1622
1623         /*
1624          * Can return NULL and error == EXDEV if the common parent
1625          * crosses a directory with the xlink flag set.
1626          */
1627         hammer2_inode_lock4(tdip, ip, NULL, NULL);
1628
1629         /*
1630          * Create the directory entry and bump nlinks.
1631          */
1632         if (error == 0) {
1633                 error = hammer2_dirent_create(tdip, name, name_len,
1634                                               ip->meta.inum, ip->meta.type);
1635                 hammer2_inode_modify(ip);
1636                 ++ip->meta.nlinks;
1637         }
1638         if (error == 0) {
1639                 /*
1640                  * Update dip's mtime
1641                  */
1642                 uint64_t mtime;
1643
1644                 hammer2_update_time(&mtime);
1645                 hammer2_inode_modify(tdip);
1646                 tdip->meta.mtime = mtime;
1647
1648                 cache_setunresolved(ap->a_nch);
1649                 cache_setvp(ap->a_nch, ap->a_vp);
1650         }
1651         hammer2_inode_unlock(ip);
1652         hammer2_inode_unlock(tdip);
1653
1654         hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
1655         hammer2_knote(ap->a_vp, NOTE_LINK);
1656         hammer2_knote(ap->a_dvp, NOTE_WRITE);
1657
1658         return error;
1659 }
1660
1661 /*
1662  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1663  *
1664  * The operating system has already ensured that the directory entry
1665  * does not exist and done all appropriate namespace locking.
1666  */
1667 static
1668 int
1669 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1670 {
1671         hammer2_inode_t *dip;
1672         hammer2_inode_t *nip;
1673         struct namecache *ncp;
1674         const uint8_t *name;
1675         size_t name_len;
1676         hammer2_tid_t inum;
1677         int error;
1678
1679         dip = VTOI(ap->a_dvp);
1680         if (dip->pmp->ronly || (dip->pmp->flags & HAMMER2_PMPF_EMERG))
1681                 return (EROFS);
1682         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1683                 return (ENOSPC);
1684
1685         ncp = ap->a_nch->ncp;
1686         name = ncp->nc_name;
1687         name_len = ncp->nc_nlen;
1688         hammer2_trans_init(dip->pmp, 0);
1689
1690         inum = hammer2_trans_newinum(dip->pmp);
1691
1692         /*
1693          * Create the actual inode as a hidden file in the iroot, then
1694          * create the directory entry.  The creation of the actual inode
1695          * sets its nlinks to 1 which is the value we desire.
1696          *
1697          * dip must be locked before nip to avoid deadlock.
1698          */
1699         hammer2_inode_lock(dip, 0);
1700         nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
1701                                           inum, &error);
1702
1703         if (error) {
1704                 error = hammer2_error_to_errno(error);
1705         } else {
1706                 error = hammer2_dirent_create(dip, name, name_len,
1707                                               nip->meta.inum, nip->meta.type);
1708         }
1709         if (error) {
1710                 if (nip) {
1711                         hammer2_inode_unlink_finisher(nip, 0);
1712                         hammer2_inode_unlock(nip);
1713                         nip = NULL;
1714                 }
1715                 *ap->a_vpp = NULL;
1716         } else {
1717                 hammer2_inode_depend(dip, nip); /* before igetv */
1718                 *ap->a_vpp = hammer2_igetv(nip, &error);
1719                 hammer2_inode_unlock(nip);
1720         }
1721
1722         /*
1723          * Update dip's mtime
1724          */
1725         if (error == 0) {
1726                 uint64_t mtime;
1727
1728                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1729                 hammer2_update_time(&mtime);
1730                 hammer2_inode_modify(dip);
1731                 dip->meta.mtime = mtime;
1732                 /*hammer2_inode_unlock(dip);*/
1733         }
1734         hammer2_inode_unlock(dip);
1735
1736         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1737
1738         if (error == 0) {
1739                 cache_setunresolved(ap->a_nch);
1740                 cache_setvp(ap->a_nch, *ap->a_vpp);
1741                 hammer2_knote(ap->a_dvp, NOTE_WRITE);
1742         }
1743         return error;
1744 }
1745
1746 /*
1747  * Make a device node (typically a fifo)
1748  */
1749 static
1750 int
1751 hammer2_vop_nmknod(struct vop_nmknod_args *ap)
1752 {
1753         hammer2_inode_t *dip;
1754         hammer2_inode_t *nip;
1755         struct namecache *ncp;
1756         const uint8_t *name;
1757         size_t name_len;
1758         hammer2_tid_t inum;
1759         int error;
1760
1761         dip = VTOI(ap->a_dvp);
1762         if (dip->pmp->ronly || (dip->pmp->flags & HAMMER2_PMPF_EMERG))
1763                 return (EROFS);
1764         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1765                 return (ENOSPC);
1766
1767         ncp = ap->a_nch->ncp;
1768         name = ncp->nc_name;
1769         name_len = ncp->nc_nlen;
1770         hammer2_trans_init(dip->pmp, 0);
1771
1772         /*
1773          * Create the device inode and then create the directory entry.
1774          *
1775          * dip must be locked before nip to avoid deadlock.
1776          */
1777         inum = hammer2_trans_newinum(dip->pmp);
1778
1779         hammer2_inode_lock(dip, 0);
1780         nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
1781                                           inum, &error);
1782         if (error == 0) {
1783                 error = hammer2_dirent_create(dip, name, name_len,
1784                                               nip->meta.inum, nip->meta.type);
1785         }
1786         if (error) {
1787                 if (nip) {
1788                         hammer2_inode_unlink_finisher(nip, 0);
1789                         hammer2_inode_unlock(nip);
1790                         nip = NULL;
1791                 }
1792                 *ap->a_vpp = NULL;
1793         } else {
1794                 hammer2_inode_depend(dip, nip); /* before igetv */
1795                 *ap->a_vpp = hammer2_igetv(nip, &error);
1796                 hammer2_inode_unlock(nip);
1797         }
1798
1799         /*
1800          * Update dip's mtime
1801          */
1802         if (error == 0) {
1803                 uint64_t mtime;
1804
1805                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1806                 hammer2_update_time(&mtime);
1807                 hammer2_inode_modify(dip);
1808                 dip->meta.mtime = mtime;
1809                 /*hammer2_inode_unlock(dip);*/
1810         }
1811         hammer2_inode_unlock(dip);
1812
1813         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1814
1815         if (error == 0) {
1816                 cache_setunresolved(ap->a_nch);
1817                 cache_setvp(ap->a_nch, *ap->a_vpp);
1818                 hammer2_knote(ap->a_dvp, NOTE_WRITE);
1819         }
1820         return error;
1821 }
1822
1823 /*
1824  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1825  */
1826 static
1827 int
1828 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1829 {
1830         hammer2_inode_t *dip;
1831         hammer2_inode_t *nip;
1832         struct namecache *ncp;
1833         const uint8_t *name;
1834         size_t name_len;
1835         hammer2_tid_t inum;
1836         int error;
1837
1838         dip = VTOI(ap->a_dvp);
1839         if (dip->pmp->ronly || (dip->pmp->flags & HAMMER2_PMPF_EMERG))
1840                 return (EROFS);
1841         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1842                 return (ENOSPC);
1843
1844         ncp = ap->a_nch->ncp;
1845         name = ncp->nc_name;
1846         name_len = ncp->nc_nlen;
1847         hammer2_trans_init(dip->pmp, 0);
1848
1849         ap->a_vap->va_type = VLNK;      /* enforce type */
1850
1851         /*
1852          * Create the softlink as an inode and then create the directory
1853          * entry.
1854          *
1855          * dip must be locked before nip to avoid deadlock.
1856          */
1857         inum = hammer2_trans_newinum(dip->pmp);
1858
1859         hammer2_inode_lock(dip, 0);
1860         nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
1861                                           inum, &error);
1862         if (error == 0) {
1863                 error = hammer2_dirent_create(dip, name, name_len,
1864                                               nip->meta.inum, nip->meta.type);
1865         }
1866         if (error) {
1867                 if (nip) {
1868                         hammer2_inode_unlink_finisher(nip, 0);
1869                         hammer2_inode_unlock(nip);
1870                         nip = NULL;
1871                 }
1872                 *ap->a_vpp = NULL;
1873                 hammer2_inode_unlock(dip);
1874                 hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1875                 return error;
1876         }
1877         hammer2_inode_depend(dip, nip); /* before igetv */
1878         *ap->a_vpp = hammer2_igetv(nip, &error);
1879
1880         /*
1881          * Build the softlink (~like file data) and finalize the namecache.
1882          */
1883         if (error == 0) {
1884                 size_t bytes;
1885                 struct uio auio;
1886                 struct iovec aiov;
1887
1888                 bytes = strlen(ap->a_target);
1889
1890                 hammer2_inode_unlock(nip);
1891                 bzero(&auio, sizeof(auio));
1892                 bzero(&aiov, sizeof(aiov));
1893                 auio.uio_iov = &aiov;
1894                 auio.uio_segflg = UIO_SYSSPACE;
1895                 auio.uio_rw = UIO_WRITE;
1896                 auio.uio_resid = bytes;
1897                 auio.uio_iovcnt = 1;
1898                 auio.uio_td = curthread;
1899                 aiov.iov_base = ap->a_target;
1900                 aiov.iov_len = bytes;
1901                 error = hammer2_write_file(nip, &auio, IO_APPEND, 0);
1902                 /* XXX handle error */
1903                 error = 0;
1904         } else {
1905                 hammer2_inode_unlock(nip);
1906         }
1907
1908         /*
1909          * Update dip's mtime
1910          */
1911         if (error == 0) {
1912                 uint64_t mtime;
1913
1914                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1915                 hammer2_update_time(&mtime);
1916                 hammer2_inode_modify(dip);
1917                 dip->meta.mtime = mtime;
1918                 /*hammer2_inode_unlock(dip);*/
1919         }
1920         hammer2_inode_unlock(dip);
1921
1922         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1923
1924         /*
1925          * Finalize namecache
1926          */
1927         if (error == 0) {
1928                 cache_setunresolved(ap->a_nch);
1929                 cache_setvp(ap->a_nch, *ap->a_vpp);
1930                 hammer2_knote(ap->a_dvp, NOTE_WRITE);
1931         }
1932         return error;
1933 }
1934
1935 /*
1936  * hammer2_vop_nremove { nch, dvp, cred }
1937  */
1938 static
1939 int
1940 hammer2_vop_nremove(struct vop_nremove_args *ap)
1941 {
1942         hammer2_xop_unlink_t *xop;
1943         hammer2_inode_t *dip;
1944         hammer2_inode_t *ip;
1945         struct namecache *ncp;
1946         int error;
1947         int isopen;
1948
1949         dip = VTOI(ap->a_dvp);
1950         if (dip->pmp->ronly)
1951                 return (EROFS);
1952 #if 0
1953         /* allow removals, except user to also bulkfree */
1954         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1955                 return (ENOSPC);
1956 #endif
1957
1958         ncp = ap->a_nch->ncp;
1959
1960         if (hammer2_debug_inode && dip->meta.inum == hammer2_debug_inode) {
1961                 kprintf("hammer2: attempt to delete inside debug inode: %s\n",
1962                         ncp->nc_name);
1963                 while (hammer2_debug_inode &&
1964                        dip->meta.inum == hammer2_debug_inode) {
1965                         tsleep(&hammer2_debug_inode, 0, "h2debug", hz*5);
1966                 }
1967         }
1968
1969         hammer2_trans_init(dip->pmp, 0);
1970         hammer2_inode_lock(dip, 0);
1971
1972         /*
1973          * The unlink XOP unlinks the path from the directory and
1974          * locates and returns the cluster associated with the real inode.
1975          * We have to handle nlinks here on the frontend.
1976          */
1977         xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
1978         hammer2_xop_setname(&xop->head, ncp->nc_name, ncp->nc_nlen);
1979
1980         /*
1981          * The namecache entry is locked so nobody can use this namespace.
1982          * Calculate isopen to determine if this namespace has an open vp
1983          * associated with it and resolve the vp only if it does.
1984          *
1985          * We try to avoid resolving the vnode if nobody has it open, but
1986          * note that the test is via this namespace only.
1987          */
1988         isopen = cache_isopen(ap->a_nch);
1989         xop->isdir = 0;
1990         xop->dopermanent = 0;
1991         hammer2_xop_start(&xop->head, &hammer2_unlink_desc);
1992
1993         /*
1994          * Collect the real inode and adjust nlinks, destroy the real
1995          * inode if nlinks transitions to 0 and it was the real inode
1996          * (else it has already been removed).
1997          */
1998         error = hammer2_xop_collect(&xop->head, 0);
1999         error = hammer2_error_to_errno(error);
2000
2001         if (error == 0) {
2002                 ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
2003                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
2004                 if (ip) {
2005                         if (hammer2_debug_inode &&
2006                             ip->meta.inum == hammer2_debug_inode) {
2007                                 kprintf("hammer2: attempt to delete debug "
2008                                         "inode!\n");
2009                                 while (hammer2_debug_inode &&
2010                                        ip->meta.inum == hammer2_debug_inode) {
2011                                         tsleep(&hammer2_debug_inode, 0,
2012                                                "h2debug", hz*5);
2013                                 }
2014                         }
2015                         hammer2_inode_unlink_finisher(ip, isopen);
2016                         hammer2_inode_depend(dip, ip); /* after modified */
2017                         hammer2_inode_unlock(ip);
2018                 }
2019         } else {
2020                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
2021         }
2022
2023         /*
2024          * Update dip's mtime
2025          */
2026         if (error == 0) {
2027                 uint64_t mtime;
2028
2029                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
2030                 hammer2_update_time(&mtime);
2031                 hammer2_inode_modify(dip);
2032                 dip->meta.mtime = mtime;
2033                 /*hammer2_inode_unlock(dip);*/
2034         }
2035         hammer2_inode_unlock(dip);
2036
2037         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
2038         if (error == 0) {
2039                 cache_unlink(ap->a_nch);
2040                 hammer2_knote(ap->a_dvp, NOTE_WRITE);
2041         }
2042         return (error);
2043 }
2044
2045 /*
2046  * hammer2_vop_nrmdir { nch, dvp, cred }
2047  */
2048 static
2049 int
2050 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
2051 {
2052         hammer2_xop_unlink_t *xop;
2053         hammer2_inode_t *dip;
2054         hammer2_inode_t *ip;
2055         struct namecache *ncp;
2056         int isopen;
2057         int error;
2058
2059         dip = VTOI(ap->a_dvp);
2060         if (dip->pmp->ronly)
2061                 return (EROFS);
2062 #if 0
2063         /* allow removals, except user to also bulkfree */
2064         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
2065                 return (ENOSPC);
2066 #endif
2067
2068         hammer2_trans_init(dip->pmp, 0);
2069         hammer2_inode_lock(dip, 0);
2070
2071         xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
2072
2073         ncp = ap->a_nch->ncp;
2074         hammer2_xop_setname(&xop->head, ncp->nc_name, ncp->nc_nlen);
2075         isopen = cache_isopen(ap->a_nch);
2076         xop->isdir = 1;
2077         xop->dopermanent = 0;
2078         hammer2_xop_start(&xop->head, &hammer2_unlink_desc);
2079
2080         /*
2081          * Collect the real inode and adjust nlinks, destroy the real
2082          * inode if nlinks transitions to 0 and it was the real inode
2083          * (else it has already been removed).
2084          */
2085         error = hammer2_xop_collect(&xop->head, 0);
2086         error = hammer2_error_to_errno(error);
2087
2088         if (error == 0) {
2089                 ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
2090                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
2091                 if (ip) {
2092                         hammer2_inode_unlink_finisher(ip, isopen);
2093                         hammer2_inode_depend(dip, ip);  /* after modified */
2094                         hammer2_inode_unlock(ip);
2095                 }
2096         } else {
2097                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
2098         }
2099
2100         /*
2101          * Update dip's mtime
2102          */
2103         if (error == 0) {
2104                 uint64_t mtime;
2105
2106                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
2107                 hammer2_update_time(&mtime);
2108                 hammer2_inode_modify(dip);
2109                 dip->meta.mtime = mtime;
2110                 /*hammer2_inode_unlock(dip);*/
2111         }
2112         hammer2_inode_unlock(dip);
2113
2114         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
2115         if (error == 0) {
2116                 cache_unlink(ap->a_nch);
2117                 hammer2_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2118         }
2119         return (error);
2120 }
2121
2122 /*
2123  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
2124  */
2125 static
2126 int
2127 hammer2_vop_nrename(struct vop_nrename_args *ap)
2128 {
2129         struct namecache *fncp;
2130         struct namecache *tncp;
2131         hammer2_inode_t *fdip;  /* source directory */
2132         hammer2_inode_t *tdip;  /* target directory */
2133         hammer2_inode_t *ip;    /* file being renamed */
2134         hammer2_inode_t *tip;   /* replaced target during rename or NULL */
2135         const uint8_t *fname;
2136         size_t fname_len;
2137         const uint8_t *tname;
2138         size_t tname_len;
2139         int error;
2140         int update_tdip;
2141         int update_fdip;
2142         hammer2_key_t tlhc;
2143
2144         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
2145                 return(EXDEV);
2146         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
2147                 return(EXDEV);
2148
2149         fdip = VTOI(ap->a_fdvp);        /* source directory */
2150         tdip = VTOI(ap->a_tdvp);        /* target directory */
2151
2152         if (fdip->pmp->ronly || (fdip->pmp->flags & HAMMER2_PMPF_EMERG))
2153                 return (EROFS);
2154         if (hammer2_vfs_enospace(fdip, 0, ap->a_cred) > 1)
2155                 return (ENOSPC);
2156
2157         fncp = ap->a_fnch->ncp;         /* entry name in source */
2158         fname = fncp->nc_name;
2159         fname_len = fncp->nc_nlen;
2160
2161         tncp = ap->a_tnch->ncp;         /* entry name in target */
2162         tname = tncp->nc_name;
2163         tname_len = tncp->nc_nlen;
2164
2165         hammer2_trans_init(tdip->pmp, 0);
2166
2167         update_tdip = 0;
2168         update_fdip = 0;
2169
2170         ip = VTOI(fncp->nc_vp);
2171         hammer2_inode_ref(ip);          /* extra ref */
2172
2173         /*
2174          * Lookup the target name to determine if a directory entry
2175          * is being overwritten.  We only hold related inode locks
2176          * temporarily, the operating system is expected to protect
2177          * against rename races.
2178          */
2179         tip = tncp->nc_vp ? VTOI(tncp->nc_vp) : NULL;
2180         if (tip)
2181                 hammer2_inode_ref(tip); /* extra ref */
2182
2183         /*
2184          * Can return NULL and error == EXDEV if the common parent
2185          * crosses a directory with the xlink flag set.
2186          *
2187          * For now try to avoid deadlocks with a simple pointer address
2188          * test.  (tip) can be NULL.
2189          */
2190         error = 0;
2191         {
2192                 hammer2_inode_t *ip1 = fdip;
2193                 hammer2_inode_t *ip2 = tdip;
2194                 hammer2_inode_t *ip3 = ip;
2195                 hammer2_inode_t *ip4 = tip;     /* may be NULL */
2196
2197                 if (fdip > tdip) {
2198                         ip1 = tdip;
2199                         ip2 = fdip;
2200                 }
2201                 if (tip && ip > tip) {
2202                         ip3 = tip;
2203                         ip4 = ip;
2204                 }
2205                 hammer2_inode_lock4(ip1, ip2, ip3, ip4);
2206         }
2207
2208         /*
2209          * Resolve the collision space for (tdip, tname, tname_len)
2210          *
2211          * tdip must be held exclusively locked to prevent races since
2212          * multiple filenames can end up in the same collision space.
2213          */
2214         {
2215                 hammer2_xop_scanlhc_t *sxop;
2216                 hammer2_tid_t lhcbase;
2217
2218                 tlhc = hammer2_dirhash(tname, tname_len);
2219                 lhcbase = tlhc;
2220                 sxop = hammer2_xop_alloc(tdip, HAMMER2_XOP_MODIFYING);
2221                 sxop->lhc = tlhc;
2222                 hammer2_xop_start(&sxop->head, &hammer2_scanlhc_desc);
2223                 while ((error = hammer2_xop_collect(&sxop->head, 0)) == 0) {
2224                         if (tlhc != sxop->head.cluster.focus->bref.key)
2225                                 break;
2226                         ++tlhc;
2227                 }
2228                 error = hammer2_error_to_errno(error);
2229                 hammer2_xop_retire(&sxop->head, HAMMER2_XOPMASK_VOP);
2230
2231                 if (error) {
2232                         if (error != ENOENT)
2233                                 goto done2;
2234                         ++tlhc;
2235                         error = 0;
2236                 }
2237                 if ((lhcbase ^ tlhc) & ~HAMMER2_DIRHASH_LOMASK) {
2238                         error = ENOSPC;
2239                         goto done2;
2240                 }
2241         }
2242
2243         /*
2244          * Ready to go, issue the rename to the backend.  Note that meta-data
2245          * updates to the related inodes occur separately from the rename
2246          * operation.
2247          *
2248          * NOTE: While it is not necessary to update ip->meta.name*, doing
2249          *       so aids catastrophic recovery and debugging.
2250          */
2251         if (error == 0) {
2252                 hammer2_xop_nrename_t *xop4;
2253
2254                 xop4 = hammer2_xop_alloc(fdip, HAMMER2_XOP_MODIFYING);
2255                 xop4->lhc = tlhc;
2256                 xop4->ip_key = ip->meta.name_key;
2257                 hammer2_xop_setip2(&xop4->head, ip);
2258                 hammer2_xop_setip3(&xop4->head, tdip);
2259                 hammer2_xop_setname(&xop4->head, fname, fname_len);
2260                 hammer2_xop_setname2(&xop4->head, tname, tname_len);
2261                 hammer2_xop_start(&xop4->head, &hammer2_nrename_desc);
2262
2263                 error = hammer2_xop_collect(&xop4->head, 0);
2264                 error = hammer2_error_to_errno(error);
2265                 hammer2_xop_retire(&xop4->head, HAMMER2_XOPMASK_VOP);
2266
2267                 if (error == ENOENT)
2268                         error = 0;
2269
2270                 /*
2271                  * Update inode meta-data.
2272                  *
2273                  * WARNING!  The in-memory inode (ip) structure does not
2274                  *           maintain a copy of the inode's filename buffer.
2275                  */
2276                 if (error == 0 &&
2277                     (ip->meta.name_key & HAMMER2_DIRHASH_VISIBLE)) {
2278                         hammer2_inode_modify(ip);
2279                         ip->meta.name_len = tname_len;
2280                         ip->meta.name_key = tlhc;
2281                 }
2282                 if (error == 0) {
2283                         hammer2_inode_modify(ip);
2284                         ip->meta.iparent = tdip->meta.inum;
2285                 }
2286                 update_fdip = 1;
2287                 update_tdip = 1;
2288         }
2289
2290 done2:
2291         /*
2292          * If no error, the backend has replaced the target directory entry.
2293          * We must adjust nlinks on the original replace target if it exists.
2294          */
2295         if (error == 0 && tip) {
2296                 int isopen;
2297
2298                 isopen = cache_isopen(ap->a_tnch);
2299                 hammer2_inode_unlink_finisher(tip, isopen);
2300         }
2301
2302         /*
2303          * Update directory mtimes to represent the something changed.
2304          */
2305         if (update_fdip || update_tdip) {
2306                 uint64_t mtime;
2307
2308                 hammer2_update_time(&mtime);
2309                 if (update_fdip) {
2310                         hammer2_inode_modify(fdip);
2311                         fdip->meta.mtime = mtime;
2312                 }
2313                 if (update_tdip) {
2314                         hammer2_inode_modify(tdip);
2315                         tdip->meta.mtime = mtime;
2316                 }
2317         }
2318         if (tip) {
2319                 hammer2_inode_unlock(tip);
2320                 hammer2_inode_drop(tip);
2321         }
2322         hammer2_inode_unlock(ip);
2323         hammer2_inode_unlock(tdip);
2324         hammer2_inode_unlock(fdip);
2325         hammer2_inode_drop(ip);
2326         hammer2_trans_done(tdip->pmp, HAMMER2_TRANS_SIDEQ);
2327
2328         /*
2329          * Issue the namecache update after unlocking all the internal
2330          * hammer2 structures, otherwise we might deadlock.
2331          *
2332          * WARNING! The target namespace must be updated atomically,
2333          *          and we depend on cache_rename() to handle that for
2334          *          us.  Do not do a separate cache_unlink() because
2335          *          that leaves a small window of opportunity for other
2336          *          threads to allocate the target namespace before we
2337          *          manage to complete our rename.
2338          *
2339          * WARNING! cache_rename() (and cache_unlink()) will properly
2340          *          set VREF_FINALIZE on any attached vnode.  Do not
2341          *          call cache_setunresolved() manually before-hand as
2342          *          this will prevent the flag from being set later via
2343          *          cache_rename().  If VREF_FINALIZE is not properly set
2344          *          and the inode is no longer in the topology, related
2345          *          chains can remain dirty indefinitely.
2346          */
2347         if (error == 0 && tip) {
2348                 /*cache_unlink(ap->a_tnch); see above */
2349                 /*cache_setunresolved(ap->a_tnch); see above */
2350         }
2351         if (error == 0) {
2352                 cache_rename(ap->a_fnch, ap->a_tnch);
2353                 hammer2_knote(ap->a_fdvp, NOTE_WRITE);
2354                 hammer2_knote(ap->a_tdvp, NOTE_WRITE);
2355                 hammer2_knote(fncp->nc_vp, NOTE_RENAME);
2356         }
2357
2358         return (error);
2359 }
2360
2361 /*
2362  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2363  */
2364 static
2365 int
2366 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2367 {
2368         hammer2_inode_t *ip;
2369         int error;
2370
2371         ip = VTOI(ap->a_vp);
2372
2373         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2374                               ap->a_fflag, ap->a_cred);
2375         return (error);
2376 }
2377
2378 static
2379 int
2380 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2381 {
2382         struct mount *mp;
2383         hammer2_pfs_t *pmp;
2384         int rc;
2385
2386         switch (ap->a_op) {
2387         case (MOUNTCTL_SET_EXPORT):
2388                 mp = ap->a_head.a_ops->head.vv_mount;
2389                 pmp = MPTOPMP(mp);
2390
2391                 if (ap->a_ctllen != sizeof(struct export_args))
2392                         rc = (EINVAL);
2393                 else
2394                         rc = vfs_export(mp, &pmp->export,
2395                                         (const struct export_args *)ap->a_ctl);
2396                 break;
2397         default:
2398                 rc = vop_stdmountctl(ap);
2399                 break;
2400         }
2401         return (rc);
2402 }
2403
2404 /*
2405  * KQFILTER
2406  */
2407 static void filt_hammer2detach(struct knote *kn);
2408 static int filt_hammer2read(struct knote *kn, long hint);
2409 static int filt_hammer2write(struct knote *kn, long hint);
2410 static int filt_hammer2vnode(struct knote *kn, long hint);
2411
2412 static struct filterops hammer2read_filtops =
2413         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2414           NULL, filt_hammer2detach, filt_hammer2read };
2415 static struct filterops hammer2write_filtops =
2416         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2417           NULL, filt_hammer2detach, filt_hammer2write };
2418 static struct filterops hammer2vnode_filtops =
2419         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2420           NULL, filt_hammer2detach, filt_hammer2vnode };
2421
2422 static
2423 int
2424 hammer2_vop_kqfilter(struct vop_kqfilter_args *ap)
2425 {
2426         struct vnode *vp = ap->a_vp;
2427         struct knote *kn = ap->a_kn;
2428
2429         switch (kn->kn_filter) {
2430         case EVFILT_READ:
2431                 kn->kn_fop = &hammer2read_filtops;
2432                 break;
2433         case EVFILT_WRITE:
2434                 kn->kn_fop = &hammer2write_filtops;
2435                 break;
2436         case EVFILT_VNODE:
2437                 kn->kn_fop = &hammer2vnode_filtops;
2438                 break;
2439         default:
2440                 return (EOPNOTSUPP);
2441         }
2442
2443         kn->kn_hook = (caddr_t)vp;
2444
2445         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
2446
2447         return(0);
2448 }
2449
2450 static void
2451 filt_hammer2detach(struct knote *kn)
2452 {
2453         struct vnode *vp = (void *)kn->kn_hook;
2454
2455         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
2456 }
2457
2458 static int
2459 filt_hammer2read(struct knote *kn, long hint)
2460 {
2461         struct vnode *vp = (void *)kn->kn_hook;
2462         hammer2_inode_t *ip = VTOI(vp);
2463         off_t off;
2464
2465         if (hint == NOTE_REVOKE) {
2466                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
2467                 return(1);
2468         }
2469         off = ip->meta.size - kn->kn_fp->f_offset;
2470         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
2471         if (kn->kn_sfflags & NOTE_OLDAPI)
2472                 return(1);
2473         return (kn->kn_data != 0);
2474 }
2475
2476
2477 static int
2478 filt_hammer2write(struct knote *kn, long hint)
2479 {
2480         if (hint == NOTE_REVOKE)
2481                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
2482         kn->kn_data = 0;
2483         return (1);
2484 }
2485
2486 static int
2487 filt_hammer2vnode(struct knote *kn, long hint)
2488 {
2489         if (kn->kn_sfflags & hint)
2490                 kn->kn_fflags |= hint;
2491         if (hint == NOTE_REVOKE) {
2492                 kn->kn_flags |= (EV_EOF | EV_NODATA);
2493                 return (1);
2494         }
2495         return (kn->kn_fflags != 0);
2496 }
2497
2498 /*
2499  * FIFO VOPS
2500  */
2501 static
2502 int
2503 hammer2_vop_markatime(struct vop_markatime_args *ap)
2504 {
2505         hammer2_inode_t *ip;
2506         struct vnode *vp;
2507
2508         vp = ap->a_vp;
2509         ip = VTOI(vp);
2510
2511         if (ip->pmp->ronly || (ip->pmp->flags & HAMMER2_PMPF_EMERG))
2512                 return (EROFS);
2513         return(0);
2514 }
2515
2516 static
2517 int
2518 hammer2_vop_fifokqfilter(struct vop_kqfilter_args *ap)
2519 {
2520         int error;
2521
2522         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2523         if (error)
2524                 error = hammer2_vop_kqfilter(ap);
2525         return(error);
2526 }
2527
2528 /*
2529  * VOPS vector
2530  */
2531 struct vop_ops hammer2_vnode_vops = {
2532         .vop_default    = vop_defaultop,
2533         .vop_fsync      = hammer2_vop_fsync,
2534         .vop_getpages   = vop_stdgetpages,
2535         .vop_putpages   = vop_stdputpages,
2536         .vop_access     = hammer2_vop_access,
2537         .vop_advlock    = hammer2_vop_advlock,
2538         .vop_close      = hammer2_vop_close,
2539         .vop_nlink      = hammer2_vop_nlink,
2540         .vop_ncreate    = hammer2_vop_ncreate,
2541         .vop_nsymlink   = hammer2_vop_nsymlink,
2542         .vop_nremove    = hammer2_vop_nremove,
2543         .vop_nrmdir     = hammer2_vop_nrmdir,
2544         .vop_nrename    = hammer2_vop_nrename,
2545         .vop_getattr    = hammer2_vop_getattr,
2546         .vop_getattr_lite = hammer2_vop_getattr_lite,
2547         .vop_setattr    = hammer2_vop_setattr,
2548         .vop_readdir    = hammer2_vop_readdir,
2549         .vop_readlink   = hammer2_vop_readlink,
2550         .vop_read       = hammer2_vop_read,
2551         .vop_write      = hammer2_vop_write,
2552         .vop_open       = hammer2_vop_open,
2553         .vop_inactive   = hammer2_vop_inactive,
2554         .vop_reclaim    = hammer2_vop_reclaim,
2555         .vop_nresolve   = hammer2_vop_nresolve,
2556         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2557         .vop_nmkdir     = hammer2_vop_nmkdir,
2558         .vop_nmknod     = hammer2_vop_nmknod,
2559         .vop_ioctl      = hammer2_vop_ioctl,
2560         .vop_mountctl   = hammer2_vop_mountctl,
2561         .vop_bmap       = hammer2_vop_bmap,
2562         .vop_strategy   = hammer2_vop_strategy,
2563         .vop_kqfilter   = hammer2_vop_kqfilter
2564 };
2565
2566 struct vop_ops hammer2_spec_vops = {
2567         .vop_default =          vop_defaultop,
2568         .vop_fsync =            hammer2_vop_fsync,
2569         .vop_read =             vop_stdnoread,
2570         .vop_write =            vop_stdnowrite,
2571         .vop_access =           hammer2_vop_access,
2572         .vop_close =            hammer2_vop_close,
2573         .vop_markatime =        hammer2_vop_markatime,
2574         .vop_getattr =          hammer2_vop_getattr,
2575         .vop_inactive =         hammer2_vop_inactive,
2576         .vop_reclaim =          hammer2_vop_reclaim,
2577         .vop_setattr =          hammer2_vop_setattr
2578 };
2579
2580 struct vop_ops hammer2_fifo_vops = {
2581         .vop_default =          fifo_vnoperate,
2582         .vop_fsync =            hammer2_vop_fsync,
2583 #if 0
2584         .vop_read =             hammer2_vop_fiforead,
2585         .vop_write =            hammer2_vop_fifowrite,
2586 #endif
2587         .vop_access =           hammer2_vop_access,
2588 #if 0
2589         .vop_close =            hammer2_vop_fifoclose,
2590 #endif
2591         .vop_markatime =        hammer2_vop_markatime,
2592         .vop_getattr =          hammer2_vop_getattr,
2593         .vop_inactive =         hammer2_vop_inactive,
2594         .vop_reclaim =          hammer2_vop_reclaim,
2595         .vop_setattr =          hammer2_vop_setattr,
2596         .vop_kqfilter =         hammer2_vop_fifokqfilter
2597 };
2598