hammer2 - refactor filesystem sync 4/N
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * Kernel Filesystem interface
38  *
39  * NOTE! local ipdata pointers must be reloaded on any modifying operation
40  *       to the inode as its underlying chain may have changed.
41  */
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/fcntl.h>
47 #include <sys/buf.h>
48 #include <sys/proc.h>
49 #include <sys/namei.h>
50 #include <sys/mount.h>
51 #include <sys/vnode.h>
52 #include <sys/mountctl.h>
53 #include <sys/dirent.h>
54 #include <sys/uio.h>
55 #include <sys/objcache.h>
56 #include <sys/event.h>
57 #include <sys/file.h>
58 #include <vfs/fifofs/fifo.h>
59
60 #include "hammer2.h"
61
62 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
63                                 int seqcount);
64 static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
65                                 int ioflag, int seqcount);
66 static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize);
67 static void hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize);
68
69 struct objcache *cache_xops;
70
71 static __inline
72 void
73 hammer2_knote(struct vnode *vp, int flags)
74 {
75         if (flags)
76                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
77 }
78
79 /*
80  * Last reference to a vnode is going away but it is still cached.
81  */
82 static
83 int
84 hammer2_vop_inactive(struct vop_inactive_args *ap)
85 {
86         hammer2_inode_t *ip;
87         struct vnode *vp;
88
89         vp = ap->a_vp;
90         ip = VTOI(vp);
91
92         /*
93          * Degenerate case
94          */
95         if (ip == NULL) {
96                 vrecycle(vp);
97                 return (0);
98         }
99
100         /*
101          * Check for deleted inodes and recycle immediately on the last
102          * release.  Be sure to destroy any left-over buffer cache buffers
103          * so we do not waste time trying to flush them.
104          *
105          * Note that deleting the file block chains under the inode chain
106          * would just be a waste of energy, so don't do it.
107          *
108          * WARNING: nvtruncbuf() can only be safely called without the inode
109          *          lock held due to the way our write thread works.
110          */
111         if (ip->flags & HAMMER2_INODE_ISUNLINKED) {
112                 hammer2_key_t lbase;
113                 int nblksize;
114
115                 /*
116                  * Detect updates to the embedded data which may be
117                  * synchronized by the strategy code.  Simply mark the
118                  * inode modified so it gets picked up by our normal flush.
119                  */
120                 nblksize = hammer2_calc_logical(ip, 0, &lbase, NULL);
121                 nvtruncbuf(vp, 0, nblksize, 0, 0);
122                 vrecycle(vp);
123         }
124         return (0);
125 }
126
127 /*
128  * Reclaim a vnode so that it can be reused; after the inode is
129  * disassociated, the filesystem must manage it alone.
130  */
131 static
132 int
133 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
134 {
135         hammer2_inode_t *ip;
136         hammer2_pfs_t *pmp;
137         struct vnode *vp;
138
139         vp = ap->a_vp;
140         ip = VTOI(vp);
141         if (ip == NULL) {
142                 return(0);
143         }
144         pmp = ip->pmp;
145
146         /*
147          * The final close of a deleted file or directory marks it for
148          * destruction.  The DELETED flag allows the flusher to shortcut
149          * any modified blocks still unflushed (that is, just ignore them).
150          *
151          * HAMMER2 usually does not try to optimize the freemap by returning
152          * deleted blocks to it as it does not usually know how many snapshots
153          * might be referencing portions of the file/dir.
154          */
155         vp->v_data = NULL;
156         ip->vp = NULL;
157
158         /*
159          * NOTE! We do not attempt to flush chains here, flushing is
160          *       really fragile and could also deadlock.
161          */
162         vclrisdirty(vp);
163
164         /*
165          * Modified inodes will already be on SIDEQ or SYNCQ, no further
166          * action is needed.
167          *
168          * We cannot safely synchronize the inode from inside the reclaim
169          * due to potentially deep locks held as-of when the reclaim occurs.
170          * Interactions and potential deadlocks abound.  We also can't do it
171          * here without desynchronizing from the related directory entrie(s).
172          */
173         hammer2_inode_drop(ip);                 /* vp ref */
174 #if 0
175         if ((ip->flags & (HAMMER2_INODE_ISUNLINKED |
176                           HAMMER2_INODE_MODIFIED |
177                           HAMMER2_INODE_RESIZED |
178                           HAMMER2_INODE_DIRTYDATA |
179                           HAMMER2_INODE_CREATING |
180                           HAMMER2_INODE_DELETING)) &&
181             (ip->flags & HAMMER2_INODE_ISDELETED) == 0) {
182                 hammer2_spin_ex(&pmp->list_spin);
183                 if ((ip->flags & (HAMMER2_INODE_SYNCQ |
184                                   HAMMER2_INODE_SIDEQ)) == 0) {
185                         /* ref -> sideq */
186                         atomic_set_int(&ip->flags, HAMMER2_INODE_SIDEQ);
187                         TAILQ_INSERT_TAIL(&pmp->sideq, ip, entry);
188                         ++pmp->sideq_count;
189                         hammer2_spin_unex(&pmp->list_spin);
190                         /* retain ip ref for SIDEQ linkage */
191                 } else {
192                         hammer2_spin_unex(&pmp->list_spin);
193                         hammer2_inode_drop(ip);         /* vp ref */
194                 }
195         } else {
196                 hammer2_inode_drop(ip);                 /* vp ref */
197         }
198 #endif
199
200         /*
201          * XXX handle background sync when ip dirty, kernel will no longer
202          * notify us regarding this inode because there is no longer a
203          * vnode attached to it.
204          */
205
206         return (0);
207 }
208
209 /*
210  * Currently this function synchronizes the front-end inode state to the
211  * backend chain topology, then flushes the inode's chain and sub-topology
212  * to backend media.  This function does not flush the root topology down to
213  * the inode.
214  */
215 static
216 int
217 hammer2_vop_fsync(struct vop_fsync_args *ap)
218 {
219         hammer2_inode_t *ip;
220         struct vnode *vp;
221         int error1;
222         int error2;
223
224         vp = ap->a_vp;
225         ip = VTOI(vp);
226         error1 = 0;
227
228         hammer2_trans_init(ip->pmp, 0);
229
230         /*
231          * Flush dirty buffers in the file's logical buffer cache.
232          * It is best to wait for the strategy code to commit the
233          * buffers to the device's backing buffer cache before
234          * then trying to flush the inode.
235          *
236          * This should be quick, but certain inode modifications cached
237          * entirely in the hammer2_inode structure may not trigger a
238          * buffer read until the flush so the fsync can wind up also
239          * doing scattered reads.
240          */
241         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
242         bio_track_wait(&vp->v_track_write, 0, 0);
243
244         /*
245          * Flush any inode changes
246          */
247         hammer2_inode_lock(ip, 0);
248         if (ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MODIFIED))
249                 error1 = hammer2_inode_chain_sync(ip);
250
251         /*
252          * Flush dirty chains related to the inode.
253          *
254          * NOTE! We are not in a flush transaction, so we should not use the
255          *       PARENTONFLUSH flag.  The inode remains on the sideq so the
256          *       filesystem syncer can synchronize it to the volume root.
257          */
258         error2 = hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP);
259         if (error2)
260                 error1 = error2;
261
262         /*
263          * We may be able to clear the vnode dirty flag.  The
264          * hammer2_pfs_moderate() code depends on this usually working.
265          */
266         if ((ip->flags & (HAMMER2_INODE_MODIFIED |
267                           HAMMER2_INODE_RESIZED |
268                           HAMMER2_INODE_DIRTYDATA)) == 0 &&
269             RB_EMPTY(&vp->v_rbdirty_tree) &&
270             !bio_track_active(&vp->v_track_write)) {
271                 vclrisdirty(vp);
272         }
273         hammer2_inode_unlock(ip);
274         hammer2_trans_done(ip->pmp, 0);
275
276         return (error1);
277 }
278
279 static
280 int
281 hammer2_vop_access(struct vop_access_args *ap)
282 {
283         hammer2_inode_t *ip = VTOI(ap->a_vp);
284         uid_t uid;
285         gid_t gid;
286         int error;
287
288         hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
289         uid = hammer2_to_unix_xid(&ip->meta.uid);
290         gid = hammer2_to_unix_xid(&ip->meta.gid);
291         error = vop_helper_access(ap, uid, gid, ip->meta.mode, ip->meta.uflags);
292         hammer2_inode_unlock(ip);
293
294         return (error);
295 }
296
297 static
298 int
299 hammer2_vop_getattr(struct vop_getattr_args *ap)
300 {
301         hammer2_pfs_t *pmp;
302         hammer2_inode_t *ip;
303         struct vnode *vp;
304         struct vattr *vap;
305         hammer2_chain_t *chain;
306         int i;
307
308         vp = ap->a_vp;
309         vap = ap->a_vap;
310
311         ip = VTOI(vp);
312         pmp = ip->pmp;
313
314         hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
315
316         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
317         vap->va_fileid = ip->meta.inum;
318         vap->va_mode = ip->meta.mode;
319         vap->va_nlink = ip->meta.nlinks;
320         vap->va_uid = hammer2_to_unix_xid(&ip->meta.uid);
321         vap->va_gid = hammer2_to_unix_xid(&ip->meta.gid);
322         vap->va_rmajor = 0;
323         vap->va_rminor = 0;
324         vap->va_size = ip->meta.size;   /* protected by shared lock */
325         vap->va_blocksize = HAMMER2_PBUFSIZE;
326         vap->va_flags = ip->meta.uflags;
327         hammer2_time_to_timespec(ip->meta.ctime, &vap->va_ctime);
328         hammer2_time_to_timespec(ip->meta.mtime, &vap->va_mtime);
329         hammer2_time_to_timespec(ip->meta.mtime, &vap->va_atime);
330         vap->va_gen = 1;
331         vap->va_bytes = 0;
332         if (ip->meta.type == HAMMER2_OBJTYPE_DIRECTORY) {
333                 /*
334                  * Can't really calculate directory use sans the files under
335                  * it, just assume one block for now.
336                  */
337                 vap->va_bytes += HAMMER2_INODE_BYTES;
338         } else {
339                 for (i = 0; i < ip->cluster.nchains; ++i) {
340                         if ((chain = ip->cluster.array[i].chain) != NULL) {
341                                 if (vap->va_bytes <
342                                     chain->bref.embed.stats.data_count) {
343                                         vap->va_bytes =
344                                             chain->bref.embed.stats.data_count;
345                                 }
346                         }
347                 }
348         }
349         vap->va_type = hammer2_get_vtype(ip->meta.type);
350         vap->va_filerev = 0;
351         vap->va_uid_uuid = ip->meta.uid;
352         vap->va_gid_uuid = ip->meta.gid;
353         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
354                           VA_FSID_UUID_VALID;
355
356         hammer2_inode_unlock(ip);
357
358         return (0);
359 }
360
361 static
362 int
363 hammer2_vop_setattr(struct vop_setattr_args *ap)
364 {
365         hammer2_inode_t *ip;
366         struct vnode *vp;
367         struct vattr *vap;
368         int error;
369         int kflags = 0;
370         uint64_t ctime;
371
372         vp = ap->a_vp;
373         vap = ap->a_vap;
374         hammer2_update_time(&ctime);
375
376         ip = VTOI(vp);
377
378         if (ip->pmp->ronly)
379                 return (EROFS);
380         if (hammer2_vfs_enospace(ip, 0, ap->a_cred) > 1)
381                 return (ENOSPC);
382
383         hammer2_pfs_memory_wait(ip, 0);
384         hammer2_trans_init(ip->pmp, 0);
385         hammer2_inode_lock(ip, 0);
386         error = 0;
387
388         if (vap->va_flags != VNOVAL) {
389                 uint32_t flags;
390
391                 flags = ip->meta.uflags;
392                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
393                                      hammer2_to_unix_xid(&ip->meta.uid),
394                                      ap->a_cred);
395                 if (error == 0) {
396                         if (ip->meta.uflags != flags) {
397                                 hammer2_inode_modify(ip);
398                                 ip->meta.uflags = flags;
399                                 ip->meta.ctime = ctime;
400                                 kflags |= NOTE_ATTRIB;
401                         }
402                         if (ip->meta.uflags & (IMMUTABLE | APPEND)) {
403                                 error = 0;
404                                 goto done;
405                         }
406                 }
407                 goto done;
408         }
409         if (ip->meta.uflags & (IMMUTABLE | APPEND)) {
410                 error = EPERM;
411                 goto done;
412         }
413         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
414                 mode_t cur_mode = ip->meta.mode;
415                 uid_t cur_uid = hammer2_to_unix_xid(&ip->meta.uid);
416                 gid_t cur_gid = hammer2_to_unix_xid(&ip->meta.gid);
417                 uuid_t uuid_uid;
418                 uuid_t uuid_gid;
419
420                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
421                                          ap->a_cred,
422                                          &cur_uid, &cur_gid, &cur_mode);
423                 if (error == 0) {
424                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
425                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
426                         if (bcmp(&uuid_uid, &ip->meta.uid, sizeof(uuid_uid)) ||
427                             bcmp(&uuid_gid, &ip->meta.gid, sizeof(uuid_gid)) ||
428                             ip->meta.mode != cur_mode
429                         ) {
430                                 hammer2_inode_modify(ip);
431                                 ip->meta.uid = uuid_uid;
432                                 ip->meta.gid = uuid_gid;
433                                 ip->meta.mode = cur_mode;
434                                 ip->meta.ctime = ctime;
435                         }
436                         kflags |= NOTE_ATTRIB;
437                 }
438         }
439
440         /*
441          * Resize the file
442          */
443         if (vap->va_size != VNOVAL && ip->meta.size != vap->va_size) {
444                 switch(vp->v_type) {
445                 case VREG:
446                         if (vap->va_size == ip->meta.size)
447                                 break;
448                         if (vap->va_size < ip->meta.size) {
449                                 hammer2_mtx_ex(&ip->truncate_lock);
450                                 hammer2_truncate_file(ip, vap->va_size);
451                                 hammer2_mtx_unlock(&ip->truncate_lock);
452                                 kflags |= NOTE_WRITE;
453                         } else {
454                                 hammer2_extend_file(ip, vap->va_size);
455                                 kflags |= NOTE_WRITE | NOTE_EXTEND;
456                         }
457                         hammer2_inode_modify(ip);
458                         ip->meta.mtime = ctime;
459                         vclrflags(vp, VLASTWRITETS);
460                         break;
461                 default:
462                         error = EINVAL;
463                         goto done;
464                 }
465         }
466 #if 0
467         /* atime not supported */
468         if (vap->va_atime.tv_sec != VNOVAL) {
469                 hammer2_inode_modify(ip);
470                 ip->meta.atime = hammer2_timespec_to_time(&vap->va_atime);
471                 kflags |= NOTE_ATTRIB;
472         }
473 #endif
474         if (vap->va_mode != (mode_t)VNOVAL) {
475                 mode_t cur_mode = ip->meta.mode;
476                 uid_t cur_uid = hammer2_to_unix_xid(&ip->meta.uid);
477                 gid_t cur_gid = hammer2_to_unix_xid(&ip->meta.gid);
478
479                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
480                                          cur_uid, cur_gid, &cur_mode);
481                 if (error == 0 && ip->meta.mode != cur_mode) {
482                         hammer2_inode_modify(ip);
483                         ip->meta.mode = cur_mode;
484                         ip->meta.ctime = ctime;
485                         kflags |= NOTE_ATTRIB;
486                 }
487         }
488
489         if (vap->va_mtime.tv_sec != VNOVAL) {
490                 hammer2_inode_modify(ip);
491                 ip->meta.mtime = hammer2_timespec_to_time(&vap->va_mtime);
492                 kflags |= NOTE_ATTRIB;
493                 vclrflags(vp, VLASTWRITETS);
494         }
495
496 done:
497         /*
498          * If a truncation occurred we must call chain_sync() now in order
499          * to trim the related data chains, otherwise a later expansion can
500          * cause havoc.
501          *
502          * If an extend occured that changed the DIRECTDATA state, we must
503          * call inode_fsync now in order to prepare the inode's indirect
504          * block table.
505          *
506          * WARNING! This means we are making an adjustment to the inode's
507          * chain outside of sync/fsync, and not just to inode->meta, which
508          * may result in some consistency issues if a crash were to occur
509          * at just the wrong time.
510          */
511         if (ip->flags & HAMMER2_INODE_RESIZED)
512                 hammer2_inode_chain_sync(ip);
513
514         /*
515          * Cleanup.
516          */
517         hammer2_inode_unlock(ip);
518         hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
519         hammer2_knote(ip->vp, kflags);
520
521         return (error);
522 }
523
524 static
525 int
526 hammer2_vop_readdir(struct vop_readdir_args *ap)
527 {
528         hammer2_xop_readdir_t *xop;
529         hammer2_blockref_t bref;
530         hammer2_inode_t *ip;
531         hammer2_tid_t inum;
532         hammer2_key_t lkey;
533         struct uio *uio;
534         off_t *cookies;
535         off_t saveoff;
536         int cookie_index;
537         int ncookies;
538         int error;
539         int eofflag;
540         int r;
541
542         ip = VTOI(ap->a_vp);
543         uio = ap->a_uio;
544         saveoff = uio->uio_offset;
545         eofflag = 0;
546         error = 0;
547
548         /*
549          * Setup cookies directory entry cookies if requested
550          */
551         if (ap->a_ncookies) {
552                 ncookies = uio->uio_resid / 16 + 1;
553                 if (ncookies > 1024)
554                         ncookies = 1024;
555                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
556         } else {
557                 ncookies = -1;
558                 cookies = NULL;
559         }
560         cookie_index = 0;
561
562         hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
563
564         /*
565          * Handle artificial entries.  To ensure that only positive 64 bit
566          * quantities are returned to userland we always strip off bit 63.
567          * The hash code is designed such that codes 0x0000-0x7FFF are not
568          * used, allowing us to use these codes for articial entries.
569          *
570          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
571          * allow '..' to cross the mount point into (e.g.) the super-root.
572          */
573         if (saveoff == 0) {
574                 inum = ip->meta.inum & HAMMER2_DIRHASH_USERMSK;
575                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
576                 if (r)
577                         goto done;
578                 if (cookies)
579                         cookies[cookie_index] = saveoff;
580                 ++saveoff;
581                 ++cookie_index;
582                 if (cookie_index == ncookies)
583                         goto done;
584         }
585
586         if (saveoff == 1) {
587                 /*
588                  * Be careful with lockorder when accessing ".."
589                  *
590                  * (ip is the current dir. xip is the parent dir).
591                  */
592                 inum = ip->meta.inum & HAMMER2_DIRHASH_USERMSK;
593                 if (ip != ip->pmp->iroot)
594                         inum = ip->meta.iparent & HAMMER2_DIRHASH_USERMSK;
595                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
596                 if (r)
597                         goto done;
598                 if (cookies)
599                         cookies[cookie_index] = saveoff;
600                 ++saveoff;
601                 ++cookie_index;
602                 if (cookie_index == ncookies)
603                         goto done;
604         }
605
606         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
607         if (hammer2_debug & 0x0020)
608                 kprintf("readdir: lkey %016jx\n", lkey);
609         if (error)
610                 goto done;
611
612         /*
613          * Use XOP for cluster scan.
614          *
615          * parent is the inode cluster, already locked for us.  Don't
616          * double lock shared locks as this will screw up upgrades.
617          */
618         xop = hammer2_xop_alloc(ip, 0);
619         xop->lkey = lkey;
620         hammer2_xop_start(&xop->head, &hammer2_readdir_desc);
621
622         for (;;) {
623                 const hammer2_inode_data_t *ripdata;
624                 const char *dname;
625                 int dtype;
626
627                 error = hammer2_xop_collect(&xop->head, 0);
628                 error = hammer2_error_to_errno(error);
629                 if (error) {
630                         break;
631                 }
632                 if (cookie_index == ncookies)
633                         break;
634                 if (hammer2_debug & 0x0020)
635                 kprintf("cluster chain %p %p\n",
636                         xop->head.cluster.focus,
637                         (xop->head.cluster.focus ?
638                          xop->head.cluster.focus->data : (void *)-1));
639                 hammer2_cluster_bref(&xop->head.cluster, &bref);
640
641                 if (bref.type == HAMMER2_BREF_TYPE_INODE) {
642                         ripdata = &hammer2_xop_gdata(&xop->head)->ipdata;
643                         dtype = hammer2_get_dtype(ripdata->meta.type);
644                         saveoff = bref.key & HAMMER2_DIRHASH_USERMSK;
645                         r = vop_write_dirent(&error, uio,
646                                              ripdata->meta.inum &
647                                               HAMMER2_DIRHASH_USERMSK,
648                                              dtype,
649                                              ripdata->meta.name_len,
650                                              ripdata->filename);
651                         hammer2_xop_pdata(&xop->head);
652                         if (r)
653                                 break;
654                         if (cookies)
655                                 cookies[cookie_index] = saveoff;
656                         ++cookie_index;
657                 } else if (bref.type == HAMMER2_BREF_TYPE_DIRENT) {
658                         uint16_t namlen;
659
660                         dtype = hammer2_get_dtype(bref.embed.dirent.type);
661                         saveoff = bref.key & HAMMER2_DIRHASH_USERMSK;
662                         namlen = bref.embed.dirent.namlen;
663                         if (namlen <= sizeof(bref.check.buf)) {
664                                 dname = bref.check.buf;
665                         } else {
666                                 dname = hammer2_xop_gdata(&xop->head)->buf;
667                         }
668                         r = vop_write_dirent(&error, uio,
669                                              bref.embed.dirent.inum, dtype,
670                                              namlen, dname);
671                         if (namlen > sizeof(bref.check.buf))
672                                 hammer2_xop_pdata(&xop->head);
673                         if (r)
674                                 break;
675                         if (cookies)
676                                 cookies[cookie_index] = saveoff;
677                         ++cookie_index;
678                 } else {
679                         /* XXX chain error */
680                         kprintf("bad chain type readdir %d\n", bref.type);
681                 }
682         }
683         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
684         if (error == ENOENT) {
685                 error = 0;
686                 eofflag = 1;
687                 saveoff = (hammer2_key_t)-1;
688         } else {
689                 saveoff = bref.key & HAMMER2_DIRHASH_USERMSK;
690         }
691 done:
692         hammer2_inode_unlock(ip);
693         if (ap->a_eofflag)
694                 *ap->a_eofflag = eofflag;
695         if (hammer2_debug & 0x0020)
696                 kprintf("readdir: done at %016jx\n", saveoff);
697         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
698         if (error && cookie_index == 0) {
699                 if (cookies) {
700                         kfree(cookies, M_TEMP);
701                         *ap->a_ncookies = 0;
702                         *ap->a_cookies = NULL;
703                 }
704         } else {
705                 if (cookies) {
706                         *ap->a_ncookies = cookie_index;
707                         *ap->a_cookies = cookies;
708                 }
709         }
710         return (error);
711 }
712
713 /*
714  * hammer2_vop_readlink { vp, uio, cred }
715  */
716 static
717 int
718 hammer2_vop_readlink(struct vop_readlink_args *ap)
719 {
720         struct vnode *vp;
721         hammer2_inode_t *ip;
722         int error;
723
724         vp = ap->a_vp;
725         if (vp->v_type != VLNK)
726                 return (EINVAL);
727         ip = VTOI(vp);
728
729         error = hammer2_read_file(ip, ap->a_uio, 0);
730         return (error);
731 }
732
733 static
734 int
735 hammer2_vop_read(struct vop_read_args *ap)
736 {
737         struct vnode *vp;
738         hammer2_inode_t *ip;
739         struct uio *uio;
740         int error;
741         int seqcount;
742         int bigread;
743
744         /*
745          * Read operations supported on this vnode?
746          */
747         vp = ap->a_vp;
748         if (vp->v_type != VREG)
749                 return (EINVAL);
750
751         /*
752          * Misc
753          */
754         ip = VTOI(vp);
755         uio = ap->a_uio;
756         error = 0;
757
758         seqcount = ap->a_ioflag >> 16;
759         bigread = (uio->uio_resid > 100 * 1024 * 1024);
760
761         error = hammer2_read_file(ip, uio, seqcount);
762         return (error);
763 }
764
765 static
766 int
767 hammer2_vop_write(struct vop_write_args *ap)
768 {
769         hammer2_inode_t *ip;
770         thread_t td;
771         struct vnode *vp;
772         struct uio *uio;
773         int error;
774         int seqcount;
775         int ioflag;
776
777         /*
778          * Read operations supported on this vnode?
779          */
780         vp = ap->a_vp;
781         if (vp->v_type != VREG)
782                 return (EINVAL);
783
784         /*
785          * Misc
786          */
787         ip = VTOI(vp);
788         ioflag = ap->a_ioflag;
789         uio = ap->a_uio;
790         error = 0;
791         if (ip->pmp->ronly)
792                 return (EROFS);
793         switch (hammer2_vfs_enospace(ip, uio->uio_resid, ap->a_cred)) {
794         case 2:
795                 return (ENOSPC);
796         case 1:
797                 ioflag |= IO_DIRECT;    /* semi-synchronous */
798                 /* fall through */
799         default:
800                 break;
801         }
802
803         seqcount = ioflag >> 16;
804
805         /*
806          * Check resource limit
807          */
808         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
809             uio->uio_offset + uio->uio_resid >
810              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
811                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
812                 return (EFBIG);
813         }
814
815         /*
816          * The transaction interlocks against flush initiations
817          * (note: but will run concurrently with the actual flush).
818          *
819          * To avoid deadlocking against the VM system, we must flag any
820          * transaction related to the buffer cache or other direct
821          * VM page manipulation.
822          */
823         if (uio->uio_segflg == UIO_NOCOPY) {
824                 hammer2_trans_init(ip->pmp, HAMMER2_TRANS_BUFCACHE);
825         } else {
826                 hammer2_pfs_memory_wait(ip, 0);
827                 hammer2_trans_init(ip->pmp, 0);
828         }
829         error = hammer2_write_file(ip, uio, ioflag, seqcount);
830         if (uio->uio_segflg == UIO_NOCOPY)
831                 hammer2_trans_done(ip->pmp, HAMMER2_TRANS_BUFCACHE |
832                                             HAMMER2_TRANS_SIDEQ);
833         else
834                 hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
835
836         return (error);
837 }
838
839 /*
840  * Perform read operations on a file or symlink given an UNLOCKED
841  * inode and uio.
842  *
843  * The passed ip is not locked.
844  */
845 static
846 int
847 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
848 {
849         hammer2_off_t size;
850         struct buf *bp;
851         int error;
852
853         error = 0;
854
855         /*
856          * UIO read loop.
857          *
858          * WARNING! Assumes that the kernel interlocks size changes at the
859          *          vnode level.
860          */
861         hammer2_mtx_sh(&ip->lock);
862         hammer2_mtx_sh(&ip->truncate_lock);
863         size = ip->meta.size;
864         hammer2_mtx_unlock(&ip->lock);
865
866         while (uio->uio_resid > 0 && uio->uio_offset < size) {
867                 hammer2_key_t lbase;
868                 hammer2_key_t leof;
869                 int lblksize;
870                 int loff;
871                 int n;
872
873                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
874                                                 &lbase, &leof);
875
876 #if 1
877                 bp = NULL;
878                 error = cluster_readx(ip->vp, leof, lbase, lblksize,
879                                       B_NOTMETA | B_KVABIO,
880                                       uio->uio_resid,
881                                       seqcount * MAXBSIZE,
882                                       &bp);
883 #else
884                 if (uio->uio_segflg == UIO_NOCOPY) {
885                         bp = getblk(ip->vp, lbase, lblksize,
886                                     GETBLK_BHEAVY | GETBLK_KVABIO, 0);
887                         if (bp->b_flags & B_CACHE) {
888                                 int i;
889                                 int j = 0;
890                                 if (bp->b_xio.xio_npages != 16)
891                                         kprintf("NPAGES BAD\n");
892                                 for (i = 0; i < bp->b_xio.xio_npages; ++i) {
893                                         vm_page_t m;
894                                         m = bp->b_xio.xio_pages[i];
895                                         if (m == NULL || m->valid == 0) {
896                                                 kprintf("bp %016jx %016jx pg %d inv",
897                                                         lbase, leof, i);
898                                                 if (m)
899                                                         kprintf("m->object %p/%p", m->object, ip->vp->v_object);
900                                                 kprintf("\n");
901                                                 j = 1;
902                                         }
903                                 }
904                                 if (j)
905                                         kprintf("b_flags %08x, b_error %d\n", bp->b_flags, bp->b_error);
906                         }
907                         bqrelse(bp);
908                 }
909                 error = bread_kvabio(ip->vp, lbase, lblksize, &bp);
910 #endif
911                 if (error) {
912                         brelse(bp);
913                         break;
914                 }
915                 bkvasync(bp);
916                 loff = (int)(uio->uio_offset - lbase);
917                 n = lblksize - loff;
918                 if (n > uio->uio_resid)
919                         n = uio->uio_resid;
920                 if (n > size - uio->uio_offset)
921                         n = (int)(size - uio->uio_offset);
922                 bp->b_flags |= B_AGE;
923                 uiomovebp(bp, (char *)bp->b_data + loff, n, uio);
924                 bqrelse(bp);
925         }
926         hammer2_mtx_unlock(&ip->truncate_lock);
927
928         return (error);
929 }
930
931 /*
932  * Write to the file represented by the inode via the logical buffer cache.
933  * The inode may represent a regular file or a symlink.
934  *
935  * The inode must not be locked.
936  */
937 static
938 int
939 hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
940                    int ioflag, int seqcount)
941 {
942         hammer2_key_t old_eof;
943         hammer2_key_t new_eof;
944         struct buf *bp;
945         int kflags;
946         int error;
947         int modified;
948
949         /*
950          * Setup if append
951          *
952          * WARNING! Assumes that the kernel interlocks size changes at the
953          *          vnode level.
954          */
955         hammer2_mtx_ex(&ip->lock);
956         hammer2_mtx_sh(&ip->truncate_lock);
957         if (ioflag & IO_APPEND)
958                 uio->uio_offset = ip->meta.size;
959         old_eof = ip->meta.size;
960
961         /*
962          * Extend the file if necessary.  If the write fails at some point
963          * we will truncate it back down to cover as much as we were able
964          * to write.
965          *
966          * Doing this now makes it easier to calculate buffer sizes in
967          * the loop.
968          */
969         kflags = 0;
970         error = 0;
971         modified = 0;
972
973         if (uio->uio_offset + uio->uio_resid > old_eof) {
974                 new_eof = uio->uio_offset + uio->uio_resid;
975                 modified = 1;
976                 hammer2_extend_file(ip, new_eof);
977                 kflags |= NOTE_EXTEND;
978         } else {
979                 new_eof = old_eof;
980         }
981         hammer2_mtx_unlock(&ip->lock);
982
983         /*
984          * UIO write loop
985          */
986         while (uio->uio_resid > 0) {
987                 hammer2_key_t lbase;
988                 int trivial;
989                 int endofblk;
990                 int lblksize;
991                 int loff;
992                 int n;
993
994                 /*
995                  * Don't allow the buffer build to blow out the buffer
996                  * cache.
997                  */
998                 if ((ioflag & IO_RECURSE) == 0)
999                         bwillwrite(HAMMER2_PBUFSIZE);
1000
1001                 /*
1002                  * This nominally tells us how much we can cluster and
1003                  * what the logical buffer size needs to be.  Currently
1004                  * we don't try to cluster the write and just handle one
1005                  * block at a time.
1006                  */
1007                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
1008                                                 &lbase, NULL);
1009                 loff = (int)(uio->uio_offset - lbase);
1010
1011                 KKASSERT(lblksize <= 65536);
1012
1013                 /*
1014                  * Calculate bytes to copy this transfer and whether the
1015                  * copy completely covers the buffer or not.
1016                  */
1017                 trivial = 0;
1018                 n = lblksize - loff;
1019                 if (n > uio->uio_resid) {
1020                         n = uio->uio_resid;
1021                         if (loff == lbase && uio->uio_offset + n == new_eof)
1022                                 trivial = 1;
1023                         endofblk = 0;
1024                 } else {
1025                         if (loff == 0)
1026                                 trivial = 1;
1027                         endofblk = 1;
1028                 }
1029                 if (lbase >= new_eof)
1030                         trivial = 1;
1031
1032                 /*
1033                  * Get the buffer
1034                  */
1035                 if (uio->uio_segflg == UIO_NOCOPY) {
1036                         /*
1037                          * Issuing a write with the same data backing the
1038                          * buffer.  Instantiate the buffer to collect the
1039                          * backing vm pages, then read-in any missing bits.
1040                          *
1041                          * This case is used by vop_stdputpages().
1042                          */
1043                         bp = getblk(ip->vp, lbase, lblksize,
1044                                     GETBLK_BHEAVY | GETBLK_KVABIO, 0);
1045                         if ((bp->b_flags & B_CACHE) == 0) {
1046                                 bqrelse(bp);
1047                                 error = bread_kvabio(ip->vp, lbase,
1048                                                      lblksize, &bp);
1049                         }
1050                 } else if (trivial) {
1051                         /*
1052                          * Even though we are entirely overwriting the buffer
1053                          * we may still have to zero it out to avoid a
1054                          * mmap/write visibility issue.
1055                          */
1056                         bp = getblk(ip->vp, lbase, lblksize,
1057                                     GETBLK_BHEAVY | GETBLK_KVABIO, 0);
1058                         if ((bp->b_flags & B_CACHE) == 0)
1059                                 vfs_bio_clrbuf(bp);
1060                 } else {
1061                         /*
1062                          * Partial overwrite, read in any missing bits then
1063                          * replace the portion being written.
1064                          *
1065                          * (The strategy code will detect zero-fill physical
1066                          * blocks for this case).
1067                          */
1068                         error = bread_kvabio(ip->vp, lbase, lblksize, &bp);
1069                         if (error == 0)
1070                                 bheavy(bp);
1071                 }
1072
1073                 if (error) {
1074                         brelse(bp);
1075                         break;
1076                 }
1077
1078                 /*
1079                  * Ok, copy the data in
1080                  */
1081                 bkvasync(bp);
1082                 error = uiomovebp(bp, bp->b_data + loff, n, uio);
1083                 kflags |= NOTE_WRITE;
1084                 modified = 1;
1085                 if (error) {
1086                         brelse(bp);
1087                         break;
1088                 }
1089
1090                 /*
1091                  * WARNING: Pageout daemon will issue UIO_NOCOPY writes
1092                  *          with IO_SYNC or IO_ASYNC set.  These writes
1093                  *          must be handled as the pageout daemon expects.
1094                  *
1095                  * NOTE!    H2 relies on cluster_write() here because it
1096                  *          cannot preallocate disk blocks at the logical
1097                  *          level due to not knowing what the compression
1098                  *          size will be at this time.
1099                  *
1100                  *          We must use cluster_write() here and we depend
1101                  *          on the write-behind feature to flush buffers
1102                  *          appropriately.  If we let the buffer daemons do
1103                  *          it the block allocations will be all over the
1104                  *          map.
1105                  */
1106                 if (ioflag & IO_SYNC) {
1107                         bwrite(bp);
1108                 } else if ((ioflag & IO_DIRECT) && endofblk) {
1109                         bawrite(bp);
1110                 } else if (ioflag & IO_ASYNC) {
1111                         bawrite(bp);
1112                 } else if (ip->vp->v_mount->mnt_flag & MNT_NOCLUSTERW) {
1113                         bdwrite(bp);
1114                 } else {
1115 #if 1
1116                         bp->b_flags |= B_CLUSTEROK;
1117                         cluster_write(bp, new_eof, lblksize, seqcount);
1118 #else
1119                         bp->b_flags |= B_CLUSTEROK;
1120                         bdwrite(bp);
1121 #endif
1122                 }
1123         }
1124
1125         /*
1126          * Cleanup.  If we extended the file EOF but failed to write through
1127          * the entire write is a failure and we have to back-up.
1128          */
1129         if (error && new_eof != old_eof) {
1130                 hammer2_mtx_unlock(&ip->truncate_lock);
1131                 hammer2_mtx_ex(&ip->lock);
1132                 hammer2_mtx_ex(&ip->truncate_lock);
1133                 hammer2_truncate_file(ip, old_eof);
1134                 if (ip->flags & HAMMER2_INODE_MODIFIED)
1135                         hammer2_inode_chain_sync(ip);
1136                 hammer2_mtx_unlock(&ip->lock);
1137         } else if (modified) {
1138                 struct vnode *vp = ip->vp;
1139
1140                 hammer2_mtx_ex(&ip->lock);
1141                 hammer2_inode_modify(ip);
1142                 if (uio->uio_segflg == UIO_NOCOPY) {
1143                         if (vp->v_flag & VLASTWRITETS) {
1144                                 ip->meta.mtime =
1145                                     (unsigned long)vp->v_lastwrite_ts.tv_sec *
1146                                     1000000 +
1147                                     vp->v_lastwrite_ts.tv_nsec / 1000;
1148                         }
1149                 } else {
1150                         hammer2_update_time(&ip->meta.mtime);
1151                         vclrflags(vp, VLASTWRITETS);
1152                 }
1153
1154 #if 0
1155                 /*
1156                  * REMOVED - handled by hammer2_extend_file().  Do not issue
1157                  * a chain_sync() outside of a sync/fsync except for DIRECTDATA
1158                  * state changes.
1159                  *
1160                  * Under normal conditions we only issue a chain_sync if
1161                  * the inode's DIRECTDATA state changed.
1162                  */
1163                 if (ip->flags & HAMMER2_INODE_RESIZED)
1164                         hammer2_inode_chain_sync(ip);
1165 #endif
1166                 hammer2_mtx_unlock(&ip->lock);
1167                 hammer2_knote(ip->vp, kflags);
1168         }
1169         hammer2_trans_assert_strategy(ip->pmp);
1170         hammer2_mtx_unlock(&ip->truncate_lock);
1171
1172         return error;
1173 }
1174
1175 /*
1176  * Truncate the size of a file.  The inode must not be locked.
1177  *
1178  * We must unconditionally set HAMMER2_INODE_RESIZED to properly
1179  * ensure that any on-media data beyond the new file EOF has been destroyed.
1180  *
1181  * WARNING: nvtruncbuf() can only be safely called without the inode lock
1182  *          held due to the way our write thread works.  If the truncation
1183  *          occurs in the middle of a buffer, nvtruncbuf() is responsible
1184  *          for dirtying that buffer and zeroing out trailing bytes.
1185  *
1186  * WARNING! Assumes that the kernel interlocks size changes at the
1187  *          vnode level.
1188  *
1189  * WARNING! Caller assumes responsibility for removing dead blocks
1190  *          if INODE_RESIZED is set.
1191  */
1192 static
1193 void
1194 hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1195 {
1196         hammer2_key_t lbase;
1197         int nblksize;
1198
1199         hammer2_mtx_unlock(&ip->lock);
1200         if (ip->vp) {
1201                 nblksize = hammer2_calc_logical(ip, nsize, &lbase, NULL);
1202                 nvtruncbuf(ip->vp, nsize,
1203                            nblksize, (int)nsize & (nblksize - 1),
1204                            0);
1205         }
1206         hammer2_mtx_ex(&ip->lock);
1207         KKASSERT((ip->flags & HAMMER2_INODE_RESIZED) == 0);
1208         ip->osize = ip->meta.size;
1209         ip->meta.size = nsize;
1210         atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED);
1211         hammer2_inode_modify(ip);
1212 }
1213
1214 /*
1215  * Extend the size of a file.  The inode must not be locked.
1216  *
1217  * Even though the file size is changing, we do not have to set the
1218  * INODE_RESIZED bit unless the file size crosses the EMBEDDED_BYTES
1219  * boundary.  When this occurs a hammer2_inode_chain_sync() is required
1220  * to prepare the inode cluster's indirect block table, otherwise
1221  * async execution of the strategy code will implode on us.
1222  *
1223  * WARNING! Assumes that the kernel interlocks size changes at the
1224  *          vnode level.
1225  *
1226  * WARNING! Caller assumes responsibility for transitioning out
1227  *          of the inode DIRECTDATA mode if INODE_RESIZED is set.
1228  */
1229 static
1230 void
1231 hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1232 {
1233         hammer2_key_t lbase;
1234         hammer2_key_t osize;
1235         int oblksize;
1236         int nblksize;
1237
1238         KKASSERT((ip->flags & HAMMER2_INODE_RESIZED) == 0);
1239         hammer2_inode_modify(ip);
1240         osize = ip->meta.size;
1241         ip->osize = osize;
1242         ip->meta.size = nsize;
1243
1244         /*
1245          * We must issue a chain_sync() when the DIRECTDATA state changes
1246          * to prevent confusion between the flush code and the in-memory
1247          * state.  This is not perfect because we are doing it outside of
1248          * a sync/fsync operation, so it might not be fully synchronized
1249          * with the meta-data topology flush.
1250          */
1251         if (osize <= HAMMER2_EMBEDDED_BYTES && nsize > HAMMER2_EMBEDDED_BYTES) {
1252                 atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED);
1253                 hammer2_inode_chain_sync(ip);
1254         }
1255
1256         hammer2_mtx_unlock(&ip->lock);
1257         if (ip->vp) {
1258                 oblksize = hammer2_calc_logical(ip, osize, &lbase, NULL);
1259                 nblksize = hammer2_calc_logical(ip, nsize, &lbase, NULL);
1260                 nvextendbuf(ip->vp,
1261                             osize, nsize,
1262                             oblksize, nblksize,
1263                             -1, -1, 0);
1264         }
1265         hammer2_mtx_ex(&ip->lock);
1266 }
1267
1268 static
1269 int
1270 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1271 {
1272         hammer2_xop_nresolve_t *xop;
1273         hammer2_inode_t *ip;
1274         hammer2_inode_t *dip;
1275         struct namecache *ncp;
1276         struct vnode *vp;
1277         int error;
1278
1279         dip = VTOI(ap->a_dvp);
1280         xop = hammer2_xop_alloc(dip, 0);
1281
1282         ncp = ap->a_nch->ncp;
1283         hammer2_xop_setname(&xop->head, ncp->nc_name, ncp->nc_nlen);
1284
1285         /*
1286          * Note: In DragonFly the kernel handles '.' and '..'.
1287          */
1288         hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);
1289         hammer2_xop_start(&xop->head, &hammer2_nresolve_desc);
1290
1291         error = hammer2_xop_collect(&xop->head, 0);
1292         error = hammer2_error_to_errno(error);
1293         if (error) {
1294                 ip = NULL;
1295         } else {
1296                 ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
1297         }
1298         hammer2_inode_unlock(dip);
1299
1300         /*
1301          * Acquire the related vnode
1302          *
1303          * NOTE: For error processing, only ENOENT resolves the namecache
1304          *       entry to NULL, otherwise we just return the error and
1305          *       leave the namecache unresolved.
1306          *
1307          * NOTE: multiple hammer2_inode structures can be aliased to the
1308          *       same chain element, for example for hardlinks.  This
1309          *       use case does not 'reattach' inode associations that
1310          *       might already exist, but always allocates a new one.
1311          *
1312          * WARNING: inode structure is locked exclusively via inode_get
1313          *          but chain was locked shared.  inode_unlock()
1314          *          will handle it properly.
1315          */
1316         if (ip) {
1317                 vp = hammer2_igetv(ip, &error); /* error set to UNIX error */
1318                 if (error == 0) {
1319                         vn_unlock(vp);
1320                         cache_setvp(ap->a_nch, vp);
1321                 } else if (error == ENOENT) {
1322                         cache_setvp(ap->a_nch, NULL);
1323                 }
1324                 hammer2_inode_unlock(ip);
1325
1326                 /*
1327                  * The vp should not be released until after we've disposed
1328                  * of our locks, because it might cause vop_inactive() to
1329                  * be called.
1330                  */
1331                 if (vp)
1332                         vrele(vp);
1333         } else {
1334                 error = ENOENT;
1335                 cache_setvp(ap->a_nch, NULL);
1336         }
1337         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1338         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1339                 ("resolve error %d/%p ap %p\n",
1340                  error, ap->a_nch->ncp->nc_vp, ap));
1341
1342         return error;
1343 }
1344
1345 static
1346 int
1347 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1348 {
1349         hammer2_inode_t *dip;
1350         hammer2_tid_t inum;
1351         int error;
1352
1353         dip = VTOI(ap->a_dvp);
1354         inum = dip->meta.iparent;
1355         *ap->a_vpp = NULL;
1356
1357         if (inum) {
1358                 error = hammer2_vfs_vget(ap->a_dvp->v_mount, NULL,
1359                                          inum, ap->a_vpp);
1360         } else {
1361                 error = ENOENT;
1362         }
1363         return error;
1364 }
1365
1366 static
1367 int
1368 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1369 {
1370         hammer2_inode_t *dip;
1371         hammer2_inode_t *nip;
1372         struct namecache *ncp;
1373         const uint8_t *name;
1374         size_t name_len;
1375         hammer2_tid_t inum;
1376         int error;
1377
1378         dip = VTOI(ap->a_dvp);
1379         if (dip->pmp->ronly)
1380                 return (EROFS);
1381         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1382                 return (ENOSPC);
1383
1384         ncp = ap->a_nch->ncp;
1385         name = ncp->nc_name;
1386         name_len = ncp->nc_nlen;
1387
1388         hammer2_pfs_memory_wait(dip, 1);
1389         hammer2_trans_init(dip->pmp, 0);
1390
1391         inum = hammer2_trans_newinum(dip->pmp);
1392
1393         /*
1394          * Create the actual inode as a hidden file in the iroot, then
1395          * create the directory entry.  The creation of the actual inode
1396          * sets its nlinks to 1 which is the value we desire.
1397          *
1398          * dip must be locked before nip to avoid deadlock.
1399          */
1400         hammer2_inode_lock(dip, 0);
1401         nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
1402                                           inum, &error);
1403         if (error) {
1404                 error = hammer2_error_to_errno(error);
1405         } else {
1406                 error = hammer2_dirent_create(dip, name, name_len,
1407                                               nip->meta.inum, nip->meta.type);
1408                 /* returns UNIX error code */
1409         }
1410         if (error) {
1411                 if (nip) {
1412                         hammer2_inode_unlink_finisher(nip, 0);
1413                         hammer2_inode_unlock(nip);
1414                         nip = NULL;
1415                 }
1416                 *ap->a_vpp = NULL;
1417         } else {
1418                 *ap->a_vpp = hammer2_igetv(nip, &error);
1419                 hammer2_inode_depend(dip, nip);
1420                 hammer2_inode_unlock(nip);
1421         }
1422
1423         /*
1424          * Update dip's mtime
1425          *
1426          * We can use a shared inode lock and allow the meta.mtime update
1427          * SMP race.  hammer2_inode_modify() is MPSAFE w/a shared lock.
1428          */
1429         if (error == 0) {
1430                 uint64_t mtime;
1431
1432                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1433                 hammer2_update_time(&mtime);
1434                 hammer2_inode_modify(dip);
1435                 dip->meta.mtime = mtime;
1436                 /*hammer2_inode_unlock(dip);*/
1437         }
1438         hammer2_inode_unlock(dip);
1439
1440         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1441
1442         if (error == 0) {
1443                 cache_setunresolved(ap->a_nch);
1444                 cache_setvp(ap->a_nch, *ap->a_vpp);
1445                 hammer2_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
1446         }
1447         return error;
1448 }
1449
1450 static
1451 int
1452 hammer2_vop_open(struct vop_open_args *ap)
1453 {
1454         return vop_stdopen(ap);
1455 }
1456
1457 /*
1458  * hammer2_vop_advlock { vp, id, op, fl, flags }
1459  */
1460 static
1461 int
1462 hammer2_vop_advlock(struct vop_advlock_args *ap)
1463 {
1464         hammer2_inode_t *ip = VTOI(ap->a_vp);
1465         hammer2_off_t size;
1466
1467         size = ip->meta.size;
1468         return (lf_advlock(ap, &ip->advlock, size));
1469 }
1470
1471 static
1472 int
1473 hammer2_vop_close(struct vop_close_args *ap)
1474 {
1475         return vop_stdclose(ap);
1476 }
1477
1478 /*
1479  * hammer2_vop_nlink { nch, dvp, vp, cred }
1480  *
1481  * Create a hardlink from (vp) to {dvp, nch}.
1482  */
1483 static
1484 int
1485 hammer2_vop_nlink(struct vop_nlink_args *ap)
1486 {
1487         hammer2_inode_t *tdip;  /* target directory to create link in */
1488         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1489         struct namecache *ncp;
1490         const uint8_t *name;
1491         size_t name_len;
1492         int error;
1493
1494         if (ap->a_dvp->v_mount != ap->a_vp->v_mount)
1495                 return(EXDEV);
1496
1497         tdip = VTOI(ap->a_dvp);
1498         if (tdip->pmp->ronly)
1499                 return (EROFS);
1500         if (hammer2_vfs_enospace(tdip, 0, ap->a_cred) > 1)
1501                 return (ENOSPC);
1502
1503         ncp = ap->a_nch->ncp;
1504         name = ncp->nc_name;
1505         name_len = ncp->nc_nlen;
1506
1507         /*
1508          * ip represents the file being hardlinked.  The file could be a
1509          * normal file or a hardlink target if it has already been hardlinked.
1510          * (with the new semantics, it will almost always be a hardlink
1511          * target).
1512          *
1513          * Bump nlinks and potentially also create or move the hardlink
1514          * target in the parent directory common to (ip) and (tdip).  The
1515          * consolidation code can modify ip->cluster.  The returned cluster
1516          * is locked.
1517          */
1518         ip = VTOI(ap->a_vp);
1519         KASSERT(ip->pmp, ("ip->pmp is NULL %p %p", ip, ip->pmp));
1520         hammer2_pfs_memory_wait(ip, 0);
1521         hammer2_trans_init(ip->pmp, 0);
1522
1523         /*
1524          * Target should be an indexed inode or there's no way we will ever
1525          * be able to find it!
1526          */
1527         KKASSERT((ip->meta.name_key & HAMMER2_DIRHASH_VISIBLE) == 0);
1528
1529         error = 0;
1530
1531         /*
1532          * Can return NULL and error == EXDEV if the common parent
1533          * crosses a directory with the xlink flag set.
1534          */
1535         hammer2_inode_lock4(tdip, ip, NULL, NULL);
1536
1537         /*
1538          * Create the directory entry and bump nlinks.
1539          */
1540         if (error == 0) {
1541                 error = hammer2_dirent_create(tdip, name, name_len,
1542                                               ip->meta.inum, ip->meta.type);
1543                 hammer2_inode_modify(ip);
1544                 ++ip->meta.nlinks;
1545         }
1546         if (error == 0) {
1547                 /*
1548                  * Update dip's mtime
1549                  */
1550                 uint64_t mtime;
1551
1552                 hammer2_update_time(&mtime);
1553                 hammer2_inode_modify(tdip);
1554                 tdip->meta.mtime = mtime;
1555
1556                 cache_setunresolved(ap->a_nch);
1557                 cache_setvp(ap->a_nch, ap->a_vp);
1558         }
1559         hammer2_inode_unlock(ip);
1560         hammer2_inode_unlock(tdip);
1561
1562         hammer2_trans_done(ip->pmp, HAMMER2_TRANS_SIDEQ);
1563         hammer2_knote(ap->a_vp, NOTE_LINK);
1564         hammer2_knote(ap->a_dvp, NOTE_WRITE);
1565
1566         return error;
1567 }
1568
1569 /*
1570  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1571  *
1572  * The operating system has already ensured that the directory entry
1573  * does not exist and done all appropriate namespace locking.
1574  */
1575 static
1576 int
1577 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1578 {
1579         hammer2_inode_t *dip;
1580         hammer2_inode_t *nip;
1581         struct namecache *ncp;
1582         const uint8_t *name;
1583         size_t name_len;
1584         hammer2_tid_t inum;
1585         int error;
1586
1587         dip = VTOI(ap->a_dvp);
1588         if (dip->pmp->ronly)
1589                 return (EROFS);
1590         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1591                 return (ENOSPC);
1592
1593         ncp = ap->a_nch->ncp;
1594         name = ncp->nc_name;
1595         name_len = ncp->nc_nlen;
1596         hammer2_pfs_memory_wait(dip, 1);
1597         hammer2_trans_init(dip->pmp, 0);
1598
1599         inum = hammer2_trans_newinum(dip->pmp);
1600
1601         /*
1602          * Create the actual inode as a hidden file in the iroot, then
1603          * create the directory entry.  The creation of the actual inode
1604          * sets its nlinks to 1 which is the value we desire.
1605          *
1606          * dip must be locked before nip to avoid deadlock.
1607          */
1608         hammer2_inode_lock(dip, 0);
1609         nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
1610                                           inum, &error);
1611
1612         if (error) {
1613                 error = hammer2_error_to_errno(error);
1614         } else {
1615                 error = hammer2_dirent_create(dip, name, name_len,
1616                                               nip->meta.inum, nip->meta.type);
1617         }
1618         if (error) {
1619                 if (nip) {
1620                         hammer2_inode_unlink_finisher(nip, 0);
1621                         hammer2_inode_unlock(nip);
1622                         nip = NULL;
1623                 }
1624                 *ap->a_vpp = NULL;
1625         } else {
1626                 *ap->a_vpp = hammer2_igetv(nip, &error);
1627                 hammer2_inode_depend(dip, nip);
1628                 hammer2_inode_unlock(nip);
1629         }
1630
1631         /*
1632          * Update dip's mtime
1633          */
1634         if (error == 0) {
1635                 uint64_t mtime;
1636
1637                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1638                 hammer2_update_time(&mtime);
1639                 hammer2_inode_modify(dip);
1640                 dip->meta.mtime = mtime;
1641                 /*hammer2_inode_unlock(dip);*/
1642         }
1643         hammer2_inode_unlock(dip);
1644
1645         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1646
1647         if (error == 0) {
1648                 cache_setunresolved(ap->a_nch);
1649                 cache_setvp(ap->a_nch, *ap->a_vpp);
1650                 hammer2_knote(ap->a_dvp, NOTE_WRITE);
1651         }
1652         return error;
1653 }
1654
1655 /*
1656  * Make a device node (typically a fifo)
1657  */
1658 static
1659 int
1660 hammer2_vop_nmknod(struct vop_nmknod_args *ap)
1661 {
1662         hammer2_inode_t *dip;
1663         hammer2_inode_t *nip;
1664         struct namecache *ncp;
1665         const uint8_t *name;
1666         size_t name_len;
1667         hammer2_tid_t inum;
1668         int error;
1669
1670         dip = VTOI(ap->a_dvp);
1671         if (dip->pmp->ronly)
1672                 return (EROFS);
1673         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1674                 return (ENOSPC);
1675
1676         ncp = ap->a_nch->ncp;
1677         name = ncp->nc_name;
1678         name_len = ncp->nc_nlen;
1679         hammer2_pfs_memory_wait(dip, 1);
1680         hammer2_trans_init(dip->pmp, 0);
1681
1682         /*
1683          * Create the device inode and then create the directory entry.
1684          *
1685          * dip must be locked before nip to avoid deadlock.
1686          */
1687         inum = hammer2_trans_newinum(dip->pmp);
1688
1689         hammer2_inode_lock(dip, 0);
1690         nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
1691                                           inum, &error);
1692         if (error == 0) {
1693                 error = hammer2_dirent_create(dip, name, name_len,
1694                                               nip->meta.inum, nip->meta.type);
1695         }
1696         if (error) {
1697                 if (nip) {
1698                         hammer2_inode_unlink_finisher(nip, 0);
1699                         hammer2_inode_unlock(nip);
1700                         nip = NULL;
1701                 }
1702                 *ap->a_vpp = NULL;
1703         } else {
1704                 *ap->a_vpp = hammer2_igetv(nip, &error);
1705                 hammer2_inode_depend(dip, nip);
1706                 hammer2_inode_unlock(nip);
1707         }
1708
1709         /*
1710          * Update dip's mtime
1711          */
1712         if (error == 0) {
1713                 uint64_t mtime;
1714
1715                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1716                 hammer2_update_time(&mtime);
1717                 hammer2_inode_modify(dip);
1718                 dip->meta.mtime = mtime;
1719                 /*hammer2_inode_unlock(dip);*/
1720         }
1721         hammer2_inode_unlock(dip);
1722
1723         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1724
1725         if (error == 0) {
1726                 cache_setunresolved(ap->a_nch);
1727                 cache_setvp(ap->a_nch, *ap->a_vpp);
1728                 hammer2_knote(ap->a_dvp, NOTE_WRITE);
1729         }
1730         return error;
1731 }
1732
1733 /*
1734  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1735  */
1736 static
1737 int
1738 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1739 {
1740         hammer2_inode_t *dip;
1741         hammer2_inode_t *nip;
1742         struct namecache *ncp;
1743         const uint8_t *name;
1744         size_t name_len;
1745         hammer2_tid_t inum;
1746         int error;
1747
1748         dip = VTOI(ap->a_dvp);
1749         if (dip->pmp->ronly)
1750                 return (EROFS);
1751         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1752                 return (ENOSPC);
1753
1754         ncp = ap->a_nch->ncp;
1755         name = ncp->nc_name;
1756         name_len = ncp->nc_nlen;
1757         hammer2_pfs_memory_wait(dip, 1);
1758         hammer2_trans_init(dip->pmp, 0);
1759
1760         ap->a_vap->va_type = VLNK;      /* enforce type */
1761
1762         /*
1763          * Create the softlink as an inode and then create the directory
1764          * entry.
1765          *
1766          * dip must be locked before nip to avoid deadlock.
1767          */
1768         inum = hammer2_trans_newinum(dip->pmp);
1769
1770         hammer2_inode_lock(dip, 0);
1771         nip = hammer2_inode_create_normal(dip, ap->a_vap, ap->a_cred,
1772                                           inum, &error);
1773         if (error == 0) {
1774                 error = hammer2_dirent_create(dip, name, name_len,
1775                                               nip->meta.inum, nip->meta.type);
1776         }
1777         if (error) {
1778                 if (nip) {
1779                         hammer2_inode_unlink_finisher(nip, 0);
1780                         hammer2_inode_unlock(nip);
1781                         nip = NULL;
1782                 }
1783                 *ap->a_vpp = NULL;
1784                 hammer2_inode_unlock(dip);
1785                 hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1786                 return error;
1787         }
1788         *ap->a_vpp = hammer2_igetv(nip, &error);
1789         hammer2_inode_depend(dip, nip);
1790
1791         /*
1792          * Build the softlink (~like file data) and finalize the namecache.
1793          */
1794         if (error == 0) {
1795                 size_t bytes;
1796                 struct uio auio;
1797                 struct iovec aiov;
1798
1799                 bytes = strlen(ap->a_target);
1800
1801                 hammer2_inode_unlock(nip);
1802                 bzero(&auio, sizeof(auio));
1803                 bzero(&aiov, sizeof(aiov));
1804                 auio.uio_iov = &aiov;
1805                 auio.uio_segflg = UIO_SYSSPACE;
1806                 auio.uio_rw = UIO_WRITE;
1807                 auio.uio_resid = bytes;
1808                 auio.uio_iovcnt = 1;
1809                 auio.uio_td = curthread;
1810                 aiov.iov_base = ap->a_target;
1811                 aiov.iov_len = bytes;
1812                 error = hammer2_write_file(nip, &auio, IO_APPEND, 0);
1813                 /* XXX handle error */
1814                 error = 0;
1815         } else {
1816                 hammer2_inode_unlock(nip);
1817         }
1818
1819         /*
1820          * Update dip's mtime
1821          */
1822         if (error == 0) {
1823                 uint64_t mtime;
1824
1825                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1826                 hammer2_update_time(&mtime);
1827                 hammer2_inode_modify(dip);
1828                 dip->meta.mtime = mtime;
1829                 /*hammer2_inode_unlock(dip);*/
1830         }
1831         hammer2_inode_unlock(dip);
1832
1833         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1834
1835         /*
1836          * Finalize namecache
1837          */
1838         if (error == 0) {
1839                 cache_setunresolved(ap->a_nch);
1840                 cache_setvp(ap->a_nch, *ap->a_vpp);
1841                 hammer2_knote(ap->a_dvp, NOTE_WRITE);
1842         }
1843         return error;
1844 }
1845
1846 /*
1847  * hammer2_vop_nremove { nch, dvp, cred }
1848  */
1849 static
1850 int
1851 hammer2_vop_nremove(struct vop_nremove_args *ap)
1852 {
1853         hammer2_xop_unlink_t *xop;
1854         hammer2_inode_t *dip;
1855         hammer2_inode_t *ip;
1856         struct namecache *ncp;
1857         int error;
1858         int isopen;
1859
1860         dip = VTOI(ap->a_dvp);
1861         if (dip->pmp->ronly)
1862                 return (EROFS);
1863 #if 0
1864         /* allow removals, except user to also bulkfree */
1865         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1866                 return (ENOSPC);
1867 #endif
1868
1869         ncp = ap->a_nch->ncp;
1870
1871         hammer2_pfs_memory_wait(dip, 1);
1872         hammer2_trans_init(dip->pmp, 0);
1873         hammer2_inode_lock(dip, 0);
1874
1875         /*
1876          * The unlink XOP unlinks the path from the directory and
1877          * locates and returns the cluster associated with the real inode.
1878          * We have to handle nlinks here on the frontend.
1879          */
1880         xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
1881         hammer2_xop_setname(&xop->head, ncp->nc_name, ncp->nc_nlen);
1882
1883         /*
1884          * The namecache entry is locked so nobody can use this namespace.
1885          * Calculate isopen to determine if this namespace has an open vp
1886          * associated with it and resolve the vp only if it does.
1887          *
1888          * We try to avoid resolving the vnode if nobody has it open, but
1889          * note that the test is via this namespace only.
1890          */
1891         isopen = cache_isopen(ap->a_nch);
1892         xop->isdir = 0;
1893         xop->dopermanent = 0;
1894         hammer2_xop_start(&xop->head, &hammer2_unlink_desc);
1895
1896         /*
1897          * Collect the real inode and adjust nlinks, destroy the real
1898          * inode if nlinks transitions to 0 and it was the real inode
1899          * (else it has already been removed).
1900          */
1901         error = hammer2_xop_collect(&xop->head, 0);
1902         error = hammer2_error_to_errno(error);
1903
1904         if (error == 0) {
1905                 ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
1906                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1907                 if (ip) {
1908                         hammer2_inode_unlink_finisher(ip, isopen);
1909                         hammer2_inode_depend(dip, ip); /* after modified */
1910                         hammer2_inode_unlock(ip);
1911                 }
1912         } else {
1913                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1914         }
1915
1916         /*
1917          * Update dip's mtime
1918          */
1919         if (error == 0) {
1920                 uint64_t mtime;
1921
1922                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
1923                 hammer2_update_time(&mtime);
1924                 hammer2_inode_modify(dip);
1925                 dip->meta.mtime = mtime;
1926                 /*hammer2_inode_unlock(dip);*/
1927         }
1928         hammer2_inode_unlock(dip);
1929
1930         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
1931         if (error == 0) {
1932                 cache_unlink(ap->a_nch);
1933                 hammer2_knote(ap->a_dvp, NOTE_WRITE);
1934         }
1935         return (error);
1936 }
1937
1938 /*
1939  * hammer2_vop_nrmdir { nch, dvp, cred }
1940  */
1941 static
1942 int
1943 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
1944 {
1945         hammer2_xop_unlink_t *xop;
1946         hammer2_inode_t *dip;
1947         hammer2_inode_t *ip;
1948         struct namecache *ncp;
1949         int isopen;
1950         int error;
1951
1952         dip = VTOI(ap->a_dvp);
1953         if (dip->pmp->ronly)
1954                 return (EROFS);
1955 #if 0
1956         /* allow removals, except user to also bulkfree */
1957         if (hammer2_vfs_enospace(dip, 0, ap->a_cred) > 1)
1958                 return (ENOSPC);
1959 #endif
1960
1961         hammer2_pfs_memory_wait(dip, 1);
1962         hammer2_trans_init(dip->pmp, 0);
1963         hammer2_inode_lock(dip, 0);
1964
1965         xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
1966
1967         ncp = ap->a_nch->ncp;
1968         hammer2_xop_setname(&xop->head, ncp->nc_name, ncp->nc_nlen);
1969         isopen = cache_isopen(ap->a_nch);
1970         xop->isdir = 1;
1971         xop->dopermanent = 0;
1972         hammer2_xop_start(&xop->head, &hammer2_unlink_desc);
1973
1974         /*
1975          * Collect the real inode and adjust nlinks, destroy the real
1976          * inode if nlinks transitions to 0 and it was the real inode
1977          * (else it has already been removed).
1978          */
1979         error = hammer2_xop_collect(&xop->head, 0);
1980         error = hammer2_error_to_errno(error);
1981
1982         if (error == 0) {
1983                 ip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
1984                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1985                 if (ip) {
1986                         hammer2_inode_unlink_finisher(ip, isopen);
1987                         hammer2_inode_depend(dip, ip);
1988                         hammer2_inode_unlock(ip);
1989                 }
1990         } else {
1991                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1992         }
1993
1994         /*
1995          * Update dip's mtime
1996          */
1997         if (error == 0) {
1998                 uint64_t mtime;
1999
2000                 /*hammer2_inode_lock(dip, HAMMER2_RESOLVE_SHARED);*/
2001                 hammer2_update_time(&mtime);
2002                 hammer2_inode_modify(dip);
2003                 dip->meta.mtime = mtime;
2004                 /*hammer2_inode_unlock(dip);*/
2005         }
2006         hammer2_inode_unlock(dip);
2007
2008         hammer2_trans_done(dip->pmp, HAMMER2_TRANS_SIDEQ);
2009         if (error == 0) {
2010                 cache_unlink(ap->a_nch);
2011                 hammer2_knote(ap->a_dvp, NOTE_WRITE | NOTE_LINK);
2012         }
2013         return (error);
2014 }
2015
2016 /*
2017  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
2018  */
2019 static
2020 int
2021 hammer2_vop_nrename(struct vop_nrename_args *ap)
2022 {
2023         struct namecache *fncp;
2024         struct namecache *tncp;
2025         hammer2_inode_t *fdip;  /* source directory */
2026         hammer2_inode_t *tdip;  /* target directory */
2027         hammer2_inode_t *ip;    /* file being renamed */
2028         hammer2_inode_t *tip;   /* replaced target during rename or NULL */
2029         const uint8_t *fname;
2030         size_t fname_len;
2031         const uint8_t *tname;
2032         size_t tname_len;
2033         int error;
2034         int update_tdip;
2035         int update_fdip;
2036         hammer2_key_t tlhc;
2037
2038         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
2039                 return(EXDEV);
2040         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
2041                 return(EXDEV);
2042
2043         fdip = VTOI(ap->a_fdvp);        /* source directory */
2044         tdip = VTOI(ap->a_tdvp);        /* target directory */
2045
2046         if (fdip->pmp->ronly)
2047                 return (EROFS);
2048         if (hammer2_vfs_enospace(fdip, 0, ap->a_cred) > 1)
2049                 return (ENOSPC);
2050
2051         fncp = ap->a_fnch->ncp;         /* entry name in source */
2052         fname = fncp->nc_name;
2053         fname_len = fncp->nc_nlen;
2054
2055         tncp = ap->a_tnch->ncp;         /* entry name in target */
2056         tname = tncp->nc_name;
2057         tname_len = tncp->nc_nlen;
2058
2059         hammer2_pfs_memory_wait(tdip, 0);
2060         hammer2_trans_init(tdip->pmp, 0);
2061
2062         update_tdip = 0;
2063         update_fdip = 0;
2064
2065         ip = VTOI(fncp->nc_vp);
2066         hammer2_inode_ref(ip);          /* extra ref */
2067
2068         /*
2069          * Lookup the target name to determine if a directory entry
2070          * is being overwritten.  We only hold related inode locks
2071          * temporarily, the operating system is expected to protect
2072          * against rename races.
2073          */
2074         tip = tncp->nc_vp ? VTOI(tncp->nc_vp) : NULL;
2075         if (tip)
2076                 hammer2_inode_ref(tip); /* extra ref */
2077
2078         /*
2079          * Can return NULL and error == EXDEV if the common parent
2080          * crosses a directory with the xlink flag set.
2081          *
2082          * For now try to avoid deadlocks with a simple pointer address
2083          * test.  (tip) can be NULL.
2084          */
2085         error = 0;
2086         {
2087                 hammer2_inode_t *ip1 = fdip;
2088                 hammer2_inode_t *ip2 = tdip;
2089                 hammer2_inode_t *ip3 = ip;
2090                 hammer2_inode_t *ip4 = tip;     /* may be NULL */
2091
2092                 if (fdip > tdip) {
2093                         ip1 = tdip;
2094                         ip2 = fdip;
2095                 }
2096                 if (tip && ip > tip) {
2097                         ip3 = tip;
2098                         ip4 = ip;
2099                 }
2100                 hammer2_inode_lock4(ip1, ip2, ip3, ip4);
2101         }
2102
2103         /*
2104          * Resolve the collision space for (tdip, tname, tname_len)
2105          *
2106          * tdip must be held exclusively locked to prevent races since
2107          * multiple filenames can end up in the same collision space.
2108          */
2109         {
2110                 hammer2_xop_scanlhc_t *sxop;
2111                 hammer2_tid_t lhcbase;
2112
2113                 tlhc = hammer2_dirhash(tname, tname_len);
2114                 lhcbase = tlhc;
2115                 sxop = hammer2_xop_alloc(tdip, HAMMER2_XOP_MODIFYING);
2116                 sxop->lhc = tlhc;
2117                 hammer2_xop_start(&sxop->head, &hammer2_scanlhc_desc);
2118                 while ((error = hammer2_xop_collect(&sxop->head, 0)) == 0) {
2119                         if (tlhc != sxop->head.cluster.focus->bref.key)
2120                                 break;
2121                         ++tlhc;
2122                 }
2123                 error = hammer2_error_to_errno(error);
2124                 hammer2_xop_retire(&sxop->head, HAMMER2_XOPMASK_VOP);
2125
2126                 if (error) {
2127                         if (error != ENOENT)
2128                                 goto done2;
2129                         ++tlhc;
2130                         error = 0;
2131                 }
2132                 if ((lhcbase ^ tlhc) & ~HAMMER2_DIRHASH_LOMASK) {
2133                         error = ENOSPC;
2134                         goto done2;
2135                 }
2136         }
2137
2138         /*
2139          * Ready to go, issue the rename to the backend.  Note that meta-data
2140          * updates to the related inodes occur separately from the rename
2141          * operation.
2142          *
2143          * NOTE: While it is not necessary to update ip->meta.name*, doing
2144          *       so aids catastrophic recovery and debugging.
2145          */
2146         if (error == 0) {
2147                 hammer2_xop_nrename_t *xop4;
2148
2149                 xop4 = hammer2_xop_alloc(fdip, HAMMER2_XOP_MODIFYING);
2150                 xop4->lhc = tlhc;
2151                 xop4->ip_key = ip->meta.name_key;
2152                 hammer2_xop_setip2(&xop4->head, ip);
2153                 hammer2_xop_setip3(&xop4->head, tdip);
2154                 hammer2_xop_setname(&xop4->head, fname, fname_len);
2155                 hammer2_xop_setname2(&xop4->head, tname, tname_len);
2156                 hammer2_xop_start(&xop4->head, &hammer2_nrename_desc);
2157
2158                 error = hammer2_xop_collect(&xop4->head, 0);
2159                 error = hammer2_error_to_errno(error);
2160                 hammer2_xop_retire(&xop4->head, HAMMER2_XOPMASK_VOP);
2161
2162                 if (error == ENOENT)
2163                         error = 0;
2164
2165                 /*
2166                  * Update inode meta-data.
2167                  *
2168                  * WARNING!  The in-memory inode (ip) structure does not
2169                  *           maintain a copy of the inode's filename buffer.
2170                  */
2171                 if (error == 0 &&
2172                     (ip->meta.name_key & HAMMER2_DIRHASH_VISIBLE)) {
2173                         hammer2_inode_modify(ip);
2174                         ip->meta.name_len = tname_len;
2175                         ip->meta.name_key = tlhc;
2176                 }
2177                 if (error == 0) {
2178                         hammer2_inode_modify(ip);
2179                         ip->meta.iparent = tdip->meta.inum;
2180                 }
2181                 update_fdip = 1;
2182                 update_tdip = 1;
2183         }
2184
2185 done2:
2186         /*
2187          * If no error, the backend has replaced the target directory entry.
2188          * We must adjust nlinks on the original replace target if it exists.
2189          */
2190         if (error == 0 && tip) {
2191                 int isopen;
2192
2193                 isopen = cache_isopen(ap->a_tnch);
2194                 hammer2_inode_unlink_finisher(tip, isopen);
2195         }
2196
2197         /*
2198          * Update directory mtimes to represent the something changed.
2199          */
2200         if (update_fdip || update_tdip) {
2201                 uint64_t mtime;
2202
2203                 hammer2_update_time(&mtime);
2204                 if (update_fdip) {
2205                         hammer2_inode_modify(fdip);
2206                         fdip->meta.mtime = mtime;
2207                 }
2208                 if (update_tdip) {
2209                         hammer2_inode_modify(tdip);
2210                         tdip->meta.mtime = mtime;
2211                 }
2212         }
2213         if (tip) {
2214                 hammer2_inode_unlock(tip);
2215                 hammer2_inode_drop(tip);
2216         }
2217         hammer2_inode_unlock(ip);
2218         hammer2_inode_unlock(tdip);
2219         hammer2_inode_unlock(fdip);
2220         hammer2_inode_drop(ip);
2221         hammer2_trans_done(tdip->pmp, HAMMER2_TRANS_SIDEQ);
2222
2223         /*
2224          * Issue the namecache update after unlocking all the internal
2225          * hammer2 structures, otherwise we might deadlock.
2226          *
2227          * WARNING! The target namespace must be updated atomically,
2228          *          and we depend on cache_rename() to handle that for
2229          *          us.  Do not do a separate cache_unlink() because
2230          *          that leaves a small window of opportunity for other
2231          *          threads to allocate the target namespace before we
2232          *          manage to complete our rename.
2233          *
2234          * WARNING! cache_rename() (and cache_unlink()) will properly
2235          *          set VREF_FINALIZE on any attached vnode.  Do not
2236          *          call cache_setunresolved() manually before-hand as
2237          *          this will prevent the flag from being set later via
2238          *          cache_rename().  If VREF_FINALIZE is not properly set
2239          *          and the inode is no longer in the topology, related
2240          *          chains can remain dirty indefinitely.
2241          */
2242         if (error == 0 && tip) {
2243                 /*cache_unlink(ap->a_tnch); see above */
2244                 /*cache_setunresolved(ap->a_tnch); see above */
2245         }
2246         if (error == 0) {
2247                 cache_rename(ap->a_fnch, ap->a_tnch);
2248                 hammer2_knote(ap->a_fdvp, NOTE_WRITE);
2249                 hammer2_knote(ap->a_tdvp, NOTE_WRITE);
2250                 hammer2_knote(fncp->nc_vp, NOTE_RENAME);
2251         }
2252
2253         return (error);
2254 }
2255
2256 /*
2257  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
2258  */
2259 static
2260 int
2261 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
2262 {
2263         hammer2_inode_t *ip;
2264         int error;
2265
2266         ip = VTOI(ap->a_vp);
2267
2268         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
2269                               ap->a_fflag, ap->a_cred);
2270         return (error);
2271 }
2272
2273 static
2274 int
2275 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
2276 {
2277         struct mount *mp;
2278         hammer2_pfs_t *pmp;
2279         int rc;
2280
2281         switch (ap->a_op) {
2282         case (MOUNTCTL_SET_EXPORT):
2283                 mp = ap->a_head.a_ops->head.vv_mount;
2284                 pmp = MPTOPMP(mp);
2285
2286                 if (ap->a_ctllen != sizeof(struct export_args))
2287                         rc = (EINVAL);
2288                 else
2289                         rc = vfs_export(mp, &pmp->export,
2290                                         (const struct export_args *)ap->a_ctl);
2291                 break;
2292         default:
2293                 rc = vop_stdmountctl(ap);
2294                 break;
2295         }
2296         return (rc);
2297 }
2298
2299 /*
2300  * KQFILTER
2301  */
2302 static void filt_hammer2detach(struct knote *kn);
2303 static int filt_hammer2read(struct knote *kn, long hint);
2304 static int filt_hammer2write(struct knote *kn, long hint);
2305 static int filt_hammer2vnode(struct knote *kn, long hint);
2306
2307 static struct filterops hammer2read_filtops =
2308         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2309           NULL, filt_hammer2detach, filt_hammer2read };
2310 static struct filterops hammer2write_filtops =
2311         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2312           NULL, filt_hammer2detach, filt_hammer2write };
2313 static struct filterops hammer2vnode_filtops =
2314         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2315           NULL, filt_hammer2detach, filt_hammer2vnode };
2316
2317 static
2318 int
2319 hammer2_vop_kqfilter(struct vop_kqfilter_args *ap)
2320 {
2321         struct vnode *vp = ap->a_vp;
2322         struct knote *kn = ap->a_kn;
2323
2324         switch (kn->kn_filter) {
2325         case EVFILT_READ:
2326                 kn->kn_fop = &hammer2read_filtops;
2327                 break;
2328         case EVFILT_WRITE:
2329                 kn->kn_fop = &hammer2write_filtops;
2330                 break;
2331         case EVFILT_VNODE:
2332                 kn->kn_fop = &hammer2vnode_filtops;
2333                 break;
2334         default:
2335                 return (EOPNOTSUPP);
2336         }
2337
2338         kn->kn_hook = (caddr_t)vp;
2339
2340         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
2341
2342         return(0);
2343 }
2344
2345 static void
2346 filt_hammer2detach(struct knote *kn)
2347 {
2348         struct vnode *vp = (void *)kn->kn_hook;
2349
2350         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
2351 }
2352
2353 static int
2354 filt_hammer2read(struct knote *kn, long hint)
2355 {
2356         struct vnode *vp = (void *)kn->kn_hook;
2357         hammer2_inode_t *ip = VTOI(vp);
2358         off_t off;
2359
2360         if (hint == NOTE_REVOKE) {
2361                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
2362                 return(1);
2363         }
2364         off = ip->meta.size - kn->kn_fp->f_offset;
2365         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
2366         if (kn->kn_sfflags & NOTE_OLDAPI)
2367                 return(1);
2368         return (kn->kn_data != 0);
2369 }
2370
2371
2372 static int
2373 filt_hammer2write(struct knote *kn, long hint)
2374 {
2375         if (hint == NOTE_REVOKE)
2376                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
2377         kn->kn_data = 0;
2378         return (1);
2379 }
2380
2381 static int
2382 filt_hammer2vnode(struct knote *kn, long hint)
2383 {
2384         if (kn->kn_sfflags & hint)
2385                 kn->kn_fflags |= hint;
2386         if (hint == NOTE_REVOKE) {
2387                 kn->kn_flags |= (EV_EOF | EV_NODATA);
2388                 return (1);
2389         }
2390         return (kn->kn_fflags != 0);
2391 }
2392
2393 /*
2394  * FIFO VOPS
2395  */
2396 static
2397 int
2398 hammer2_vop_markatime(struct vop_markatime_args *ap)
2399 {
2400         hammer2_inode_t *ip;
2401         struct vnode *vp;
2402
2403         vp = ap->a_vp;
2404         ip = VTOI(vp);
2405
2406         if (ip->pmp->ronly)
2407                 return (EROFS);
2408         return(0);
2409 }
2410
2411 static
2412 int
2413 hammer2_vop_fifokqfilter(struct vop_kqfilter_args *ap)
2414 {
2415         int error;
2416
2417         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2418         if (error)
2419                 error = hammer2_vop_kqfilter(ap);
2420         return(error);
2421 }
2422
2423 /*
2424  * VOPS vector
2425  */
2426 struct vop_ops hammer2_vnode_vops = {
2427         .vop_default    = vop_defaultop,
2428         .vop_fsync      = hammer2_vop_fsync,
2429         .vop_getpages   = vop_stdgetpages,
2430         .vop_putpages   = vop_stdputpages,
2431         .vop_access     = hammer2_vop_access,
2432         .vop_advlock    = hammer2_vop_advlock,
2433         .vop_close      = hammer2_vop_close,
2434         .vop_nlink      = hammer2_vop_nlink,
2435         .vop_ncreate    = hammer2_vop_ncreate,
2436         .vop_nsymlink   = hammer2_vop_nsymlink,
2437         .vop_nremove    = hammer2_vop_nremove,
2438         .vop_nrmdir     = hammer2_vop_nrmdir,
2439         .vop_nrename    = hammer2_vop_nrename,
2440         .vop_getattr    = hammer2_vop_getattr,
2441         .vop_setattr    = hammer2_vop_setattr,
2442         .vop_readdir    = hammer2_vop_readdir,
2443         .vop_readlink   = hammer2_vop_readlink,
2444         .vop_read       = hammer2_vop_read,
2445         .vop_write      = hammer2_vop_write,
2446         .vop_open       = hammer2_vop_open,
2447         .vop_inactive   = hammer2_vop_inactive,
2448         .vop_reclaim    = hammer2_vop_reclaim,
2449         .vop_nresolve   = hammer2_vop_nresolve,
2450         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2451         .vop_nmkdir     = hammer2_vop_nmkdir,
2452         .vop_nmknod     = hammer2_vop_nmknod,
2453         .vop_ioctl      = hammer2_vop_ioctl,
2454         .vop_mountctl   = hammer2_vop_mountctl,
2455         .vop_bmap       = hammer2_vop_bmap,
2456         .vop_strategy   = hammer2_vop_strategy,
2457         .vop_kqfilter   = hammer2_vop_kqfilter
2458 };
2459
2460 struct vop_ops hammer2_spec_vops = {
2461         .vop_default =          vop_defaultop,
2462         .vop_fsync =            hammer2_vop_fsync,
2463         .vop_read =             vop_stdnoread,
2464         .vop_write =            vop_stdnowrite,
2465         .vop_access =           hammer2_vop_access,
2466         .vop_close =            hammer2_vop_close,
2467         .vop_markatime =        hammer2_vop_markatime,
2468         .vop_getattr =          hammer2_vop_getattr,
2469         .vop_inactive =         hammer2_vop_inactive,
2470         .vop_reclaim =          hammer2_vop_reclaim,
2471         .vop_setattr =          hammer2_vop_setattr
2472 };
2473
2474 struct vop_ops hammer2_fifo_vops = {
2475         .vop_default =          fifo_vnoperate,
2476         .vop_fsync =            hammer2_vop_fsync,
2477 #if 0
2478         .vop_read =             hammer2_vop_fiforead,
2479         .vop_write =            hammer2_vop_fifowrite,
2480 #endif
2481         .vop_access =           hammer2_vop_access,
2482 #if 0
2483         .vop_close =            hammer2_vop_fifoclose,
2484 #endif
2485         .vop_markatime =        hammer2_vop_markatime,
2486         .vop_getattr =          hammer2_vop_getattr,
2487         .vop_inactive =         hammer2_vop_inactive,
2488         .vop_reclaim =          hammer2_vop_reclaim,
2489         .vop_setattr =          hammer2_vop_setattr,
2490         .vop_kqfilter =         hammer2_vop_fifokqfilter
2491 };
2492