hammer2 - Refactor frontend part 4/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vnops.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression) 
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in
17  *    the documentation and/or other materials provided with the
18  *    distribution.
19  * 3. Neither the name of The DragonFly Project nor the names of its
20  *    contributors may be used to endorse or promote products derived
21  *    from this software without specific, prior written permission.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
27  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
29  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
31  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
32  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
33  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
34  * SUCH DAMAGE.
35  */
36 /*
37  * Kernel Filesystem interface
38  *
39  * NOTE! local ipdata pointers must be reloaded on any modifying operation
40  *       to the inode as its underlying chain may have changed.
41  */
42
43 #include <sys/param.h>
44 #include <sys/systm.h>
45 #include <sys/kernel.h>
46 #include <sys/fcntl.h>
47 #include <sys/buf.h>
48 #include <sys/proc.h>
49 #include <sys/namei.h>
50 #include <sys/mount.h>
51 #include <sys/vnode.h>
52 #include <sys/mountctl.h>
53 #include <sys/dirent.h>
54 #include <sys/uio.h>
55 #include <sys/objcache.h>
56 #include <sys/event.h>
57 #include <sys/file.h>
58 #include <vfs/fifofs/fifo.h>
59
60 #include "hammer2.h"
61
62 static int hammer2_read_file(hammer2_inode_t *ip, struct uio *uio,
63                                 int seqcount);
64 static int hammer2_write_file(hammer2_inode_t *ip, struct uio *uio,
65                                 int ioflag, int seqcount);
66 static void hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize);
67 static void hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize);
68
69 struct objcache *cache_vop_info;
70
71 static __inline
72 void
73 hammer2_knote(struct vnode *vp, int flags)
74 {
75         if (flags)
76                 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
77 }
78
79 /*
80  * Last reference to a vnode is going away but it is still cached.
81  */
82 static
83 int
84 hammer2_vop_inactive(struct vop_inactive_args *ap)
85 {
86         hammer2_inode_t *ip;
87         struct vnode *vp;
88
89         LOCKSTART;
90         vp = ap->a_vp;
91         ip = VTOI(vp);
92
93         /*
94          * Degenerate case
95          */
96         if (ip == NULL) {
97                 vrecycle(vp);
98                 LOCKSTOP;
99                 return (0);
100         }
101
102         /*
103          * Check for deleted inodes and recycle immediately on the last
104          * release.  Be sure to destroy any left-over buffer cache buffers
105          * so we do not waste time trying to flush them.
106          *
107          * WARNING: nvtruncbuf() can only be safely called without the inode
108          *          lock held due to the way our write thread works.
109          */
110         if (ip->flags & HAMMER2_INODE_ISUNLINKED) {
111                 hammer2_key_t lbase;
112                 int nblksize;
113
114                 /*
115                  * Detect updates to the embedded data which may be
116                  * synchronized by the strategy code.  Simply mark the
117                  * inode modified so it gets picked up by our normal flush.
118                  */
119                 nblksize = hammer2_calc_logical(ip, 0, &lbase, NULL);
120                 nvtruncbuf(vp, 0, nblksize, 0, 0);
121                 vrecycle(vp);
122         }
123         LOCKSTOP;
124         return (0);
125 }
126
127 /*
128  * Reclaim a vnode so that it can be reused; after the inode is
129  * disassociated, the filesystem must manage it alone.
130  */
131 static
132 int
133 hammer2_vop_reclaim(struct vop_reclaim_args *ap)
134 {
135         hammer2_inode_t *ip;
136         hammer2_pfs_t *pmp;
137         struct vnode *vp;
138
139         LOCKSTART;
140         vp = ap->a_vp;
141         ip = VTOI(vp);
142         if (ip == NULL) {
143                 LOCKSTOP;
144                 return(0);
145         }
146         pmp = ip->pmp;
147
148         /*
149          * The final close of a deleted file or directory marks it for
150          * destruction.  The DELETED flag allows the flusher to shortcut
151          * any modified blocks still unflushed (that is, just ignore them).
152          *
153          * HAMMER2 usually does not try to optimize the freemap by returning
154          * deleted blocks to it as it does not usually know how many snapshots
155          * might be referencing portions of the file/dir.
156          */
157         vp->v_data = NULL;
158         ip->vp = NULL;
159
160         /*
161          * NOTE! We do not attempt to flush chains here, flushing is
162          *       really fragile and could also deadlock.
163          */
164         vclrisdirty(vp);
165
166         /*
167          * Once reclaimed the inode is disconnected from the normal flush
168          * mechanism and must be tracked
169          *
170          * A reclaim can occur at any time so we cannot safely start a
171          * transaction to handle reclamation of unlinked files.  Instead,
172          * the ip is left with a reference and placed on a linked list and
173          * handled later on.
174          */
175         if (ip->flags & HAMMER2_INODE_ISUNLINKED) {
176                 hammer2_inode_unlink_t *ipul;
177
178                 ipul = kmalloc(sizeof(*ipul), pmp->minode, M_WAITOK | M_ZERO);
179                 ipul->ip = ip;
180
181                 hammer2_spin_ex(&pmp->list_spin);
182                 TAILQ_INSERT_TAIL(&pmp->unlinkq, ipul, entry);
183                 hammer2_spin_unex(&pmp->list_spin);
184                 /* retain ref from vp for ipul */
185         } else {
186                 hammer2_inode_drop(ip);                 /* vp ref */
187         }
188
189         /*
190          * XXX handle background sync when ip dirty, kernel will no longer
191          * notify us regarding this inode because there is no longer a
192          * vnode attached to it.
193          */
194
195         LOCKSTOP;
196         return (0);
197 }
198
199 static
200 int
201 hammer2_vop_fsync(struct vop_fsync_args *ap)
202 {
203         hammer2_inode_t *ip;
204         hammer2_trans_t trans;
205         hammer2_cluster_t *cluster;
206         struct vnode *vp;
207
208         LOCKSTART;
209         vp = ap->a_vp;
210         ip = VTOI(vp);
211
212 #if 0
213         /* XXX can't do this yet */
214         hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_ISFLUSH);
215         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
216 #endif
217         hammer2_trans_init(&trans, ip->pmp, 0);
218         vfsync(vp, ap->a_waitfor, 1, NULL, NULL);
219
220         /*
221          * Calling chain_flush here creates a lot of duplicative
222          * COW operations due to non-optimal vnode ordering.
223          *
224          * Only do it for an actual fsync() syscall.  The other forms
225          * which call this function will eventually call chain_flush
226          * on the volume root as a catch-all, which is far more optimal.
227          */
228         cluster = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS);
229         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
230         /*vclrisdirty(vp);*/
231         if (ip->flags & (HAMMER2_INODE_RESIZED|HAMMER2_INODE_MTIME))
232                 hammer2_inode_fsync(&trans, ip, cluster);
233
234         hammer2_inode_unlock(ip, cluster);
235         hammer2_trans_done(&trans);
236
237         LOCKSTOP;
238         return (0);
239 }
240
241 static
242 int
243 hammer2_vop_access(struct vop_access_args *ap)
244 {
245         hammer2_inode_t *ip = VTOI(ap->a_vp);
246         const hammer2_inode_data_t *ripdata;
247         hammer2_cluster_t *cluster;
248         uid_t uid;
249         gid_t gid;
250         int error;
251
252         LOCKSTART;
253         cluster = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS |
254                                          HAMMER2_RESOLVE_SHARED);
255         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
256         uid = hammer2_to_unix_xid(&ripdata->meta.uid);
257         gid = hammer2_to_unix_xid(&ripdata->meta.gid);
258         error = vop_helper_access(ap, uid, gid,
259                                   ripdata->meta.mode, ripdata->meta.uflags);
260         hammer2_inode_unlock(ip, cluster);
261
262         LOCKSTOP;
263         return (error);
264 }
265
266 static
267 int
268 hammer2_vop_getattr(struct vop_getattr_args *ap)
269 {
270         const hammer2_inode_data_t *ripdata;
271         hammer2_cluster_t *cluster;
272         hammer2_pfs_t *pmp;
273         hammer2_inode_t *ip;
274         hammer2_blockref_t bref;
275         struct vnode *vp;
276         struct vattr *vap;
277
278         LOCKSTART;
279         vp = ap->a_vp;
280         vap = ap->a_vap;
281
282         ip = VTOI(vp);
283         pmp = ip->pmp;
284
285         cluster = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS |
286                                          HAMMER2_RESOLVE_SHARED);
287         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
288         KKASSERT(hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE);
289         hammer2_cluster_bref(cluster, &bref);
290
291         vap->va_fsid = pmp->mp->mnt_stat.f_fsid.val[0];
292         vap->va_fileid = ripdata->meta.inum;
293         vap->va_mode = ripdata->meta.mode;
294         vap->va_nlink = ripdata->meta.nlinks;
295         vap->va_uid = hammer2_to_unix_xid(&ripdata->meta.uid);
296         vap->va_gid = hammer2_to_unix_xid(&ripdata->meta.gid);
297         vap->va_rmajor = 0;
298         vap->va_rminor = 0;
299         vap->va_size = ip->meta.size;   /* protected by shared lock */
300         vap->va_blocksize = HAMMER2_PBUFSIZE;
301         vap->va_flags = ripdata->meta.uflags;
302         hammer2_time_to_timespec(ripdata->meta.ctime, &vap->va_ctime);
303         hammer2_time_to_timespec(ripdata->meta.mtime, &vap->va_mtime);
304         hammer2_time_to_timespec(ripdata->meta.mtime, &vap->va_atime);
305         vap->va_gen = 1;
306         vap->va_bytes = bref.data_count;
307         vap->va_type = hammer2_get_vtype(ripdata);
308         vap->va_filerev = 0;
309         vap->va_uid_uuid = ripdata->meta.uid;
310         vap->va_gid_uuid = ripdata->meta.gid;
311         vap->va_vaflags = VA_UID_UUID_VALID | VA_GID_UUID_VALID |
312                           VA_FSID_UUID_VALID;
313
314         hammer2_inode_unlock(ip, cluster);
315
316         LOCKSTOP;
317         return (0);
318 }
319
320 static
321 int
322 hammer2_vop_setattr(struct vop_setattr_args *ap)
323 {
324         const hammer2_inode_data_t *ripdata;
325         hammer2_inode_data_t *wipdata;
326         hammer2_inode_t *ip;
327         hammer2_cluster_t *cluster;
328         hammer2_trans_t trans;
329         struct vnode *vp;
330         struct vattr *vap;
331         int error;
332         int kflags = 0;
333         int domtime = 0;
334         int dosync = 0;
335         uint64_t ctime;
336
337         LOCKSTART;
338         vp = ap->a_vp;
339         vap = ap->a_vap;
340         hammer2_update_time(&ctime);
341
342         ip = VTOI(vp);
343
344         if (ip->pmp->ronly) {
345                 LOCKSTOP;
346                 return(EROFS);
347         }
348
349         hammer2_pfs_memory_wait(ip->pmp);
350         hammer2_trans_init(&trans, ip->pmp, 0);
351         cluster = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS);
352         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
353         error = 0;
354
355         if (vap->va_flags != VNOVAL) {
356                 u_int32_t flags;
357
358                 flags = ripdata->meta.uflags;
359                 error = vop_helper_setattr_flags(&flags, vap->va_flags,
360                                      hammer2_to_unix_xid(&ripdata->meta.uid),
361                                      ap->a_cred);
362                 if (error == 0) {
363                         if (ripdata->meta.uflags != flags) {
364                                 wipdata = hammer2_cluster_modify_ip(&trans, ip,
365                                                                     cluster, 0);
366                                 wipdata->meta.uflags = flags;
367                                 wipdata->meta.ctime = ctime;
368                                 kflags |= NOTE_ATTRIB;
369                                 dosync = 1;
370                                 ripdata = wipdata;
371                         }
372                         if (ripdata->meta.uflags & (IMMUTABLE | APPEND)) {
373                                 error = 0;
374                                 goto done;
375                         }
376                 }
377                 goto done;
378         }
379         if (ripdata->meta.uflags & (IMMUTABLE | APPEND)) {
380                 error = EPERM;
381                 goto done;
382         }
383         if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
384                 mode_t cur_mode = ripdata->meta.mode;
385                 uid_t cur_uid = hammer2_to_unix_xid(&ripdata->meta.uid);
386                 gid_t cur_gid = hammer2_to_unix_xid(&ripdata->meta.gid);
387                 uuid_t uuid_uid;
388                 uuid_t uuid_gid;
389
390                 error = vop_helper_chown(ap->a_vp, vap->va_uid, vap->va_gid,
391                                          ap->a_cred,
392                                          &cur_uid, &cur_gid, &cur_mode);
393                 if (error == 0) {
394                         hammer2_guid_to_uuid(&uuid_uid, cur_uid);
395                         hammer2_guid_to_uuid(&uuid_gid, cur_gid);
396                         if (bcmp(&uuid_uid, &ripdata->meta.uid,
397                                  sizeof(uuid_uid)) ||
398                             bcmp(&uuid_gid, &ripdata->meta.gid,
399                                  sizeof(uuid_gid)) ||
400                             ripdata->meta.mode != cur_mode
401                         ) {
402                                 wipdata = hammer2_cluster_modify_ip(&trans, ip,
403                                                                     cluster, 0);
404                                 wipdata->meta.uid = uuid_uid;
405                                 wipdata->meta.gid = uuid_gid;
406                                 wipdata->meta.mode = cur_mode;
407                                 wipdata->meta.ctime = ctime;
408                                 dosync = 1;
409                                 ripdata = wipdata;
410                         }
411                         kflags |= NOTE_ATTRIB;
412                 }
413         }
414
415         /*
416          * Resize the file
417          */
418         if (vap->va_size != VNOVAL && ip->meta.size != vap->va_size) {
419                 switch(vp->v_type) {
420                 case VREG:
421                         if (vap->va_size == ip->meta.size)
422                                 break;
423                         hammer2_inode_unlock(ip, cluster);
424                         if (vap->va_size < ip->meta.size) {
425                                 hammer2_truncate_file(ip, vap->va_size);
426                         } else {
427                                 hammer2_extend_file(ip, vap->va_size);
428                         }
429                         cluster = hammer2_inode_lock(ip,
430                                                      HAMMER2_RESOLVE_ALWAYS);
431                         /* RELOAD */
432                         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
433                         domtime = 1;
434                         break;
435                 default:
436                         error = EINVAL;
437                         goto done;
438                 }
439         }
440 #if 0
441         /* atime not supported */
442         if (vap->va_atime.tv_sec != VNOVAL) {
443                 wipdata = hammer2_cluster_modify_ip(&trans, ip, cluster, 0);
444                 wipdata->meta.atime = hammer2_timespec_to_time(&vap->va_atime);
445                 kflags |= NOTE_ATTRIB;
446                 dosync = 1;
447                 ripdata = wipdata;
448         }
449 #endif
450         if (vap->va_mtime.tv_sec != VNOVAL) {
451                 wipdata = hammer2_cluster_modify_ip(&trans, ip, cluster, 0);
452                 wipdata->meta.mtime = hammer2_timespec_to_time(&vap->va_mtime);
453                 kflags |= NOTE_ATTRIB;
454                 domtime = 0;
455                 dosync = 1;
456                 ripdata = wipdata;
457         }
458         if (vap->va_mode != (mode_t)VNOVAL) {
459                 mode_t cur_mode = ripdata->meta.mode;
460                 uid_t cur_uid = hammer2_to_unix_xid(&ripdata->meta.uid);
461                 gid_t cur_gid = hammer2_to_unix_xid(&ripdata->meta.gid);
462
463                 error = vop_helper_chmod(ap->a_vp, vap->va_mode, ap->a_cred,
464                                          cur_uid, cur_gid, &cur_mode);
465                 if (error == 0 && ripdata->meta.mode != cur_mode) {
466                         wipdata = hammer2_cluster_modify_ip(&trans, ip,
467                                                             cluster, 0);
468                         wipdata->meta.mode = cur_mode;
469                         wipdata->meta.ctime = ctime;
470                         kflags |= NOTE_ATTRIB;
471                         dosync = 1;
472                         ripdata = wipdata;
473                 }
474         }
475
476         /*
477          * If a truncation occurred we must call inode_fsync() now in order
478          * to trim the related data chains, otherwise a later expansion can
479          * cause havoc.
480          */
481         if (dosync) {
482                 hammer2_cluster_modsync(cluster);
483                 dosync = 0;
484         }
485         hammer2_inode_fsync(&trans, ip, cluster);
486
487         /*
488          * Cleanup.  If domtime is set an additional inode modification
489          * must be flagged.  All other modifications will have already
490          * set INODE_MODIFIED and called vsetisdirty().
491          */
492 done:
493         if (domtime) {
494                 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED |
495                                            HAMMER2_INODE_MTIME);
496                 vsetisdirty(ip->vp);
497         }
498         if (dosync)
499                 hammer2_cluster_modsync(cluster);
500         hammer2_inode_unlock(ip, cluster);
501         hammer2_trans_done(&trans);
502         hammer2_knote(ip->vp, kflags);
503
504         LOCKSTOP;
505         return (error);
506 }
507
508 static
509 int
510 hammer2_vop_readdir(struct vop_readdir_args *ap)
511 {
512         const hammer2_inode_data_t *ripdata;
513         hammer2_inode_t *ip;
514         hammer2_inode_t *xip;
515         hammer2_cluster_t *cparent;
516         hammer2_cluster_t *cluster;
517         hammer2_cluster_t *xcluster;
518         hammer2_blockref_t bref;
519         hammer2_tid_t inum;
520         hammer2_key_t key_next;
521         hammer2_key_t lkey;
522         struct uio *uio;
523         off_t *cookies;
524         off_t saveoff;
525         int cookie_index;
526         int ncookies;
527         int error;
528         int dtype;
529         int r;
530
531         LOCKSTART;
532         ip = VTOI(ap->a_vp);
533         uio = ap->a_uio;
534         saveoff = uio->uio_offset;
535
536         /*
537          * Setup cookies directory entry cookies if requested
538          */
539         if (ap->a_ncookies) {
540                 ncookies = uio->uio_resid / 16 + 1;
541                 if (ncookies > 1024)
542                         ncookies = 1024;
543                 cookies = kmalloc(ncookies * sizeof(off_t), M_TEMP, M_WAITOK);
544         } else {
545                 ncookies = -1;
546                 cookies = NULL;
547         }
548         cookie_index = 0;
549
550         cparent = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS |
551                                          HAMMER2_RESOLVE_SHARED);
552
553         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
554
555         /*
556          * Handle artificial entries.  To ensure that only positive 64 bit
557          * quantities are returned to userland we always strip off bit 63.
558          * The hash code is designed such that codes 0x0000-0x7FFF are not
559          * used, allowing us to use these codes for articial entries.
560          *
561          * Entry 0 is used for '.' and entry 1 is used for '..'.  Do not
562          * allow '..' to cross the mount point into (e.g.) the super-root.
563          */
564         error = 0;
565         cluster = (void *)(intptr_t)-1; /* non-NULL for early goto done case */
566
567         if (saveoff == 0) {
568                 inum = ripdata->meta.inum & HAMMER2_DIRHASH_USERMSK;
569                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 1, ".");
570                 if (r)
571                         goto done;
572                 if (cookies)
573                         cookies[cookie_index] = saveoff;
574                 ++saveoff;
575                 ++cookie_index;
576                 if (cookie_index == ncookies)
577                         goto done;
578         }
579
580         if (saveoff == 1) {
581                 /*
582                  * Be careful with lockorder when accessing ".."
583                  *
584                  * (ip is the current dir. xip is the parent dir).
585                  */
586                 inum = ripdata->meta.inum & HAMMER2_DIRHASH_USERMSK;
587                 while (ip->pip != NULL && ip != ip->pmp->iroot) {
588                         xip = ip->pip;
589                         hammer2_inode_ref(xip);
590                         hammer2_inode_unlock(ip, cparent);
591                         xcluster = hammer2_inode_lock(xip,
592                                                       HAMMER2_RESOLVE_ALWAYS |
593                                                       HAMMER2_RESOLVE_SHARED);
594
595                         cparent = hammer2_inode_lock(ip,
596                                                       HAMMER2_RESOLVE_ALWAYS |
597                                                       HAMMER2_RESOLVE_SHARED);
598                         hammer2_inode_drop(xip);
599                         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
600                         if (xip == ip->pip) {
601                                 inum = hammer2_cluster_rdata(xcluster)->
602                                         ipdata.meta.inum &
603                                          HAMMER2_DIRHASH_USERMSK;
604                                 hammer2_inode_unlock(xip, xcluster);
605                                 break;
606                         }
607                         hammer2_inode_unlock(xip, xcluster);
608                 }
609                 r = vop_write_dirent(&error, uio, inum, DT_DIR, 2, "..");
610                 if (r)
611                         goto done;
612                 if (cookies)
613                         cookies[cookie_index] = saveoff;
614                 ++saveoff;
615                 ++cookie_index;
616                 if (cookie_index == ncookies)
617                         goto done;
618         }
619
620         lkey = saveoff | HAMMER2_DIRHASH_VISIBLE;
621         if (hammer2_debug & 0x0020)
622                 kprintf("readdir: lkey %016jx\n", lkey);
623
624         /*
625          * parent is the inode cluster, already locked for us.  Don't
626          * double lock shared locks as this will screw up upgrades.
627          */
628         if (error) {
629                 goto done;
630         }
631         cluster = hammer2_cluster_lookup(cparent, &key_next, lkey, lkey,
632                                      HAMMER2_LOOKUP_SHARED);
633         if (cluster == NULL) {
634                 cluster = hammer2_cluster_lookup(cparent, &key_next,
635                                              lkey, (hammer2_key_t)-1,
636                                              HAMMER2_LOOKUP_SHARED);
637         }
638         if (cluster)
639                 hammer2_cluster_bref(cluster, &bref);
640         while (cluster) {
641                 if (hammer2_debug & 0x0020)
642                         kprintf("readdir: p=%p chain=%p %016jx (next %016jx)\n",
643                                 cparent->focus, cluster->focus,
644                                 bref.key, key_next);
645
646                 if (bref.type == HAMMER2_BREF_TYPE_INODE) {
647                         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
648                         dtype = hammer2_get_dtype(ripdata);
649                         saveoff = bref.key & HAMMER2_DIRHASH_USERMSK;
650                         r = vop_write_dirent(&error, uio,
651                                              ripdata->meta.inum &
652                                               HAMMER2_DIRHASH_USERMSK,
653                                              dtype,
654                                              ripdata->meta.name_len,
655                                              ripdata->filename);
656                         if (r)
657                                 break;
658                         if (cookies)
659                                 cookies[cookie_index] = saveoff;
660                         ++cookie_index;
661                 } else {
662                         /* XXX chain error */
663                         kprintf("bad chain type readdir %d\n", bref.type);
664                 }
665
666                 /*
667                  * Keys may not be returned in order so once we have a
668                  * placemarker (cluster) the scan must allow the full range
669                  * or some entries will be missed.
670                  */
671                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
672                                                key_next, (hammer2_key_t)-1,
673                                                HAMMER2_LOOKUP_SHARED);
674                 if (cluster) {
675                         hammer2_cluster_bref(cluster, &bref);
676                         saveoff = (bref.key & HAMMER2_DIRHASH_USERMSK) + 1;
677                 } else {
678                         saveoff = (hammer2_key_t)-1;
679                 }
680                 if (cookie_index == ncookies)
681                         break;
682         }
683         if (cluster) {
684                 hammer2_cluster_unlock(cluster);
685                 hammer2_cluster_drop(cluster);
686         }
687 done:
688         hammer2_inode_unlock(ip, cparent);
689         if (ap->a_eofflag)
690                 *ap->a_eofflag = (cluster == NULL);
691         if (hammer2_debug & 0x0020)
692                 kprintf("readdir: done at %016jx\n", saveoff);
693         uio->uio_offset = saveoff & ~HAMMER2_DIRHASH_VISIBLE;
694         if (error && cookie_index == 0) {
695                 if (cookies) {
696                         kfree(cookies, M_TEMP);
697                         *ap->a_ncookies = 0;
698                         *ap->a_cookies = NULL;
699                 }
700         } else {
701                 if (cookies) {
702                         *ap->a_ncookies = cookie_index;
703                         *ap->a_cookies = cookies;
704                 }
705         }
706         LOCKSTOP;
707         return (error);
708 }
709
710 /*
711  * hammer2_vop_readlink { vp, uio, cred }
712  */
713 static
714 int
715 hammer2_vop_readlink(struct vop_readlink_args *ap)
716 {
717         struct vnode *vp;
718         hammer2_inode_t *ip;
719         int error;
720
721         vp = ap->a_vp;
722         if (vp->v_type != VLNK)
723                 return (EINVAL);
724         ip = VTOI(vp);
725
726         error = hammer2_read_file(ip, ap->a_uio, 0);
727         return (error);
728 }
729
730 static
731 int
732 hammer2_vop_read(struct vop_read_args *ap)
733 {
734         struct vnode *vp;
735         hammer2_inode_t *ip;
736         struct uio *uio;
737         int error;
738         int seqcount;
739         int bigread;
740
741         /*
742          * Read operations supported on this vnode?
743          */
744         vp = ap->a_vp;
745         if (vp->v_type != VREG)
746                 return (EINVAL);
747
748         /*
749          * Misc
750          */
751         ip = VTOI(vp);
752         uio = ap->a_uio;
753         error = 0;
754
755         seqcount = ap->a_ioflag >> 16;
756         bigread = (uio->uio_resid > 100 * 1024 * 1024);
757
758         error = hammer2_read_file(ip, uio, seqcount);
759         return (error);
760 }
761
762 static
763 int
764 hammer2_vop_write(struct vop_write_args *ap)
765 {
766         hammer2_inode_t *ip;
767         hammer2_trans_t trans;
768         thread_t td;
769         struct vnode *vp;
770         struct uio *uio;
771         int error;
772         int seqcount;
773         int bigwrite;
774
775         /*
776          * Read operations supported on this vnode?
777          */
778         vp = ap->a_vp;
779         if (vp->v_type != VREG)
780                 return (EINVAL);
781
782         /*
783          * Misc
784          */
785         ip = VTOI(vp);
786         uio = ap->a_uio;
787         error = 0;
788         if (ip->pmp->ronly) {
789                 return (EROFS);
790         }
791
792         seqcount = ap->a_ioflag >> 16;
793         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
794
795         /*
796          * Check resource limit
797          */
798         if (uio->uio_resid > 0 && (td = uio->uio_td) != NULL && td->td_proc &&
799             uio->uio_offset + uio->uio_resid >
800              td->td_proc->p_rlimit[RLIMIT_FSIZE].rlim_cur) {
801                 lwpsignal(td->td_proc, td->td_lwp, SIGXFSZ);
802                 return (EFBIG);
803         }
804
805         bigwrite = (uio->uio_resid > 100 * 1024 * 1024);
806
807         /*
808          * The transaction interlocks against flushes initiations
809          * (note: but will run concurrently with the actual flush).
810          */
811         hammer2_trans_init(&trans, ip->pmp, 0);
812         error = hammer2_write_file(ip, uio, ap->a_ioflag, seqcount);
813         hammer2_trans_done(&trans);
814
815         return (error);
816 }
817
818 /*
819  * Perform read operations on a file or symlink given an UNLOCKED
820  * inode and uio.
821  *
822  * The passed ip is not locked.
823  */
824 static
825 int
826 hammer2_read_file(hammer2_inode_t *ip, struct uio *uio, int seqcount)
827 {
828         hammer2_off_t size;
829         struct buf *bp;
830         int error;
831
832         error = 0;
833
834         /*
835          * UIO read loop.
836          *
837          * WARNING! Assumes that the kernel interlocks size changes at the
838          *          vnode level.
839          */
840         hammer2_mtx_sh(&ip->lock);
841         size = ip->meta.size;
842         hammer2_mtx_unlock(&ip->lock);
843
844         while (uio->uio_resid > 0 && uio->uio_offset < size) {
845                 hammer2_key_t lbase;
846                 hammer2_key_t leof;
847                 int lblksize;
848                 int loff;
849                 int n;
850
851                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
852                                                 &lbase, &leof);
853
854                 error = cluster_read(ip->vp, leof, lbase, lblksize,
855                                      uio->uio_resid, seqcount * BKVASIZE,
856                                      &bp);
857
858                 if (error)
859                         break;
860                 loff = (int)(uio->uio_offset - lbase);
861                 n = lblksize - loff;
862                 if (n > uio->uio_resid)
863                         n = uio->uio_resid;
864                 if (n > size - uio->uio_offset)
865                         n = (int)(size - uio->uio_offset);
866                 bp->b_flags |= B_AGE;
867                 uiomove((char *)bp->b_data + loff, n, uio);
868                 bqrelse(bp);
869         }
870         return (error);
871 }
872
873 /*
874  * Write to the file represented by the inode via the logical buffer cache.
875  * The inode may represent a regular file or a symlink.
876  *
877  * The inode must not be locked.
878  */
879 static
880 int
881 hammer2_write_file(hammer2_inode_t *ip,
882                    struct uio *uio, int ioflag, int seqcount)
883 {
884         hammer2_key_t old_eof;
885         hammer2_key_t new_eof;
886         struct buf *bp;
887         int kflags;
888         int error;
889         int modified;
890
891         /*
892          * Setup if append
893          *
894          * WARNING! Assumes that the kernel interlocks size changes at the
895          *          vnode level.
896          */
897         hammer2_mtx_ex(&ip->lock);
898         if (ioflag & IO_APPEND)
899                 uio->uio_offset = ip->meta.size;
900         old_eof = ip->meta.size;
901         hammer2_mtx_unlock(&ip->lock);
902
903         /*
904          * Extend the file if necessary.  If the write fails at some point
905          * we will truncate it back down to cover as much as we were able
906          * to write.
907          *
908          * Doing this now makes it easier to calculate buffer sizes in
909          * the loop.
910          */
911         kflags = 0;
912         error = 0;
913         modified = 0;
914
915         if (uio->uio_offset + uio->uio_resid > old_eof) {
916                 new_eof = uio->uio_offset + uio->uio_resid;
917                 modified = 1;
918                 hammer2_extend_file(ip, new_eof);
919                 kflags |= NOTE_EXTEND;
920         } else {
921                 new_eof = old_eof;
922         }
923         
924         /*
925          * UIO write loop
926          */
927         while (uio->uio_resid > 0) {
928                 hammer2_key_t lbase;
929                 int trivial;
930                 int endofblk;
931                 int lblksize;
932                 int loff;
933                 int n;
934
935                 /*
936                  * Don't allow the buffer build to blow out the buffer
937                  * cache.
938                  */
939                 if ((ioflag & IO_RECURSE) == 0)
940                         bwillwrite(HAMMER2_PBUFSIZE);
941
942                 /*
943                  * This nominally tells us how much we can cluster and
944                  * what the logical buffer size needs to be.  Currently
945                  * we don't try to cluster the write and just handle one
946                  * block at a time.
947                  */
948                 lblksize = hammer2_calc_logical(ip, uio->uio_offset,
949                                                 &lbase, NULL);
950                 loff = (int)(uio->uio_offset - lbase);
951                 
952                 KKASSERT(lblksize <= 65536);
953
954                 /*
955                  * Calculate bytes to copy this transfer and whether the
956                  * copy completely covers the buffer or not.
957                  */
958                 trivial = 0;
959                 n = lblksize - loff;
960                 if (n > uio->uio_resid) {
961                         n = uio->uio_resid;
962                         if (loff == lbase && uio->uio_offset + n == new_eof)
963                                 trivial = 1;
964                         endofblk = 0;
965                 } else {
966                         if (loff == 0)
967                                 trivial = 1;
968                         endofblk = 1;
969                 }
970
971                 /*
972                  * Get the buffer
973                  */
974                 if (uio->uio_segflg == UIO_NOCOPY) {
975                         /*
976                          * Issuing a write with the same data backing the
977                          * buffer.  Instantiate the buffer to collect the
978                          * backing vm pages, then read-in any missing bits.
979                          *
980                          * This case is used by vop_stdputpages().
981                          */
982                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
983                         if ((bp->b_flags & B_CACHE) == 0) {
984                                 bqrelse(bp);
985                                 error = bread(ip->vp, lbase, lblksize, &bp);
986                         }
987                 } else if (trivial) {
988                         /*
989                          * Even though we are entirely overwriting the buffer
990                          * we may still have to zero it out to avoid a
991                          * mmap/write visibility issue.
992                          */
993                         bp = getblk(ip->vp, lbase, lblksize, GETBLK_BHEAVY, 0);
994                         if ((bp->b_flags & B_CACHE) == 0)
995                                 vfs_bio_clrbuf(bp);
996                 } else {
997                         /*
998                          * Partial overwrite, read in any missing bits then
999                          * replace the portion being written.
1000                          *
1001                          * (The strategy code will detect zero-fill physical
1002                          * blocks for this case).
1003                          */
1004                         error = bread(ip->vp, lbase, lblksize, &bp);
1005                         if (error == 0)
1006                                 bheavy(bp);
1007                 }
1008
1009                 if (error) {
1010                         brelse(bp);
1011                         break;
1012                 }
1013
1014                 /*
1015                  * Ok, copy the data in
1016                  */
1017                 error = uiomove(bp->b_data + loff, n, uio);
1018                 kflags |= NOTE_WRITE;
1019                 modified = 1;
1020                 if (error) {
1021                         brelse(bp);
1022                         break;
1023                 }
1024
1025                 /*
1026                  * WARNING: Pageout daemon will issue UIO_NOCOPY writes
1027                  *          with IO_SYNC or IO_ASYNC set.  These writes
1028                  *          must be handled as the pageout daemon expects.
1029                  */
1030                 if (ioflag & IO_SYNC) {
1031                         bwrite(bp);
1032                 } else if ((ioflag & IO_DIRECT) && endofblk) {
1033                         bawrite(bp);
1034                 } else if (ioflag & IO_ASYNC) {
1035                         bawrite(bp);
1036                 } else {
1037                         bdwrite(bp);
1038                 }
1039         }
1040
1041         /*
1042          * Cleanup.  If we extended the file EOF but failed to write through
1043          * the entire write is a failure and we have to back-up.
1044          */
1045         if (error && new_eof != old_eof) {
1046                 hammer2_truncate_file(ip, old_eof);
1047         } else if (modified) {
1048                 hammer2_mtx_ex(&ip->lock);
1049                 hammer2_update_time(&ip->meta.mtime);
1050                 atomic_set_int(&ip->flags, HAMMER2_INODE_MTIME);
1051                 hammer2_mtx_unlock(&ip->lock);
1052         }
1053         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1054         hammer2_knote(ip->vp, kflags);
1055         vsetisdirty(ip->vp);
1056         hammer2_trans_assert_strategy(ip->pmp);
1057
1058         return error;
1059 }
1060
1061 /*
1062  * Truncate the size of a file.  The inode must not be locked.
1063  *
1064  * NOTE:    Caller handles setting HAMMER2_INODE_MODIFIED
1065  *
1066  * WARNING: nvtruncbuf() can only be safely called without the inode lock
1067  *          held due to the way our write thread works.
1068  *
1069  * WARNING! Assumes that the kernel interlocks size changes at the
1070  *          vnode level.
1071  */
1072 static
1073 void
1074 hammer2_truncate_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1075 {
1076         hammer2_key_t lbase;
1077         int nblksize;
1078
1079         LOCKSTART;
1080         if (ip->vp) {
1081                 nblksize = hammer2_calc_logical(ip, nsize, &lbase, NULL);
1082                 nvtruncbuf(ip->vp, nsize,
1083                            nblksize, (int)nsize & (nblksize - 1),
1084                            0);
1085         }
1086         hammer2_mtx_ex(&ip->lock);
1087         ip->meta.size = nsize;
1088         atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED);
1089         hammer2_mtx_unlock(&ip->lock);
1090         LOCKSTOP;
1091 }
1092
1093 /*
1094  * Extend the size of a file.  The inode must not be locked.
1095  *
1096  * WARNING! Assumes that the kernel interlocks size changes at the
1097  *          vnode level.
1098  *
1099  * NOTE: Caller handles setting HAMMER2_INODE_MODIFIED
1100  */
1101 static
1102 void
1103 hammer2_extend_file(hammer2_inode_t *ip, hammer2_key_t nsize)
1104 {
1105         hammer2_key_t lbase;
1106         hammer2_key_t osize;
1107         int oblksize;
1108         int nblksize;
1109
1110         LOCKSTART;
1111         hammer2_mtx_ex(&ip->lock);
1112         osize = ip->meta.size;
1113         ip->meta.size = nsize;
1114         hammer2_mtx_unlock(&ip->lock);
1115
1116         if (ip->vp) {
1117                 oblksize = hammer2_calc_logical(ip, osize, &lbase, NULL);
1118                 nblksize = hammer2_calc_logical(ip, nsize, &lbase, NULL);
1119                 nvextendbuf(ip->vp,
1120                             osize, nsize,
1121                             oblksize, nblksize,
1122                             -1, -1, 0);
1123         }
1124         atomic_set_int(&ip->flags, HAMMER2_INODE_RESIZED);
1125         LOCKSTOP;
1126 }
1127
1128 static
1129 int
1130 hammer2_vop_nresolve(struct vop_nresolve_args *ap)
1131 {
1132         hammer2_inode_t *ip;
1133         hammer2_inode_t *dip;
1134         hammer2_cluster_t *cparent;
1135         hammer2_cluster_t *cluster;
1136         const hammer2_inode_data_t *ripdata;
1137         hammer2_key_t key_next;
1138         hammer2_key_t lhc;
1139         struct namecache *ncp;
1140         const uint8_t *name;
1141         size_t name_len;
1142         int error = 0;
1143         struct vnode *vp;
1144
1145         LOCKSTART;
1146         dip = VTOI(ap->a_dvp);
1147         ncp = ap->a_nch->ncp;
1148         name = ncp->nc_name;
1149         name_len = ncp->nc_nlen;
1150         lhc = hammer2_dirhash(name, name_len);
1151
1152         /*
1153          * Note: In DragonFly the kernel handles '.' and '..'.
1154          */
1155         cparent = hammer2_inode_lock(dip, HAMMER2_RESOLVE_ALWAYS |
1156                                           HAMMER2_RESOLVE_SHARED);
1157
1158         cluster = hammer2_cluster_lookup(cparent, &key_next,
1159                                          lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1160                                          HAMMER2_LOOKUP_SHARED);
1161         while (cluster) {
1162                 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE) {
1163                         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1164                         if (ripdata->meta.name_len == name_len &&
1165                             bcmp(ripdata->filename, name, name_len) == 0) {
1166                                 break;
1167                         }
1168                 }
1169                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
1170                                                key_next,
1171                                                lhc + HAMMER2_DIRHASH_LOMASK,
1172                                                HAMMER2_LOOKUP_SHARED);
1173         }
1174         hammer2_inode_unlock(dip, cparent);
1175
1176         /*
1177          * Resolve hardlink entries before acquiring the inode.
1178          */
1179         if (cluster) {
1180                 ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1181                 if (ripdata->meta.type == HAMMER2_OBJTYPE_HARDLINK) {
1182                         hammer2_tid_t inum = ripdata->meta.inum;
1183                         error = hammer2_hardlink_find(dip, NULL, &cluster);
1184                         if (error) {
1185                                 kprintf("hammer2: unable to find hardlink "
1186                                         "0x%016jx\n", inum);
1187                                 LOCKSTOP;
1188
1189                                 return error;
1190                         }
1191                 }
1192         }
1193
1194         /*
1195          * nresolve needs to resolve hardlinks, the original cluster is not
1196          * sufficient.
1197          */
1198         if (cluster) {
1199                 ip = hammer2_inode_get(dip->pmp, dip, cluster);
1200                 ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1201                 if (ripdata->meta.type == HAMMER2_OBJTYPE_HARDLINK) {
1202                         kprintf("nresolve: fixup hardlink\n");
1203                         hammer2_inode_ref(ip);
1204                         hammer2_inode_unlock(ip, NULL);
1205                         hammer2_cluster_unlock(cluster);
1206                         hammer2_cluster_drop(cluster);
1207                         cluster = hammer2_inode_lock(ip,
1208                                                      HAMMER2_RESOLVE_ALWAYS);
1209                         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1210                         hammer2_inode_drop(ip);
1211                         kprintf("nresolve: fixup to type %02x\n",
1212                                 ripdata->meta.type);
1213                 }
1214         } else {
1215                 ip = NULL;
1216         }
1217
1218 #if 0
1219         /*
1220          * Deconsolidate any hardlink whos nlinks == 1.  Ignore errors.
1221          * If an error occurs chain and ip are left alone.
1222          *
1223          * XXX upgrade shared lock?
1224          */
1225         if (ochain && chain &&
1226             chain->data->ipdata.meta.nlinks == 1 && !dip->pmp->ronly) {
1227                 kprintf("hammer2: need to unconsolidate hardlink for %s\n",
1228                         chain->data->ipdata.filename);
1229                 /* XXX retain shared lock on dip? (currently not held) */
1230                 hammer2_trans_init(&trans, dip->pmp, 0);
1231                 hammer2_hardlink_deconsolidate(&trans, dip, &chain, &ochain);
1232                 hammer2_trans_done(&trans);
1233         }
1234 #endif
1235
1236         /*
1237          * Acquire the related vnode
1238          *
1239          * NOTE: For error processing, only ENOENT resolves the namecache
1240          *       entry to NULL, otherwise we just return the error and
1241          *       leave the namecache unresolved.
1242          *
1243          * NOTE: multiple hammer2_inode structures can be aliased to the
1244          *       same chain element, for example for hardlinks.  This
1245          *       use case does not 'reattach' inode associations that
1246          *       might already exist, but always allocates a new one.
1247          *
1248          * WARNING: inode structure is locked exclusively via inode_get
1249          *          but chain was locked shared.  inode_unlock()
1250          *          will handle it properly.
1251          */
1252         if (cluster) {
1253                 vp = hammer2_igetv(ip, cluster, &error);
1254                 if (error == 0) {
1255                         vn_unlock(vp);
1256                         cache_setvp(ap->a_nch, vp);
1257                 } else if (error == ENOENT) {
1258                         cache_setvp(ap->a_nch, NULL);
1259                 }
1260                 hammer2_inode_unlock(ip, cluster);
1261
1262                 /*
1263                  * The vp should not be released until after we've disposed
1264                  * of our locks, because it might cause vop_inactive() to
1265                  * be called.
1266                  */
1267                 if (vp)
1268                         vrele(vp);
1269         } else {
1270                 error = ENOENT;
1271                 cache_setvp(ap->a_nch, NULL);
1272         }
1273         KASSERT(error || ap->a_nch->ncp->nc_vp != NULL,
1274                 ("resolve error %d/%p ap %p\n",
1275                  error, ap->a_nch->ncp->nc_vp, ap));
1276         LOCKSTOP;
1277         return error;
1278 }
1279
1280 static
1281 int
1282 hammer2_vop_nlookupdotdot(struct vop_nlookupdotdot_args *ap)
1283 {
1284         hammer2_inode_t *dip;
1285         hammer2_inode_t *ip;
1286         hammer2_cluster_t *cparent;
1287         int error;
1288
1289         LOCKSTART;
1290         dip = VTOI(ap->a_dvp);
1291
1292         if ((ip = dip->pip) == NULL) {
1293                 *ap->a_vpp = NULL;
1294                 LOCKSTOP;
1295                 return ENOENT;
1296         }
1297         cparent = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS);
1298         *ap->a_vpp = hammer2_igetv(ip, cparent, &error);
1299         hammer2_inode_unlock(ip, cparent);
1300
1301         LOCKSTOP;
1302         return error;
1303 }
1304
1305 static
1306 int
1307 hammer2_vop_nmkdir(struct vop_nmkdir_args *ap)
1308 {
1309         hammer2_inode_t *dip;
1310         hammer2_inode_t *nip;
1311         hammer2_trans_t trans;
1312         hammer2_cluster_t *cluster;
1313         struct namecache *ncp;
1314         const uint8_t *name;
1315         size_t name_len;
1316         int error;
1317
1318         LOCKSTART;
1319         dip = VTOI(ap->a_dvp);
1320         if (dip->pmp->ronly) {
1321                 LOCKSTOP;
1322                 return (EROFS);
1323         }
1324
1325         ncp = ap->a_nch->ncp;
1326         name = ncp->nc_name;
1327         name_len = ncp->nc_nlen;
1328         cluster = NULL;
1329
1330         hammer2_pfs_memory_wait(dip->pmp);
1331         hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
1332         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1333                                    name, name_len,
1334                                    &cluster, 0, &error);
1335         if (error) {
1336                 KKASSERT(nip == NULL);
1337                 *ap->a_vpp = NULL;
1338         } else {
1339                 *ap->a_vpp = hammer2_igetv(nip, cluster, &error);
1340                 hammer2_inode_unlock(nip, cluster);
1341         }
1342         hammer2_trans_done(&trans);
1343
1344         if (error == 0) {
1345                 cache_setunresolved(ap->a_nch);
1346                 cache_setvp(ap->a_nch, *ap->a_vpp);
1347         }
1348         LOCKSTOP;
1349         return error;
1350 }
1351
1352 static
1353 int
1354 hammer2_vop_open(struct vop_open_args *ap)
1355 {
1356         return vop_stdopen(ap);
1357 }
1358
1359 /*
1360  * hammer2_vop_advlock { vp, id, op, fl, flags }
1361  */
1362 static
1363 int
1364 hammer2_vop_advlock(struct vop_advlock_args *ap)
1365 {
1366         hammer2_inode_t *ip = VTOI(ap->a_vp);
1367         const hammer2_inode_data_t *ripdata;
1368         hammer2_cluster_t *cparent;
1369         hammer2_off_t size;
1370
1371         cparent = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS |
1372                                          HAMMER2_RESOLVE_SHARED);
1373         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1374         size = ripdata->meta.size;
1375         hammer2_inode_unlock(ip, cparent);
1376         return (lf_advlock(ap, &ip->advlock, size));
1377 }
1378
1379
1380 static
1381 int
1382 hammer2_vop_close(struct vop_close_args *ap)
1383 {
1384         return vop_stdclose(ap);
1385 }
1386
1387 /*
1388  * hammer2_vop_nlink { nch, dvp, vp, cred }
1389  *
1390  * Create a hardlink from (vp) to {dvp, nch}.
1391  */
1392 static
1393 int
1394 hammer2_vop_nlink(struct vop_nlink_args *ap)
1395 {
1396         hammer2_inode_t *fdip;  /* target directory to create link in */
1397         hammer2_inode_t *tdip;  /* target directory to create link in */
1398         hammer2_inode_t *cdip;  /* common parent directory */
1399         hammer2_inode_t *ip;    /* inode we are hardlinking to */
1400         hammer2_cluster_t *cluster;
1401         hammer2_cluster_t *fdcluster;
1402         hammer2_cluster_t *tdcluster;
1403         hammer2_cluster_t *cdcluster;
1404         hammer2_trans_t trans;
1405         struct namecache *ncp;
1406         const uint8_t *name;
1407         size_t name_len;
1408         int error;
1409
1410         LOCKSTART;
1411         tdip = VTOI(ap->a_dvp);
1412         if (tdip->pmp->ronly) {
1413                 LOCKSTOP;
1414                 return (EROFS);
1415         }
1416
1417         ncp = ap->a_nch->ncp;
1418         name = ncp->nc_name;
1419         name_len = ncp->nc_nlen;
1420
1421         /*
1422          * ip represents the file being hardlinked.  The file could be a
1423          * normal file or a hardlink target if it has already been hardlinked.
1424          * If ip is a hardlinked target then ip->pip represents the location
1425          * of the hardlinked target, NOT the location of the hardlink pointer.
1426          *
1427          * Bump nlinks and potentially also create or move the hardlink
1428          * target in the parent directory common to (ip) and (tdip).  The
1429          * consolidation code can modify ip->cluster and ip->pip.  The
1430          * returned cluster is locked.
1431          */
1432         ip = VTOI(ap->a_vp);
1433         hammer2_pfs_memory_wait(ip->pmp);
1434         hammer2_trans_init(&trans, ip->pmp, HAMMER2_TRANS_NEWINODE);
1435
1436         /*
1437          * The common parent directory must be locked first to avoid deadlocks.
1438          * Also note that fdip and/or tdip might match cdip.
1439          */
1440         fdip = ip->pip;
1441         cdip = hammer2_inode_common_parent(fdip, tdip);
1442         cdcluster = hammer2_inode_lock(cdip, HAMMER2_RESOLVE_ALWAYS);
1443         fdcluster = hammer2_inode_lock(fdip, HAMMER2_RESOLVE_ALWAYS);
1444         tdcluster = hammer2_inode_lock(tdip, HAMMER2_RESOLVE_ALWAYS);
1445         cluster = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS);
1446         error = hammer2_hardlink_consolidate(&trans, ip, &cluster,
1447                                              cdip, cdcluster, 1);
1448         if (error)
1449                 goto done;
1450
1451         /*
1452          * Create a directory entry connected to the specified cluster.
1453          *
1454          * WARNING! chain can get moved by the connect (indirectly due to
1455          *          potential indirect block creation).
1456          */
1457         error = hammer2_inode_connect(&trans,
1458                                       ip, &cluster, 1,
1459                                       tdip, tdcluster,
1460                                       name, name_len, 0);
1461         if (error == 0) {
1462                 cache_setunresolved(ap->a_nch);
1463                 cache_setvp(ap->a_nch, ap->a_vp);
1464         }
1465 done:
1466         hammer2_inode_unlock(ip, cluster);
1467         hammer2_inode_unlock(tdip, tdcluster);
1468         hammer2_inode_unlock(fdip, fdcluster);
1469         hammer2_inode_unlock(cdip, cdcluster);
1470         hammer2_inode_drop(cdip);
1471         hammer2_trans_done(&trans);
1472
1473         LOCKSTOP;
1474         return error;
1475 }
1476
1477 /*
1478  * hammer2_vop_ncreate { nch, dvp, vpp, cred, vap }
1479  *
1480  * The operating system has already ensured that the directory entry
1481  * does not exist and done all appropriate namespace locking.
1482  */
1483 static
1484 int
1485 hammer2_vop_ncreate(struct vop_ncreate_args *ap)
1486 {
1487         hammer2_inode_t *dip;
1488         hammer2_inode_t *nip;
1489         hammer2_trans_t trans;
1490         hammer2_cluster_t *ncluster;
1491         struct namecache *ncp;
1492         const uint8_t *name;
1493         size_t name_len;
1494         int error;
1495
1496         LOCKSTART;
1497         dip = VTOI(ap->a_dvp);
1498         if (dip->pmp->ronly) {
1499                 LOCKSTOP;
1500                 return (EROFS);
1501         }
1502
1503         ncp = ap->a_nch->ncp;
1504         name = ncp->nc_name;
1505         name_len = ncp->nc_nlen;
1506         hammer2_pfs_memory_wait(dip->pmp);
1507         hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
1508         ncluster = NULL;
1509
1510         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1511                                    name, name_len,
1512                                    &ncluster, 0, &error);
1513         if (error) {
1514                 KKASSERT(nip == NULL);
1515                 *ap->a_vpp = NULL;
1516         } else {
1517                 *ap->a_vpp = hammer2_igetv(nip, ncluster, &error);
1518                 hammer2_inode_unlock(nip, ncluster);
1519         }
1520         hammer2_trans_done(&trans);
1521
1522         if (error == 0) {
1523                 cache_setunresolved(ap->a_nch);
1524                 cache_setvp(ap->a_nch, *ap->a_vpp);
1525         }
1526         LOCKSTOP;
1527         return error;
1528 }
1529
1530 /*
1531  * Make a device node (typically a fifo)
1532  */
1533 static
1534 int
1535 hammer2_vop_nmknod(struct vop_nmknod_args *ap)
1536 {
1537         hammer2_inode_t *dip;
1538         hammer2_inode_t *nip;
1539         hammer2_trans_t trans;
1540         hammer2_cluster_t *ncluster;
1541         struct namecache *ncp;
1542         const uint8_t *name;
1543         size_t name_len;
1544         int error;
1545
1546         LOCKSTART;
1547         dip = VTOI(ap->a_dvp);
1548         if (dip->pmp->ronly) {
1549                 LOCKSTOP;
1550                 return (EROFS);
1551         }
1552
1553         ncp = ap->a_nch->ncp;
1554         name = ncp->nc_name;
1555         name_len = ncp->nc_nlen;
1556         hammer2_pfs_memory_wait(dip->pmp);
1557         hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
1558         ncluster = NULL;
1559
1560         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1561                                    name, name_len,
1562                                    &ncluster, 0, &error);
1563         if (error) {
1564                 KKASSERT(nip == NULL);
1565                 *ap->a_vpp = NULL;
1566         } else {
1567                 *ap->a_vpp = hammer2_igetv(nip, ncluster, &error);
1568                 hammer2_inode_unlock(nip, ncluster);
1569         }
1570         hammer2_trans_done(&trans);
1571
1572         if (error == 0) {
1573                 cache_setunresolved(ap->a_nch);
1574                 cache_setvp(ap->a_nch, *ap->a_vpp);
1575         }
1576         LOCKSTOP;
1577         return error;
1578 }
1579
1580 /*
1581  * hammer2_vop_nsymlink { nch, dvp, vpp, cred, vap, target }
1582  */
1583 static
1584 int
1585 hammer2_vop_nsymlink(struct vop_nsymlink_args *ap)
1586 {
1587         hammer2_inode_t *dip;
1588         hammer2_inode_t *nip;
1589         hammer2_cluster_t *ncparent;
1590         hammer2_trans_t trans;
1591         struct namecache *ncp;
1592         const uint8_t *name;
1593         size_t name_len;
1594         int error;
1595         
1596         dip = VTOI(ap->a_dvp);
1597         if (dip->pmp->ronly)
1598                 return (EROFS);
1599
1600         ncp = ap->a_nch->ncp;
1601         name = ncp->nc_name;
1602         name_len = ncp->nc_nlen;
1603         hammer2_pfs_memory_wait(dip->pmp);
1604         hammer2_trans_init(&trans, dip->pmp, HAMMER2_TRANS_NEWINODE);
1605         ncparent = NULL;
1606
1607         ap->a_vap->va_type = VLNK;      /* enforce type */
1608
1609         nip = hammer2_inode_create(&trans, dip, ap->a_vap, ap->a_cred,
1610                                    name, name_len,
1611                                    &ncparent, 0, &error);
1612         if (error) {
1613                 KKASSERT(nip == NULL);
1614                 *ap->a_vpp = NULL;
1615                 hammer2_trans_done(&trans);
1616                 return error;
1617         }
1618         *ap->a_vpp = hammer2_igetv(nip, ncparent, &error);
1619
1620         /*
1621          * Build the softlink (~like file data) and finalize the namecache.
1622          */
1623         if (error == 0) {
1624                 size_t bytes;
1625                 struct uio auio;
1626                 struct iovec aiov;
1627                 hammer2_inode_data_t *nipdata;
1628
1629                 nipdata = &hammer2_cluster_wdata(ncparent)->ipdata;
1630                 /* nipdata = &nip->chain->data->ipdata;XXX */
1631                 bytes = strlen(ap->a_target);
1632
1633                 if (bytes <= HAMMER2_EMBEDDED_BYTES) {
1634                         KKASSERT(nipdata->meta.op_flags &
1635                                  HAMMER2_OPFLAG_DIRECTDATA);
1636                         bcopy(ap->a_target, nipdata->u.data, bytes);
1637                         nipdata->meta.size = bytes;
1638                         nip->meta.size = bytes;
1639                         hammer2_cluster_modsync(ncparent);
1640                         hammer2_inode_unlock(nip, ncparent);
1641                         /* nipdata = NULL; not needed */
1642                 } else {
1643                         hammer2_inode_unlock(nip, ncparent);
1644                         /* nipdata = NULL; not needed */
1645                         bzero(&auio, sizeof(auio));
1646                         bzero(&aiov, sizeof(aiov));
1647                         auio.uio_iov = &aiov;
1648                         auio.uio_segflg = UIO_SYSSPACE;
1649                         auio.uio_rw = UIO_WRITE;
1650                         auio.uio_resid = bytes;
1651                         auio.uio_iovcnt = 1;
1652                         auio.uio_td = curthread;
1653                         aiov.iov_base = ap->a_target;
1654                         aiov.iov_len = bytes;
1655                         error = hammer2_write_file(nip, &auio, IO_APPEND, 0);
1656                         /* XXX handle error */
1657                         error = 0;
1658                 }
1659         } else {
1660                 hammer2_inode_unlock(nip, ncparent);
1661         }
1662         hammer2_trans_done(&trans);
1663
1664         /*
1665          * Finalize namecache
1666          */
1667         if (error == 0) {
1668                 cache_setunresolved(ap->a_nch);
1669                 cache_setvp(ap->a_nch, *ap->a_vpp);
1670                 /* hammer2_knote(ap->a_dvp, NOTE_WRITE); */
1671         }
1672         return error;
1673 }
1674
1675 /*
1676  * hammer2_vop_nremove { nch, dvp, cred }
1677  */
1678 static
1679 int
1680 hammer2_vop_nremove(struct vop_nremove_args *ap)
1681 {
1682         hammer2_inode_t *dip;
1683         hammer2_trans_t trans;
1684         struct namecache *ncp;
1685         const uint8_t *name;
1686         size_t name_len;
1687         int error;
1688
1689         LOCKSTART;
1690         dip = VTOI(ap->a_dvp);
1691         if (dip->pmp->ronly) {
1692                 LOCKSTOP;
1693                 return(EROFS);
1694         }
1695
1696         ncp = ap->a_nch->ncp;
1697         name = ncp->nc_name;
1698         name_len = ncp->nc_nlen;
1699
1700         hammer2_pfs_memory_wait(dip->pmp);
1701         hammer2_trans_init(&trans, dip->pmp, 0);
1702         error = hammer2_unlink_file(&trans, dip, name, name_len,
1703                                     0, NULL, ap->a_nch, -1);
1704         hammer2_run_unlinkq(&trans, dip->pmp);
1705         hammer2_trans_done(&trans);
1706         if (error == 0)
1707                 cache_unlink(ap->a_nch);
1708         LOCKSTOP;
1709         return (error);
1710 }
1711
1712 /*
1713  * hammer2_vop_nrmdir { nch, dvp, cred }
1714  */
1715 static
1716 int
1717 hammer2_vop_nrmdir(struct vop_nrmdir_args *ap)
1718 {
1719         hammer2_inode_t *dip;
1720         hammer2_trans_t trans;
1721         struct namecache *ncp;
1722         const uint8_t *name;
1723         size_t name_len;
1724         int error;
1725
1726         LOCKSTART;
1727         dip = VTOI(ap->a_dvp);
1728         if (dip->pmp->ronly) {
1729                 LOCKSTOP;
1730                 return(EROFS);
1731         }
1732
1733         ncp = ap->a_nch->ncp;
1734         name = ncp->nc_name;
1735         name_len = ncp->nc_nlen;
1736
1737         hammer2_pfs_memory_wait(dip->pmp);
1738         hammer2_trans_init(&trans, dip->pmp, 0);
1739         hammer2_run_unlinkq(&trans, dip->pmp);
1740         error = hammer2_unlink_file(&trans, dip, name, name_len,
1741                                     1, NULL, ap->a_nch, -1);
1742         hammer2_trans_done(&trans);
1743         if (error == 0)
1744                 cache_unlink(ap->a_nch);
1745         LOCKSTOP;
1746         return (error);
1747 }
1748
1749 /*
1750  * hammer2_vop_nrename { fnch, tnch, fdvp, tdvp, cred }
1751  */
1752 static
1753 int
1754 hammer2_vop_nrename(struct vop_nrename_args *ap)
1755 {
1756         struct namecache *fncp;
1757         struct namecache *tncp;
1758         hammer2_inode_t *cdip;
1759         hammer2_inode_t *fdip;
1760         hammer2_inode_t *tdip;
1761         hammer2_inode_t *ip;
1762         hammer2_cluster_t *cluster;
1763         hammer2_cluster_t *fdcluster;
1764         hammer2_cluster_t *tdcluster;
1765         hammer2_cluster_t *cdcluster;
1766         hammer2_trans_t trans;
1767         const uint8_t *fname;
1768         size_t fname_len;
1769         const uint8_t *tname;
1770         size_t tname_len;
1771         int error;
1772         int tnch_error;
1773         int hlink;
1774
1775         if (ap->a_fdvp->v_mount != ap->a_tdvp->v_mount)
1776                 return(EXDEV);
1777         if (ap->a_fdvp->v_mount != ap->a_fnch->ncp->nc_vp->v_mount)
1778                 return(EXDEV);
1779
1780         fdip = VTOI(ap->a_fdvp);        /* source directory */
1781         tdip = VTOI(ap->a_tdvp);        /* target directory */
1782
1783         if (fdip->pmp->ronly)
1784                 return(EROFS);
1785
1786         LOCKSTART;
1787         fncp = ap->a_fnch->ncp;         /* entry name in source */
1788         fname = fncp->nc_name;
1789         fname_len = fncp->nc_nlen;
1790
1791         tncp = ap->a_tnch->ncp;         /* entry name in target */
1792         tname = tncp->nc_name;
1793         tname_len = tncp->nc_nlen;
1794
1795         hammer2_pfs_memory_wait(tdip->pmp);
1796         hammer2_trans_init(&trans, tdip->pmp, 0);
1797
1798         /*
1799          * ip is the inode being renamed.  If this is a hardlink then
1800          * ip represents the actual file and not the hardlink marker.
1801          */
1802         ip = VTOI(fncp->nc_vp);
1803         cluster = NULL;
1804
1805
1806         /*
1807          * The common parent directory must be locked first to avoid deadlocks.
1808          * Also note that fdip and/or tdip might match cdip.
1809          *
1810          * WARNING! fdip may not match ip->pip.  That is, if the source file
1811          *          is already a hardlink then what we are renaming is the
1812          *          hardlink pointer, not the hardlink itself.  The hardlink
1813          *          directory (ip->pip) will already be at a common parent
1814          *          of fdrip.
1815          *
1816          *          Be sure to use ip->pip when finding the common parent
1817          *          against tdip or we might accidently move the hardlink
1818          *          target into a subdirectory that makes it inaccessible to
1819          *          other pointers.
1820          */
1821         cdip = hammer2_inode_common_parent(ip->pip, tdip);
1822         cdcluster = hammer2_inode_lock(cdip, HAMMER2_RESOLVE_ALWAYS);
1823         fdcluster = hammer2_inode_lock(fdip, HAMMER2_RESOLVE_ALWAYS);
1824         tdcluster = hammer2_inode_lock(tdip, HAMMER2_RESOLVE_ALWAYS);
1825
1826         /*
1827          * Keep a tight grip on the inode so the temporary unlinking from
1828          * the source location prior to linking to the target location
1829          * does not cause the cluster to be destroyed.
1830          *
1831          * NOTE: To avoid deadlocks we cannot lock (ip) while we are
1832          *       unlinking elements from their directories.  Locking
1833          *       the nlinks field does not lock the whole inode.
1834          */
1835         hammer2_inode_ref(ip);
1836
1837         /*
1838          * Remove target if it exists.
1839          */
1840         error = hammer2_unlink_file(&trans, tdip, tname, tname_len,
1841                                     -1, NULL, ap->a_tnch, -1);
1842         tnch_error = error;
1843         if (error && error != ENOENT)
1844                 goto done;
1845
1846         /*
1847          * When renaming a hardlinked file we may have to re-consolidate
1848          * the location of the hardlink target.
1849          *
1850          * If ip represents a regular file the consolidation code essentially
1851          * does nothing other than return the same locked cluster that was
1852          * passed in.
1853          *
1854          * The returned cluster will be locked.
1855          *
1856          * WARNING!  We do not currently have a local copy of ipdata but
1857          *           we do use one later remember that it must be reloaded
1858          *           on any modification to the inode, including connects.
1859          */
1860         cluster = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS);
1861         error = hammer2_hardlink_consolidate(&trans, ip, &cluster,
1862                                              cdip, cdcluster, 0);
1863         if (error)
1864                 goto done;
1865
1866         /*
1867          * Disconnect (fdip, fname) from the source directory.  This will
1868          * disconnect (ip) if it represents a direct file.  If (ip) represents
1869          * a hardlink the HARDLINK pointer object will be removed but the
1870          * hardlink will stay intact.
1871          *
1872          * Always pass nch as NULL because we intend to reconnect the inode,
1873          * so we don't want hammer2_unlink_file() to rename it to the hidden
1874          * open-but-unlinked directory.
1875          *
1876          * The target cluster may be marked DELETED but will not be destroyed
1877          * since we retain our hold on ip and cluster.
1878          *
1879          * NOTE: We pass nlinks as 0 (not -1) in order to retain the file's
1880          *       link count.
1881          */
1882         error = hammer2_unlink_file(&trans, fdip, fname, fname_len,
1883                                     -1, &hlink, NULL, 0);
1884         KKASSERT(error != EAGAIN);
1885         if (error)
1886                 goto done;
1887
1888         /*
1889          * Reconnect ip to target directory using cluster.  Chains cannot
1890          * actually be moved, so this will duplicate the cluster in the new
1891          * spot and assign it to the ip, replacing the old cluster.
1892          *
1893          * WARNING: Because recursive locks are allowed and we unlinked the
1894          *          file that we have a cluster-in-hand for just above, the
1895          *          cluster might have been delete-duplicated.  We must
1896          *          refactor the cluster.
1897          *
1898          * WARNING: Chain locks can lock buffer cache buffers, to avoid
1899          *          deadlocks we want to unlock before issuing a cache_*()
1900          *          op (that might have to lock a vnode).
1901          *
1902          * NOTE:    Pass nlinks as 0 because we retained the link count from
1903          *          the unlink, so we do not have to modify it.
1904          */
1905         error = hammer2_inode_connect(&trans,
1906                                       ip, &cluster, hlink,
1907                                       tdip, tdcluster,
1908                                       tname, tname_len, 0);
1909         if (error == 0) {
1910                 KKASSERT(cluster != NULL);
1911                 hammer2_inode_repoint(ip, (hlink ? ip->pip : tdip), cluster);
1912         }
1913 done:
1914         hammer2_inode_unlock(ip, cluster);
1915         hammer2_inode_unlock(tdip, tdcluster);
1916         hammer2_inode_unlock(fdip, fdcluster);
1917         hammer2_inode_unlock(cdip, cdcluster);
1918         hammer2_inode_drop(ip);
1919         hammer2_inode_drop(cdip);
1920         hammer2_run_unlinkq(&trans, fdip->pmp);
1921         hammer2_trans_done(&trans);
1922
1923         /*
1924          * Issue the namecache update after unlocking all the internal
1925          * hammer structures, otherwise we might deadlock.
1926          */
1927         if (tnch_error == 0) {
1928                 cache_unlink(ap->a_tnch);
1929                 cache_setunresolved(ap->a_tnch);
1930         }
1931         if (error == 0)
1932                 cache_rename(ap->a_fnch, ap->a_tnch);
1933
1934         LOCKSTOP;
1935         return (error);
1936 }
1937
1938 /*
1939  * hammer2_vop_ioctl { vp, command, data, fflag, cred }
1940  */
1941 static
1942 int
1943 hammer2_vop_ioctl(struct vop_ioctl_args *ap)
1944 {
1945         hammer2_inode_t *ip;
1946         int error;
1947
1948         LOCKSTART;
1949         ip = VTOI(ap->a_vp);
1950
1951         error = hammer2_ioctl(ip, ap->a_command, (void *)ap->a_data,
1952                               ap->a_fflag, ap->a_cred);
1953         LOCKSTOP;
1954         return (error);
1955 }
1956
1957 static
1958 int 
1959 hammer2_vop_mountctl(struct vop_mountctl_args *ap)
1960 {
1961         struct mount *mp;
1962         hammer2_pfs_t *pmp;
1963         int rc;
1964
1965         LOCKSTART;
1966         switch (ap->a_op) {
1967         case (MOUNTCTL_SET_EXPORT):
1968                 mp = ap->a_head.a_ops->head.vv_mount;
1969                 pmp = MPTOPMP(mp);
1970
1971                 if (ap->a_ctllen != sizeof(struct export_args))
1972                         rc = (EINVAL);
1973                 else
1974                         rc = vfs_export(mp, &pmp->export,
1975                                         (const struct export_args *)ap->a_ctl);
1976                 break;
1977         default:
1978                 rc = vop_stdmountctl(ap);
1979                 break;
1980         }
1981         LOCKSTOP;
1982         return (rc);
1983 }
1984
1985 /*
1986  * This handles unlinked open files after the vnode is finally dereferenced.
1987  * To avoid deadlocks it cannot be called from the normal vnode recycling
1988  * path, so we call it (1) after a unlink, rmdir, or rename, (2) on every
1989  * flush, and (3) on umount.
1990  */
1991 void
1992 hammer2_run_unlinkq(hammer2_trans_t *trans, hammer2_pfs_t *pmp)
1993 {
1994         const hammer2_inode_data_t *ripdata;
1995         hammer2_inode_unlink_t *ipul;
1996         hammer2_inode_t *ip;
1997         hammer2_cluster_t *cluster;
1998         hammer2_cluster_t *cparent;
1999
2000         if (TAILQ_EMPTY(&pmp->unlinkq))
2001                 return;
2002
2003         LOCKSTART;
2004         hammer2_spin_ex(&pmp->list_spin);
2005         while ((ipul = TAILQ_FIRST(&pmp->unlinkq)) != NULL) {
2006                 TAILQ_REMOVE(&pmp->unlinkq, ipul, entry);
2007                 hammer2_spin_unex(&pmp->list_spin);
2008                 ip = ipul->ip;
2009                 kfree(ipul, pmp->minode);
2010
2011                 cluster = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS);
2012                 ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
2013                 if (hammer2_debug & 0x400) {
2014                         kprintf("hammer2: unlink on reclaim: %s refs=%d\n",
2015                                 ripdata->filename, ip->refs);
2016                 }
2017
2018                 /*
2019                  * NOTE: Due to optimizations to avoid I/O on the inode for
2020                  *       the last unlink, ripdata->nlinks is not necessarily
2021                  *       0 here.
2022                  */
2023                 /* KKASSERT(ripdata->nlinks == 0); (see NOTE) */
2024                 cparent = hammer2_cluster_parent(cluster);
2025                 hammer2_cluster_delete(trans, cparent, cluster,
2026                                        HAMMER2_DELETE_PERMANENT);
2027                 hammer2_cluster_unlock(cparent);
2028                 hammer2_cluster_drop(cparent);
2029                 hammer2_inode_unlock(ip, cluster);      /* inode lock */
2030                 hammer2_inode_drop(ip);                 /* ipul ref */
2031
2032                 hammer2_spin_ex(&pmp->list_spin);
2033         }
2034         hammer2_spin_unex(&pmp->list_spin);
2035         LOCKSTOP;
2036 }
2037
2038
2039 /*
2040  * KQFILTER
2041  */
2042 static void filt_hammer2detach(struct knote *kn);
2043 static int filt_hammer2read(struct knote *kn, long hint);
2044 static int filt_hammer2write(struct knote *kn, long hint);
2045 static int filt_hammer2vnode(struct knote *kn, long hint);
2046
2047 static struct filterops hammer2read_filtops =
2048         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2049           NULL, filt_hammer2detach, filt_hammer2read };
2050 static struct filterops hammer2write_filtops =
2051         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2052           NULL, filt_hammer2detach, filt_hammer2write };
2053 static struct filterops hammer2vnode_filtops =
2054         { FILTEROP_ISFD | FILTEROP_MPSAFE,
2055           NULL, filt_hammer2detach, filt_hammer2vnode };
2056
2057 static
2058 int
2059 hammer2_vop_kqfilter(struct vop_kqfilter_args *ap)
2060 {
2061         struct vnode *vp = ap->a_vp;
2062         struct knote *kn = ap->a_kn;
2063
2064         switch (kn->kn_filter) {
2065         case EVFILT_READ:
2066                 kn->kn_fop = &hammer2read_filtops;
2067                 break;
2068         case EVFILT_WRITE:
2069                 kn->kn_fop = &hammer2write_filtops;
2070                 break;
2071         case EVFILT_VNODE:
2072                 kn->kn_fop = &hammer2vnode_filtops;
2073                 break;
2074         default:
2075                 return (EOPNOTSUPP);
2076         }
2077
2078         kn->kn_hook = (caddr_t)vp;
2079
2080         knote_insert(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
2081
2082         return(0);
2083 }
2084
2085 static void
2086 filt_hammer2detach(struct knote *kn)
2087 {
2088         struct vnode *vp = (void *)kn->kn_hook;
2089
2090         knote_remove(&vp->v_pollinfo.vpi_kqinfo.ki_note, kn);
2091 }
2092
2093 static int
2094 filt_hammer2read(struct knote *kn, long hint)
2095 {
2096         struct vnode *vp = (void *)kn->kn_hook;
2097         hammer2_inode_t *ip = VTOI(vp);
2098         off_t off;
2099
2100         if (hint == NOTE_REVOKE) {
2101                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
2102                 return(1);
2103         }
2104         off = ip->meta.size - kn->kn_fp->f_offset;
2105         kn->kn_data = (off < INTPTR_MAX) ? off : INTPTR_MAX;
2106         if (kn->kn_sfflags & NOTE_OLDAPI)
2107                 return(1);
2108         return (kn->kn_data != 0);
2109 }
2110
2111
2112 static int
2113 filt_hammer2write(struct knote *kn, long hint)
2114 {
2115         if (hint == NOTE_REVOKE)
2116                 kn->kn_flags |= (EV_EOF | EV_NODATA | EV_ONESHOT);
2117         kn->kn_data = 0;
2118         return (1);
2119 }
2120
2121 static int
2122 filt_hammer2vnode(struct knote *kn, long hint)
2123 {
2124         if (kn->kn_sfflags & hint)
2125                 kn->kn_fflags |= hint;
2126         if (hint == NOTE_REVOKE) {
2127                 kn->kn_flags |= (EV_EOF | EV_NODATA);
2128                 return (1);
2129         }
2130         return (kn->kn_fflags != 0);
2131 }
2132
2133 /*
2134  * FIFO VOPS
2135  */
2136 static
2137 int
2138 hammer2_vop_markatime(struct vop_markatime_args *ap)
2139 {
2140         hammer2_inode_t *ip;
2141         struct vnode *vp;
2142
2143         vp = ap->a_vp;
2144         ip = VTOI(vp);
2145
2146         if (ip->pmp->ronly)
2147                 return(EROFS);
2148         return(0);
2149 }
2150
2151 static
2152 int
2153 hammer2_vop_fifokqfilter(struct vop_kqfilter_args *ap)
2154 {
2155         int error;
2156
2157         error = VOCALL(&fifo_vnode_vops, &ap->a_head);
2158         if (error)
2159                 error = hammer2_vop_kqfilter(ap);
2160         return(error);
2161 }
2162
2163 /*
2164  * VOPS vector
2165  */
2166 struct vop_ops hammer2_vnode_vops = {
2167         .vop_default    = vop_defaultop,
2168         .vop_fsync      = hammer2_vop_fsync,
2169         .vop_getpages   = vop_stdgetpages,
2170         .vop_putpages   = vop_stdputpages,
2171         .vop_access     = hammer2_vop_access,
2172         .vop_advlock    = hammer2_vop_advlock,
2173         .vop_close      = hammer2_vop_close,
2174         .vop_nlink      = hammer2_vop_nlink,
2175         .vop_ncreate    = hammer2_vop_ncreate,
2176         .vop_nsymlink   = hammer2_vop_nsymlink,
2177         .vop_nremove    = hammer2_vop_nremove,
2178         .vop_nrmdir     = hammer2_vop_nrmdir,
2179         .vop_nrename    = hammer2_vop_nrename,
2180         .vop_getattr    = hammer2_vop_getattr,
2181         .vop_setattr    = hammer2_vop_setattr,
2182         .vop_readdir    = hammer2_vop_readdir,
2183         .vop_readlink   = hammer2_vop_readlink,
2184         .vop_getpages   = vop_stdgetpages,
2185         .vop_putpages   = vop_stdputpages,
2186         .vop_read       = hammer2_vop_read,
2187         .vop_write      = hammer2_vop_write,
2188         .vop_open       = hammer2_vop_open,
2189         .vop_inactive   = hammer2_vop_inactive,
2190         .vop_reclaim    = hammer2_vop_reclaim,
2191         .vop_nresolve   = hammer2_vop_nresolve,
2192         .vop_nlookupdotdot = hammer2_vop_nlookupdotdot,
2193         .vop_nmkdir     = hammer2_vop_nmkdir,
2194         .vop_nmknod     = hammer2_vop_nmknod,
2195         .vop_ioctl      = hammer2_vop_ioctl,
2196         .vop_mountctl   = hammer2_vop_mountctl,
2197         .vop_bmap       = hammer2_vop_bmap,
2198         .vop_strategy   = hammer2_vop_strategy,
2199         .vop_kqfilter   = hammer2_vop_kqfilter
2200 };
2201
2202 struct vop_ops hammer2_spec_vops = {
2203         .vop_default =          vop_defaultop,
2204         .vop_fsync =            hammer2_vop_fsync,
2205         .vop_read =             vop_stdnoread,
2206         .vop_write =            vop_stdnowrite,
2207         .vop_access =           hammer2_vop_access,
2208         .vop_close =            hammer2_vop_close,
2209         .vop_markatime =        hammer2_vop_markatime,
2210         .vop_getattr =          hammer2_vop_getattr,
2211         .vop_inactive =         hammer2_vop_inactive,
2212         .vop_reclaim =          hammer2_vop_reclaim,
2213         .vop_setattr =          hammer2_vop_setattr
2214 };
2215
2216 struct vop_ops hammer2_fifo_vops = {
2217         .vop_default =          fifo_vnoperate,
2218         .vop_fsync =            hammer2_vop_fsync,
2219 #if 0
2220         .vop_read =             hammer2_vop_fiforead,
2221         .vop_write =            hammer2_vop_fifowrite,
2222 #endif
2223         .vop_access =           hammer2_vop_access,
2224 #if 0
2225         .vop_close =            hammer2_vop_fifoclose,
2226 #endif
2227         .vop_markatime =        hammer2_vop_markatime,
2228         .vop_getattr =          hammer2_vop_getattr,
2229         .vop_inactive =         hammer2_vop_inactive,
2230         .vop_reclaim =          hammer2_vop_reclaim,
2231         .vop_setattr =          hammer2_vop_setattr,
2232         .vop_kqfilter =         hammer2_vop_fifokqfilter
2233 };
2234