2 * Copyright (c) 2011-2018 The DragonFly Project. All rights reserved.
4 * This code is derived from software contributed to The DragonFly Project
5 * by Matthew Dillon <dillon@dragonflybsd.org>
6 * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
8 * Redistribution and use in source and binary forms, with or without
9 * modification, are permitted provided that the following conditions
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in
16 * the documentation and/or other materials provided with the
18 * 3. Neither the name of The DragonFly Project nor the names of its
19 * contributors may be used to endorse or promote products derived
20 * from this software without specific, prior written permission.
22 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
46 RB_GENERATE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
47 hammer2_tid_t, meta.inum);
50 hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
52 if (ip1->meta.inum < ip2->meta.inum)
54 if (ip1->meta.inum > ip2->meta.inum)
61 hammer2_knote(struct vnode *vp, int flags)
64 KNOTE(&vp->v_pollinfo.vpi_kqinfo.ki_note, flags);
69 hammer2_inode_delayed_sideq(hammer2_inode_t *ip)
71 hammer2_pfs_t *pmp = ip->pmp;
73 if ((ip->flags & (HAMMER2_INODE_SYNCQ | HAMMER2_INODE_SIDEQ)) == 0) {
74 hammer2_spin_ex(&pmp->list_spin);
75 if ((ip->flags & (HAMMER2_INODE_SYNCQ |
76 HAMMER2_INODE_SIDEQ)) == 0) {
77 hammer2_inode_ref(ip);
78 atomic_set_int(&ip->flags, HAMMER2_INODE_SIDEQ);
79 TAILQ_INSERT_TAIL(&pmp->sideq, ip, entry);
81 hammer2_spin_unex(&pmp->list_spin);
83 hammer2_spin_unex(&pmp->list_spin);
91 * HAMMER2 offers shared and exclusive locks on inodes. Pass a mask of
94 * - pass HAMMER2_RESOLVE_SHARED if a shared lock is desired. The
95 * inode locking function will automatically set the RDONLY flag.
97 * - pass HAMMER2_RESOLVE_ALWAYS if you need the inode's meta-data.
98 * Most front-end inode locks do.
100 * - pass HAMMER2_RESOLVE_NEVER if you do not want to require that
101 * the inode data be resolved. This is used by the syncthr because
102 * it can run on an unresolved/out-of-sync cluster, and also by the
103 * vnode reclamation code to avoid unnecessary I/O (particularly when
104 * disposing of hundreds of thousands of cached vnodes).
106 * When an exclusive lock is obtained on an inode that is on the SYNCQ,
107 * HAMMER2 will automatically move the inode to the front of the queue before
108 * blocking to avoid long stalls against filesystem sync operations.
110 * The inode locking function locks the inode itself, resolves any stale
111 * chains in the inode's cluster, and allocates a fresh copy of the
112 * cluster with 1 ref and all the underlying chains locked.
114 * ip->cluster will be stable while the inode is locked.
116 * NOTE: We don't combine the inode/chain lock because putting away an
117 * inode would otherwise confuse multiple lock holders of the inode.
119 * NOTE: In-memory inodes always point to hardlink targets (the actual file),
120 * and never point to a hardlink pointer.
122 * NOTE: If caller passes HAMMER2_RESOLVE_RDONLY the exclusive locking code
123 * will feel free to reduce the chain set in the cluster as an
124 * optimization. It will still be validated against the quorum if
125 * appropriate, but the optimization might be able to reduce data
126 * accesses to one node. This flag is automatically set if the inode
127 * is locked with HAMMER2_RESOLVE_SHARED.
130 hammer2_inode_lock(hammer2_inode_t *ip, int how)
134 hammer2_inode_ref(ip);
138 * Inode structure mutex - Shared lock
140 if (how & HAMMER2_RESOLVE_SHARED) {
141 /*how |= HAMMER2_RESOLVE_RDONLY; not used */
142 hammer2_mtx_sh(&ip->lock);
147 * Inode structure mutex - Exclusive lock
149 * The exclusive lock must wait for inodes on SYNCQ to flush
150 * first, to ensure that meta-data dependencies such as the
151 * nlink count and related directory entries are not split
154 hammer2_mtx_ex(&ip->lock);
155 while ((ip->flags & HAMMER2_INODE_SYNCQ) && pmp) {
156 hammer2_spin_ex(&pmp->list_spin);
157 if (ip->flags & HAMMER2_INODE_SYNCQ) {
158 atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ_WAKEUP);
159 TAILQ_REMOVE(&pmp->syncq, ip, entry);
160 TAILQ_INSERT_HEAD(&pmp->syncq, ip, entry);
161 hammer2_spin_unex(&pmp->list_spin);
162 tsleep_interlock(&ip->flags, 0);
163 hammer2_mtx_unlock(&ip->lock);
164 tsleep(&ip->flags, PINTERLOCKED, "h2sync", 0);
165 hammer2_mtx_ex(&ip->lock);
168 hammer2_spin_unex(&pmp->list_spin);
174 * Release an inode lock. If another thread is blocked on SYNCQ_WAKEUP
178 hammer2_inode_unlock(hammer2_inode_t *ip)
180 if (ip->flags & HAMMER2_INODE_SYNCQ_WAKEUP) {
181 atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_WAKEUP);
182 hammer2_mtx_unlock(&ip->lock);
185 hammer2_mtx_unlock(&ip->lock);
187 hammer2_inode_drop(ip);
192 * Select a chain out of an inode's cluster and lock it.
194 * The inode does not have to be locked.
197 hammer2_inode_chain(hammer2_inode_t *ip, int clindex, int how)
199 hammer2_chain_t *chain;
200 hammer2_cluster_t *cluster;
202 hammer2_spin_sh(&ip->cluster_spin);
203 cluster = &ip->cluster;
204 if (clindex >= cluster->nchains)
207 chain = cluster->array[clindex].chain;
209 hammer2_chain_ref(chain);
210 hammer2_spin_unsh(&ip->cluster_spin);
211 hammer2_chain_lock(chain, how);
213 hammer2_spin_unsh(&ip->cluster_spin);
219 hammer2_inode_chain_and_parent(hammer2_inode_t *ip, int clindex,
220 hammer2_chain_t **parentp, int how)
222 hammer2_chain_t *chain;
223 hammer2_chain_t *parent;
226 hammer2_spin_sh(&ip->cluster_spin);
227 if (clindex >= ip->cluster.nchains)
230 chain = ip->cluster.array[clindex].chain;
232 hammer2_chain_ref(chain);
233 hammer2_spin_unsh(&ip->cluster_spin);
234 hammer2_chain_lock(chain, how);
236 hammer2_spin_unsh(&ip->cluster_spin);
240 * Get parent, lock order must be (parent, chain).
242 parent = chain->parent;
244 hammer2_chain_ref(parent);
245 hammer2_chain_unlock(chain);
246 hammer2_chain_lock(parent, how);
247 hammer2_chain_lock(chain, how);
249 if (ip->cluster.array[clindex].chain == chain &&
250 chain->parent == parent) {
257 hammer2_chain_unlock(chain);
258 hammer2_chain_drop(chain);
260 hammer2_chain_unlock(parent);
261 hammer2_chain_drop(parent);
270 * Temporarily release a lock held shared or exclusive. Caller must
271 * hold the lock shared or exclusive on call and lock will be released
274 * Restore a lock that was temporarily released.
277 hammer2_inode_lock_temp_release(hammer2_inode_t *ip)
279 return hammer2_mtx_temp_release(&ip->lock);
283 hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, hammer2_mtx_state_t ostate)
285 hammer2_mtx_temp_restore(&ip->lock, ostate);
289 * Upgrade a shared inode lock to exclusive and return. If the inode lock
290 * is already held exclusively this is a NOP.
292 * The caller MUST hold the inode lock either shared or exclusive on call
293 * and will own the lock exclusively on return.
295 * Returns non-zero if the lock was already exclusive prior to the upgrade.
298 hammer2_inode_lock_upgrade(hammer2_inode_t *ip)
302 if (mtx_islocked_ex(&ip->lock)) {
305 hammer2_mtx_unlock(&ip->lock);
306 hammer2_mtx_ex(&ip->lock);
313 * Downgrade an inode lock from exclusive to shared only if the inode
314 * lock was previously shared. If the inode lock was previously exclusive,
318 hammer2_inode_lock_downgrade(hammer2_inode_t *ip, int wasexclusive)
320 if (wasexclusive == 0)
321 mtx_downgrade(&ip->lock);
325 * Lookup an inode by inode number
328 hammer2_inode_lookup(hammer2_pfs_t *pmp, hammer2_tid_t inum)
336 hammer2_spin_ex(&pmp->inum_spin);
337 ip = RB_LOOKUP(hammer2_inode_tree, &pmp->inum_tree, inum);
339 hammer2_inode_ref(ip);
340 hammer2_spin_unex(&pmp->inum_spin);
346 * Adding a ref to an inode is only legal if the inode already has at least
349 * (can be called with spinlock held)
352 hammer2_inode_ref(hammer2_inode_t *ip)
354 atomic_add_int(&ip->refs, 1);
355 if (hammer2_debug & 0x80000) {
356 kprintf("INODE+1 %p (%d->%d)\n", ip, ip->refs - 1, ip->refs);
362 * Drop an inode reference, freeing the inode when the last reference goes
366 hammer2_inode_drop(hammer2_inode_t *ip)
372 if (hammer2_debug & 0x80000) {
373 kprintf("INODE-1 %p (%d->%d)\n",
374 ip, ip->refs, ip->refs - 1);
381 * Transition to zero, must interlock with
382 * the inode inumber lookup tree (if applicable).
383 * It should not be possible for anyone to race
384 * the transition to 0.
388 hammer2_spin_ex(&pmp->inum_spin);
390 if (atomic_cmpset_int(&ip->refs, 1, 0)) {
391 KKASSERT(hammer2_mtx_refs(&ip->lock) == 0);
392 if (ip->flags & HAMMER2_INODE_ONRBTREE) {
393 atomic_clear_int(&ip->flags,
394 HAMMER2_INODE_ONRBTREE);
395 RB_REMOVE(hammer2_inode_tree,
396 &pmp->inum_tree, ip);
399 hammer2_spin_unex(&pmp->inum_spin);
404 * Cleaning out ip->cluster isn't entirely
407 hammer2_inode_repoint(ip, NULL, NULL);
409 kfree(ip, pmp->minode);
410 atomic_add_long(&pmp->inmem_inodes, -1);
411 ip = NULL; /* will terminate loop */
413 hammer2_spin_unex(&ip->pmp->inum_spin);
417 * Non zero transition
419 if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
426 * Get the vnode associated with the given inode, allocating the vnode if
427 * necessary. The vnode will be returned exclusively locked.
429 * *errorp is set to a UNIX error, not a HAMMER2 error.
431 * The caller must lock the inode (shared or exclusive).
433 * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
437 hammer2_igetv(hammer2_inode_t *ip, int *errorp)
443 KKASSERT(pmp != NULL);
448 * Attempt to reuse an existing vnode assignment. It is
449 * possible to race a reclaim so the vget() may fail. The
450 * inode must be unlocked during the vget() to avoid a
451 * deadlock against a reclaim.
458 * Inode must be unlocked during the vget() to avoid
459 * possible deadlocks, but leave the ip ref intact.
461 * vnode is held to prevent destruction during the
462 * vget(). The vget() can still fail if we lost
463 * a reclaim race on the vnode.
465 hammer2_mtx_state_t ostate;
468 ostate = hammer2_inode_lock_temp_release(ip);
469 if (vget(vp, LK_EXCLUSIVE)) {
471 hammer2_inode_lock_temp_restore(ip, ostate);
474 hammer2_inode_lock_temp_restore(ip, ostate);
476 /* vp still locked and ref from vget */
478 kprintf("hammer2: igetv race %p/%p\n",
488 * No vnode exists, allocate a new vnode. Beware of
489 * allocation races. This function will return an
490 * exclusively locked and referenced vnode.
492 *errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
494 kprintf("hammer2: igetv getnewvnode failed %d\n",
501 * Lock the inode and check for an allocation race.
503 wasexclusive = hammer2_inode_lock_upgrade(ip);
504 if (ip->vp != NULL) {
507 hammer2_inode_lock_downgrade(ip, wasexclusive);
511 switch (ip->meta.type) {
512 case HAMMER2_OBJTYPE_DIRECTORY:
515 case HAMMER2_OBJTYPE_REGFILE:
517 * Regular file must use buffer cache I/O
518 * (VKVABIO cpu sync semantics supported)
521 vsetflags(vp, VKVABIO);
522 vinitvmio(vp, ip->meta.size,
524 (int)ip->meta.size & HAMMER2_LBUFMASK);
526 case HAMMER2_OBJTYPE_SOFTLINK:
528 * XXX for now we are using the generic file_read
529 * and file_write code so we need a buffer cache
532 * (VKVABIO cpu sync semantics supported)
535 vsetflags(vp, VKVABIO);
536 vinitvmio(vp, ip->meta.size,
538 (int)ip->meta.size & HAMMER2_LBUFMASK);
540 case HAMMER2_OBJTYPE_CDEV:
543 case HAMMER2_OBJTYPE_BDEV:
544 vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
545 if (ip->meta.type != HAMMER2_OBJTYPE_CDEV)
551 case HAMMER2_OBJTYPE_FIFO:
553 vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
555 case HAMMER2_OBJTYPE_SOCKET:
559 panic("hammer2: unhandled objtype %d",
564 if (ip == pmp->iroot)
565 vsetflags(vp, VROOT);
569 hammer2_inode_ref(ip); /* vp association */
570 hammer2_inode_lock_downgrade(ip, wasexclusive);
575 * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
577 if (hammer2_debug & 0x0002) {
578 kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n",
579 vp, vp->v_refcnt, vp->v_auxrefs);
585 * Returns the inode associated with the passed-in cluster, allocating a new
586 * hammer2_inode structure if necessary, then synchronizing it to the passed
587 * xop cluster. When synchronizing, if idx >= 0, only cluster index (idx)
588 * is synchronized. Otherwise the whole cluster is synchronized. inum will
589 * be extracted from the passed-in xop and the inum argument will be ignored.
591 * If xop is passed as NULL then a new hammer2_inode is allocated with the
592 * specified inum, and returned. For normal inodes, the inode will be
593 * indexed in memory and if it already exists the existing ip will be
594 * returned instead of allocating a new one. The superroot and PFS inodes
595 * are not indexed in memory.
597 * The passed-in cluster must be locked and will remain locked on return.
598 * The returned inode will be locked and the caller may dispose of both
599 * via hammer2_inode_unlock() + hammer2_inode_drop(). However, if the caller
600 * needs to resolve a hardlink it must ref/unlock/relock/drop the inode.
602 * The hammer2_inode structure regulates the interface between the high level
603 * kernel VNOPS API and the filesystem backend (the chains).
605 * On return the inode is locked with the supplied cluster.
608 hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_xop_head_t *xop,
609 hammer2_tid_t inum, int idx)
611 hammer2_inode_t *nip;
612 const hammer2_inode_data_t *iptmp;
613 const hammer2_inode_data_t *nipdata;
615 KKASSERT(xop == NULL ||
616 hammer2_cluster_type(&xop->cluster) ==
617 HAMMER2_BREF_TYPE_INODE);
621 * Interlocked lookup/ref of the inode. This code is only needed
622 * when looking up inodes with nlinks != 0 (TODO: optimize out
623 * otherwise and test for duplicates).
625 * Cluster can be NULL during the initial pfs allocation.
628 iptmp = &hammer2_xop_gdata(xop)->ipdata;
629 inum = iptmp->meta.inum;
630 hammer2_xop_pdata(xop);
633 nip = hammer2_inode_lookup(pmp, inum);
636 * Handle SMP race (not applicable to the super-root spmp
637 * which can't index inodes due to duplicative inode numbers).
639 hammer2_mtx_ex(&nip->lock);
640 if (pmp->spmp_hmp == NULL &&
641 (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
642 hammer2_mtx_unlock(&nip->lock);
643 hammer2_inode_drop(nip);
648 hammer2_inode_repoint_one(nip, &xop->cluster,
651 hammer2_inode_repoint(nip, NULL, &xop->cluster);
657 * We couldn't find the inode number, create a new inode and try to
658 * insert it, handle insertion races.
660 nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
661 spin_init(&nip->cluster_spin, "h2clspin");
662 atomic_add_long(&pmp->inmem_inodes, 1);
663 hammer2_pfs_memory_inc(pmp);
664 hammer2_pfs_memory_wakeup(pmp);
666 nip->flags = HAMMER2_INODE_SROOT;
669 * Initialize nip's cluster. A cluster is provided for normal
670 * inodes but typically not for the super-root or PFS inodes.
672 nip->cluster.refs = 1;
673 nip->cluster.pmp = pmp;
674 nip->cluster.flags |= HAMMER2_CLUSTER_INODE;
676 nipdata = &hammer2_xop_gdata(xop)->ipdata;
677 nip->meta = nipdata->meta;
678 hammer2_xop_pdata(xop);
679 atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);
680 hammer2_inode_repoint(nip, NULL, &xop->cluster);
682 nip->meta.inum = inum; /* PFS inum is always 1 XXX */
683 /* mtime will be updated when a cluster is available */
684 atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD); /*XXX*/
690 * ref and lock on nip gives it state compatible to after a
691 * hammer2_inode_lock() call.
694 hammer2_mtx_init(&nip->lock, "h2inode");
695 hammer2_mtx_ex(&nip->lock);
696 /* combination of thread lock and chain lock == inode lock */
699 * Attempt to add the inode. If it fails we raced another inode
700 * get. Undo all the work and try again.
702 if (pmp->spmp_hmp == NULL) {
703 hammer2_spin_ex(&pmp->inum_spin);
704 if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
705 hammer2_spin_unex(&pmp->inum_spin);
706 hammer2_mtx_unlock(&nip->lock);
707 hammer2_inode_drop(nip);
710 atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
712 hammer2_spin_unex(&pmp->inum_spin);
718 * Create a PFS inode under the superroot. This function will create the
719 * inode, its media chains, and also insert it into the media.
721 * Caller must be in a flush transaction because we are inserting the inode
725 hammer2_inode_create_pfs(hammer2_pfs_t *spmp,
726 const uint8_t *name, size_t name_len,
729 hammer2_xop_create_t *xop;
730 hammer2_inode_t *pip;
731 hammer2_inode_t *nip;
736 uint8_t pip_comp_algo;
737 uint8_t pip_check_algo;
738 hammer2_tid_t pip_inum;
744 lhc = hammer2_dirhash(name, name_len);
748 * Locate the inode or indirect block to create the new
749 * entry in. At the same time check for key collisions
750 * and iterate until we don't get one.
752 * Lock the directory exclusively for now to guarantee that
753 * we can find an unused lhc for the name. Due to collisions,
754 * two different creates can end up with the same lhc so we
755 * cannot depend on the OS to prevent the collision.
757 hammer2_inode_lock(pip, 0);
759 pip_uid = pip->meta.uid;
760 pip_gid = pip->meta.gid;
761 pip_mode = pip->meta.mode;
762 pip_comp_algo = pip->meta.comp_algo;
763 pip_check_algo = pip->meta.check_algo;
764 pip_inum = (pip == pip->pmp->iroot) ? 1 : pip->meta.inum;
767 * Locate an unused key in the collision space.
770 hammer2_xop_scanlhc_t *sxop;
771 hammer2_key_t lhcbase;
774 sxop = hammer2_xop_alloc(pip, HAMMER2_XOP_MODIFYING);
776 hammer2_xop_start(&sxop->head, &hammer2_scanlhc_desc);
777 while ((error = hammer2_xop_collect(&sxop->head, 0)) == 0) {
778 if (lhc != sxop->head.cluster.focus->bref.key)
782 hammer2_xop_retire(&sxop->head, HAMMER2_XOPMASK_VOP);
785 if (error != HAMMER2_ERROR_ENOENT)
790 if ((lhcbase ^ lhc) & ~HAMMER2_DIRHASH_LOMASK) {
791 error = HAMMER2_ERROR_ENOSPC;
797 * Create the inode with the lhc as the key.
799 xop = hammer2_xop_alloc(pip, HAMMER2_XOP_MODIFYING);
801 xop->flags = HAMMER2_INSERT_PFSROOT;
802 bzero(&xop->meta, sizeof(xop->meta));
804 xop->meta.type = HAMMER2_OBJTYPE_DIRECTORY;
806 xop->meta.iparent = pip_inum;
808 /* Inherit parent's inode compression mode. */
809 xop->meta.comp_algo = pip_comp_algo;
810 xop->meta.check_algo = pip_check_algo;
811 xop->meta.version = HAMMER2_INODE_VERSION_ONE;
812 hammer2_update_time(&xop->meta.ctime);
813 xop->meta.mtime = xop->meta.ctime;
814 xop->meta.mode = 0755;
815 xop->meta.nlinks = 1;
818 * Regular files and softlinks allow a small amount of data to be
819 * directly embedded in the inode. This flag will be cleared if
820 * the size is extended past the embedded limit.
822 if (xop->meta.type == HAMMER2_OBJTYPE_REGFILE ||
823 xop->meta.type == HAMMER2_OBJTYPE_SOFTLINK) {
824 xop->meta.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
826 hammer2_xop_setname(&xop->head, name, name_len);
827 xop->meta.name_len = name_len;
828 xop->meta.name_key = lhc;
829 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
831 hammer2_xop_start(&xop->head, &hammer2_inode_create_desc);
833 error = hammer2_xop_collect(&xop->head, 0);
835 kprintf("CREATE INODE %*.*s\n",
836 (int)name_len, (int)name_len, name);
845 * Set up the new inode if not a hardlink pointer.
847 * NOTE: *_get() integrates chain's lock into the inode lock.
849 * NOTE: Only one new inode can currently be created per
850 * transaction. If the need arises we can adjust
851 * hammer2_trans_init() to allow more.
853 * NOTE: nipdata will have chain's blockset data.
855 nip = hammer2_inode_get(pip->pmp, &xop->head, -1, -1);
856 nip->comp_heuristic = 0;
858 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
860 hammer2_inode_unlock(pip);
866 hammer2_inode_create_normal(hammer2_inode_t *pip,
867 struct vattr *vap, struct ucred *cred,
868 hammer2_key_t inum, int *errorp)
870 hammer2_xop_create_t *xop;
871 hammer2_inode_t *dip;
872 hammer2_inode_t *nip;
878 uint8_t pip_comp_algo;
879 uint8_t pip_check_algo;
880 hammer2_tid_t pip_inum;
883 dip = pip->pmp->iroot;
884 KKASSERT(dip != NULL);
889 hammer2_inode_lock(dip, 0);
891 pip_uid = pip->meta.uid;
892 pip_gid = pip->meta.gid;
893 pip_mode = pip->meta.mode;
894 pip_comp_algo = pip->meta.comp_algo;
895 pip_check_algo = pip->meta.check_algo;
896 pip_inum = (pip == pip->pmp->iroot) ? 1 : pip->meta.inum;
899 * Create the inode using (inum) as the key.
901 xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
904 bzero(&xop->meta, sizeof(xop->meta));
908 * Setup the inode meta-data
910 xop->meta.type = hammer2_get_obj_type(vap->va_type);
912 switch (xop->meta.type) {
913 case HAMMER2_OBJTYPE_CDEV:
914 case HAMMER2_OBJTYPE_BDEV:
915 xop->meta.rmajor = vap->va_rmajor;
916 xop->meta.rminor = vap->va_rminor;
921 type = xop->meta.type;
923 xop->meta.inum = inum;
924 xop->meta.iparent = pip_inum;
926 /* Inherit parent's inode compression mode. */
927 xop->meta.comp_algo = pip_comp_algo;
928 xop->meta.check_algo = pip_check_algo;
929 xop->meta.version = HAMMER2_INODE_VERSION_ONE;
930 hammer2_update_time(&xop->meta.ctime);
931 xop->meta.mtime = xop->meta.ctime;
932 xop->meta.mode = vap->va_mode;
933 xop->meta.nlinks = 1;
935 xuid = hammer2_to_unix_xid(&pip_uid);
936 xuid = vop_helper_create_uid(dip->pmp->mp, pip_mode,
939 if (vap->va_vaflags & VA_UID_UUID_VALID)
940 xop->meta.uid = vap->va_uid_uuid;
941 else if (vap->va_uid != (uid_t)VNOVAL)
942 hammer2_guid_to_uuid(&xop->meta.uid, vap->va_uid);
944 hammer2_guid_to_uuid(&xop->meta.uid, xuid);
946 if (vap->va_vaflags & VA_GID_UUID_VALID)
947 xop->meta.gid = vap->va_gid_uuid;
948 else if (vap->va_gid != (gid_t)VNOVAL)
949 hammer2_guid_to_uuid(&xop->meta.gid, vap->va_gid);
951 xop->meta.gid = pip_gid;
954 * Regular files and softlinks allow a small amount of data to be
955 * directly embedded in the inode. This flag will be cleared if
956 * the size is extended past the embedded limit.
958 if (xop->meta.type == HAMMER2_OBJTYPE_REGFILE ||
959 xop->meta.type == HAMMER2_OBJTYPE_SOFTLINK) {
960 xop->meta.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
963 xop->meta.name_len = hammer2_xop_setname_inum(&xop->head, inum);
964 xop->meta.name_key = inum;
967 * Create the inode media chains
969 hammer2_xop_start(&xop->head, &hammer2_inode_create_desc);
971 error = hammer2_xop_collect(&xop->head, 0);
973 kprintf("CREATE INODE %*.*s\n",
974 (int)name_len, (int)name_len, name);
983 * Set up the new inode if not a hardlink pointer.
985 * NOTE: *_get() integrates chain's lock into the inode lock.
987 * NOTE: Only one new inode can currently be created per
988 * transaction. If the need arises we can adjust
989 * hammer2_trans_init() to allow more.
991 * NOTE: nipdata will have chain's blockset data.
993 nip = hammer2_inode_get(dip->pmp, &xop->head, -1, -1);
994 nip->comp_heuristic = 0;
996 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
997 hammer2_inode_unlock(dip);
1003 * Create a directory entry under dip with the specified name, inode number,
1004 * and OBJTYPE (type).
1006 * This returns a UNIX errno code, not a HAMMER2_ERROR_* code.
1009 hammer2_dirent_create(hammer2_inode_t *dip, const char *name, size_t name_len,
1010 hammer2_key_t inum, uint8_t type)
1012 hammer2_xop_mkdirent_t *xop;
1019 KKASSERT(name != NULL);
1020 lhc = hammer2_dirhash(name, name_len);
1023 * Locate the inode or indirect block to create the new
1024 * entry in. At the same time check for key collisions
1025 * and iterate until we don't get one.
1027 * Lock the directory exclusively for now to guarantee that
1028 * we can find an unused lhc for the name. Due to collisions,
1029 * two different creates can end up with the same lhc so we
1030 * cannot depend on the OS to prevent the collision.
1032 hammer2_inode_lock(dip, 0);
1035 * If name specified, locate an unused key in the collision space.
1036 * Otherwise use the passed-in lhc directly.
1039 hammer2_xop_scanlhc_t *sxop;
1040 hammer2_key_t lhcbase;
1043 sxop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
1045 hammer2_xop_start(&sxop->head, &hammer2_scanlhc_desc);
1046 while ((error = hammer2_xop_collect(&sxop->head, 0)) == 0) {
1047 if (lhc != sxop->head.cluster.focus->bref.key)
1051 hammer2_xop_retire(&sxop->head, HAMMER2_XOPMASK_VOP);
1054 if (error != HAMMER2_ERROR_ENOENT)
1059 if ((lhcbase ^ lhc) & ~HAMMER2_DIRHASH_LOMASK) {
1060 error = HAMMER2_ERROR_ENOSPC;
1066 * Create the directory entry with the lhc as the key.
1068 xop = hammer2_xop_alloc(dip, HAMMER2_XOP_MODIFYING);
1070 bzero(&xop->dirent, sizeof(xop->dirent));
1071 xop->dirent.inum = inum;
1072 xop->dirent.type = type;
1073 xop->dirent.namlen = name_len;
1075 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1076 hammer2_xop_setname(&xop->head, name, name_len);
1078 hammer2_xop_start(&xop->head, &hammer2_inode_mkdirent_desc);
1080 error = hammer2_xop_collect(&xop->head, 0);
1082 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1084 error = hammer2_error_to_errno(error);
1085 hammer2_inode_unlock(dip);
1091 * Repoint ip->cluster's chains to cluster's chains and fixup the default
1092 * focus. All items, valid or invalid, are repointed. hammer2_xop_start()
1093 * filters out invalid or non-matching elements.
1095 * Caller must hold the inode and cluster exclusive locked, if not NULL,
1096 * must also be locked.
1098 * Cluster may be NULL to clean out any chains in ip->cluster.
1101 hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
1102 hammer2_cluster_t *cluster)
1104 hammer2_chain_t *dropch[HAMMER2_MAXCLUSTER];
1105 hammer2_chain_t *ochain;
1106 hammer2_chain_t *nchain;
1109 bzero(dropch, sizeof(dropch));
1112 * Replace chains in ip->cluster with chains from cluster and
1113 * adjust the focus if necessary.
1115 * NOTE: nchain and/or ochain can be NULL due to gaps
1116 * in the cluster arrays.
1118 hammer2_spin_ex(&ip->cluster_spin);
1119 for (i = 0; cluster && i < cluster->nchains; ++i) {
1121 * Do not replace elements which are the same. Also handle
1122 * element count discrepancies.
1124 nchain = cluster->array[i].chain;
1125 if (i < ip->cluster.nchains) {
1126 ochain = ip->cluster.array[i].chain;
1127 if (ochain == nchain)
1136 ip->cluster.array[i].chain = nchain;
1137 ip->cluster.array[i].flags &= ~HAMMER2_CITEM_INVALID;
1138 ip->cluster.array[i].flags |= cluster->array[i].flags &
1139 HAMMER2_CITEM_INVALID;
1141 hammer2_chain_ref(nchain);
1146 * Release any left-over chains in ip->cluster.
1148 while (i < ip->cluster.nchains) {
1149 nchain = ip->cluster.array[i].chain;
1151 ip->cluster.array[i].chain = NULL;
1152 ip->cluster.array[i].flags |= HAMMER2_CITEM_INVALID;
1159 * Fixup fields. Note that the inode-embedded cluster is never
1163 ip->cluster.nchains = cluster->nchains;
1164 ip->cluster.focus = cluster->focus;
1165 ip->cluster.flags = cluster->flags & ~HAMMER2_CLUSTER_LOCKED;
1167 ip->cluster.nchains = 0;
1168 ip->cluster.focus = NULL;
1169 ip->cluster.flags &= ~HAMMER2_CLUSTER_ZFLAGS;
1172 hammer2_spin_unex(&ip->cluster_spin);
1175 * Cleanup outside of spinlock
1179 hammer2_chain_drop(dropch[i]);
1184 * Repoint a single element from the cluster to the ip. Used by the
1185 * synchronization threads to piecemeal update inodes. Does not change
1186 * focus and requires inode to be re-locked to clean-up flags (XXX).
1189 hammer2_inode_repoint_one(hammer2_inode_t *ip, hammer2_cluster_t *cluster,
1192 hammer2_chain_t *ochain;
1193 hammer2_chain_t *nchain;
1196 hammer2_spin_ex(&ip->cluster_spin);
1197 KKASSERT(idx < cluster->nchains);
1198 if (idx < ip->cluster.nchains) {
1199 ochain = ip->cluster.array[idx].chain;
1200 nchain = cluster->array[idx].chain;
1203 nchain = cluster->array[idx].chain;
1204 for (i = ip->cluster.nchains; i <= idx; ++i) {
1205 bzero(&ip->cluster.array[i],
1206 sizeof(ip->cluster.array[i]));
1207 ip->cluster.array[i].flags |= HAMMER2_CITEM_INVALID;
1209 ip->cluster.nchains = idx + 1;
1211 if (ochain != nchain) {
1215 ip->cluster.array[idx].chain = nchain;
1216 ip->cluster.array[idx].flags &= ~HAMMER2_CITEM_INVALID;
1217 ip->cluster.array[idx].flags |= cluster->array[idx].flags &
1218 HAMMER2_CITEM_INVALID;
1220 hammer2_spin_unex(&ip->cluster_spin);
1221 if (ochain != nchain) {
1223 hammer2_chain_ref(nchain);
1225 hammer2_chain_drop(ochain);
1230 * Called with a locked inode to finish unlinking an inode after xop_unlink
1231 * had been run. This function is responsible for decrementing nlinks.
1233 * We don't bother decrementing nlinks if the file is not open and this was
1236 * If the inode is a hardlink target it's chain has not yet been deleted,
1237 * otherwise it's chain has been deleted.
1239 * If isopen then any prior deletion was not permanent and the inode is
1240 * left intact with nlinks == 0;
1243 hammer2_inode_unlink_finisher(hammer2_inode_t *ip, int isopen)
1251 * Decrement nlinks. If this is the last link and the file is
1252 * not open we can just delete the inode and not bother dropping
1253 * nlinks to 0 (avoiding unnecessary block updates).
1255 if (ip->meta.nlinks == 1) {
1256 atomic_set_int(&ip->flags, HAMMER2_INODE_ISUNLINKED);
1261 hammer2_inode_modify(ip);
1263 if ((int64_t)ip->meta.nlinks < 0)
1264 ip->meta.nlinks = 0; /* safety */
1267 * If nlinks is not zero we are done. However, this should only be
1268 * possible with a hardlink target. If the inode is an embedded
1269 * hardlink nlinks should have dropped to zero, warn and proceed
1270 * with the next step.
1272 if (ip->meta.nlinks) {
1273 if ((ip->meta.name_key & HAMMER2_DIRHASH_VISIBLE) == 0)
1275 kprintf("hammer2_inode_unlink: nlinks was not 0 (%jd)\n",
1276 (intmax_t)ip->meta.nlinks);
1281 hammer2_knote(ip->vp, NOTE_DELETE);
1284 * nlinks is now an implied zero, delete the inode if not open.
1285 * We avoid unnecessary media updates by not bothering to actually
1286 * decrement nlinks for the 1->0 transition
1288 * Put the inode on the sideq to ensure that any disconnected chains
1289 * get properly flushed (so they can be freed).
1292 hammer2_xop_destroy_t *xop;
1295 hammer2_inode_delayed_sideq(ip);
1296 atomic_set_int(&ip->flags, HAMMER2_INODE_ISDELETED);
1297 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
1298 hammer2_xop_start(&xop->head, &hammer2_inode_destroy_desc);
1299 error = hammer2_xop_collect(&xop->head, 0);
1300 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1302 error = 0; /* XXX */
1308 * Mark an inode as being modified, meaning that the caller will modify
1311 * If a vnode is present we set the vnode dirty and the nominal filesystem
1312 * sync will also handle synchronizing the inode meta-data. If no vnode
1313 * is present we must ensure that the inode is on pmp->sideq.
1315 * NOTE: We must always queue the inode to the sideq. This allows H2 to
1316 * shortcut vsyncscan() and flush inodes and their related vnodes
1317 * in a two stages. H2 still calls vfsync() for each vnode.
1319 * NOTE: No mtid (modify_tid) is passed into this routine. The caller is
1320 * only modifying the in-memory inode. A modify_tid is synchronized
1321 * later when the inode gets flushed.
1323 * NOTE: As an exception to the general rule, the inode MAY be locked
1324 * shared for this particular call.
1327 hammer2_inode_modify(hammer2_inode_t *ip)
1329 atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1331 vsetisdirty(ip->vp);
1332 if (ip->pmp && (ip->flags & HAMMER2_INODE_NOSIDEQ) == 0)
1333 hammer2_inode_delayed_sideq(ip);
1337 * Synchronize the inode's frontend state with the chain state prior
1338 * to any explicit flush of the inode or any strategy write call. This
1339 * does not flush the inode's chain or its sub-topology to media (higher
1340 * level layers are responsible for doing that).
1342 * Called with a locked inode inside a normal transaction.
1344 * inode must be locked.
1347 hammer2_inode_chain_sync(hammer2_inode_t *ip)
1352 if (ip->flags & (HAMMER2_INODE_RESIZED | HAMMER2_INODE_MODIFIED)) {
1353 hammer2_xop_fsync_t *xop;
1355 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
1356 xop->clear_directdata = 0;
1357 if (ip->flags & HAMMER2_INODE_RESIZED) {
1358 if ((ip->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1359 ip->meta.size > HAMMER2_EMBEDDED_BYTES) {
1360 ip->meta.op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1361 xop->clear_directdata = 1;
1363 xop->osize = ip->osize;
1365 xop->osize = ip->meta.size; /* safety */
1367 xop->ipflags = ip->flags;
1368 xop->meta = ip->meta;
1370 atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED |
1371 HAMMER2_INODE_MODIFIED);
1372 hammer2_xop_start(&xop->head, &hammer2_inode_chain_sync_desc);
1373 error = hammer2_xop_collect(&xop->head, 0);
1374 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1375 if (error == HAMMER2_ERROR_ENOENT)
1378 kprintf("hammer2: unable to fsync inode %p\n", ip);
1380 atomic_set_int(&ip->flags,
1381 xop->ipflags & (HAMMER2_INODE_RESIZED |
1382 HAMMER2_INODE_MODIFIED));
1384 /* XXX return error somehow? */
1391 * Flushes the inode's chain and its sub-topology to media. Interlocks
1392 * HAMMER2_INODE_DIRTYDATA by clearing it prior to the flush. Any strategy
1393 * function creating or modifying a chain under this inode will re-set the
1396 * inode must be locked.
1399 hammer2_inode_chain_flush(hammer2_inode_t *ip)
1401 hammer2_xop_fsync_t *xop;
1404 atomic_clear_int(&ip->flags, HAMMER2_INODE_DIRTYDATA);
1405 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING |
1406 HAMMER2_XOP_INODE_STOP);
1407 hammer2_xop_start(&xop->head, &hammer2_inode_flush_desc);
1408 error = hammer2_xop_collect(&xop->head, HAMMER2_XOP_COLLECT_WAITALL);
1409 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1410 if (error == HAMMER2_ERROR_ENOENT)
1418 * The normal filesystem sync no longer has visibility to an inode structure
1419 * after its vnode has been reclaimed. In this situation a dirty inode may
1420 * require additional processing to synchronize ip->meta to its underlying
1423 * In particular, reclaims can occur in almost any state (for example, when
1424 * doing operations on unrelated vnodes) and flushing the reclaimed inode
1425 * in the reclaim path itself is a non-starter.
1427 * Caller must be in a transaction.
1430 hammer2_inode_run_sideq(hammer2_pfs_t *pmp, int doall)
1432 hammer2_xop_destroy_t *xop;
1433 hammer2_inode_sideq_t *ipul;
1434 hammer2_inode_t *ip;
1438 * Nothing to do if sideq is empty or (if doall == 0) there just
1439 * aren't very many sideq entries.
1441 if (TAILQ_EMPTY(&pmp->sideq))
1444 if (pmp->sideq_count > (pmp->inum_count >> 3)) {
1445 if (hammer2_debug & 0x0001) {
1446 kprintf("hammer2: flush sideq %ld/%ld\n",
1447 pmp->sideq_count, pmp->inum_count);
1452 if (doall == 0 && pmp->sideq_count <= (pmp->inum_count >> 3))
1455 hammer2_spin_ex(&pmp->list_spin);
1456 while ((ipul = TAILQ_FIRST(&pmp->sideq)) != NULL) {
1457 TAILQ_REMOVE(&pmp->sideq, ipul, entry);
1460 KKASSERT(ip->flags & HAMMER2_INODE_ONSIDEQ);
1461 atomic_clear_int(&ip->flags, HAMMER2_INODE_ONSIDEQ);
1462 hammer2_spin_unex(&pmp->list_spin);
1463 kfree(ipul, pmp->minode);
1465 hammer2_inode_lock(ip, 0);
1466 if (ip->flags & HAMMER2_INODE_ISDELETED) {
1468 * The inode has already been deleted. This is a
1469 * fairly rare circumstance. For now we don't rock
1470 * the boat and synchronize it normally.
1472 hammer2_inode_chain_sync(ip);
1473 hammer2_inode_chain_flush(ip);
1474 } else if (ip->flags & HAMMER2_INODE_ISUNLINKED) {
1476 * The inode was unlinked while open. The inode must
1477 * be deleted and destroyed.
1479 xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING);
1480 hammer2_xop_start(&xop->head,
1481 &hammer2_inode_destroy_desc);
1482 error = hammer2_xop_collect(&xop->head, 0);
1483 /* XXX error handling */
1484 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1487 * The inode was dirty as-of the reclaim, requiring
1488 * synchronization of ip->meta with its underlying
1491 hammer2_inode_chain_sync(ip);
1492 hammer2_inode_chain_flush(ip);
1495 hammer2_inode_unlock(ip);
1496 hammer2_inode_drop(ip); /* ipul ref */
1498 hammer2_spin_ex(&pmp->list_spin);
1501 * If doall is 0 the original sideq_count was greater than
1502 * 1/8 the inode count. Add some hysteresis in the loop,
1503 * don't stop flushing until sideq_count drops below 1/16.
1505 if (doall == 0 && pmp->sideq_count <= (pmp->inum_count >> 4)) {
1506 if (hammer2_debug & 0x0001) {
1507 kprintf("hammer2: flush sideq %ld/%ld (end)\n",
1508 pmp->sideq_count, pmp->inum_count);
1513 hammer2_spin_unex(&pmp->list_spin);