hammer2 - Refactor frontend part 11/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_inode.c
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/lock.h>
40 #include <sys/uuid.h>
41
42 #include "hammer2.h"
43
44 #define INODE_DEBUG     0
45
46 static void hammer2_inode_move_to_hidden(hammer2_cluster_t **cparentp,
47                                          hammer2_cluster_t **clusterp,
48                                          hammer2_tid_t inum);
49
50 RB_GENERATE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
51              hammer2_tid_t, meta.inum);
52
53 int
54 hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
55 {
56         if (ip1->meta.inum < ip2->meta.inum)
57                 return(-1);
58         if (ip1->meta.inum > ip2->meta.inum)
59                 return(1);
60         return(0);
61 }
62
63 /*
64  * HAMMER2 inode locks
65  *
66  * HAMMER2 offers shared and exclusive locks on inodes.  Pass a mask of
67  * flags for options:
68  *
69  *      - pass HAMMER2_RESOLVE_SHARED if a shared lock is desired.  The
70  *        inode locking function will automatically set the RDONLY flag.
71  *
72  *      - pass HAMMER2_RESOLVE_ALWAYS if you need the inode's meta-data.
73  *        Most front-end inode locks do.
74  *
75  *      - pass HAMMER2_RESOLVE_NEVER if you do not want to require that
76  *        the inode data be resolved.  This is used by the syncthr because
77  *        it can run on an unresolved/out-of-sync cluster, and also by the
78  *        vnode reclamation code to avoid unnecessary I/O (particularly when
79  *        disposing of hundreds of thousands of cached vnodes).
80  *
81  * The inode locking function locks the inode itself, resolves any stale
82  * chains in the inode's cluster, and allocates a fresh copy of the
83  * cluster with 1 ref and all the underlying chains locked.
84  *
85  * ip->cluster will be stable while the inode is locked.
86  *
87  * NOTE: We don't combine the inode/chain lock because putting away an
88  *       inode would otherwise confuse multiple lock holders of the inode.
89  *
90  * NOTE: In-memory inodes always point to hardlink targets (the actual file),
91  *       and never point to a hardlink pointer.
92  *
93  * NOTE: If caller passes HAMMER2_RESOLVE_RDONLY the exclusive locking code
94  *       will feel free to reduce the chain set in the cluster as an
95  *       optimization.  It will still be validated against the quorum if
96  *       appropriate, but the optimization might be able to reduce data
97  *       accesses to one node.  This flag is automatically set if the inode
98  *       is locked with HAMMER2_RESOLVE_SHARED.
99  */
100 void
101 hammer2_inode_lock(hammer2_inode_t *ip, int how)
102 {
103         hammer2_inode_ref(ip);
104
105         /* 
106          * Inode structure mutex
107          */
108         if (how & HAMMER2_RESOLVE_SHARED) {
109                 /*how |= HAMMER2_RESOLVE_RDONLY; not used */
110                 hammer2_mtx_sh(&ip->lock);
111         } else {
112                 hammer2_mtx_ex(&ip->lock);
113         }
114 }
115
116 /*
117  * Create a locked copy of ip->cluster.  Note that the copy will have a
118  * ref on the cluster AND its chains and we don't want a second ref to
119  * either when we lock it.
120  *
121  * Exclusive inode locks set the template focus chain in (ip)
122  * as a hint.  Cluster locks can ALWAYS replace the focus in the
123  * working copy if the hint does not work out, so beware.
124  */
125 hammer2_cluster_t *
126 hammer2_inode_cluster(hammer2_inode_t *ip, int how)
127 {
128         hammer2_cluster_t *cluster;
129
130         cluster = hammer2_cluster_copy(&ip->cluster);
131         hammer2_cluster_lock(cluster, how);
132         hammer2_cluster_resolve(cluster);
133
134         /*
135          * cluster->focus will be set if resolving RESOLVE_ALWAYS, but
136          * only update the cached focus in the inode structure when taking
137          * out an exclusive lock.
138          */
139         if ((how & HAMMER2_RESOLVE_SHARED) == 0)
140                 ip->cluster.focus = cluster->focus;
141
142         return cluster;
143 }
144
145 /*
146  * Select a chain out of an inode's cluster and lock it.
147  */
148 hammer2_chain_t *
149 hammer2_inode_chain(hammer2_inode_t *ip, int clindex, int how)
150 {
151         hammer2_chain_t *chain;
152
153         if (clindex >= ip->cluster.nchains)
154                 chain = NULL;
155         else
156                 chain = ip->cluster.array[clindex].chain;
157         if (chain) {
158                 hammer2_chain_ref(chain);
159                 hammer2_chain_lock(chain, how);
160         }
161         return chain;
162 }
163
164 void
165 hammer2_inode_unlock(hammer2_inode_t *ip, hammer2_cluster_t *cluster)
166 {
167         if (cluster) {
168                 hammer2_cluster_unlock(cluster);
169                 hammer2_cluster_drop(cluster);
170         }
171         hammer2_mtx_unlock(&ip->lock);
172         hammer2_inode_drop(ip);
173 }
174
175 /*
176  * Temporarily release a lock held shared or exclusive.  Caller must
177  * hold the lock shared or exclusive on call and lock will be released
178  * on return.
179  *
180  * Restore a lock that was temporarily released.
181  */
182 hammer2_mtx_state_t
183 hammer2_inode_lock_temp_release(hammer2_inode_t *ip)
184 {
185         return hammer2_mtx_temp_release(&ip->lock);
186 }
187
188 void
189 hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, hammer2_mtx_state_t ostate)
190 {
191         hammer2_mtx_temp_restore(&ip->lock, ostate);
192 }
193
194 /*
195  * Upgrade a shared inode lock to exclusive and return.  If the inode lock
196  * is already held exclusively this is a NOP.
197  *
198  * The caller MUST hold the inode lock either shared or exclusive on call
199  * and will own the lock exclusively on return.
200  *
201  * Returns non-zero if the lock was already exclusive prior to the upgrade.
202  */
203 int
204 hammer2_inode_lock_upgrade(hammer2_inode_t *ip)
205 {
206         int wasexclusive;
207
208         if (mtx_islocked_ex(&ip->lock)) {
209                 wasexclusive = 1;
210         } else {
211                 hammer2_mtx_unlock(&ip->lock);
212                 hammer2_mtx_ex(&ip->lock);
213                 wasexclusive = 0;
214         }
215         return wasexclusive;
216 }
217
218 /*
219  * Downgrade an inode lock from exclusive to shared only if the inode
220  * lock was previously shared.  If the inode lock was previously exclusive,
221  * this is a NOP.
222  */
223 void
224 hammer2_inode_lock_downgrade(hammer2_inode_t *ip, int wasexclusive)
225 {
226         if (wasexclusive == 0)
227                 mtx_downgrade(&ip->lock);
228 }
229
230 /*
231  * Lookup an inode by inode number
232  */
233 hammer2_inode_t *
234 hammer2_inode_lookup(hammer2_pfs_t *pmp, hammer2_tid_t inum)
235 {
236         hammer2_inode_t *ip;
237
238         KKASSERT(pmp);
239         if (pmp->spmp_hmp) {
240                 ip = NULL;
241         } else {
242                 hammer2_spin_ex(&pmp->inum_spin);
243                 ip = RB_LOOKUP(hammer2_inode_tree, &pmp->inum_tree, inum);
244                 if (ip)
245                         hammer2_inode_ref(ip);
246                 hammer2_spin_unex(&pmp->inum_spin);
247         }
248         return(ip);
249 }
250
251 /*
252  * Adding a ref to an inode is only legal if the inode already has at least
253  * one ref.
254  *
255  * (can be called with spinlock held)
256  */
257 void
258 hammer2_inode_ref(hammer2_inode_t *ip)
259 {
260         atomic_add_int(&ip->refs, 1);
261 }
262
263 /*
264  * Drop an inode reference, freeing the inode when the last reference goes
265  * away.
266  */
267 void
268 hammer2_inode_drop(hammer2_inode_t *ip)
269 {
270         hammer2_pfs_t *pmp;
271         hammer2_inode_t *pip;
272         u_int refs;
273
274         while (ip) {
275                 refs = ip->refs;
276                 cpu_ccfence();
277                 if (refs == 1) {
278                         /*
279                          * Transition to zero, must interlock with
280                          * the inode inumber lookup tree (if applicable).
281                          * It should not be possible for anyone to race
282                          * the transition to 0.
283                          */
284                         pmp = ip->pmp;
285                         KKASSERT(pmp);
286                         hammer2_spin_ex(&pmp->inum_spin);
287
288                         if (atomic_cmpset_int(&ip->refs, 1, 0)) {
289                                 KKASSERT(hammer2_mtx_refs(&ip->lock) == 0);
290                                 if (ip->flags & HAMMER2_INODE_ONRBTREE) {
291                                         atomic_clear_int(&ip->flags,
292                                                      HAMMER2_INODE_ONRBTREE);
293                                         RB_REMOVE(hammer2_inode_tree,
294                                                   &pmp->inum_tree, ip);
295                                 }
296                                 hammer2_spin_unex(&pmp->inum_spin);
297
298                                 pip = ip->pip;
299                                 ip->pip = NULL;
300                                 ip->pmp = NULL;
301
302                                 /*
303                                  * Cleaning out ip->cluster isn't entirely
304                                  * trivial.
305                                  */
306                                 hammer2_inode_repoint(ip, NULL, NULL);
307
308                                 /*
309                                  * We have to drop pip (if non-NULL) to
310                                  * dispose of our implied reference from
311                                  * ip->pip.  We can simply loop on it.
312                                  */
313                                 kfree(ip, pmp->minode);
314                                 atomic_add_long(&pmp->inmem_inodes, -1);
315                                 ip = pip;
316                                 /* continue with pip (can be NULL) */
317                         } else {
318                                 hammer2_spin_unex(&ip->pmp->inum_spin);
319                         }
320                 } else {
321                         /*
322                          * Non zero transition
323                          */
324                         if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
325                                 break;
326                 }
327         }
328 }
329
330 /*
331  * Get the vnode associated with the given inode, allocating the vnode if
332  * necessary.  The vnode will be returned exclusively locked.
333  *
334  * The caller must lock the inode (shared or exclusive).
335  *
336  * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
337  * races.
338  */
339 struct vnode *
340 hammer2_igetv(hammer2_inode_t *ip, int *errorp)
341 {
342         hammer2_pfs_t *pmp;
343         struct vnode *vp;
344
345         pmp = ip->pmp;
346         KKASSERT(pmp != NULL);
347         *errorp = 0;
348
349         for (;;) {
350                 /*
351                  * Attempt to reuse an existing vnode assignment.  It is
352                  * possible to race a reclaim so the vget() may fail.  The
353                  * inode must be unlocked during the vget() to avoid a
354                  * deadlock against a reclaim.
355                  */
356                 int wasexclusive;
357
358                 vp = ip->vp;
359                 if (vp) {
360                         /*
361                          * Inode must be unlocked during the vget() to avoid
362                          * possible deadlocks, but leave the ip ref intact.
363                          *
364                          * vnode is held to prevent destruction during the
365                          * vget().  The vget() can still fail if we lost
366                          * a reclaim race on the vnode.
367                          */
368                         hammer2_mtx_state_t ostate;
369
370                         vhold(vp);
371                         ostate = hammer2_inode_lock_temp_release(ip);
372                         if (vget(vp, LK_EXCLUSIVE)) {
373                                 vdrop(vp);
374                                 hammer2_inode_lock_temp_restore(ip, ostate);
375                                 continue;
376                         }
377                         hammer2_inode_lock_temp_restore(ip, ostate);
378                         vdrop(vp);
379                         /* vp still locked and ref from vget */
380                         if (ip->vp != vp) {
381                                 kprintf("hammer2: igetv race %p/%p\n",
382                                         ip->vp, vp);
383                                 vput(vp);
384                                 continue;
385                         }
386                         *errorp = 0;
387                         break;
388                 }
389
390                 /*
391                  * No vnode exists, allocate a new vnode.  Beware of
392                  * allocation races.  This function will return an
393                  * exclusively locked and referenced vnode.
394                  */
395                 *errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
396                 if (*errorp) {
397                         kprintf("hammer2: igetv getnewvnode failed %d\n",
398                                 *errorp);
399                         vp = NULL;
400                         break;
401                 }
402
403                 /*
404                  * Lock the inode and check for an allocation race.
405                  */
406                 wasexclusive = hammer2_inode_lock_upgrade(ip);
407                 if (ip->vp != NULL) {
408                         vp->v_type = VBAD;
409                         vx_put(vp);
410                         hammer2_inode_lock_downgrade(ip, wasexclusive);
411                         continue;
412                 }
413
414                 switch (ip->meta.type) {
415                 case HAMMER2_OBJTYPE_DIRECTORY:
416                         vp->v_type = VDIR;
417                         break;
418                 case HAMMER2_OBJTYPE_REGFILE:
419                         vp->v_type = VREG;
420                         vinitvmio(vp, ip->meta.size,
421                                   HAMMER2_LBUFSIZE,
422                                   (int)ip->meta.size & HAMMER2_LBUFMASK);
423                         break;
424                 case HAMMER2_OBJTYPE_SOFTLINK:
425                         /*
426                          * XXX for now we are using the generic file_read
427                          * and file_write code so we need a buffer cache
428                          * association.
429                          */
430                         vp->v_type = VLNK;
431                         vinitvmio(vp, ip->meta.size,
432                                   HAMMER2_LBUFSIZE,
433                                   (int)ip->meta.size & HAMMER2_LBUFMASK);
434                         break;
435                 case HAMMER2_OBJTYPE_CDEV:
436                         vp->v_type = VCHR;
437                         /* fall through */
438                 case HAMMER2_OBJTYPE_BDEV:
439                         vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
440                         if (ip->meta.type != HAMMER2_OBJTYPE_CDEV)
441                                 vp->v_type = VBLK;
442                         addaliasu(vp,
443                                   ip->meta.rmajor,
444                                   ip->meta.rminor);
445                         break;
446                 case HAMMER2_OBJTYPE_FIFO:
447                         vp->v_type = VFIFO;
448                         vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
449                         break;
450                 default:
451                         panic("hammer2: unhandled objtype %d",
452                               ip->meta.type);
453                         break;
454                 }
455
456                 if (ip == pmp->iroot)
457                         vsetflags(vp, VROOT);
458
459                 vp->v_data = ip;
460                 ip->vp = vp;
461                 hammer2_inode_ref(ip);          /* vp association */
462                 hammer2_inode_lock_downgrade(ip, wasexclusive);
463                 break;
464         }
465
466         /*
467          * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
468          */
469         if (hammer2_debug & 0x0002) {
470                 kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n",
471                         vp, vp->v_refcnt, vp->v_auxrefs);
472         }
473         return (vp);
474 }
475
476 /*
477  * Returns the inode associated with the passed-in cluster, creating the
478  * inode if necessary and synchronizing it to the passed-in cluster otherwise.
479  *
480  * The passed-in cluster must be locked and will remain locked on return.
481  * The returned inode will be locked and the caller may dispose of both
482  * via hammer2_inode_unlock() + hammer2_inode_drop().  However, if the caller
483  * needs to resolve a hardlink it must ref/unlock/relock/drop the inode.
484  *
485  * The hammer2_inode structure regulates the interface between the high level
486  * kernel VNOPS API and the filesystem backend (the chains).
487  *
488  * On return the inode is locked with the supplied cluster.
489  */
490 hammer2_inode_t *
491 hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_inode_t *dip,
492                   hammer2_cluster_t *cluster)
493 {
494         hammer2_inode_t *nip;
495         const hammer2_inode_data_t *iptmp;
496         const hammer2_inode_data_t *nipdata;
497
498         KKASSERT(cluster == NULL ||
499                  hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE);
500         KKASSERT(pmp);
501
502         /*
503          * Interlocked lookup/ref of the inode.  This code is only needed
504          * when looking up inodes with nlinks != 0 (TODO: optimize out
505          * otherwise and test for duplicates).
506          *
507          * Cluster can be NULL during the initial pfs allocation.
508          */
509 again:
510         while (cluster) {
511                 iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
512                 nip = hammer2_inode_lookup(pmp, iptmp->meta.inum);
513                 if (nip == NULL)
514                         break;
515
516                 hammer2_mtx_ex(&nip->lock);
517
518                 /*
519                  * Handle SMP race (not applicable to the super-root spmp
520                  * which can't index inodes due to duplicative inode numbers).
521                  */
522                 if (pmp->spmp_hmp == NULL &&
523                     (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
524                         hammer2_mtx_unlock(&nip->lock);
525                         hammer2_inode_drop(nip);
526                         continue;
527                 }
528                 hammer2_inode_repoint(nip, NULL, cluster);
529
530                 return nip;
531         }
532
533         /*
534          * We couldn't find the inode number, create a new inode.
535          */
536         nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
537         spin_init(&nip->cluster_spin, "h2clspin");
538         atomic_add_long(&pmp->inmem_inodes, 1);
539         hammer2_pfs_memory_inc(pmp);
540         hammer2_pfs_memory_wakeup(pmp);
541         if (pmp->spmp_hmp)
542                 nip->flags = HAMMER2_INODE_SROOT;
543
544         /*
545          * Initialize nip's cluster.  A cluster is provided for normal
546          * inodes but typically not for the super-root or PFS inodes.
547          */
548         nip->cluster.refs = 1;
549         nip->cluster.pmp = pmp;
550         nip->cluster.flags |= HAMMER2_CLUSTER_INODE;
551         if (cluster) {
552                 nipdata = &hammer2_cluster_rdata(cluster)->ipdata;
553                 nip->meta = nipdata->meta;
554                 hammer2_cluster_bref(cluster, &nip->bref);
555                 atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);
556                 hammer2_inode_repoint(nip, NULL, cluster);
557         } else {
558                 nip->meta.inum = 1;             /* PFS inum is always 1 XXX */
559                 /* mtime will be updated when a cluster is available */
560                 atomic_set_int(&nip->flags, HAMMER2_INODE_METAGOOD);/*XXX*/
561         }
562
563         nip->pip = dip;                         /* can be NULL */
564         if (dip)
565                 hammer2_inode_ref(dip); /* ref dip for nip->pip */
566
567         nip->pmp = pmp;
568
569         /*
570          * ref and lock on nip gives it state compatible to after a
571          * hammer2_inode_lock() call.
572          */
573         nip->refs = 1;
574         hammer2_mtx_init(&nip->lock, "h2inode");
575         hammer2_mtx_ex(&nip->lock);
576         /* combination of thread lock and chain lock == inode lock */
577
578         /*
579          * Attempt to add the inode.  If it fails we raced another inode
580          * get.  Undo all the work and try again.
581          */
582         if (pmp->spmp_hmp == NULL) {
583                 hammer2_spin_ex(&pmp->inum_spin);
584                 if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
585                         hammer2_spin_unex(&pmp->inum_spin);
586                         hammer2_mtx_unlock(&nip->lock);
587                         hammer2_inode_drop(nip);
588                         goto again;
589                 }
590                 atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
591                 hammer2_spin_unex(&pmp->inum_spin);
592         }
593
594         return (nip);
595 }
596
597 /*
598  * Create a new inode in the specified directory using the vattr to
599  * figure out the type of inode.
600  *
601  * If no error occurs the new inode with its cluster locked is returned in
602  * *nipp, otherwise an error is returned and *nipp is set to NULL.
603  *
604  * If vap and/or cred are NULL the related fields are not set and the
605  * inode type defaults to a directory.  This is used when creating PFSs
606  * under the super-root, so the inode number is set to 1 in this case.
607  *
608  * dip is not locked on entry.
609  *
610  * NOTE: When used to create a snapshot, the inode is temporarily associated
611  *       with the super-root spmp. XXX should pass new pmp for snapshot.
612  */
613 hammer2_inode_t *
614 hammer2_inode_create(hammer2_inode_t *dip,
615                      struct vattr *vap, struct ucred *cred,
616                      const uint8_t *name, size_t name_len,
617                      int flags, int *errorp)
618 {
619         hammer2_xop_scanlhc_t *sxop;
620         hammer2_xop_create_t *xop;
621         hammer2_inode_t *nip;
622         hammer2_key_t lhc;
623         int error;
624         uid_t xuid;
625         uuid_t dip_uid;
626         uuid_t dip_gid;
627         uint32_t dip_mode;
628         uint8_t dip_comp_algo;
629         uint8_t dip_check_algo;
630
631         lhc = hammer2_dirhash(name, name_len);
632         *errorp = 0;
633         nip = NULL;
634
635         /*
636          * Locate the inode or indirect block to create the new
637          * entry in.  At the same time check for key collisions
638          * and iterate until we don't get one.
639          *
640          * NOTE: hidden inodes do not have iterators.
641          *
642          * Lock the directory exclusively for now to guarantee that
643          * we can find an unused lhc for the name.  Due to collisions,
644          * two different creates can end up with the same lhc so we
645          * cannot depend on the OS to prevent the collision.
646          */
647         hammer2_inode_lock(dip, HAMMER2_RESOLVE_ALWAYS);
648
649         dip_uid = dip->meta.uid;
650         dip_gid = dip->meta.gid;
651         dip_mode = dip->meta.mode;
652         dip_comp_algo = dip->meta.comp_algo;
653         dip_check_algo = dip->meta.check_algo;
654
655         /*
656          * Locate an unused key in the collision space.
657          */
658         sxop = &hammer2_xop_alloc(dip)->xop_scanlhc;
659         sxop->lhc = lhc;
660         hammer2_xop_start(&sxop->head, hammer2_inode_xop_scanlhc);
661         while ((error = hammer2_xop_collect(&sxop->head, 0)) == 0) {
662                 if (lhc != sxop->head.cluster.focus->bref.key)
663                         break;
664                 ++lhc;
665         }
666         hammer2_xop_retire(&sxop->head, HAMMER2_XOPMASK_VOP);
667
668         if (error) {
669                 if (error != ENOENT)
670                         goto done2;
671                 ++lhc;
672                 error = 0;
673         }
674         if ((sxop->lhc ^ lhc) & ~HAMMER2_DIRHASH_LOMASK) {
675                 error = ENOSPC;
676                 goto done2;
677         }
678
679         /*
680          * Create the inode with the lhc as the key.
681          */
682         xop = &hammer2_xop_alloc(dip)->xop_create;
683         xop->lhc = lhc;
684         xop->flags = flags;
685         bzero(&xop->meta, sizeof(xop->meta));
686
687         if (vap) {
688                 xop->meta.type = hammer2_get_obj_type(vap->va_type);
689                 xop->meta.inum = hammer2_trans_newinum(dip->pmp);
690
691                 switch (xop->meta.type) {
692                 case HAMMER2_OBJTYPE_CDEV:
693                 case HAMMER2_OBJTYPE_BDEV:
694                         xop->meta.rmajor = vap->va_rmajor;
695                         xop->meta.rminor = vap->va_rminor;
696                         break;
697                 default:
698                         break;
699                 }
700         } else {
701                 xop->meta.type = HAMMER2_OBJTYPE_DIRECTORY;
702                 xop->meta.inum = 1;
703         }
704         
705         /* Inherit parent's inode compression mode. */
706         xop->meta.comp_algo = dip_comp_algo;
707         xop->meta.check_algo = dip_check_algo;
708         xop->meta.version = HAMMER2_INODE_VERSION_ONE;
709         hammer2_update_time(&xop->meta.ctime);
710         xop->meta.mtime = xop->meta.ctime;
711         if (vap)
712                 xop->meta.mode = vap->va_mode;
713         xop->meta.nlinks = 1;
714         if (vap) {
715                 if (dip && dip->pmp) {
716                         xuid = hammer2_to_unix_xid(&dip_uid);
717                         xuid = vop_helper_create_uid(dip->pmp->mp,
718                                                      dip_mode,
719                                                      xuid,
720                                                      cred,
721                                                      &vap->va_mode);
722                 } else {
723                         /* super-root has no dip and/or pmp */
724                         xuid = 0;
725                 }
726                 if (vap->va_vaflags & VA_UID_UUID_VALID)
727                         xop->meta.uid = vap->va_uid_uuid;
728                 else if (vap->va_uid != (uid_t)VNOVAL)
729                         hammer2_guid_to_uuid(&xop->meta.uid, vap->va_uid);
730                 else
731                         hammer2_guid_to_uuid(&xop->meta.uid, xuid);
732
733                 if (vap->va_vaflags & VA_GID_UUID_VALID)
734                         xop->meta.gid = vap->va_gid_uuid;
735                 else if (vap->va_gid != (gid_t)VNOVAL)
736                         hammer2_guid_to_uuid(&xop->meta.gid, vap->va_gid);
737                 else if (dip)
738                         xop->meta.gid = dip_gid;
739         }
740
741         /*
742          * Regular files and softlinks allow a small amount of data to be
743          * directly embedded in the inode.  This flag will be cleared if
744          * the size is extended past the embedded limit.
745          */
746         if (xop->meta.type == HAMMER2_OBJTYPE_REGFILE ||
747             xop->meta.type == HAMMER2_OBJTYPE_SOFTLINK) {
748                 xop->meta.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
749         }
750
751         xop->name = name;
752         xop->name_len = name_len;
753         xop->meta.name_len = name_len;
754         xop->meta.name_key = lhc;
755         KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
756
757         hammer2_xop_start(&xop->head, hammer2_inode_xop_create);
758
759         error = hammer2_xop_collect(&xop->head, 0);
760 #if INODE_DEBUG
761         kprintf("CREATE INODE %*.*s\n",
762                 (int)name_len, (int)name_len, name);
763 #endif
764
765         if (error) {
766                 *errorp = error;
767                 goto done;
768         }
769
770         /*
771          * Set up the new inode.
772          *
773          * NOTE: *_get() integrates chain's lock into the inode lock.
774          *
775          * NOTE: Only one new inode can currently be created per
776          *       transaction.  If the need arises we can adjust
777          *       hammer2_trans_init() to allow more.
778          *
779          * NOTE: nipdata will have chain's blockset data.
780          */
781         nip = hammer2_inode_get(dip->pmp, dip, &xop->head.cluster);
782         nip->comp_heuristic = 0;
783
784 done:
785         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
786 done2:
787         hammer2_inode_unlock(dip, NULL);
788
789         return (nip);
790 }
791
792 /*
793  * Connect the target inode represented by (cluster) to the media topology
794  * at (dip, name, len).  The caller can pass a rough *chainp, this function
795  * will issue lookup()s to position the parent chain properly for the
796  * chain insertion.
797  *
798  * If hlink is TRUE this function creates an OBJTYPE_HARDLINK directory
799  * entry instead of connecting (cluster).
800  *
801  * If hlink is FALSE this function expects (cluster) to be unparented.
802  */
803 int
804 hammer2_inode_connect(hammer2_inode_t *ip, hammer2_cluster_t **clusterp,
805                       int hlink,
806                       hammer2_inode_t *dip, hammer2_cluster_t *dcluster,
807                       const uint8_t *name, size_t name_len,
808                       hammer2_key_t lhc)
809 {
810         hammer2_inode_data_t *wipdata;
811         hammer2_cluster_t *ocluster;
812         hammer2_cluster_t *ncluster;
813         hammer2_pfs_t *pmp;
814         hammer2_key_t key_dummy;
815         int error;
816
817         /*
818          * Since ocluster is either disconnected from the topology or
819          * represents a hardlink terminus which is always a parent of or
820          * equal to dip, we should be able to safely lock dip->chain for
821          * our setup.
822          *
823          * WARNING! Must use inode_lock_ex() on dip to handle a stale
824          *          dip->cluster.
825          *
826          * If name is non-NULL we calculate lhc, else we use the passed-in
827          * lhc.
828          */
829         ocluster = *clusterp;
830         pmp = dip->pmp;
831
832         if (name) {
833                 lhc = hammer2_dirhash(name, name_len);
834
835                 /*
836                  * Locate the inode or indirect block to create the new
837                  * entry in.  At the same time check for key collisions
838                  * and iterate until we don't get one.
839                  */
840                 error = 0;
841                 while (error == 0) {
842                         ncluster = hammer2_cluster_lookup(dcluster, &key_dummy,
843                                                       lhc, lhc, 0);
844                         if (ncluster == NULL)
845                                 break;
846                         if ((lhc & HAMMER2_DIRHASH_LOMASK) ==
847                             HAMMER2_DIRHASH_LOMASK) {
848                                 error = ENOSPC;
849                         }
850                         hammer2_cluster_unlock(ncluster);
851                         hammer2_cluster_drop(ncluster);
852                         ncluster = NULL;
853                         ++lhc;
854                 }
855         } else {
856                 /*
857                  * Reconnect to specific key (used when moving
858                  * unlinked-but-open files into the hidden directory).
859                  */
860                 ncluster = hammer2_cluster_lookup(dcluster, &key_dummy,
861                                                   lhc, lhc, 0);
862                 KKASSERT(ncluster == NULL);
863                 error = 0;
864         }
865
866         if (error == 0) {
867                 if (hlink) {
868                         /*
869                          * Hardlink pointer needed, create totally fresh
870                          * directory entry.
871                          *
872                          * We must refactor ocluster because it might have
873                          * been shifted into an indirect cluster by the
874                          * create.
875                          */
876                         KKASSERT(ncluster == NULL);
877                         error = hammer2_cluster_create(pmp,
878                                                        dcluster, &ncluster,
879                                                        lhc, 0,
880                                                        HAMMER2_BREF_TYPE_INODE,
881                                                        HAMMER2_INODE_BYTES,
882                                                        0);
883                 } else {
884                         /*
885                          * Reconnect the original cluster under the new name.
886                          * Original cluster must have already been deleted by
887                          * teh caller.
888                          *
889                          * WARNING! Can cause held-over clusters to require a
890                          *          refactor.  Fortunately we have none (our
891                          *          locked clusters are passed into and
892                          *          modified by the call).
893                          */
894                         ncluster = ocluster;
895                         ocluster = NULL;
896                         error = hammer2_cluster_create(pmp, dcluster, &ncluster,
897                                                        lhc, 0,
898                                                        HAMMER2_BREF_TYPE_INODE,
899                                                        HAMMER2_INODE_BYTES,
900                                                        0);
901                 }
902         }
903
904         /*
905          * Unlock stuff.
906          */
907         KKASSERT(error != EAGAIN);
908
909         /*
910          * ncluster should be NULL on error, leave ocluster
911          * (ocluster == *clusterp) alone.
912          */
913         if (error) {
914                 KKASSERT(ncluster == NULL);
915                 return (error);
916         }
917
918         /*
919          * Directory entries are inodes so if the name has changed we have
920          * to update the inode.
921          *
922          * When creating an OBJTYPE_HARDLINK entry remember to unlock the
923          * cluster, the caller will access the hardlink via the actual hardlink
924          * target file and not the hardlink pointer entry, so we must still
925          * return ocluster.
926          */
927         if (hlink && hammer2_hardlink_enable >= 0) {
928                 /*
929                  * Create the HARDLINK pointer.  oip represents the hardlink
930                  * target in this situation.
931                  *
932                  * We will return ocluster (the hardlink target).
933                  */
934                 hammer2_cluster_modify(ncluster, 0);
935                 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
936                 wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
937                 bcopy(name, wipdata->filename, name_len);
938                 wipdata->meta.name_key = lhc;
939                 wipdata->meta.name_len = name_len;
940                 wipdata->meta.target_type =
941                             hammer2_cluster_rdata(ocluster)->ipdata.meta.type;
942                 wipdata->meta.type = HAMMER2_OBJTYPE_HARDLINK;
943                 wipdata->meta.inum =
944                             hammer2_cluster_rdata(ocluster)->ipdata.meta.inum;
945                 wipdata->meta.version = HAMMER2_INODE_VERSION_ONE;
946                 wipdata->meta.nlinks = 1;
947                 wipdata->meta.op_flags = HAMMER2_OPFLAG_DIRECTDATA;
948                 hammer2_cluster_modsync(ncluster);
949                 hammer2_cluster_unlock(ncluster);
950                 hammer2_cluster_drop(ncluster);
951                 ncluster = ocluster;
952                 ocluster = NULL;
953         } else {
954                 /*
955                  * ncluster is a duplicate of ocluster at the new location.
956                  * We must fixup the name stored in the inode data.
957                  * The bref key has already been adjusted by inode_connect().
958                  */
959                 hammer2_inode_modify(ip);
960                 hammer2_cluster_modify(ncluster, 0);
961                 wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
962
963                 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
964                 bcopy(name, wipdata->filename, name_len);
965                 ip->meta.name_key = lhc;
966                 ip->meta.name_len = name_len;
967                 ip->meta.nlinks = 1;
968
969                 /*
970                  * Resync wipdata->meta from the local copy.
971                  */
972                 wipdata->meta = ip->meta;
973                 hammer2_cluster_modsync(ncluster);
974         }
975
976         /*
977          * We are replacing ocluster with ncluster, unlock ocluster.  In the
978          * case where ocluster is left unchanged the code above sets
979          * ncluster to ocluster and ocluster to NULL, resulting in a NOP here.
980          */
981         if (ocluster) {
982                 hammer2_cluster_unlock(ocluster);
983                 hammer2_cluster_drop(ocluster);
984         }
985         *clusterp = ncluster;
986
987         return (0);
988 }
989
990 /*
991  * Repoint ip->cluster's chains to cluster's chains and fixup the default
992  * focus.  Only valid elements are repointed.  Invalid elements have to be
993  * adjusted by the appropriate slave sync threads.
994  *
995  * Caller must hold the inode and cluster exclusive locked, if not NULL,
996  * must also be locked.
997  *
998  * Cluster may be NULL to clean out any chains in ip->cluster.
999  */
1000 void
1001 hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
1002                       hammer2_cluster_t *cluster)
1003 {
1004         hammer2_chain_t *dropch[HAMMER2_MAXCLUSTER];
1005         hammer2_chain_t *ochain;
1006         hammer2_chain_t *nchain;
1007         hammer2_inode_t *opip;
1008         int i;
1009
1010         bzero(dropch, sizeof(dropch));
1011
1012         /*
1013          * Replace chains in ip->cluster with chains from cluster and
1014          * adjust the focus if necessary.
1015          *
1016          * NOTE: nchain and/or ochain can be NULL due to gaps
1017          *       in the cluster arrays.
1018          */
1019         hammer2_spin_ex(&ip->cluster_spin);
1020         for (i = 0; cluster && i < cluster->nchains; ++i) {
1021                 /*
1022                  * Do not replace invalid elements as this might race
1023                  * syncthr replacements.
1024                  */
1025                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1026                         continue;
1027
1028                 /*
1029                  * Do not replace elements which are the same.  Also handle
1030                  * element count discrepancies.
1031                  */
1032                 nchain = cluster->array[i].chain;
1033                 if (i < ip->cluster.nchains) {
1034                         ochain = ip->cluster.array[i].chain;
1035                         if (ochain == nchain)
1036                                 continue;
1037                 } else {
1038                         ochain = NULL;
1039                 }
1040
1041                 /*
1042                  * Make adjustments
1043                  */
1044                 ip->cluster.array[i].chain = nchain;
1045                 ip->cluster.array[i].flags &= ~HAMMER2_CITEM_INVALID;
1046                 ip->cluster.array[i].flags |= cluster->array[i].flags &
1047                                               HAMMER2_CITEM_INVALID;
1048                 if (nchain)
1049                         hammer2_chain_ref(nchain);
1050                 dropch[i] = ochain;
1051         }
1052
1053         /*
1054          * Release any left-over chains in ip->cluster.
1055          */
1056         while (i < ip->cluster.nchains) {
1057                 nchain = ip->cluster.array[i].chain;
1058                 if (nchain) {
1059                         ip->cluster.array[i].chain = NULL;
1060                         ip->cluster.array[i].flags |= HAMMER2_CITEM_INVALID;
1061                 }
1062                 dropch[i] = nchain;
1063                 ++i;
1064         }
1065
1066         /*
1067          * Fixup fields.  Note that the inode-embedded cluster is never
1068          * directly locked.
1069          */
1070         if (cluster) {
1071                 ip->cluster.nchains = cluster->nchains;
1072                 ip->cluster.focus = cluster->focus;
1073                 ip->cluster.flags = cluster->flags & ~HAMMER2_CLUSTER_LOCKED;
1074         } else {
1075                 ip->cluster.nchains = 0;
1076                 ip->cluster.focus = NULL;
1077                 ip->cluster.flags &= ~HAMMER2_CLUSTER_ZFLAGS;
1078         }
1079
1080         /*
1081          * Repoint ip->pip if requested (non-NULL pip).
1082          */
1083         if (pip && ip->pip != pip) {
1084                 opip = ip->pip;
1085                 hammer2_inode_ref(pip);
1086                 ip->pip = pip;
1087         } else {
1088                 opip = NULL;
1089         }
1090         hammer2_spin_unex(&ip->cluster_spin);
1091
1092         /*
1093          * Cleanup outside of spinlock
1094          */
1095         while (--i >= 0) {
1096                 if (dropch[i])
1097                         hammer2_chain_drop(dropch[i]);
1098         }
1099         if (opip)
1100                 hammer2_inode_drop(opip);
1101 }
1102
1103 /*
1104  * Repoint a single element from the cluster to the ip.  Used by the
1105  * synchronization threads to piecemeal update inodes.  Does not change
1106  * focus and requires inode to be re-locked to clean-up flags (XXX).
1107  */
1108 void
1109 hammer2_inode_repoint_one(hammer2_inode_t *ip, hammer2_cluster_t *cluster,
1110                           int idx)
1111 {
1112         hammer2_chain_t *ochain;
1113         hammer2_chain_t *nchain;
1114         int i;
1115
1116         hammer2_spin_ex(&ip->cluster_spin);
1117         KKASSERT(idx < cluster->nchains);
1118         if (idx < ip->cluster.nchains) {
1119                 ochain = ip->cluster.array[idx].chain;
1120                 nchain = cluster->array[idx].chain;
1121         } else {
1122                 ochain = NULL;
1123                 nchain = cluster->array[idx].chain;
1124                 ip->cluster.nchains = idx + 1;
1125                 for (i = ip->cluster.nchains; i <= idx; ++i) {
1126                         bzero(&ip->cluster.array[i],
1127                               sizeof(ip->cluster.array[i]));
1128                         ip->cluster.array[i].flags |= HAMMER2_CITEM_INVALID;
1129                 }
1130         }
1131         if (ochain != nchain) {
1132                 /*
1133                  * Make adjustments.
1134                  */
1135                 ip->cluster.array[idx].chain = nchain;
1136                 ip->cluster.array[idx].flags &= ~HAMMER2_CITEM_INVALID;
1137                 ip->cluster.array[idx].flags |= cluster->array[idx].flags &
1138                                                 HAMMER2_CITEM_INVALID;
1139         }
1140         hammer2_spin_unex(&ip->cluster_spin);
1141         if (ochain != nchain) {
1142                 if (nchain)
1143                         hammer2_chain_ref(nchain);
1144                 if (ochain)
1145                         hammer2_chain_drop(ochain);
1146         }
1147 }
1148
1149 /*
1150  * Unlink the file (dip, name, name_len), from the specified directory inode.
1151  * If the caller also has the hammer2_inode the caller must pass it locked as
1152  * (ip), but may pass NULL if it does not have the inode in hand.
1153  *
1154  * The directory inode does not need to be locked.
1155  *
1156  * isdir determines whether a directory/non-directory check should be made.
1157  * No check is made if isdir is set to -1.
1158  *
1159  * isopen specifies whether special unlink-with-open-descriptor handling
1160  * must be performed.  If set to -1 the caller is deleting a PFS and we
1161  * check whether the chain is mounted or not (chain->pmp != NULL).  1 is
1162  * implied if it is mounted.
1163  *
1164  * If isopen is 1 and nlinks drops to 0 this function must move the chain
1165  * to a special hidden directory until last-close occurs on the file.
1166  *
1167  * NOTE!  The underlying file can still be active with open descriptors
1168  *        or if the inode is being manually held (e.g. for rename).
1169  *
1170  * NOTE!  When unlinking an open file the inode will be temporarily moved to
1171  *        a hidden directory, otherwise the inode will be deleted.
1172  */
1173 int
1174 hammer2_unlink_file(hammer2_inode_t *dip, hammer2_inode_t *ip,
1175                     const uint8_t *name, size_t name_len,
1176                     int isdir, int *hlinkp, struct nchandle *nch,
1177                     int nlinks)
1178 {
1179         const hammer2_inode_data_t *ripdata;
1180         hammer2_cluster_t *cparent;
1181         hammer2_cluster_t *hcluster;
1182         hammer2_cluster_t *hparent;
1183         hammer2_cluster_t *cluster;
1184         hammer2_cluster_t *dparent;
1185         hammer2_cluster_t *dcluster;
1186         hammer2_key_t key_dummy;
1187         hammer2_key_t key_next;
1188         hammer2_key_t lhc;
1189         int last_link;
1190         int error;
1191         int hlink;
1192         int myip;
1193         uint8_t type;
1194
1195         error = 0;
1196         hlink = 0;
1197         myip = 0;
1198         hcluster = NULL;
1199         hparent = NULL;
1200         lhc = hammer2_dirhash(name, name_len);
1201
1202 again:
1203         /*
1204          * Locate the filename in the directory and instantiate the ip
1205          * if necessary.  If the ip is already known we must still locate
1206          * the filename to adjust cparent for possible deletion.
1207          */
1208         hammer2_inode_lock(dip, HAMMER2_RESOLVE_ALWAYS);
1209         cparent = hammer2_inode_cluster(dip, HAMMER2_RESOLVE_ALWAYS);
1210         cluster = hammer2_cluster_lookup(cparent, &key_next,
1211                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK, 0);
1212         while (cluster) {
1213                 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE) {
1214                         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1215                         if (ripdata->meta.name_len == name_len &&
1216                             bcmp(ripdata->filename, name, name_len) == 0) {
1217                                 break;
1218                         }
1219                 }
1220                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
1221                                                key_next,
1222                                                lhc + HAMMER2_DIRHASH_LOMASK,
1223                                                0);
1224         }
1225         hammer2_inode_unlock(dip, NULL);        /* retain cparent */
1226
1227         /*
1228          * Not found or wrong type (isdir < 0 disables the type check).
1229          * If a hardlink pointer, type checks use the hardlink target.
1230          */
1231         if (cluster == NULL) {
1232                 error = ENOENT;
1233                 goto done;
1234         }
1235
1236         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1237         type = ripdata->meta.type;
1238         if (type == HAMMER2_OBJTYPE_HARDLINK) {
1239                 hlink = 1;
1240                 type = ripdata->meta.target_type;
1241         }
1242
1243         if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 0) {
1244                 error = ENOTDIR;
1245                 goto done;
1246         }
1247         if (type != HAMMER2_OBJTYPE_DIRECTORY && isdir >= 1) {
1248                 error = EISDIR;
1249                 goto done;
1250         }
1251
1252         /*
1253          * Hardlink must be resolved.  We can't hold the parent locked
1254          * while we do this or we could deadlock.  The physical file will
1255          * be located at or above the current directory.
1256          *
1257          * We loop to reacquire the hardlink origination.
1258          *
1259          * NOTE: hammer2_hardlink_find() will locate the hardlink target,
1260          *       returning a modified hparent and hcluster.
1261          */
1262         if (ripdata->meta.type == HAMMER2_OBJTYPE_HARDLINK) {
1263                 if (hcluster == NULL) {
1264                         hcluster = cluster;
1265                         cluster = NULL; /* safety */
1266                         hammer2_cluster_unlock(cparent);
1267                         hammer2_cluster_drop(cparent);
1268                         cparent = NULL; /* safety */
1269                         ripdata = NULL; /* safety (associated w/cparent) */
1270                         error = hammer2_cluster_hardlink_find(dip, &hparent, &hcluster);
1271
1272                         /*
1273                          * If we couldn't find the hardlink target then some
1274                          * parent directory containing the hardlink pointer
1275                          * probably got renamed to above the original target,
1276                          * a case not yet handled by H2.
1277                          */
1278                         if (error) {
1279                                 kprintf("H2 unlink_file: hardlink target for "
1280                                         "\"%s\" not found\n",
1281                                         name);
1282                                 kprintf("(likely due to known directory "
1283                                         "rename bug)\n");
1284                                 goto done;
1285                         }
1286                         goto again;
1287                 }
1288         }
1289
1290         /*
1291          * If this is a directory the directory must be empty.  However, if
1292          * isdir < 0 we are doing a rename and the directory does not have
1293          * to be empty, and if isdir > 1 we are deleting a PFS/snapshot
1294          * and the directory does not have to be empty.
1295          *
1296          * NOTE: We check the full key range here which covers both visible
1297          *       and invisible entries.  Theoretically there should be no
1298          *       invisible (hardlink target) entries if there are no visible
1299          *       entries.
1300          */
1301         if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 1) {
1302                 dparent = hammer2_cluster_lookup_init(cluster, 0);
1303                 dcluster = hammer2_cluster_lookup(dparent, &key_dummy,
1304                                                   0, (hammer2_key_t)-1,
1305                                                   HAMMER2_LOOKUP_NODATA);
1306                 if (dcluster) {
1307                         hammer2_cluster_unlock(dcluster);
1308                         hammer2_cluster_drop(dcluster);
1309                         hammer2_cluster_lookup_done(dparent);
1310                         error = ENOTEMPTY;
1311                         goto done;
1312                 }
1313                 hammer2_cluster_lookup_done(dparent);
1314                 dparent = NULL;
1315                 /* dcluster NULL */
1316         }
1317
1318         /*
1319          * If this was a hardlink then (cparent, cluster) is the hardlink
1320          * pointer, which we can simply destroy outright.  Discard the
1321          * clusters and replace with the hardlink target.
1322          */
1323         if (hcluster) {
1324                 hammer2_cluster_delete(cparent, cluster,
1325                                        HAMMER2_DELETE_PERMANENT);
1326                 hammer2_cluster_unlock(cparent);
1327                 hammer2_cluster_drop(cparent);
1328                 hammer2_cluster_unlock(cluster);
1329                 hammer2_cluster_drop(cluster);
1330                 cparent = hparent;
1331                 cluster = hcluster;
1332                 hparent = NULL;
1333                 hcluster = NULL;
1334         }
1335
1336         /*
1337          * This leaves us with the hardlink target or non-hardlinked file
1338          * or directory in (cparent, cluster).
1339          *
1340          * Delete the target when nlinks reaches 0 with special handling
1341          * to avoid I/O (to avoid actually updating the inode) for the 1->0
1342          * transition, if possible.  This optimization makes rm -rf very
1343          * fast.
1344          *
1345          * NOTE! In DragonFly the vnops function calls cache_unlink() after
1346          *       calling us here to clean out the namecache association,
1347          *       (which does not represent a ref for the open-test), and to
1348          *       force finalization of the vnode if/when the last ref gets
1349          *       dropped.
1350          *
1351          * NOTE! Files are unlinked by rename and then relinked.  nch will be
1352          *       passed as NULL in this situation.  hammer2_inode_connect()
1353          *       will bump nlinks.
1354          */
1355         KKASSERT(cluster != NULL);
1356
1357         /*
1358          * Instantiate ip if necessary for ip->meta data consolidation.
1359          */
1360         if (ip == NULL) {
1361                 ip = hammer2_inode_get(dip->pmp, dip, cluster);
1362                 myip = 1;
1363         }
1364
1365
1366         /*
1367          * Note: nlinks is negative when decrementing, positive when
1368          *       incrementing.
1369          */
1370         last_link = (ip->meta.nlinks + nlinks == 0);
1371
1372         if (last_link) {
1373                 /*
1374                  * Target nlinks has reached 0, file now unlinked (but may
1375                  * still be open).
1376                  *
1377                  * nlinks will be -1 for a normal remove().  If this is the
1378                  * last link we must flag the inode so we can optimally
1379                  * throw away buffer data and destroy the file on reclaim.
1380                  */
1381                 if (nlinks == -1)
1382                         atomic_set_int(&ip->flags, HAMMER2_INODE_ISUNLINKED);
1383
1384                 if (nch && cache_isopen(nch)) {
1385                         /*
1386                          * If an unlinked file is still open we must update
1387                          * the inodes link count.
1388                          */
1389                         /*hammer2_cluster_modify(cluster, 0);*/
1390                         hammer2_inode_modify(ip);
1391                         ip->meta.nlinks += nlinks;
1392                         if ((int64_t)ip->meta.nlinks < 0)       /* safety */
1393                                 ip->meta.nlinks = 0;
1394                         hammer2_inode_move_to_hidden(&cparent, &cluster,
1395                                                      ip->meta.inum);
1396                         /* hammer2_cluster_modsync(cluster); */
1397                 } else {
1398                         /*
1399                          * This won't get everything if a vnode is still
1400                          * present, but the cache_unlink() call the caller
1401                          * makes will.
1402                          */
1403                         hammer2_cluster_delete(cparent, cluster,
1404                                                HAMMER2_DELETE_PERMANENT);
1405                 }
1406         } else if (hlink == 0) {
1407                 /*
1408                  * In this situation a normal non-hardlinked file (which can
1409                  * only have nlinks == 1) still has a non-zero nlinks, the
1410                  * caller must be doing a RENAME operation and so is passing
1411                  * a nlinks adjustment of 0, and only wishes to remove file
1412                  * in order to be able to reconnect it under a different name.
1413                  *
1414                  * In this situation we do a temporary deletion of the
1415                  * chain in order to allow the file to be reconnected in
1416                  * a different location.
1417                  */
1418                 KKASSERT(nlinks == 0);
1419                 hammer2_cluster_delete(cparent, cluster, 0);
1420         } else {
1421                 /*
1422                  * Links remain, must update the inode link count.
1423                  */
1424                 /*hammer2_cluster_modify(cluster, 0);*/
1425                 hammer2_inode_modify(ip);
1426                 ip->meta.nlinks += nlinks;
1427                 if ((int64_t)ip->meta.nlinks < 0)
1428                         ip->meta.nlinks = 0;
1429                 /* hammer2_cluster_modsync(cluster); */
1430         }
1431
1432         if (myip) {
1433                 hammer2_inode_unlock(ip, NULL);
1434         }
1435
1436         error = 0;
1437 done:
1438         if (cparent) {
1439                 hammer2_cluster_unlock(cparent);
1440                 hammer2_cluster_drop(cparent);
1441         }
1442         if (cluster) {
1443                 hammer2_cluster_unlock(cluster);
1444                 hammer2_cluster_drop(cluster);
1445         }
1446         if (hparent) {
1447                 hammer2_cluster_unlock(hparent);
1448                 hammer2_cluster_drop(hparent);
1449         }
1450         if (hcluster) {
1451                 hammer2_cluster_unlock(hcluster);
1452                 hammer2_cluster_drop(hcluster);
1453         }
1454         if (hlinkp)
1455                 *hlinkp = hlink;
1456
1457         return error;
1458 }
1459
1460 /*
1461  * This is called from the mount code to initialize pmp->ihidden
1462  */
1463 void
1464 hammer2_inode_install_hidden(hammer2_pfs_t *pmp)
1465 {
1466         hammer2_cluster_t *cparent;
1467         hammer2_cluster_t *cluster;
1468         hammer2_cluster_t *scan;
1469         const hammer2_inode_data_t *ripdata;
1470         hammer2_inode_data_t *wipdata;
1471         hammer2_key_t key_dummy;
1472         hammer2_key_t key_next;
1473         int error;
1474         int count;
1475         int dip_check_algo;
1476         int dip_comp_algo;
1477
1478         if (pmp->ihidden)
1479                 return;
1480
1481         /*
1482          * Find the hidden directory
1483          */
1484         bzero(&key_dummy, sizeof(key_dummy));
1485         hammer2_trans_init(pmp, 0);
1486
1487         /*
1488          * Setup for lookup, retrieve iroot's check and compression
1489          * algorithm request which was likely generated by newfs_hammer2.
1490          *
1491          * The check/comp fields will probably never be used since inodes
1492          * are renamed into the hidden directory and not created relative to
1493          * the hidden directory, chain creation inherits from bref.methods,
1494          * and data chains inherit from their respective file inode *_algo
1495          * fields.
1496          */
1497         hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1498         cparent = hammer2_inode_cluster(pmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1499         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1500         dip_check_algo = ripdata->meta.check_algo;
1501         dip_comp_algo = ripdata->meta.comp_algo;
1502         ripdata = NULL;
1503
1504         cluster = hammer2_cluster_lookup(cparent, &key_dummy,
1505                                          HAMMER2_INODE_HIDDENDIR,
1506                                          HAMMER2_INODE_HIDDENDIR,
1507                                          0);
1508         if (cluster) {
1509                 pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, cluster);
1510                 hammer2_inode_ref(pmp->ihidden);
1511
1512                 /*
1513                  * Remove any unlinked files which were left open as-of
1514                  * any system crash.
1515                  *
1516                  * Don't pass NODATA, we need the inode data so the delete
1517                  * can do proper statistics updates.
1518                  */
1519                 count = 0;
1520                 scan = hammer2_cluster_lookup(cluster, &key_next,
1521                                               0, HAMMER2_TID_MAX, 0);
1522                 while (scan) {
1523                         if (hammer2_cluster_type(scan) ==
1524                             HAMMER2_BREF_TYPE_INODE) {
1525                                 hammer2_cluster_delete(cluster, scan,
1526                                                    HAMMER2_DELETE_PERMANENT);
1527                                 ++count;
1528                         }
1529                         scan = hammer2_cluster_next(cluster, scan, &key_next,
1530                                                     0, HAMMER2_TID_MAX, 0);
1531                 }
1532
1533                 hammer2_inode_unlock(pmp->ihidden, cluster);
1534                 hammer2_inode_unlock(pmp->iroot, cparent);
1535                 hammer2_trans_done(pmp);
1536                 kprintf("hammer2: PFS loaded hidden dir, "
1537                         "removed %d dead entries\n", count);
1538                 return;
1539         }
1540
1541         /*
1542          * Create the hidden directory
1543          */
1544         error = hammer2_cluster_create(pmp, cparent, &cluster,
1545                                        HAMMER2_INODE_HIDDENDIR, 0,
1546                                        HAMMER2_BREF_TYPE_INODE,
1547                                        HAMMER2_INODE_BYTES,
1548                                        0);
1549         hammer2_inode_unlock(pmp->iroot, cparent);
1550
1551         hammer2_cluster_modify(cluster, 0);
1552         wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1553         wipdata->meta.type = HAMMER2_OBJTYPE_DIRECTORY;
1554         wipdata->meta.inum = HAMMER2_INODE_HIDDENDIR;
1555         wipdata->meta.nlinks = 1;
1556         wipdata->meta.comp_algo = dip_comp_algo;
1557         wipdata->meta.check_algo = dip_check_algo;
1558         hammer2_cluster_modsync(cluster);
1559         kprintf("hammer2: PFS root missing hidden directory, creating\n");
1560
1561         pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, cluster);
1562         hammer2_inode_ref(pmp->ihidden);
1563         hammer2_inode_unlock(pmp->ihidden, cluster);
1564         hammer2_trans_done(pmp);
1565 }
1566
1567 /*
1568  * If an open file is unlinked H2 needs to retain the file in the topology
1569  * to ensure that its backing store is not recovered by the bulk free scan.
1570  * This also allows us to avoid having to special-case the CHAIN_DELETED flag.
1571  *
1572  * To do this the file is moved to a hidden directory in the PFS root and
1573  * renamed.  The hidden directory must be created if it does not exist.
1574  */
1575 static
1576 void
1577 hammer2_inode_move_to_hidden(hammer2_cluster_t **cparentp,
1578                              hammer2_cluster_t **clusterp,
1579                              hammer2_tid_t inum)
1580 {
1581         hammer2_cluster_t *dcluster;
1582         hammer2_pfs_t *pmp;
1583         int error;
1584
1585         pmp = (*clusterp)->pmp;
1586         KKASSERT(pmp != NULL);
1587         KKASSERT(pmp->ihidden != NULL);
1588
1589         hammer2_cluster_delete(*cparentp, *clusterp, 0);
1590         hammer2_inode_lock(pmp->ihidden, HAMMER2_RESOLVE_ALWAYS);
1591         dcluster = hammer2_inode_cluster(pmp->ihidden, HAMMER2_RESOLVE_ALWAYS);
1592         error = hammer2_inode_connect(NULL/*XXX*/, clusterp, 0,
1593                                       pmp->ihidden, dcluster,
1594                                       NULL, 0, inum);
1595         hammer2_inode_unlock(pmp->ihidden, dcluster);
1596         KKASSERT(error == 0);
1597 }
1598
1599 /*
1600  * Find the directory common to both fdip and tdip.
1601  *
1602  * Returns a held but not locked inode.  Caller typically locks the inode,
1603  * and when through unlocks AND drops it.
1604  */
1605 hammer2_inode_t *
1606 hammer2_inode_common_parent(hammer2_inode_t *fdip, hammer2_inode_t *tdip)
1607 {
1608         hammer2_inode_t *scan1;
1609         hammer2_inode_t *scan2;
1610
1611         /*
1612          * We used to have a depth field but it complicated matters too
1613          * much for directory renames.  So now its ugly.  Check for
1614          * simple cases before giving up and doing it the expensive way.
1615          *
1616          * XXX need a bottom-up topology stability lock
1617          */
1618         if (fdip == tdip || fdip == tdip->pip) {
1619                 hammer2_inode_ref(fdip);
1620                 return(fdip);
1621         }
1622         if (fdip->pip == tdip) {
1623                 hammer2_inode_ref(tdip);
1624                 return(tdip);
1625         }
1626
1627         /*
1628          * XXX not MPSAFE
1629          */
1630         for (scan1 = fdip; scan1->pmp == fdip->pmp; scan1 = scan1->pip) {
1631                 scan2 = tdip;
1632                 while (scan2->pmp == tdip->pmp) {
1633                         if (scan1 == scan2) {
1634                                 hammer2_inode_ref(scan1);
1635                                 return(scan1);
1636                         }
1637                         scan2 = scan2->pip;
1638                         if (scan2 == NULL)
1639                                 break;
1640                 }
1641         }
1642         panic("hammer2_inode_common_parent: no common parent %p %p\n",
1643               fdip, tdip);
1644         /* NOT REACHED */
1645         return(NULL);
1646 }
1647
1648 /*
1649  * Set an inode's cluster modified, marking the related chains RW and
1650  * duplicating them if necessary.
1651  *
1652  * The passed-in chain is a localized copy of the chain previously acquired
1653  * when the inode was locked (and possilby replaced in the mean time), and
1654  * must also be updated.  In fact, we update it first and then synchronize
1655  * the inode's cluster cache.
1656  */
1657 void
1658 hammer2_inode_modify(hammer2_inode_t *ip)
1659 {
1660         atomic_set_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1661         if (ip->vp)
1662                 vsetisdirty(ip->vp);
1663 }
1664
1665 /*
1666  * Synchronize the inode's frontend state with the chain state prior
1667  * to any explicit flush of the inode or any strategy write call.
1668  *
1669  * Called with a locked inode.
1670  */
1671 void
1672 hammer2_inode_fsync(hammer2_inode_t *ip, hammer2_cluster_t *cparent)
1673 {
1674         int clear_directdata = 0;
1675
1676         /* temporary hack, allow cparent to be NULL */
1677         if (cparent == NULL) {
1678                 cparent = hammer2_inode_cluster(ip, HAMMER2_RESOLVE_ALWAYS);
1679                 hammer2_inode_fsync(ip, cparent);
1680                 hammer2_cluster_unlock(cparent);
1681                 hammer2_cluster_drop(cparent);
1682                 return;
1683         }
1684
1685         if ((ip->flags & HAMMER2_INODE_RESIZED) == 0) {
1686                 /* do nothing */
1687         } else if (ip->meta.size < ip->osize) {
1688                 /*
1689                  * We must delete any chains beyond the EOF.  The chain
1690                  * straddling the EOF will be pending in the bioq.
1691                  */
1692                 hammer2_cluster_t *dparent;
1693                 hammer2_cluster_t *cluster;
1694                 hammer2_key_t lbase;
1695                 hammer2_key_t key_next;
1696
1697                 lbase = (ip->meta.size + HAMMER2_PBUFMASK64) &
1698                         ~HAMMER2_PBUFMASK64;
1699                 dparent = hammer2_cluster_lookup_init(&ip->cluster, 0);
1700                 cluster = hammer2_cluster_lookup(dparent, &key_next,
1701                                                  lbase, (hammer2_key_t)-1,
1702                                                  HAMMER2_LOOKUP_NODATA);
1703                 while (cluster) {
1704                         /*
1705                          * Degenerate embedded case, nothing to loop on
1706                          */
1707                         switch (hammer2_cluster_type(cluster)) {
1708                         case HAMMER2_BREF_TYPE_INODE:
1709                                 hammer2_cluster_unlock(cluster);
1710                                 hammer2_cluster_drop(cluster);
1711                                 cluster = NULL;
1712                                 break;
1713                         case HAMMER2_BREF_TYPE_DATA:
1714                                 hammer2_cluster_delete(dparent, cluster,
1715                                                    HAMMER2_DELETE_PERMANENT);
1716                                 /* fall through */
1717                         default:
1718                                 cluster = hammer2_cluster_next(dparent, cluster,
1719                                                    &key_next,
1720                                                    key_next, (hammer2_key_t)-1,
1721                                                    HAMMER2_LOOKUP_NODATA);
1722                                 break;
1723                         }
1724                 }
1725                 hammer2_cluster_lookup_done(dparent);
1726                 atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1727                 KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
1728         } else if (ip->meta.size > ip->osize) {
1729                 /*
1730                  * When resizing larger we may not have any direct-data
1731                  * available.
1732                  */
1733                 if ((ip->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
1734                     ip->meta.size > HAMMER2_EMBEDDED_BYTES) {
1735                         ip->meta.op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
1736                         clear_directdata = 1;
1737                 }
1738                 atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1739                 KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
1740         } else {
1741                 /*
1742                  * RESIZED was set but size didn't change.
1743                  */
1744                 atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
1745                 KKASSERT(ip->flags & HAMMER2_INODE_MODIFIED);
1746         }
1747
1748         /*
1749          * Sync inode meta-data
1750          */
1751         if (ip->flags & HAMMER2_INODE_MODIFIED) {
1752                 hammer2_inode_data_t *wipdata;
1753
1754                 atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1755                 hammer2_cluster_modify(cparent, 0);
1756                 hammer2_inode_repoint(ip, NULL, cparent);
1757
1758                 wipdata = &hammer2_cluster_wdata(cparent)->ipdata;
1759                 wipdata->meta = ip->meta;
1760                 if (clear_directdata) {
1761                         bzero(&wipdata->u.blockset,
1762                               sizeof(wipdata->u.blockset));
1763                 }
1764                 hammer2_cluster_modsync(cparent);
1765         }
1766 }