5483334984bf5d6b3ec746abababd4c6016a1115
[dragonfly.git] / sys / vfs / hammer2 / hammer2_inode.c
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/lock.h>
40 #include <sys/uuid.h>
41
42 #include "hammer2.h"
43
44 #define INODE_DEBUG     0
45
46 static void hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
47                                          hammer2_cluster_t **cparentp,
48                                          hammer2_cluster_t **clusterp,
49                                          hammer2_tid_t inum);
50
51 RB_GENERATE2(hammer2_inode_tree, hammer2_inode, rbnode, hammer2_inode_cmp,
52              hammer2_tid_t, inum);
53
54 int
55 hammer2_inode_cmp(hammer2_inode_t *ip1, hammer2_inode_t *ip2)
56 {
57         if (ip1->inum < ip2->inum)
58                 return(-1);
59         if (ip1->inum > ip2->inum)
60                 return(1);
61         return(0);
62 }
63
64 /*
65  * HAMMER2 inode locks
66  *
67  * HAMMER2 offers shared and exclusive locks on inodes.  Pass a mask of
68  * flags for options:
69  *
70  *      - pass HAMMER2_RESOLVE_SHARED if a shared lock is desired.  The
71  *        inode locking function will automatically set the RDONLY flag.
72  *
73  *      - pass HAMMER2_RESOLVE_ALWAYS if you need the inode's meta-data.
74  *        Most front-end inode locks do.
75  *
76  *      - pass HAMMER2_RESOLVE_NEVER if you do not want to require that
77  *        the inode data be resolved.  This is used by the syncthr because
78  *        it can run on an unresolved/out-of-sync cluster, and also by the
79  *        vnode reclamation code to avoid unnecessary I/O (particularly when
80  *        disposing of hundreds of thousands of cached vnodes).
81  *
82  * The inode locking function locks the inode itself, resolves any stale
83  * chains in the inode's cluster, and allocates a fresh copy of the
84  * cluster with 1 ref and all the underlying chains locked.
85  *
86  * ip->cluster will be stable while the inode is locked.
87  *
88  * NOTE: We don't combine the inode/chain lock because putting away an
89  *       inode would otherwise confuse multiple lock holders of the inode.
90  *
91  * NOTE: In-memory inodes always point to hardlink targets (the actual file),
92  *       and never point to a hardlink pointer.
93  *
94  * NOTE: If caller passes HAMMER2_RESOLVE_RDONLY the exclusive locking code
95  *       will feel free to reduce the chain set in the cluster as an
96  *       optimization.  It will still be validated against the quorum if
97  *       appropriate, but the optimization might be able to reduce data
98  *       accesses to one node.  This flag is automatically set if the inode
99  *       is locked with HAMMER2_RESOLVE_SHARED.
100  */
101 hammer2_cluster_t *
102 hammer2_inode_lock(hammer2_inode_t *ip, int how)
103 {
104         hammer2_cluster_t *cluster;
105
106         hammer2_inode_ref(ip);
107
108         /* 
109          * Inode structure mutex
110          */
111         if (how & HAMMER2_RESOLVE_SHARED) {
112                 how |= HAMMER2_RESOLVE_RDONLY;
113                 hammer2_mtx_sh(&ip->lock);
114         } else {
115                 hammer2_mtx_ex(&ip->lock);
116         }
117
118         /*
119          * Create a copy of ip->cluster and lock it.  Note that the copy
120          * will have a ref on the cluster AND its chains and we don't want
121          * a second ref to either when we lock it.
122          *
123          * The copy will not have a focus until it is locked.
124          *
125          * Exclusive inode locks set the template focus chain in (ip)
126          * as a hint.  Cluster locks can ALWAYS replace the focus in the
127          * working copy if the hint does not work out, so beware.
128          */
129         cluster = hammer2_cluster_copy(&ip->cluster);
130         hammer2_cluster_lock(cluster, how);
131         hammer2_cluster_resolve(cluster);
132
133         /*
134          * cluster->focus will be set if resolving RESOLVE_ALWAYS, but
135          * only update the cached focus in the inode structure when taking
136          * out an exclusive lock.
137          */
138         if ((how & HAMMER2_RESOLVE_SHARED) == 0)
139                 ip->cluster.focus = cluster->focus;
140
141         /*
142          * Initialize pmp->inode_tid and pmp->modify_tid on first access
143          * to the root of mount that resolves good.
144          * XXX probably not the best place for this.
145          */
146         if (ip->pmp->inode_tid == 0 &&
147             cluster->error == 0 && cluster->focus) {
148                 const hammer2_inode_data_t *ripdata;
149                 hammer2_pfs_t *pmp = ip->pmp;
150                 hammer2_blockref_t bref;
151
152                 ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
153                 hammer2_cluster_bref(cluster, &bref);
154                 pmp->inode_tid = ripdata->pfs_inum + 1;
155                 pmp->modify_tid = bref.modify_tid;
156                 kprintf("PMP focus good set nextino=%ld mod=%016jx\n",
157                         pmp->inode_tid, pmp->modify_tid);
158
159         }
160         return (cluster);
161 }
162
163 void
164 hammer2_inode_unlock(hammer2_inode_t *ip, hammer2_cluster_t *cluster)
165 {
166         if (cluster) {
167                 hammer2_cluster_unlock(cluster);
168                 hammer2_cluster_drop(cluster);
169         }
170         hammer2_mtx_unlock(&ip->lock);
171         hammer2_inode_drop(ip);
172 }
173
174 /*
175  * Temporarily release a lock held shared or exclusive.  Caller must
176  * hold the lock shared or exclusive on call and lock will be released
177  * on return.
178  *
179  * Restore a lock that was temporarily released.
180  */
181 hammer2_mtx_state_t
182 hammer2_inode_lock_temp_release(hammer2_inode_t *ip)
183 {
184         return hammer2_mtx_temp_release(&ip->lock);
185 }
186
187 void
188 hammer2_inode_lock_temp_restore(hammer2_inode_t *ip, hammer2_mtx_state_t ostate)
189 {
190         hammer2_mtx_temp_restore(&ip->lock, ostate);
191 }
192
193 /*
194  * Upgrade a shared inode lock to exclusive and return.  If the inode lock
195  * is already held exclusively this is a NOP.
196  *
197  * The caller MUST hold the inode lock either shared or exclusive on call
198  * and will own the lock exclusively on return.
199  *
200  * Returns non-zero if the lock was already exclusive prior to the upgrade.
201  */
202 int
203 hammer2_inode_lock_upgrade(hammer2_inode_t *ip)
204 {
205         int wasexclusive;
206
207         if (mtx_islocked_ex(&ip->lock)) {
208                 wasexclusive = 1;
209         } else {
210                 hammer2_mtx_unlock(&ip->lock);
211                 hammer2_mtx_ex(&ip->lock);
212                 wasexclusive = 0;
213         }
214         return wasexclusive;
215 }
216
217 /*
218  * Downgrade an inode lock from exclusive to shared only if the inode
219  * lock was previously shared.  If the inode lock was previously exclusive,
220  * this is a NOP.
221  */
222 void
223 hammer2_inode_lock_downgrade(hammer2_inode_t *ip, int wasexclusive)
224 {
225         if (wasexclusive == 0)
226                 mtx_downgrade(&ip->lock);
227 }
228
229 /*
230  * Lookup an inode by inode number
231  */
232 hammer2_inode_t *
233 hammer2_inode_lookup(hammer2_pfs_t *pmp, hammer2_tid_t inum)
234 {
235         hammer2_inode_t *ip;
236
237         KKASSERT(pmp);
238         if (pmp->spmp_hmp) {
239                 ip = NULL;
240         } else {
241                 hammer2_spin_ex(&pmp->inum_spin);
242                 ip = RB_LOOKUP(hammer2_inode_tree, &pmp->inum_tree, inum);
243                 if (ip)
244                         hammer2_inode_ref(ip);
245                 hammer2_spin_unex(&pmp->inum_spin);
246         }
247         return(ip);
248 }
249
250 /*
251  * Adding a ref to an inode is only legal if the inode already has at least
252  * one ref.
253  *
254  * (can be called with spinlock held)
255  */
256 void
257 hammer2_inode_ref(hammer2_inode_t *ip)
258 {
259         atomic_add_int(&ip->refs, 1);
260 }
261
262 /*
263  * Drop an inode reference, freeing the inode when the last reference goes
264  * away.
265  */
266 void
267 hammer2_inode_drop(hammer2_inode_t *ip)
268 {
269         hammer2_pfs_t *pmp;
270         hammer2_inode_t *pip;
271         u_int refs;
272
273         while (ip) {
274                 refs = ip->refs;
275                 cpu_ccfence();
276                 if (refs == 1) {
277                         /*
278                          * Transition to zero, must interlock with
279                          * the inode inumber lookup tree (if applicable).
280                          * It should not be possible for anyone to race
281                          * the transition to 0.
282                          *
283                          */
284                         pmp = ip->pmp;
285                         KKASSERT(pmp);
286                         hammer2_spin_ex(&pmp->inum_spin);
287
288                         if (atomic_cmpset_int(&ip->refs, 1, 0)) {
289                                 KKASSERT(hammer2_mtx_refs(&ip->lock) == 0);
290                                 if (ip->flags & HAMMER2_INODE_ONRBTREE) {
291                                         atomic_clear_int(&ip->flags,
292                                                      HAMMER2_INODE_ONRBTREE);
293                                         RB_REMOVE(hammer2_inode_tree,
294                                                   &pmp->inum_tree, ip);
295                                 }
296                                 hammer2_spin_unex(&pmp->inum_spin);
297
298                                 pip = ip->pip;
299                                 ip->pip = NULL;
300                                 ip->pmp = NULL;
301
302                                 /*
303                                  * Cleaning out ip->cluster isn't entirely
304                                  * trivial.
305                                  */
306                                 hammer2_inode_repoint(ip, NULL, NULL);
307
308                                 /*
309                                  * We have to drop pip (if non-NULL) to
310                                  * dispose of our implied reference from
311                                  * ip->pip.  We can simply loop on it.
312                                  */
313                                 kfree(ip, pmp->minode);
314                                 atomic_add_long(&pmp->inmem_inodes, -1);
315                                 ip = pip;
316                                 /* continue with pip (can be NULL) */
317                         } else {
318                                 hammer2_spin_unex(&ip->pmp->inum_spin);
319                         }
320                 } else {
321                         /*
322                          * Non zero transition
323                          */
324                         if (atomic_cmpset_int(&ip->refs, refs, refs - 1))
325                                 break;
326                 }
327         }
328 }
329
330 /*
331  * Get the vnode associated with the given inode, allocating the vnode if
332  * necessary.  The vnode will be returned exclusively locked.
333  *
334  * The caller must lock the inode (shared or exclusive).
335  *
336  * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
337  * races.
338  */
339 struct vnode *
340 hammer2_igetv(hammer2_inode_t *ip, hammer2_cluster_t *cparent, int *errorp)
341 {
342         const hammer2_inode_data_t *ripdata;
343         hammer2_pfs_t *pmp;
344         struct vnode *vp;
345
346         pmp = ip->pmp;
347         KKASSERT(pmp != NULL);
348         *errorp = 0;
349
350         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
351
352         for (;;) {
353                 /*
354                  * Attempt to reuse an existing vnode assignment.  It is
355                  * possible to race a reclaim so the vget() may fail.  The
356                  * inode must be unlocked during the vget() to avoid a
357                  * deadlock against a reclaim.
358                  */
359                 int wasexclusive;
360
361                 vp = ip->vp;
362                 if (vp) {
363                         /*
364                          * Inode must be unlocked during the vget() to avoid
365                          * possible deadlocks, but leave the ip ref intact.
366                          *
367                          * vnode is held to prevent destruction during the
368                          * vget().  The vget() can still fail if we lost
369                          * a reclaim race on the vnode.
370                          */
371                         hammer2_mtx_state_t ostate;
372
373                         vhold(vp);
374                         ostate = hammer2_inode_lock_temp_release(ip);
375                         if (vget(vp, LK_EXCLUSIVE)) {
376                                 vdrop(vp);
377                                 hammer2_inode_lock_temp_restore(ip, ostate);
378                                 continue;
379                         }
380                         hammer2_inode_lock_temp_restore(ip, ostate);
381                         vdrop(vp);
382                         /* vp still locked and ref from vget */
383                         if (ip->vp != vp) {
384                                 kprintf("hammer2: igetv race %p/%p\n",
385                                         ip->vp, vp);
386                                 vput(vp);
387                                 continue;
388                         }
389                         *errorp = 0;
390                         break;
391                 }
392
393                 /*
394                  * No vnode exists, allocate a new vnode.  Beware of
395                  * allocation races.  This function will return an
396                  * exclusively locked and referenced vnode.
397                  */
398                 *errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
399                 if (*errorp) {
400                         kprintf("hammer2: igetv getnewvnode failed %d\n",
401                                 *errorp);
402                         vp = NULL;
403                         break;
404                 }
405
406                 /*
407                  * Lock the inode and check for an allocation race.
408                  */
409                 wasexclusive = hammer2_inode_lock_upgrade(ip);
410                 if (ip->vp != NULL) {
411                         vp->v_type = VBAD;
412                         vx_put(vp);
413                         hammer2_inode_lock_downgrade(ip, wasexclusive);
414                         continue;
415                 }
416
417                 switch (ripdata->type) {
418                 case HAMMER2_OBJTYPE_DIRECTORY:
419                         vp->v_type = VDIR;
420                         break;
421                 case HAMMER2_OBJTYPE_REGFILE:
422                         vp->v_type = VREG;
423                         vinitvmio(vp, ripdata->size,
424                                   HAMMER2_LBUFSIZE,
425                                   (int)ripdata->size & HAMMER2_LBUFMASK);
426                         break;
427                 case HAMMER2_OBJTYPE_SOFTLINK:
428                         /*
429                          * XXX for now we are using the generic file_read
430                          * and file_write code so we need a buffer cache
431                          * association.
432                          */
433                         vp->v_type = VLNK;
434                         vinitvmio(vp, ripdata->size,
435                                   HAMMER2_LBUFSIZE,
436                                   (int)ripdata->size & HAMMER2_LBUFMASK);
437                         break;
438                 case HAMMER2_OBJTYPE_CDEV:
439                         vp->v_type = VCHR;
440                         /* fall through */
441                 case HAMMER2_OBJTYPE_BDEV:
442                         vp->v_ops = &pmp->mp->mnt_vn_spec_ops;
443                         if (ripdata->type != HAMMER2_OBJTYPE_CDEV)
444                                 vp->v_type = VBLK;
445                         addaliasu(vp, ripdata->rmajor, ripdata->rminor);
446                         break;
447                 case HAMMER2_OBJTYPE_FIFO:
448                         vp->v_type = VFIFO;
449                         vp->v_ops = &pmp->mp->mnt_vn_fifo_ops;
450                         break;
451                 default:
452                         panic("hammer2: unhandled objtype %d", ripdata->type);
453                         break;
454                 }
455
456                 if (ip == pmp->iroot)
457                         vsetflags(vp, VROOT);
458
459                 vp->v_data = ip;
460                 ip->vp = vp;
461                 hammer2_inode_ref(ip);          /* vp association */
462                 hammer2_inode_lock_downgrade(ip, wasexclusive);
463                 break;
464         }
465
466         /*
467          * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
468          */
469         if (hammer2_debug & 0x0002) {
470                 kprintf("igetv vp %p refs 0x%08x aux 0x%08x\n",
471                         vp, vp->v_refcnt, vp->v_auxrefs);
472         }
473         return (vp);
474 }
475
476 /*
477  * Returns the inode associated with the passed-in cluster, creating the
478  * inode if necessary and synchronizing it to the passed-in cluster otherwise.
479  *
480  * The passed-in cluster must be locked and will remain locked on return.
481  * The returned inode will be locked and the caller may dispose of both
482  * via hammer2_inode_unlock_ex().  However, if the caller needs to resolve
483  * a hardlink it must ref/unlock/relock/drop the inode.
484  *
485  * The hammer2_inode structure regulates the interface between the high level
486  * kernel VNOPS API and the filesystem backend (the chains).
487  *
488  * On return the inode is locked with the supplied cluster.
489  */
490 hammer2_inode_t *
491 hammer2_inode_get(hammer2_pfs_t *pmp, hammer2_inode_t *dip,
492                   hammer2_cluster_t *cluster)
493 {
494         hammer2_inode_t *nip;
495         const hammer2_inode_data_t *iptmp;
496         const hammer2_inode_data_t *nipdata;
497
498         KKASSERT(cluster == NULL ||
499                  hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE);
500         KKASSERT(pmp);
501
502         /*
503          * Interlocked lookup/ref of the inode.  This code is only needed
504          * when looking up inodes with nlinks != 0 (TODO: optimize out
505          * otherwise and test for duplicates).
506          *
507          * Cluster can be NULL during the initial pfs allocation.
508          */
509 again:
510         while (cluster) {
511                 iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
512                 nip = hammer2_inode_lookup(pmp, iptmp->inum);
513                 if (nip == NULL)
514                         break;
515
516                 hammer2_mtx_ex(&nip->lock);
517
518                 /*
519                  * Handle SMP race (not applicable to the super-root spmp
520                  * which can't index inodes due to duplicative inode numbers).
521                  */
522                 if (pmp->spmp_hmp == NULL &&
523                     (nip->flags & HAMMER2_INODE_ONRBTREE) == 0) {
524                         hammer2_mtx_unlock(&nip->lock);
525                         hammer2_inode_drop(nip);
526                         continue;
527                 }
528                 hammer2_inode_repoint(nip, NULL, cluster);
529
530                 return nip;
531         }
532
533         /*
534          * We couldn't find the inode number, create a new inode.
535          */
536         nip = kmalloc(sizeof(*nip), pmp->minode, M_WAITOK | M_ZERO);
537         spin_init(&nip->cluster_spin, "h2clspin");
538         atomic_add_long(&pmp->inmem_inodes, 1);
539         hammer2_pfs_memory_inc(pmp);
540         hammer2_pfs_memory_wakeup(pmp);
541         if (pmp->spmp_hmp)
542                 nip->flags = HAMMER2_INODE_SROOT;
543
544         /*
545          * Initialize nip's cluster.  A cluster is provided for normal
546          * inodes but typically not for the super-root or PFS inodes.
547          */
548         nip->cluster.refs = 1;
549         nip->cluster.pmp = pmp;
550         nip->cluster.flags |= HAMMER2_CLUSTER_INODE;
551         if (cluster) {
552                 nipdata = &hammer2_cluster_rdata(cluster)->ipdata;
553                 nip->inum = nipdata->inum;
554                 nip->size = nipdata->size;
555                 nip->mtime = nipdata->mtime;
556                 hammer2_inode_repoint(nip, NULL, cluster);
557         } else {
558                 nip->inum = 1;                  /* PFS inum is always 1 XXX */
559                 /* mtime will be updated when a cluster is available */
560         }
561
562         nip->pip = dip;                         /* can be NULL */
563         if (dip)
564                 hammer2_inode_ref(dip); /* ref dip for nip->pip */
565
566         nip->pmp = pmp;
567
568         /*
569          * ref and lock on nip gives it state compatible to after a
570          * hammer2_inode_lock() call.
571          */
572         nip->refs = 1;
573         hammer2_mtx_init(&nip->lock, "h2inode");
574         hammer2_mtx_ex(&nip->lock);
575         /* combination of thread lock and chain lock == inode lock */
576
577         /*
578          * Attempt to add the inode.  If it fails we raced another inode
579          * get.  Undo all the work and try again.
580          */
581         if (pmp->spmp_hmp == NULL) {
582                 hammer2_spin_ex(&pmp->inum_spin);
583                 if (RB_INSERT(hammer2_inode_tree, &pmp->inum_tree, nip)) {
584                         hammer2_spin_unex(&pmp->inum_spin);
585                         hammer2_mtx_unlock(&nip->lock);
586                         hammer2_inode_drop(nip);
587                         goto again;
588                 }
589                 atomic_set_int(&nip->flags, HAMMER2_INODE_ONRBTREE);
590                 hammer2_spin_unex(&pmp->inum_spin);
591         }
592
593         return (nip);
594 }
595
596 /*
597  * Create a new inode in the specified directory using the vattr to
598  * figure out the type of inode.
599  *
600  * If no error occurs the new inode with its cluster locked is returned in
601  * *nipp, otherwise an error is returned and *nipp is set to NULL.
602  *
603  * If vap and/or cred are NULL the related fields are not set and the
604  * inode type defaults to a directory.  This is used when creating PFSs
605  * under the super-root, so the inode number is set to 1 in this case.
606  *
607  * dip is not locked on entry.
608  *
609  * NOTE: When used to create a snapshot, the inode is temporarily associated
610  *       with the super-root spmp. XXX should pass new pmp for snapshot.
611  */
612 hammer2_inode_t *
613 hammer2_inode_create(hammer2_trans_t *trans, hammer2_inode_t *dip,
614                      struct vattr *vap, struct ucred *cred,
615                      const uint8_t *name, size_t name_len,
616                      hammer2_cluster_t **clusterp,
617                      int flags, int *errorp)
618 {
619         const hammer2_inode_data_t *dipdata;
620         hammer2_inode_data_t *nipdata;
621         hammer2_cluster_t *cluster;
622         hammer2_cluster_t *cparent;
623         hammer2_inode_t *nip;
624         hammer2_key_t key_dummy;
625         hammer2_key_t lhc;
626         int error;
627         uid_t xuid;
628         uuid_t dip_uid;
629         uuid_t dip_gid;
630         uint32_t dip_mode;
631         uint8_t dip_comp_algo;
632         uint8_t dip_check_algo;
633
634         lhc = hammer2_dirhash(name, name_len);
635         *errorp = 0;
636
637         /*
638          * Locate the inode or indirect block to create the new
639          * entry in.  At the same time check for key collisions
640          * and iterate until we don't get one.
641          *
642          * NOTE: hidden inodes do not have iterators.
643          */
644 retry:
645         cparent = hammer2_inode_lock(dip, HAMMER2_RESOLVE_ALWAYS);
646         dipdata = &hammer2_cluster_rdata(cparent)->ipdata;
647         dip_uid = dipdata->uid;
648         dip_gid = dipdata->gid;
649         dip_mode = dipdata->mode;
650         dip_comp_algo = dipdata->comp_algo;
651         dip_check_algo = dipdata->check_algo;
652
653         error = 0;
654         while (error == 0) {
655                 cluster = hammer2_cluster_lookup(cparent, &key_dummy,
656                                                  lhc, lhc, 0);
657                 if (cluster == NULL)
658                         break;
659                 if ((lhc & HAMMER2_DIRHASH_VISIBLE) == 0)
660                         error = ENOSPC;
661                 if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
662                         error = ENOSPC;
663                 hammer2_cluster_unlock(cluster);
664                 hammer2_cluster_drop(cluster);
665                 cluster = NULL;
666                 ++lhc;
667         }
668
669         if (error == 0) {
670                 error = hammer2_cluster_create(trans, cparent, &cluster,
671                                              lhc, 0,
672                                              HAMMER2_BREF_TYPE_INODE,
673                                              HAMMER2_INODE_BYTES,
674                                              flags);
675         }
676 #if INODE_DEBUG
677         kprintf("CREATE INODE %*.*s chain=%p\n",
678                 (int)name_len, (int)name_len, name,
679                 (cluster ? cluster->focus : NULL));
680 #endif
681
682         /*
683          * Cleanup and handle retries.
684          */
685         if (error == EAGAIN) {
686                 hammer2_cluster_ref(cparent);
687                 hammer2_inode_unlock(dip, cparent);
688                 hammer2_cluster_wait(cparent);
689                 hammer2_cluster_drop(cparent);
690                 goto retry;
691         }
692         hammer2_inode_unlock(dip, cparent);
693         cparent = NULL;
694
695         if (error) {
696                 KKASSERT(cluster == NULL);
697                 *errorp = error;
698                 return (NULL);
699         }
700
701         /*
702          * Set up the new inode.
703          *
704          * NOTE: *_get() integrates chain's lock into the inode lock.
705          *
706          * NOTE: Only one new inode can currently be created per
707          *       transaction.  If the need arises we can adjust
708          *       hammer2_trans_init() to allow more.
709          *
710          * NOTE: nipdata will have chain's blockset data.
711          */
712         KKASSERT(cluster->focus->flags & HAMMER2_CHAIN_MODIFIED);
713         nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
714         nipdata->inum = trans->inode_tid;
715         hammer2_cluster_modsync(cluster);
716         nip = hammer2_inode_get(dip->pmp, dip, cluster);
717         nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
718
719         if (vap) {
720                 KKASSERT(trans->inodes_created == 0);
721                 nipdata->type = hammer2_get_obj_type(vap->va_type);
722                 nipdata->inum = trans->inode_tid;
723                 ++trans->inodes_created;
724
725                 switch (nipdata->type) {
726                 case HAMMER2_OBJTYPE_CDEV:
727                 case HAMMER2_OBJTYPE_BDEV:
728                         nipdata->rmajor = vap->va_rmajor;
729                         nipdata->rminor = vap->va_rminor;
730                         break;
731                 default:
732                         break;
733                 }
734         } else {
735                 nipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
736                 nipdata->inum = 1;
737         }
738         
739         /* Inherit parent's inode compression mode. */
740         nip->comp_heuristic = 0;
741         nipdata->comp_algo = dip_comp_algo;
742         nipdata->check_algo = dip_check_algo;
743         nipdata->version = HAMMER2_INODE_VERSION_ONE;
744         hammer2_update_time(&nipdata->ctime);
745         nipdata->mtime = nipdata->ctime;
746         if (vap)
747                 nipdata->mode = vap->va_mode;
748         nipdata->nlinks = 1;
749         if (vap) {
750                 if (dip && dip->pmp) {
751                         xuid = hammer2_to_unix_xid(&dip_uid);
752                         xuid = vop_helper_create_uid(dip->pmp->mp,
753                                                      dip_mode,
754                                                      xuid,
755                                                      cred,
756                                                      &vap->va_mode);
757                 } else {
758                         /* super-root has no dip and/or pmp */
759                         xuid = 0;
760                 }
761                 if (vap->va_vaflags & VA_UID_UUID_VALID)
762                         nipdata->uid = vap->va_uid_uuid;
763                 else if (vap->va_uid != (uid_t)VNOVAL)
764                         hammer2_guid_to_uuid(&nipdata->uid, vap->va_uid);
765                 else
766                         hammer2_guid_to_uuid(&nipdata->uid, xuid);
767
768                 if (vap->va_vaflags & VA_GID_UUID_VALID)
769                         nipdata->gid = vap->va_gid_uuid;
770                 else if (vap->va_gid != (gid_t)VNOVAL)
771                         hammer2_guid_to_uuid(&nipdata->gid, vap->va_gid);
772                 else if (dip)
773                         nipdata->gid = dip_gid;
774         }
775
776         /*
777          * Regular files and softlinks allow a small amount of data to be
778          * directly embedded in the inode.  This flag will be cleared if
779          * the size is extended past the embedded limit.
780          */
781         if (nipdata->type == HAMMER2_OBJTYPE_REGFILE ||
782             nipdata->type == HAMMER2_OBJTYPE_SOFTLINK) {
783                 nipdata->op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
784         }
785
786         KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
787         bcopy(name, nipdata->filename, name_len);
788         nipdata->name_key = lhc;
789         nipdata->name_len = name_len;
790         hammer2_cluster_modsync(cluster);
791         *clusterp = cluster;
792
793         return (nip);
794 }
795
796 /*
797  * The cluster has been removed from the original directory and replaced
798  * with a hardlink pointer.  Move the cluster to the specified parent
799  * directory, change the filename to "0xINODENUMBER", and adjust the key.
800  * The cluster becomes our invisible hardlink target.
801  *
802  * The original cluster must be deleted on entry.
803  */
804 static
805 void
806 hammer2_hardlink_shiftup(hammer2_trans_t *trans, hammer2_cluster_t *cluster,
807                         hammer2_inode_t *dip, hammer2_cluster_t *dcluster,
808                         int nlinks, int *errorp)
809 {
810         const hammer2_inode_data_t *iptmp;
811         hammer2_inode_data_t *nipdata;
812         hammer2_cluster_t *xcluster;
813         hammer2_key_t key_dummy;
814         hammer2_key_t lhc;
815         hammer2_blockref_t bref;
816
817         iptmp = &hammer2_cluster_rdata(cluster)->ipdata;
818         lhc = iptmp->inum;
819         KKASSERT((lhc & HAMMER2_DIRHASH_VISIBLE) == 0);
820
821         /*
822          * Locate the inode or indirect block to create the new
823          * entry in.  lhc represents the inode number so there is
824          * no collision iteration.
825          *
826          * There should be no key collisions with invisible inode keys.
827          *
828          * WARNING! Must use inode_lock_ex() on dip to handle a stale
829          *          dip->cluster cache.
830          */
831         *errorp = 0;
832         xcluster = hammer2_cluster_lookup(dcluster, &key_dummy,
833                                       lhc, lhc, 0);
834         if (xcluster) {
835                 kprintf("X3 chain %p dip %p dchain %p dip->chain %p\n",
836                         xcluster->focus, dip, dcluster->focus,
837                         dip->cluster.focus);
838                 hammer2_cluster_unlock(xcluster);
839                 hammer2_cluster_drop(xcluster);
840                 xcluster = NULL;
841                 *errorp = ENOSPC;
842 #if 0
843                 Debugger("X3");
844 #endif
845         }
846
847         /*
848          * Handle the error case
849          */
850         if (*errorp) {
851                 panic("error2");
852                 KKASSERT(xcluster == NULL);
853                 return;
854         }
855
856         /*
857          * Use xcluster as a placeholder for (lhc).  Duplicate cluster to the
858          * same target bref as xcluster and then delete xcluster.  The
859          * duplication occurs after xcluster in flush order even though
860          * xcluster is deleted after the duplication. XXX
861          *
862          * WARNING! Duplications (to a different parent) can cause indirect
863          *          blocks to be inserted, refactor xcluster.
864          *
865          * WARNING! Only key and keybits is extracted from a passed-in bref.
866          */
867         hammer2_cluster_bref(cluster, &bref);
868         bref.key = lhc;                 /* invisible dir entry key */
869         bref.keybits = 0;
870         hammer2_cluster_rename(trans, &bref, dcluster, cluster, 0);
871
872         /*
873          * cluster is now 'live' again.. adjust the filename.
874          *
875          * Directory entries are inodes but this is a hidden hardlink
876          * target.  The name isn't used but to ease debugging give it
877          * a name after its inode number.
878          */
879         hammer2_cluster_modify(trans, cluster, 0);
880         nipdata = &hammer2_cluster_wdata(cluster)->ipdata;
881         ksnprintf(nipdata->filename, sizeof(nipdata->filename),
882                   "0x%016jx", (intmax_t)nipdata->inum);
883         nipdata->name_len = strlen(nipdata->filename);
884         nipdata->name_key = lhc;
885         nipdata->nlinks += nlinks;
886         hammer2_cluster_modsync(cluster);
887 }
888
889 /*
890  * Connect the target inode represented by (cluster) to the media topology
891  * at (dip, name, len).  The caller can pass a rough *chainp, this function
892  * will issue lookup()s to position the parent chain properly for the
893  * chain insertion.
894  *
895  * If hlink is TRUE this function creates an OBJTYPE_HARDLINK directory
896  * entry instead of connecting (cluster).
897  *
898  * If hlink is FALSE this function expects (cluster) to be unparented.
899  */
900 int
901 hammer2_inode_connect(hammer2_trans_t *trans,
902                       hammer2_cluster_t **clusterp, int hlink,
903                       hammer2_inode_t *dip, hammer2_cluster_t *dcluster,
904                       const uint8_t *name, size_t name_len,
905                       hammer2_key_t lhc)
906 {
907         hammer2_inode_data_t *wipdata;
908         hammer2_cluster_t *ocluster;
909         hammer2_cluster_t *ncluster;
910         hammer2_key_t key_dummy;
911         int error;
912
913         /*
914          * Since ocluster is either disconnected from the topology or
915          * represents a hardlink terminus which is always a parent of or
916          * equal to dip, we should be able to safely lock dip->chain for
917          * our setup.
918          *
919          * WARNING! Must use inode_lock_ex() on dip to handle a stale
920          *          dip->cluster.
921          *
922          * If name is non-NULL we calculate lhc, else we use the passed-in
923          * lhc.
924          */
925         ocluster = *clusterp;
926
927         if (name) {
928                 lhc = hammer2_dirhash(name, name_len);
929
930                 /*
931                  * Locate the inode or indirect block to create the new
932                  * entry in.  At the same time check for key collisions
933                  * and iterate until we don't get one.
934                  */
935                 error = 0;
936                 while (error == 0) {
937                         ncluster = hammer2_cluster_lookup(dcluster, &key_dummy,
938                                                       lhc, lhc, 0);
939                         if (ncluster == NULL)
940                                 break;
941                         if ((lhc & HAMMER2_DIRHASH_LOMASK) ==
942                             HAMMER2_DIRHASH_LOMASK) {
943                                 error = ENOSPC;
944                         }
945                         hammer2_cluster_unlock(ncluster);
946                         hammer2_cluster_drop(ncluster);
947                         ncluster = NULL;
948                         ++lhc;
949                 }
950         } else {
951                 /*
952                  * Reconnect to specific key (used when moving
953                  * unlinked-but-open files into the hidden directory).
954                  */
955                 ncluster = hammer2_cluster_lookup(dcluster, &key_dummy,
956                                                   lhc, lhc, 0);
957                 KKASSERT(ncluster == NULL);
958                 error = 0;
959         }
960
961         if (error == 0) {
962                 if (hlink) {
963                         /*
964                          * Hardlink pointer needed, create totally fresh
965                          * directory entry.
966                          *
967                          * We must refactor ocluster because it might have
968                          * been shifted into an indirect cluster by the
969                          * create.
970                          */
971                         KKASSERT(ncluster == NULL);
972                         error = hammer2_cluster_create(trans,
973                                                        dcluster, &ncluster,
974                                                        lhc, 0,
975                                                        HAMMER2_BREF_TYPE_INODE,
976                                                        HAMMER2_INODE_BYTES,
977                                                        0);
978                 } else {
979                         /*
980                          * Reconnect the original cluster under the new name.
981                          * Original cluster must have already been deleted by
982                          * teh caller.
983                          *
984                          * WARNING! Can cause held-over clusters to require a
985                          *          refactor.  Fortunately we have none (our
986                          *          locked clusters are passed into and
987                          *          modified by the call).
988                          */
989                         ncluster = ocluster;
990                         ocluster = NULL;
991                         error = hammer2_cluster_create(trans,
992                                                        dcluster, &ncluster,
993                                                        lhc, 0,
994                                                        HAMMER2_BREF_TYPE_INODE,
995                                                        HAMMER2_INODE_BYTES,
996                                                        0);
997                 }
998         }
999
1000         /*
1001          * Unlock stuff.
1002          */
1003         KKASSERT(error != EAGAIN);
1004
1005         /*
1006          * ncluster should be NULL on error, leave ocluster
1007          * (ocluster == *clusterp) alone.
1008          */
1009         if (error) {
1010                 KKASSERT(ncluster == NULL);
1011                 return (error);
1012         }
1013
1014         /*
1015          * Directory entries are inodes so if the name has changed we have
1016          * to update the inode.
1017          *
1018          * When creating an OBJTYPE_HARDLINK entry remember to unlock the
1019          * cluster, the caller will access the hardlink via the actual hardlink
1020          * target file and not the hardlink pointer entry, so we must still
1021          * return ocluster.
1022          */
1023         if (hlink && hammer2_hardlink_enable >= 0) {
1024                 /*
1025                  * Create the HARDLINK pointer.  oip represents the hardlink
1026                  * target in this situation.
1027                  *
1028                  * We will return ocluster (the hardlink target).
1029                  */
1030                 hammer2_cluster_modify(trans, ncluster, 0);
1031                 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1032                 wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1033                 bcopy(name, wipdata->filename, name_len);
1034                 wipdata->name_key = lhc;
1035                 wipdata->name_len = name_len;
1036                 wipdata->target_type =
1037                                 hammer2_cluster_rdata(ocluster)->ipdata.type;
1038                 wipdata->type = HAMMER2_OBJTYPE_HARDLINK;
1039                 wipdata->inum = hammer2_cluster_rdata(ocluster)->ipdata.inum;
1040                 wipdata->version = HAMMER2_INODE_VERSION_ONE;
1041                 wipdata->nlinks = 1;
1042                 wipdata->op_flags = HAMMER2_OPFLAG_DIRECTDATA;
1043                 hammer2_cluster_modsync(ncluster);
1044                 hammer2_cluster_unlock(ncluster);
1045                 hammer2_cluster_drop(ncluster);
1046                 ncluster = ocluster;
1047                 ocluster = NULL;
1048         } else {
1049                 /*
1050                  * ncluster is a duplicate of ocluster at the new location.
1051                  * We must fixup the name stored in the inode data.
1052                  * The bref key has already been adjusted by inode_connect().
1053                  */
1054                 hammer2_cluster_modify(trans, ncluster, 0);
1055                 wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1056
1057                 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
1058                 bcopy(name, wipdata->filename, name_len);
1059                 wipdata->name_key = lhc;
1060                 wipdata->name_len = name_len;
1061                 wipdata->nlinks = 1;
1062                 hammer2_cluster_modsync(ncluster);
1063         }
1064
1065         /*
1066          * We are replacing ocluster with ncluster, unlock ocluster.  In the
1067          * case where ocluster is left unchanged the code above sets
1068          * ncluster to ocluster and ocluster to NULL, resulting in a NOP here.
1069          */
1070         if (ocluster) {
1071                 hammer2_cluster_unlock(ocluster);
1072                 hammer2_cluster_drop(ocluster);
1073         }
1074         *clusterp = ncluster;
1075
1076         return (0);
1077 }
1078
1079 /*
1080  * Repoint ip->cluster's chains to cluster's chains and fixup the default
1081  * focus.  Only valid elements are repointed.  Invalid elements have to be
1082  * adjusted by the appropriate slave sync threads.
1083  *
1084  * Caller must hold the inode and cluster exclusive locked, if not NULL,
1085  * must also be locked.
1086  *
1087  * Cluster may be NULL to clean out any chains in ip->cluster.
1088  */
1089 void
1090 hammer2_inode_repoint(hammer2_inode_t *ip, hammer2_inode_t *pip,
1091                       hammer2_cluster_t *cluster)
1092 {
1093         hammer2_chain_t *dropch[HAMMER2_MAXCLUSTER];
1094         hammer2_chain_t *ochain;
1095         hammer2_chain_t *nchain;
1096         hammer2_inode_t *opip;
1097         int i;
1098
1099         bzero(dropch, sizeof(dropch));
1100
1101         /*
1102          * Replace chains in ip->cluster with chains from cluster and
1103          * adjust the focus if necessary.
1104          *
1105          * NOTE: nchain and/or ochain can be NULL due to gaps
1106          *       in the cluster arrays.
1107          */
1108         hammer2_spin_ex(&ip->cluster_spin);
1109         for (i = 0; cluster && i < cluster->nchains; ++i) {
1110                 /*
1111                  * Do not replace invalid elements as this might race
1112                  * syncthr replacements.
1113                  */
1114                 if (cluster->array[i].flags & HAMMER2_CITEM_INVALID)
1115                         continue;
1116
1117                 /*
1118                  * Do not replace elements which are the same.  Also handle
1119                  * element count discrepancies.
1120                  */
1121                 nchain = cluster->array[i].chain;
1122                 if (i < ip->cluster.nchains) {
1123                         ochain = ip->cluster.array[i].chain;
1124                         if (ochain == nchain)
1125                                 continue;
1126                 } else {
1127                         ochain = NULL;
1128                 }
1129
1130                 /*
1131                  * Make adjustments
1132                  */
1133                 ip->cluster.array[i].chain = nchain;
1134                 ip->cluster.array[i].flags &= ~HAMMER2_CITEM_INVALID;
1135                 ip->cluster.array[i].flags |= cluster->array[i].flags &
1136                                               HAMMER2_CITEM_INVALID;
1137                 if (nchain)
1138                         hammer2_chain_ref(nchain);
1139                 dropch[i] = ochain;
1140         }
1141
1142         /*
1143          * Release any left-over chains in ip->cluster.
1144          */
1145         while (i < ip->cluster.nchains) {
1146                 nchain = ip->cluster.array[i].chain;
1147                 if (nchain) {
1148                         ip->cluster.array[i].chain = NULL;
1149                         ip->cluster.array[i].flags |= HAMMER2_CITEM_INVALID;
1150                 }
1151                 dropch[i] = nchain;
1152                 ++i;
1153         }
1154
1155         /*
1156          * Fixup fields.  Note that the inode-embedded cluster is never
1157          * directly locked.
1158          */
1159         if (cluster) {
1160                 ip->cluster.nchains = cluster->nchains;
1161                 ip->cluster.focus = cluster->focus;
1162                 ip->cluster.flags = cluster->flags & ~HAMMER2_CLUSTER_LOCKED;
1163         } else {
1164                 ip->cluster.nchains = 0;
1165                 ip->cluster.focus = NULL;
1166                 ip->cluster.flags &= ~HAMMER2_CLUSTER_ZFLAGS;
1167         }
1168
1169         /*
1170          * Repoint ip->pip if requested (non-NULL pip).
1171          */
1172         if (pip && ip->pip != pip) {
1173                 opip = ip->pip;
1174                 hammer2_inode_ref(pip);
1175                 ip->pip = pip;
1176         } else {
1177                 opip = NULL;
1178         }
1179         hammer2_spin_unex(&ip->cluster_spin);
1180
1181         /*
1182          * Cleanup outside of spinlock
1183          */
1184         while (--i >= 0) {
1185                 if (dropch[i])
1186                         hammer2_chain_drop(dropch[i]);
1187         }
1188         if (opip)
1189                 hammer2_inode_drop(opip);
1190 }
1191
1192 /*
1193  * Repoint a single element from the cluster to the ip.  Used by the
1194  * synchronization threads to piecemeal update inodes.  Does not change
1195  * focus and requires inode to be re-locked to clean-up flags (XXX).
1196  */
1197 void
1198 hammer2_inode_repoint_one(hammer2_inode_t *ip, hammer2_cluster_t *cluster,
1199                           int idx)
1200 {
1201         hammer2_chain_t *ochain;
1202         hammer2_chain_t *nchain;
1203         int i;
1204
1205         hammer2_spin_ex(&ip->cluster_spin);
1206         KKASSERT(idx < cluster->nchains);
1207         if (idx < ip->cluster.nchains) {
1208                 ochain = ip->cluster.array[idx].chain;
1209                 nchain = cluster->array[idx].chain;
1210         } else {
1211                 ochain = NULL;
1212                 nchain = cluster->array[idx].chain;
1213                 ip->cluster.nchains = idx + 1;
1214                 for (i = ip->cluster.nchains; i <= idx; ++i) {
1215                         bzero(&ip->cluster.array[i],
1216                               sizeof(ip->cluster.array[i]));
1217                         ip->cluster.array[i].flags |= HAMMER2_CITEM_INVALID;
1218                 }
1219         }
1220         if (ochain != nchain) {
1221                 /*
1222                  * Make adjustments.
1223                  */
1224                 ip->cluster.array[idx].chain = nchain;
1225                 ip->cluster.array[idx].flags &= ~HAMMER2_CITEM_INVALID;
1226                 ip->cluster.array[idx].flags |= cluster->array[idx].flags &
1227                                                 HAMMER2_CITEM_INVALID;
1228         }
1229         hammer2_spin_unex(&ip->cluster_spin);
1230         if (ochain != nchain) {
1231                 if (nchain)
1232                         hammer2_chain_ref(nchain);
1233                 if (ochain)
1234                         hammer2_chain_drop(ochain);
1235         }
1236 }
1237
1238 /*
1239  * Unlink the file from the specified directory inode.  The directory inode
1240  * does not need to be locked.
1241  *
1242  * isdir determines whether a directory/non-directory check should be made.
1243  * No check is made if isdir is set to -1.
1244  *
1245  * isopen specifies whether special unlink-with-open-descriptor handling
1246  * must be performed.  If set to -1 the caller is deleting a PFS and we
1247  * check whether the chain is mounted or not (chain->pmp != NULL).  1 is
1248  * implied if it is mounted.
1249  *
1250  * If isopen is 1 and nlinks drops to 0 this function must move the chain
1251  * to a special hidden directory until last-close occurs on the file.
1252  *
1253  * NOTE!  The underlying file can still be active with open descriptors
1254  *        or if the chain is being manually held (e.g. for rename).
1255  *
1256  *        The caller is responsible for fixing up ip->chain if e.g. a
1257  *        rename occurs (see chain_duplicate()).
1258  *
1259  * NOTE!  The chain is not deleted if it is moved to the hidden directory,
1260  *        but otherwise will be deleted.
1261  */
1262 int
1263 hammer2_unlink_file(hammer2_trans_t *trans, hammer2_inode_t *dip,
1264                     const uint8_t *name, size_t name_len,
1265                     int isdir, int *hlinkp, struct nchandle *nch,
1266                     int nlinks)
1267 {
1268         const hammer2_inode_data_t *ripdata;
1269         hammer2_inode_data_t *wipdata;
1270         hammer2_cluster_t *cparent;
1271         hammer2_cluster_t *hcluster;
1272         hammer2_cluster_t *hparent;
1273         hammer2_cluster_t *cluster;
1274         hammer2_cluster_t *dparent;
1275         hammer2_cluster_t *dcluster;
1276         hammer2_key_t key_dummy;
1277         hammer2_key_t key_next;
1278         hammer2_key_t lhc;
1279         int last_link;
1280         int error;
1281         int hlink;
1282         uint8_t type;
1283
1284         error = 0;
1285         hlink = 0;
1286         hcluster = NULL;
1287         hparent = NULL;
1288         lhc = hammer2_dirhash(name, name_len);
1289
1290 again:
1291         /*
1292          * Search for the filename in the directory
1293          */
1294         cparent = hammer2_inode_lock(dip, HAMMER2_RESOLVE_ALWAYS);
1295         cluster = hammer2_cluster_lookup(cparent, &key_next,
1296                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK, 0);
1297         while (cluster) {
1298                 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE) {
1299                         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1300                         if (ripdata->name_len == name_len &&
1301                             bcmp(ripdata->filename, name, name_len) == 0) {
1302                                 break;
1303                         }
1304                 }
1305                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
1306                                                key_next,
1307                                                lhc + HAMMER2_DIRHASH_LOMASK,
1308                                                0);
1309         }
1310         hammer2_inode_unlock(dip, NULL);        /* retain cparent */
1311
1312         /*
1313          * Not found or wrong type (isdir < 0 disables the type check).
1314          * If a hardlink pointer, type checks use the hardlink target.
1315          */
1316         if (cluster == NULL) {
1317                 error = ENOENT;
1318                 goto done;
1319         }
1320         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1321         type = ripdata->type;
1322         if (type == HAMMER2_OBJTYPE_HARDLINK) {
1323                 hlink = 1;
1324                 type = ripdata->target_type;
1325         }
1326
1327         if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 0) {
1328                 error = ENOTDIR;
1329                 goto done;
1330         }
1331         if (type != HAMMER2_OBJTYPE_DIRECTORY && isdir >= 1) {
1332                 error = EISDIR;
1333                 goto done;
1334         }
1335
1336         /*
1337          * Hardlink must be resolved.  We can't hold the parent locked
1338          * while we do this or we could deadlock.  The physical file will
1339          * be located at or above the current directory.
1340          *
1341          * We loop to reacquire the hardlink origination.
1342          *
1343          * NOTE: hammer2_hardlink_find() will locate the hardlink target,
1344          *       returning a modified hparent and hcluster.
1345          */
1346         if (ripdata->type == HAMMER2_OBJTYPE_HARDLINK) {
1347                 if (hcluster == NULL) {
1348                         hcluster = cluster;
1349                         cluster = NULL; /* safety */
1350                         hammer2_cluster_unlock(cparent);
1351                         hammer2_cluster_drop(cparent);
1352                         cparent = NULL; /* safety */
1353                         ripdata = NULL; /* safety (associated w/cparent) */
1354                         error = hammer2_hardlink_find(dip, &hparent, &hcluster);
1355
1356                         /*
1357                          * If we couldn't find the hardlink target then some
1358                          * parent directory containing the hardlink pointer
1359                          * probably got renamed to above the original target,
1360                          * a case not yet handled by H2.
1361                          */
1362                         if (error) {
1363                                 kprintf("H2 unlink_file: hardlink target for "
1364                                         "\"%s\" not found\n",
1365                                         name);
1366                                 kprintf("(likely due to known directory "
1367                                         "rename bug)\n");
1368                                 goto done;
1369                         }
1370                         goto again;
1371                 }
1372         }
1373
1374         /*
1375          * If this is a directory the directory must be empty.  However, if
1376          * isdir < 0 we are doing a rename and the directory does not have
1377          * to be empty, and if isdir > 1 we are deleting a PFS/snapshot
1378          * and the directory does not have to be empty.
1379          *
1380          * NOTE: We check the full key range here which covers both visible
1381          *       and invisible entries.  Theoretically there should be no
1382          *       invisible (hardlink target) entries if there are no visible
1383          *       entries.
1384          */
1385         if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 1) {
1386                 dparent = hammer2_cluster_lookup_init(cluster, 0);
1387                 dcluster = hammer2_cluster_lookup(dparent, &key_dummy,
1388                                                   0, (hammer2_key_t)-1,
1389                                                   HAMMER2_LOOKUP_NODATA);
1390                 if (dcluster) {
1391                         hammer2_cluster_unlock(dcluster);
1392                         hammer2_cluster_drop(dcluster);
1393                         hammer2_cluster_lookup_done(dparent);
1394                         error = ENOTEMPTY;
1395                         goto done;
1396                 }
1397                 hammer2_cluster_lookup_done(dparent);
1398                 dparent = NULL;
1399                 /* dcluster NULL */
1400         }
1401
1402         /*
1403          * If this was a hardlink then (cparent, cluster) is the hardlink
1404          * pointer, which we can simply destroy outright.  Discard the
1405          * clusters and replace with the hardlink target.
1406          */
1407         if (hcluster) {
1408                 hammer2_cluster_delete(trans, cparent, cluster,
1409                                        HAMMER2_DELETE_PERMANENT);
1410                 hammer2_cluster_unlock(cparent);
1411                 hammer2_cluster_drop(cparent);
1412                 hammer2_cluster_unlock(cluster);
1413                 hammer2_cluster_drop(cluster);
1414                 cparent = hparent;
1415                 cluster = hcluster;
1416                 hparent = NULL;
1417                 hcluster = NULL;
1418         }
1419
1420         /*
1421          * This leaves us with the hardlink target or non-hardlinked file
1422          * or directory in (cparent, cluster).
1423          *
1424          * Delete the target when nlinks reaches 0 with special handling
1425          * to avoid I/O (to avoid actually updating the inode) for the 1->0
1426          * transition, if possible.  This optimization makes rm -rf very
1427          * fast.
1428          *
1429          * NOTE! In DragonFly the vnops function calls cache_unlink() after
1430          *       calling us here to clean out the namecache association,
1431          *       (which does not represent a ref for the open-test), and to
1432          *       force finalization of the vnode if/when the last ref gets
1433          *       dropped.
1434          *
1435          * NOTE! Files are unlinked by rename and then relinked.  nch will be
1436          *       passed as NULL in this situation.  hammer2_inode_connect()
1437          *       will bump nlinks.
1438          */
1439         KKASSERT(cluster != NULL);
1440
1441         /*
1442          * Note: nlinks is negative when decrementing, positive when
1443          *       incrementing.
1444          */
1445         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1446         last_link = (ripdata->nlinks + nlinks == 0);
1447
1448         if (last_link) {
1449                 /*
1450                  * Target nlinks has reached 0, file now unlinked (but may
1451                  * still be open).
1452                  *
1453                  * nlinks will be -1 for a normal remove().  If this is the
1454                  * last link we must flag the inode on deactivation. XXX race ?
1455                  */
1456                 hammer2_inode_t *ip;
1457
1458                 if (nlinks == -1) {
1459                         ip = hammer2_inode_lookup(trans->pmp, ripdata->inum);
1460                         if (ip) {
1461                                 atomic_set_int(&ip->flags,
1462                                                HAMMER2_INODE_ISUNLINKED);
1463                                 hammer2_inode_drop(ip);
1464                         }
1465                 }
1466
1467                 if (nch && cache_isopen(nch)) {
1468                         /*
1469                          * If an unlinked file is still open we must update
1470                          * the inodes link count.
1471                          */
1472                         hammer2_cluster_modify(trans, cluster, 0);
1473                         wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1474                         ripdata = wipdata;
1475                         wipdata->nlinks += nlinks;
1476                         /* XXX debugging */
1477                         if ((int64_t)wipdata->nlinks < 0) {
1478                                 wipdata->nlinks = 0;
1479                         }
1480                         hammer2_cluster_modsync(cluster);
1481                         hammer2_inode_move_to_hidden(trans, &cparent, &cluster,
1482                                                      wipdata->inum);
1483                 } else {
1484                         /*
1485                          * This won't get everything if a vnode is still
1486                          * present, but the cache_unlink() call the caller
1487                          * makes will.
1488                          */
1489                         hammer2_cluster_delete(trans, cparent, cluster,
1490                                                HAMMER2_DELETE_PERMANENT);
1491                 }
1492         } else if (hlink == 0) {
1493                 /*
1494                  * In this situation a normal non-hardlinked file (which can
1495                  * only have nlinks == 1) still has a non-zero nlinks, the
1496                  * caller must be doing a RENAME operation and so is passing
1497                  * a nlinks adjustment of 0, and only wishes to remove file
1498                  * in order to be able to reconnect it under a different name.
1499                  *
1500                  * In this situation we do a temporary deletion of the
1501                  * chain in order to allow the file to be reconnected in
1502                  * a different location.
1503                  */
1504                 KKASSERT(nlinks == 0);
1505                 hammer2_cluster_delete(trans, cparent, cluster, 0);
1506         } else {
1507                 /*
1508                  * Links remain, must update the inode link count.
1509                  */
1510                 hammer2_cluster_modify(trans, cluster, 0);
1511                 wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1512                 ripdata = wipdata;
1513                 wipdata->nlinks += nlinks;
1514                 if ((int64_t)wipdata->nlinks < 0) {     /* XXX debugging */
1515                         wipdata->nlinks = 0;
1516                 }
1517                 hammer2_cluster_modsync(cluster);
1518         }
1519
1520         error = 0;
1521 done:
1522         if (cparent) {
1523                 hammer2_cluster_unlock(cparent);
1524                 hammer2_cluster_drop(cparent);
1525         }
1526         if (cluster) {
1527                 hammer2_cluster_unlock(cluster);
1528                 hammer2_cluster_drop(cluster);
1529         }
1530         if (hparent) {
1531                 hammer2_cluster_unlock(hparent);
1532                 hammer2_cluster_drop(hparent);
1533         }
1534         if (hcluster) {
1535                 hammer2_cluster_unlock(hcluster);
1536                 hammer2_cluster_drop(hcluster);
1537         }
1538         if (hlinkp)
1539                 *hlinkp = hlink;
1540
1541         return error;
1542 }
1543
1544 /*
1545  * This is called from the mount code to initialize pmp->ihidden
1546  */
1547 void
1548 hammer2_inode_install_hidden(hammer2_pfs_t *pmp)
1549 {
1550         hammer2_trans_t trans;
1551         hammer2_cluster_t *cparent;
1552         hammer2_cluster_t *cluster;
1553         hammer2_cluster_t *scan;
1554         const hammer2_inode_data_t *ripdata;
1555         hammer2_inode_data_t *wipdata;
1556         hammer2_key_t key_dummy;
1557         hammer2_key_t key_next;
1558         int error;
1559         int count;
1560         int dip_check_algo;
1561         int dip_comp_algo;
1562
1563         if (pmp->ihidden)
1564                 return;
1565
1566         /*
1567          * Find the hidden directory
1568          */
1569         bzero(&key_dummy, sizeof(key_dummy));
1570         hammer2_trans_init(&trans, pmp, 0);
1571
1572         /*
1573          * Setup for lookup, retrieve iroot's check and compression
1574          * algorithm request which was likely generated by newfs_hammer2.
1575          *
1576          * The check/comp fields will probably never be used since inodes
1577          * are renamed into the hidden directory and not created relative to
1578          * the hidden directory, chain creation inherits from bref.methods,
1579          * and data chains inherit from their respective file inode *_algo
1580          * fields.
1581          */
1582         cparent = hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1583         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1584         dip_check_algo = ripdata->check_algo;
1585         dip_comp_algo = ripdata->comp_algo;
1586         ripdata = NULL;
1587
1588         cluster = hammer2_cluster_lookup(cparent, &key_dummy,
1589                                          HAMMER2_INODE_HIDDENDIR,
1590                                          HAMMER2_INODE_HIDDENDIR,
1591                                          0);
1592         if (cluster) {
1593                 pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, cluster);
1594                 hammer2_inode_ref(pmp->ihidden);
1595
1596                 /*
1597                  * Remove any unlinked files which were left open as-of
1598                  * any system crash.
1599                  *
1600                  * Don't pass NODATA, we need the inode data so the delete
1601                  * can do proper statistics updates.
1602                  */
1603                 count = 0;
1604                 scan = hammer2_cluster_lookup(cluster, &key_next,
1605                                               0, HAMMER2_TID_MAX, 0);
1606                 while (scan) {
1607                         if (hammer2_cluster_type(scan) ==
1608                             HAMMER2_BREF_TYPE_INODE) {
1609                                 hammer2_cluster_delete(&trans, cluster, scan,
1610                                                    HAMMER2_DELETE_PERMANENT);
1611                                 ++count;
1612                         }
1613                         scan = hammer2_cluster_next(cluster, scan, &key_next,
1614                                                     0, HAMMER2_TID_MAX, 0);
1615                 }
1616
1617                 hammer2_inode_unlock(pmp->ihidden, cluster);
1618                 hammer2_inode_unlock(pmp->iroot, cparent);
1619                 hammer2_trans_done(&trans);
1620                 kprintf("hammer2: PFS loaded hidden dir, "
1621                         "removed %d dead entries\n", count);
1622                 return;
1623         }
1624
1625         /*
1626          * Create the hidden directory
1627          */
1628         error = hammer2_cluster_create(&trans, cparent, &cluster,
1629                                        HAMMER2_INODE_HIDDENDIR, 0,
1630                                        HAMMER2_BREF_TYPE_INODE,
1631                                        HAMMER2_INODE_BYTES,
1632                                        0);
1633         hammer2_inode_unlock(pmp->iroot, cparent);
1634
1635         hammer2_cluster_modify(&trans, cluster, 0);
1636         wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1637         wipdata->type = HAMMER2_OBJTYPE_DIRECTORY;
1638         wipdata->inum = HAMMER2_INODE_HIDDENDIR;
1639         wipdata->nlinks = 1;
1640         wipdata->comp_algo = dip_comp_algo;
1641         wipdata->check_algo = dip_check_algo;
1642         hammer2_cluster_modsync(cluster);
1643         kprintf("hammer2: PFS root missing hidden directory, creating\n");
1644
1645         pmp->ihidden = hammer2_inode_get(pmp, pmp->iroot, cluster);
1646         hammer2_inode_ref(pmp->ihidden);
1647         hammer2_inode_unlock(pmp->ihidden, cluster);
1648         hammer2_trans_done(&trans);
1649 }
1650
1651 /*
1652  * If an open file is unlinked H2 needs to retain the file in the topology
1653  * to ensure that its backing store is not recovered by the bulk free scan.
1654  * This also allows us to avoid having to special-case the CHAIN_DELETED flag.
1655  *
1656  * To do this the file is moved to a hidden directory in the PFS root and
1657  * renamed.  The hidden directory must be created if it does not exist.
1658  */
1659 static
1660 void
1661 hammer2_inode_move_to_hidden(hammer2_trans_t *trans,
1662                              hammer2_cluster_t **cparentp,
1663                              hammer2_cluster_t **clusterp,
1664                              hammer2_tid_t inum)
1665 {
1666         hammer2_cluster_t *dcluster;
1667         hammer2_pfs_t *pmp;
1668         int error;
1669
1670         pmp = (*clusterp)->pmp;
1671         KKASSERT(pmp != NULL);
1672         KKASSERT(pmp->ihidden != NULL);
1673
1674         hammer2_cluster_delete(trans, *cparentp, *clusterp, 0);
1675         dcluster = hammer2_inode_lock(pmp->ihidden, HAMMER2_RESOLVE_ALWAYS);
1676         error = hammer2_inode_connect(trans, clusterp, 0,
1677                                       pmp->ihidden, dcluster,
1678                                       NULL, 0, inum);
1679         hammer2_inode_unlock(pmp->ihidden, dcluster);
1680         KKASSERT(error == 0);
1681 }
1682
1683 /*
1684  * Given an exclusively locked inode and cluster we consolidate the cluster
1685  * for hardlink creation, adding (nlinks) to the file's link count and
1686  * potentially relocating the inode to (cdip) which is a parent directory
1687  * common to both the current location of the inode and the intended new
1688  * hardlink.
1689  *
1690  * Replaces (*clusterp) if consolidation occurred, unlocking the old cluster
1691  * and returning a new locked cluster.
1692  *
1693  * NOTE!  This function will also replace ip->cluster.
1694  */
1695 int
1696 hammer2_hardlink_consolidate(hammer2_trans_t *trans,
1697                              hammer2_inode_t *ip,
1698                              hammer2_cluster_t **clusterp,
1699                              hammer2_inode_t *cdip,
1700                              hammer2_cluster_t *cdcluster,
1701                              int nlinks)
1702 {
1703         const hammer2_inode_data_t *ripdata;
1704         hammer2_inode_data_t *wipdata;
1705         hammer2_cluster_t *cluster;
1706         hammer2_cluster_t *cparent;
1707         int error;
1708
1709         cluster = *clusterp;
1710         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1711         if (nlinks == 0 &&                      /* no hardlink needed */
1712             (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE)) {
1713                 return (0);
1714         }
1715
1716         if (hammer2_hardlink_enable == 0) {     /* disallow hardlinks */
1717                 hammer2_cluster_unlock(cluster);
1718                 hammer2_cluster_drop(cluster);
1719                 *clusterp = NULL;
1720                 return (ENOTSUP);
1721         }
1722
1723         cparent = NULL;
1724
1725         /*
1726          * If no change in the hardlink's target directory is required and
1727          * this is already a hardlink target, all we need to do is adjust
1728          * the link count.
1729          */
1730         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1731         if (cdip == ip->pip &&
1732             (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) == 0) {
1733                 if (nlinks) {
1734                         hammer2_cluster_modify(trans, cluster, 0);
1735                         wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1736                         wipdata->nlinks += nlinks;
1737                         hammer2_cluster_modsync(cluster);
1738                         ripdata = wipdata;
1739                 }
1740                 error = 0;
1741                 goto done;
1742         }
1743
1744         /*
1745          * Cluster is the real inode.  The originating directory is locked
1746          * by the caller so we can manipulate it without worrying about races
1747          * against other lookups.
1748          *
1749          * If cluster is visible we need to delete it from the current
1750          * location and create a hardlink pointer in its place.  If it is
1751          * not visible we need only delete it.  Then later cluster will be
1752          * renamed to a parent directory and converted (if necessary) to
1753          * a hidden inode (via shiftup).
1754          *
1755          * NOTE! We must hold cparent locked through the delete/create/rename
1756          *       operation to ensure that other threads block resolving to
1757          *       the same hardlink, otherwise the other threads may not see
1758          *       the hardlink.
1759          */
1760         KKASSERT((cluster->focus->flags & HAMMER2_CHAIN_DELETED) == 0);
1761         cparent = hammer2_cluster_parent(cluster);
1762
1763         hammer2_cluster_delete(trans, cparent, cluster, 0);
1764
1765         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1766         KKASSERT(ripdata->type != HAMMER2_OBJTYPE_HARDLINK);
1767         if (ripdata->name_key & HAMMER2_DIRHASH_VISIBLE) {
1768                 hammer2_cluster_t *ncluster;
1769                 hammer2_key_t lhc;
1770
1771                 ncluster = NULL;
1772                 lhc = cluster->focus->bref.key;
1773                 error = hammer2_cluster_create(trans, cparent, &ncluster,
1774                                              lhc, 0,
1775                                              HAMMER2_BREF_TYPE_INODE,
1776                                              HAMMER2_INODE_BYTES,
1777                                              0);
1778                 hammer2_cluster_modify(trans, ncluster, 0);
1779                 wipdata = &hammer2_cluster_wdata(ncluster)->ipdata;
1780
1781                 /* wipdata->comp_algo = ripdata->comp_algo; */
1782                 wipdata->comp_algo = 0;
1783                 wipdata->check_algo = 0;
1784                 wipdata->version = HAMMER2_INODE_VERSION_ONE;
1785                 wipdata->inum = ripdata->inum;
1786                 wipdata->target_type = ripdata->type;
1787                 wipdata->type = HAMMER2_OBJTYPE_HARDLINK;
1788                 wipdata->uflags = 0;
1789                 wipdata->rmajor = 0;
1790                 wipdata->rminor = 0;
1791                 wipdata->ctime = 0;
1792                 wipdata->mtime = 0;
1793                 wipdata->atime = 0;
1794                 wipdata->btime = 0;
1795                 bzero(&wipdata->uid, sizeof(wipdata->uid));
1796                 bzero(&wipdata->gid, sizeof(wipdata->gid));
1797                 wipdata->op_flags = HAMMER2_OPFLAG_DIRECTDATA;
1798                 wipdata->cap_flags = 0;
1799                 wipdata->mode = 0;
1800                 wipdata->size = 0;
1801                 wipdata->nlinks = 1;
1802                 wipdata->iparent = 0;   /* XXX */
1803                 wipdata->pfs_type = 0;
1804                 wipdata->pfs_inum = 0;
1805                 bzero(&wipdata->pfs_clid, sizeof(wipdata->pfs_clid));
1806                 bzero(&wipdata->pfs_fsid, sizeof(wipdata->pfs_fsid));
1807                 wipdata->data_quota = 0;
1808                 /* wipdata->data_count = 0; */
1809                 wipdata->inode_quota = 0;
1810                 /* wipdata->inode_count = 0; */
1811                 wipdata->attr_tid = 0;
1812                 wipdata->dirent_tid = 0;
1813                 bzero(&wipdata->u, sizeof(wipdata->u));
1814                 bcopy(ripdata->filename, wipdata->filename, ripdata->name_len);
1815                 wipdata->name_key = ncluster->focus->bref.key;
1816                 wipdata->name_len = ripdata->name_len;
1817                 /* XXX transaction ids */
1818                 hammer2_cluster_modsync(ncluster);
1819                 hammer2_cluster_unlock(ncluster);
1820                 hammer2_cluster_drop(ncluster);
1821         }
1822         ripdata = wipdata;
1823
1824         /*
1825          * cluster represents the hardlink target and is now flagged deleted.
1826          * duplicate it to the parent directory and adjust nlinks.
1827          *
1828          * WARNING! The shiftup() call can cause ncluster to be moved into
1829          *          an indirect block, and our ncluster will wind up pointing
1830          *          to the older/original version.
1831          */
1832         KKASSERT(cluster->focus->flags & HAMMER2_CHAIN_DELETED);
1833         hammer2_hardlink_shiftup(trans, cluster, cdip, cdcluster,
1834                                  nlinks, &error);
1835
1836         if (error == 0)
1837                 hammer2_inode_repoint(ip, cdip, cluster);
1838
1839 done:
1840         /*
1841          * Cleanup, cluster/ncluster already dealt with.
1842          *
1843          * Return the shifted cluster in *clusterp.
1844          */
1845         if (cparent) {
1846                 hammer2_cluster_unlock(cparent);
1847                 hammer2_cluster_drop(cparent);
1848         }
1849         *clusterp = cluster;
1850
1851         return (error);
1852 }
1853
1854 /*
1855  * If (*ochainp) is non-NULL it points to the forward OBJTYPE_HARDLINK
1856  * inode while (*chainp) points to the resolved (hidden hardlink
1857  * target) inode.  In this situation when nlinks is 1 we wish to
1858  * deconsolidate the hardlink, moving it back to the directory that now
1859  * represents the only remaining link.
1860  */
1861 int
1862 hammer2_hardlink_deconsolidate(hammer2_trans_t *trans,
1863                                hammer2_inode_t *dip,
1864                                hammer2_chain_t **chainp,
1865                                hammer2_chain_t **ochainp)
1866 {
1867         if (*ochainp == NULL)
1868                 return (0);
1869         /* XXX */
1870         return (0);
1871 }
1872
1873 /*
1874  * The caller presents a locked cluster with an obj_type of
1875  * HAMMER2_OBJTYPE_HARDLINK in (*clusterp).  This routine will locate
1876  * the inode and replace (*clusterp) with a new locked cluster containing
1877  * the target hardlink, also locked.  The original cluster will be
1878  * unlocked and released.
1879  *
1880  * If cparentp is not NULL a locked cluster representing the hardlink's
1881  * parent is also returned.
1882  *
1883  * If we are unable to locate the hardlink target EIO is returned,
1884  * (*cparentp) is set to NULL, the original passed-in (*clusterp)
1885  * will be unlocked and released and (*clusterp) will be set to NULL
1886  * as well.
1887  */
1888 int
1889 hammer2_hardlink_find(hammer2_inode_t *dip,
1890                       hammer2_cluster_t **cparentp,
1891                       hammer2_cluster_t **clusterp)
1892 {
1893         const hammer2_inode_data_t *ipdata;
1894         hammer2_cluster_t *cluster;
1895         hammer2_cluster_t *cparent;
1896         hammer2_cluster_t *rcluster;
1897         hammer2_inode_t *ip;
1898         hammer2_inode_t *pip;
1899         hammer2_key_t key_dummy;
1900         hammer2_key_t lhc;
1901
1902         cluster = *clusterp;
1903         pip = dip;
1904         hammer2_inode_ref(pip);         /* for loop */
1905
1906         /*
1907          * Locate the hardlink.  pip is referenced and not locked.
1908          * Unlock and release (*clusterp) after extracting the needed
1909          * data.
1910          */
1911         ipdata = &hammer2_cluster_rdata(cluster)->ipdata;
1912         lhc = ipdata->inum;
1913         ipdata = NULL;                  /* safety */
1914         hammer2_cluster_unlock(cluster);
1915         hammer2_cluster_drop(cluster);
1916         *clusterp = NULL;               /* safety */
1917
1918         rcluster = NULL;
1919         cparent = NULL;
1920
1921         while ((ip = pip) != NULL) {
1922                 cparent = hammer2_inode_lock(ip, HAMMER2_RESOLVE_ALWAYS);
1923                 hammer2_inode_drop(ip);                 /* loop */
1924                 KKASSERT(hammer2_cluster_type(cparent) ==
1925                          HAMMER2_BREF_TYPE_INODE);
1926                 rcluster = hammer2_cluster_lookup(cparent, &key_dummy,
1927                                              lhc, lhc, 0);
1928                 if (rcluster)
1929                         break;
1930                 hammer2_cluster_lookup_done(cparent);   /* discard parent */
1931                 cparent = NULL;                         /* safety */
1932                 pip = ip->pip;          /* safe, ip held locked */
1933                 if (pip)
1934                         hammer2_inode_ref(pip);         /* loop */
1935                 hammer2_inode_unlock(ip, NULL);
1936         }
1937
1938         /*
1939          * chain is locked, ip is locked.  Unlock ip, return the locked
1940          * chain.  *ipp is already set w/a ref count and not locked.
1941          *
1942          * (cparent is already unlocked).
1943          */
1944         *clusterp = rcluster;
1945         if (rcluster) {
1946                 if (cparentp) {
1947                         *cparentp = cparent;
1948                         hammer2_inode_unlock(ip, NULL);
1949                 } else {
1950                         hammer2_inode_unlock(ip, cparent);
1951                 }
1952                 return (0);
1953         } else {
1954                 if (cparentp)
1955                         *cparentp = NULL;
1956                 if (ip)
1957                         hammer2_inode_unlock(ip, cparent);
1958                 return (EIO);
1959         }
1960 }
1961
1962 /*
1963  * Find the directory common to both fdip and tdip.
1964  *
1965  * Returns a held but not locked inode.  Caller typically locks the inode,
1966  * and when through unlocks AND drops it.
1967  */
1968 hammer2_inode_t *
1969 hammer2_inode_common_parent(hammer2_inode_t *fdip, hammer2_inode_t *tdip)
1970 {
1971         hammer2_inode_t *scan1;
1972         hammer2_inode_t *scan2;
1973
1974         /*
1975          * We used to have a depth field but it complicated matters too
1976          * much for directory renames.  So now its ugly.  Check for
1977          * simple cases before giving up and doing it the expensive way.
1978          *
1979          * XXX need a bottom-up topology stability lock
1980          */
1981         if (fdip == tdip || fdip == tdip->pip) {
1982                 hammer2_inode_ref(fdip);
1983                 return(fdip);
1984         }
1985         if (fdip->pip == tdip) {
1986                 hammer2_inode_ref(tdip);
1987                 return(tdip);
1988         }
1989
1990         /*
1991          * XXX not MPSAFE
1992          */
1993         for (scan1 = fdip; scan1->pmp == fdip->pmp; scan1 = scan1->pip) {
1994                 scan2 = tdip;
1995                 while (scan2->pmp == tdip->pmp) {
1996                         if (scan1 == scan2) {
1997                                 hammer2_inode_ref(scan1);
1998                                 return(scan1);
1999                         }
2000                         scan2 = scan2->pip;
2001                         if (scan2 == NULL)
2002                                 break;
2003                 }
2004         }
2005         panic("hammer2_inode_common_parent: no common parent %p %p\n",
2006               fdip, tdip);
2007         /* NOT REACHED */
2008         return(NULL);
2009 }
2010
2011 /*
2012  * Synchronize the inode's frontend state with the chain state prior
2013  * to any explicit flush of the inode or any strategy write call.
2014  *
2015  * Called with a locked inode.
2016  */
2017 void
2018 hammer2_inode_fsync(hammer2_trans_t *trans, hammer2_inode_t *ip, 
2019                     hammer2_cluster_t *cparent)
2020 {
2021         const hammer2_inode_data_t *ripdata;
2022         hammer2_inode_data_t *wipdata;
2023         hammer2_cluster_t *dparent;
2024         hammer2_cluster_t *cluster;
2025         hammer2_key_t lbase;
2026         hammer2_key_t key_next;
2027         int dosync = 0;
2028
2029         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;    /* target file */
2030
2031         if (ip->flags & HAMMER2_INODE_MTIME) {
2032                 wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
2033                 atomic_clear_int(&ip->flags, HAMMER2_INODE_MTIME);
2034                 wipdata->mtime = ip->mtime;
2035                 dosync = 1;
2036                 ripdata = wipdata;
2037         }
2038         if ((ip->flags & HAMMER2_INODE_RESIZED) && ip->size < ripdata->size) {
2039                 wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
2040                 wipdata->size = ip->size;
2041                 dosync = 1;
2042                 ripdata = wipdata;
2043                 atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
2044
2045                 /*
2046                  * We must delete any chains beyond the EOF.  The chain
2047                  * straddling the EOF will be pending in the bioq.
2048                  */
2049                 lbase = (ripdata->size + HAMMER2_PBUFMASK64) &
2050                         ~HAMMER2_PBUFMASK64;
2051                 dparent = hammer2_cluster_lookup_init(&ip->cluster, 0);
2052                 cluster = hammer2_cluster_lookup(dparent, &key_next,
2053                                                  lbase, (hammer2_key_t)-1,
2054                                                  HAMMER2_LOOKUP_NODATA);
2055                 while (cluster) {
2056                         /*
2057                          * Degenerate embedded case, nothing to loop on
2058                          */
2059                         switch (hammer2_cluster_type(cluster)) {
2060                         case HAMMER2_BREF_TYPE_INODE:
2061                                 hammer2_cluster_unlock(cluster);
2062                                 hammer2_cluster_drop(cluster);
2063                                 cluster = NULL;
2064                                 break;
2065                         case HAMMER2_BREF_TYPE_DATA:
2066                                 hammer2_cluster_delete(trans, dparent, cluster,
2067                                                    HAMMER2_DELETE_PERMANENT);
2068                                 /* fall through */
2069                         default:
2070                                 cluster = hammer2_cluster_next(dparent, cluster,
2071                                                    &key_next,
2072                                                    key_next, (hammer2_key_t)-1,
2073                                                    HAMMER2_LOOKUP_NODATA);
2074                                 break;
2075                         }
2076                 }
2077                 hammer2_cluster_lookup_done(dparent);
2078         } else
2079         if ((ip->flags & HAMMER2_INODE_RESIZED) && ip->size > ripdata->size) {
2080                 wipdata = hammer2_cluster_modify_ip(trans, ip, cparent, 0);
2081                 wipdata->size = ip->size;
2082                 atomic_clear_int(&ip->flags, HAMMER2_INODE_RESIZED);
2083
2084                 /*
2085                  * When resizing larger we may not have any direct-data
2086                  * available.
2087                  */
2088                 if ((wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) &&
2089                     ip->size > HAMMER2_EMBEDDED_BYTES) {
2090                         wipdata->op_flags &= ~HAMMER2_OPFLAG_DIRECTDATA;
2091                         bzero(&wipdata->u.blockset,
2092                               sizeof(wipdata->u.blockset));
2093                 }
2094                 dosync = 1;
2095                 ripdata = wipdata;
2096         }
2097         if (dosync)
2098                 hammer2_cluster_modsync(cparent);
2099 }