hammer2 - Cleanup various races, better flush
[dragonfly.git] / sys / vfs / hammer2 / hammer2_inode.c
1 /*
2  * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@dragonflybsd.org>
6  * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/cdefs.h>
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/types.h>
39 #include <sys/lock.h>
40 #include <sys/uuid.h>
41
42 #include "hammer2.h"
43
44 /*
45  * Adding a ref to an inode is only legal if the inode already has at least
46  * one ref.
47  */
48 void
49 hammer2_inode_ref(hammer2_inode_t *ip)
50 {
51         hammer2_chain_ref(ip->hmp, &ip->chain);
52 }
53
54 /*
55  * Drop an inode reference, freeing the inode when the last reference goes
56  * away.
57  */
58 void
59 hammer2_inode_drop(hammer2_inode_t *ip)
60 {
61         hammer2_chain_drop(ip->hmp, &ip->chain);
62 }
63
64 /*
65  * Get the vnode associated with the given inode, allocating the vnode if
66  * necessary.  The vnode will be returned exclusively locked.
67  *
68  * The caller must lock the inode (shared or exclusive).
69  *
70  * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
71  * races.
72  */
73 struct vnode *
74 hammer2_igetv(hammer2_inode_t *ip, int *errorp)
75 {
76         struct vnode *vp;
77         hammer2_pfsmount_t *pmp;
78         ccms_state_t ostate;
79
80         pmp = ip->pmp;
81         KKASSERT(pmp != NULL);
82         *errorp = 0;
83
84         for (;;) {
85                 /*
86                  * Attempt to reuse an existing vnode assignment.  It is
87                  * possible to race a reclaim so the vget() may fail.  The
88                  * inode must be unlocked during the vget() to avoid a
89                  * deadlock against a reclaim.
90                  */
91                 vp = ip->vp;
92                 if (vp) {
93                         /*
94                          * Inode must be unlocked during the vget() to avoid
95                          * possible deadlocks, vnode is held to prevent
96                          * destruction during the vget().  The vget() can
97                          * still fail if we lost a reclaim race on the vnode.
98                          */
99                         vhold_interlocked(vp);
100                         ccms_thread_unlock(&ip->chain.cst);
101                         if (vget(vp, LK_EXCLUSIVE)) {
102                                 vdrop(vp);
103                                 ccms_thread_lock(&ip->chain.cst,
104                                                  CCMS_STATE_EXCLUSIVE);
105                                 continue;
106                         }
107                         ccms_thread_lock(&ip->chain.cst, CCMS_STATE_EXCLUSIVE);
108                         vdrop(vp);
109                         /* vp still locked and ref from vget */
110                         *errorp = 0;
111                         break;
112                 }
113
114                 /*
115                  * No vnode exists, allocate a new vnode.  Beware of
116                  * allocation races.  This function will return an
117                  * exclusively locked and referenced vnode.
118                  */
119                 *errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
120                 if (*errorp) {
121                         vp = NULL;
122                         break;
123                 }
124
125                 /*
126                  * Lock the inode and check for an allocation race.
127                  */
128                 ostate = ccms_thread_lock_upgrade(&ip->chain.cst);
129                 if (ip->vp != NULL) {
130                         vp->v_type = VBAD;
131                         vx_put(vp);
132                         ccms_thread_lock_restore(&ip->chain.cst, ostate);
133                         continue;
134                 }
135
136                 switch (ip->ip_data.type) {
137                 case HAMMER2_OBJTYPE_DIRECTORY:
138                         vp->v_type = VDIR;
139                         break;
140                 case HAMMER2_OBJTYPE_REGFILE:
141                         vp->v_type = VREG;
142                         vinitvmio(vp, ip->ip_data.size,
143                                   HAMMER2_LBUFSIZE,
144                                   (int)ip->ip_data.size & HAMMER2_LBUFMASK);
145                         break;
146                 case HAMMER2_OBJTYPE_SOFTLINK:
147                         /*
148                          * XXX for now we are using the generic file_read
149                          * and file_write code so we need a buffer cache
150                          * association.
151                          */
152                         vp->v_type = VLNK;
153                         vinitvmio(vp, ip->ip_data.size,
154                                   HAMMER2_LBUFSIZE,
155                                   (int)ip->ip_data.size & HAMMER2_LBUFMASK);
156                         break;
157                 /* XXX FIFO */
158                 default:
159                         panic("hammer2: unhandled objtype %d",
160                               ip->ip_data.type);
161                         break;
162                 }
163
164                 if (ip == pmp->iroot)
165                         vsetflags(vp, VROOT);
166
167                 vp->v_data = ip;
168                 ip->vp = vp;
169                 hammer2_chain_ref(ip->hmp, &ip->chain); /* vp association */
170                 ccms_thread_lock_restore(&ip->chain.cst, ostate);
171                 break;
172         }
173
174         /*
175          * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
176          */
177         if (hammer2_debug & 0x0002) {
178                 kprintf("igetv vp %p refs %d aux %d\n",
179                         vp, vp->v_sysref.refcnt, vp->v_auxrefs);
180         }
181         return (vp);
182 }
183
184 /*
185  * Create a new inode in the specified directory using the vattr to
186  * figure out the type of inode.
187  *
188  * If no error occurs the new inode with its chain locked is returned in
189  * *nipp, otherwise an error is returned and *nipp is set to NULL.
190  *
191  * If vap and/or cred are NULL the related fields are not set and the
192  * inode type defaults to a directory.  This is used when creating PFSs
193  * under the super-root, so the inode number is set to 1 in this case.
194  */
195 int
196 hammer2_inode_create(hammer2_inode_t *dip,
197                      struct vattr *vap, struct ucred *cred,
198                      const uint8_t *name, size_t name_len,
199                      hammer2_inode_t **nipp)
200 {
201         hammer2_mount_t *hmp = dip->hmp;
202         hammer2_chain_t *chain;
203         hammer2_chain_t *parent;
204         hammer2_inode_t *nip;
205         hammer2_key_t lhc;
206         int error;
207         uid_t xuid;
208
209         lhc = hammer2_dirhash(name, name_len);
210
211         /*
212          * Locate the inode or indirect block to create the new
213          * entry in.  At the same time check for key collisions
214          * and iterate until we don't get one.
215          */
216         parent = &dip->chain;
217         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
218
219         error = 0;
220         while (error == 0) {
221                 chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
222                 if (chain == NULL)
223                         break;
224                 if ((lhc & HAMMER2_DIRHASH_VISIBLE) == 0)
225                         error = ENOSPC;
226                 if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
227                         error = ENOSPC;
228                 hammer2_chain_unlock(hmp, chain);
229                 chain = NULL;
230                 ++lhc;
231         }
232         if (error == 0) {
233                 chain = hammer2_chain_create(hmp, parent, NULL, lhc, 0,
234                                              HAMMER2_BREF_TYPE_INODE,
235                                              HAMMER2_INODE_BYTES);
236                 if (chain == NULL)
237                         error = EIO;
238         }
239         hammer2_chain_unlock(hmp, parent);
240
241         /*
242          * Handle the error case
243          */
244         if (error) {
245                 KKASSERT(chain == NULL);
246                 *nipp = NULL;
247                 return (error);
248         }
249
250         /*
251          * Set up the new inode
252          */
253         nip = chain->u.ip;
254         *nipp = nip;
255
256         hammer2_voldata_lock(hmp);
257         if (vap) {
258                 nip->ip_data.type = hammer2_get_obj_type(vap->va_type);
259                 nip->ip_data.inum = hmp->voldata.alloc_tid++;
260                 /* XXX modify/lock */
261         } else {
262                 nip->ip_data.type = HAMMER2_OBJTYPE_DIRECTORY;
263                 nip->ip_data.inum = 1;
264         }
265         hammer2_voldata_unlock(hmp);
266         nip->ip_data.version = HAMMER2_INODE_VERSION_ONE;
267         hammer2_update_time(&nip->ip_data.ctime);
268         nip->ip_data.mtime = nip->ip_data.ctime;
269         if (vap)
270                 nip->ip_data.mode = vap->va_mode;
271         nip->ip_data.nlinks = 1;
272         if (vap) {
273                 if (dip) {
274                         xuid = hammer2_to_unix_xid(&dip->ip_data.uid);
275                         xuid = vop_helper_create_uid(dip->pmp->mp,
276                                                      dip->ip_data.mode,
277                                                      xuid,
278                                                      cred,
279                                                      &vap->va_mode);
280                 } else {
281                         xuid = 0;
282                 }
283                 if (vap->va_vaflags & VA_UID_UUID_VALID)
284                         nip->ip_data.uid = vap->va_uid_uuid;
285                 else if (vap->va_uid != (uid_t)VNOVAL)
286                         hammer2_guid_to_uuid(&nip->ip_data.uid, vap->va_uid);
287                 else
288                         hammer2_guid_to_uuid(&nip->ip_data.uid, xuid);
289
290                 if (vap->va_vaflags & VA_GID_UUID_VALID)
291                         nip->ip_data.gid = vap->va_gid_uuid;
292                 else if (vap->va_gid != (gid_t)VNOVAL)
293                         hammer2_guid_to_uuid(&nip->ip_data.gid, vap->va_gid);
294                 else if (dip)
295                         nip->ip_data.gid = dip->ip_data.gid;
296         }
297
298         /*
299          * Regular files and softlinks allow a small amount of data to be
300          * directly embedded in the inode.  This flag will be cleared if
301          * the size is extended past the embedded limit.
302          */
303         if (nip->ip_data.type == HAMMER2_OBJTYPE_REGFILE ||
304             nip->ip_data.type == HAMMER2_OBJTYPE_SOFTLINK) {
305                 nip->ip_data.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
306         }
307
308         KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
309         bcopy(name, nip->ip_data.filename, name_len);
310         nip->ip_data.name_key = lhc;
311         nip->ip_data.name_len = name_len;
312
313         return (0);
314 }
315
316 /*
317  * Duplicate the specified existing inode in the specified target directory.
318  * If name is NULL the inode is duplicated as a hidden directory entry.
319  *
320  * Returns the new inode.  The old inode is left alone.
321  *
322  * XXX name needs to be NULL for now.
323  */
324 int
325 hammer2_inode_duplicate(hammer2_inode_t *dip, hammer2_inode_t *oip,
326                         hammer2_inode_t **nipp,
327                         const uint8_t *name, size_t name_len)
328 {
329         hammer2_mount_t *hmp = dip->hmp;
330         hammer2_inode_t *nip;
331         hammer2_chain_t *parent;
332         hammer2_chain_t *chain;
333         hammer2_key_t lhc;
334         int error;
335
336         if (name) {
337                 lhc = hammer2_dirhash(name, name_len);
338         } else {
339                 lhc = oip->ip_data.inum;
340                 KKASSERT((lhc & HAMMER2_DIRHASH_VISIBLE) == 0);
341         }
342
343         /*
344          * Locate the inode or indirect block to create the new
345          * entry in.  At the same time check for key collisions
346          * and iterate until we don't get one.
347          */
348         nip = NULL;
349         parent = &dip->chain;
350         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
351
352         error = 0;
353         while (error == 0) {
354                 chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
355                 if (chain == NULL)
356                         break;
357                 /* XXX bcmp name if not NULL */
358                 if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
359                         error = ENOSPC;
360                 if ((lhc & HAMMER2_DIRHASH_VISIBLE) == 0) /* shouldn't happen */
361                         error = ENOSPC;
362                 hammer2_chain_unlock(hmp, chain);
363                 chain = NULL;
364                 ++lhc;
365         }
366
367         /*
368          * Create entry in common parent directory.
369          */
370         if (error == 0) {
371                 chain = hammer2_chain_create(hmp, parent, NULL, lhc, 0,
372                                              HAMMER2_BREF_TYPE_INODE /* n/a */,
373                                              HAMMER2_INODE_BYTES);   /* n/a */
374                 if (chain == NULL)
375                         error = EIO;
376         }
377         hammer2_chain_unlock(hmp, parent);
378
379         /*
380          * Handle the error case
381          */
382         if (error) {
383                 KKASSERT(chain == NULL);
384                 return (error);
385         }
386
387         /*
388          * XXX This is currently a horrible hack.  Well, if we wanted to
389          *     duplicate a file, i.e. as in a snapshot, we definitely
390          *     would have to flush it first.
391          *
392          *     For hardlink target generation we can theoretically move any
393          *     active chain structures without flushing, but that gets really
394          *     iffy for code which follows chain->parent and ip->pip links.
395          *
396          * XXX only works with files.  Duplicating a directory hierarchy
397          *     requires a flush but doesn't deal with races post-flush.
398          *     Well, it would work I guess, but you might catch some files
399          *     mid-operation.
400          *
401          * We cannot leave oip with any in-memory chains because (for a
402          * hardlink), oip will become a OBJTYPE_HARDLINK which is just a
403          * pointer to the real hardlink's inum and can't have any sub-chains.
404          * XXX might be 0-ref chains left.
405          */
406         hammer2_inode_lock_ex(oip);
407         hammer2_chain_flush(hmp, &oip->chain, 0);
408         hammer2_inode_unlock_ex(oip);
409         /*KKASSERT(RB_EMPTY(&oip->chain.rbhead));*/
410
411         nip = chain->u.ip;
412         hammer2_chain_modify(hmp, chain, 0);
413         nip->ip_data = oip->ip_data;    /* sync media data after flush */
414
415         if (name) {
416                 /*
417                  * Directory entries are inodes so if the name has changed
418                  * we have to update the inode.
419                  */
420                 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
421                 bcopy(name, nip->ip_data.filename, name_len);
422                 nip->ip_data.name_key = lhc;
423                 nip->ip_data.name_len = name_len;
424         } else {
425                 /*
426                  * Directory entries are inodes but this is a hidden hardlink
427                  * target.  The name isn't used but to ease debugging give it
428                  * a name after its inode number.
429                  */
430                 ksnprintf(nip->ip_data.filename, sizeof(nip->ip_data.filename),
431                           "0x%016jx", (intmax_t)nip->ip_data.inum);
432                 nip->ip_data.name_len = strlen(nip->ip_data.filename);
433                 nip->ip_data.name_key = lhc;
434         }
435         *nipp = nip;
436
437         return (0);
438 }
439
440
441 /*
442  * Connect inode (oip) to the specified directory using the specified name.
443  * (oip) must be locked.
444  *
445  * If (oip) is not currently connected we simply connect it up.
446  *
447  * If (oip) is already connected we create a OBJTYPE_HARDLINK entry which
448  * points to (oip)'s inode number.  (oip) is expected to be the terminus of
449  * the hardlink sitting as a hidden file in a common parent directory
450  * in this situation.
451  */
452 int
453 hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *oip,
454                       const uint8_t *name, size_t name_len)
455 {
456         hammer2_mount_t *hmp = dip->hmp;
457         hammer2_chain_t *chain;
458         hammer2_chain_t *parent;
459         hammer2_inode_t *nip;
460         hammer2_key_t lhc;
461         int error;
462         int hlink;
463
464         /*
465          * (oip) is the terminus of the hardlink sitting in the common
466          * parent directory.  This means that if oip->pip != dip then
467          * the already locked oip is ABOVE dip.
468          *
469          * But if the common parent directory IS dip, then we would have
470          * a lock-order reversal and must rearrange the lock ordering.
471          * For now the caller deals with this for us by locking dip in
472          * that case (and our lock here winds up just being recursive)
473          */
474         parent = &dip->chain;
475         if (oip->pip == dip) {
476                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
477                 hammer2_chain_lock(hmp, &oip->chain, HAMMER2_RESOLVE_ALWAYS);
478         } else {
479                 hammer2_chain_lock(hmp, &oip->chain, HAMMER2_RESOLVE_ALWAYS);
480                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
481         }
482
483
484         lhc = hammer2_dirhash(name, name_len);
485         hlink = (oip->chain.parent != NULL);
486
487         /*
488          * In fake mode flush oip so we can just snapshot it downbelow.
489          */
490         if (hlink && hammer2_hardlink_enable < 0)
491                 hammer2_chain_flush(hmp, &oip->chain, 0);
492
493         /*
494          * Locate the inode or indirect block to create the new
495          * entry in.  At the same time check for key collisions
496          * and iterate until we don't get one.
497          */
498         error = 0;
499         while (error == 0) {
500                 chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
501                 if (chain == NULL)
502                         break;
503                 if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
504                         error = ENOSPC;
505                 hammer2_chain_unlock(hmp, chain);
506                 chain = NULL;
507                 ++lhc;
508         }
509
510         /*
511          * Passing a non-NULL chain to hammer2_chain_create() reconnects the
512          * existing chain instead of creating a new one.  The chain's bref
513          * will be properly updated.
514          */
515         if (error == 0) {
516                 if (hlink) {
517                         chain = hammer2_chain_create(hmp, parent,
518                                                      NULL, lhc, 0,
519                                                      HAMMER2_BREF_TYPE_INODE,
520                                                      HAMMER2_INODE_BYTES);
521                 } else {
522                         chain = hammer2_chain_create(hmp, parent,
523                                                      &oip->chain, lhc, 0,
524                                                      HAMMER2_BREF_TYPE_INODE,
525                                                      HAMMER2_INODE_BYTES);
526                         if (chain)
527                                 KKASSERT(chain == &oip->chain);
528                 }
529                 if (chain == NULL)
530                         error = EIO;
531         }
532         hammer2_chain_unlock(hmp, parent);
533
534         /*
535          * Handle the error case
536          */
537         if (error) {
538                 KKASSERT(chain == NULL);
539                 hammer2_chain_unlock(hmp, &oip->chain);
540                 return (error);
541         }
542
543         /*
544          * Directory entries are inodes so if the name has changed we have
545          * to update the inode.
546          *
547          * When creating an OBJTYPE_HARDLINK entry remember to unlock the
548          * chain, the caller will access the hardlink via the actual hardlink
549          * target file and not the hardlink pointer entry.
550          */
551         if (hlink && hammer2_hardlink_enable >= 0) {
552                 /*
553                  * Create the HARDLINK pointer.  oip represents the hardlink
554                  * target in this situation.
555                  */
556                 nip = chain->u.ip;
557                 hammer2_chain_modify(hmp, chain, 0);
558                 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
559                 bcopy(name, nip->ip_data.filename, name_len);
560                 nip->ip_data.name_key = lhc;
561                 nip->ip_data.name_len = name_len;
562                 nip->ip_data.target_type = oip->ip_data.type;
563                 nip->ip_data.type = HAMMER2_OBJTYPE_HARDLINK;
564                 nip->ip_data.inum = oip->ip_data.inum;
565                 nip->ip_data.nlinks = 1;
566                 kprintf("created hardlink %*.*s\n",
567                         (int)name_len, (int)name_len, name);
568                 hammer2_chain_unlock(hmp, chain);
569         } else if (hlink && hammer2_hardlink_enable < 0) {
570                 /*
571                  * Create a snapshot (hardlink fake mode for debugging).
572                  */
573                 nip = chain->u.ip;
574                 nip->ip_data = oip->ip_data;
575                 hammer2_chain_modify(hmp, chain, 0);
576                 KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
577                 bcopy(name, nip->ip_data.filename, name_len);
578                 nip->ip_data.name_key = lhc;
579                 nip->ip_data.name_len = name_len;
580                 kprintf("created fake hardlink %*.*s\n",
581                         (int)name_len, (int)name_len, name);
582                 hammer2_chain_unlock(hmp, chain);
583         } else {
584                 /*
585                  * Normally disconnected inode (e.g. during a rename) that
586                  * was reconnected.  We must fixup the name stored in
587                  * oip.
588                  *
589                  * We are using oip as chain, already locked by caller,
590                  * do not unlock it.
591                  */
592                 hammer2_chain_modify(hmp, chain, 0);
593                 if (oip->ip_data.name_len != name_len ||
594                     bcmp(oip->ip_data.filename, name, name_len) != 0) {
595                         KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
596                         bcopy(name, oip->ip_data.filename, name_len);
597                         oip->ip_data.name_key = lhc;
598                         oip->ip_data.name_len = name_len;
599                 }
600                 oip->ip_data.nlinks = 1;
601         }
602         hammer2_chain_unlock(hmp, &oip->chain);
603         return (0);
604 }
605
606 /*
607  * Unlink the file from the specified directory inode.  The directory inode
608  * does not need to be locked.
609  *
610  * isdir determines whether a directory/non-directory check should be made.
611  * No check is made if isdir is set to -1.
612  */
613 int
614 hammer2_unlink_file(hammer2_inode_t *dip,
615                     const uint8_t *name, size_t name_len,
616                     int isdir, hammer2_inode_t *retain_ip)
617 {
618         hammer2_mount_t *hmp;
619         hammer2_chain_t *parent;
620         hammer2_chain_t *chain;
621         hammer2_chain_t *dparent;
622         hammer2_chain_t *dchain;
623         hammer2_key_t lhc;
624         hammer2_inode_t *ip;
625         hammer2_inode_t *oip;
626         int error;
627         uint8_t type;
628
629         error = 0;
630         oip = NULL;
631         hmp = dip->hmp;
632         lhc = hammer2_dirhash(name, name_len);
633
634         /*
635          * Search for the filename in the directory
636          */
637         parent = &dip->chain;
638         hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
639         chain = hammer2_chain_lookup(hmp, &parent,
640                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
641                                      HAMMER2_LOOKUP_MAYDELETE);
642         while (chain) {
643                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
644                     chain->u.ip &&
645                     name_len == chain->data->ipdata.name_len &&
646                     bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
647                         break;
648                 }
649                 chain = hammer2_chain_next(hmp, &parent, chain,
650                                            lhc, lhc + HAMMER2_DIRHASH_LOMASK,
651                                            HAMMER2_LOOKUP_MAYDELETE);
652         }
653
654         /*
655          * Not found or wrong type (isdir < 0 disables the type check).
656          */
657         if (chain == NULL) {
658                 hammer2_chain_unlock(hmp, parent);
659                 return ENOENT;
660         }
661         if ((type = chain->data->ipdata.type) == HAMMER2_OBJTYPE_HARDLINK)
662                 type = chain->data->ipdata.target_type;
663
664         if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir == 0) {
665                 error = ENOTDIR;
666                 goto done;
667         }
668         if (type != HAMMER2_OBJTYPE_DIRECTORY && isdir == 1) {
669                 error = EISDIR;
670                 goto done;
671         }
672
673         /*
674          * Hardlink must be resolved.  We can't hold parent locked while we
675          * do this or we could deadlock.
676          */
677         if (chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
678                 hammer2_chain_unlock(hmp, parent);
679                 parent = NULL;
680                 error = hammer2_hardlink_find(dip, &chain, &oip);
681         }
682
683         /*
684          * If this is a directory the directory must be empty.  However, if
685          * isdir < 0 we are doing a rename and the directory does not have
686          * to be empty.
687          *
688          * NOTE: We check the full key range here which covers both visible
689          *       and invisible entries.  Theoretically there should be no
690          *       invisible (hardlink target) entries if there are no visible
691          *       entries.
692          */
693         if (type == HAMMER2_OBJTYPE_DIRECTORY && isdir >= 0) {
694                 dparent = chain;
695                 hammer2_chain_lock(hmp, dparent, HAMMER2_RESOLVE_ALWAYS);
696                 dchain = hammer2_chain_lookup(hmp, &dparent,
697                                               0, (hammer2_key_t)-1,
698                                               HAMMER2_LOOKUP_NODATA);
699                 if (dchain) {
700                         hammer2_chain_unlock(hmp, dchain);
701                         hammer2_chain_unlock(hmp, dparent);
702                         error = ENOTEMPTY;
703                         goto done;
704                 }
705                 hammer2_chain_unlock(hmp, dparent);
706                 dparent = NULL;
707                 /* dchain NULL */
708         }
709
710         /*
711          * Ok, we can now unlink the chain.  We always decrement nlinks even
712          * if the entry can be deleted in case someone has the file open and
713          * does an fstat().
714          *
715          * The chain itself will no longer be in the on-media topology but
716          * can still be flushed to the media (e.g. if an open descriptor
717          * remains).  When the last vnode/ip ref goes away the chain will
718          * be marked unmodified, avoiding any further (now unnecesary) I/O.
719          */
720         if (oip) {
721                 /*
722                  * If this was a hardlink we first delete the hardlink
723                  * pointer entry.
724                  */
725                 parent = oip->chain.parent;
726                 hammer2_chain_lock_pair(hmp, parent, &oip->chain,
727                                         HAMMER2_RESOLVE_ALWAYS |
728                                         HAMMER2_RESOLVE_MAYDELETE);
729                 hammer2_chain_delete(hmp, parent, &oip->chain,
730                                     (retain_ip == oip));
731                 hammer2_chain_unlock(hmp, &oip->chain);
732                 hammer2_chain_unlock(hmp, parent);
733                 parent = NULL;
734
735                 /*
736                  * Then decrement nlinks on hardlink target.
737                  */
738                 ip = chain->u.ip;
739                 if (ip->ip_data.nlinks == 1) {
740                         dparent = chain->parent;
741                         hammer2_chain_ref(hmp, chain);
742                         hammer2_chain_unlock(hmp, chain);
743                         hammer2_chain_lock_pair(hmp, dparent, chain,
744                                            HAMMER2_RESOLVE_ALWAYS |
745                                            HAMMER2_RESOLVE_MAYDELETE);
746                         hammer2_chain_drop(hmp, chain);
747                         hammer2_chain_modify(hmp, chain, 0);
748                         --ip->ip_data.nlinks;
749                         hammer2_chain_delete(hmp, dparent, chain, 0);
750                         hammer2_chain_unlock(hmp, dparent);
751                 } else {
752                         hammer2_chain_modify(hmp, chain, 0);
753                         --ip->ip_data.nlinks;
754                 }
755         } else {
756                 /*
757                  * Otherwise this was not a hardlink and we can just
758                  * remove the entry and decrement nlinks.
759                  */
760                 ip = chain->u.ip;
761                 hammer2_chain_modify(hmp, chain, 0);
762                 --ip->ip_data.nlinks;
763                 hammer2_chain_delete(hmp, parent, chain,
764                                      (retain_ip == ip));
765         }
766
767         error = 0;
768
769 done:
770         if (chain)
771                 hammer2_chain_unlock(hmp, chain);
772         if (parent)
773                 hammer2_chain_unlock(hmp, parent);
774         if (oip)
775                 hammer2_chain_drop(oip->hmp, &oip->chain);
776
777         return error;
778 }
779
780 /*
781  * Calculate the allocation size for the file fragment straddling EOF
782  */
783 int
784 hammer2_inode_calc_alloc(hammer2_key_t filesize)
785 {
786         int frag = (int)filesize & HAMMER2_PBUFMASK;
787         int radix;
788
789         if (frag == 0)
790                 return(0);
791         for (radix = HAMMER2_MINALLOCRADIX; frag > (1 << radix); ++radix)
792                 ;
793         return (radix);
794 }
795
796 void
797 hammer2_inode_lock_nlinks(hammer2_inode_t *ip)
798 {
799         hammer2_chain_ref(ip->hmp, &ip->chain);
800 }
801
802 void
803 hammer2_inode_unlock_nlinks(hammer2_inode_t *ip)
804 {
805         hammer2_chain_drop(ip->hmp, &ip->chain);
806 }
807
808 /*
809  * Consolidate for hard link creation.  This moves the specified terminal
810  * hardlink inode to a directory common to its current directory and tdip
811  * if necessary, replacing *ipp with the new inode chain element and
812  * modifying the original inode chain element to OBJTYPE_HARDLINK.
813  *
814  * If the original inode chain element was a prior incarnation of a hidden
815  * inode it can simply be deleted instead of converted.
816  *
817  * (*ipp)'s nlinks field is locked on entry and the new (*ipp)'s nlinks
818  * field will be locked on return (with the original's unlocked).
819  *
820  * The link count is bumped if requested.
821  */
822 int
823 hammer2_hardlink_consolidate(hammer2_inode_t **ipp, hammer2_inode_t *tdip)
824 {
825         hammer2_mount_t *hmp;
826         hammer2_inode_t *oip = *ipp;
827         hammer2_inode_t *nip = NULL;
828         hammer2_inode_t *fdip;
829         hammer2_inode_t *cdip;
830         hammer2_chain_t *parent;
831         int error;
832
833         hmp = tdip->hmp;
834
835         if (hammer2_hardlink_enable < 0)
836                 return (0);
837         if (hammer2_hardlink_enable == 0)
838                 return (ENOTSUP);
839
840         fdip = oip->pip;
841         cdip = hammer2_inode_common_parent(hmp, fdip, tdip);
842
843         /*
844          * Nothing to do (except bump the link count) if the hardlink has
845          * already been consolidated in the correct place.
846          */
847         if (cdip == fdip &&
848             (oip->ip_data.name_key & HAMMER2_DIRHASH_VISIBLE) == 0) {
849                 kprintf("hardlink already consolidated correctly\n");
850                 nip = oip;
851                 hammer2_inode_lock_ex(nip);
852                 hammer2_chain_modify(hmp, &nip->chain, 0);
853                 ++nip->ip_data.nlinks;
854                 hammer2_inode_unlock_ex(nip);
855                 hammer2_inode_drop(cdip);
856                 return (0);
857         }
858
859         /*
860          * Create a hidden inode directory entry in the parent, copying
861          * (*oip)'s state.  Then replace oip with OBJTYPE_HARDLINK.
862          *
863          * The duplication function will either flush or move any chains
864          * under oip to the new hardlink target inode, retiring all chains
865          * related to oip before returning.  XXX vp->ip races.
866          */
867         error = hammer2_inode_duplicate(cdip, oip, &nip, NULL, 0);
868         if (error == 0) {
869                 /*
870                  * Bump nlinks on duplicated hidden inode.
871                  */
872                 kprintf("hardlink consolidation success in parent dir %s\n",
873                         cdip->ip_data.filename);
874                 hammer2_inode_lock_nlinks(nip);
875                 hammer2_inode_unlock_nlinks(oip);
876                 hammer2_chain_modify(hmp, &nip->chain, 0);
877                 ++nip->ip_data.nlinks;
878                 hammer2_inode_unlock_ex(nip);
879
880                 if (oip->ip_data.name_key & HAMMER2_DIRHASH_VISIBLE) {
881                         /*
882                          * Replace the old inode with an OBJTYPE_HARDLINK
883                          * pointer.
884                          */
885                         hammer2_inode_lock_ex(oip);
886                         hammer2_chain_modify(hmp, &oip->chain, 0);
887                         oip->ip_data.target_type = oip->ip_data.type;
888                         oip->ip_data.type = HAMMER2_OBJTYPE_HARDLINK;
889                         oip->ip_data.uflags = 0;
890                         oip->ip_data.rmajor = 0;
891                         oip->ip_data.rminor = 0;
892                         oip->ip_data.ctime = 0;
893                         oip->ip_data.mtime = 0;
894                         oip->ip_data.atime = 0;
895                         oip->ip_data.btime = 0;
896                         bzero(&oip->ip_data.uid, sizeof(oip->ip_data.uid));
897                         bzero(&oip->ip_data.gid, sizeof(oip->ip_data.gid));
898                         oip->ip_data.op_flags = HAMMER2_OPFLAG_DIRECTDATA;
899                         oip->ip_data.cap_flags = 0;
900                         oip->ip_data.mode = 0;
901                         oip->ip_data.size = 0;
902                         oip->ip_data.nlinks = 1;
903                         oip->ip_data.iparent = 0;       /* XXX */
904                         oip->ip_data.pfs_type = 0;
905                         oip->ip_data.pfs_inum = 0;
906                         bzero(&oip->ip_data.pfs_clid,
907                               sizeof(oip->ip_data.pfs_clid));
908                         bzero(&oip->ip_data.pfs_fsid,
909                               sizeof(oip->ip_data.pfs_fsid));
910                         oip->ip_data.data_quota = 0;
911                         oip->ip_data.data_count = 0;
912                         oip->ip_data.inode_quota = 0;
913                         oip->ip_data.inode_count = 0;
914                         oip->ip_data.attr_tid = 0;
915                         oip->ip_data.dirent_tid = 0;
916                         bzero(&oip->ip_data.u, sizeof(oip->ip_data.u));
917                         /* XXX transaction ids */
918
919                         hammer2_inode_unlock_ex(oip);
920                 } else {
921                         /*
922                          * The old inode was a hardlink target, which we
923                          * have now moved.  We must delete it so the new
924                          * hardlink target at a higher directory level
925                          * becomes the only hardlink target for this inode.
926                          */
927                         kprintf("DELETE INVISIBLE\n");
928                         parent = oip->chain.parent;
929                         hammer2_chain_lock(hmp, parent,
930                                            HAMMER2_RESOLVE_ALWAYS);
931                         hammer2_chain_lock(hmp, &oip->chain,
932                                            HAMMER2_RESOLVE_ALWAYS);
933                         hammer2_chain_delete(hmp, parent, &oip->chain, 0);
934                         hammer2_chain_unlock(hmp, &oip->chain);
935                         hammer2_chain_unlock(hmp, parent);
936                 }
937                 *ipp = nip;
938         } else {
939                 KKASSERT(nip == NULL);
940         }
941         hammer2_inode_drop(cdip);
942
943         return (error);
944 }
945
946 /*
947  * If (*ipp) is non-NULL it points to the forward OBJTYPE_HARDLINK inode while
948  * (*chainp) points to the resolved (hidden hardlink target) inode.  In this
949  * situation when nlinks is 1 we wish to deconsolidate the hardlink, moving
950  * it back to the directory that now represents the only remaining link.
951  */
952 int
953 hammer2_hardlink_deconsolidate(hammer2_inode_t *dip, hammer2_chain_t **chainp,
954                                hammer2_inode_t **ipp)
955 {
956         if (*ipp == NULL)
957                 return (0);
958         /* XXX */
959         return (0);
960 }
961
962 /*
963  * When presented with a (*chainp) representing an inode of type
964  * OBJTYPE_HARDLINK this code will save the original inode (with a ref)
965  * in (*ipp), and then locate the hidden hardlink target in (dip) or
966  * any parent directory above (dip).  The locked (*chainp) is replaced
967  * with a new locked (*chainp) representing the hardlink target.
968  */
969 int
970 hammer2_hardlink_find(hammer2_inode_t *dip, hammer2_chain_t **chainp,
971                       hammer2_inode_t **ipp)
972 {
973         hammer2_mount_t *hmp = dip->hmp;
974         hammer2_chain_t *chain = *chainp;
975         hammer2_chain_t *parent;
976         hammer2_inode_t *pip;
977         hammer2_key_t lhc;
978
979         *ipp = chain->u.ip;
980         hammer2_inode_ref(chain->u.ip);
981         lhc = chain->u.ip->ip_data.inum;
982
983         hammer2_inode_unlock_ex(chain->u.ip);
984         pip = chain->u.ip->pip;
985
986         chain = NULL;
987         while (pip) {
988                 parent = &pip->chain;
989                 KKASSERT(parent->bref.type == HAMMER2_BREF_TYPE_INODE);
990
991                 hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
992                 chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
993                 hammer2_chain_unlock(hmp, parent);
994                 if (chain)
995                         break;
996                 pip = pip->pip; /* XXX SMP RACE */
997         }
998         *chainp = chain;
999         if (chain) {
1000                 KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INODE);
1001                 /* already locked */
1002                 return (0);
1003         } else {
1004                 return (EIO);
1005         }
1006 }
1007
1008 /*
1009  * Find the directory common to both fdip and tdip, hold and return
1010  * its inode.
1011  */
1012 hammer2_inode_t *
1013 hammer2_inode_common_parent(hammer2_mount_t *hmp,
1014                             hammer2_inode_t *fdip, hammer2_inode_t *tdip)
1015 {
1016         hammer2_inode_t *scan1;
1017         hammer2_inode_t *scan2;
1018
1019         /*
1020          * We used to have a depth field but it complicated matters too
1021          * much for directory renames.  So now its ugly.  Check for
1022          * simple cases before giving up and doing it the expensive way.
1023          *
1024          * XXX need a bottom-up topology stability lock
1025          */
1026         if (fdip == tdip || fdip == tdip->pip) {
1027                 hammer2_inode_ref(fdip);
1028                 return(fdip);
1029         }
1030         if (fdip->pip == tdip) {
1031                 hammer2_inode_ref(tdip);
1032                 return(tdip);
1033         }
1034         for (scan1 = fdip; scan1->pmp == fdip->pmp; scan1 = scan1->pip) {
1035                 scan2 = tdip;
1036                 while (scan2->pmp == tdip->pmp) {
1037                         if (scan1 == scan2) {
1038                                 hammer2_inode_ref(scan1);
1039                                 return(scan1);
1040                         }
1041                         scan2 = scan2->pip;
1042                 }
1043         }
1044         panic("hammer2_inode_common_parent: no common parent %p %p\n",
1045               fdip, tdip);
1046         /* NOT REACHED */
1047         return(NULL);
1048 }