usr.sbin/makefs/hammer2/hammer2_vfsops.c

   1 /*
   2  * SPDX-License-Identifier: BSD-3-Clause
   3  *
   4  * Copyright (c) 2022 Tomohiro Kusumi <tkusumi@netbsd.org>
   5  * Copyright (c) 2011-2022 The DragonFly Project.  All rights reserved.
   6  *
   7  * This code is derived from software contributed to The DragonFly Project
   8  * by Matthew Dillon <dillon@dragonflybsd.org>
   9  *
  10  * Redistribution and use in source and binary forms, with or without
  11  * modification, are permitted provided that the following conditions
  12  * are met:
  13  *
  14  * 1. Redistributions of source code must retain the above copyright
  15  *    notice, this list of conditions and the following disclaimer.
  16  * 2. Redistributions in binary form must reproduce the above copyright
  17  *    notice, this list of conditions and the following disclaimer in
  18  *    the documentation and/or other materials provided with the
  19  *    distribution.
  20  * 3. Neither the name of The DragonFly Project nor the names of its
  21  *    contributors may be used to endorse or promote products derived
  22  *    from this software without specific, prior written permission.
  23  *
  24  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  27  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
  28  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
  32  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  33  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  34  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  35  * SUCH DAMAGE.
  36  */
  37 /*
  38 #include <sys/param.h>
  39 #include <sys/systm.h>
  40 #include <sys/kernel.h>
  41 #include <sys/nlookup.h>
  42 #include <sys/vnode.h>
  43 #include <sys/mount.h>
  44 #include <sys/fcntl.h>
  45 #include <sys/vfsops.h>
  46 #include <sys/sysctl.h>
  47 #include <sys/socket.h>
  48 #include <sys/objcache.h>
  49 #include <sys/proc.h>
  50 #include <sys/lock.h>
  51 #include <sys/file.h>
  52 */
  53
  54 #include "hammer2.h"
  55
  56 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
  57 static struct hammer2_mntlist hammer2_mntlist;
  58
  59 struct hammer2_pfslist hammer2_pfslist;
  60 struct hammer2_pfslist hammer2_spmplist;
  61 struct lock hammer2_mntlk;
  62
  63 int hammer2_supported_version = HAMMER2_VOL_VERSION_DEFAULT;
  64 int hammer2_debug;
  65 int hammer2_aux_flags;
  66 int hammer2_xop_nthreads;
  67 int hammer2_xop_sgroups;
  68 int hammer2_xop_xgroups;
  69 int hammer2_xop_xbase;
  70 int hammer2_xop_mod;
  71 long hammer2_debug_inode;
  72 int hammer2_cluster_meta_read = 1;      /* physical read-ahead */
  73 int hammer2_cluster_data_read = 4;      /* physical read-ahead */
  74 int hammer2_cluster_write = 0;          /* physical write clustering */
  75 int hammer2_dedup_enable = 1;
  76 int hammer2_always_compress = 0;        /* always try to compress */
  77 int hammer2_flush_pipe = 100;
  78 int hammer2_dio_count;
  79 int hammer2_dio_limit = 256;
  80 int hammer2_bulkfree_tps = 5000;
  81 int hammer2_spread_workers;
  82 int hammer2_limit_saved_depth;
  83 long hammer2_chain_allocs;
  84 long hammer2_limit_saved_chains;
  85 long hammer2_limit_dirty_chains;
  86 long hammer2_limit_dirty_inodes;
  87 long hammer2_count_modified_chains;
  88 long hammer2_iod_file_read;
  89 long hammer2_iod_meta_read;
  90 long hammer2_iod_indr_read;
  91 long hammer2_iod_fmap_read;
  92 long hammer2_iod_volu_read;
  93 long hammer2_iod_file_write;
  94 long hammer2_iod_file_wembed;
  95 long hammer2_iod_file_wzero;
  96 long hammer2_iod_file_wdedup;
  97 long hammer2_iod_meta_write;
  98 long hammer2_iod_indr_write;
  99 long hammer2_iod_fmap_write;
 100 long hammer2_iod_volu_write;
 101 static long hammer2_iod_inode_creates;
 102 static long hammer2_iod_inode_deletes;
 103
 104 long hammer2_process_icrc32;
 105 long hammer2_process_xxhash64;
 106
 107 int hz;
 108 int ticks;
 109 int64_t vnode_count;
 110
 111 MALLOC_DECLARE(M_HAMMER2_CBUFFER);
 112 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer",
 113                 "Buffer used for compression.");
 114
 115 MALLOC_DECLARE(M_HAMMER2_DEBUFFER);
 116 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer",
 117                 "Buffer used for decompression.");
 118
 119 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
 120
 121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, supported_version, CTLFLAG_RD,
 122            &hammer2_supported_version, 0, "");
 123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, aux_flags, CTLFLAG_RW,
 124            &hammer2_aux_flags, 0, "");
 125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
 126            &hammer2_debug, 0, "");
 127 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, debug_inode, CTLFLAG_RW,
 128            &hammer2_debug_inode, 0, "");
 129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, spread_workers, CTLFLAG_RW,
 130            &hammer2_spread_workers, 0, "");
 131 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_meta_read, CTLFLAG_RW,
 132            &hammer2_cluster_meta_read, 0, "");
 133 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_data_read, CTLFLAG_RW,
 134            &hammer2_cluster_data_read, 0, "");
 135 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_write, CTLFLAG_RW,
 136            &hammer2_cluster_write, 0, "");
 137 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW,
 138            &hammer2_dedup_enable, 0, "");
 139 SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW,
 140            &hammer2_always_compress, 0, "");
 141 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
 142            &hammer2_flush_pipe, 0, "");
 143 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW,
 144            &hammer2_bulkfree_tps, 0, "");
 145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW,
 146            &hammer2_chain_allocs, 0, "");
 147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_saved_chains, CTLFLAG_RW,
 148            &hammer2_limit_saved_chains, 0, "");
 149 SYSCTL_INT(_vfs_hammer2, OID_AUTO, limit_saved_depth, CTLFLAG_RW,
 150            &hammer2_limit_saved_depth, 0, "");
 151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
 152            &hammer2_limit_dirty_chains, 0, "");
 153 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW,
 154            &hammer2_limit_dirty_inodes, 0, "");
 155 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW,
 156            &hammer2_count_modified_chains, 0, "");
 157 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
 158            &hammer2_dio_count, 0, "");
 159 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW,
 160            &hammer2_dio_limit, 0, "");
 161
 162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
 163            &hammer2_iod_file_read, 0, "");
 164 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
 165            &hammer2_iod_meta_read, 0, "");
 166 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
 167            &hammer2_iod_indr_read, 0, "");
 168 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
 169            &hammer2_iod_fmap_read, 0, "");
 170 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
 171            &hammer2_iod_volu_read, 0, "");
 172
 173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
 174            &hammer2_iod_file_write, 0, "");
 175 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wembed, CTLFLAG_RW,
 176            &hammer2_iod_file_wembed, 0, "");
 177 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wzero, CTLFLAG_RW,
 178            &hammer2_iod_file_wzero, 0, "");
 179 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wdedup, CTLFLAG_RW,
 180            &hammer2_iod_file_wdedup, 0, "");
 181 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
 182            &hammer2_iod_meta_write, 0, "");
 183 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
 184            &hammer2_iod_indr_write, 0, "");
 185 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
 186            &hammer2_iod_fmap_write, 0, "");
 187 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
 188            &hammer2_iod_volu_write, 0, "");
 189 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_creates, CTLFLAG_RW,
 190            &hammer2_iod_inode_creates, 0, "");
 191 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_deletes, CTLFLAG_RW,
 192            &hammer2_iod_inode_deletes, 0, "");
 193
 194 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW,
 195            &hammer2_process_icrc32, 0, "");
 196 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW,
 197            &hammer2_process_xxhash64, 0, "");
 198
 199 /*
 200 static int hammer2_vfs_init(struct vfsconf *conf);
 201 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
 202 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
 203                                 struct ucred *cred);
 204 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *,
 205                                 struct ucred *);
 206 */
 207 static int hammer2_recovery(hammer2_dev_t *hmp);
 208 /*
 209 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
 210 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
 211 */
 212 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
 213                                 struct ucred *cred);
 214 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
 215                                 struct ucred *cred);
 216 /*
 217 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
 218                                 struct fid *fhp, struct vnode **vpp);
 219 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
 220 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
 221                                 int *exflagsp, struct ucred **credanonp);
 222 static int hammer2_vfs_modifying(struct mount *mp);
 223 */
 224
 225 static void hammer2_update_pmps(hammer2_dev_t *hmp);
 226
 227 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp);
 228 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp,
 229                                 hammer2_dev_t *hmp);
 230 static int hammer2_fixup_pfses(hammer2_dev_t *hmp);
 231
 232 /*
 233  * HAMMER2 vfs operations.
 234  */
 235 /*
 236 static struct vfsops hammer2_vfsops = {
 237         .vfs_flags      = 0,
 238         .vfs_init       = hammer2_vfs_init,
 239         .vfs_uninit     = hammer2_vfs_uninit,
 240         .vfs_sync       = hammer2_vfs_sync,
 241         .vfs_mount      = hammer2_vfs_mount,
 242         .vfs_unmount    = hammer2_vfs_unmount,
 243         .vfs_root       = hammer2_vfs_root,
 244         .vfs_statfs     = hammer2_vfs_statfs,
 245         .vfs_statvfs    = hammer2_vfs_statvfs,
 246         .vfs_vget       = hammer2_vfs_vget,
 247         .vfs_vptofh     = hammer2_vfs_vptofh,
 248         .vfs_fhtovp     = hammer2_vfs_fhtovp,
 249         .vfs_checkexp   = hammer2_vfs_checkexp,
 250         .vfs_modifying  = hammer2_vfs_modifying
 251 };
 252 */
 253
 254 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
 255
 256 VFS_SET(hammer2_vfsops, hammer2, VFCF_MPSAFE);
 257 MODULE_VERSION(hammer2, 1);
 258
 259 int
 260 hammer2_vfs_init(void)
 261 {
 262         /*
 263         static struct objcache_malloc_args margs_read;
 264         static struct objcache_malloc_args margs_write;
 265         static struct objcache_malloc_args margs_vop;
 266         */
 267
 268         int error;
 269         int mod;
 270
 271         error = 0;
 272         kmalloc_raise_limit(M_HAMMER2, 0);      /* unlimited */
 273
 274         /*
 275          * hammer2_xop_nthreads must be a multiple of ncpus,
 276          * minimum 2 * ncpus.
 277          */
 278         const int ncpus = 1;
 279         mod = ncpus;
 280         hammer2_xop_mod = mod;
 281         hammer2_xop_nthreads = mod * 2;
 282         /*
 283         while (hammer2_xop_nthreads / mod < HAMMER2_XOPGROUPS_MIN ||
 284                hammer2_xop_nthreads < HAMMER2_XOPTHREADS_MIN)
 285         {
 286                 hammer2_xop_nthreads += mod;
 287         }
 288         hammer2_xop_sgroups = hammer2_xop_nthreads / mod / 2;
 289         hammer2_xop_xgroups = hammer2_xop_nthreads / mod - hammer2_xop_sgroups;
 290         hammer2_xop_xbase = hammer2_xop_sgroups * mod;
 291         */
 292
 293         /*
 294          * A large DIO cache is needed to retain dedup enablement masks.
 295          * The bulkfree code clears related masks as part of the disk block
 296          * recycling algorithm, preventing it from being used for a later
 297          * dedup.
 298          *
 299          * NOTE: A large buffer cache can actually interfere with dedup
 300          *       operation because we dedup based on media physical buffers
 301          *       and not logical buffers.  Try to make the DIO case large
 302          *       enough to avoid this problem, but also cap it.
 303          */
 304         const long nbuf = 100000; /* XXX */
 305         hammer2_dio_limit = nbuf * 2;
 306         if (hammer2_dio_limit > 100000)
 307                 hammer2_dio_limit = 100000;
 308
 309         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
 310                 error = EINVAL;
 311         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
 312                 error = EINVAL;
 313         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
 314                 error = EINVAL;
 315
 316         if (error) {
 317                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
 318                 return (error);
 319         }
 320
 321 #if 0
 322         margs_read.objsize = 65536;
 323         margs_read.mtype = M_HAMMER2_DEBUFFER;
 324
 325         margs_write.objsize = 32768;
 326         margs_write.mtype = M_HAMMER2_CBUFFER;
 327
 328         margs_vop.objsize = sizeof(hammer2_xop_t);
 329         margs_vop.mtype = M_HAMMER2;
 330
 331         /*
 332          * Note thaht for the XOPS cache we want backing store allocations
 333          * to use M_ZERO.  This is not allowed in objcache_get() (to avoid
 334          * confusion), so use the backing store function that does it.  This
 335          * means that initial XOPS objects are zerod but REUSED objects are
 336          * not.  So we are responsible for cleaning the object up sufficiently
 337          * for our needs before objcache_put()ing it back (typically just the
 338          * FIFO indices).
 339          */
 340         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
 341                                 0, 1, NULL, NULL, NULL,
 342                                 objcache_malloc_alloc,
 343                                 objcache_malloc_free,
 344                                 &margs_read);
 345         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
 346                                 0, 1, NULL, NULL, NULL,
 347                                 objcache_malloc_alloc,
 348                                 objcache_malloc_free,
 349                                 &margs_write);
 350         cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc,
 351                                 0, 1, NULL, NULL, NULL,
 352                                 objcache_malloc_alloc_zero,
 353                                 objcache_malloc_free,
 354                                 &margs_vop);
 355 #endif
 356
 357
 358         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
 359         TAILQ_INIT(&hammer2_mntlist);
 360         TAILQ_INIT(&hammer2_pfslist);
 361         TAILQ_INIT(&hammer2_spmplist);
 362
 363         const int maxvnodes = 100000; /* XXX */
 364         hammer2_limit_dirty_chains = maxvnodes / 10;
 365         if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS)
 366                 hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS;
 367         if (hammer2_limit_dirty_chains < 1000)
 368                 hammer2_limit_dirty_chains = 1000;
 369
 370         hammer2_limit_dirty_inodes = maxvnodes / 25;
 371         if (hammer2_limit_dirty_inodes < 100)
 372                 hammer2_limit_dirty_inodes = 100;
 373         if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES)
 374                 hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES;
 375
 376         hammer2_limit_saved_chains = hammer2_limit_dirty_chains * 5;
 377
 378         return (error);
 379 }
 380
 381 int
 382 hammer2_vfs_uninit(void)
 383 {
 384         /*
 385         objcache_destroy(cache_buffer_read);
 386         objcache_destroy(cache_buffer_write);
 387         objcache_destroy(cache_xops);
 388         */
 389         return 0;
 390 }
 391
 392 /*
 393  * Core PFS allocator.  Used to allocate or reference the pmp structure
 394  * for PFS cluster mounts and the spmp structure for media (hmp) structures.
 395  * The pmp can be passed in or loaded by this function using the chain and
 396  * inode data.
 397  *
 398  * pmp->modify_tid tracks new modify_tid transaction ids for front-end
 399  * transactions.  Note that synchronization does not use this field.
 400  * (typically frontend operations and synchronization cannot run on the
 401  * same PFS node at the same time).
 402  *
 403  * XXX check locking
 404  */
 405 hammer2_pfs_t *
 406 hammer2_pfsalloc(hammer2_chain_t *chain,
 407                  const hammer2_inode_data_t *ripdata,
 408                  hammer2_tid_t modify_tid, hammer2_dev_t *force_local)
 409 {
 410         hammer2_pfs_t *pmp;
 411         hammer2_inode_t *iroot;
 412         int count;
 413         int i;
 414         int j;
 415
 416         pmp = NULL;
 417
 418         /*
 419          * Locate or create the PFS based on the cluster id.  If ripdata
 420          * is NULL this is a spmp which is unique and is always allocated.
 421          *
 422          * If the device is mounted in local mode all PFSs are considered
 423          * independent and not part of any cluster (for debugging only).
 424          */
 425         if (ripdata) {
 426                 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
 427                         if (force_local != pmp->force_local)
 428                                 continue;
 429                         if (force_local == NULL &&
 430                             bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid,
 431                                  sizeof(pmp->pfs_clid)) == 0) {
 432                                         break;
 433                         } else if (force_local && pmp->pfs_names[0] &&
 434                             strcmp(pmp->pfs_names[0], ripdata->filename) == 0) {
 435                                         break;
 436                         }
 437                 }
 438         }
 439
 440         if (pmp == NULL) {
 441                 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
 442                 pmp->force_local = force_local;
 443                 hammer2_trans_manage_init(pmp);
 444                 kmalloc_create_obj(&pmp->minode, "HAMMER2-inodes",
 445                                    sizeof(struct hammer2_inode));
 446                 lockinit(&pmp->lock, "pfslk", 0, 0);
 447                 hammer2_spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
 448                 hammer2_spin_init(&pmp->xop_spin, "h2xop");
 449                 hammer2_spin_init(&pmp->lru_spin, "h2lru");
 450                 RB_INIT(&pmp->inum_tree);
 451                 TAILQ_INIT(&pmp->syncq);
 452                 TAILQ_INIT(&pmp->depq);
 453                 TAILQ_INIT(&pmp->lru_list);
 454                 hammer2_spin_init(&pmp->list_spin, "h2pfsalloc_list");
 455
 456                 /*
 457                  * Save the last media transaction id for the flusher.  Set
 458                  * initial
 459                  */
 460                 if (ripdata) {
 461                         pmp->pfs_clid = ripdata->meta.pfs_clid;
 462                         TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
 463                 } else {
 464                         pmp->flags |= HAMMER2_PMPF_SPMP;
 465                         TAILQ_INSERT_TAIL(&hammer2_spmplist, pmp, mntentry);
 466                 }
 467
 468                 /*
 469                  * The synchronization thread may start too early, make
 470                  * sure it stays frozen until we are ready to let it go.
 471                  * XXX
 472                  */
 473                 /*
 474                 pmp->primary_thr.flags = HAMMER2_THREAD_FROZEN |
 475                                          HAMMER2_THREAD_REMASTER;
 476                 */
 477         }
 478
 479         /*
 480          * Create the PFS's root inode and any missing XOP helper threads.
 481          */
 482         if ((iroot = pmp->iroot) == NULL) {
 483                 iroot = hammer2_inode_get(pmp, NULL, 1, -1);
 484                 if (ripdata)
 485                         iroot->meta = ripdata->meta;
 486                 pmp->iroot = iroot;
 487                 hammer2_inode_ref(iroot);
 488                 hammer2_inode_unlock(iroot);
 489         }
 490
 491         /*
 492          * Stop here if no chain is passed in.
 493          */
 494         if (chain == NULL)
 495                 goto done;
 496
 497         /*
 498          * When a chain is passed in we must add it to the PFS's root
 499          * inode, update pmp->pfs_types[], and update the syncronization
 500          * threads.
 501          *
 502          * When forcing local mode, mark the PFS as a MASTER regardless.
 503          *
 504          * At the moment empty spots can develop due to removals or failures.
 505          * Ultimately we want to re-fill these spots but doing so might
 506          * confused running code. XXX
 507          */
 508         hammer2_inode_ref(iroot);
 509         hammer2_mtx_ex(&iroot->lock);
 510         j = iroot->cluster.nchains;
 511
 512         if (j == HAMMER2_MAXCLUSTER) {
 513                 kprintf("hammer2_pfsalloc: cluster full!\n");
 514                 /* XXX fatal error? */
 515         } else {
 516                 KKASSERT(chain->pmp == NULL);
 517                 chain->pmp = pmp;
 518                 hammer2_chain_ref(chain);
 519                 iroot->cluster.array[j].chain = chain;
 520                 if (force_local)
 521                         pmp->pfs_types[j] = HAMMER2_PFSTYPE_MASTER;
 522                 else
 523                         pmp->pfs_types[j] = ripdata->meta.pfs_type;
 524                 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2);
 525                 pmp->pfs_hmps[j] = chain->hmp;
 526                 hammer2_spin_ex(&pmp->inum_spin);
 527                 pmp->pfs_iroot_blocksets[j] = chain->data->ipdata.u.blockset;
 528                 hammer2_spin_unex(&pmp->inum_spin);
 529
 530                 /*
 531                  * If the PFS is already mounted we must account
 532                  * for the mount_count here.
 533                  */
 534                 if (pmp->mp)
 535                         ++chain->hmp->mount_count;
 536
 537                 /*
 538                  * May have to fixup dirty chain tracking.  Previous
 539                  * pmp was NULL so nothing to undo.
 540                  */
 541                 if (chain->flags & HAMMER2_CHAIN_MODIFIED)
 542                         hammer2_pfs_memory_inc(pmp);
 543                 ++j;
 544         }
 545         iroot->cluster.nchains = j;
 546
 547         /*
 548          * Update nmasters from any PFS inode which is part of the cluster.
 549          * It is possible that this will result in a value which is too
 550          * high.  MASTER PFSs are authoritative for pfs_nmasters and will
 551          * override this value later on.
 552          *
 553          * (This informs us of masters that might not currently be
 554          *  discoverable by this mount).
 555          */
 556         if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) {
 557                 pmp->pfs_nmasters = ripdata->meta.pfs_nmasters;
 558         }
 559
 560         /*
 561          * Count visible masters.  Masters are usually added with
 562          * ripdata->meta.pfs_nmasters set to 1.  This detects when there
 563          * are more (XXX and must update the master inodes).
 564          */
 565         count = 0;
 566         for (i = 0; i < iroot->cluster.nchains; ++i) {
 567                 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER)
 568                         ++count;
 569         }
 570         if (pmp->pfs_nmasters < count)
 571                 pmp->pfs_nmasters = count;
 572
 573         /*
 574          * Create missing synchronization and support threads.
 575          *
 576          * Single-node masters (including snapshots) have nothing to
 577          * synchronize and do not require this thread.
 578          *
 579          * Multi-node masters or any number of soft masters, slaves, copy,
 580          * or other PFS types need the thread.
 581          *
 582          * Each thread is responsible for its particular cluster index.
 583          * We use independent threads so stalls or mismatches related to
 584          * any given target do not affect other targets.
 585          */
 586         for (i = 0; i < iroot->cluster.nchains; ++i) {
 587                 /*
 588                  * Single-node masters (including snapshots) have nothing
 589                  * to synchronize and will make direct xops support calls,
 590                  * thus they do not require this thread.
 591                  *
 592                  * Note that there can be thousands of snapshots.  We do not
 593                  * want to create thousands of threads.
 594                  */
 595                 if (pmp->pfs_nmasters <= 1 &&
 596                     pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) {
 597                         continue;
 598                 }
 599
 600                 /*
 601                  * Sync support thread
 602                  */
 603                 /*
 604                 if (pmp->sync_thrs[i].td == NULL) {
 605                         hammer2_thr_create(&pmp->sync_thrs[i], pmp, NULL,
 606                                            "h2nod", i, -1,
 607                                            hammer2_primary_sync_thread);
 608                 }
 609                 */
 610         }
 611
 612         /*
 613          * Create missing Xop threads
 614          *
 615          * NOTE: We create helper threads for all mounted PFSs or any
 616          *       PFSs with 2+ nodes (so the sync thread can update them,
 617          *       even if not mounted).
 618          */
 619         if (pmp->mp || iroot->cluster.nchains >= 2)
 620                 hammer2_xop_helper_create(pmp);
 621
 622         hammer2_mtx_unlock(&iroot->lock);
 623         hammer2_inode_drop(iroot);
 624 done:
 625         return pmp;
 626 }
 627
 628 /*
 629  * Deallocate an element of a probed PFS.  If destroying and this is a
 630  * MASTER, adjust nmasters.
 631  *
 632  * This function does not physically destroy the PFS element in its device
 633  * under the super-root  (see hammer2_ioctl_pfs_delete()).
 634  */
 635 void
 636 hammer2_pfsdealloc(hammer2_pfs_t *pmp, int clindex, int destroying)
 637 {
 638         hammer2_inode_t *iroot;
 639         hammer2_chain_t *chain;
 640         int j;
 641
 642         /*
 643          * Cleanup our reference on iroot.  iroot is (should) not be needed
 644          * by the flush code.
 645          */
 646         iroot = pmp->iroot;
 647         if (iroot) {
 648                 /*
 649                  * Stop synchronizing
 650                  *
 651                  * XXX flush after acquiring the iroot lock.
 652                  * XXX clean out the cluster index from all inode structures.
 653                  */
 654                 hammer2_thr_delete(&pmp->sync_thrs[clindex]);
 655
 656                 /*
 657                  * Remove the cluster index from the group.  If destroying
 658                  * the PFS and this is a master, adjust pfs_nmasters.
 659                  */
 660                 hammer2_mtx_ex(&iroot->lock);
 661                 chain = iroot->cluster.array[clindex].chain;
 662                 iroot->cluster.array[clindex].chain = NULL;
 663
 664                 switch(pmp->pfs_types[clindex]) {
 665                 case HAMMER2_PFSTYPE_MASTER:
 666                         if (destroying && pmp->pfs_nmasters > 0)
 667                                 --pmp->pfs_nmasters;
 668                         /* XXX adjust ripdata->meta.pfs_nmasters */
 669                         break;
 670                 default:
 671                         break;
 672                 }
 673                 pmp->pfs_types[clindex] = HAMMER2_PFSTYPE_NONE;
 674
 675                 hammer2_mtx_unlock(&iroot->lock);
 676
 677                 /*
 678                  * Release the chain.
 679                  */
 680                 if (chain) {
 681                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
 682                         hammer2_chain_drop(chain);
 683                 }
 684
 685                 /*
 686                  * Terminate all XOP threads for the cluster index.
 687                  */
 688                 if (pmp->xop_groups) {
 689                         for (j = 0; j < hammer2_xop_nthreads; ++j) {
 690                                 hammer2_thr_delete(
 691                                         &pmp->xop_groups[j].thrs[clindex]);
 692                         }
 693                 }
 694         }
 695 }
 696
 697 /*
 698  * Destroy a PFS, typically only occurs after the last mount on a device
 699  * has gone away.
 700  */
 701 static void
 702 hammer2_pfsfree(hammer2_pfs_t *pmp)
 703 {
 704         hammer2_inode_t *iroot;
 705         hammer2_chain_t *chain;
 706         int chains_still_present = 0;
 707         int i;
 708         //int j;
 709
 710         /*
 711          * Cleanup our reference on iroot.  iroot is (should) not be needed
 712          * by the flush code.
 713          */
 714         if (pmp->flags & HAMMER2_PMPF_SPMP)
 715                 TAILQ_REMOVE(&hammer2_spmplist, pmp, mntentry);
 716         else
 717                 TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
 718
 719         /*
 720          * Cleanup chains remaining on LRU list.
 721          */
 722         hammer2_spin_ex(&pmp->lru_spin);
 723         while ((chain = TAILQ_FIRST(&pmp->lru_list)) != NULL) {
 724                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONLRU);
 725                 atomic_add_int(&pmp->lru_count, -1);
 726                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
 727                 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
 728                 hammer2_chain_ref(chain);
 729                 hammer2_spin_unex(&pmp->lru_spin);
 730                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
 731                 hammer2_chain_drop(chain);
 732                 hammer2_spin_ex(&pmp->lru_spin);
 733         }
 734         hammer2_spin_unex(&pmp->lru_spin);
 735
 736         /*
 737          * Clean up iroot
 738          */
 739         iroot = pmp->iroot;
 740         if (iroot) {
 741                 for (i = 0; i < iroot->cluster.nchains; ++i) {
 742                         /*
 743                         hammer2_thr_delete(&pmp->sync_thrs[i]);
 744                         if (pmp->xop_groups) {
 745                                 for (j = 0; j < hammer2_xop_nthreads; ++j)
 746                                         hammer2_thr_delete(
 747                                                 &pmp->xop_groups[j].thrs[i]);
 748                         }
 749                         */
 750                         chain = iroot->cluster.array[i].chain;
 751                         if (chain && !RB_EMPTY(&chain->core.rbtree)) {
 752                                 kprintf("hammer2: Warning pmp %p still "
 753                                         "has active chains\n", pmp);
 754                                 chains_still_present = 1;
 755                         }
 756                 }
 757                 KASSERT(iroot->refs == 1,
 758                         ("PMP->IROOT %p REFS WRONG %d", iroot, iroot->refs));
 759
 760                 /* ref for iroot */
 761                 hammer2_inode_drop(iroot);
 762                 pmp->iroot = NULL;
 763         }
 764
 765         /*
 766          * Free remaining pmp resources
 767          */
 768         if (chains_still_present) {
 769                 kprintf("hammer2: cannot free pmp %p, still in use\n", pmp);
 770         } else {
 771                 /*
 772                  * In makefs HAMMER2, all inodes must be gone at this point.
 773                  * XXX vnode_count may not be 0 at this point.
 774                  */
 775                 assert(hammer2_pfs_inode_count(pmp) == 0);
 776
 777                 kmalloc_destroy_obj(&pmp->minode);
 778                 kfree(pmp, M_HAMMER2);
 779         }
 780 }
 781
 782 /*
 783  * Remove all references to hmp from the pfs list.  Any PFS which becomes
 784  * empty is terminated and freed.
 785  *
 786  * XXX inefficient.
 787  */
 788 static void
 789 hammer2_pfsfree_scan(hammer2_dev_t *hmp, int which)
 790 {
 791         hammer2_pfs_t *pmp;
 792         hammer2_inode_t *iroot;
 793         hammer2_chain_t *rchain;
 794         int i;
 795         //int j;
 796         struct hammer2_pfslist *wlist;
 797
 798         if (which == 0)
 799                 wlist = &hammer2_pfslist;
 800         else
 801                 wlist = &hammer2_spmplist;
 802 again:
 803         TAILQ_FOREACH(pmp, wlist, mntentry) {
 804                 if ((iroot = pmp->iroot) == NULL)
 805                         continue;
 806
 807                 /*
 808                  * Determine if this PFS is affected.  If it is we must
 809                  * freeze all management threads and lock its iroot.
 810                  *
 811                  * Freezing a management thread forces it idle, operations
 812                  * in-progress will be aborted and it will have to start
 813                  * over again when unfrozen, or exit if told to exit.
 814                  */
 815                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
 816                         if (pmp->pfs_hmps[i] == hmp)
 817                                 break;
 818                 }
 819                 if (i == HAMMER2_MAXCLUSTER)
 820                         continue;
 821
 822                 hammer2_vfs_sync_pmp(pmp, MNT_WAIT);
 823
 824                 /*
 825                  * Make sure all synchronization threads are locked
 826                  * down.
 827                  */
 828                 /*
 829                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
 830                         if (pmp->pfs_hmps[i] == NULL)
 831                                 continue;
 832                         hammer2_thr_freeze_async(&pmp->sync_thrs[i]);
 833                         if (pmp->xop_groups) {
 834                                 for (j = 0; j < hammer2_xop_nthreads; ++j) {
 835                                         hammer2_thr_freeze_async(
 836                                                 &pmp->xop_groups[j].thrs[i]);
 837                                 }
 838                         }
 839                 }
 840                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
 841                         if (pmp->pfs_hmps[i] == NULL)
 842                                 continue;
 843                         hammer2_thr_freeze(&pmp->sync_thrs[i]);
 844                         if (pmp->xop_groups) {
 845                                 for (j = 0; j < hammer2_xop_nthreads; ++j) {
 846                                         hammer2_thr_freeze(
 847                                                 &pmp->xop_groups[j].thrs[i]);
 848                                 }
 849                         }
 850                 }
 851                 */
 852
 853                 /*
 854                  * Lock the inode and clean out matching chains.
 855                  * Note that we cannot use hammer2_inode_lock_*()
 856                  * here because that would attempt to validate the
 857                  * cluster that we are in the middle of ripping
 858                  * apart.
 859                  *
 860                  * WARNING! We are working directly on the inodes
 861                  *          embedded cluster.
 862                  */
 863                 hammer2_mtx_ex(&iroot->lock);
 864
 865                 /*
 866                  * Remove the chain from matching elements of the PFS.
 867                  */
 868                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
 869                         if (pmp->pfs_hmps[i] != hmp)
 870                                 continue;
 871                         /*
 872                         hammer2_thr_delete(&pmp->sync_thrs[i]);
 873                         if (pmp->xop_groups) {
 874                                 for (j = 0; j < hammer2_xop_nthreads; ++j) {
 875                                         hammer2_thr_delete(
 876                                                 &pmp->xop_groups[j].thrs[i]);
 877                                 }
 878                         }
 879                         */
 880                         rchain = iroot->cluster.array[i].chain;
 881                         iroot->cluster.array[i].chain = NULL;
 882                         pmp->pfs_types[i] = 0;
 883                         if (pmp->pfs_names[i]) {
 884                                 kfree(pmp->pfs_names[i], M_HAMMER2);
 885                                 pmp->pfs_names[i] = NULL;
 886                         }
 887                         if (rchain) {
 888                                 hammer2_chain_drop(rchain);
 889                                 /* focus hint */
 890                                 if (iroot->cluster.focus == rchain)
 891                                         iroot->cluster.focus = NULL;
 892                         }
 893                         pmp->pfs_hmps[i] = NULL;
 894                 }
 895                 hammer2_mtx_unlock(&iroot->lock);
 896
 897                 /*
 898                  * Cleanup trailing chains.  Gaps may remain.
 899                  */
 900                 for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) {
 901                         if (pmp->pfs_hmps[i])
 902                                 break;
 903                 }
 904                 iroot->cluster.nchains = i + 1;
 905
 906                 /*
 907                  * If the PMP has no elements remaining we can destroy it.
 908                  * (this will transition management threads from frozen->exit).
 909                  */
 910                 if (iroot->cluster.nchains == 0) {
 911                         /*
 912                          * If this was the hmp's spmp, we need to clean
 913                          * a little more stuff out.
 914                          */
 915                         if (hmp->spmp == pmp) {
 916                                 hmp->spmp = NULL;
 917                                 hmp->vchain.pmp = NULL;
 918                                 hmp->fchain.pmp = NULL;
 919                         }
 920
 921                         /*
 922                          * Free the pmp and restart the loop
 923                          */
 924                         KKASSERT(TAILQ_EMPTY(&pmp->syncq));
 925                         KKASSERT(TAILQ_EMPTY(&pmp->depq));
 926                         hammer2_pfsfree(pmp);
 927                         goto again;
 928                 }
 929
 930                 /*
 931                  * If elements still remain we need to set the REMASTER
 932                  * flag and unfreeze it.
 933                  */
 934                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
 935                         if (pmp->pfs_hmps[i] == NULL)
 936                                 continue;
 937                         /*
 938                         hammer2_thr_remaster(&pmp->sync_thrs[i]);
 939                         hammer2_thr_unfreeze(&pmp->sync_thrs[i]);
 940                         if (pmp->xop_groups) {
 941                                 for (j = 0; j < hammer2_xop_nthreads; ++j) {
 942                                         hammer2_thr_remaster(
 943                                                 &pmp->xop_groups[j].thrs[i]);
 944                                         hammer2_thr_unfreeze(
 945                                                 &pmp->xop_groups[j].thrs[i]);
 946                                 }
 947                         }
 948                         */
 949                 }
 950         }
 951 }
 952
 953 /*
 954  * Mount or remount HAMMER2 fileystem from physical media
 955  *
 956  *      mountroot
 957  *              mp              mount point structure
 958  *              path            NULL
 959  *              data            <unused>
 960  *              cred            <unused>
 961  *
 962  *      mount
 963  *              mp              mount point structure
 964  *              path            path to mount point
 965  *              data            pointer to argument structure in user space
 966  *                      volume  volume path (device@LABEL form)
 967  *                      hflags  user mount flags
 968  *              cred            user credentials
 969  *
 970  * RETURNS:     0       Success
 971  *              !0      error number
 972  */
 973 int
 974 hammer2_vfs_mount(struct vnode *makefs_devvp, struct mount *mp,
 975                   const char *label, const struct hammer2_mount_info *mi)
 976 {
 977         struct hammer2_mount_info info = *mi;
 978         hammer2_pfs_t *pmp;
 979         hammer2_pfs_t *spmp;
 980         hammer2_dev_t *hmp, *hmp_tmp;
 981         hammer2_dev_t *force_local;
 982         hammer2_key_t key_next;
 983         hammer2_key_t key_dummy;
 984         hammer2_key_t lhc;
 985         hammer2_chain_t *parent;
 986         hammer2_chain_t *chain;
 987         const hammer2_inode_data_t *ripdata;
 988         hammer2_blockref_t bref;
 989         hammer2_devvp_list_t devvpl;
 990         hammer2_devvp_t *e, *e_tmp;
 991         char *dev;
 992         int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
 993         int error;
 994         int i;
 995
 996         hmp = NULL;
 997         pmp = NULL;
 998         dev = NULL;
 999
1000         kprintf("hammer2_mount: dev=\"%s\" label=\"%s\" rdonly=%d\n",
1001                 dev, label, ronly);
1002
1003         /*
1004          * Initialize all device vnodes.
1005          */
1006         TAILQ_INIT(&devvpl);
1007         error = hammer2_init_devvp(makefs_devvp, &devvpl);
1008         if (error) {
1009                 kprintf("hammer2: failed to initialize devvp in %s\n", dev);
1010                 hammer2_cleanup_devvp(&devvpl);
1011                 return error;
1012         }
1013
1014         /*
1015          * Determine if the device has already been mounted.  After this
1016          * check hmp will be non-NULL if we are doing the second or more
1017          * hammer2 mounts from the same device.
1018          */
1019         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1020         if (!TAILQ_EMPTY(&devvpl)) {
1021                 /*
1022                  * Match the device.  Due to the way devfs works,
1023                  * we may not be able to directly match the vnode pointer,
1024                  * so also check to see if the underlying device matches.
1025                  */
1026                 TAILQ_FOREACH(hmp_tmp, &hammer2_mntlist, mntentry) {
1027                         TAILQ_FOREACH(e_tmp, &hmp_tmp->devvpl, entry) {
1028                                 int devvp_found = 0;
1029                                 TAILQ_FOREACH(e, &devvpl, entry) {
1030                                         KKASSERT(e->devvp);
1031                                         if (e_tmp->devvp == e->devvp)
1032                                                 devvp_found = 1;
1033                                         /*
1034                                         if (e_tmp->devvp->v_rdev &&
1035                                             e_tmp->devvp->v_rdev == e->devvp->v_rdev)
1036                                                 devvp_found = 1;
1037                                         */
1038                                 }
1039                                 if (!devvp_found)
1040                                         goto next_hmp;
1041                         }
1042                         hmp = hmp_tmp;
1043                         kprintf("hammer2_mount: hmp=%p matched\n", hmp);
1044                         break;
1045 next_hmp:
1046                         continue;
1047                 }
1048
1049                 /*
1050                  * If no match this may be a fresh H2 mount, make sure
1051                  * the device is not mounted on anything else.
1052                  */
1053                 if (hmp == NULL) {
1054                         TAILQ_FOREACH(e, &devvpl, entry) {
1055                                 struct vnode *devvp = e->devvp;
1056                                 KKASSERT(devvp);
1057                                 error = vfs_mountedon(devvp);
1058                                 if (error) {
1059                                         kprintf("hammer2_mount: %s mounted %d\n",
1060                                                 e->path, error);
1061                                         hammer2_cleanup_devvp(&devvpl);
1062                                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1063                                         return error;
1064                                 }
1065                         }
1066                 }
1067         } else {
1068                 /*
1069                  * Match the label to a pmp already probed.
1070                  */
1071                 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
1072                         for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
1073                                 if (pmp->pfs_names[i] &&
1074                                     strcmp(pmp->pfs_names[i], label) == 0) {
1075                                         hmp = pmp->pfs_hmps[i];
1076                                         break;
1077                                 }
1078                         }
1079                         if (hmp)
1080                                 break;
1081                 }
1082                 if (hmp == NULL) {
1083                         kprintf("hammer2_mount: PFS label \"%s\" not found\n",
1084                                 label);
1085                         hammer2_cleanup_devvp(&devvpl);
1086                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1087                         return ENOENT;
1088                 }
1089         }
1090
1091         /*
1092          * Open the device if this isn't a secondary mount and construct
1093          * the H2 device mount (hmp).
1094          */
1095         if (hmp == NULL) {
1096                 hammer2_chain_t *schain;
1097                 hammer2_xop_head_t xop;
1098
1099                 /*
1100                  * Now open the device
1101                  */
1102                 KKASSERT(!TAILQ_EMPTY(&devvpl));
1103                 if (error == 0) {
1104                         error = hammer2_open_devvp(&devvpl, ronly);
1105                         if (error) {
1106                                 hammer2_close_devvp(&devvpl, ronly);
1107                                 hammer2_cleanup_devvp(&devvpl);
1108                                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1109                                 return error;
1110                         }
1111                 }
1112
1113                 /*
1114                  * Construct volumes and link with device vnodes.
1115                  */
1116                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
1117                 hmp->devvp = NULL;
1118                 error = hammer2_init_vfsvolumes(mp, &devvpl, hmp->volumes,
1119                                              &hmp->voldata, &hmp->volhdrno,
1120                                              &hmp->devvp);
1121                 if (error) {
1122                         hammer2_close_devvp(&devvpl, ronly);
1123                         hammer2_cleanup_devvp(&devvpl);
1124                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1125                         kfree(hmp, M_HAMMER2);
1126                         return error;
1127                 }
1128                 if (!hmp->devvp) {
1129                         kprintf("hammer2: failed to initialize root volume\n");
1130                         hammer2_unmount_helper(mp, NULL, hmp);
1131                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1132                         hammer2_vfs_unmount(mp, MNT_FORCE);
1133                         return EINVAL;
1134                 }
1135
1136                 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev);
1137                 hmp->ronly = ronly;
1138                 hmp->hflags = info.hflags & HMNT2_DEVFLAGS;
1139                 kmalloc_create_obj(&hmp->mchain, "HAMMER2-chains",
1140                                    sizeof(struct hammer2_chain));
1141                 kmalloc_create_obj(&hmp->mio, "HAMMER2-dio",
1142                                    sizeof(struct hammer2_io));
1143                 kmalloc_create(&hmp->mmsg, "HAMMER2-msg");
1144                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
1145                 RB_INIT(&hmp->iotree);
1146                 hammer2_spin_init(&hmp->io_spin, "h2mount_io");
1147                 hammer2_spin_init(&hmp->list_spin, "h2mount_list");
1148
1149                 lockinit(&hmp->vollk, "h2vol", 0, 0);
1150                 lockinit(&hmp->bulklk, "h2bulk", 0, 0);
1151                 lockinit(&hmp->bflock, "h2bflk", 0, 0);
1152
1153                 /*
1154                  * vchain setup. vchain.data is embedded.
1155                  * vchain.refs is initialized and will never drop to 0.
1156                  *
1157                  * NOTE! voldata is not yet loaded.
1158                  */
1159                 hmp->vchain.hmp = hmp;
1160                 hmp->vchain.refs = 1;
1161                 hmp->vchain.data = (void *)&hmp->voldata;
1162                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
1163                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
1164                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
1165                 hammer2_chain_core_init(&hmp->vchain);
1166
1167                 /*
1168                  * fchain setup.  fchain.data is embedded.
1169                  * fchain.refs is initialized and will never drop to 0.
1170                  *
1171                  * The data is not used but needs to be initialized to
1172                  * pass assertion muster.  We use this chain primarily
1173                  * as a placeholder for the freemap's top-level radix tree
1174                  * so it does not interfere with the volume's topology
1175                  * radix tree.
1176                  */
1177                 hmp->fchain.hmp = hmp;
1178                 hmp->fchain.refs = 1;
1179                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
1180                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
1181                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
1182                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
1183                 hmp->fchain.bref.methods =
1184                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
1185                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
1186                 hammer2_chain_core_init(&hmp->fchain);
1187
1188                 /*
1189                  * Initialize volume header related fields.
1190                  */
1191                 KKASSERT(hmp->voldata.magic == HAMMER2_VOLUME_ID_HBO ||
1192                          hmp->voldata.magic == HAMMER2_VOLUME_ID_ABO);
1193                 hmp->volsync = hmp->voldata;
1194                 hmp->free_reserved = hmp->voldata.allocator_size / 20;
1195                 /*
1196                  * Must use hmp instead of volume header for these two
1197                  * in order to handle volume versions transparently.
1198                  */
1199                 if (hmp->voldata.version >= HAMMER2_VOL_VERSION_MULTI_VOLUMES) {
1200                         hmp->nvolumes = hmp->voldata.nvolumes;
1201                         hmp->total_size = hmp->voldata.total_size;
1202                 } else {
1203                         hmp->nvolumes = 1;
1204                         hmp->total_size = hmp->voldata.volu_size;
1205                 }
1206                 KKASSERT(hmp->nvolumes > 0);
1207
1208                 /*
1209                  * Move devvpl entries to hmp.
1210                  */
1211                 TAILQ_INIT(&hmp->devvpl);
1212                 while ((e = TAILQ_FIRST(&devvpl)) != NULL) {
1213                         TAILQ_REMOVE(&devvpl, e, entry);
1214                         TAILQ_INSERT_TAIL(&hmp->devvpl, e, entry);
1215                 }
1216                 KKASSERT(TAILQ_EMPTY(&devvpl));
1217                 KKASSERT(!TAILQ_EMPTY(&hmp->devvpl));
1218
1219                 /*
1220                  * Really important to get these right or the flush and
1221                  * teardown code will get confused.
1222                  */
1223                 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0, NULL);
1224                 spmp = hmp->spmp;
1225                 spmp->pfs_hmps[0] = hmp;
1226
1227                 /*
1228                  * Dummy-up vchain and fchain's modify_tid.  mirror_tid
1229                  * is inherited from the volume header.
1230                  */
1231                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
1232                 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
1233                 hmp->vchain.pmp = spmp;
1234                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
1235                 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
1236                 hmp->fchain.pmp = spmp;
1237
1238                 /*
1239                  * First locate the super-root inode, which is key 0
1240                  * relative to the volume header's blockset.
1241                  *
1242                  * Then locate the root inode by scanning the directory keyspace
1243                  * represented by the label.
1244                  */
1245                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
1246                 schain = hammer2_chain_lookup(&parent, &key_dummy,
1247                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
1248                                       &error, 0);
1249                 hammer2_chain_lookup_done(parent);
1250                 if (schain == NULL) {
1251                         kprintf("hammer2_mount: invalid super-root\n");
1252                         hammer2_unmount_helper(mp, NULL, hmp);
1253                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1254                         hammer2_vfs_unmount(mp, MNT_FORCE);
1255                         return EINVAL;
1256                 }
1257                 if (schain->error) {
1258                         kprintf("hammer2_mount: error %s reading super-root\n",
1259                                 hammer2_error_str(schain->error));
1260                         hammer2_chain_unlock(schain);
1261                         hammer2_chain_drop(schain);
1262                         schain = NULL;
1263                         hammer2_unmount_helper(mp, NULL, hmp);
1264                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1265                         hammer2_vfs_unmount(mp, MNT_FORCE);
1266                         return EINVAL;
1267                 }
1268
1269                 /*
1270                  * The super-root always uses an inode_tid of 1 when
1271                  * creating PFSs.
1272                  */
1273                 spmp->inode_tid = 1;
1274                 spmp->modify_tid = schain->bref.modify_tid + 1;
1275
1276                 /*
1277                  * Sanity-check schain's pmp and finish initialization.
1278                  * Any chain belonging to the super-root topology should
1279                  * have a NULL pmp (not even set to spmp).
1280                  */
1281                 ripdata = &schain->data->ipdata;
1282                 KKASSERT(schain->pmp == NULL);
1283                 spmp->pfs_clid = ripdata->meta.pfs_clid;
1284
1285                 /*
1286                  * Replace the dummy spmp->iroot with a real one.  It's
1287                  * easier to just do a wholesale replacement than to try
1288                  * to update the chain and fixup the iroot fields.
1289                  *
1290                  * The returned inode is locked with the supplied cluster.
1291                  */
1292                 hammer2_dummy_xop_from_chain(&xop, schain);
1293                 hammer2_inode_drop(spmp->iroot);
1294                 spmp->iroot = NULL;
1295                 spmp->iroot = hammer2_inode_get(spmp, &xop, -1, -1);
1296                 spmp->spmp_hmp = hmp;
1297                 spmp->pfs_types[0] = ripdata->meta.pfs_type;
1298                 spmp->pfs_hmps[0] = hmp;
1299                 hammer2_inode_ref(spmp->iroot);
1300                 hammer2_inode_unlock(spmp->iroot);
1301                 hammer2_cluster_unlock(&xop.cluster);
1302                 hammer2_chain_drop(schain);
1303                 /* do not call hammer2_cluster_drop() on an embedded cluster */
1304                 schain = NULL;  /* now invalid */
1305                 /* leave spmp->iroot with one ref */
1306
1307                 if (!hmp->ronly) {
1308                         error = hammer2_recovery(hmp);
1309                         if (error == 0)
1310                                 error |= hammer2_fixup_pfses(hmp);
1311                         /* XXX do something with error */
1312                 }
1313                 hammer2_update_pmps(hmp);
1314                 hammer2_iocom_init(hmp);
1315                 hammer2_bulkfree_init(hmp);
1316
1317                 /*
1318                  * Ref the cluster management messaging descriptor.  The mount
1319                  * program deals with the other end of the communications pipe.
1320                  *
1321                  * Root mounts typically do not supply one.
1322                  */
1323                 /*
1324                 if (info.cluster_fd >= 0) {
1325                         fp = holdfp(curthread, info.cluster_fd, -1);
1326                         if (fp) {
1327                                 hammer2_cluster_reconnect(hmp, fp);
1328                         } else {
1329                                 kprintf("hammer2_mount: bad cluster_fd!\n");
1330                         }
1331                 }
1332                 */
1333         } else {
1334                 spmp = hmp->spmp;
1335                 if (info.hflags & HMNT2_DEVFLAGS) {
1336                         kprintf("hammer2_mount: Warning: mount flags pertaining "
1337                                 "to the whole device may only be specified "
1338                                 "on the first mount of the device: %08x\n",
1339                                 info.hflags & HMNT2_DEVFLAGS);
1340                 }
1341         }
1342
1343         /*
1344          * Force local mount (disassociate all PFSs from their clusters).
1345          * Used primarily for debugging.
1346          */
1347         force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL;
1348
1349         /*
1350          * Lookup the mount point under the media-localized super-root.
1351          * Scanning hammer2_pfslist doesn't help us because it represents
1352          * PFS cluster ids which can aggregate several named PFSs together.
1353          *
1354          * cluster->pmp will incorrectly point to spmp and must be fixed
1355          * up later on.
1356          */
1357         hammer2_inode_lock(spmp->iroot, 0);
1358         parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
1359         lhc = hammer2_dirhash(label, strlen(label));
1360         chain = hammer2_chain_lookup(&parent, &key_next,
1361                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1362                                      &error, 0);
1363         while (chain) {
1364                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1365                     strcmp(label, chain->data->ipdata.filename) == 0) {
1366                         break;
1367                 }
1368                 chain = hammer2_chain_next(&parent, chain, &key_next,
1369                                             key_next,
1370                                             lhc + HAMMER2_DIRHASH_LOMASK,
1371                                             &error, 0);
1372         }
1373         if (parent) {
1374                 hammer2_chain_unlock(parent);
1375                 hammer2_chain_drop(parent);
1376         }
1377         hammer2_inode_unlock(spmp->iroot);
1378
1379         /*
1380          * PFS could not be found?
1381          */
1382         if (chain == NULL) {
1383                 hammer2_unmount_helper(mp, NULL, hmp);
1384                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1385                 hammer2_vfs_unmount(mp, MNT_FORCE);
1386
1387                 if (error) {
1388                         kprintf("hammer2_mount: PFS label I/O error\n");
1389                         return EINVAL;
1390                 } else {
1391                         kprintf("hammer2_mount: PFS label \"%s\" not found\n",
1392                                 label);
1393                         return ENOENT;
1394                 }
1395         }
1396
1397         /*
1398          * Acquire the pmp structure (it should have already been allocated
1399          * via hammer2_update_pmps() so do not pass cluster in to add to
1400          * available chains).
1401          *
1402          * Check if the cluster has already been mounted.  A cluster can
1403          * only be mounted once, use null mounts to mount additional copies.
1404          */
1405         if (chain->error) {
1406                 kprintf("hammer2_mount: PFS label I/O error\n");
1407         } else {
1408                 ripdata = &chain->data->ipdata;
1409                 bref = chain->bref;
1410                 pmp = hammer2_pfsalloc(NULL, ripdata,
1411                                        bref.modify_tid, force_local);
1412         }
1413         hammer2_chain_unlock(chain);
1414         hammer2_chain_drop(chain);
1415
1416         /*
1417          * Finish the mount
1418          */
1419         kprintf("hammer2_mount: hmp=%p pmp=%p\n", hmp, pmp);
1420
1421         if (pmp->mp) {
1422                 kprintf("hammer2_mount: PFS already mounted!\n");
1423                 hammer2_unmount_helper(mp, NULL, hmp);
1424                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1425                 hammer2_vfs_unmount(mp, MNT_FORCE);
1426
1427                 return EBUSY;
1428         }
1429
1430         pmp->hflags = info.hflags;
1431         mp->mnt_flag |= MNT_LOCAL;
1432         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
1433         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
1434
1435         /*
1436          * required mount structure initializations
1437          */
1438         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
1439         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
1440
1441         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
1442         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1443
1444         /*
1445          * Optional fields
1446          */
1447         mp->mnt_iosize_max = MAXPHYS;
1448
1449         /*
1450          * Connect up mount pointers.
1451          */
1452         hammer2_mount_helper(mp, pmp);
1453         lockmgr(&hammer2_mntlk, LK_RELEASE);
1454
1455 #if 0
1456         /*
1457          * Finish setup
1458          */
1459         vfs_getnewfsid(mp);
1460         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
1461         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
1462         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
1463
1464         if (path) {
1465                 copyinstr(info.volume, mp->mnt_stat.f_mntfromname,
1466                           MNAMELEN - 1, &size);
1467                 bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
1468         } /* else root mount, already in there */
1469
1470         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
1471         if (path) {
1472                 copyinstr(path, mp->mnt_stat.f_mntonname,
1473                           sizeof(mp->mnt_stat.f_mntonname) - 1,
1474                           &size);
1475         } else {
1476                 /* root mount */
1477                 mp->mnt_stat.f_mntonname[0] = '/';
1478         }
1479 #endif
1480
1481         /*
1482          * Initial statfs to prime mnt_stat.
1483          */
1484         hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL);
1485         hammer2_vfs_statvfs(mp, &mp->mnt_vstat, NULL);
1486
1487         return 0;
1488 }
1489
1490 /*
1491  * Scan PFSs under the super-root and create hammer2_pfs structures.
1492  */
1493 static
1494 void
1495 hammer2_update_pmps(hammer2_dev_t *hmp)
1496 {
1497         const hammer2_inode_data_t *ripdata;
1498         hammer2_chain_t *parent;
1499         hammer2_chain_t *chain;
1500         hammer2_blockref_t bref;
1501         hammer2_dev_t *force_local;
1502         hammer2_pfs_t *spmp;
1503         hammer2_pfs_t *pmp;
1504         hammer2_key_t key_next;
1505         int error;
1506
1507         /*
1508          * Force local mount (disassociate all PFSs from their clusters).
1509          * Used primarily for debugging.
1510          */
1511         force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL;
1512
1513         /*
1514          * Lookup mount point under the media-localized super-root.
1515          *
1516          * cluster->pmp will incorrectly point to spmp and must be fixed
1517          * up later on.
1518          */
1519         spmp = hmp->spmp;
1520         hammer2_inode_lock(spmp->iroot, 0);
1521         parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
1522         chain = hammer2_chain_lookup(&parent, &key_next,
1523                                          HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
1524                                          &error, 0);
1525         while (chain) {
1526                 if (chain->error) {
1527                         kprintf("I/O error scanning PFS labels\n");
1528                 } else if (chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
1529                         kprintf("Non inode chain type %d under super-root\n",
1530                                 chain->bref.type);
1531                 } else {
1532                         ripdata = &chain->data->ipdata;
1533                         bref = chain->bref;
1534                         pmp = hammer2_pfsalloc(chain, ripdata,
1535                                                bref.modify_tid, force_local);
1536                 }
1537                 chain = hammer2_chain_next(&parent, chain, &key_next,
1538                                            key_next, HAMMER2_KEY_MAX,
1539                                            &error, 0);
1540         }
1541         if (parent) {
1542                 hammer2_chain_unlock(parent);
1543                 hammer2_chain_drop(parent);
1544         }
1545         hammer2_inode_unlock(spmp->iroot);
1546 }
1547
1548 #if 0
1549 static
1550 int
1551 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path __unused,
1552                 struct ucred *cred)
1553 {
1554         hammer2_volume_t *vol;
1555         struct vnode *devvp;
1556         int i, error, result = 0;
1557
1558         if (!(hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)))
1559                 return 0;
1560
1561         for (i = 0; i < hmp->nvolumes; ++i) {
1562                 vol = &hmp->volumes[i];
1563                 devvp = vol->dev->devvp;
1564                 KKASSERT(devvp);
1565                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1566                 VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, NULL);
1567                 vn_unlock(devvp);
1568                 error = 0;
1569                 if (vol->id == HAMMER2_ROOT_VOLUME) {
1570                         error = hammer2_recovery(hmp);
1571                         if (error == 0)
1572                                 error |= hammer2_fixup_pfses(hmp);
1573                 }
1574                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1575                 if (error == 0) {
1576                         VOP_CLOSE(devvp, FREAD, NULL);
1577                 } else {
1578                         VOP_CLOSE(devvp, FREAD | FWRITE, NULL);
1579                 }
1580                 vn_unlock(devvp);
1581                 result |= error;
1582         }
1583         if (result == 0) {
1584                 kprintf("hammer2: enable read/write\n");
1585                 hmp->ronly = 0;
1586         }
1587
1588         return result;
1589 }
1590 #endif
1591
1592 int
1593 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1594 {
1595         hammer2_pfs_t *pmp;
1596         int flags;
1597         int error = 0;
1598
1599         pmp = MPTOPMP(mp);
1600
1601         if (pmp == NULL)
1602                 return(0);
1603
1604         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1605
1606         /*
1607          * If mount initialization proceeded far enough we must flush
1608          * its vnodes and sync the underlying mount points.  Three syncs
1609          * are required to fully flush the filesystem (freemap updates lag
1610          * by one flush, and one extra for safety).
1611          */
1612         if (mntflags & MNT_FORCE)
1613                 flags = FORCECLOSE;
1614         else
1615                 flags = 0;
1616         if (pmp->iroot) {
1617                 error = vflush(mp, 0, flags);
1618                 if (error)
1619                         goto failed;
1620                 hammer2_vfs_sync(mp, MNT_WAIT);
1621                 hammer2_vfs_sync(mp, MNT_WAIT);
1622                 hammer2_vfs_sync(mp, MNT_WAIT);
1623         }
1624
1625         /*
1626          * Cleanup the frontend support XOPS threads
1627          */
1628         hammer2_xop_helper_cleanup(pmp);
1629
1630         if (pmp->mp)
1631                 hammer2_unmount_helper(mp, pmp, NULL);
1632
1633         error = 0;
1634 failed:
1635         lockmgr(&hammer2_mntlk, LK_RELEASE);
1636
1637         return (error);
1638 }
1639
1640 /*
1641  * Mount helper, hook the system mount into our PFS.
1642  * The mount lock is held.
1643  *
1644  * We must bump the mount_count on related devices for any
1645  * mounted PFSs.
1646  */
1647 static
1648 void
1649 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp)
1650 {
1651         hammer2_cluster_t *cluster;
1652         hammer2_chain_t *rchain;
1653         int i;
1654
1655         mp->mnt_data = (qaddr_t)pmp;
1656         pmp->mp = mp;
1657
1658         /*
1659          * After pmp->mp is set we have to adjust hmp->mount_count.
1660          */
1661         cluster = &pmp->iroot->cluster;
1662         for (i = 0; i < cluster->nchains; ++i) {
1663                 rchain = cluster->array[i].chain;
1664                 if (rchain == NULL)
1665                         continue;
1666                 ++rchain->hmp->mount_count;
1667         }
1668
1669         /*
1670          * Create missing Xop threads
1671          */
1672         hammer2_xop_helper_create(pmp);
1673 }
1674
1675 /*
1676  * Unmount helper, unhook the system mount from our PFS.
1677  * The mount lock is held.
1678  *
1679  * If hmp is supplied a mount responsible for being the first to open
1680  * the block device failed and the block device and all PFSs using the
1681  * block device must be cleaned up.
1682  *
1683  * If pmp is supplied multiple devices might be backing the PFS and each
1684  * must be disconnected.  This might not be the last PFS using some of the
1685  * underlying devices.  Also, we have to adjust our hmp->mount_count
1686  * accounting for the devices backing the pmp which is now undergoing an
1687  * unmount.
1688  */
1689 static
1690 void
1691 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp)
1692 {
1693         hammer2_cluster_t *cluster;
1694         hammer2_chain_t *rchain;
1695         int dumpcnt;
1696         int i;
1697
1698         /*
1699          * If no device supplied this is a high-level unmount and we have to
1700          * to disconnect the mount, adjust mount_count, and locate devices
1701          * that might now have no mounts.
1702          */
1703         if (pmp) {
1704                 KKASSERT(hmp == NULL);
1705                 KKASSERT(MPTOPMP(mp) == pmp);
1706                 pmp->mp = NULL;
1707                 mp->mnt_data = NULL;
1708
1709                 /*
1710                  * After pmp->mp is cleared we have to account for
1711                  * mount_count.
1712                  */
1713                 cluster = &pmp->iroot->cluster;
1714                 for (i = 0; i < cluster->nchains; ++i) {
1715                         rchain = cluster->array[i].chain;
1716                         if (rchain == NULL)
1717                                 continue;
1718                         --rchain->hmp->mount_count;
1719                         /* scrapping hmp now may invalidate the pmp */
1720                 }
1721 again:
1722                 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
1723                         if (hmp->mount_count == 0) {
1724                                 hammer2_unmount_helper(NULL, NULL, hmp);
1725                                 goto again;
1726                         }
1727                 }
1728                 return;
1729         }
1730
1731         /*
1732          * Try to terminate the block device.  We can't terminate it if
1733          * there are still PFSs referencing it.
1734          */
1735         if (hmp->mount_count)
1736                 return;
1737
1738         /*
1739          * Decomission the network before we start messing with the
1740          * device and PFS.
1741          */
1742         hammer2_iocom_uninit(hmp);
1743
1744         hammer2_bulkfree_uninit(hmp);
1745         hammer2_pfsfree_scan(hmp, 0);
1746
1747         /*
1748          * Cycle the volume data lock as a safety (probably not needed any
1749          * more).  To ensure everything is out we need to flush at least
1750          * three times.  (1) The running of the sideq can dirty the
1751          * filesystem, (2) A normal flush can dirty the freemap, and
1752          * (3) ensure that the freemap is fully synchronized.
1753          *
1754          * The next mount's recovery scan can clean everything up but we want
1755          * to leave the filesystem in a 100% clean state on a normal unmount.
1756          */
1757 #if 0
1758         hammer2_voldata_lock(hmp);
1759         hammer2_voldata_unlock(hmp);
1760 #endif
1761
1762         /*
1763          * Flush whatever is left.  Unmounted but modified PFS's might still
1764          * have some dirty chains on them.
1765          */
1766         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1767         hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1768
1769         if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1770                 hammer2_voldata_modify(hmp);
1771                 hammer2_flush(&hmp->fchain, HAMMER2_FLUSH_TOP |
1772                                             HAMMER2_FLUSH_ALL);
1773         }
1774         hammer2_chain_unlock(&hmp->fchain);
1775
1776         if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1777                 hammer2_flush(&hmp->vchain, HAMMER2_FLUSH_TOP |
1778                                             HAMMER2_FLUSH_ALL);
1779         }
1780         hammer2_chain_unlock(&hmp->vchain);
1781
1782         if ((hmp->vchain.flags | hmp->fchain.flags) &
1783             HAMMER2_CHAIN_FLUSH_MASK) {
1784                 kprintf("hammer2_unmount: chains left over after final sync\n");
1785                 kprintf("    vchain %08x\n", hmp->vchain.flags);
1786                 kprintf("    fchain %08x\n", hmp->fchain.flags);
1787
1788                 if (hammer2_debug & 0x0010)
1789                         Debugger("entered debugger");
1790         }
1791
1792         hammer2_pfsfree_scan(hmp, 1);
1793
1794         KKASSERT(hmp->spmp == NULL);
1795
1796         /*
1797          * Finish up with the device vnode
1798          */
1799         if (!TAILQ_EMPTY(&hmp->devvpl)) {
1800                 hammer2_close_devvp(&hmp->devvpl, hmp->ronly);
1801                 hammer2_cleanup_devvp(&hmp->devvpl);
1802         }
1803         KKASSERT(TAILQ_EMPTY(&hmp->devvpl));
1804
1805         /*
1806          * Clear vchain/fchain flags that might prevent final cleanup
1807          * of these chains.
1808          */
1809         if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
1810                 atomic_add_long(&hammer2_count_modified_chains, -1);
1811                 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
1812                 hammer2_pfs_memory_wakeup(hmp->vchain.pmp, -1);
1813         }
1814         if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
1815                 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_UPDATE);
1816         }
1817
1818         if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
1819                 atomic_add_long(&hammer2_count_modified_chains, -1);
1820                 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_MODIFIED);
1821                 hammer2_pfs_memory_wakeup(hmp->fchain.pmp, -1);
1822         }
1823         if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
1824                 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_UPDATE);
1825         }
1826
1827         /*
1828          * Final drop of embedded freemap root chain to
1829          * clean up fchain.core (fchain structure is not
1830          * flagged ALLOCATED so it is cleaned out and then
1831          * left to rot).
1832          */
1833         hammer2_chain_drop(&hmp->fchain);
1834
1835         /*
1836          * Final drop of embedded volume root chain to clean
1837          * up vchain.core (vchain structure is not flagged
1838          * ALLOCATED so it is cleaned out and then left to
1839          * rot).
1840          */
1841         dumpcnt = 50;
1842         hammer2_dump_chain(&hmp->vchain, 0, 0, &dumpcnt, 'v', (u_int)-1);
1843         dumpcnt = 50;
1844         hammer2_dump_chain(&hmp->fchain, 0, 0, &dumpcnt, 'f', (u_int)-1);
1845
1846         hammer2_chain_drop(&hmp->vchain);
1847
1848         hammer2_io_cleanup(hmp, &hmp->iotree);
1849         if (hmp->iofree_count) {
1850                 kprintf("io_cleanup: %d I/O's left hanging\n",
1851                         hmp->iofree_count);
1852         }
1853
1854         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1855         kmalloc_destroy_obj(&hmp->mchain);
1856         kmalloc_destroy_obj(&hmp->mio);
1857         kmalloc_destroy(&hmp->mmsg);
1858         kfree(hmp, M_HAMMER2);
1859 }
1860
1861 int
1862 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1863                  ino_t ino, struct vnode **vpp)
1864 {
1865         hammer2_xop_lookup_t *xop;
1866         hammer2_pfs_t *pmp;
1867         hammer2_inode_t *ip;
1868         hammer2_tid_t inum;
1869         int error;
1870
1871         inum = (hammer2_tid_t)ino & HAMMER2_DIRHASH_USERMSK;
1872
1873         error = 0;
1874         pmp = MPTOPMP(mp);
1875
1876         /*
1877          * Easy if we already have it cached
1878          */
1879         ip = hammer2_inode_lookup(pmp, inum);
1880         if (ip) {
1881                 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
1882                 *vpp = hammer2_igetv(ip, &error);
1883                 hammer2_inode_unlock(ip);
1884                 hammer2_inode_drop(ip);         /* from lookup */
1885
1886                 return error;
1887         }
1888
1889         /*
1890          * Otherwise we have to find the inode
1891          */
1892         xop = hammer2_xop_alloc(pmp->iroot, 0);
1893         xop->lhc = inum;
1894         hammer2_xop_start(&xop->head, &hammer2_lookup_desc);
1895         error = hammer2_xop_collect(&xop->head, 0);
1896
1897         if (error == 0)
1898                 ip = hammer2_inode_get(pmp, &xop->head, -1, -1);
1899         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1900
1901         if (ip) {
1902                 *vpp = hammer2_igetv(ip, &error);
1903                 hammer2_inode_unlock(ip);
1904         } else {
1905                 *vpp = NULL;
1906                 error = ENOENT;
1907         }
1908         return (error);
1909 }
1910
1911 int
1912 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1913 {
1914         hammer2_pfs_t *pmp;
1915         struct vnode *vp;
1916         int error;
1917
1918         pmp = MPTOPMP(mp);
1919         if (pmp->iroot == NULL) {
1920                 kprintf("hammer2 (%s): no root inode\n",
1921                         mp->mnt_stat.f_mntfromname);
1922                 *vpp = NULL;
1923                 return EINVAL;
1924         }
1925
1926         error = 0;
1927         hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED);
1928
1929         while (pmp->inode_tid == 0) {
1930                 hammer2_xop_ipcluster_t *xop;
1931                 const hammer2_inode_meta_t *meta;
1932
1933                 xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING);
1934                 hammer2_xop_start(&xop->head, &hammer2_ipcluster_desc);
1935                 error = hammer2_xop_collect(&xop->head, 0);
1936
1937                 if (error == 0) {
1938                         meta = &hammer2_xop_gdata(&xop->head)->ipdata.meta;
1939                         pmp->iroot->meta = *meta;
1940                         pmp->inode_tid = meta->pfs_inum + 1;
1941                         hammer2_xop_pdata(&xop->head);
1942                         /* meta invalid */
1943
1944                         if (pmp->inode_tid < HAMMER2_INODE_START)
1945                                 pmp->inode_tid = HAMMER2_INODE_START;
1946                         pmp->modify_tid =
1947                                 xop->head.cluster.focus->bref.modify_tid + 1;
1948 #if 0
1949                         kprintf("PFS: Starting inode %jd\n",
1950                                 (intmax_t)pmp->inode_tid);
1951                         kprintf("PMP focus good set nextino=%ld mod=%016jx\n",
1952                                 pmp->inode_tid, pmp->modify_tid);
1953 #endif
1954                         //wakeup(&pmp->iroot); XXX
1955
1956                         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1957
1958                         /*
1959                          * Prime the mount info.
1960                          */
1961                         hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL);
1962                         break;
1963                 }
1964
1965                 /*
1966                  * Loop, try again
1967                  */
1968                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1969                 hammer2_inode_unlock(pmp->iroot);
1970                 error = tsleep(&pmp->iroot, PCATCH, "h2root", hz);
1971                 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED);
1972                 if (error == EINTR)
1973                         break;
1974         }
1975
1976         if (error) {
1977                 hammer2_inode_unlock(pmp->iroot);
1978                 *vpp = NULL;
1979         } else {
1980                 vp = hammer2_igetv(pmp->iroot, &error);
1981                 hammer2_inode_unlock(pmp->iroot);
1982                 *vpp = vp;
1983         }
1984
1985         return (error);
1986 }
1987
1988 /*
1989  * Filesystem status
1990  *
1991  * XXX incorporate ipdata->meta.inode_quota and data_quota
1992  */
1993 static
1994 int
1995 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
1996 {
1997         hammer2_pfs_t *pmp;
1998         hammer2_dev_t *hmp;
1999         hammer2_blockref_t bref;
2000         struct statfs tmp;
2001         int i;
2002
2003         /*
2004          * NOTE: iroot might not have validated the cluster yet.
2005          */
2006         pmp = MPTOPMP(mp);
2007
2008         bzero(&tmp, sizeof(tmp));
2009
2010         for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
2011                 hmp = pmp->pfs_hmps[i];
2012                 if (hmp == NULL)
2013                         continue;
2014                 if (pmp->iroot->cluster.array[i].chain)
2015                         bref = pmp->iroot->cluster.array[i].chain->bref;
2016                 else
2017                         bzero(&bref, sizeof(bref));
2018
2019                 tmp.f_files = bref.embed.stats.inode_count;
2020                 tmp.f_ffree = 0;
2021                 tmp.f_blocks = hmp->voldata.allocator_size /
2022                                mp->mnt_vstat.f_bsize;
2023                 tmp.f_bfree = hmp->voldata.allocator_free /
2024                               mp->mnt_vstat.f_bsize;
2025                 tmp.f_bavail = tmp.f_bfree;
2026
2027                 if (cred && cred->cr_uid != 0) {
2028                         uint64_t adj;
2029
2030                         /* 5% */
2031                         adj = hmp->free_reserved / mp->mnt_vstat.f_bsize;
2032                         tmp.f_blocks -= adj;
2033                         tmp.f_bfree -= adj;
2034                         tmp.f_bavail -= adj;
2035                 }
2036
2037                 mp->mnt_stat.f_blocks = tmp.f_blocks;
2038                 mp->mnt_stat.f_bfree = tmp.f_bfree;
2039                 mp->mnt_stat.f_bavail = tmp.f_bavail;
2040                 mp->mnt_stat.f_files = tmp.f_files;
2041                 mp->mnt_stat.f_ffree = tmp.f_ffree;
2042
2043                 *sbp = mp->mnt_stat;
2044         }
2045         return (0);
2046 }
2047
2048 static
2049 int
2050 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
2051 {
2052         hammer2_pfs_t *pmp;
2053         hammer2_dev_t *hmp;
2054         hammer2_blockref_t bref;
2055         struct statvfs tmp;
2056         int i;
2057
2058         /*
2059          * NOTE: iroot might not have validated the cluster yet.
2060          */
2061         pmp = MPTOPMP(mp);
2062         bzero(&tmp, sizeof(tmp));
2063
2064         for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
2065                 hmp = pmp->pfs_hmps[i];
2066                 if (hmp == NULL)
2067                         continue;
2068                 if (pmp->iroot->cluster.array[i].chain)
2069                         bref = pmp->iroot->cluster.array[i].chain->bref;
2070                 else
2071                         bzero(&bref, sizeof(bref));
2072
2073                 tmp.f_files = bref.embed.stats.inode_count;
2074                 tmp.f_ffree = 0;
2075                 tmp.f_blocks = hmp->voldata.allocator_size /
2076                                mp->mnt_vstat.f_bsize;
2077                 tmp.f_bfree = hmp->voldata.allocator_free /
2078                               mp->mnt_vstat.f_bsize;
2079                 tmp.f_bavail = tmp.f_bfree;
2080
2081                 if (cred && cred->cr_uid != 0) {
2082                         uint64_t adj;
2083
2084                         /* 5% */
2085                         adj = hmp->free_reserved / mp->mnt_vstat.f_bsize;
2086                         tmp.f_blocks -= adj;
2087                         tmp.f_bfree -= adj;
2088                         tmp.f_bavail -= adj;
2089                 }
2090
2091                 mp->mnt_vstat.f_blocks = tmp.f_blocks;
2092                 mp->mnt_vstat.f_bfree = tmp.f_bfree;
2093                 mp->mnt_vstat.f_bavail = tmp.f_bavail;
2094                 mp->mnt_vstat.f_files = tmp.f_files;
2095                 mp->mnt_vstat.f_ffree = tmp.f_ffree;
2096
2097                 *sbp = mp->mnt_vstat;
2098         }
2099         return (0);
2100 }
2101
2102 /*
2103  * Mount-time recovery (RW mounts)
2104  *
2105  * Updates to the free block table are allowed to lag flushes by one
2106  * transaction.  In case of a crash, then on a fresh mount we must do an
2107  * incremental scan of the last committed transaction id and make sure that
2108  * all related blocks have been marked allocated.
2109  */
2110 struct hammer2_recovery_elm {
2111         TAILQ_ENTRY(hammer2_recovery_elm) entry;
2112         hammer2_chain_t *chain;
2113         hammer2_tid_t sync_tid;
2114 };
2115
2116 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
2117
2118 struct hammer2_recovery_info {
2119         struct hammer2_recovery_list list;
2120         hammer2_tid_t   mtid;
2121         int     depth;
2122 };
2123
2124 static int hammer2_recovery_scan(hammer2_dev_t *hmp,
2125                         hammer2_chain_t *parent,
2126                         struct hammer2_recovery_info *info,
2127                         hammer2_tid_t sync_tid);
2128
2129 #define HAMMER2_RECOVERY_MAXDEPTH       10
2130
2131 static
2132 int
2133 hammer2_recovery(hammer2_dev_t *hmp)
2134 {
2135         struct hammer2_recovery_info info;
2136         struct hammer2_recovery_elm *elm;
2137         hammer2_chain_t *parent;
2138         hammer2_tid_t sync_tid;
2139         hammer2_tid_t mirror_tid;
2140         int error;
2141
2142         hammer2_trans_init(hmp->spmp, 0);
2143
2144         sync_tid = hmp->voldata.freemap_tid;
2145         mirror_tid = hmp->voldata.mirror_tid;
2146
2147         kprintf("hammer2_mount: \"%s\": ", hmp->devrepname);
2148         if (sync_tid >= mirror_tid) {
2149                 kprintf("no recovery needed\n");
2150         } else {
2151                 kprintf("freemap recovery %016jx-%016jx\n",
2152                         sync_tid + 1, mirror_tid);
2153         }
2154
2155         TAILQ_INIT(&info.list);
2156         info.depth = 0;
2157         parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
2158         error = hammer2_recovery_scan(hmp, parent, &info, sync_tid);
2159         hammer2_chain_lookup_done(parent);
2160
2161         while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
2162                 TAILQ_REMOVE(&info.list, elm, entry);
2163                 parent = elm->chain;
2164                 sync_tid = elm->sync_tid;
2165                 kfree(elm, M_HAMMER2);
2166
2167                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2168                 error |= hammer2_recovery_scan(hmp, parent, &info,
2169                                               hmp->voldata.freemap_tid);
2170                 hammer2_chain_unlock(parent);
2171                 hammer2_chain_drop(parent);     /* drop elm->chain ref */
2172         }
2173
2174         hammer2_trans_done(hmp->spmp, 0);
2175
2176         return error;
2177 }
2178
2179 static
2180 int
2181 hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent,
2182                       struct hammer2_recovery_info *info,
2183                       hammer2_tid_t sync_tid)
2184 {
2185         const hammer2_inode_data_t *ripdata;
2186         hammer2_chain_t *chain;
2187         hammer2_blockref_t bref;
2188         int tmp_error;
2189         int rup_error;
2190         int error;
2191         int first;
2192
2193         /*
2194          * Adjust freemap to ensure that the block(s) are marked allocated.
2195          */
2196         if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
2197                 hammer2_freemap_adjust(hmp, &parent->bref,
2198                                        HAMMER2_FREEMAP_DORECOVER);
2199         }
2200
2201         /*
2202          * Check type for recursive scan
2203          */
2204         switch(parent->bref.type) {
2205         case HAMMER2_BREF_TYPE_VOLUME:
2206                 /* data already instantiated */
2207                 break;
2208         case HAMMER2_BREF_TYPE_INODE:
2209                 /*
2210                  * Must instantiate data for DIRECTDATA test and also
2211                  * for recursion.
2212                  */
2213                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2214                 ripdata = &parent->data->ipdata;
2215                 if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2216                         /* not applicable to recovery scan */
2217                         hammer2_chain_unlock(parent);
2218                         return 0;
2219                 }
2220                 hammer2_chain_unlock(parent);
2221                 break;
2222         case HAMMER2_BREF_TYPE_INDIRECT:
2223                 /*
2224                  * Must instantiate data for recursion
2225                  */
2226                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2227                 hammer2_chain_unlock(parent);
2228                 break;
2229         case HAMMER2_BREF_TYPE_DIRENT:
2230         case HAMMER2_BREF_TYPE_DATA:
2231         case HAMMER2_BREF_TYPE_FREEMAP:
2232         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2233         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2234                 /* not applicable to recovery scan */
2235                 return 0;
2236                 break;
2237         default:
2238                 return HAMMER2_ERROR_BADBREF;
2239         }
2240
2241         /*
2242          * Defer operation if depth limit reached.
2243          */
2244         if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) {
2245                 struct hammer2_recovery_elm *elm;
2246
2247                 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2248                 elm->chain = parent;
2249                 elm->sync_tid = sync_tid;
2250                 hammer2_chain_ref(parent);
2251                 TAILQ_INSERT_TAIL(&info->list, elm, entry);
2252                 /* unlocked by caller */
2253
2254                 return(0);
2255         }
2256
2257
2258         /*
2259          * Recursive scan of the last flushed transaction only.  We are
2260          * doing this without pmp assignments so don't leave the chains
2261          * hanging around after we are done with them.
2262          *
2263          * error        Cumulative error this level only
2264          * rup_error    Cumulative error for recursion
2265          * tmp_error    Specific non-cumulative recursion error
2266          */
2267         chain = NULL;
2268         first = 1;
2269         rup_error = 0;
2270         error = 0;
2271
2272         for (;;) {
2273                 error |= hammer2_chain_scan(parent, &chain, &bref,
2274                                             &first,
2275                                             HAMMER2_LOOKUP_NODATA);
2276
2277                 /*
2278                  * Problem during scan or EOF
2279                  */
2280                 if (error)
2281                         break;
2282
2283                 /*
2284                  * If this is a leaf
2285                  */
2286                 if (chain == NULL) {
2287                         if (bref.mirror_tid > sync_tid) {
2288                                 hammer2_freemap_adjust(hmp, &bref,
2289                                                      HAMMER2_FREEMAP_DORECOVER);
2290                         }
2291                         continue;
2292                 }
2293
2294                 /*
2295                  * This may or may not be a recursive node.
2296                  */
2297                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2298                 if (bref.mirror_tid > sync_tid) {
2299                         ++info->depth;
2300                         tmp_error = hammer2_recovery_scan(hmp, chain,
2301                                                            info, sync_tid);
2302                         --info->depth;
2303                 } else {
2304                         tmp_error = 0;
2305                 }
2306
2307                 /*
2308                  * Flush the recovery at the PFS boundary to stage it for
2309                  * the final flush of the super-root topology.
2310                  */
2311                 if (tmp_error == 0 &&
2312                     (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
2313                     (chain->flags & HAMMER2_CHAIN_ONFLUSH)) {
2314                         hammer2_flush(chain, HAMMER2_FLUSH_TOP |
2315                                              HAMMER2_FLUSH_ALL);
2316                 }
2317                 rup_error |= tmp_error;
2318         }
2319         return ((error | rup_error) & ~HAMMER2_ERROR_EOF);
2320 }
2321
2322 /*
2323  * This fixes up an error introduced in earlier H2 implementations where
2324  * moving a PFS inode into an indirect block wound up causing the
2325  * HAMMER2_BREF_FLAG_PFSROOT flag in the bref to get cleared.
2326  */
2327 static
2328 int
2329 hammer2_fixup_pfses(hammer2_dev_t *hmp)
2330 {
2331         const hammer2_inode_data_t *ripdata;
2332         hammer2_chain_t *parent;
2333         hammer2_chain_t *chain;
2334         hammer2_key_t key_next;
2335         hammer2_pfs_t *spmp;
2336         int error;
2337
2338         error = 0;
2339
2340         /*
2341          * Lookup mount point under the media-localized super-root.
2342          *
2343          * cluster->pmp will incorrectly point to spmp and must be fixed
2344          * up later on.
2345          */
2346         spmp = hmp->spmp;
2347         hammer2_inode_lock(spmp->iroot, 0);
2348         parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
2349         chain = hammer2_chain_lookup(&parent, &key_next,
2350                                          HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
2351                                          &error, 0);
2352         while (chain) {
2353                 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE)
2354                         continue;
2355                 if (chain->error) {
2356                         kprintf("I/O error scanning PFS labels\n");
2357                         error |= chain->error;
2358                 } else if ((chain->bref.flags &
2359                             HAMMER2_BREF_FLAG_PFSROOT) == 0) {
2360                         int error2;
2361
2362                         ripdata = &chain->data->ipdata;
2363                         hammer2_trans_init(hmp->spmp, 0);
2364                         error2 = hammer2_chain_modify(chain,
2365                                                       chain->bref.modify_tid,
2366                                                       0, 0);
2367                         if (error2 == 0) {
2368                                 kprintf("hammer2: Correct mis-flagged PFS %s\n",
2369                                         ripdata->filename);
2370                                 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
2371                         } else {
2372                                 error |= error2;
2373                         }
2374                         hammer2_flush(chain, HAMMER2_FLUSH_TOP |
2375                                              HAMMER2_FLUSH_ALL);
2376                         hammer2_trans_done(hmp->spmp, 0);
2377                 }
2378                 chain = hammer2_chain_next(&parent, chain, &key_next,
2379                                            key_next, HAMMER2_KEY_MAX,
2380                                            &error, 0);
2381         }
2382         if (parent) {
2383                 hammer2_chain_unlock(parent);
2384                 hammer2_chain_drop(parent);
2385         }
2386         hammer2_inode_unlock(spmp->iroot);
2387
2388         return error;
2389 }
2390
2391 /*
2392  * Sync a mount point; this is called periodically on a per-mount basis from
2393  * the filesystem syncer, and whenever a user issues a sync.
2394  */
2395 int
2396 hammer2_vfs_sync(struct mount *mp, int waitfor)
2397 {
2398         int error;
2399
2400         error = hammer2_vfs_sync_pmp(MPTOPMP(mp), waitfor);
2401
2402         return error;
2403 }
2404
2405 /*
2406  * Because frontend operations lock vnodes before we get a chance to
2407  * lock the related inode, we can't just acquire a vnode lock without
2408  * risking a deadlock.  The frontend may be holding a vnode lock while
2409  * also blocked on our SYNCQ flag while trying to get the inode lock.
2410  *
2411  * To deal with this situation we can check the vnode lock situation
2412  * after locking the inode and perform a work-around.
2413  */
2414 int
2415 hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
2416 {
2417         hammer2_inode_t *ip;
2418         hammer2_depend_t *depend;
2419         hammer2_depend_t *depend_next;
2420         struct vnode *vp;
2421         uint32_t pass2;
2422         int error;
2423         int wakecount;
2424         int dorestart;
2425
2426         /*
2427          * Move all inodes on sideq to syncq.  This will clear sideq.
2428          * This should represent all flushable inodes.  These inodes
2429          * will already have refs due to being on syncq or sideq.  We
2430          * must do this all at once with the spinlock held to ensure that
2431          * all inode dependencies are part of the same flush.
2432          *
2433          * We should be able to do this asynchronously from frontend
2434          * operations because we will be locking the inodes later on
2435          * to actually flush them, and that will partition any frontend
2436          * op using the same inode.  Either it has already locked the
2437          * inode and we will block, or it has not yet locked the inode
2438          * and it will block until we are finished flushing that inode.
2439          *
2440          * When restarting, only move the inodes flagged as PASS2 from
2441          * SIDEQ to SYNCQ.  PASS2 propagation by inode_lock4() and
2442          * inode_depend() are atomic with the spin-lock.
2443          */
2444         hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
2445 #ifdef HAMMER2_DEBUG_SYNC
2446         kprintf("FILESYSTEM SYNC BOUNDARY\n");
2447 #endif
2448         dorestart = 0;
2449
2450         /*
2451          * Move inodes from depq to syncq, releasing the related
2452          * depend structures.
2453          */
2454 restart:
2455 #ifdef HAMMER2_DEBUG_SYNC
2456         kprintf("FILESYSTEM SYNC RESTART (%d)\n", dorestart);
2457 #endif
2458         hammer2_trans_setflags(pmp, 0/*HAMMER2_TRANS_COPYQ*/);
2459         hammer2_trans_clearflags(pmp, HAMMER2_TRANS_RESCAN);
2460
2461         /*
2462          * Move inodes from depq to syncq.  When restarting, only depq's
2463          * marked pass2 are moved.
2464          */
2465         hammer2_spin_ex(&pmp->list_spin);
2466         depend_next = TAILQ_FIRST(&pmp->depq);
2467         wakecount = 0;
2468
2469         while ((depend = depend_next) != NULL) {
2470                 depend_next = TAILQ_NEXT(depend, entry);
2471                 if (dorestart && depend->pass2 == 0)
2472                         continue;
2473                 TAILQ_FOREACH(ip, &depend->sideq, entry) {
2474                         KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ);
2475                         atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ);
2476                         atomic_clear_int(&ip->flags, HAMMER2_INODE_SIDEQ);
2477                         ip->depend = NULL;
2478                 }
2479
2480                 /*
2481                  * NOTE: pmp->sideq_count includes both sideq and syncq
2482                  */
2483                 TAILQ_CONCAT(&pmp->syncq, &depend->sideq, entry);
2484
2485                 depend->count = 0;
2486                 depend->pass2 = 0;
2487                 TAILQ_REMOVE(&pmp->depq, depend, entry);
2488         }
2489
2490         hammer2_spin_unex(&pmp->list_spin);
2491         hammer2_trans_clearflags(pmp, /*HAMMER2_TRANS_COPYQ |*/
2492                                       HAMMER2_TRANS_WAITING);
2493         dorestart = 0;
2494
2495         /*
2496          * sideq_count may have dropped enough to allow us to unstall
2497          * the frontend.
2498          */
2499         hammer2_pfs_memory_wakeup(pmp, 0);
2500
2501         /*
2502          * Now run through all inodes on syncq.
2503          *
2504          * Flush transactions only interlock with other flush transactions.
2505          * Any conflicting frontend operations will block on the inode, but
2506          * may hold a vnode lock while doing so.
2507          */
2508         hammer2_spin_ex(&pmp->list_spin);
2509         while ((ip = TAILQ_FIRST(&pmp->syncq)) != NULL) {
2510                 /*
2511                  * Remove the inode from the SYNCQ, transfer the syncq ref
2512                  * to us.  We must clear SYNCQ to allow any potential
2513                  * front-end deadlock to proceed.  We must set PASS2 so
2514                  * the dependency code knows what to do.
2515                  */
2516                 pass2 = ip->flags;
2517                 cpu_ccfence();
2518                 if (atomic_cmpset_int(&ip->flags,
2519                               pass2,
2520                               (pass2 & ~(HAMMER2_INODE_SYNCQ |
2521                                          HAMMER2_INODE_SYNCQ_WAKEUP)) |
2522                               HAMMER2_INODE_SYNCQ_PASS2) == 0)
2523                 {
2524                         continue;
2525                 }
2526                 TAILQ_REMOVE(&pmp->syncq, ip, entry);
2527                 --pmp->sideq_count;
2528                 hammer2_spin_unex(&pmp->list_spin);
2529
2530                 /*
2531                  * Tickle anyone waiting on ip->flags or the hysteresis
2532                  * on the dirty inode count.
2533                  */
2534                 if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP)
2535                         wakeup(&ip->flags);
2536                 if (++wakecount >= hammer2_limit_dirty_inodes / 20 + 1) {
2537                         wakecount = 0;
2538                         hammer2_pfs_memory_wakeup(pmp, 0);
2539                 }
2540
2541                 /*
2542                  * Relock the inode, and we inherit a ref from the above.
2543                  * We will check for a race after we acquire the vnode.
2544                  */
2545                 hammer2_mtx_ex(&ip->lock);
2546
2547                 /*
2548                  * We need the vp in order to vfsync() dirty buffers, so if
2549                  * one isn't attached we can skip it.
2550                  *
2551                  * Ordering the inode lock and then the vnode lock has the
2552                  * potential to deadlock.  If we had left SYNCQ set that could
2553                  * also deadlock us against the frontend even if we don't hold
2554                  * any locks, but the latter is not a problem now since we
2555                  * cleared it.  igetv will temporarily release the inode lock
2556                  * in a safe manner to work-around the deadlock.
2557                  *
2558                  * Unfortunately it is still possible to deadlock when the
2559                  * frontend obtains multiple inode locks, because all the
2560                  * related vnodes are already locked (nor can the vnode locks
2561                  * be released and reacquired without messing up RECLAIM and
2562                  * INACTIVE sequencing).
2563                  *
2564                  * The solution for now is to move the vp back onto SIDEQ
2565                  * and set dorestart, which will restart the flush after we
2566                  * exhaust the current SYNCQ.  Note that additional
2567                  * dependencies may build up, so we definitely need to move
2568                  * the whole SIDEQ back to SYNCQ when we restart.
2569                  */
2570                 vp = ip->vp;
2571                 if (vp) {
2572                         if (vget(vp, LK_EXCLUSIVE|LK_NOWAIT)) {
2573                                 /*
2574                                  * Failed to get the vnode, requeue the inode
2575                                  * (PASS2 is already set so it will be found
2576                                  * again on the restart).
2577                                  *
2578                                  * Then unlock, possibly sleep, and retry
2579                                  * later.  We sleep if PASS2 was *previously*
2580                                  * set, before we set it again above.
2581                                  */
2582                                 vp = NULL;
2583                                 dorestart = 1;
2584 #ifdef HAMMER2_DEBUG_SYNC
2585                                 kprintf("inum %ld (sync delayed by vnode)\n",
2586                                         (long)ip->meta.inum);
2587 #endif
2588                                 hammer2_inode_delayed_sideq(ip);
2589
2590                                 hammer2_mtx_unlock(&ip->lock);
2591                                 hammer2_inode_drop(ip);
2592
2593                                 if (pass2 & HAMMER2_INODE_SYNCQ_PASS2) {
2594                                         tsleep(&dorestart, 0, "h2syndel", 2);
2595                                 }
2596                                 hammer2_spin_ex(&pmp->list_spin);
2597                                 continue;
2598                         }
2599                 } else {
2600                         vp = NULL;
2601                 }
2602
2603                 /*
2604                  * If the inode wound up on a SIDEQ again it will already be
2605                  * prepped for another PASS2.  In this situation if we flush
2606                  * it now we will just wind up flushing it again in the same
2607                  * syncer run, so we might as well not flush it now.
2608                  */
2609                 if (ip->flags & HAMMER2_INODE_SIDEQ) {
2610                         hammer2_mtx_unlock(&ip->lock);
2611                         hammer2_inode_drop(ip);
2612                         if (vp)
2613                                 vput(vp);
2614                         dorestart = 1;
2615                         hammer2_spin_ex(&pmp->list_spin);
2616                         continue;
2617                 }
2618
2619                 /*
2620                  * Ok we have the inode exclusively locked and if vp is
2621                  * not NULL that will also be exclusively locked.  Do the
2622                  * meat of the flush.
2623                  *
2624                  * vp token needed for v_rbdirty_tree check / vclrisdirty
2625                  * sequencing.  Though we hold the vnode exclusively so
2626                  * we shouldn't need to hold the token also in this case.
2627                  */
2628                 if (vp) {
2629                         vfsync(vp, MNT_WAIT, 1, NULL, NULL);
2630                         bio_track_wait(NULL, 0, 0); /* XXX */
2631                 }
2632
2633                 /*
2634                  * If the inode has not yet been inserted into the tree
2635                  * we must do so.  Then sync and flush it.  The flush should
2636                  * update the parent.
2637                  */
2638                 if (ip->flags & HAMMER2_INODE_DELETING) {
2639 #ifdef HAMMER2_DEBUG_SYNC
2640                         kprintf("inum %ld destroy\n", (long)ip->meta.inum);
2641 #endif
2642                         hammer2_inode_chain_des(ip);
2643                         atomic_add_long(&hammer2_iod_inode_deletes, 1);
2644                 } else if (ip->flags & HAMMER2_INODE_CREATING) {
2645 #ifdef HAMMER2_DEBUG_SYNC
2646                         kprintf("inum %ld insert\n", (long)ip->meta.inum);
2647 #endif
2648                         hammer2_inode_chain_ins(ip);
2649                         atomic_add_long(&hammer2_iod_inode_creates, 1);
2650                 }
2651 #ifdef HAMMER2_DEBUG_SYNC
2652                 kprintf("inum %ld chain-sync\n", (long)ip->meta.inum);
2653 #endif
2654
2655                 /*
2656                  * Because I kinda messed up the design and index the inodes
2657                  * under the root inode, along side the directory entries,
2658                  * we can't flush the inode index under the iroot until the
2659                  * end.  If we do it now we might miss effects created by
2660                  * other inodes on the SYNCQ.
2661                  *
2662                  * Do a normal (non-FSSYNC) flush instead, which allows the
2663                  * vnode code to work the same.  We don't want to force iroot
2664                  * back onto the SIDEQ, and we also don't want the flush code
2665                  * to update pfs_iroot_blocksets until the final flush later.
2666                  *
2667                  * XXX at the moment this will likely result in a double-flush
2668                  * of the iroot chain.
2669                  */
2670                 hammer2_inode_chain_sync(ip);
2671                 if (ip == pmp->iroot) {
2672                         hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP);
2673                 } else {
2674                         hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
2675                                                       HAMMER2_XOP_FSSYNC);
2676                 }
2677                 if (vp) {
2678                         lwkt_gettoken(NULL);
2679                         if ((ip->flags & (HAMMER2_INODE_MODIFIED |
2680                                           HAMMER2_INODE_RESIZED |
2681                                           HAMMER2_INODE_DIRTYDATA)) == 0) {
2682                             //RB_EMPTY(&vp->v_rbdirty_tree) &&
2683                             //!bio_track_active(&vp->v_track_write)) {
2684                                 vclrisdirty(vp);
2685                         } else {
2686                                 hammer2_inode_delayed_sideq(ip);
2687                         }
2688                         lwkt_reltoken(NULL);
2689                         vput(vp);
2690                         vp = NULL;      /* safety */
2691                 }
2692                 atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_PASS2);
2693                 hammer2_inode_unlock(ip);       /* unlock+drop */
2694                 /* ip pointer invalid */
2695
2696                 /*
2697                  * If the inode got dirted after we dropped our locks,
2698                  * it will have already been moved back to the SIDEQ.
2699                  */
2700                 hammer2_spin_ex(&pmp->list_spin);
2701         }
2702         hammer2_spin_unex(&pmp->list_spin);
2703         hammer2_pfs_memory_wakeup(pmp, 0);
2704
2705         if (dorestart || (pmp->trans.flags & HAMMER2_TRANS_RESCAN)) {
2706 #ifdef HAMMER2_DEBUG_SYNC
2707                 kprintf("FILESYSTEM SYNC STAGE 1 RESTART\n");
2708                 /*tsleep(&dorestart, 0, "h2STG1-R", hz*20);*/
2709 #endif
2710                 dorestart = 1;
2711                 goto restart;
2712         }
2713 #ifdef HAMMER2_DEBUG_SYNC
2714         kprintf("FILESYSTEM SYNC STAGE 2 BEGIN\n");
2715         /*tsleep(&dorestart, 0, "h2STG2", hz*20);*/
2716 #endif
2717
2718         /*
2719          * We have to flush the PFS root last, even if it does not appear to
2720          * be dirty, because all the inodes in the PFS are indexed under it.
2721          * The normal flushing of iroot above would only occur if directory
2722          * entries under the root were changed.
2723          *
2724          * Specifying VOLHDR will cause an additionl flush of hmp->spmp
2725          * for the media making up the cluster.
2726          */
2727         if ((ip = pmp->iroot) != NULL) {
2728                 hammer2_inode_ref(ip);
2729                 hammer2_mtx_ex(&ip->lock);
2730                 hammer2_inode_chain_sync(ip);
2731                 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
2732                                               HAMMER2_XOP_FSSYNC |
2733                                               HAMMER2_XOP_VOLHDR);
2734                 hammer2_inode_unlock(ip);       /* unlock+drop */
2735         }
2736 #ifdef HAMMER2_DEBUG_SYNC
2737         kprintf("FILESYSTEM SYNC STAGE 2 DONE\n");
2738 #endif
2739
2740         /*
2741          * device bioq sync
2742          */
2743         hammer2_bioq_sync(pmp);
2744
2745         error = 0;      /* XXX */
2746         hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH);
2747
2748         return (error);
2749 }
2750
2751 #if 0
2752 static
2753 int
2754 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2755 {
2756         hammer2_inode_t *ip;
2757
2758         KKASSERT(MAXFIDSZ >= 16);
2759         ip = VTOI(vp);
2760         fhp->fid_len = offsetof(struct fid, fid_data[16]);
2761         fhp->fid_ext = 0;
2762         ((hammer2_tid_t *)fhp->fid_data)[0] = ip->meta.inum;
2763         ((hammer2_tid_t *)fhp->fid_data)[1] = 0;
2764
2765         return 0;
2766 }
2767
2768 static
2769 int
2770 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2771                struct fid *fhp, struct vnode **vpp)
2772 {
2773         hammer2_pfs_t *pmp;
2774         hammer2_tid_t inum;
2775         int error;
2776
2777         pmp = MPTOPMP(mp);
2778         inum = ((hammer2_tid_t *)fhp->fid_data)[0] & HAMMER2_DIRHASH_USERMSK;
2779         if (vpp) {
2780                 if (inum == 1)
2781                         error = hammer2_vfs_root(mp, vpp);
2782                 else
2783                         error = hammer2_vfs_vget(mp, NULL, inum, vpp);
2784         } else {
2785                 error = 0;
2786         }
2787         return error;
2788 }
2789
2790 static
2791 int
2792 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2793                  int *exflagsp, struct ucred **credanonp)
2794 {
2795         hammer2_pfs_t *pmp;
2796         struct netcred *np;
2797         int error;
2798
2799         pmp = MPTOPMP(mp);
2800         np = vfs_export_lookup(mp, &pmp->export, nam);
2801         if (np) {
2802                 *exflagsp = np->netc_exflags;
2803                 *credanonp = &np->netc_anon;
2804                 error = 0;
2805         } else {
2806                 error = EACCES;
2807         }
2808         return error;
2809 }
2810 #endif
2811
2812 /*
2813  * This handles hysteresis on regular file flushes.  Because the BIOs are
2814  * routed to a thread it is possible for an excessive number to build up
2815  * and cause long front-end stalls long before the runningbuffspace limit
2816  * is hit, so we implement hammer2_flush_pipe to control the
2817  * hysteresis.
2818  *
2819  * This is a particular problem when compression is used.
2820  */
2821 void
2822 hammer2_lwinprog_ref(hammer2_pfs_t *pmp)
2823 {
2824         atomic_add_int(&pmp->count_lwinprog, 1);
2825 }
2826
2827 void
2828 hammer2_lwinprog_drop(hammer2_pfs_t *pmp)
2829 {
2830 #if 0
2831         int lwinprog;
2832
2833         lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2834         if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2835             (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2836                 atomic_clear_int(&pmp->count_lwinprog,
2837                                  HAMMER2_LWINPROG_WAITING);
2838                 wakeup(&pmp->count_lwinprog);
2839         }
2840         if ((lwinprog & HAMMER2_LWINPROG_WAITING0) &&
2841             (lwinprog & HAMMER2_LWINPROG_MASK) <= 0) {
2842                 atomic_clear_int(&pmp->count_lwinprog,
2843                                  HAMMER2_LWINPROG_WAITING0);
2844                 wakeup(&pmp->count_lwinprog);
2845         }
2846 #endif
2847 }
2848
2849 void
2850 hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe)
2851 {
2852 #if 0
2853         int lwinprog;
2854         int lwflag = (flush_pipe) ? HAMMER2_LWINPROG_WAITING :
2855                                     HAMMER2_LWINPROG_WAITING0;
2856
2857         for (;;) {
2858                 lwinprog = pmp->count_lwinprog;
2859                 cpu_ccfence();
2860                 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe)
2861                         break;
2862                 tsleep_interlock(&pmp->count_lwinprog, 0);
2863                 atomic_set_int(&pmp->count_lwinprog, lwflag);
2864                 lwinprog = pmp->count_lwinprog;
2865                 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe)
2866                         break;
2867                 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2868         }
2869 #endif
2870 }
2871
2872 #if 0
2873 /*
2874  * It is possible for an excessive number of dirty chains or dirty inodes
2875  * to build up.  When this occurs we start an asynchronous filesystem sync.
2876  * If the level continues to build up, we stall, waiting for it to drop,
2877  * with some hysteresis.
2878  *
2879  * This relies on the kernel calling hammer2_vfs_modifying() prior to
2880  * obtaining any vnode locks before making a modifying VOP call.
2881  */
2882 static int
2883 hammer2_vfs_modifying(struct mount *mp)
2884 {
2885         if (mp->mnt_flag & MNT_RDONLY)
2886                 return EROFS;
2887         hammer2_pfs_memory_wait(MPTOPMP(mp));
2888
2889         return 0;
2890 }
2891 #endif
2892
2893 /*
2894  * Initiate an asynchronous filesystem sync and, with hysteresis,
2895  * stall if the internal data structure count becomes too bloated.
2896  */
2897 void
2898 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
2899 {
2900         uint32_t waiting;
2901         int pcatch;
2902         int error;
2903
2904         if (pmp == NULL || pmp->mp == NULL)
2905                 return;
2906
2907         for (;;) {
2908                 waiting = pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK;
2909                 cpu_ccfence();
2910
2911                 /*
2912                  * Start the syncer running at 1/2 the limit
2913                  */
2914                 if (waiting > hammer2_limit_dirty_chains / 2 ||
2915                     pmp->sideq_count > hammer2_limit_dirty_inodes / 2) {
2916                         trigger_syncer(pmp->mp);
2917                 }
2918
2919                 /*
2920                  * Stall at the limit waiting for the counts to drop.
2921                  * This code will typically be woken up once the count
2922                  * drops below 3/4 the limit, or in one second.
2923                  */
2924                 if (waiting < hammer2_limit_dirty_chains &&
2925                     pmp->sideq_count < hammer2_limit_dirty_inodes) {
2926                         break;
2927                 }
2928
2929                 pcatch = curthread->td_proc ? PCATCH : 0;
2930
2931                 tsleep_interlock(&pmp->inmem_dirty_chains, pcatch);
2932                 atomic_set_int(&pmp->inmem_dirty_chains,
2933                                HAMMER2_DIRTYCHAIN_WAITING);
2934                 if (waiting < hammer2_limit_dirty_chains &&
2935                     pmp->sideq_count < hammer2_limit_dirty_inodes) {
2936                         break;
2937                 }
2938                 trigger_syncer(pmp->mp);
2939                 error = tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED | pcatch,
2940                                "h2memw", hz);
2941                 if (error == ERESTART)
2942                         break;
2943         }
2944 }
2945
2946 /*
2947  * Wake up any stalled frontend ops waiting, with hysteresis, using
2948  * 2/3 of the limit.
2949  */
2950 void
2951 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp, int count)
2952 {
2953         uint32_t waiting;
2954
2955         if (pmp) {
2956                 waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, count);
2957                 /* don't need --waiting to test flag */
2958
2959                 if ((waiting & HAMMER2_DIRTYCHAIN_WAITING) &&
2960                     (pmp->inmem_dirty_chains & HAMMER2_DIRTYCHAIN_MASK) <=
2961                     hammer2_limit_dirty_chains * 2 / 3 &&
2962                     pmp->sideq_count <= hammer2_limit_dirty_inodes * 2 / 3) {
2963                         atomic_clear_int(&pmp->inmem_dirty_chains,
2964                                          HAMMER2_DIRTYCHAIN_WAITING);
2965                         wakeup(&pmp->inmem_dirty_chains);
2966                 }
2967         }
2968 }
2969
2970 void
2971 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp)
2972 {
2973         if (pmp) {
2974                 atomic_add_int(&pmp->inmem_dirty_chains, 1);
2975         }
2976 }
2977
2978 /*
2979  * Volume header data locks
2980  */
2981 void
2982 hammer2_voldata_lock(hammer2_dev_t *hmp)
2983 {
2984         lockmgr(&hmp->vollk, LK_EXCLUSIVE);
2985 }
2986
2987 void
2988 hammer2_voldata_unlock(hammer2_dev_t *hmp)
2989 {
2990         lockmgr(&hmp->vollk, LK_RELEASE);
2991 }
2992
2993 /*
2994  * Caller indicates that the volume header is being modified.  Flag
2995  * the related chain and adjust its transaction id.
2996  *
2997  * The transaction id is set to voldata.mirror_tid + 1, similar to
2998  * what hammer2_chain_modify() does.  Be very careful here, volume
2999  * data can be updated independently of the rest of the filesystem.
3000  */
3001 void
3002 hammer2_voldata_modify(hammer2_dev_t *hmp)
3003 {
3004         if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) == 0) {
3005                 atomic_add_long(&hammer2_count_modified_chains, 1);
3006                 atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
3007                 hammer2_pfs_memory_inc(hmp->vchain.pmp);
3008                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid + 1;
3009         }
3010 }
3011
3012 /*
3013  * Returns 0 if the filesystem has tons of free space
3014  * Returns 1 if the filesystem has less than 10% remaining
3015  * Returns 2 if the filesystem has less than 2%/5% (user/root) remaining.
3016  */
3017 int
3018 hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred)
3019 {
3020         hammer2_pfs_t *pmp;
3021         hammer2_dev_t *hmp;
3022         hammer2_off_t free_reserved;
3023         hammer2_off_t free_nominal;
3024         int i;
3025
3026         pmp = ip->pmp;
3027
3028         if (/*XXX*/ 1 || pmp->free_ticks == 0 || pmp->free_ticks != ticks) {
3029                 free_reserved = HAMMER2_SEGSIZE;
3030                 free_nominal = 0x7FFFFFFFFFFFFFFFLLU;
3031                 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
3032                         hmp = pmp->pfs_hmps[i];
3033                         if (hmp == NULL)
3034                                 continue;
3035                         if (pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER &&
3036                             pmp->pfs_types[i] != HAMMER2_PFSTYPE_SOFT_MASTER)
3037                                 continue;
3038
3039                         if (free_nominal > hmp->voldata.allocator_free)
3040                                 free_nominal = hmp->voldata.allocator_free;
3041                         if (free_reserved < hmp->free_reserved)
3042                                 free_reserved = hmp->free_reserved;
3043                 }
3044
3045                 /*
3046                  * SMP races ok
3047                  */
3048                 pmp->free_reserved = free_reserved;
3049                 pmp->free_nominal = free_nominal;
3050                 pmp->free_ticks = ticks;
3051         } else {
3052                 free_reserved = pmp->free_reserved;
3053                 free_nominal = pmp->free_nominal;
3054         }
3055         if (cred && cred->cr_uid != 0) {
3056                 if ((int64_t)(free_nominal - bytes) <
3057                     (int64_t)free_reserved) {
3058                         return 2;
3059                 }
3060         } else {
3061                 if ((int64_t)(free_nominal - bytes) <
3062                     (int64_t)free_reserved / 2) {
3063                         return 2;
3064                 }
3065         }
3066         if ((int64_t)(free_nominal - bytes) < (int64_t)free_reserved * 2)
3067                 return 1;
3068         return 0;
3069 }
3070
3071 /*
3072  * Debugging
3073  */
3074 void
3075 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int bi, int *countp,
3076                    char pfx, u_int flags)
3077 {
3078         hammer2_chain_t *scan;
3079         hammer2_chain_t *parent;
3080
3081         if (hammer2_debug & 0x80000000)
3082                 *countp = INT_MAX;
3083
3084         --*countp;
3085         if (*countp == 0) {
3086                 kprintf("%*.*s...\n", tab, tab, "");
3087                 return;
3088         }
3089         if (*countp < 0)
3090                 return;
3091         kprintf("%*.*s%c-chain %p %s.%-3d %016jx %016jx/%-2d mir=%016jx\n",
3092                 tab, tab, "", pfx, chain,
3093                 hammer2_bref_type_str(chain->bref.type), bi,
3094                 chain->bref.data_off, chain->bref.key, chain->bref.keybits,
3095                 chain->bref.mirror_tid);
3096
3097         kprintf("%*.*s      [%08x] (%s) refs=%d",
3098                 tab, tab, "",
3099                 chain->flags,
3100                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
3101                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
3102                 chain->refs);
3103
3104         parent = chain->parent;
3105         if (parent)
3106                 kprintf("\n%*.*s      p=%p [pflags %08x prefs %d]",
3107                         tab, tab, "",
3108                         parent, parent->flags, parent->refs);
3109         if (RB_EMPTY(&chain->core.rbtree)) {
3110                 kprintf("\n");
3111         } else {
3112                 int bi = 0;
3113                 kprintf(" {\n");
3114                 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) {
3115                         if ((scan->flags & flags) || flags == (u_int)-1) {
3116                                 hammer2_dump_chain(scan, tab + 4, bi, countp,
3117                                                    'a', flags);
3118                         }
3119                         bi++;
3120                 }
3121                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
3122                         kprintf("%*.*s}(%s)\n", tab, tab, "",
3123                                 chain->data->ipdata.filename);
3124                 else
3125                         kprintf("%*.*s}\n", tab, tab, "");
3126         }
3127 }
3128
3129 void
3130 hammer2_dump_chains(hammer2_dev_t *hmp, char vpfx, char fpfx)
3131 {
3132         int dumpcnt;
3133
3134         dumpcnt = 50;
3135         hammer2_dump_chain(&hmp->vchain, 0, 0, &dumpcnt, vpfx, (u_int)-1);
3136
3137         dumpcnt = 50;
3138         hammer2_dump_chain(&hmp->fchain, 0, 0, &dumpcnt, fpfx, (u_int)-1);
3139 }