719c976cc34c70476a8e0a5d8a81f98957bd78c5
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*
2  * Copyright (c) 2011-2018 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61 #include "hammer2_lz4.h"
62
63 #include "zlib/hammer2_zlib.h"
64
65 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
66
67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
68
69 struct hammer2_sync_info {
70         int error;
71         int waitfor;
72         int pass;
73 };
74
75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
76 static struct hammer2_mntlist hammer2_mntlist;
77
78 struct hammer2_pfslist hammer2_pfslist;
79 struct hammer2_pfslist hammer2_spmplist;
80 struct lock hammer2_mntlk;
81
82 int hammer2_supported_version = HAMMER2_VOL_VERSION_DEFAULT;
83 int hammer2_debug;
84 int hammer2_cluster_meta_read = 1;      /* physical read-ahead */
85 int hammer2_cluster_data_read = 4;      /* physical read-ahead */
86 int hammer2_cluster_write = 0;          /* physical write clustering */
87 int hammer2_dedup_enable = 1;
88 int hammer2_always_compress = 0;        /* always try to compress */
89 int hammer2_inval_enable = 0;
90 int hammer2_flush_pipe = 100;
91 int hammer2_dio_count;
92 int hammer2_dio_limit = 256;
93 int hammer2_bulkfree_tps = 5000;
94 long hammer2_chain_allocs;
95 long hammer2_chain_frees;
96 long hammer2_limit_dirty_chains;
97 long hammer2_limit_dirty_inodes;
98 long hammer2_count_modified_chains;
99 long hammer2_iod_invals;
100 long hammer2_iod_file_read;
101 long hammer2_iod_meta_read;
102 long hammer2_iod_indr_read;
103 long hammer2_iod_fmap_read;
104 long hammer2_iod_volu_read;
105 long hammer2_iod_file_write;
106 long hammer2_iod_file_wembed;
107 long hammer2_iod_file_wzero;
108 long hammer2_iod_file_wdedup;
109 long hammer2_iod_meta_write;
110 long hammer2_iod_indr_write;
111 long hammer2_iod_fmap_write;
112 long hammer2_iod_volu_write;
113 long hammer2_iod_inode_creates;
114 long hammer2_iod_inode_deletes;
115
116 MALLOC_DECLARE(M_HAMMER2_CBUFFER);
117 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer",
118                 "Buffer used for compression.");
119
120 MALLOC_DECLARE(M_HAMMER2_DEBUFFER);
121 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer",
122                 "Buffer used for decompression.");
123
124 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
125
126 SYSCTL_INT(_vfs_hammer2, OID_AUTO, supported_version, CTLFLAG_RD,
127            &hammer2_supported_version, 0, "");
128 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
129            &hammer2_debug, 0, "");
130 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_meta_read, CTLFLAG_RW,
131            &hammer2_cluster_meta_read, 0, "");
132 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_data_read, CTLFLAG_RW,
133            &hammer2_cluster_data_read, 0, "");
134 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_write, CTLFLAG_RW,
135            &hammer2_cluster_write, 0, "");
136 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dedup_enable, CTLFLAG_RW,
137            &hammer2_dedup_enable, 0, "");
138 SYSCTL_INT(_vfs_hammer2, OID_AUTO, always_compress, CTLFLAG_RW,
139            &hammer2_always_compress, 0, "");
140 SYSCTL_INT(_vfs_hammer2, OID_AUTO, inval_enable, CTLFLAG_RW,
141            &hammer2_inval_enable, 0, "");
142 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
143            &hammer2_flush_pipe, 0, "");
144 SYSCTL_INT(_vfs_hammer2, OID_AUTO, bulkfree_tps, CTLFLAG_RW,
145            &hammer2_bulkfree_tps, 0, "");
146 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_allocs, CTLFLAG_RW,
147            &hammer2_chain_allocs, 0, "");
148 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, chain_frees, CTLFLAG_RW,
149            &hammer2_chain_frees, 0, "");
150 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
151            &hammer2_limit_dirty_chains, 0, "");
152 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_inodes, CTLFLAG_RW,
153            &hammer2_limit_dirty_inodes, 0, "");
154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, count_modified_chains, CTLFLAG_RW,
155            &hammer2_count_modified_chains, 0, "");
156 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
157            &hammer2_dio_count, 0, "");
158 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_limit, CTLFLAG_RW,
159            &hammer2_dio_limit, 0, "");
160
161 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_invals, CTLFLAG_RW,
162            &hammer2_iod_invals, 0, "");
163 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
164            &hammer2_iod_file_read, 0, "");
165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
166            &hammer2_iod_meta_read, 0, "");
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
168            &hammer2_iod_indr_read, 0, "");
169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
170            &hammer2_iod_fmap_read, 0, "");
171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
172            &hammer2_iod_volu_read, 0, "");
173
174 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
175            &hammer2_iod_file_write, 0, "");
176 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wembed, CTLFLAG_RW,
177            &hammer2_iod_file_wembed, 0, "");
178 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wzero, CTLFLAG_RW,
179            &hammer2_iod_file_wzero, 0, "");
180 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_wdedup, CTLFLAG_RW,
181            &hammer2_iod_file_wdedup, 0, "");
182 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
183            &hammer2_iod_meta_write, 0, "");
184 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
185            &hammer2_iod_indr_write, 0, "");
186 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
187            &hammer2_iod_fmap_write, 0, "");
188 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
189            &hammer2_iod_volu_write, 0, "");
190 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_creates, CTLFLAG_RW,
191            &hammer2_iod_inode_creates, 0, "");
192 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_inode_deletes, CTLFLAG_RW,
193            &hammer2_iod_inode_deletes, 0, "");
194
195 long hammer2_process_icrc32;
196 long hammer2_process_xxhash64;
197 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_icrc32, CTLFLAG_RW,
198            &hammer2_process_icrc32, 0, "");
199 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, process_xxhash64, CTLFLAG_RW,
200            &hammer2_process_xxhash64, 0, "");
201
202 static int hammer2_vfs_init(struct vfsconf *conf);
203 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
204 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
205                                 struct ucred *cred);
206 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *,
207                                 struct vnode *, struct ucred *);
208 static int hammer2_recovery(hammer2_dev_t *hmp);
209 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
210 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
211 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
212                                 struct ucred *cred);
213 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
214                                 struct ucred *cred);
215 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
216                                 struct fid *fhp, struct vnode **vpp);
217 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
218 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
219                                 int *exflagsp, struct ucred **credanonp);
220
221 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
222 #if 0
223 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
224 #endif
225
226 static void hammer2_update_pmps(hammer2_dev_t *hmp);
227
228 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp);
229 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp,
230                                 hammer2_dev_t *hmp);
231 static int hammer2_fixup_pfses(hammer2_dev_t *hmp);
232
233 /*
234  * HAMMER2 vfs operations.
235  */
236 static struct vfsops hammer2_vfsops = {
237         .vfs_init       = hammer2_vfs_init,
238         .vfs_uninit     = hammer2_vfs_uninit,
239         .vfs_sync       = hammer2_vfs_sync,
240         .vfs_mount      = hammer2_vfs_mount,
241         .vfs_unmount    = hammer2_vfs_unmount,
242         .vfs_root       = hammer2_vfs_root,
243         .vfs_statfs     = hammer2_vfs_statfs,
244         .vfs_statvfs    = hammer2_vfs_statvfs,
245         .vfs_vget       = hammer2_vfs_vget,
246         .vfs_vptofh     = hammer2_vfs_vptofh,
247         .vfs_fhtovp     = hammer2_vfs_fhtovp,
248         .vfs_checkexp   = hammer2_vfs_checkexp
249 };
250
251 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
252
253 VFS_SET(hammer2_vfsops, hammer2, VFCF_MPSAFE);
254 MODULE_VERSION(hammer2, 1);
255
256 static
257 int
258 hammer2_vfs_init(struct vfsconf *conf)
259 {
260         static struct objcache_malloc_args margs_read;
261         static struct objcache_malloc_args margs_write;
262         static struct objcache_malloc_args margs_vop;
263
264         int error;
265
266         error = 0;
267
268         /*
269          * A large DIO cache is needed to retain dedup enablement masks.
270          * The bulkfree code clears related masks as part of the disk block
271          * recycling algorithm, preventing it from being used for a later
272          * dedup.
273          *
274          * NOTE: A large buffer cache can actually interfere with dedup
275          *       operation because we dedup based on media physical buffers
276          *       and not logical buffers.  Try to make the DIO case large
277          *       enough to avoid this problem, but also cap it.
278          */
279         hammer2_dio_limit = nbuf * 2;
280         if (hammer2_dio_limit > 100000)
281                 hammer2_dio_limit = 100000;
282
283         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
284                 error = EINVAL;
285         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
286                 error = EINVAL;
287         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
288                 error = EINVAL;
289
290         if (error)
291                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
292         
293         margs_read.objsize = 65536;
294         margs_read.mtype = M_HAMMER2_DEBUFFER;
295         
296         margs_write.objsize = 32768;
297         margs_write.mtype = M_HAMMER2_CBUFFER;
298
299         margs_vop.objsize = sizeof(hammer2_xop_t);
300         margs_vop.mtype = M_HAMMER2;
301         
302         /*
303          * Note thaht for the XOPS cache we want backing store allocations
304          * to use M_ZERO.  This is not allowed in objcache_get() (to avoid
305          * confusion), so use the backing store function that does it.  This
306          * means that initial XOPS objects are zerod but REUSED objects are
307          * not.  So we are responsible for cleaning the object up sufficiently
308          * for our needs before objcache_put()ing it back (typically just the
309          * FIFO indices).
310          */
311         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
312                                 0, 1, NULL, NULL, NULL,
313                                 objcache_malloc_alloc,
314                                 objcache_malloc_free,
315                                 &margs_read);
316         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
317                                 0, 1, NULL, NULL, NULL,
318                                 objcache_malloc_alloc,
319                                 objcache_malloc_free,
320                                 &margs_write);
321         cache_xops = objcache_create(margs_vop.mtype->ks_shortdesc,
322                                 0, 1, NULL, NULL, NULL,
323                                 objcache_malloc_alloc_zero,
324                                 objcache_malloc_free,
325                                 &margs_vop);
326
327
328         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
329         TAILQ_INIT(&hammer2_mntlist);
330         TAILQ_INIT(&hammer2_pfslist);
331         TAILQ_INIT(&hammer2_spmplist);
332
333         hammer2_limit_dirty_chains = maxvnodes / 10;
334         if (hammer2_limit_dirty_chains > HAMMER2_LIMIT_DIRTY_CHAINS)
335                 hammer2_limit_dirty_chains = HAMMER2_LIMIT_DIRTY_CHAINS;
336
337         hammer2_limit_dirty_inodes = maxvnodes / 100;
338         if (hammer2_limit_dirty_inodes < 100)
339                 hammer2_limit_dirty_inodes = 100;
340         if (hammer2_limit_dirty_inodes > HAMMER2_LIMIT_DIRTY_INODES)
341                 hammer2_limit_dirty_inodes = HAMMER2_LIMIT_DIRTY_INODES;
342
343         return (error);
344 }
345
346 static
347 int
348 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
349 {
350         objcache_destroy(cache_buffer_read);
351         objcache_destroy(cache_buffer_write);
352         objcache_destroy(cache_xops);
353         return 0;
354 }
355
356 /*
357  * Core PFS allocator.  Used to allocate or reference the pmp structure
358  * for PFS cluster mounts and the spmp structure for media (hmp) structures.
359  * The pmp can be passed in or loaded by this function using the chain and
360  * inode data.
361  *
362  * pmp->modify_tid tracks new modify_tid transaction ids for front-end
363  * transactions.  Note that synchronization does not use this field.
364  * (typically frontend operations and synchronization cannot run on the
365  * same PFS node at the same time).
366  *
367  * XXX check locking
368  */
369 hammer2_pfs_t *
370 hammer2_pfsalloc(hammer2_chain_t *chain,
371                  const hammer2_inode_data_t *ripdata,
372                  hammer2_tid_t modify_tid, hammer2_dev_t *force_local)
373 {
374         hammer2_pfs_t *pmp;
375         hammer2_inode_t *iroot;
376         int count;
377         int i;
378         int j;
379
380         pmp = NULL;
381
382         /*
383          * Locate or create the PFS based on the cluster id.  If ripdata
384          * is NULL this is a spmp which is unique and is always allocated.
385          *
386          * If the device is mounted in local mode all PFSs are considered
387          * independent and not part of any cluster (for debugging only).
388          */
389         if (ripdata) {
390                 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
391                         if (force_local != pmp->force_local)
392                                 continue;
393                         if (force_local == NULL &&
394                             bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid,
395                                  sizeof(pmp->pfs_clid)) == 0) {
396                                         break;
397                         } else if (force_local && pmp->pfs_names[0] &&
398                             strcmp(pmp->pfs_names[0], ripdata->filename) == 0) {
399                                         break;
400                         }
401                 }
402         }
403
404         if (pmp == NULL) {
405                 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
406                 pmp->force_local = force_local;
407                 hammer2_trans_manage_init(pmp);
408                 kmalloc_create(&pmp->minode, "HAMMER2-inodes");
409                 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
410                 lockinit(&pmp->lock, "pfslk", 0, 0);
411                 lockinit(&pmp->lock_nlink, "h2nlink", 0, 0);
412                 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
413                 spin_init(&pmp->xop_spin, "h2xop");
414                 spin_init(&pmp->lru_spin, "h2lru");
415                 RB_INIT(&pmp->inum_tree);
416                 TAILQ_INIT(&pmp->sideq);
417                 TAILQ_INIT(&pmp->syncq);
418                 TAILQ_INIT(&pmp->lru_list);
419                 spin_init(&pmp->list_spin, "hm2pfsalloc_list");
420
421                 /*
422                  * Distribute backend operations to threads
423                  */
424                 for (i = 0; i < HAMMER2_XOPGROUPS; ++i)
425                         hammer2_xop_group_init(pmp, &pmp->xop_groups[i]);
426
427                 /*
428                  * Save the last media transaction id for the flusher.  Set
429                  * initial 
430                  */
431                 if (ripdata) {
432                         pmp->pfs_clid = ripdata->meta.pfs_clid;
433                         TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
434                 } else {
435                         pmp->flags |= HAMMER2_PMPF_SPMP;
436                         TAILQ_INSERT_TAIL(&hammer2_spmplist, pmp, mntentry);
437                 }
438
439                 /*
440                  * The synchronization thread may start too early, make
441                  * sure it stays frozen until we are ready to let it go.
442                  * XXX
443                  */
444                 /*
445                 pmp->primary_thr.flags = HAMMER2_THREAD_FROZEN |
446                                          HAMMER2_THREAD_REMASTER;
447                 */
448         }
449
450         /*
451          * Create the PFS's root inode and any missing XOP helper threads.
452          */
453         if ((iroot = pmp->iroot) == NULL) {
454                 iroot = hammer2_inode_get(pmp, NULL, 1, -1);
455                 if (ripdata)
456                         iroot->meta = ripdata->meta;
457                 pmp->iroot = iroot;
458                 hammer2_inode_ref(iroot);
459                 hammer2_inode_unlock(iroot);
460         }
461
462         /*
463          * Stop here if no chain is passed in.
464          */
465         if (chain == NULL)
466                 goto done;
467
468         /*
469          * When a chain is passed in we must add it to the PFS's root
470          * inode, update pmp->pfs_types[], and update the syncronization
471          * threads.
472          *
473          * When forcing local mode, mark the PFS as a MASTER regardless.
474          *
475          * At the moment empty spots can develop due to removals or failures.
476          * Ultimately we want to re-fill these spots but doing so might
477          * confused running code. XXX
478          */
479         hammer2_inode_ref(iroot);
480         hammer2_mtx_ex(&iroot->lock);
481         j = iroot->cluster.nchains;
482
483         if (j == HAMMER2_MAXCLUSTER) {
484                 kprintf("hammer2_mount: cluster full!\n");
485                 /* XXX fatal error? */
486         } else {
487                 KKASSERT(chain->pmp == NULL);
488                 chain->pmp = pmp;
489                 hammer2_chain_ref(chain);
490                 iroot->cluster.array[j].chain = chain;
491                 if (force_local)
492                         pmp->pfs_types[j] = HAMMER2_PFSTYPE_MASTER;
493                 else
494                         pmp->pfs_types[j] = ripdata->meta.pfs_type;
495                 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2);
496                 pmp->pfs_hmps[j] = chain->hmp;
497                 hammer2_spin_ex(&pmp->inum_spin);
498                 pmp->pfs_iroot_blocksets[j] = chain->data->ipdata.u.blockset;
499                 hammer2_spin_unex(&pmp->inum_spin);
500
501                 /*
502                  * If the PFS is already mounted we must account
503                  * for the mount_count here.
504                  */
505                 if (pmp->mp)
506                         ++chain->hmp->mount_count;
507
508                 /*
509                  * May have to fixup dirty chain tracking.  Previous
510                  * pmp was NULL so nothing to undo.
511                  */
512                 if (chain->flags & HAMMER2_CHAIN_MODIFIED)
513                         hammer2_pfs_memory_inc(pmp);
514                 ++j;
515         }
516         iroot->cluster.nchains = j;
517
518         /*
519          * Update nmasters from any PFS inode which is part of the cluster.
520          * It is possible that this will result in a value which is too
521          * high.  MASTER PFSs are authoritative for pfs_nmasters and will
522          * override this value later on.
523          *
524          * (This informs us of masters that might not currently be
525          *  discoverable by this mount).
526          */
527         if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) {
528                 pmp->pfs_nmasters = ripdata->meta.pfs_nmasters;
529         }
530
531         /*
532          * Count visible masters.  Masters are usually added with
533          * ripdata->meta.pfs_nmasters set to 1.  This detects when there
534          * are more (XXX and must update the master inodes).
535          */
536         count = 0;
537         for (i = 0; i < iroot->cluster.nchains; ++i) {
538                 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER)
539                         ++count;
540         }
541         if (pmp->pfs_nmasters < count)
542                 pmp->pfs_nmasters = count;
543
544         /*
545          * Create missing synchronization and support threads.
546          *
547          * Single-node masters (including snapshots) have nothing to
548          * synchronize and do not require this thread.
549          *
550          * Multi-node masters or any number of soft masters, slaves, copy,
551          * or other PFS types need the thread.
552          *
553          * Each thread is responsible for its particular cluster index.
554          * We use independent threads so stalls or mismatches related to
555          * any given target do not affect other targets.
556          */
557         for (i = 0; i < iroot->cluster.nchains; ++i) {
558                 /*
559                  * Single-node masters (including snapshots) have nothing
560                  * to synchronize and will make direct xops support calls,
561                  * thus they do not require this thread.
562                  *
563                  * Note that there can be thousands of snapshots.  We do not
564                  * want to create thousands of threads.
565                  */
566                 if (pmp->pfs_nmasters <= 1 &&
567                     pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER) {
568                         continue;
569                 }
570
571                 /*
572                  * Sync support thread
573                  */
574                 if (pmp->sync_thrs[i].td == NULL) {
575                         hammer2_thr_create(&pmp->sync_thrs[i], pmp, NULL,
576                                            "h2nod", i, -1,
577                                            hammer2_primary_sync_thread);
578                 }
579         }
580
581         /*
582          * Create missing Xop threads
583          *
584          * NOTE: We create helper threads for all mounted PFSs or any
585          *       PFSs with 2+ nodes (so the sync thread can update them,
586          *       even if not mounted).
587          */
588         if (pmp->mp || iroot->cluster.nchains >= 2)
589                 hammer2_xop_helper_create(pmp);
590
591         hammer2_mtx_unlock(&iroot->lock);
592         hammer2_inode_drop(iroot);
593 done:
594         return pmp;
595 }
596
597 /*
598  * Deallocate an element of a probed PFS.  If destroying and this is a
599  * MASTER, adjust nmasters.
600  *
601  * This function does not physically destroy the PFS element in its device
602  * under the super-root  (see hammer2_ioctl_pfs_delete()).
603  */
604 void
605 hammer2_pfsdealloc(hammer2_pfs_t *pmp, int clindex, int destroying)
606 {
607         hammer2_inode_t *iroot;
608         hammer2_chain_t *chain;
609         int j;
610
611         /*
612          * Cleanup our reference on iroot.  iroot is (should) not be needed
613          * by the flush code.
614          */
615         iroot = pmp->iroot;
616         if (iroot) {
617                 /*
618                  * Stop synchronizing
619                  *
620                  * XXX flush after acquiring the iroot lock.
621                  * XXX clean out the cluster index from all inode structures.
622                  */
623                 hammer2_thr_delete(&pmp->sync_thrs[clindex]);
624
625                 /*
626                  * Remove the cluster index from the group.  If destroying
627                  * the PFS and this is a master, adjust pfs_nmasters.
628                  */
629                 hammer2_mtx_ex(&iroot->lock);
630                 chain = iroot->cluster.array[clindex].chain;
631                 iroot->cluster.array[clindex].chain = NULL;
632
633                 switch(pmp->pfs_types[clindex]) {
634                 case HAMMER2_PFSTYPE_MASTER:
635                         if (destroying && pmp->pfs_nmasters > 0)
636                                 --pmp->pfs_nmasters;
637                         /* XXX adjust ripdata->meta.pfs_nmasters */
638                         break;
639                 default:
640                         break;
641                 }
642                 pmp->pfs_types[clindex] = HAMMER2_PFSTYPE_NONE;
643
644                 hammer2_mtx_unlock(&iroot->lock);
645
646                 /*
647                  * Release the chain.
648                  */
649                 if (chain) {
650                         atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
651                         hammer2_chain_drop(chain);
652                 }
653
654                 /*
655                  * Terminate all XOP threads for the cluster index.
656                  */
657                 for (j = 0; j < HAMMER2_XOPGROUPS; ++j)
658                         hammer2_thr_delete(&pmp->xop_groups[j].thrs[clindex]);
659         }
660 }
661
662 /*
663  * Destroy a PFS, typically only occurs after the last mount on a device
664  * has gone away.
665  */
666 static void
667 hammer2_pfsfree(hammer2_pfs_t *pmp)
668 {
669         hammer2_inode_t *iroot;
670         hammer2_chain_t *chain;
671         int chains_still_present = 0;
672         int i;
673         int j;
674
675         /*
676          * Cleanup our reference on iroot.  iroot is (should) not be needed
677          * by the flush code.
678          */
679         if (pmp->flags & HAMMER2_PMPF_SPMP)
680                 TAILQ_REMOVE(&hammer2_spmplist, pmp, mntentry);
681         else
682                 TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
683
684         /*
685          * Cleanup chains remaining on LRU list.
686          */
687         hammer2_spin_ex(&pmp->lru_spin);
688         while ((chain = TAILQ_FIRST(&pmp->lru_list)) != NULL) {
689                 KKASSERT(chain->flags & HAMMER2_CHAIN_ONLRU);
690                 atomic_add_int(&pmp->lru_count, -1);
691                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_ONLRU);
692                 TAILQ_REMOVE(&pmp->lru_list, chain, lru_node);
693                 hammer2_chain_ref(chain);
694                 hammer2_spin_unex(&pmp->lru_spin);
695                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
696                 hammer2_chain_drop(chain);
697                 hammer2_spin_ex(&pmp->lru_spin);
698         }
699         hammer2_spin_unex(&pmp->lru_spin);
700
701         /*
702          * Clean up iroot
703          */
704         iroot = pmp->iroot;
705         if (iroot) {
706                 for (i = 0; i < iroot->cluster.nchains; ++i) {
707                         hammer2_thr_delete(&pmp->sync_thrs[i]);
708                         for (j = 0; j < HAMMER2_XOPGROUPS; ++j)
709                                 hammer2_thr_delete(&pmp->xop_groups[j].thrs[i]);
710                         chain = iroot->cluster.array[i].chain;
711                         if (chain && !RB_EMPTY(&chain->core.rbtree)) {
712                                 kprintf("hammer2: Warning pmp %p still "
713                                         "has active chains\n", pmp);
714                                 chains_still_present = 1;
715                         }
716                 }
717 #if REPORT_REFS_ERRORS
718                 if (iroot->refs != 1)
719                         kprintf("PMP->IROOT %p REFS WRONG %d\n",
720                                 iroot, iroot->refs);
721 #else
722                 KKASSERT(iroot->refs == 1);
723 #endif
724                 /* ref for iroot */
725                 hammer2_inode_drop(iroot);
726                 pmp->iroot = NULL;
727         }
728
729         /*
730          * Free remaining pmp resources
731          */
732         if (chains_still_present) {
733                 kprintf("hammer2: cannot free pmp %p, still in use\n", pmp);
734         } else {
735                 kmalloc_destroy(&pmp->mmsg);
736                 kmalloc_destroy(&pmp->minode);
737                 kfree(pmp, M_HAMMER2);
738         }
739 }
740
741 /*
742  * Remove all references to hmp from the pfs list.  Any PFS which becomes
743  * empty is terminated and freed.
744  *
745  * XXX inefficient.
746  */
747 static void
748 hammer2_pfsfree_scan(hammer2_dev_t *hmp, int which)
749 {
750         hammer2_pfs_t *pmp;
751         hammer2_inode_t *iroot;
752         hammer2_chain_t *rchain;
753         int i;
754         int j;
755         struct hammer2_pfslist *wlist;
756
757         if (which == 0)
758                 wlist = &hammer2_pfslist;
759         else
760                 wlist = &hammer2_spmplist;
761 again:
762         TAILQ_FOREACH(pmp, wlist, mntentry) {
763                 if ((iroot = pmp->iroot) == NULL)
764                         continue;
765
766                 /*
767                  * Determine if this PFS is affected.  If it is we must
768                  * freeze all management threads and lock its iroot.
769                  *
770                  * Freezing a management thread forces it idle, operations
771                  * in-progress will be aborted and it will have to start
772                  * over again when unfrozen, or exit if told to exit.
773                  */
774                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
775                         if (pmp->pfs_hmps[i] == hmp)
776                                 break;
777                 }
778                 if (i == HAMMER2_MAXCLUSTER)
779                         continue;
780
781                 hammer2_vfs_sync_pmp(pmp, MNT_WAIT);
782
783                 /*
784                  * Make sure all synchronization threads are locked
785                  * down.
786                  */
787                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
788                         if (pmp->pfs_hmps[i] == NULL)
789                                 continue;
790                         hammer2_thr_freeze_async(&pmp->sync_thrs[i]);
791                         for (j = 0; j < HAMMER2_XOPGROUPS; ++j) {
792                                 hammer2_thr_freeze_async(
793                                         &pmp->xop_groups[j].thrs[i]);
794                         }
795                 }
796                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
797                         if (pmp->pfs_hmps[i] == NULL)
798                                 continue;
799                         hammer2_thr_freeze(&pmp->sync_thrs[i]);
800                         for (j = 0; j < HAMMER2_XOPGROUPS; ++j) {
801                                 hammer2_thr_freeze(
802                                         &pmp->xop_groups[j].thrs[i]);
803                         }
804                 }
805
806                 /*
807                  * Lock the inode and clean out matching chains.
808                  * Note that we cannot use hammer2_inode_lock_*()
809                  * here because that would attempt to validate the
810                  * cluster that we are in the middle of ripping
811                  * apart.
812                  *
813                  * WARNING! We are working directly on the inodes
814                  *          embedded cluster.
815                  */
816                 hammer2_mtx_ex(&iroot->lock);
817
818                 /*
819                  * Remove the chain from matching elements of the PFS.
820                  */
821                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
822                         if (pmp->pfs_hmps[i] != hmp)
823                                 continue;
824                         hammer2_thr_delete(&pmp->sync_thrs[i]);
825                         for (j = 0; j < HAMMER2_XOPGROUPS; ++j) {
826                                 hammer2_thr_delete(
827                                         &pmp->xop_groups[j].thrs[i]);
828                         }
829                         rchain = iroot->cluster.array[i].chain;
830                         iroot->cluster.array[i].chain = NULL;
831                         pmp->pfs_types[i] = 0;
832                         if (pmp->pfs_names[i]) {
833                                 kfree(pmp->pfs_names[i], M_HAMMER2);
834                                 pmp->pfs_names[i] = NULL;
835                         }
836                         if (rchain) {
837                                 hammer2_chain_drop(rchain);
838                                 /* focus hint */
839                                 if (iroot->cluster.focus == rchain)
840                                         iroot->cluster.focus = NULL;
841                         }
842                         pmp->pfs_hmps[i] = NULL;
843                 }
844                 hammer2_mtx_unlock(&iroot->lock);
845
846                 /*
847                  * Cleanup trailing chains.  Gaps may remain.
848                  */
849                 for (i = HAMMER2_MAXCLUSTER - 1; i >= 0; --i) {
850                         if (pmp->pfs_hmps[i])
851                                 break;
852                 }
853                 iroot->cluster.nchains = i + 1;
854
855                 /*
856                  * If the PMP has no elements remaining we can destroy it.
857                  * (this will transition management threads from frozen->exit).
858                  */
859                 if (iroot->cluster.nchains == 0) {
860                         /*
861                          * If this was the hmp's spmp, we need to clean
862                          * a little more stuff out.
863                          */
864                         if (hmp->spmp == pmp) {
865                                 hmp->spmp = NULL;
866                                 hmp->vchain.pmp = NULL;
867                                 hmp->fchain.pmp = NULL;
868                         }
869
870                         /*
871                          * Free the pmp and restart the loop
872                          */
873                         KKASSERT(TAILQ_EMPTY(&pmp->sideq));
874                         KKASSERT(TAILQ_EMPTY(&pmp->syncq));
875                         hammer2_pfsfree(pmp);
876                         goto again;
877                 }
878
879                 /*
880                  * If elements still remain we need to set the REMASTER
881                  * flag and unfreeze it.
882                  */
883                 for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
884                         if (pmp->pfs_hmps[i] == NULL)
885                                 continue;
886                         hammer2_thr_remaster(&pmp->sync_thrs[i]);
887                         hammer2_thr_unfreeze(&pmp->sync_thrs[i]);
888                         for (j = 0; j < HAMMER2_XOPGROUPS; ++j) {
889                                 hammer2_thr_remaster(
890                                         &pmp->xop_groups[j].thrs[i]);
891                                 hammer2_thr_unfreeze(
892                                         &pmp->xop_groups[j].thrs[i]);
893                         }
894                 }
895         }
896 }
897
898 /*
899  * Mount or remount HAMMER2 fileystem from physical media
900  *
901  *      mountroot
902  *              mp              mount point structure
903  *              path            NULL
904  *              data            <unused>
905  *              cred            <unused>
906  *
907  *      mount
908  *              mp              mount point structure
909  *              path            path to mount point
910  *              data            pointer to argument structure in user space
911  *                      volume  volume path (device@LABEL form)
912  *                      hflags  user mount flags
913  *              cred            user credentials
914  *
915  * RETURNS:     0       Success
916  *              !0      error number
917  */
918 static
919 int
920 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
921                   struct ucred *cred)
922 {
923         struct hammer2_mount_info info;
924         hammer2_pfs_t *pmp;
925         hammer2_pfs_t *spmp;
926         hammer2_dev_t *hmp;
927         hammer2_dev_t *force_local;
928         hammer2_key_t key_next;
929         hammer2_key_t key_dummy;
930         hammer2_key_t lhc;
931         struct vnode *devvp;
932         struct nlookupdata nd;
933         hammer2_chain_t *parent;
934         hammer2_chain_t *chain;
935         const hammer2_inode_data_t *ripdata;
936         hammer2_blockref_t bref;
937         struct file *fp;
938         char devstr[MNAMELEN];
939         size_t size;
940         size_t done;
941         char *dev;
942         char *label;
943         int ronly = 1;
944         int error;
945         int i;
946
947         hmp = NULL;
948         pmp = NULL;
949         dev = NULL;
950         label = NULL;
951         devvp = NULL;
952
953         if (path == NULL) {
954                 /*
955                  * Root mount
956                  */
957                 bzero(&info, sizeof(info));
958                 info.cluster_fd = -1;
959                 ksnprintf(devstr, sizeof(devstr), "%s",
960                           mp->mnt_stat.f_mntfromname);
961                 kprintf("hammer2_mount: root '%s'\n", devstr);
962                 done = strlen(devstr) + 1;
963         } else {
964                 /*
965                  * Non-root mount or updating a mount
966                  */
967                 error = copyin(data, &info, sizeof(info));
968                 if (error)
969                         return (error);
970
971                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
972                 if (error)
973                         return (error);
974                 kprintf("hammer2_mount: '%s'\n", devstr);
975         }
976
977         /*
978          * Extract device and label, automatically mount @BOOT, @ROOT, or @DATA
979          * if no label specified, based on the partition id.  Error out if no
980          * label or device (with partition id) is specified.  This is strictly
981          * a convenience to match the default label created by newfs_hammer2,
982          * our preference is that a label always be specified.
983          *
984          * NOTE: We allow 'mount @LABEL <blah>'... that is, a mount command
985          *       that does not specify a device, as long as some H2 label
986          *       has already been mounted from that device.  This makes
987          *       mounting snapshots a lot easier.
988          */
989         dev = devstr;
990         label = strchr(devstr, '@');
991         if (label && ((label + 1) - dev) > done) {
992                 kprintf("hammer2: mount: bad label %s/%zd\n",
993                         devstr, done);
994                 return (EINVAL);
995         }
996         if (label == NULL || label[1] == 0) {
997                 char slice;
998
999                 if (label == NULL)
1000                         label = devstr + strlen(devstr);
1001                 else
1002                         *label = '\0';          /* clean up trailing @ */
1003
1004                 slice = label[-1];
1005                 switch(slice) {
1006                 case 'a':
1007                         label = "BOOT";
1008                         break;
1009                 case 'd':
1010                         label = "ROOT";
1011                         break;
1012                 default:
1013                         label = "DATA";
1014                         break;
1015                 }
1016         } else {
1017                 *label = '\0';
1018                 label++;
1019         }
1020
1021         kprintf("hammer2_mount: dev=\"%s\" label=\"%s\" rdonly=%d\n",
1022                 dev, label, (mp->mnt_flag & MNT_RDONLY));
1023
1024         if (mp->mnt_flag & MNT_UPDATE) {
1025                 /*
1026                  * Update mount.  Note that pmp->iroot->cluster is
1027                  * an inode-embedded cluster and thus cannot be
1028                  * directly locked.
1029                  *
1030                  * XXX HAMMER2 needs to implement NFS export via
1031                  *     mountctl.
1032                  */
1033                 hammer2_cluster_t *cluster;
1034
1035                 pmp = MPTOPMP(mp);
1036                 pmp->hflags = info.hflags;
1037                 cluster = &pmp->iroot->cluster;
1038                 for (i = 0; i < cluster->nchains; ++i) {
1039                         if (cluster->array[i].chain == NULL)
1040                                 continue;
1041                         hmp = cluster->array[i].chain->hmp;
1042                         devvp = hmp->devvp;
1043                         error = hammer2_remount(hmp, mp, path,
1044                                                 devvp, cred);
1045                         if (error)
1046                                 break;
1047                 }
1048
1049                 return error;
1050         }
1051
1052         /*
1053          * HMP device mount
1054          *
1055          * If a path is specified and dev is not an empty string, lookup the
1056          * name and verify that it referes to a block device.
1057          *
1058          * If a path is specified and dev is an empty string we fall through
1059          * and locate the label in the hmp search.
1060          */
1061         if (path && *dev != 0) {
1062                 error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
1063                 if (error == 0)
1064                         error = nlookup(&nd);
1065                 if (error == 0)
1066                         error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
1067                 nlookup_done(&nd);
1068         } else if (path == NULL) {
1069                 /* root mount */
1070                 cdev_t cdev = kgetdiskbyname(dev);
1071                 error = bdevvp(cdev, &devvp);
1072                 if (error)
1073                         kprintf("hammer2: cannot find '%s'\n", dev);
1074         } else {
1075                 /*
1076                  * We will locate the hmp using the label in the hmp loop.
1077                  */
1078                 error = 0;
1079         }
1080
1081         /*
1082          * Make sure its a block device.  Do not check to see if it is
1083          * already mounted until we determine that its a fresh H2 device.
1084          */
1085         if (error == 0 && devvp) {
1086                 vn_isdisk(devvp, &error);
1087         }
1088
1089         /*
1090          * Determine if the device has already been mounted.  After this
1091          * check hmp will be non-NULL if we are doing the second or more
1092          * hammer2 mounts from the same device.
1093          */
1094         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1095         if (devvp) {
1096                 /*
1097                  * Match the device.  Due to the way devfs works,
1098                  * we may not be able to directly match the vnode pointer,
1099                  * so also check to see if the underlying device matches.
1100                  */
1101                 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
1102                         if (hmp->devvp == devvp)
1103                                 break;
1104                         if (devvp->v_rdev &&
1105                             hmp->devvp->v_rdev == devvp->v_rdev) {
1106                                 break;
1107                         }
1108                 }
1109
1110                 /*
1111                  * If no match this may be a fresh H2 mount, make sure
1112                  * the device is not mounted on anything else.
1113                  */
1114                 if (hmp == NULL)
1115                         error = vfs_mountedon(devvp);
1116         } else if (error == 0) {
1117                 /*
1118                  * Match the label to a pmp already probed.
1119                  */
1120                 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
1121                         for (i = 0; i < HAMMER2_MAXCLUSTER; ++i) {
1122                                 if (pmp->pfs_names[i] &&
1123                                     strcmp(pmp->pfs_names[i], label) == 0) {
1124                                         hmp = pmp->pfs_hmps[i];
1125                                         break;
1126                                 }
1127                         }
1128                         if (hmp)
1129                                 break;
1130                 }
1131                 if (hmp == NULL)
1132                         error = ENOENT;
1133         }
1134
1135         /*
1136          * Open the device if this isn't a secondary mount and construct
1137          * the H2 device mount (hmp).
1138          */
1139         if (hmp == NULL) {
1140                 hammer2_chain_t *schain;
1141                 hammer2_xid_t xid;
1142                 hammer2_xop_head_t xop;
1143
1144                 if (error == 0 && vcount(devvp) > 0) {
1145                         kprintf("Primary device already has references\n");
1146                         error = EBUSY;
1147                 }
1148
1149                 /*
1150                  * Now open the device
1151                  */
1152                 if (error == 0) {
1153                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
1154                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1155                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
1156                         if (error == 0) {
1157                                 error = VOP_OPEN(devvp,
1158                                              (ronly ? FREAD : FREAD | FWRITE),
1159                                              FSCRED, NULL);
1160                         }
1161                         vn_unlock(devvp);
1162                 }
1163                 if (error && devvp) {
1164                         vrele(devvp);
1165                         devvp = NULL;
1166                 }
1167                 if (error) {
1168                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1169                         return error;
1170                 }
1171                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
1172                 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev);
1173                 hmp->ronly = ronly;
1174                 hmp->devvp = devvp;
1175                 hmp->hflags = info.hflags & HMNT2_DEVFLAGS;
1176                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
1177                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
1178                 RB_INIT(&hmp->iotree);
1179                 spin_init(&hmp->io_spin, "hm2mount_io");
1180                 spin_init(&hmp->list_spin, "hm2mount_list");
1181
1182                 lockinit(&hmp->vollk, "h2vol", 0, 0);
1183                 lockinit(&hmp->bulklk, "h2bulk", 0, 0);
1184                 lockinit(&hmp->bflock, "h2bflk", 0, 0);
1185
1186                 /*
1187                  * vchain setup. vchain.data is embedded.
1188                  * vchain.refs is initialized and will never drop to 0.
1189                  *
1190                  * NOTE! voldata is not yet loaded.
1191                  */
1192                 hmp->vchain.hmp = hmp;
1193                 hmp->vchain.refs = 1;
1194                 hmp->vchain.data = (void *)&hmp->voldata;
1195                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
1196                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
1197                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
1198
1199                 hammer2_chain_core_init(&hmp->vchain);
1200                 /* hmp->vchain.u.xxx is left NULL */
1201
1202                 /*
1203                  * fchain setup.  fchain.data is embedded.
1204                  * fchain.refs is initialized and will never drop to 0.
1205                  *
1206                  * The data is not used but needs to be initialized to
1207                  * pass assertion muster.  We use this chain primarily
1208                  * as a placeholder for the freemap's top-level RBTREE
1209                  * so it does not interfere with the volume's topology
1210                  * RBTREE.
1211                  */
1212                 hmp->fchain.hmp = hmp;
1213                 hmp->fchain.refs = 1;
1214                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
1215                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
1216                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
1217                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
1218                 hmp->fchain.bref.methods =
1219                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
1220                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
1221
1222                 hammer2_chain_core_init(&hmp->fchain);
1223                 /* hmp->fchain.u.xxx is left NULL */
1224
1225                 /*
1226                  * Install the volume header and initialize fields from
1227                  * voldata.
1228                  */
1229                 error = hammer2_install_volume_header(hmp);
1230                 if (error) {
1231                         hammer2_unmount_helper(mp, NULL, hmp);
1232                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1233                         hammer2_vfs_unmount(mp, MNT_FORCE);
1234                         return error;
1235                 }
1236
1237                 /*
1238                  * Really important to get these right or flush will get
1239                  * confused.
1240                  */
1241                 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0, NULL);
1242                 spmp = hmp->spmp;
1243
1244                 /*
1245                  * Dummy-up vchain and fchain's modify_tid.  mirror_tid
1246                  * is inherited from the volume header.
1247                  */
1248                 xid = 0;
1249                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
1250                 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
1251                 hmp->vchain.pmp = spmp;
1252                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
1253                 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
1254                 hmp->fchain.pmp = spmp;
1255
1256                 /*
1257                  * First locate the super-root inode, which is key 0
1258                  * relative to the volume header's blockset.
1259                  *
1260                  * Then locate the root inode by scanning the directory keyspace
1261                  * represented by the label.
1262                  */
1263                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
1264                 schain = hammer2_chain_lookup(&parent, &key_dummy,
1265                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
1266                                       &error, 0);
1267                 hammer2_chain_lookup_done(parent);
1268                 if (schain == NULL) {
1269                         kprintf("hammer2_mount: invalid super-root\n");
1270                         hammer2_unmount_helper(mp, NULL, hmp);
1271                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1272                         hammer2_vfs_unmount(mp, MNT_FORCE);
1273                         return EINVAL;
1274                 }
1275                 if (schain->error) {
1276                         kprintf("hammer2_mount: error %s reading super-root\n",
1277                                 hammer2_error_str(schain->error));
1278                         hammer2_chain_unlock(schain);
1279                         hammer2_chain_drop(schain);
1280                         schain = NULL;
1281                         hammer2_unmount_helper(mp, NULL, hmp);
1282                         lockmgr(&hammer2_mntlk, LK_RELEASE);
1283                         hammer2_vfs_unmount(mp, MNT_FORCE);
1284                         return EINVAL;
1285                 }
1286
1287                 /*
1288                  * The super-root always uses an inode_tid of 1 when
1289                  * creating PFSs.
1290                  */
1291                 spmp->inode_tid = 1;
1292                 spmp->modify_tid = schain->bref.modify_tid + 1;
1293
1294                 /*
1295                  * Sanity-check schain's pmp and finish initialization.
1296                  * Any chain belonging to the super-root topology should
1297                  * have a NULL pmp (not even set to spmp).
1298                  */
1299                 ripdata = &hammer2_chain_rdata(schain)->ipdata;
1300                 KKASSERT(schain->pmp == NULL);
1301                 spmp->pfs_clid = ripdata->meta.pfs_clid;
1302
1303                 /*
1304                  * Replace the dummy spmp->iroot with a real one.  It's
1305                  * easier to just do a wholesale replacement than to try
1306                  * to update the chain and fixup the iroot fields.
1307                  *
1308                  * The returned inode is locked with the supplied cluster.
1309                  */
1310                 hammer2_dummy_xop_from_chain(&xop, schain);
1311                 hammer2_inode_drop(spmp->iroot);
1312                 spmp->iroot = NULL;
1313                 spmp->iroot = hammer2_inode_get(spmp, &xop, -1, -1);
1314                 spmp->spmp_hmp = hmp;
1315                 spmp->pfs_types[0] = ripdata->meta.pfs_type;
1316                 spmp->pfs_hmps[0] = hmp;
1317                 hammer2_inode_ref(spmp->iroot);
1318                 hammer2_inode_unlock(spmp->iroot);
1319                 hammer2_cluster_unlock(&xop.cluster);
1320                 hammer2_chain_drop(schain);
1321                 /* do not call hammer2_cluster_drop() on an embedded cluster */
1322                 schain = NULL;  /* now invalid */
1323                 /* leave spmp->iroot with one ref */
1324
1325                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1326                         error = hammer2_recovery(hmp);
1327                         if (error == 0)
1328                                 error |= hammer2_fixup_pfses(hmp);
1329                         /* XXX do something with error */
1330                 }
1331                 hammer2_update_pmps(hmp);
1332                 hammer2_iocom_init(hmp);
1333                 hammer2_bulkfree_init(hmp);
1334
1335                 /*
1336                  * Ref the cluster management messaging descriptor.  The mount
1337                  * program deals with the other end of the communications pipe.
1338                  *
1339                  * Root mounts typically do not supply one.
1340                  */
1341                 if (info.cluster_fd >= 0) {
1342                         fp = holdfp(curthread, info.cluster_fd, -1);
1343                         if (fp) {
1344                                 hammer2_cluster_reconnect(hmp, fp);
1345                         } else {
1346                                 kprintf("hammer2_mount: bad cluster_fd!\n");
1347                         }
1348                 }
1349         } else {
1350                 spmp = hmp->spmp;
1351                 if (info.hflags & HMNT2_DEVFLAGS) {
1352                         kprintf("hammer2: Warning: mount flags pertaining "
1353                                 "to the whole device may only be specified "
1354                                 "on the first mount of the device: %08x\n",
1355                                 info.hflags & HMNT2_DEVFLAGS);
1356                 }
1357         }
1358
1359         /*
1360          * Force local mount (disassociate all PFSs from their clusters).
1361          * Used primarily for debugging.
1362          */
1363         force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL;
1364
1365         /*
1366          * Lookup the mount point under the media-localized super-root.
1367          * Scanning hammer2_pfslist doesn't help us because it represents
1368          * PFS cluster ids which can aggregate several named PFSs together.
1369          *
1370          * cluster->pmp will incorrectly point to spmp and must be fixed
1371          * up later on.
1372          */
1373         hammer2_inode_lock(spmp->iroot, 0);
1374         parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
1375         lhc = hammer2_dirhash(label, strlen(label));
1376         chain = hammer2_chain_lookup(&parent, &key_next,
1377                                      lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1378                                      &error, 0);
1379         while (chain) {
1380                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
1381                     strcmp(label, chain->data->ipdata.filename) == 0) {
1382                         break;
1383                 }
1384                 chain = hammer2_chain_next(&parent, chain, &key_next,
1385                                             key_next,
1386                                             lhc + HAMMER2_DIRHASH_LOMASK,
1387                                             &error, 0);
1388         }
1389         if (parent) {
1390                 hammer2_chain_unlock(parent);
1391                 hammer2_chain_drop(parent);
1392         }
1393         hammer2_inode_unlock(spmp->iroot);
1394
1395         /*
1396          * PFS could not be found?
1397          */
1398         if (chain == NULL) {
1399                 if (error)
1400                         kprintf("hammer2_mount: PFS label I/O error\n");
1401                 else
1402                         kprintf("hammer2_mount: PFS label not found\n");
1403                 hammer2_unmount_helper(mp, NULL, hmp);
1404                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1405                 hammer2_vfs_unmount(mp, MNT_FORCE);
1406
1407                 return EINVAL;
1408         }
1409
1410         /*
1411          * Acquire the pmp structure (it should have already been allocated
1412          * via hammer2_update_pmps() so do not pass cluster in to add to
1413          * available chains).
1414          *
1415          * Check if the cluster has already been mounted.  A cluster can
1416          * only be mounted once, use null mounts to mount additional copies.
1417          */
1418         if (chain->error) {
1419                 kprintf("hammer2_mount: PFS label I/O error\n");
1420         } else {
1421                 ripdata = &chain->data->ipdata;
1422                 bref = chain->bref;
1423                 pmp = hammer2_pfsalloc(NULL, ripdata,
1424                                        bref.modify_tid, force_local);
1425         }
1426         hammer2_chain_unlock(chain);
1427         hammer2_chain_drop(chain);
1428
1429         /*
1430          * Finish the mount
1431          */
1432         kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp);
1433
1434         if (pmp->mp) {
1435                 kprintf("hammer2_mount: PFS already mounted!\n");
1436                 hammer2_unmount_helper(mp, NULL, hmp);
1437                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1438                 hammer2_vfs_unmount(mp, MNT_FORCE);
1439
1440                 return EBUSY;
1441         }
1442
1443         pmp->hflags = info.hflags;
1444         mp->mnt_flag |= MNT_LOCAL;
1445         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
1446         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
1447  
1448         /*
1449          * required mount structure initializations
1450          */
1451         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
1452         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
1453  
1454         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
1455         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1456  
1457         /*
1458          * Optional fields
1459          */
1460         mp->mnt_iosize_max = MAXPHYS;
1461
1462         /*
1463          * Connect up mount pointers.
1464          */
1465         hammer2_mount_helper(mp, pmp);
1466
1467         lockmgr(&hammer2_mntlk, LK_RELEASE);
1468
1469         /*
1470          * Finish setup
1471          */
1472         vfs_getnewfsid(mp);
1473         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
1474         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
1475         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
1476
1477         if (path) {
1478                 copyinstr(info.volume, mp->mnt_stat.f_mntfromname,
1479                           MNAMELEN - 1, &size);
1480                 bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
1481         } /* else root mount, already in there */
1482
1483         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
1484         if (path) {
1485                 copyinstr(path, mp->mnt_stat.f_mntonname,
1486                           sizeof(mp->mnt_stat.f_mntonname) - 1,
1487                           &size);
1488         } else {
1489                 /* root mount */
1490                 mp->mnt_stat.f_mntonname[0] = '/';
1491         }
1492
1493         /*
1494          * Initial statfs to prime mnt_stat.
1495          */
1496         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
1497         
1498         return 0;
1499 }
1500
1501 /*
1502  * Scan PFSs under the super-root and create hammer2_pfs structures.
1503  */
1504 static
1505 void
1506 hammer2_update_pmps(hammer2_dev_t *hmp)
1507 {
1508         const hammer2_inode_data_t *ripdata;
1509         hammer2_chain_t *parent;
1510         hammer2_chain_t *chain;
1511         hammer2_blockref_t bref;
1512         hammer2_dev_t *force_local;
1513         hammer2_pfs_t *spmp;
1514         hammer2_pfs_t *pmp;
1515         hammer2_key_t key_next;
1516         int error;
1517
1518         /*
1519          * Force local mount (disassociate all PFSs from their clusters).
1520          * Used primarily for debugging.
1521          */
1522         force_local = (hmp->hflags & HMNT2_LOCAL) ? hmp : NULL;
1523
1524         /*
1525          * Lookup mount point under the media-localized super-root.
1526          *
1527          * cluster->pmp will incorrectly point to spmp and must be fixed
1528          * up later on.
1529          */
1530         spmp = hmp->spmp;
1531         hammer2_inode_lock(spmp->iroot, 0);
1532         parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
1533         chain = hammer2_chain_lookup(&parent, &key_next,
1534                                          HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
1535                                          &error, 0);
1536         while (chain) {
1537                 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE)
1538                         continue;
1539                 if (chain->error) {
1540                         kprintf("I/O error scanning PFS labels\n");
1541                 } else {
1542                         ripdata = &chain->data->ipdata;
1543                         bref = chain->bref;
1544
1545                         pmp = hammer2_pfsalloc(chain, ripdata,
1546                                                bref.modify_tid, force_local);
1547                 }
1548                 chain = hammer2_chain_next(&parent, chain, &key_next,
1549                                            key_next, HAMMER2_KEY_MAX,
1550                                            &error, 0);
1551         }
1552         if (parent) {
1553                 hammer2_chain_unlock(parent);
1554                 hammer2_chain_drop(parent);
1555         }
1556         hammer2_inode_unlock(spmp->iroot);
1557 }
1558
1559 static
1560 int
1561 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path __unused,
1562                 struct vnode *devvp, struct ucred *cred)
1563 {
1564         int error;
1565
1566         if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1567                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1568                 VOP_OPEN(devvp, FREAD | FWRITE, FSCRED, NULL);
1569                 vn_unlock(devvp);
1570                 error = hammer2_recovery(hmp);
1571                 if (error == 0)
1572                         error |= hammer2_fixup_pfses(hmp);
1573                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1574                 if (error == 0) {
1575                         VOP_CLOSE(devvp, FREAD, NULL);
1576                         hmp->ronly = 0;
1577                 } else {
1578                         VOP_CLOSE(devvp, FREAD | FWRITE, NULL);
1579                 }
1580                 vn_unlock(devvp);
1581         } else {
1582                 error = 0;
1583         }
1584         return error;
1585 }
1586
1587 static
1588 int
1589 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1590 {
1591         hammer2_pfs_t *pmp;
1592         int flags;
1593         int error = 0;
1594
1595         pmp = MPTOPMP(mp);
1596
1597         if (pmp == NULL)
1598                 return(0);
1599
1600         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1601
1602         /*
1603          * If mount initialization proceeded far enough we must flush
1604          * its vnodes and sync the underlying mount points.  Three syncs
1605          * are required to fully flush the filesystem (freemap updates lag
1606          * by one flush, and one extra for safety).
1607          */
1608         if (mntflags & MNT_FORCE)
1609                 flags = FORCECLOSE;
1610         else
1611                 flags = 0;
1612         if (pmp->iroot) {
1613                 error = vflush(mp, 0, flags);
1614                 if (error)
1615                         goto failed;
1616                 hammer2_vfs_sync(mp, MNT_WAIT);
1617                 hammer2_vfs_sync(mp, MNT_WAIT);
1618                 hammer2_vfs_sync(mp, MNT_WAIT);
1619         }
1620
1621         /*
1622          * Cleanup the frontend support XOPS threads
1623          */
1624         hammer2_xop_helper_cleanup(pmp);
1625
1626         if (pmp->mp)
1627                 hammer2_unmount_helper(mp, pmp, NULL);
1628
1629         error = 0;
1630 failed:
1631         lockmgr(&hammer2_mntlk, LK_RELEASE);
1632
1633         return (error);
1634 }
1635
1636 /*
1637  * Mount helper, hook the system mount into our PFS.
1638  * The mount lock is held.
1639  *
1640  * We must bump the mount_count on related devices for any
1641  * mounted PFSs.
1642  */
1643 static
1644 void
1645 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp)
1646 {
1647         hammer2_cluster_t *cluster;
1648         hammer2_chain_t *rchain;
1649         int i;
1650
1651         mp->mnt_data = (qaddr_t)pmp;
1652         pmp->mp = mp;
1653
1654         /*
1655          * After pmp->mp is set we have to adjust hmp->mount_count.
1656          */
1657         cluster = &pmp->iroot->cluster;
1658         for (i = 0; i < cluster->nchains; ++i) {
1659                 rchain = cluster->array[i].chain;
1660                 if (rchain == NULL)
1661                         continue;
1662                 ++rchain->hmp->mount_count;
1663         }
1664
1665         /*
1666          * Create missing Xop threads
1667          */
1668         hammer2_xop_helper_create(pmp);
1669 }
1670
1671 /*
1672  * Mount helper, unhook the system mount from our PFS.
1673  * The mount lock is held.
1674  *
1675  * If hmp is supplied a mount responsible for being the first to open
1676  * the block device failed and the block device and all PFSs using the
1677  * block device must be cleaned up.
1678  *
1679  * If pmp is supplied multiple devices might be backing the PFS and each
1680  * must be disconnected.  This might not be the last PFS using some of the
1681  * underlying devices.  Also, we have to adjust our hmp->mount_count
1682  * accounting for the devices backing the pmp which is now undergoing an
1683  * unmount.
1684  */
1685 static
1686 void
1687 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp)
1688 {
1689         hammer2_cluster_t *cluster;
1690         hammer2_chain_t *rchain;
1691         struct vnode *devvp;
1692         int dumpcnt;
1693         int ronly;
1694         int i;
1695
1696         /*
1697          * If no device supplied this is a high-level unmount and we have to
1698          * to disconnect the mount, adjust mount_count, and locate devices
1699          * that might now have no mounts.
1700          */
1701         if (pmp) {
1702                 KKASSERT(hmp == NULL);
1703                 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp);
1704                 pmp->mp = NULL;
1705                 mp->mnt_data = NULL;
1706
1707                 /*
1708                  * After pmp->mp is cleared we have to account for
1709                  * mount_count.
1710                  */
1711                 cluster = &pmp->iroot->cluster;
1712                 for (i = 0; i < cluster->nchains; ++i) {
1713                         rchain = cluster->array[i].chain;
1714                         if (rchain == NULL)
1715                                 continue;
1716                         --rchain->hmp->mount_count;
1717                         /* scrapping hmp now may invalidate the pmp */
1718                 }
1719 again:
1720                 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
1721                         if (hmp->mount_count == 0) {
1722                                 hammer2_unmount_helper(NULL, NULL, hmp);
1723                                 goto again;
1724                         }
1725                 }
1726                 return;
1727         }
1728
1729         /*
1730          * Try to terminate the block device.  We can't terminate it if
1731          * there are still PFSs referencing it.
1732          */
1733         if (hmp->mount_count)
1734                 return;
1735
1736         /*
1737          * Decomission the network before we start messing with the
1738          * device and PFS.
1739          */
1740         hammer2_iocom_uninit(hmp);
1741
1742         hammer2_bulkfree_uninit(hmp);
1743         hammer2_pfsfree_scan(hmp, 0);
1744 #if 0
1745         hammer2_dev_exlock(hmp);        /* XXX order */
1746 #endif
1747
1748         /*
1749          * Cycle the volume data lock as a safety (probably not needed any
1750          * more).  To ensure everything is out we need to flush at least
1751          * three times.  (1) The running of the sideq can dirty the
1752          * filesystem, (2) A normal flush can dirty the freemap, and
1753          * (3) ensure that the freemap is fully synchronized.
1754          *
1755          * The next mount's recovery scan can clean everything up but we want
1756          * to leave the filesystem in a 100% clean state on a normal unmount.
1757          */
1758 #if 0
1759         hammer2_voldata_lock(hmp);
1760         hammer2_voldata_unlock(hmp);
1761 #endif
1762
1763         /*
1764          * Flush whatever is left.  Unmounted but modified PFS's might still
1765          * have some dirty chains on them.
1766          */
1767         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1768         hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1769
1770         if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1771                 hammer2_voldata_modify(hmp);
1772                 hammer2_flush(&hmp->fchain, HAMMER2_FLUSH_TOP |
1773                                             HAMMER2_FLUSH_ALL);
1774         }
1775         hammer2_chain_unlock(&hmp->fchain);
1776
1777         if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
1778                 hammer2_flush(&hmp->vchain, HAMMER2_FLUSH_TOP |
1779                                             HAMMER2_FLUSH_ALL);
1780         }
1781         hammer2_chain_unlock(&hmp->vchain);
1782
1783         if ((hmp->vchain.flags | hmp->fchain.flags) &
1784             HAMMER2_CHAIN_FLUSH_MASK) {
1785                 kprintf("hammer2_unmount: chains left over "
1786                         "after final sync\n");
1787                 kprintf("    vchain %08x\n", hmp->vchain.flags);
1788                 kprintf("    fchain %08x\n", hmp->fchain.flags);
1789
1790                 if (hammer2_debug & 0x0010)
1791                         Debugger("entered debugger");
1792         }
1793
1794         hammer2_pfsfree_scan(hmp, 1);
1795
1796         KKASSERT(hmp->spmp == NULL);
1797
1798         /*
1799          * Finish up with the device vnode
1800          */
1801         if ((devvp = hmp->devvp) != NULL) {
1802                 ronly = hmp->ronly;
1803                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1804                 kprintf("hammer2_unmount(A): devvp %s rbdirty %p ronly=%d\n",
1805                         hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree),
1806                         ronly);
1807                 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1808                 kprintf("hammer2_unmount(B): devvp %s rbdirty %p\n",
1809                         hmp->devrepname, RB_ROOT(&devvp->v_rbdirty_tree));
1810                 hmp->devvp = NULL;
1811                 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
1812                 vn_unlock(devvp);
1813                 vrele(devvp);
1814                 devvp = NULL;
1815         }
1816
1817         /*
1818          * Clear vchain/fchain flags that might prevent final cleanup
1819          * of these chains.
1820          */
1821         if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
1822                 atomic_add_long(&hammer2_count_modified_chains, -1);
1823                 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED);
1824                 hammer2_pfs_memory_wakeup(hmp->vchain.pmp);
1825         }
1826         if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
1827                 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_UPDATE);
1828         }
1829
1830         if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
1831                 atomic_add_long(&hammer2_count_modified_chains, -1);
1832                 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_MODIFIED);
1833                 hammer2_pfs_memory_wakeup(hmp->fchain.pmp);
1834         }
1835         if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
1836                 atomic_clear_int(&hmp->fchain.flags, HAMMER2_CHAIN_UPDATE);
1837         }
1838
1839         /*
1840          * Final drop of embedded freemap root chain to
1841          * clean up fchain.core (fchain structure is not
1842          * flagged ALLOCATED so it is cleaned out and then
1843          * left to rot).
1844          */
1845         hammer2_chain_drop(&hmp->fchain);
1846
1847         /*
1848          * Final drop of embedded volume root chain to clean
1849          * up vchain.core (vchain structure is not flagged
1850          * ALLOCATED so it is cleaned out and then left to
1851          * rot).
1852          */
1853         dumpcnt = 50;
1854         hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v', (u_int)-1);
1855         dumpcnt = 50;
1856         hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f', (u_int)-1);
1857 #if 0
1858         hammer2_dev_unlock(hmp);
1859 #endif
1860         hammer2_chain_drop(&hmp->vchain);
1861
1862         hammer2_io_cleanup(hmp, &hmp->iotree);
1863         if (hmp->iofree_count) {
1864                 kprintf("io_cleanup: %d I/O's left hanging\n",
1865                         hmp->iofree_count);
1866         }
1867
1868         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1869         kmalloc_destroy(&hmp->mchain);
1870         kfree(hmp, M_HAMMER2);
1871 }
1872
1873 int
1874 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1875                  ino_t ino, struct vnode **vpp)
1876 {
1877         hammer2_xop_lookup_t *xop;
1878         hammer2_pfs_t *pmp;
1879         hammer2_inode_t *ip;
1880         hammer2_tid_t inum;
1881         int error;
1882
1883         inum = (hammer2_tid_t)ino & HAMMER2_DIRHASH_USERMSK;
1884
1885         error = 0;
1886         pmp = MPTOPMP(mp);
1887
1888         /*
1889          * Easy if we already have it cached
1890          */
1891         ip = hammer2_inode_lookup(pmp, inum);
1892         if (ip) {
1893                 hammer2_inode_lock(ip, HAMMER2_RESOLVE_SHARED);
1894                 *vpp = hammer2_igetv(ip, &error);
1895                 hammer2_inode_unlock(ip);
1896                 hammer2_inode_drop(ip);         /* from lookup */
1897
1898                 return error;
1899         }
1900
1901         /*
1902          * Otherwise we have to find the inode
1903          */
1904         xop = hammer2_xop_alloc(pmp->iroot, 0);
1905         xop->lhc = inum;
1906         hammer2_xop_start(&xop->head, &hammer2_lookup_desc);
1907         error = hammer2_xop_collect(&xop->head, 0);
1908
1909         if (error == 0)
1910                 ip = hammer2_inode_get(pmp, &xop->head, -1, -1);
1911         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1912
1913         if (ip) {
1914                 *vpp = hammer2_igetv(ip, &error);
1915                 hammer2_inode_unlock(ip);
1916         } else {
1917                 *vpp = NULL;
1918                 error = ENOENT;
1919         }
1920         return (error);
1921 }
1922
1923 static
1924 int
1925 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1926 {
1927         hammer2_pfs_t *pmp;
1928         struct vnode *vp;
1929         int error;
1930
1931         pmp = MPTOPMP(mp);
1932         if (pmp->iroot == NULL) {
1933                 kprintf("hammer2 (%s): no root inode\n",
1934                         mp->mnt_stat.f_mntfromname);
1935                 *vpp = NULL;
1936                 return EINVAL;
1937         }
1938
1939         error = 0;
1940         hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED);
1941
1942         while (pmp->inode_tid == 0) {
1943                 hammer2_xop_ipcluster_t *xop;
1944                 const hammer2_inode_meta_t *meta;
1945
1946                 xop = hammer2_xop_alloc(pmp->iroot, HAMMER2_XOP_MODIFYING);
1947                 hammer2_xop_start(&xop->head, &hammer2_ipcluster_desc);
1948                 error = hammer2_xop_collect(&xop->head, 0);
1949
1950                 if (error == 0) {
1951                         meta = &hammer2_xop_gdata(&xop->head)->ipdata.meta;
1952                         pmp->iroot->meta = *meta;
1953                         pmp->inode_tid = meta->pfs_inum + 1;
1954                         hammer2_xop_pdata(&xop->head);
1955                         /* meta invalid */
1956
1957                         if (pmp->inode_tid < HAMMER2_INODE_START)
1958                                 pmp->inode_tid = HAMMER2_INODE_START;
1959                         pmp->modify_tid =
1960                                 xop->head.cluster.focus->bref.modify_tid + 1;
1961 #if 0
1962                         kprintf("PFS: Starting inode %jd\n",
1963                                 (intmax_t)pmp->inode_tid);
1964                         kprintf("PMP focus good set nextino=%ld mod=%016jx\n",
1965                                 pmp->inode_tid, pmp->modify_tid);
1966 #endif
1967                         wakeup(&pmp->iroot);
1968
1969                         hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1970
1971                         /*
1972                          * Prime the mount info.
1973                          */
1974                         hammer2_vfs_statfs(mp, &mp->mnt_stat, NULL);
1975                         break;
1976                 }
1977
1978                 /*
1979                  * Loop, try again
1980                  */
1981                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
1982                 hammer2_inode_unlock(pmp->iroot);
1983                 error = tsleep(&pmp->iroot, PCATCH, "h2root", hz);
1984                 hammer2_inode_lock(pmp->iroot, HAMMER2_RESOLVE_SHARED);
1985                 if (error == EINTR)
1986                         break;
1987         }
1988
1989         if (error) {
1990                 hammer2_inode_unlock(pmp->iroot);
1991                 *vpp = NULL;
1992         } else {
1993                 vp = hammer2_igetv(pmp->iroot, &error);
1994                 hammer2_inode_unlock(pmp->iroot);
1995                 *vpp = vp;
1996         }
1997
1998         return (error);
1999 }
2000
2001 /*
2002  * Filesystem status
2003  *
2004  * XXX incorporate ipdata->meta.inode_quota and data_quota
2005  */
2006 static
2007 int
2008 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
2009 {
2010         hammer2_pfs_t *pmp;
2011         hammer2_dev_t *hmp;
2012         hammer2_blockref_t bref;
2013         struct statfs tmp;
2014         int i;
2015
2016         /*
2017          * NOTE: iroot might not have validated the cluster yet.
2018          */
2019         pmp = MPTOPMP(mp);
2020
2021         bzero(&tmp, sizeof(tmp));
2022
2023         for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
2024                 hmp = pmp->pfs_hmps[i];
2025                 if (hmp == NULL)
2026                         continue;
2027                 if (pmp->iroot->cluster.array[i].chain)
2028                         bref = pmp->iroot->cluster.array[i].chain->bref;
2029                 else
2030                         bzero(&bref, sizeof(bref));
2031
2032                 tmp.f_files = bref.embed.stats.inode_count;
2033                 tmp.f_ffree = 0;
2034                 tmp.f_blocks = hmp->voldata.allocator_size /
2035                                mp->mnt_vstat.f_bsize;
2036                 tmp.f_bfree = hmp->voldata.allocator_free /
2037                               mp->mnt_vstat.f_bsize;
2038                 tmp.f_bavail = tmp.f_bfree;
2039
2040                 if (cred && cred->cr_uid != 0) {
2041                         uint64_t adj;
2042
2043                         /* 5% */
2044                         adj = hmp->free_reserved / mp->mnt_vstat.f_bsize;
2045                         tmp.f_blocks -= adj;
2046                         tmp.f_bfree -= adj;
2047                         tmp.f_bavail -= adj;
2048                 }
2049
2050                 mp->mnt_stat.f_blocks = tmp.f_blocks;
2051                 mp->mnt_stat.f_bfree = tmp.f_bfree;
2052                 mp->mnt_stat.f_bavail = tmp.f_bavail;
2053                 mp->mnt_stat.f_files = tmp.f_files;
2054                 mp->mnt_stat.f_ffree = tmp.f_ffree;
2055
2056                 *sbp = mp->mnt_stat;
2057         }
2058         return (0);
2059 }
2060
2061 static
2062 int
2063 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
2064 {
2065         hammer2_pfs_t *pmp;
2066         hammer2_dev_t *hmp;
2067         hammer2_blockref_t bref;
2068         struct statvfs tmp;
2069         int i;
2070
2071         /*
2072          * NOTE: iroot might not have validated the cluster yet.
2073          */
2074         pmp = MPTOPMP(mp);
2075         bzero(&tmp, sizeof(tmp));
2076
2077         for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
2078                 hmp = pmp->pfs_hmps[i];
2079                 if (hmp == NULL)
2080                         continue;
2081                 if (pmp->iroot->cluster.array[i].chain)
2082                         bref = pmp->iroot->cluster.array[i].chain->bref;
2083                 else
2084                         bzero(&bref, sizeof(bref));
2085
2086                 tmp.f_files = bref.embed.stats.inode_count;
2087                 tmp.f_ffree = 0;
2088                 tmp.f_blocks = hmp->voldata.allocator_size /
2089                                mp->mnt_vstat.f_bsize;
2090                 tmp.f_bfree = hmp->voldata.allocator_free /
2091                               mp->mnt_vstat.f_bsize;
2092                 tmp.f_bavail = tmp.f_bfree;
2093
2094                 if (cred && cred->cr_uid != 0) {
2095                         uint64_t adj;
2096
2097                         /* 5% */
2098                         adj = hmp->free_reserved / mp->mnt_vstat.f_bsize;
2099                         tmp.f_blocks -= adj;
2100                         tmp.f_bfree -= adj;
2101                         tmp.f_bavail -= adj;
2102                 }
2103
2104                 mp->mnt_vstat.f_blocks = tmp.f_blocks;
2105                 mp->mnt_vstat.f_bfree = tmp.f_bfree;
2106                 mp->mnt_vstat.f_bavail = tmp.f_bavail;
2107                 mp->mnt_vstat.f_files = tmp.f_files;
2108                 mp->mnt_vstat.f_ffree = tmp.f_ffree;
2109
2110                 *sbp = mp->mnt_vstat;
2111         }
2112         return (0);
2113 }
2114
2115 /*
2116  * Mount-time recovery (RW mounts)
2117  *
2118  * Updates to the free block table are allowed to lag flushes by one
2119  * transaction.  In case of a crash, then on a fresh mount we must do an
2120  * incremental scan of the last committed transaction id and make sure that
2121  * all related blocks have been marked allocated.
2122  *
2123  * The super-root topology and each PFS has its own transaction id domain,
2124  * so we must track PFS boundary transitions.
2125  */
2126 struct hammer2_recovery_elm {
2127         TAILQ_ENTRY(hammer2_recovery_elm) entry;
2128         hammer2_chain_t *chain;
2129         hammer2_tid_t sync_tid;
2130 };
2131
2132 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
2133
2134 struct hammer2_recovery_info {
2135         struct hammer2_recovery_list list;
2136         hammer2_tid_t   mtid;
2137         int     depth;
2138 };
2139
2140 static int hammer2_recovery_scan(hammer2_dev_t *hmp,
2141                         hammer2_chain_t *parent,
2142                         struct hammer2_recovery_info *info,
2143                         hammer2_tid_t sync_tid);
2144
2145 #define HAMMER2_RECOVERY_MAXDEPTH       10
2146
2147 static
2148 int
2149 hammer2_recovery(hammer2_dev_t *hmp)
2150 {
2151         struct hammer2_recovery_info info;
2152         struct hammer2_recovery_elm *elm;
2153         hammer2_chain_t *parent;
2154         hammer2_tid_t sync_tid;
2155         hammer2_tid_t mirror_tid;
2156         int error;
2157
2158         hammer2_trans_init(hmp->spmp, 0);
2159
2160         sync_tid = hmp->voldata.freemap_tid;
2161         mirror_tid = hmp->voldata.mirror_tid;
2162
2163         kprintf("hammer2 mount \"%s\": ", hmp->devrepname);
2164         if (sync_tid >= mirror_tid) {
2165                 kprintf(" no recovery needed\n");
2166         } else {
2167                 kprintf(" freemap recovery %016jx-%016jx\n",
2168                         sync_tid + 1, mirror_tid);
2169         }
2170
2171         TAILQ_INIT(&info.list);
2172         info.depth = 0;
2173         parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
2174         error = hammer2_recovery_scan(hmp, parent, &info, sync_tid);
2175         hammer2_chain_lookup_done(parent);
2176
2177         while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
2178                 TAILQ_REMOVE(&info.list, elm, entry);
2179                 parent = elm->chain;
2180                 sync_tid = elm->sync_tid;
2181                 kfree(elm, M_HAMMER2);
2182
2183                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2184                 error |= hammer2_recovery_scan(hmp, parent, &info,
2185                                               hmp->voldata.freemap_tid);
2186                 hammer2_chain_unlock(parent);
2187                 hammer2_chain_drop(parent);     /* drop elm->chain ref */
2188         }
2189
2190         hammer2_trans_done(hmp->spmp, 0);
2191
2192         return error;
2193 }
2194
2195 static
2196 int
2197 hammer2_recovery_scan(hammer2_dev_t *hmp, hammer2_chain_t *parent,
2198                       struct hammer2_recovery_info *info,
2199                       hammer2_tid_t sync_tid)
2200 {
2201         const hammer2_inode_data_t *ripdata;
2202         hammer2_chain_t *chain;
2203         hammer2_blockref_t bref;
2204         int tmp_error;
2205         int rup_error;
2206         int error;
2207         int first;
2208
2209         /*
2210          * Adjust freemap to ensure that the block(s) are marked allocated.
2211          */
2212         if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
2213                 hammer2_freemap_adjust(hmp, &parent->bref,
2214                                        HAMMER2_FREEMAP_DORECOVER);
2215         }
2216
2217         /*
2218          * Check type for recursive scan
2219          */
2220         switch(parent->bref.type) {
2221         case HAMMER2_BREF_TYPE_VOLUME:
2222                 /* data already instantiated */
2223                 break;
2224         case HAMMER2_BREF_TYPE_INODE:
2225                 /*
2226                  * Must instantiate data for DIRECTDATA test and also
2227                  * for recursion.
2228                  */
2229                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2230                 ripdata = &hammer2_chain_rdata(parent)->ipdata;
2231                 if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2232                         /* not applicable to recovery scan */
2233                         hammer2_chain_unlock(parent);
2234                         return 0;
2235                 }
2236                 hammer2_chain_unlock(parent);
2237                 break;
2238         case HAMMER2_BREF_TYPE_INDIRECT:
2239                 /*
2240                  * Must instantiate data for recursion
2241                  */
2242                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2243                 hammer2_chain_unlock(parent);
2244                 break;
2245         case HAMMER2_BREF_TYPE_DIRENT:
2246         case HAMMER2_BREF_TYPE_DATA:
2247         case HAMMER2_BREF_TYPE_FREEMAP:
2248         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2249         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2250                 /* not applicable to recovery scan */
2251                 return 0;
2252                 break;
2253         default:
2254                 return HAMMER2_ERROR_BADBREF;
2255         }
2256
2257         /*
2258          * Defer operation if depth limit reached or if we are crossing a
2259          * PFS boundary.
2260          */
2261         if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) {
2262                 struct hammer2_recovery_elm *elm;
2263
2264                 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2265                 elm->chain = parent;
2266                 elm->sync_tid = sync_tid;
2267                 hammer2_chain_ref(parent);
2268                 TAILQ_INSERT_TAIL(&info->list, elm, entry);
2269                 /* unlocked by caller */
2270
2271                 return(0);
2272         }
2273
2274
2275         /*
2276          * Recursive scan of the last flushed transaction only.  We are
2277          * doing this without pmp assignments so don't leave the chains
2278          * hanging around after we are done with them.
2279          *
2280          * error        Cumulative error this level only
2281          * rup_error    Cumulative error for recursion
2282          * tmp_error    Specific non-cumulative recursion error
2283          */
2284         chain = NULL;
2285         first = 1;
2286         rup_error = 0;
2287         error = 0;
2288
2289         for (;;) {
2290                 error |= hammer2_chain_scan(parent, &chain, &bref,
2291                                             &first,
2292                                             HAMMER2_LOOKUP_NODATA);
2293
2294                 /*
2295                  * Problem during scan or EOF
2296                  */
2297                 if (error)
2298                         break;
2299
2300                 /*
2301                  * If this is a leaf
2302                  */
2303                 if (chain == NULL) {
2304                         if (bref.mirror_tid > sync_tid) {
2305                                 hammer2_freemap_adjust(hmp, &bref,
2306                                                      HAMMER2_FREEMAP_DORECOVER);
2307                         }
2308                         continue;
2309                 }
2310
2311                 /*
2312                  * This may or may not be a recursive node.
2313                  */
2314                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2315                 if (bref.mirror_tid > sync_tid) {
2316                         ++info->depth;
2317                         tmp_error = hammer2_recovery_scan(hmp, chain,
2318                                                            info, sync_tid);
2319                         --info->depth;
2320                 } else {
2321                         tmp_error = 0;
2322                 }
2323
2324                 /*
2325                  * Flush the recovery at the PFS boundary to stage it for
2326                  * the final flush of the super-root topology.
2327                  */
2328                 if (tmp_error == 0 &&
2329                     (bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
2330                     (chain->flags & HAMMER2_CHAIN_ONFLUSH)) {
2331                         hammer2_flush(chain, HAMMER2_FLUSH_TOP |
2332                                              HAMMER2_FLUSH_ALL);
2333                 }
2334                 rup_error |= tmp_error;
2335         }
2336         return ((error | rup_error) & ~HAMMER2_ERROR_EOF);
2337 }
2338
2339 /*
2340  * This fixes up an error introduced in earlier H2 implementations where
2341  * moving a PFS inode into an indirect block wound up causing the
2342  * HAMMER2_BREF_FLAG_PFSROOT flag in the bref to get cleared.
2343  */
2344 static
2345 int
2346 hammer2_fixup_pfses(hammer2_dev_t *hmp)
2347 {
2348         const hammer2_inode_data_t *ripdata;
2349         hammer2_chain_t *parent;
2350         hammer2_chain_t *chain;
2351         hammer2_key_t key_next;
2352         hammer2_pfs_t *spmp;
2353         int error;
2354
2355         error = 0;
2356
2357         /*
2358          * Lookup mount point under the media-localized super-root.
2359          *
2360          * cluster->pmp will incorrectly point to spmp and must be fixed
2361          * up later on.
2362          */
2363         spmp = hmp->spmp;
2364         hammer2_inode_lock(spmp->iroot, 0);
2365         parent = hammer2_inode_chain(spmp->iroot, 0, HAMMER2_RESOLVE_ALWAYS);
2366         chain = hammer2_chain_lookup(&parent, &key_next,
2367                                          HAMMER2_KEY_MIN, HAMMER2_KEY_MAX,
2368                                          &error, 0);
2369         while (chain) {
2370                 if (chain->bref.type != HAMMER2_BREF_TYPE_INODE)
2371                         continue;
2372                 if (chain->error) {
2373                         kprintf("I/O error scanning PFS labels\n");
2374                         error |= chain->error;
2375                 } else if ((chain->bref.flags &
2376                             HAMMER2_BREF_FLAG_PFSROOT) == 0) {
2377                         int error2;
2378
2379                         ripdata = &chain->data->ipdata;
2380                         hammer2_trans_init(hmp->spmp, 0);
2381                         error2 = hammer2_chain_modify(chain,
2382                                                       chain->bref.modify_tid,
2383                                                       0, 0);
2384                         if (error2 == 0) {
2385                                 kprintf("hammer2: Correct mis-flagged PFS %s\n",
2386                                         ripdata->filename);
2387                                 chain->bref.flags |= HAMMER2_BREF_FLAG_PFSROOT;
2388                         } else {
2389                                 error |= error2;
2390                         }
2391                         hammer2_flush(chain, HAMMER2_FLUSH_TOP |
2392                                              HAMMER2_FLUSH_ALL);
2393                         hammer2_trans_done(hmp->spmp, 0);
2394                 }
2395                 chain = hammer2_chain_next(&parent, chain, &key_next,
2396                                            key_next, HAMMER2_KEY_MAX,
2397                                            &error, 0);
2398         }
2399         if (parent) {
2400                 hammer2_chain_unlock(parent);
2401                 hammer2_chain_drop(parent);
2402         }
2403         hammer2_inode_unlock(spmp->iroot);
2404
2405         return error;
2406 }
2407
2408 /*
2409  * Sync a mount point; this is called periodically on a per-mount basis from
2410  * the filesystem syncer, and whenever a user issues a sync.
2411  */
2412 int
2413 hammer2_vfs_sync(struct mount *mp, int waitfor)
2414 {
2415         int error;
2416
2417         error = hammer2_vfs_sync_pmp(MPTOPMP(mp), waitfor);
2418
2419         return error;
2420 }
2421
2422 /*
2423  * Because frontend operations lock vnodes before we get a chance to
2424  * lock the related inode, we can't just acquire a vnode lock without
2425  * risking a deadlock.  The frontend may be holding a vnode lock while
2426  * also blocked on our SYNCQ flag while trying to get the inode lock.
2427  *
2428  * To deal with this situation we can check the vnode lock situation
2429  * after locking the inode and perform a work-around.
2430  */
2431 int
2432 hammer2_vfs_sync_pmp(hammer2_pfs_t *pmp, int waitfor)
2433 {
2434         struct mount *mp;
2435         /*hammer2_xop_flush_t *xop;*/
2436         /*struct hammer2_sync_info info;*/
2437         hammer2_inode_t *ip;
2438         hammer2_inode_t *ipdrop;
2439         struct vnode *vp;
2440         uint32_t pass2;
2441         int error;
2442         int dorestart;
2443
2444         mp = pmp->mp;
2445
2446         /*
2447          * Move all inodes on sideq to syncq.  This will clear sideq.
2448          * This should represent all flushable inodes.  These inodes
2449          * will already have refs due to being on syncq or sideq.  We
2450          * must do this all at once to ensure that inode dependencies
2451          * are part of the same flush.
2452          *
2453          * We should be able to do this asynchronously from frontend
2454          * operations because we will be locking the inodes later on
2455          * to actually flush them, and that will partition any frontend
2456          * op using the same inode.  Either it has already locked the
2457          * inode and we will block, or it has not yet locked the inode
2458          * and it will block until we are finished flushing that inode.
2459          *
2460          * When restarting, only move the inodes flagged as PASS2.
2461          */
2462         hammer2_trans_init(pmp, HAMMER2_TRANS_ISFLUSH);
2463 #ifdef HAMMER2_DEBUG_SYNC
2464         kprintf("FILESYSTEM SYNC BOUNDARY\n");
2465 #endif
2466         dorestart = 0;
2467 restart:
2468 #ifdef HAMMER2_DEBUG_SYNC
2469         kprintf("FILESYSTEM SYNC RESTART (%d)\n", dorestart);
2470 #endif
2471         hammer2_trans_setflags(pmp, HAMMER2_TRANS_COPYQ);
2472         hammer2_trans_clearflags(pmp, HAMMER2_TRANS_RESCAN);
2473         hammer2_spin_ex(&pmp->list_spin);
2474
2475         ipdrop = TAILQ_FIRST(&pmp->sideq);
2476         while ((ip = ipdrop) != NULL) {
2477                 ipdrop = TAILQ_NEXT(ip, entry);
2478                 KKASSERT(ip->flags & HAMMER2_INODE_SIDEQ);
2479                 if (dorestart == 0 ||
2480                     (ip->flags & HAMMER2_INODE_SYNCQ_PASS2)) {
2481                         TAILQ_REMOVE(&pmp->sideq, ip, entry);
2482                         TAILQ_INSERT_TAIL(&pmp->syncq, ip, entry);
2483                         atomic_set_int(&ip->flags, HAMMER2_INODE_SYNCQ);
2484                         atomic_clear_int(&ip->flags,
2485                                          HAMMER2_INODE_SIDEQ);
2486                         --pmp->sideq_count;
2487                 }
2488         }
2489         hammer2_spin_unex(&pmp->list_spin);
2490         hammer2_trans_clearflags(pmp, HAMMER2_TRANS_COPYQ |
2491                                       HAMMER2_TRANS_WAITING);
2492         dorestart = 0;
2493
2494         /*
2495          * Now run through all inodes on syncq.
2496          *
2497          * Flush transactions only interlock with other flush transactions.
2498          * Any conflicting frontend operations will block on the inode, but
2499          * may hold a vnode lock while doing so.
2500          */
2501         ipdrop = NULL;
2502
2503         hammer2_spin_ex(&pmp->list_spin);
2504         while ((ip = TAILQ_FIRST(&pmp->syncq)) != NULL) {
2505                 /*
2506                  * Remove the inode from the SYNCQ, transfer the syncq ref
2507                  * to us.  We must clear SYNCQ to allow any potential
2508                  * front-end deadlock to proceed.
2509                  */
2510                 pass2 = ip->flags;
2511                 cpu_ccfence();
2512                 if (atomic_cmpset_int(&ip->flags,
2513                               pass2,
2514                               (pass2 & ~(HAMMER2_INODE_SYNCQ |
2515                                         HAMMER2_INODE_SYNCQ_WAKEUP)) |
2516                                         HAMMER2_INODE_SYNCQ_PASS2) == 0) {
2517                         continue;
2518                 }
2519                 if (pass2 & HAMMER2_INODE_SYNCQ_WAKEUP)
2520                         wakeup(&ip->flags);
2521                 TAILQ_REMOVE(&pmp->syncq, ip, entry);
2522                 hammer2_spin_unex(&pmp->list_spin);
2523                 if (ipdrop) {
2524                         hammer2_inode_drop(ipdrop);
2525                         ipdrop = NULL;
2526                 }
2527                 hammer2_mtx_ex(&ip->lock);
2528
2529                 /*
2530                  * We need the vp in order to vfsync() dirty buffers, so if
2531                  * one isn't attached we can skip it.
2532                  *
2533                  * Ordering the inode lock and then the vnode lock has the
2534                  * potential to deadlock.  If we had left SYNCQ set that could
2535                  * also deadlock us against the frontend even if we don't hold
2536                  * any locks, but the latter is not a problem now since we
2537                  * cleared it.  igetv will temporarily release the inode lock
2538                  * in a safe manner to work-around the deadlock.
2539                  *
2540                  * Unfortunately it is still possible to deadlock when the
2541                  * frontend obtains multiple inode locks, because all the
2542                  * related vnodes are already locked (nor can the vnode locks
2543                  * be released and reacquired without messing up RECLAIM and
2544                  * INACTIVE sequencing).
2545                  *
2546                  * The solution for now is to move the vp back onto SIDEQ
2547                  * and set dorestart, which will restart the flush after we
2548                  * exhaust the current SYNCQ.  Note that additional
2549                  * dependencies may build up, so we definitely need to move
2550                  * the whole SIDEQ back to SYNCQ when we restart.
2551                  */
2552                 vp = ip->vp;
2553                 if (vp) {
2554                         if (vget(vp, LK_EXCLUSIVE|LK_NOWAIT)) {
2555                                 /*
2556                                  * Failed, move to SIDEQ.  It may already be
2557                                  * on the SIDEQ if we lost a race.
2558                                  */
2559                                 vp = NULL;
2560                                 dorestart = 1;
2561 #ifdef HAMMER2_DEBUG_SYNC
2562                                 kprintf("inum %ld (sync delayed by vnode)\n",
2563                                         (long)ip->meta.inum);
2564 #endif
2565                                 hammer2_spin_ex(&pmp->list_spin);
2566                                 if ((ip->flags & (HAMMER2_INODE_SYNCQ |
2567                                                   HAMMER2_INODE_SIDEQ)) == 0) {
2568                                         /* XXX PASS2 redundant */
2569                                         atomic_set_int(&ip->flags,
2570                                                    HAMMER2_INODE_SIDEQ |
2571                                                    HAMMER2_INODE_SYNCQ_PASS2);
2572                                         TAILQ_INSERT_TAIL(&pmp->sideq, ip,
2573                                                           entry);
2574                                         hammer2_spin_unex(&pmp->list_spin);
2575                                         hammer2_mtx_unlock(&ip->lock);
2576                                 } else if (ip->flags & HAMMER2_INODE_SIDEQ) {
2577                                         /* XXX PASS2 redundant */
2578                                         atomic_set_int(&ip->flags,
2579                                                    HAMMER2_INODE_SYNCQ_PASS2);
2580                                         hammer2_spin_unex(&pmp->list_spin);
2581                                         hammer2_mtx_unlock(&ip->lock);
2582                                         hammer2_inode_drop(ip);
2583                                 } else {
2584                                         hammer2_spin_unex(&pmp->list_spin);
2585                                         hammer2_mtx_unlock(&ip->lock);
2586                                         hammer2_inode_drop(ip);
2587                                 }
2588                                 if (pass2 & HAMMER2_INODE_SYNCQ_PASS2) {
2589                                         tsleep(&dorestart, 0, "h2syndel", 2);
2590                                 }
2591                                 hammer2_spin_ex(&pmp->list_spin);
2592                                 continue;
2593                         }
2594                 } else {
2595                         vp = NULL;
2596                 }
2597
2598                 /*
2599                  * Ok we have the inode exclusively locked and if vp is
2600                  * not NULL that will also be exclusively locked.  Do the
2601                  * meat of the flush.
2602                  *
2603                  * vp token needed for v_rbdirty_tree check / vclrisdirty
2604                  * sequencing.  Though we hold the vnode exclusively so
2605                  * we shouldn't need to hold the token also in this case.
2606                  */
2607                 if (vp) {
2608                         vfsync(vp, MNT_WAIT, 1, NULL, NULL);
2609                         bio_track_wait(&vp->v_track_write, 0, 0); /* XXX */
2610                 }
2611
2612                 /*
2613                  * If the inode has not yet been inserted into the tree
2614                  * we must do so.  Then sync and flush it.  The flush should
2615                  * update the parent.
2616                  */
2617                 if (ip->flags & HAMMER2_INODE_DELETING) {
2618 #ifdef HAMMER2_DEBUG_SYNC
2619                         kprintf("inum %ld destroy\n", (long)ip->meta.inum);
2620 #endif
2621                         hammer2_inode_chain_des(ip);
2622                         atomic_add_long(&hammer2_iod_inode_deletes, 1);
2623                 } else if (ip->flags & HAMMER2_INODE_CREATING) {
2624 #ifdef HAMMER2_DEBUG_SYNC
2625                         kprintf("inum %ld insert\n", (long)ip->meta.inum);
2626 #endif
2627                         hammer2_inode_chain_ins(ip);
2628                         atomic_add_long(&hammer2_iod_inode_creates, 1);
2629                 }
2630 #ifdef HAMMER2_DEBUG_SYNC
2631                 kprintf("inum %ld chain-sync\n", (long)ip->meta.inum);
2632 #endif
2633                 hammer2_inode_chain_sync(ip);
2634                 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
2635                                               HAMMER2_XOP_FSSYNC);
2636                 if (vp) {
2637                         lwkt_gettoken(&vp->v_token);
2638                         if ((ip->flags & (HAMMER2_INODE_MODIFIED |
2639                                           HAMMER2_INODE_RESIZED |
2640                                           HAMMER2_INODE_DIRTYDATA)) == 0 &&
2641                             RB_EMPTY(&vp->v_rbdirty_tree) &&
2642                             !bio_track_active(&vp->v_track_write)) {
2643                                 vclrisdirty(vp);
2644                         }
2645                         lwkt_reltoken(&vp->v_token);
2646                         vput(vp);
2647                 }
2648                 atomic_clear_int(&ip->flags, HAMMER2_INODE_SYNCQ_PASS2);
2649                 hammer2_inode_unlock(ip);       /* unlock+drop */
2650                 /* ip pointer invalid */
2651
2652                 /*
2653                  * If the inode got dirted after we dropped our locks,
2654                  * it will have already been moved back to the SIDEQ.
2655                  */
2656                 hammer2_spin_ex(&pmp->list_spin);
2657         }
2658         hammer2_spin_unex(&pmp->list_spin);
2659         if (ipdrop) {
2660                 hammer2_inode_drop(ipdrop);
2661                 ipdrop = NULL;
2662         }
2663         if (dorestart || (pmp->trans.flags & HAMMER2_TRANS_RESCAN)) {
2664 #ifdef HAMMER2_DEBUG_SYNC
2665                 kprintf("FILESYSTEM SYNC STAGE 1 RESTART\n");
2666                 tsleep(&dorestart, 0, "h2STG1-R", hz*20);
2667 #endif
2668                 dorestart = 1;
2669                 goto restart;
2670         }
2671 #ifdef HAMMER2_DEBUG_SYNC
2672         kprintf("FILESYSTEM SYNC STAGE 2 BEGIN\n");
2673         tsleep(&dorestart, 0, "h2STG2", hz*20);
2674 #endif
2675
2676         /*
2677          * We have to flush the PFS root last, even if it does not appear to
2678          * be dirty, because all the inodes in the PFS are indexed under it.
2679          * The normal flushing of iroot above would only occur if directory
2680          * entries under the root were changed.
2681          *
2682          * Specifying VOLHDR will cause an additionl flush of hmp->spmp
2683          * for the media making up the cluster.
2684          */
2685         if ((ip = pmp->iroot) != NULL) {
2686                 hammer2_inode_ref(ip);
2687                 hammer2_mtx_ex(&ip->lock);
2688                 hammer2_inode_chain_sync(ip);
2689                 hammer2_inode_chain_flush(ip, HAMMER2_XOP_INODE_STOP |
2690                                               HAMMER2_XOP_FSSYNC |
2691                                               HAMMER2_XOP_VOLHDR);
2692                 hammer2_inode_unlock(ip);       /* unlock+drop */
2693         }
2694 #ifdef HAMMER2_DEBUG_SYNC
2695         kprintf("FILESYSTEM SYNC STAGE 2 DONE\n");
2696 #endif
2697
2698         /*
2699          * device bioq sync
2700          */
2701         hammer2_bioq_sync(pmp);
2702
2703 #if 0
2704         info.pass = 1;
2705         info.waitfor = MNT_WAIT;
2706         vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2707
2708         info.pass = 2;
2709         info.waitfor = MNT_WAIT;
2710         vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2711 #endif
2712 #if 0
2713         /*
2714          * Generally speaking we now want to flush the media topology from
2715          * the iroot through to the inodes.  The flush stops at any inode
2716          * boundary, which allows the frontend to continue running concurrent
2717          * modifying operations on inodes (including kernel flushes of
2718          * buffers) without interfering with the main sync.
2719          *
2720          * Use the XOP interface to concurrently flush all nodes to
2721          * synchronize the PFSROOT subtopology to the media.  A standard
2722          * end-of-scan ENOENT error indicates cluster sufficiency.
2723          *
2724          * Note that this flush will not be visible on crash recovery until
2725          * we flush the super-root topology in the next loop.
2726          *
2727          * XXX For now wait for all flushes to complete.
2728          */
2729         if (mp && (ip = pmp->iroot) != NULL) {
2730                 /*
2731                  * If unmounting try to flush everything including any
2732                  * sub-trees under inodes, just in case there is dangling
2733                  * modified data, as a safety.  Otherwise just flush up to
2734                  * the inodes in this stage.
2735                  */
2736                 kprintf("MP & IROOT\n");
2737 #ifdef HAMMER2_DEBUG_SYNC
2738                 kprintf("FILESYSTEM SYNC STAGE 3 IROOT BEGIN\n");
2739 #endif
2740                 if (mp->mnt_kern_flag & MNTK_UNMOUNT) {
2741                         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING |
2742                                                     HAMMER2_XOP_VOLHDR |
2743                                                     HAMMER2_XOP_FSSYNC |
2744                                                     HAMMER2_XOP_INODE_STOP);
2745                 } else {
2746                         xop = hammer2_xop_alloc(ip, HAMMER2_XOP_MODIFYING |
2747                                                     HAMMER2_XOP_INODE_STOP |
2748                                                     HAMMER2_XOP_VOLHDR |
2749                                                     HAMMER2_XOP_FSSYNC |
2750                                                     HAMMER2_XOP_INODE_STOP);
2751                 }
2752                 hammer2_xop_start(&xop->head, &hammer2_inode_flush_desc);
2753                 error = hammer2_xop_collect(&xop->head,
2754                                             HAMMER2_XOP_COLLECT_WAITALL);
2755                 hammer2_xop_retire(&xop->head, HAMMER2_XOPMASK_VOP);
2756 #ifdef HAMMER2_DEBUG_SYNC
2757                 kprintf("FILESYSTEM SYNC STAGE 3 IROOT END\n");
2758 #endif
2759                 if (error == HAMMER2_ERROR_ENOENT)
2760                         error = 0;
2761                 else
2762                         error = hammer2_error_to_errno(error);
2763         } else {
2764                 error = 0;
2765         }
2766 #endif
2767         error = 0;      /* XXX */
2768         hammer2_trans_done(pmp, HAMMER2_TRANS_ISFLUSH);
2769
2770         return (error);
2771 }
2772
2773 static
2774 int
2775 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2776 {
2777         hammer2_inode_t *ip;
2778
2779         KKASSERT(MAXFIDSZ >= 16);
2780         ip = VTOI(vp);
2781         fhp->fid_len = offsetof(struct fid, fid_data[16]);
2782         fhp->fid_ext = 0;
2783         ((hammer2_tid_t *)fhp->fid_data)[0] = ip->meta.inum;
2784         ((hammer2_tid_t *)fhp->fid_data)[1] = 0;
2785
2786         return 0;
2787 }
2788
2789 static
2790 int
2791 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2792                struct fid *fhp, struct vnode **vpp)
2793 {
2794         hammer2_pfs_t *pmp;
2795         hammer2_tid_t inum;
2796         int error;
2797
2798         pmp = MPTOPMP(mp);
2799         inum = ((hammer2_tid_t *)fhp->fid_data)[0] & HAMMER2_DIRHASH_USERMSK;
2800         if (vpp) {
2801                 if (inum == 1)
2802                         error = hammer2_vfs_root(mp, vpp);
2803                 else
2804                         error = hammer2_vfs_vget(mp, NULL, inum, vpp);
2805         } else {
2806                 error = 0;
2807         }
2808         if (error)
2809                 kprintf("fhtovp: %016jx -> %p, %d\n", inum, *vpp, error);
2810         return error;
2811 }
2812
2813 static
2814 int
2815 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2816                  int *exflagsp, struct ucred **credanonp)
2817 {
2818         hammer2_pfs_t *pmp;
2819         struct netcred *np;
2820         int error;
2821
2822         pmp = MPTOPMP(mp);
2823         np = vfs_export_lookup(mp, &pmp->export, nam);
2824         if (np) {
2825                 *exflagsp = np->netc_exflags;
2826                 *credanonp = &np->netc_anon;
2827                 error = 0;
2828         } else {
2829                 error = EACCES;
2830         }
2831         return error;
2832 }
2833
2834 /*
2835  * Support code for hammer2_vfs_mount().  Read, verify, and install the volume
2836  * header into the HMP
2837  *
2838  * XXX read four volhdrs and use the one with the highest TID whos CRC
2839  *     matches.
2840  *
2841  * XXX check iCRCs.
2842  *
2843  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2844  *     nonexistant locations.
2845  *
2846  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2847  */
2848 static
2849 int
2850 hammer2_install_volume_header(hammer2_dev_t *hmp)
2851 {
2852         hammer2_volume_data_t *vd;
2853         struct buf *bp;
2854         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2855         int error_reported;
2856         int error;
2857         int valid;
2858         int i;
2859
2860         error_reported = 0;
2861         error = 0;
2862         valid = 0;
2863         bp = NULL;
2864
2865         /*
2866          * There are up to 4 copies of the volume header (syncs iterate
2867          * between them so there is no single master).  We don't trust the
2868          * volu_size field so we don't know precisely how large the filesystem
2869          * is, so depend on the OS to return an error if we go beyond the
2870          * block device's EOF.
2871          */
2872         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2873                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2874                               HAMMER2_VOLUME_BYTES, &bp);
2875                 if (error) {
2876                         brelse(bp);
2877                         bp = NULL;
2878                         continue;
2879                 }
2880
2881                 vd = (struct hammer2_volume_data *) bp->b_data;
2882                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2883                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2884                         brelse(bp);
2885                         bp = NULL;
2886                         continue;
2887                 }
2888
2889                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2890                         /* XXX: Reversed-endianness filesystem */
2891                         kprintf("hammer2: reverse-endian filesystem detected");
2892                         brelse(bp);
2893                         bp = NULL;
2894                         continue;
2895                 }
2896
2897                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2898                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2899                                       HAMMER2_VOLUME_ICRC0_SIZE);
2900                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2901                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2902                                        HAMMER2_VOLUME_ICRC1_SIZE);
2903                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
2904                         kprintf("hammer2 volume header crc "
2905                                 "mismatch copy #%d %08x/%08x\n",
2906                                 i, crc0, crc);
2907                         error_reported = 1;
2908                         brelse(bp);
2909                         bp = NULL;
2910                         continue;
2911                 }
2912                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
2913                         valid = 1;
2914                         hmp->voldata = *vd;
2915                         hmp->volhdrno = i;
2916                 }
2917                 brelse(bp);
2918                 bp = NULL;
2919         }
2920         if (valid) {
2921                 hmp->volsync = hmp->voldata;
2922                 hmp->free_reserved = hmp->voldata.allocator_size / 20;
2923                 error = 0;
2924                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
2925                         kprintf("hammer2: using volume header #%d\n",
2926                                 hmp->volhdrno);
2927                 }
2928         } else {
2929                 error = EINVAL;
2930                 kprintf("hammer2: no valid volume headers found!\n");
2931         }
2932         return (error);
2933 }
2934
2935 /*
2936  * This handles hysteresis on regular file flushes.  Because the BIOs are
2937  * routed to a thread it is possible for an excessive number to build up
2938  * and cause long front-end stalls long before the runningbuffspace limit
2939  * is hit, so we implement hammer2_flush_pipe to control the
2940  * hysteresis.
2941  *
2942  * This is a particular problem when compression is used.
2943  */
2944 void
2945 hammer2_lwinprog_ref(hammer2_pfs_t *pmp)
2946 {
2947         atomic_add_int(&pmp->count_lwinprog, 1);
2948 }
2949
2950 void
2951 hammer2_lwinprog_drop(hammer2_pfs_t *pmp)
2952 {
2953         int lwinprog;
2954
2955         lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2956         if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2957             (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2958                 atomic_clear_int(&pmp->count_lwinprog,
2959                                  HAMMER2_LWINPROG_WAITING);
2960                 wakeup(&pmp->count_lwinprog);
2961         }
2962         if ((lwinprog & HAMMER2_LWINPROG_WAITING0) &&
2963             (lwinprog & HAMMER2_LWINPROG_MASK) <= 0) {
2964                 atomic_clear_int(&pmp->count_lwinprog,
2965                                  HAMMER2_LWINPROG_WAITING0);
2966                 wakeup(&pmp->count_lwinprog);
2967         }
2968 }
2969
2970 void
2971 hammer2_lwinprog_wait(hammer2_pfs_t *pmp, int flush_pipe)
2972 {
2973         int lwinprog;
2974         int lwflag = (flush_pipe) ? HAMMER2_LWINPROG_WAITING :
2975                                     HAMMER2_LWINPROG_WAITING0;
2976
2977         for (;;) {
2978                 lwinprog = pmp->count_lwinprog;
2979                 cpu_ccfence();
2980                 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe)
2981                         break;
2982                 tsleep_interlock(&pmp->count_lwinprog, 0);
2983                 atomic_set_int(&pmp->count_lwinprog, lwflag);
2984                 lwinprog = pmp->count_lwinprog;
2985                 if ((lwinprog & HAMMER2_LWINPROG_MASK) <= flush_pipe)
2986                         break;
2987                 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2988         }
2989 }
2990
2991 /*
2992  * Attempt to proactively fsync dirty vnodes if we have too many.  This
2993  * solves an issue where the kernel syncer thread can get seriously behind
2994  * when multiple user processes/threads are furiously modifying inodes.
2995  * This situation can occur on slow storage and is only limited by
2996  * kern.maxvnodes without the moderation code below.  It is made worse
2997  * when the device buffers underlying the modified inodes (which are clean)
2998  * get evicted before the flush can occur, forcing a re-read.
2999  *
3000  * We do not want sysads to feel that they have to torpedo kern.maxvnodes
3001  * to solve this problem, so we implement vfs.hammer2.limit_dirty_inodes
3002  * (per-mount-basis) and default it to something reasonable.
3003  *
3004  * XXX we cannot safely block here because we might be holding locks that
3005  * the syncer needs.
3006  */
3007 static void
3008 hammer2_pfs_moderate(hammer2_inode_t *ip, int always_moderate)
3009 {
3010         hammer2_pfs_t *pmp = ip->pmp;
3011         struct mount *mp = pmp->mp;
3012
3013         if (mp && vn_syncer_count(mp) > hammer2_limit_dirty_inodes) {
3014                 speedup_syncer(mp);
3015                 /*vn_syncer_one(mp);*/
3016         }
3017 }
3018
3019 /*
3020  * Manage excessive memory resource use for chain and related
3021  * structures.
3022  *
3023  * Called without any inode locks or transaction locks.  VNodes
3024  * might be locked by the kernel in the call stack.
3025  */
3026 void
3027 hammer2_pfs_memory_wait(hammer2_inode_t *ip, int always_moderate)
3028 {
3029         hammer2_pfs_t *pmp = ip->pmp;
3030         uint32_t waiting;
3031         uint32_t count;
3032         uint32_t limit;
3033 #if 0
3034         static int zzticks;
3035 #endif
3036
3037         return; /* XXX */
3038
3039         /*
3040          * Moderate the number of dirty inodes
3041          */
3042         hammer2_pfs_moderate(ip, always_moderate);
3043
3044         /*
3045          * Atomic check condition and wait.  Also do an early speedup of
3046          * the syncer to try to avoid hitting the wait.
3047          */
3048         for (;;) {
3049                 waiting = pmp->inmem_dirty_chains;
3050                 cpu_ccfence();
3051                 count = waiting & HAMMER2_DIRTYCHAIN_MASK;
3052
3053                 limit = pmp->mp->mnt_nvnodelistsize / 10;
3054                 if (limit < hammer2_limit_dirty_chains)
3055                         limit = hammer2_limit_dirty_chains;
3056                 if (limit < 1000)
3057                         limit = 1000;
3058
3059 #if 0
3060                 if ((int)(ticks - zzticks) > hz) {
3061                         zzticks = ticks;
3062                         kprintf("count %ld %ld\n", count, limit);
3063                 }
3064 #endif
3065
3066                 /*
3067                  * Block if there are too many dirty chains present, wait
3068                  * for the flush to clean some out.
3069                  */
3070                 if (count > limit) {
3071                         hammer2_pfs_moderate(ip, always_moderate);
3072                         tsleep_interlock(&pmp->inmem_dirty_chains, 0);
3073                         if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
3074                                                waiting,
3075                                        waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
3076                                 if (ticks != pmp->speedup_ticks) {
3077                                         pmp->speedup_ticks = ticks;
3078                                         speedup_syncer(pmp->mp);
3079                                 }
3080                                 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
3081                                        "chnmem", hz);
3082                         }
3083                         continue;       /* loop on success or fail */
3084                 }
3085
3086                 /*
3087                  * Try to start an early flush before we are forced to block.
3088                  */
3089                 if (count > limit * 5 / 10 &&
3090                     ticks != pmp->speedup_ticks) {
3091                         pmp->speedup_ticks = ticks;
3092                         speedup_syncer(pmp->mp);
3093                 }
3094                 break;
3095         }
3096 }
3097
3098 void
3099 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp)
3100 {
3101         if (pmp) {
3102                 atomic_add_int(&pmp->inmem_dirty_chains, 1);
3103         }
3104 }
3105
3106 void
3107 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp)
3108 {
3109         uint32_t waiting;
3110
3111         if (pmp) {
3112                 waiting = atomic_fetchadd_int(&pmp->inmem_dirty_chains, -1);
3113                 /* don't need --waiting to test flag */
3114                 if (waiting & HAMMER2_DIRTYCHAIN_WAITING) {
3115                         atomic_clear_int(&pmp->inmem_dirty_chains,
3116                                          HAMMER2_DIRTYCHAIN_WAITING);
3117                         wakeup(&pmp->inmem_dirty_chains);
3118                 }
3119         }
3120 }
3121
3122 /*
3123  * Returns 0 if the filesystem has tons of free space
3124  * Returns 1 if the filesystem has less than 10% remaining
3125  * Returns 2 if the filesystem has less than 2%/5% (user/root) remaining.
3126  */
3127 int
3128 hammer2_vfs_enospace(hammer2_inode_t *ip, off_t bytes, struct ucred *cred)
3129 {
3130         hammer2_pfs_t *pmp;
3131         hammer2_dev_t *hmp;
3132         hammer2_off_t free_reserved;
3133         hammer2_off_t free_nominal;
3134         int i;
3135
3136         pmp = ip->pmp;
3137
3138         if (pmp->free_ticks == 0 || pmp->free_ticks != ticks) {
3139                 free_reserved = HAMMER2_SEGSIZE;
3140                 free_nominal = 0x7FFFFFFFFFFFFFFFLLU;
3141                 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
3142                         hmp = pmp->pfs_hmps[i];
3143                         if (hmp == NULL)
3144                                 continue;
3145                         if (pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER &&
3146                             pmp->pfs_types[i] != HAMMER2_PFSTYPE_SOFT_MASTER)
3147                                 continue;
3148
3149                         if (free_nominal > hmp->voldata.allocator_free)
3150                                 free_nominal = hmp->voldata.allocator_free;
3151                         if (free_reserved < hmp->free_reserved)
3152                                 free_reserved = hmp->free_reserved;
3153                 }
3154
3155                 /*
3156                  * SMP races ok
3157                  */
3158                 pmp->free_reserved = free_reserved;
3159                 pmp->free_nominal = free_nominal;
3160                 pmp->free_ticks = ticks;
3161         } else {
3162                 free_reserved = pmp->free_reserved;
3163                 free_nominal = pmp->free_nominal;
3164         }
3165         if (cred && cred->cr_uid != 0) {
3166                 if ((int64_t)(free_nominal - bytes) <
3167                     (int64_t)free_reserved) {
3168                         return 2;
3169                 }
3170         } else {
3171                 if ((int64_t)(free_nominal - bytes) <
3172                     (int64_t)free_reserved / 2) {
3173                         return 2;
3174                 }
3175         }
3176         if ((int64_t)(free_nominal - bytes) < (int64_t)free_reserved * 2)
3177                 return 1;
3178         return 0;
3179 }
3180
3181 /*
3182  * Debugging
3183  */
3184 void
3185 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx,
3186                    u_int flags)
3187 {
3188         hammer2_chain_t *scan;
3189         hammer2_chain_t *parent;
3190
3191         --*countp;
3192         if (*countp == 0) {
3193                 kprintf("%*.*s...\n", tab, tab, "");
3194                 return;
3195         }
3196         if (*countp < 0)
3197                 return;
3198         kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
3199                 tab, tab, "", pfx,
3200                 chain, chain->bref.type,
3201                 chain->bref.key, chain->bref.keybits,
3202                 chain->bref.mirror_tid);
3203
3204         kprintf("%*.*s      [%08x] (%s) refs=%d",
3205                 tab, tab, "",
3206                 chain->flags,
3207                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
3208                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
3209                 chain->refs);
3210
3211         parent = chain->parent;
3212         if (parent)
3213                 kprintf("\n%*.*s      p=%p [pflags %08x prefs %d",
3214                         tab, tab, "",
3215                         parent, parent->flags, parent->refs);
3216         if (RB_EMPTY(&chain->core.rbtree)) {
3217                 kprintf("\n");
3218         } else {
3219                 kprintf(" {\n");
3220                 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree) {
3221                         if ((scan->flags & flags) || flags == (u_int)-1) {
3222                                 hammer2_dump_chain(scan, tab + 4, countp, 'a',
3223                                                    flags);
3224                         }
3225                 }
3226                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
3227                         kprintf("%*.*s}(%s)\n", tab, tab, "",
3228                                 chain->data->ipdata.filename);
3229                 else
3230                         kprintf("%*.*s}\n", tab, tab, "");
3231         }
3232 }