52c2b26dd67f1c71714210fcc36b6df81217c264
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61 #include "hammer2_lz4.h"
62
63 #include "zlib/hammer2_zlib.h"
64
65 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
66
67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
68
69 struct hammer2_sync_info {
70         hammer2_trans_t trans;
71         int error;
72         int waitfor;
73 };
74
75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
76 TAILQ_HEAD(hammer2_pfslist, hammer2_pfs);
77 static struct hammer2_mntlist hammer2_mntlist;
78 static struct hammer2_pfslist hammer2_pfslist;
79 static struct lock hammer2_mntlk;
80
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 int hammer2_flush_pipe = 100;
85 int hammer2_synchronous_flush = 1;
86 int hammer2_dio_count;
87 long hammer2_limit_dirty_chains;
88 long hammer2_iod_file_read;
89 long hammer2_iod_meta_read;
90 long hammer2_iod_indr_read;
91 long hammer2_iod_fmap_read;
92 long hammer2_iod_volu_read;
93 long hammer2_iod_file_write;
94 long hammer2_iod_meta_write;
95 long hammer2_iod_indr_write;
96 long hammer2_iod_fmap_write;
97 long hammer2_iod_volu_write;
98 long hammer2_ioa_file_read;
99 long hammer2_ioa_meta_read;
100 long hammer2_ioa_indr_read;
101 long hammer2_ioa_fmap_read;
102 long hammer2_ioa_volu_read;
103 long hammer2_ioa_fmap_write;
104 long hammer2_ioa_file_write;
105 long hammer2_ioa_meta_write;
106 long hammer2_ioa_indr_write;
107 long hammer2_ioa_volu_write;
108
109 MALLOC_DECLARE(C_BUFFER);
110 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
111
112 MALLOC_DECLARE(D_BUFFER);
113 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
114
115 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
116
117 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
118            &hammer2_debug, 0, "");
119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
120            &hammer2_cluster_enable, 0, "");
121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
122            &hammer2_hardlink_enable, 0, "");
123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
124            &hammer2_flush_pipe, 0, "");
125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
126            &hammer2_synchronous_flush, 0, "");
127 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
128            &hammer2_limit_dirty_chains, 0, "");
129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
130            &hammer2_dio_count, 0, "");
131
132 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
133            &hammer2_iod_file_read, 0, "");
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
135            &hammer2_iod_meta_read, 0, "");
136 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
137            &hammer2_iod_indr_read, 0, "");
138 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
139            &hammer2_iod_fmap_read, 0, "");
140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
141            &hammer2_iod_volu_read, 0, "");
142
143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
144            &hammer2_iod_file_write, 0, "");
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
146            &hammer2_iod_meta_write, 0, "");
147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
148            &hammer2_iod_indr_write, 0, "");
149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
150            &hammer2_iod_fmap_write, 0, "");
151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
152            &hammer2_iod_volu_write, 0, "");
153
154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
155            &hammer2_ioa_file_read, 0, "");
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
157            &hammer2_ioa_meta_read, 0, "");
158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
159            &hammer2_ioa_indr_read, 0, "");
160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
161            &hammer2_ioa_fmap_read, 0, "");
162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
163            &hammer2_ioa_volu_read, 0, "");
164
165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
166            &hammer2_ioa_file_write, 0, "");
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
168            &hammer2_ioa_meta_write, 0, "");
169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
170            &hammer2_ioa_indr_write, 0, "");
171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
172            &hammer2_ioa_fmap_write, 0, "");
173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
174            &hammer2_ioa_volu_write, 0, "");
175
176 static int hammer2_vfs_init(struct vfsconf *conf);
177 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
178 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
179                                 struct ucred *cred);
180 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *,
181                                 struct vnode *, struct ucred *);
182 static int hammer2_recovery(hammer2_dev_t *hmp);
183 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
184 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
185 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
186                                 struct ucred *cred);
187 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
188                                 struct ucred *cred);
189 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
190                                 ino_t ino, struct vnode **vpp);
191 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
192                                 struct fid *fhp, struct vnode **vpp);
193 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
194 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
195                                 int *exflagsp, struct ucred **credanonp);
196
197 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
198 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
199
200 static void hammer2_update_pmps(hammer2_dev_t *hmp);
201 static void hammer2_write_thread(void *arg);
202
203 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp);
204 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp,
205                                 hammer2_dev_t *hmp);
206
207 /* 
208  * Functions for compression in threads,
209  * from hammer2_vnops.c
210  */
211 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
212                                 hammer2_inode_t *ip,
213                                 const hammer2_inode_data_t *ripdata,
214                                 hammer2_cluster_t *cparent,
215                                 hammer2_key_t lbase, int ioflag, int pblksize,
216                                 int *errorp);
217 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
218                                 hammer2_inode_t *ip,
219                                 const hammer2_inode_data_t *ripdata,
220                                 hammer2_cluster_t *cparent,
221                                 hammer2_key_t lbase, int ioflag,
222                                 int pblksize, int *errorp,
223                                 int comp_algo, int check_algo);
224 static void hammer2_zero_check_and_write(struct buf *bp,
225                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
226                                 const hammer2_inode_data_t *ripdata,
227                                 hammer2_cluster_t *cparent,
228                                 hammer2_key_t lbase,
229                                 int ioflag, int pblksize, int *errorp,
230                                 int check_algo);
231 static int test_block_zeros(const char *buf, size_t bytes);
232 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
233                                 hammer2_inode_t *ip,
234                                 const hammer2_inode_data_t *ripdata,
235                                 hammer2_cluster_t *cparent,
236                                 hammer2_key_t lbase,
237                                 int *errorp);
238 static void hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp,
239                                 int ioflag, int pblksize, int *errorp,
240                                 int check_algo);
241
242 /*
243  * HAMMER2 vfs operations.
244  */
245 static struct vfsops hammer2_vfsops = {
246         .vfs_init       = hammer2_vfs_init,
247         .vfs_uninit     = hammer2_vfs_uninit,
248         .vfs_sync       = hammer2_vfs_sync,
249         .vfs_mount      = hammer2_vfs_mount,
250         .vfs_unmount    = hammer2_vfs_unmount,
251         .vfs_root       = hammer2_vfs_root,
252         .vfs_statfs     = hammer2_vfs_statfs,
253         .vfs_statvfs    = hammer2_vfs_statvfs,
254         .vfs_vget       = hammer2_vfs_vget,
255         .vfs_vptofh     = hammer2_vfs_vptofh,
256         .vfs_fhtovp     = hammer2_vfs_fhtovp,
257         .vfs_checkexp   = hammer2_vfs_checkexp
258 };
259
260 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
261
262 VFS_SET(hammer2_vfsops, hammer2, 0);
263 MODULE_VERSION(hammer2, 1);
264
265 static
266 int
267 hammer2_vfs_init(struct vfsconf *conf)
268 {
269         static struct objcache_malloc_args margs_read;
270         static struct objcache_malloc_args margs_write;
271
272         int error;
273
274         error = 0;
275
276         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
277                 error = EINVAL;
278         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
279                 error = EINVAL;
280         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
281                 error = EINVAL;
282
283         if (error)
284                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
285         
286         margs_read.objsize = 65536;
287         margs_read.mtype = D_BUFFER;
288         
289         margs_write.objsize = 32768;
290         margs_write.mtype = C_BUFFER;
291         
292         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
293                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
294                                 objcache_malloc_free, &margs_read);
295         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
296                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
297                                 objcache_malloc_free, &margs_write);
298
299         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
300         TAILQ_INIT(&hammer2_mntlist);
301         TAILQ_INIT(&hammer2_pfslist);
302
303         hammer2_limit_dirty_chains = desiredvnodes / 10;
304
305         hammer2_trans_manage_init();
306
307         return (error);
308 }
309
310 static
311 int
312 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
313 {
314         objcache_destroy(cache_buffer_read);
315         objcache_destroy(cache_buffer_write);
316         return 0;
317 }
318
319 /*
320  * Core PFS allocator.  Used to allocate the pmp structure for PFS cluster
321  * mounts and the spmp structure for media (hmp) structures.
322  *
323  * XXX check locking
324  */
325 hammer2_pfs_t *
326 hammer2_pfsalloc(hammer2_cluster_t *cluster,
327                  const hammer2_inode_data_t *ripdata,
328                  hammer2_tid_t alloc_tid)
329 {
330         hammer2_chain_t *rchain;
331         hammer2_pfs_t *pmp;
332         int i;
333         int j;
334
335         /*
336          * Locate or create the PFS based on the cluster id.  If ripdata
337          * is NULL this is a spmp which is unique and is always allocated.
338          */
339         if (ripdata) {
340                 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
341                         if (bcmp(&pmp->pfs_clid, &ripdata->pfs_clid,
342                                  sizeof(pmp->pfs_clid)) == 0) {
343                                         break;
344                         }
345                 }
346         } else {
347                 pmp = NULL;
348         }
349
350         if (pmp == NULL) {
351                 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
352                 kmalloc_create(&pmp->minode, "HAMMER2-inodes");
353                 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
354                 lockinit(&pmp->lock, "pfslk", 0, 0);
355                 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
356                 RB_INIT(&pmp->inum_tree);
357                 TAILQ_INIT(&pmp->unlinkq);
358                 spin_init(&pmp->list_spin, "hm2pfsalloc_list");
359
360                 /* our first media transaction id */
361                 pmp->alloc_tid = alloc_tid + 1;
362                 pmp->flush_tid = pmp->alloc_tid;
363                 if (ripdata) {
364                         pmp->inode_tid = ripdata->pfs_inum + 1;
365                         pmp->pfs_clid = ripdata->pfs_clid;
366                 }
367                 hammer2_mtx_init(&pmp->wthread_mtx, "h2wthr");
368                 bioq_init(&pmp->wthread_bioq);
369                 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
370         }
371
372         /*
373          * Create the PFS's root inode.
374          */
375         if (pmp->iroot == NULL) {
376                 pmp->iroot = hammer2_inode_get(pmp, NULL, NULL);
377                 hammer2_inode_ref(pmp->iroot);
378                 hammer2_inode_unlock_ex(pmp->iroot, NULL);
379         }
380
381         /*
382          * Create a primary synchronizer thread for the PFS if necessary.
383          * Single-node masters (including snapshots) have nothing to
384          * synchronize and do not require this thread.
385          *
386          * Multi-node masters or any number of soft masters, slaves, copy,
387          * or other PFS types need the thread.
388          */
389         if (cluster && ripdata &&
390             (ripdata->pfs_type != HAMMER2_PFSTYPE_MASTER ||
391              ripdata->pfs_nmasters > 1) &&
392             pmp->primary_thr.td == NULL) {
393                 hammer2_syncthr_create(&pmp->primary_thr, pmp,
394                                        hammer2_syncthr_primary);
395         }
396
397         /*
398          * Update nmasters from any PFS which is part of the cluster.
399          * It is possible that this will result in a value which is too
400          * high.  MASTER PFSs are authoritative for pfs_nmasters and will
401          * override this value later on.
402          */
403         if (ripdata && pmp->pfs_nmasters < ripdata->pfs_nmasters) {
404                 pmp->pfs_nmasters = ripdata->pfs_nmasters;
405         }
406
407         /*
408          * When a cluster is passed in we must add the cluster's chains
409          * to the PFS's root inode and update pmp->pfs_types[].
410          *
411          * At the moment empty spots can develop due to removals or failures.
412          * Ultimately we want to re-fill these spots. XXX
413          */
414         if (cluster) {
415                 hammer2_inode_ref(pmp->iroot);
416                 hammer2_mtx_ex(&pmp->iroot->lock);
417                 j = pmp->iroot->cluster.nchains;
418
419                 kprintf("add PFS to pmp %p[%d]\n", pmp, j);
420
421                 for (i = 0; i < cluster->nchains; ++i) {
422                         if (j == HAMMER2_MAXCLUSTER)
423                                 break;
424                         rchain = cluster->array[i].chain;
425                         KKASSERT(rchain->pmp == NULL);
426                         rchain->pmp = pmp;
427                         hammer2_chain_ref(rchain);
428                         pmp->iroot->cluster.array[j].chain = rchain;
429                         pmp->pfs_types[j] = ripdata->pfs_type;
430
431                         /*
432                          * May have to fixup dirty chain tracking.  Previous
433                          * pmp was NULL so nothing to undo.
434                          */
435                         if (rchain->flags & HAMMER2_CHAIN_MODIFIED)
436                                 hammer2_pfs_memory_inc(pmp);
437                         ++j;
438                 }
439                 pmp->iroot->cluster.nchains = j;
440                 hammer2_mtx_unlock(&pmp->iroot->lock);
441                 hammer2_inode_drop(pmp->iroot);
442
443                 if (i != cluster->nchains) {
444                         kprintf("hammer2_mount: cluster full!\n");
445                         /* XXX fatal error? */
446                 }
447         }
448
449         return pmp;
450 }
451
452 /*
453  * Destroy a PFS, typically only occurs after the last mount on a device
454  * has gone away.
455  */
456 static void
457 hammer2_pfsfree(hammer2_pfs_t *pmp)
458 {
459         /*
460          * Cleanup our reference on iroot.  iroot is (should) not be needed
461          * by the flush code.
462          */
463         TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
464
465         hammer2_syncthr_delete(&pmp->primary_thr);
466
467         if (pmp->iroot) {
468 #if REPORT_REFS_ERRORS
469                 if (pmp->iroot->refs != 1)
470                         kprintf("PMP->IROOT %p REFS WRONG %d\n",
471                                 pmp->iroot, pmp->iroot->refs);
472 #else
473                 KKASSERT(pmp->iroot->refs == 1);
474 #endif
475                 /* ref for pmp->iroot */
476                 hammer2_inode_drop(pmp->iroot);
477                 pmp->iroot = NULL;
478         }
479
480         kmalloc_destroy(&pmp->mmsg);
481         kmalloc_destroy(&pmp->minode);
482
483         kfree(pmp, M_HAMMER2);
484 }
485
486 /*
487  * Remove all references to hmp from the pfs list.  Any PFS which becomes
488  * empty is terminated and freed.
489  *
490  * XXX inefficient.
491  */
492 static void
493 hammer2_pfsfree_scan(hammer2_dev_t *hmp)
494 {
495         hammer2_pfs_t *pmp;
496         hammer2_cluster_t *cluster;
497         hammer2_chain_t *rchain;
498         int didfreeze;
499         int i;
500
501 again:
502         TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
503                 if (pmp->iroot == NULL)
504                         continue;
505                 if (hmp->spmp == pmp) {
506                         kprintf("unmount hmp %p remove spmp %p\n",
507                                 hmp, pmp);
508                         hmp->spmp = NULL;
509                 }
510
511                 /*
512                  * Determine if this PFS is affected.  If it is we must
513                  * freeze all management threads and lock its iroot.
514                  *
515                  * Freezing a management thread forces it idle, operations
516                  * in-progress will be aborted and it will have to start
517                  * over again when unfrozen, or exit if told to exit.
518                  */
519                 cluster = &pmp->iroot->cluster;
520                 for (i = 0; i < cluster->nchains; ++i) {
521                         rchain = cluster->array[i].chain;
522                         if (rchain == NULL || rchain->hmp != hmp)
523                                 continue;
524                         break;
525                 }
526                 if (i != cluster->nchains) {
527                         hammer2_syncthr_freeze(&pmp->primary_thr);
528
529                         /*
530                          * Lock the inode and clean out matching chains.
531                          * Note that we cannot use hammer2_inode_lock_*()
532                          * here because that would attempt to validate the
533                          * cluster that we are in the middle of ripping
534                          * apart.
535                          *
536                          * WARNING! We are working directly on the inodes
537                          *          embedded cluster.
538                          */
539                         hammer2_mtx_ex(&pmp->iroot->lock);
540
541                         /*
542                          * Remove the chain from matching elements of the PFS.
543                          */
544                         for (i = 0; i < cluster->nchains; ++i) {
545                                 rchain = cluster->array[i].chain;
546                                 if (rchain == NULL || rchain->hmp != hmp)
547                                         continue;
548
549                                 cluster->array[i].chain = NULL;
550                                 pmp->pfs_types[i] = 0;
551                                 hammer2_chain_drop(rchain);
552
553                                 /* focus hint */
554                                 if (cluster->focus == rchain)
555                                         cluster->focus = NULL;
556                         }
557                         hammer2_mtx_unlock(&pmp->iroot->lock);
558                         didfreeze = 1;  /* remaster, unfreeze down below */
559                 } else {
560                         didfreeze = 0;
561                 }
562
563                 /*
564                  * Cleanup trailing chains.  Do not reorder chains (for now).
565                  * XXX might remove more than we intended.
566                  */
567                 while (i > 0) {
568                         if (cluster->array[i - 1].chain)
569                                 break;
570                         --i;
571                 }
572                 cluster->nchains = i;
573
574                 /*
575                  * If the PMP has no elements remaining we can destroy it.
576                  * (this will transition management threads from frozen->exit).
577                  */
578                 if (cluster->nchains == 0) {
579                         kprintf("unmount hmp %p last ref to PMP=%p\n",
580                                 hmp, pmp);
581                         hammer2_pfsfree(pmp);
582                         goto again;
583                 }
584
585                 /*
586                  * If elements still remain we need to set the REMASTER
587                  * flag and unfreeze it.
588                  */
589                 if (didfreeze) {
590                         hammer2_syncthr_remaster(&pmp->primary_thr);
591                         hammer2_syncthr_unfreeze(&pmp->primary_thr);
592                 }
593         }
594 }
595
596 /*
597  * Mount or remount HAMMER2 fileystem from physical media
598  *
599  *      mountroot
600  *              mp              mount point structure
601  *              path            NULL
602  *              data            <unused>
603  *              cred            <unused>
604  *
605  *      mount
606  *              mp              mount point structure
607  *              path            path to mount point
608  *              data            pointer to argument structure in user space
609  *                      volume  volume path (device@LABEL form)
610  *                      hflags  user mount flags
611  *              cred            user credentials
612  *
613  * RETURNS:     0       Success
614  *              !0      error number
615  */
616 static
617 int
618 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
619                   struct ucred *cred)
620 {
621         struct hammer2_mount_info info;
622         hammer2_pfs_t *pmp;
623         hammer2_pfs_t *spmp;
624         hammer2_dev_t *hmp;
625         hammer2_key_t key_next;
626         hammer2_key_t key_dummy;
627         hammer2_key_t lhc;
628         struct vnode *devvp;
629         struct nlookupdata nd;
630         hammer2_chain_t *parent;
631         hammer2_cluster_t *cluster;
632         hammer2_cluster_t *cparent;
633         const hammer2_inode_data_t *ripdata;
634         hammer2_blockref_t bref;
635         struct file *fp;
636         char devstr[MNAMELEN];
637         size_t size;
638         size_t done;
639         char *dev;
640         char *label;
641         int ronly = 1;
642         int error;
643         int cache_index;
644         int i;
645
646         hmp = NULL;
647         pmp = NULL;
648         dev = NULL;
649         label = NULL;
650         devvp = NULL;
651         cache_index = -1;
652
653         kprintf("hammer2_mount\n");
654
655         if (path == NULL) {
656                 /*
657                  * Root mount
658                  */
659                 bzero(&info, sizeof(info));
660                 info.cluster_fd = -1;
661                 return (EOPNOTSUPP);
662         } else {
663                 /*
664                  * Non-root mount or updating a mount
665                  */
666                 error = copyin(data, &info, sizeof(info));
667                 if (error)
668                         return (error);
669
670                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
671                 if (error)
672                         return (error);
673
674                 /* Extract device and label */
675                 dev = devstr;
676                 label = strchr(devstr, '@');
677                 if (label == NULL ||
678                     ((label + 1) - dev) > done) {
679                         return (EINVAL);
680                 }
681                 *label = '\0';
682                 label++;
683                 if (*label == '\0')
684                         return (EINVAL);
685
686                 if (mp->mnt_flag & MNT_UPDATE) {
687                         /*
688                          * Update mount.  Note that pmp->iroot->cluster is
689                          * an inode-embedded cluster and thus cannot be
690                          * directly locked.
691                          *
692                          * XXX HAMMER2 needs to implement NFS export via
693                          *     mountctl.
694                          */
695                         pmp = MPTOPMP(mp);
696                         cluster = &pmp->iroot->cluster;
697                         for (i = 0; i < cluster->nchains; ++i) {
698                                 hmp = cluster->array[i].chain->hmp;
699                                 devvp = hmp->devvp;
700                                 error = hammer2_remount(hmp, mp, path,
701                                                         devvp, cred);
702                                 if (error)
703                                         break;
704                         }
705                         /*hammer2_inode_install_hidden(pmp);*/
706
707                         return error;
708                 }
709         }
710
711         /*
712          * HMP device mount
713          *
714          * Lookup name and verify it refers to a block device.
715          */
716         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
717         if (error == 0)
718                 error = nlookup(&nd);
719         if (error == 0)
720                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
721         nlookup_done(&nd);
722
723         if (error == 0) {
724                 if (vn_isdisk(devvp, &error))
725                         error = vfs_mountedon(devvp);
726         }
727
728         /*
729          * Determine if the device has already been mounted.  After this
730          * check hmp will be non-NULL if we are doing the second or more
731          * hammer2 mounts from the same device.
732          */
733         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
734         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
735                 if (hmp->devvp == devvp)
736                         break;
737         }
738
739         /*
740          * Open the device if this isn't a secondary mount and construct
741          * the H2 device mount (hmp).
742          */
743         if (hmp == NULL) {
744                 hammer2_chain_t *schain;
745                 hammer2_xid_t xid;
746
747                 if (error == 0 && vcount(devvp) > 0)
748                         error = EBUSY;
749
750                 /*
751                  * Now open the device
752                  */
753                 if (error == 0) {
754                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
755                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
756                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
757                         if (error == 0) {
758                                 error = VOP_OPEN(devvp,
759                                                  ronly ? FREAD : FREAD | FWRITE,
760                                                  FSCRED, NULL);
761                         }
762                         vn_unlock(devvp);
763                 }
764                 if (error && devvp) {
765                         vrele(devvp);
766                         devvp = NULL;
767                 }
768                 if (error) {
769                         lockmgr(&hammer2_mntlk, LK_RELEASE);
770                         return error;
771                 }
772                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
773                 hmp->ronly = ronly;
774                 hmp->devvp = devvp;
775                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
776                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
777                 RB_INIT(&hmp->iotree);
778                 spin_init(&hmp->io_spin, "hm2mount_io");
779                 spin_init(&hmp->list_spin, "hm2mount_list");
780                 TAILQ_INIT(&hmp->flushq);
781
782                 lockinit(&hmp->vollk, "h2vol", 0, 0);
783
784                 /*
785                  * vchain setup. vchain.data is embedded.
786                  * vchain.refs is initialized and will never drop to 0.
787                  *
788                  * NOTE! voldata is not yet loaded.
789                  */
790                 hmp->vchain.hmp = hmp;
791                 hmp->vchain.refs = 1;
792                 hmp->vchain.data = (void *)&hmp->voldata;
793                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
794                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
795                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
796
797                 hammer2_chain_core_init(&hmp->vchain);
798                 /* hmp->vchain.u.xxx is left NULL */
799
800                 /*
801                  * fchain setup.  fchain.data is embedded.
802                  * fchain.refs is initialized and will never drop to 0.
803                  *
804                  * The data is not used but needs to be initialized to
805                  * pass assertion muster.  We use this chain primarily
806                  * as a placeholder for the freemap's top-level RBTREE
807                  * so it does not interfere with the volume's topology
808                  * RBTREE.
809                  */
810                 hmp->fchain.hmp = hmp;
811                 hmp->fchain.refs = 1;
812                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
813                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
814                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
815                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
816                 hmp->fchain.bref.methods =
817                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
818                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
819
820                 hammer2_chain_core_init(&hmp->fchain);
821                 /* hmp->fchain.u.xxx is left NULL */
822
823                 /*
824                  * Install the volume header and initialize fields from
825                  * voldata.
826                  */
827                 error = hammer2_install_volume_header(hmp);
828                 if (error) {
829                         hammer2_unmount_helper(mp, NULL, hmp);
830                         lockmgr(&hammer2_mntlk, LK_RELEASE);
831                         hammer2_vfs_unmount(mp, MNT_FORCE);
832                         return error;
833                 }
834
835                 /*
836                  * Really important to get these right or flush will get
837                  * confused.
838                  */
839                 hmp->spmp = hammer2_pfsalloc(NULL, NULL,
840                                              hmp->voldata.mirror_tid);
841                 kprintf("alloc spmp %p tid %016jx\n",
842                         hmp->spmp, hmp->voldata.mirror_tid);
843                 spmp = hmp->spmp;
844                 spmp->inode_tid = 1;
845
846                 xid = 0;
847                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
848                 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
849                 hmp->vchain.pmp = spmp;
850                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
851                 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
852                 hmp->fchain.pmp = spmp;
853
854                 /*
855                  * First locate the super-root inode, which is key 0
856                  * relative to the volume header's blockset.
857                  *
858                  * Then locate the root inode by scanning the directory keyspace
859                  * represented by the label.
860                  */
861                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
862                 schain = hammer2_chain_lookup(&parent, &key_dummy,
863                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
864                                       &cache_index, 0);
865                 hammer2_chain_lookup_done(parent);
866                 if (schain == NULL) {
867                         kprintf("hammer2_mount: invalid super-root\n");
868                         hammer2_unmount_helper(mp, NULL, hmp);
869                         lockmgr(&hammer2_mntlk, LK_RELEASE);
870                         hammer2_vfs_unmount(mp, MNT_FORCE);
871                         return EINVAL;
872                 }
873
874                 /*
875                  * Sanity-check schain's pmp and finish initialization.
876                  * Any chain belonging to the super-root topology should
877                  * have a NULL pmp (not even set to spmp).
878                  */
879                 ripdata = &hammer2_chain_rdata(schain)->ipdata;
880                 KKASSERT(schain->pmp == NULL);
881                 spmp->pfs_clid = ripdata->pfs_clid;
882
883                 /*
884                  * Replace the dummy spmp->iroot with a real one.  It's
885                  * easier to just do a wholesale replacement than to try
886                  * to update the chain and fixup the iroot fields.
887                  *
888                  * The returned inode is locked with the supplied cluster.
889                  */
890                 cluster = hammer2_cluster_from_chain(schain);
891                 hammer2_inode_drop(spmp->iroot);
892                 spmp->iroot = NULL;
893                 spmp->iroot = hammer2_inode_get(spmp, NULL, cluster);
894                 spmp->spmp_hmp = hmp;
895                 spmp->pfs_types[0] = ripdata->pfs_type;
896                 hammer2_inode_ref(spmp->iroot);
897                 hammer2_inode_unlock_ex(spmp->iroot, cluster);
898                 schain = NULL;
899                 /* leave spmp->iroot with one ref */
900
901                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
902                         error = hammer2_recovery(hmp);
903                         /* XXX do something with error */
904                 }
905                 hammer2_update_pmps(hmp);
906                 hammer2_iocom_init(hmp);
907
908                 /*
909                  * Ref the cluster management messaging descriptor.  The mount
910                  * program deals with the other end of the communications pipe.
911                  */
912                 fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
913                 if (fp) {
914                         hammer2_cluster_reconnect(hmp, fp);
915                 } else {
916                         kprintf("hammer2_mount: bad cluster_fd!\n");
917                 }
918         } else {
919                 spmp = hmp->spmp;
920         }
921
922         /*
923          * Lookup the mount point under the media-localized super-root.
924          * Scanning hammer2_pfslist doesn't help us because it represents
925          * PFS cluster ids which can aggregate several named PFSs together.
926          *
927          * cluster->pmp will incorrectly point to spmp and must be fixed
928          * up later on.
929          */
930         cparent = hammer2_inode_lock_ex(spmp->iroot);
931         lhc = hammer2_dirhash(label, strlen(label));
932         cluster = hammer2_cluster_lookup(cparent, &key_next,
933                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
934                                       0);
935         while (cluster) {
936                 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE &&
937                     strcmp(label,
938                        hammer2_cluster_rdata(cluster)->ipdata.filename) == 0) {
939                         break;
940                 }
941                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
942                                             key_next,
943                                             lhc + HAMMER2_DIRHASH_LOMASK, 0);
944         }
945         hammer2_inode_unlock_ex(spmp->iroot, cparent);
946
947         /*
948          * PFS could not be found?
949          */
950         if (cluster == NULL) {
951                 kprintf("hammer2_mount: PFS label not found\n");
952                 hammer2_unmount_helper(mp, NULL, hmp);
953                 lockmgr(&hammer2_mntlk, LK_RELEASE);
954                 hammer2_vfs_unmount(mp, MNT_FORCE);
955
956                 return EINVAL;
957         }
958
959         /*
960          * Acquire the pmp structure (it should have already been allocated
961          * via hammer2_update_pmps() so do not pass cluster in to add to
962          * available chains).
963          *
964          * Check if the cluster has already been mounted.  A cluster can
965          * only be mounted once, use null mounts to mount additional copies.
966          */
967         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
968         hammer2_cluster_bref(cluster, &bref);
969         pmp = hammer2_pfsalloc(NULL, ripdata, bref.mirror_tid);
970         hammer2_cluster_unlock(cluster);
971
972         if (pmp->mp) {
973                 kprintf("hammer2_mount: PFS already mounted!\n");
974                 hammer2_unmount_helper(mp, NULL, hmp);
975                 lockmgr(&hammer2_mntlk, LK_RELEASE);
976                 hammer2_vfs_unmount(mp, MNT_FORCE);
977
978                 return EBUSY;
979         }
980
981         /*
982          * Finish the mount
983          */
984         kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp);
985
986         mp->mnt_flag = MNT_LOCAL;
987         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
988         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
989  
990         /*
991          * required mount structure initializations
992          */
993         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
994         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
995  
996         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
997         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
998  
999         /*
1000          * Optional fields
1001          */
1002         mp->mnt_iosize_max = MAXPHYS;
1003
1004         /*
1005          * Connect up mount pointers.
1006          */
1007         hammer2_mount_helper(mp, pmp);
1008
1009         lockmgr(&hammer2_mntlk, LK_RELEASE);
1010
1011         /*
1012          * A mounted PFS needs a write thread for logical buffers and
1013          * a hidden directory for deletions of open files.  These features
1014          * are not used by unmounted PFSs.
1015          *
1016          * The logical file buffer bio write thread handles things like
1017          * physical block assignment and compression.
1018          */
1019         pmp->wthread_destroy = 0;
1020         lwkt_create(hammer2_write_thread, pmp,
1021                     &pmp->wthread_td, NULL, 0, -1, "hwrite-%s", label);
1022
1023         /*
1024          * With the cluster operational install ihidden.
1025          * (only applicable to pfs mounts, not applicable to spmp)
1026          */
1027         hammer2_inode_install_hidden(pmp);
1028
1029         /*
1030          * Finish setup
1031          */
1032         vfs_getnewfsid(mp);
1033         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
1034         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
1035         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
1036
1037         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
1038         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
1039         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
1040         copyinstr(path, mp->mnt_stat.f_mntonname,
1041                   sizeof(mp->mnt_stat.f_mntonname) - 1,
1042                   &size);
1043
1044         /*
1045          * Initial statfs to prime mnt_stat.
1046          */
1047         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
1048         
1049         return 0;
1050 }
1051
1052 /*
1053  * Scan PFSs under the super-root and create hammer2_pfs structures.
1054  */
1055 static
1056 void
1057 hammer2_update_pmps(hammer2_dev_t *hmp)
1058 {
1059         const hammer2_inode_data_t *ripdata;
1060         hammer2_cluster_t *cparent;
1061         hammer2_cluster_t *cluster;
1062         hammer2_blockref_t bref;
1063         hammer2_pfs_t *spmp;
1064         hammer2_pfs_t *pmp;
1065         hammer2_key_t key_next;
1066
1067         /*
1068          * Lookup mount point under the media-localized super-root.
1069          *
1070          * cluster->pmp will incorrectly point to spmp and must be fixed
1071          * up later on.
1072          */
1073         spmp = hmp->spmp;
1074         cparent = hammer2_inode_lock_ex(spmp->iroot);
1075         cluster = hammer2_cluster_lookup(cparent, &key_next,
1076                                          HAMMER2_KEY_MIN,
1077                                          HAMMER2_KEY_MAX,
1078                                          0);
1079         while (cluster) {
1080                 if (hammer2_cluster_type(cluster) != HAMMER2_BREF_TYPE_INODE)
1081                         continue;
1082                 ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1083                 hammer2_cluster_bref(cluster, &bref);
1084                 kprintf("ADD LOCAL PFS: %s\n", ripdata->filename);
1085
1086                 pmp = hammer2_pfsalloc(cluster, ripdata, bref.mirror_tid);
1087                 cluster = hammer2_cluster_next(cparent, cluster,
1088                                                &key_next,
1089                                                key_next,
1090                                                HAMMER2_KEY_MAX,
1091                                                0);
1092         }
1093         hammer2_inode_unlock_ex(spmp->iroot, cparent);
1094 }
1095
1096 /*
1097  * Handle bioq for strategy write
1098  */
1099 static
1100 void
1101 hammer2_write_thread(void *arg)
1102 {
1103         hammer2_pfs_t *pmp;
1104         struct bio *bio;
1105         struct buf *bp;
1106         hammer2_trans_t trans;
1107         struct vnode *vp;
1108         hammer2_inode_t *ip;
1109         hammer2_cluster_t *cparent;
1110         const hammer2_inode_data_t *ripdata;
1111         hammer2_key_t lbase;
1112         int lblksize;
1113         int pblksize;
1114         int error;
1115         
1116         pmp = arg;
1117         
1118         hammer2_mtx_ex(&pmp->wthread_mtx);
1119         while (pmp->wthread_destroy == 0) {
1120                 if (bioq_first(&pmp->wthread_bioq) == NULL) {
1121                         mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
1122                                  0, "h2bioqw", 0);
1123                 }
1124                 cparent = NULL;
1125
1126                 hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
1127
1128                 while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
1129                         /*
1130                          * dummy bio for synchronization.  The transaction
1131                          * must be reinitialized.
1132                          */
1133                         if (bio->bio_buf == NULL) {
1134                                 bio->bio_flags |= BIO_DONE;
1135                                 wakeup(bio);
1136                                 hammer2_trans_done(&trans);
1137                                 hammer2_trans_init(&trans, pmp,
1138                                                    HAMMER2_TRANS_BUFCACHE);
1139                                 continue;
1140                         }
1141
1142                         /*
1143                          * else normal bio processing
1144                          */
1145                         hammer2_mtx_unlock(&pmp->wthread_mtx);
1146
1147                         hammer2_lwinprog_drop(pmp);
1148                         
1149                         error = 0;
1150                         bp = bio->bio_buf;
1151                         vp = bp->b_vp;
1152                         ip = VTOI(vp);
1153
1154                         /*
1155                          * Inode is modified, flush size and mtime changes
1156                          * to ensure that the file size remains consistent
1157                          * with the buffers being flushed.
1158                          *
1159                          * NOTE: The inode_fsync() call only flushes the
1160                          *       inode's meta-data state, it doesn't try
1161                          *       to flush underlying buffers or chains.
1162                          *
1163                          * NOTE: hammer2_write_file_core() may indirectly
1164                          *       modify and modsync the inode.
1165                          */
1166                         cparent = hammer2_inode_lock_ex(ip);
1167                         if (ip->flags & (HAMMER2_INODE_RESIZED |
1168                                          HAMMER2_INODE_MTIME)) {
1169                                 hammer2_inode_fsync(&trans, ip, cparent);
1170                         }
1171                         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1172                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
1173                                                         &lbase, NULL);
1174                         pblksize = hammer2_calc_physical(ip, ripdata, lbase);
1175                         hammer2_write_file_core(bp, &trans, ip, ripdata,
1176                                                 cparent,
1177                                                 lbase, IO_ASYNC,
1178                                                 pblksize, &error);
1179                         /* ripdata can be invalid after call */
1180                         hammer2_inode_unlock_ex(ip, cparent);
1181                         if (error) {
1182                                 kprintf("hammer2: error in buffer write\n");
1183                                 bp->b_flags |= B_ERROR;
1184                                 bp->b_error = EIO;
1185                         }
1186                         biodone(bio);
1187                         hammer2_mtx_ex(&pmp->wthread_mtx);
1188                 }
1189                 hammer2_trans_done(&trans);
1190         }
1191         pmp->wthread_destroy = -1;
1192         wakeup(&pmp->wthread_destroy);
1193         
1194         hammer2_mtx_unlock(&pmp->wthread_mtx);
1195 }
1196
1197 void
1198 hammer2_bioq_sync(hammer2_pfs_t *pmp)
1199 {
1200         struct bio sync_bio;
1201
1202         bzero(&sync_bio, sizeof(sync_bio));     /* dummy with no bio_buf */
1203         hammer2_mtx_ex(&pmp->wthread_mtx);
1204         if (pmp->wthread_destroy == 0 &&
1205             TAILQ_FIRST(&pmp->wthread_bioq.queue)) {
1206                 bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
1207                 while ((sync_bio.bio_flags & BIO_DONE) == 0)
1208                         mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
1209         }
1210         hammer2_mtx_unlock(&pmp->wthread_mtx);
1211 }
1212
1213 /* 
1214  * Return a chain suitable for I/O, creating the chain if necessary
1215  * and assigning its physical block.
1216  *
1217  * cparent can wind up being anything.
1218  */
1219 static
1220 hammer2_cluster_t *
1221 hammer2_assign_physical(hammer2_trans_t *trans,
1222                         hammer2_inode_t *ip, hammer2_cluster_t *cparent,
1223                         hammer2_key_t lbase, int pblksize, int *errorp)
1224 {
1225         hammer2_cluster_t *cluster;
1226         hammer2_cluster_t *dparent;
1227         hammer2_key_t key_dummy;
1228         int pradix = hammer2_getradix(pblksize);
1229
1230         /*
1231          * Locate the chain associated with lbase, return a locked chain.
1232          * However, do not instantiate any data reference (which utilizes a
1233          * device buffer) because we will be using direct IO via the
1234          * logical buffer cache buffer.
1235          */
1236         *errorp = 0;
1237         KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
1238 retry:
1239         dparent = hammer2_cluster_lookup_init(cparent, 0);
1240         cluster = hammer2_cluster_lookup(dparent, &key_dummy,
1241                                      lbase, lbase,
1242                                      HAMMER2_LOOKUP_NODATA);
1243
1244         if (cluster == NULL) {
1245                 /*
1246                  * We found a hole, create a new chain entry.
1247                  *
1248                  * NOTE: DATA chains are created without device backing
1249                  *       store (nor do we want any).
1250                  */
1251                 *errorp = hammer2_cluster_create(trans, dparent, &cluster,
1252                                                lbase, HAMMER2_PBUFRADIX,
1253                                                HAMMER2_BREF_TYPE_DATA,
1254                                                pblksize, 0);
1255                 if (cluster == NULL) {
1256                         hammer2_cluster_lookup_done(dparent);
1257                         panic("hammer2_cluster_create: par=%p error=%d\n",
1258                                 dparent->focus, *errorp);
1259                         goto retry;
1260                 }
1261                 /*ip->delta_dcount += pblksize;*/
1262         } else {
1263                 switch (hammer2_cluster_type(cluster)) {
1264                 case HAMMER2_BREF_TYPE_INODE:
1265                         /*
1266                          * The data is embedded in the inode.  The
1267                          * caller is responsible for marking the inode
1268                          * modified and copying the data to the embedded
1269                          * area.
1270                          */
1271                         break;
1272                 case HAMMER2_BREF_TYPE_DATA:
1273                         if (hammer2_cluster_need_resize(cluster, pblksize)) {
1274                                 hammer2_cluster_resize(trans, ip,
1275                                                      dparent, cluster,
1276                                                      pradix,
1277                                                      HAMMER2_MODIFY_OPTDATA);
1278                         }
1279
1280                         /*
1281                          * DATA buffers must be marked modified whether the
1282                          * data is in a logical buffer or not.  We also have
1283                          * to make this call to fixup the chain data pointers
1284                          * after resizing in case this is an encrypted or
1285                          * compressed buffer.
1286                          */
1287                         hammer2_cluster_modify(trans, cluster,
1288                                                HAMMER2_MODIFY_OPTDATA);
1289                         break;
1290                 default:
1291                         panic("hammer2_assign_physical: bad type");
1292                         /* NOT REACHED */
1293                         break;
1294                 }
1295         }
1296
1297         /*
1298          * Cleanup.  If cluster wound up being the inode itself, i.e.
1299          * the DIRECTDATA case for offset 0, then we need to update cparent.
1300          * The caller expects cparent to not become stale.
1301          */
1302         hammer2_cluster_lookup_done(dparent);
1303         /* dparent = NULL; safety */
1304         return (cluster);
1305 }
1306
1307 /* 
1308  * bio queued from hammer2_vnops.c.
1309  *
1310  * The core write function which determines which path to take
1311  * depending on compression settings.  We also have to locate the
1312  * related clusters so we can calculate and set the check data for
1313  * the blockref.
1314  */
1315 static
1316 void
1317 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
1318                         hammer2_inode_t *ip,
1319                         const hammer2_inode_data_t *ripdata,
1320                         hammer2_cluster_t *cparent,
1321                         hammer2_key_t lbase, int ioflag, int pblksize,
1322                         int *errorp)
1323 {
1324         hammer2_cluster_t *cluster;
1325
1326         switch(HAMMER2_DEC_ALGO(ripdata->comp_algo)) {
1327         case HAMMER2_COMP_NONE:
1328                 /*
1329                  * We have to assign physical storage to the buffer
1330                  * we intend to dirty or write now to avoid deadlocks
1331                  * in the strategy code later.
1332                  *
1333                  * This can return NOOFFSET for inode-embedded data.
1334                  * The strategy code will take care of it in that case.
1335                  */
1336                 cluster = hammer2_assign_physical(trans, ip, cparent,
1337                                                 lbase, pblksize,
1338                                                 errorp);
1339                 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
1340                                  ripdata->check_algo);
1341                 /* ripdata can become invalid */
1342                 if (cluster)
1343                         hammer2_cluster_unlock(cluster);
1344                 break;
1345         case HAMMER2_COMP_AUTOZERO:
1346                 /*
1347                  * Check for zero-fill only
1348                  */
1349                 hammer2_zero_check_and_write(bp, trans, ip,
1350                                     ripdata, cparent, lbase,
1351                                     ioflag, pblksize, errorp,
1352                                     ripdata->check_algo);
1353                 break;
1354         case HAMMER2_COMP_LZ4:
1355         case HAMMER2_COMP_ZLIB:
1356         default:
1357                 /*
1358                  * Check for zero-fill and attempt compression.
1359                  */
1360                 hammer2_compress_and_write(bp, trans, ip,
1361                                            ripdata, cparent,
1362                                            lbase, ioflag,
1363                                            pblksize, errorp,
1364                                            ripdata->comp_algo,
1365                                            ripdata->check_algo);
1366                 break;
1367         }
1368 }
1369
1370 /*
1371  * Generic function that will perform the compression in compression
1372  * write path. The compression algorithm is determined by the settings
1373  * obtained from inode.
1374  */
1375 static
1376 void
1377 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
1378         hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1379         hammer2_cluster_t *cparent,
1380         hammer2_key_t lbase, int ioflag, int pblksize,
1381         int *errorp, int comp_algo, int check_algo)
1382 {
1383         hammer2_cluster_t *cluster;
1384         hammer2_chain_t *chain;
1385         int comp_size;
1386         int comp_block_size;
1387         int i;
1388         char *comp_buffer;
1389
1390         if (test_block_zeros(bp->b_data, pblksize)) {
1391                 zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
1392                 return;
1393         }
1394
1395         comp_size = 0;
1396         comp_buffer = NULL;
1397
1398         KKASSERT(pblksize / 2 <= 32768);
1399                 
1400         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
1401                 z_stream strm_compress;
1402                 int comp_level;
1403                 int ret;
1404
1405                 switch(HAMMER2_DEC_ALGO(comp_algo)) {
1406                 case HAMMER2_COMP_LZ4:
1407                         comp_buffer = objcache_get(cache_buffer_write,
1408                                                    M_INTWAIT);
1409                         comp_size = LZ4_compress_limitedOutput(
1410                                         bp->b_data,
1411                                         &comp_buffer[sizeof(int)],
1412                                         pblksize,
1413                                         pblksize / 2 - sizeof(int));
1414                         /*
1415                          * We need to prefix with the size, LZ4
1416                          * doesn't do it for us.  Add the related
1417                          * overhead.
1418                          */
1419                         *(int *)comp_buffer = comp_size;
1420                         if (comp_size)
1421                                 comp_size += sizeof(int);
1422                         break;
1423                 case HAMMER2_COMP_ZLIB:
1424                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1425                         if (comp_level == 0)
1426                                 comp_level = 6; /* default zlib compression */
1427                         else if (comp_level < 6)
1428                                 comp_level = 6;
1429                         else if (comp_level > 9)
1430                                 comp_level = 9;
1431                         ret = deflateInit(&strm_compress, comp_level);
1432                         if (ret != Z_OK) {
1433                                 kprintf("HAMMER2 ZLIB: fatal error "
1434                                         "on deflateInit.\n");
1435                         }
1436
1437                         comp_buffer = objcache_get(cache_buffer_write,
1438                                                    M_INTWAIT);
1439                         strm_compress.next_in = bp->b_data;
1440                         strm_compress.avail_in = pblksize;
1441                         strm_compress.next_out = comp_buffer;
1442                         strm_compress.avail_out = pblksize / 2;
1443                         ret = deflate(&strm_compress, Z_FINISH);
1444                         if (ret == Z_STREAM_END) {
1445                                 comp_size = pblksize / 2 -
1446                                             strm_compress.avail_out;
1447                         } else {
1448                                 comp_size = 0;
1449                         }
1450                         ret = deflateEnd(&strm_compress);
1451                         break;
1452                 default:
1453                         kprintf("Error: Unknown compression method.\n");
1454                         kprintf("Comp_method = %d.\n", comp_algo);
1455                         break;
1456                 }
1457         }
1458
1459         if (comp_size == 0) {
1460                 /*
1461                  * compression failed or turned off
1462                  */
1463                 comp_block_size = pblksize;     /* safety */
1464                 if (++ip->comp_heuristic > 128)
1465                         ip->comp_heuristic = 8;
1466         } else {
1467                 /*
1468                  * compression succeeded
1469                  */
1470                 ip->comp_heuristic = 0;
1471                 if (comp_size <= 1024) {
1472                         comp_block_size = 1024;
1473                 } else if (comp_size <= 2048) {
1474                         comp_block_size = 2048;
1475                 } else if (comp_size <= 4096) {
1476                         comp_block_size = 4096;
1477                 } else if (comp_size <= 8192) {
1478                         comp_block_size = 8192;
1479                 } else if (comp_size <= 16384) {
1480                         comp_block_size = 16384;
1481                 } else if (comp_size <= 32768) {
1482                         comp_block_size = 32768;
1483                 } else {
1484                         panic("hammer2: WRITE PATH: "
1485                               "Weird comp_size value.");
1486                         /* NOT REACHED */
1487                         comp_block_size = pblksize;
1488                 }
1489         }
1490
1491         cluster = hammer2_assign_physical(trans, ip, cparent,
1492                                           lbase, comp_block_size,
1493                                           errorp);
1494         ripdata = NULL;
1495
1496         if (*errorp) {
1497                 kprintf("WRITE PATH: An error occurred while "
1498                         "assigning physical space.\n");
1499                 KKASSERT(cluster == NULL);
1500                 goto done;
1501         }
1502
1503         if (cluster->ddflag) {
1504                 hammer2_inode_data_t *wipdata;
1505
1506                 wipdata = hammer2_cluster_modify_ip(trans, ip, cluster, 0);
1507                 KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1508                 KKASSERT(bp->b_loffset == 0);
1509                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1510                 hammer2_cluster_modsync(cluster);
1511         } else
1512         for (i = 0; i < cluster->nchains; ++i) {
1513                 hammer2_io_t *dio;
1514                 char *bdata;
1515
1516                 /* XXX hackx */
1517
1518                 chain = cluster->array[i].chain;        /* XXX */
1519                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1520
1521                 switch(chain->bref.type) {
1522                 case HAMMER2_BREF_TYPE_INODE:
1523                         panic("hammer2_write_bp: unexpected inode\n");
1524                         break;
1525                 case HAMMER2_BREF_TYPE_DATA:
1526                         /*
1527                          * Optimize out the read-before-write
1528                          * if possible.
1529                          */
1530                         *errorp = hammer2_io_newnz(chain->hmp,
1531                                                    chain->bref.data_off,
1532                                                    chain->bytes,
1533                                                    &dio);
1534                         if (*errorp) {
1535                                 hammer2_io_brelse(&dio);
1536                                 kprintf("hammer2: WRITE PATH: "
1537                                         "dbp bread error\n");
1538                                 break;
1539                         }
1540                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1541
1542                         /*
1543                          * When loading the block make sure we don't
1544                          * leave garbage after the compressed data.
1545                          */
1546                         if (comp_size) {
1547                                 chain->bref.methods =
1548                                         HAMMER2_ENC_COMP(comp_algo) +
1549                                         HAMMER2_ENC_CHECK(check_algo);
1550                                 bcopy(comp_buffer, bdata, comp_size);
1551                                 if (comp_size != comp_block_size) {
1552                                         bzero(bdata + comp_size,
1553                                               comp_block_size - comp_size);
1554                                 }
1555                         } else {
1556                                 chain->bref.methods =
1557                                         HAMMER2_ENC_COMP(
1558                                                 HAMMER2_COMP_NONE) +
1559                                         HAMMER2_ENC_CHECK(check_algo);
1560                                 bcopy(bp->b_data, bdata, pblksize);
1561                         }
1562
1563                         /*
1564                          * The flush code doesn't calculate check codes for
1565                          * file data (doing so can result in excessive I/O),
1566                          * so we do it here.
1567                          */
1568                         hammer2_chain_setcheck(chain, bdata);
1569
1570                         /*
1571                          * Device buffer is now valid, chain is no longer in
1572                          * the initial state.
1573                          *
1574                          * (No blockref table worries with file data)
1575                          */
1576                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1577
1578                         /* Now write the related bdp. */
1579                         if (ioflag & IO_SYNC) {
1580                                 /*
1581                                  * Synchronous I/O requested.
1582                                  */
1583                                 hammer2_io_bwrite(&dio);
1584                         /*
1585                         } else if ((ioflag & IO_DIRECT) &&
1586                                    loff + n == pblksize) {
1587                                 hammer2_io_bdwrite(&dio);
1588                         */
1589                         } else if (ioflag & IO_ASYNC) {
1590                                 hammer2_io_bawrite(&dio);
1591                         } else {
1592                                 hammer2_io_bdwrite(&dio);
1593                         }
1594                         break;
1595                 default:
1596                         panic("hammer2_write_bp: bad chain type %d\n",
1597                                 chain->bref.type);
1598                         /* NOT REACHED */
1599                         break;
1600                 }
1601         }
1602 done:
1603         if (cluster)
1604                 hammer2_cluster_unlock(cluster);
1605         if (comp_buffer)
1606                 objcache_put(cache_buffer_write, comp_buffer);
1607 }
1608
1609 /*
1610  * Function that performs zero-checking and writing without compression,
1611  * it corresponds to default zero-checking path.
1612  */
1613 static
1614 void
1615 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1616         hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1617         hammer2_cluster_t *cparent,
1618         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp,
1619         int check_algo)
1620 {
1621         hammer2_cluster_t *cluster;
1622
1623         if (test_block_zeros(bp->b_data, pblksize)) {
1624                 zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
1625                 /* ripdata can become invalid */
1626         } else {
1627                 cluster = hammer2_assign_physical(trans, ip, cparent,
1628                                                   lbase, pblksize, errorp);
1629                 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
1630                                  check_algo);
1631                 /* ripdata can become invalid */
1632                 if (cluster)
1633                         hammer2_cluster_unlock(cluster);
1634         }
1635 }
1636
1637 /*
1638  * A function to test whether a block of data contains only zeros,
1639  * returns TRUE (non-zero) if the block is all zeros.
1640  */
1641 static
1642 int
1643 test_block_zeros(const char *buf, size_t bytes)
1644 {
1645         size_t i;
1646
1647         for (i = 0; i < bytes; i += sizeof(long)) {
1648                 if (*(const long *)(buf + i) != 0)
1649                         return (0);
1650         }
1651         return (1);
1652 }
1653
1654 /*
1655  * Function to "write" a block that contains only zeros.
1656  */
1657 static
1658 void
1659 zero_write(struct buf *bp, hammer2_trans_t *trans,
1660            hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1661            hammer2_cluster_t *cparent,
1662            hammer2_key_t lbase, int *errorp __unused)
1663 {
1664         hammer2_cluster_t *cluster;
1665         hammer2_key_t key_dummy;
1666
1667         cparent = hammer2_cluster_lookup_init(cparent, 0);
1668         cluster = hammer2_cluster_lookup(cparent, &key_dummy, lbase, lbase,
1669                                      HAMMER2_LOOKUP_NODATA);
1670         if (cluster) {
1671                 if (cluster->ddflag) {
1672                         hammer2_inode_data_t *wipdata;
1673
1674                         wipdata = hammer2_cluster_modify_ip(trans, ip,
1675                                                             cluster, 0);
1676                         KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1677                         KKASSERT(bp->b_loffset == 0);
1678                         bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1679                         hammer2_cluster_modsync(cluster);
1680                 } else {
1681                         hammer2_cluster_delete(trans, cparent, cluster,
1682                                                HAMMER2_DELETE_PERMANENT);
1683                 }
1684                 hammer2_cluster_unlock(cluster);
1685         }
1686         hammer2_cluster_lookup_done(cparent);
1687 }
1688
1689 /*
1690  * Function to write the data as it is, without performing any sort of
1691  * compression. This function is used in path without compression and
1692  * default zero-checking path.
1693  */
1694 static
1695 void
1696 hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
1697                                 int pblksize, int *errorp, int check_algo)
1698 {
1699         hammer2_chain_t *chain;
1700         hammer2_inode_data_t *wipdata;
1701         hammer2_io_t *dio;
1702         char *bdata;
1703         int error;
1704         int i;
1705
1706         error = 0;      /* XXX TODO below */
1707
1708         for (i = 0; i < cluster->nchains; ++i) {
1709                 chain = cluster->array[i].chain;        /* XXX */
1710                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1711
1712                 switch(chain->bref.type) {
1713                 case HAMMER2_BREF_TYPE_INODE:
1714                         wipdata = &hammer2_chain_wdata(chain)->ipdata;
1715                         KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1716                         KKASSERT(bp->b_loffset == 0);
1717                         bcopy(bp->b_data, wipdata->u.data,
1718                               HAMMER2_EMBEDDED_BYTES);
1719                         error = 0;
1720                         break;
1721                 case HAMMER2_BREF_TYPE_DATA:
1722                         error = hammer2_io_newnz(chain->hmp,
1723                                                  chain->bref.data_off,
1724                                                  chain->bytes, &dio);
1725                         if (error) {
1726                                 hammer2_io_bqrelse(&dio);
1727                                 kprintf("hammer2: WRITE PATH: "
1728                                         "dbp bread error\n");
1729                                 break;
1730                         }
1731                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1732
1733                         chain->bref.methods = HAMMER2_ENC_COMP(
1734                                                         HAMMER2_COMP_NONE) +
1735                                               HAMMER2_ENC_CHECK(check_algo);
1736                         bcopy(bp->b_data, bdata, chain->bytes);
1737
1738                         /*
1739                          * The flush code doesn't calculate check codes for
1740                          * file data (doing so can result in excessive I/O),
1741                          * so we do it here.
1742                          */
1743                         hammer2_chain_setcheck(chain, bdata);
1744
1745                         /*
1746                          * Device buffer is now valid, chain is no longer in
1747                          * the initial state.
1748                          *
1749                          * (No blockref table worries with file data)
1750                          */
1751                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1752
1753                         if (ioflag & IO_SYNC) {
1754                                 /*
1755                                  * Synchronous I/O requested.
1756                                  */
1757                                 hammer2_io_bwrite(&dio);
1758                         /*
1759                         } else if ((ioflag & IO_DIRECT) &&
1760                                    loff + n == pblksize) {
1761                                 hammer2_io_bdwrite(&dio);
1762                         */
1763                         } else if (ioflag & IO_ASYNC) {
1764                                 hammer2_io_bawrite(&dio);
1765                         } else {
1766                                 hammer2_io_bdwrite(&dio);
1767                         }
1768                         break;
1769                 default:
1770                         panic("hammer2_write_bp: bad chain type %d\n",
1771                               chain->bref.type);
1772                         /* NOT REACHED */
1773                         error = 0;
1774                         break;
1775                 }
1776                 KKASSERT(error == 0);   /* XXX TODO */
1777         }
1778         *errorp = error;
1779 }
1780
1781 static
1782 int
1783 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path,
1784                 struct vnode *devvp, struct ucred *cred)
1785 {
1786         int error;
1787
1788         if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1789                 error = hammer2_recovery(hmp);
1790         } else {
1791                 error = 0;
1792         }
1793         return error;
1794 }
1795
1796 static
1797 int
1798 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1799 {
1800         hammer2_pfs_t *pmp;
1801         int flags;
1802         int error = 0;
1803
1804         pmp = MPTOPMP(mp);
1805
1806         if (pmp == NULL)
1807                 return(0);
1808
1809         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1810
1811         /*
1812          * If mount initialization proceeded far enough we must flush
1813          * its vnodes and sync the underlying mount points.  Three syncs
1814          * are required to fully flush the filesystem (freemap updates lag
1815          * by one flush, and one extra for safety).
1816          */
1817         if (mntflags & MNT_FORCE)
1818                 flags = FORCECLOSE;
1819         else
1820                 flags = 0;
1821         if (pmp->iroot) {
1822                 error = vflush(mp, 0, flags);
1823                 if (error)
1824                         goto failed;
1825                 hammer2_vfs_sync(mp, MNT_WAIT);
1826                 hammer2_vfs_sync(mp, MNT_WAIT);
1827                 hammer2_vfs_sync(mp, MNT_WAIT);
1828         }
1829
1830         if (pmp->wthread_td) {
1831                 hammer2_mtx_ex(&pmp->wthread_mtx);
1832                 pmp->wthread_destroy = 1;
1833                 wakeup(&pmp->wthread_bioq);
1834                 while (pmp->wthread_destroy != -1) {
1835                         mtxsleep(&pmp->wthread_destroy,
1836                                 &pmp->wthread_mtx, 0,
1837                                 "umount-sleep", 0);
1838                 }
1839                 hammer2_mtx_unlock(&pmp->wthread_mtx);
1840                 pmp->wthread_td = NULL;
1841         }
1842
1843         /*
1844          * Cleanup our reference on ihidden.
1845          */
1846         if (pmp->ihidden) {
1847                 hammer2_inode_drop(pmp->ihidden);
1848                 pmp->ihidden = NULL;
1849         }
1850         if (pmp->mp)
1851                 hammer2_unmount_helper(mp, pmp, NULL);
1852
1853         error = 0;
1854 failed:
1855         lockmgr(&hammer2_mntlk, LK_RELEASE);
1856
1857         return (error);
1858 }
1859
1860 /*
1861  * Mount helper, hook the system mount into our PFS.
1862  * The mount lock is held.
1863  *
1864  * We must bump the pmp_count on related devices for any
1865  * mounted PFSs.
1866  */
1867 static
1868 void
1869 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp)
1870 {
1871         hammer2_cluster_t *cluster;
1872         hammer2_chain_t *rchain;
1873         int i;
1874
1875         mp->mnt_data = (qaddr_t)pmp;
1876         pmp->mp = mp;
1877
1878         cluster = &pmp->iroot->cluster;
1879         for (i = 0; i < cluster->nchains; ++i) {
1880                 rchain = cluster->array[i].chain;
1881                 if (rchain == NULL)
1882                         continue;
1883                 ++rchain->hmp->pmp_count;
1884                 kprintf("hammer2_mount hmp=%p ++pmp_count=%d\n",
1885                         rchain->hmp, rchain->hmp->pmp_count);
1886         }
1887 }
1888
1889 /*
1890  * Mount helper, unhook the system mount from our PFS.
1891  * The mount lock is held.
1892  *
1893  * If hmp is supplied a mount responsible for being the first to open
1894  * the block device failed and the block device and all PFSs using the
1895  * block device must be cleaned up.
1896  *
1897  * If pmp is supplied multiple devices might be backing the PFS and each
1898  * must be disconnect.  This might not be the last PFS using some of the
1899  * underlying devices.  Also, we have to adjust our hmp->pmp_count accounting
1900  * for the devices backing the pmp which is now undergoing an unmount.
1901  */
1902 static
1903 void
1904 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp)
1905 {
1906         hammer2_cluster_t *cluster;
1907         hammer2_chain_t *rchain;
1908         struct vnode *devvp;
1909         int dumpcnt;
1910         int ronly = 0;
1911         int i;
1912
1913         /*
1914          * If no device supplied this is a high-level unmount and we have to
1915          * to disconnect the mount, adjust pmp_count, and locate devices that
1916          * might now have no mounts.
1917          */
1918         if (pmp) {
1919                 KKASSERT(hmp == NULL);
1920                 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp);
1921                 pmp->mp = NULL;
1922                 mp->mnt_data = NULL;
1923
1924                 cluster = &pmp->iroot->cluster;
1925                 for (i = 0; i < cluster->nchains; ++i) {
1926                         rchain = cluster->array[i].chain;
1927                         if (rchain == NULL)
1928                                 continue;
1929                         --rchain->hmp->pmp_count;
1930                         kprintf("hammer2_unmount hmp=%p --pmp_count=%d\n",
1931                                 rchain->hmp, rchain->hmp->pmp_count);
1932                         /* scrapping hmp now may invalidate the pmp */
1933                 }
1934 again:
1935                 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
1936                         if (hmp->pmp_count == 0) {
1937                                 hammer2_unmount_helper(NULL, NULL, hmp);
1938                                 goto again;
1939                         }
1940                 }
1941                 return;
1942         }
1943
1944         /*
1945          * Try to terminate the block device.  We can't terminate it if
1946          * there are still PFSs referencing it.
1947          */
1948         kprintf("hammer2_unmount hmp=%p pmp_count=%d\n", hmp, hmp->pmp_count);
1949         if (hmp->pmp_count)
1950                 return;
1951
1952         hammer2_pfsfree_scan(hmp);
1953         hammer2_dev_exlock(hmp);        /* XXX order */
1954
1955         /*
1956          * Cycle the volume data lock as a safety (probably not needed any
1957          * more).  To ensure everything is out we need to flush at least
1958          * three times.  (1) The running of the unlinkq can dirty the
1959          * filesystem, (2) A normal flush can dirty the freemap, and
1960          * (3) ensure that the freemap is fully synchronized.
1961          *
1962          * The next mount's recovery scan can clean everything up but we want
1963          * to leave the filesystem in a 100% clean state on a normal unmount.
1964          */
1965 #if 0
1966         hammer2_voldata_lock(hmp);
1967         hammer2_voldata_unlock(hmp);
1968 #endif
1969         hammer2_iocom_uninit(hmp);
1970
1971         if ((hmp->vchain.flags | hmp->fchain.flags) &
1972             HAMMER2_CHAIN_FLUSH_MASK) {
1973                 kprintf("hammer2_unmount: chains left over "
1974                         "after final sync\n");
1975                 kprintf("    vchain %08x\n", hmp->vchain.flags);
1976                 kprintf("    fchain %08x\n", hmp->fchain.flags);
1977
1978                 if (hammer2_debug & 0x0010)
1979                         Debugger("entered debugger");
1980         }
1981
1982         KKASSERT(hmp->spmp == NULL);
1983
1984         /*
1985          * Finish up with the device vnode
1986          */
1987         if ((devvp = hmp->devvp) != NULL) {
1988                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1989                 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1990                 hmp->devvp = NULL;
1991                 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
1992                 vn_unlock(devvp);
1993                 vrele(devvp);
1994                 devvp = NULL;
1995         }
1996
1997         /*
1998          * Clear vchain/fchain flags that might prevent final cleanup
1999          * of these chains.
2000          */
2001         if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
2002                 atomic_clear_int(&hmp->vchain.flags,
2003                                  HAMMER2_CHAIN_MODIFIED);
2004                 hammer2_pfs_memory_wakeup(hmp->vchain.pmp);
2005                 hammer2_chain_drop(&hmp->vchain);
2006         }
2007         if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
2008                 atomic_clear_int(&hmp->vchain.flags,
2009                                  HAMMER2_CHAIN_UPDATE);
2010                 hammer2_chain_drop(&hmp->vchain);
2011         }
2012
2013         if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
2014                 atomic_clear_int(&hmp->fchain.flags,
2015                                  HAMMER2_CHAIN_MODIFIED);
2016                 hammer2_pfs_memory_wakeup(hmp->fchain.pmp);
2017                 hammer2_chain_drop(&hmp->fchain);
2018         }
2019         if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
2020                 atomic_clear_int(&hmp->fchain.flags,
2021                                  HAMMER2_CHAIN_UPDATE);
2022                 hammer2_chain_drop(&hmp->fchain);
2023         }
2024
2025         /*
2026          * Final drop of embedded freemap root chain to
2027          * clean up fchain.core (fchain structure is not
2028          * flagged ALLOCATED so it is cleaned out and then
2029          * left to rot).
2030          */
2031         hammer2_chain_drop(&hmp->fchain);
2032
2033         /*
2034          * Final drop of embedded volume root chain to clean
2035          * up vchain.core (vchain structure is not flagged
2036          * ALLOCATED so it is cleaned out and then left to
2037          * rot).
2038          */
2039         dumpcnt = 50;
2040         hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
2041         dumpcnt = 50;
2042         hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
2043         hammer2_dev_unlock(hmp);
2044         hammer2_chain_drop(&hmp->vchain);
2045
2046         hammer2_io_cleanup(hmp, &hmp->iotree);
2047         if (hmp->iofree_count) {
2048                 kprintf("io_cleanup: %d I/O's left hanging\n",
2049                         hmp->iofree_count);
2050         }
2051
2052         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
2053         kmalloc_destroy(&hmp->mchain);
2054         kfree(hmp, M_HAMMER2);
2055 }
2056
2057 static
2058 int
2059 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
2060              ino_t ino, struct vnode **vpp)
2061 {
2062         kprintf("hammer2_vget\n");
2063         return (EOPNOTSUPP);
2064 }
2065
2066 static
2067 int
2068 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
2069 {
2070         hammer2_pfs_t *pmp;
2071         hammer2_cluster_t *cparent;
2072         int error;
2073         struct vnode *vp;
2074
2075         pmp = MPTOPMP(mp);
2076         if (pmp->iroot == NULL) {
2077                 *vpp = NULL;
2078                 error = EINVAL;
2079         } else {
2080                 cparent = hammer2_inode_lock_sh(pmp->iroot);
2081                 vp = hammer2_igetv(pmp->iroot, cparent, &error);
2082                 hammer2_inode_unlock_sh(pmp->iroot, cparent);
2083                 *vpp = vp;
2084                 if (vp == NULL)
2085                         kprintf("vnodefail\n");
2086         }
2087
2088         return (error);
2089 }
2090
2091 /*
2092  * Filesystem status
2093  *
2094  * XXX incorporate ipdata->inode_quota and data_quota
2095  */
2096 static
2097 int
2098 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
2099 {
2100         hammer2_pfs_t *pmp;
2101         hammer2_dev_t *hmp;
2102
2103         pmp = MPTOPMP(mp);
2104         KKASSERT(pmp->iroot->cluster.nchains >= 1);
2105         hmp = pmp->iroot->cluster.focus->hmp;   /* XXX */
2106
2107         mp->mnt_stat.f_files = pmp->inode_count;
2108         mp->mnt_stat.f_ffree = 0;
2109         mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
2110         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
2111         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
2112
2113         *sbp = mp->mnt_stat;
2114         return (0);
2115 }
2116
2117 static
2118 int
2119 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
2120 {
2121         hammer2_pfs_t *pmp;
2122         hammer2_dev_t *hmp;
2123
2124         pmp = MPTOPMP(mp);
2125         KKASSERT(pmp->iroot->cluster.nchains >= 1);
2126         hmp = pmp->iroot->cluster.focus->hmp;   /* XXX */
2127
2128         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
2129         mp->mnt_vstat.f_files = pmp->inode_count;
2130         mp->mnt_vstat.f_ffree = 0;
2131         mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
2132         mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
2133         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
2134
2135         *sbp = mp->mnt_vstat;
2136         return (0);
2137 }
2138
2139 /*
2140  * Mount-time recovery (RW mounts)
2141  *
2142  * Updates to the free block table are allowed to lag flushes by one
2143  * transaction.  In case of a crash, then on a fresh mount we must do an
2144  * incremental scan of the last committed transaction id and make sure that
2145  * all related blocks have been marked allocated.
2146  *
2147  * The super-root topology and each PFS has its own transaction id domain,
2148  * so we must track PFS boundary transitions.
2149  */
2150 struct hammer2_recovery_elm {
2151         TAILQ_ENTRY(hammer2_recovery_elm) entry;
2152         hammer2_chain_t *chain;
2153         hammer2_tid_t sync_tid;
2154 };
2155
2156 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
2157
2158 struct hammer2_recovery_info {
2159         struct hammer2_recovery_list list;
2160         int     depth;
2161 };
2162
2163 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2164                         hammer2_chain_t *parent,
2165                         struct hammer2_recovery_info *info,
2166                         hammer2_tid_t sync_tid);
2167
2168 #define HAMMER2_RECOVERY_MAXDEPTH       10
2169
2170 static
2171 int
2172 hammer2_recovery(hammer2_dev_t *hmp)
2173 {
2174         hammer2_trans_t trans;
2175         struct hammer2_recovery_info info;
2176         struct hammer2_recovery_elm *elm;
2177         hammer2_chain_t *parent;
2178         hammer2_tid_t sync_tid;
2179         int error;
2180         int cumulative_error = 0;
2181
2182         hammer2_trans_init(&trans, hmp->spmp, 0);
2183
2184         sync_tid = 0;
2185         TAILQ_INIT(&info.list);
2186         info.depth = 0;
2187         parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
2188         cumulative_error = hammer2_recovery_scan(&trans, hmp, parent,
2189                                                  &info, sync_tid);
2190         hammer2_chain_lookup_done(parent);
2191
2192         while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
2193                 TAILQ_REMOVE(&info.list, elm, entry);
2194                 parent = elm->chain;
2195                 sync_tid = elm->sync_tid;
2196                 kfree(elm, M_HAMMER2);
2197
2198                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
2199                                            HAMMER2_RESOLVE_NOREF);
2200                 error = hammer2_recovery_scan(&trans, hmp, parent,
2201                                               &info, sync_tid);
2202                 hammer2_chain_unlock(parent);
2203                 if (error)
2204                         cumulative_error = error;
2205         }
2206         hammer2_trans_done(&trans);
2207
2208         return cumulative_error;
2209 }
2210
2211 static
2212 int
2213 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2214                       hammer2_chain_t *parent,
2215                       struct hammer2_recovery_info *info,
2216                       hammer2_tid_t sync_tid)
2217 {
2218         const hammer2_inode_data_t *ripdata;
2219         hammer2_chain_t *chain;
2220         int cache_index;
2221         int cumulative_error = 0;
2222         int pfs_boundary = 0;
2223         int error;
2224
2225         /*
2226          * Adjust freemap to ensure that the block(s) are marked allocated.
2227          */
2228         if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
2229                 hammer2_freemap_adjust(trans, hmp, &parent->bref,
2230                                        HAMMER2_FREEMAP_DORECOVER);
2231         }
2232
2233         /*
2234          * Check type for recursive scan
2235          */
2236         switch(parent->bref.type) {
2237         case HAMMER2_BREF_TYPE_VOLUME:
2238                 /* data already instantiated */
2239                 break;
2240         case HAMMER2_BREF_TYPE_INODE:
2241                 /*
2242                  * Must instantiate data for DIRECTDATA test and also
2243                  * for recursion.
2244                  */
2245                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2246                 ripdata = &hammer2_chain_rdata(parent)->ipdata;
2247                 if (ripdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2248                         /* not applicable to recovery scan */
2249                         hammer2_chain_unlock(parent);
2250                         return 0;
2251                 }
2252                 if ((ripdata->op_flags & HAMMER2_OPFLAG_PFSROOT) &&
2253                     info->depth != 0) {
2254                         pfs_boundary = 1;
2255                         sync_tid = parent->bref.mirror_tid - 1;
2256                         kprintf("recovery scan PFS synctid %016jx \"%s\"\n",
2257                                 sync_tid, ripdata->filename);
2258                 }
2259 #if 0
2260                 if ((ripdata->op_flags & HAMMER2_OPFLAG_PFSROOT) == 0) {
2261                         kprintf("%*.*s\"%s\"\n", info->depth, info->depth, "", ripdata->filename);
2262                 }
2263 #endif
2264                 hammer2_chain_unlock(parent);
2265                 break;
2266         case HAMMER2_BREF_TYPE_INDIRECT:
2267                 /*
2268                  * Must instantiate data for recursion
2269                  */
2270                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2271                 hammer2_chain_unlock(parent);
2272                 break;
2273         case HAMMER2_BREF_TYPE_DATA:
2274         case HAMMER2_BREF_TYPE_FREEMAP:
2275         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2276         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2277                 /* not applicable to recovery scan */
2278                 return 0;
2279                 break;
2280         default:
2281                 return EDOM;
2282         }
2283
2284         /*
2285          * Defer operation if depth limit reached or if we are crossing a
2286          * PFS boundary.
2287          */
2288         if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH || pfs_boundary) {
2289                 struct hammer2_recovery_elm *elm;
2290
2291                 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2292                 elm->chain = parent;
2293                 elm->sync_tid = sync_tid;
2294                 hammer2_chain_ref(parent);
2295                 TAILQ_INSERT_TAIL(&info->list, elm, entry);
2296                 /* unlocked by caller */
2297
2298                 return(0);
2299         }
2300
2301
2302         /*
2303          * Recursive scan of the last flushed transaction only.  We are
2304          * doing this without pmp assignments so don't leave the chains
2305          * hanging around after we are done with them.
2306          */
2307         cache_index = 0;
2308         chain = hammer2_chain_scan(parent, NULL, &cache_index,
2309                                    HAMMER2_LOOKUP_NODATA);
2310         while (chain) {
2311                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2312                 if (chain->bref.mirror_tid >= sync_tid) {
2313                         ++info->depth;
2314                         error = hammer2_recovery_scan(trans, hmp, chain,
2315                                                       info, sync_tid);
2316                         --info->depth;
2317                         if (error)
2318                                 cumulative_error = error;
2319                 }
2320                 chain = hammer2_chain_scan(parent, chain, &cache_index,
2321                                            HAMMER2_LOOKUP_NODATA);
2322         }
2323
2324         return cumulative_error;
2325 }
2326
2327 /*
2328  * Sync the entire filesystem; this is called from the filesystem syncer
2329  * process periodically and whenever a user calls sync(1) on the hammer
2330  * mountpoint.
2331  *
2332  * Currently is actually called from the syncer! \o/
2333  *
2334  * This task will have to snapshot the state of the dirty inode chain.
2335  * From that, it will have to make sure all of the inodes on the dirty
2336  * chain have IO initiated. We make sure that io is initiated for the root
2337  * block.
2338  *
2339  * If waitfor is set, we wait for media to acknowledge the new rootblock.
2340  *
2341  * THINKS: side A vs side B, to have sync not stall all I/O?
2342  */
2343 int
2344 hammer2_vfs_sync(struct mount *mp, int waitfor)
2345 {
2346         struct hammer2_sync_info info;
2347         hammer2_inode_t *iroot;
2348         hammer2_chain_t *chain;
2349         hammer2_chain_t *parent;
2350         hammer2_pfs_t *pmp;
2351         hammer2_dev_t *hmp;
2352         int flags;
2353         int error;
2354         int total_error;
2355         int force_fchain;
2356         int i;
2357         int j;
2358
2359         pmp = MPTOPMP(mp);
2360         iroot = pmp->iroot;
2361         KKASSERT(iroot);
2362         KKASSERT(iroot->pmp == pmp);
2363
2364         /*
2365          * We can't acquire locks on existing vnodes while in a transaction
2366          * without risking a deadlock.  This assumes that vfsync() can be
2367          * called without the vnode locked (which it can in DragonFly).
2368          * Otherwise we'd have to implement a multi-pass or flag the lock
2369          * failures and retry.
2370          *
2371          * The reclamation code interlocks with the sync list's token
2372          * (by removing the vnode from the scan list) before unlocking
2373          * the inode, giving us time to ref the inode.
2374          */
2375         /*flags = VMSC_GETVP;*/
2376         flags = 0;
2377         if (waitfor & MNT_LAZY)
2378                 flags |= VMSC_ONEPASS;
2379
2380         /*
2381          * Start our flush transaction.  This does not return until all
2382          * concurrent transactions have completed and will prevent any
2383          * new transactions from running concurrently, except for the
2384          * buffer cache transactions.
2385          *
2386          * For efficiency do an async pass before making sure with a
2387          * synchronous pass on all related buffer cache buffers.  It
2388          * should theoretically not be possible for any new file buffers
2389          * to be instantiated during this sequence.
2390          */
2391         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH |
2392                                              HAMMER2_TRANS_PREFLUSH);
2393         hammer2_run_unlinkq(&info.trans, pmp);
2394
2395         info.error = 0;
2396         info.waitfor = MNT_NOWAIT;
2397         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
2398         info.waitfor = MNT_WAIT;
2399         vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2400
2401         /*
2402          * Clear PREFLUSH.  This prevents (or asserts on) any new logical
2403          * buffer cache flushes which occur during the flush.  Device buffers
2404          * are not affected.
2405          */
2406
2407 #if 0
2408         if (info.error == 0 && (waitfor & MNT_WAIT)) {
2409                 info.waitfor = waitfor;
2410                     vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2411
2412         }
2413 #endif
2414         hammer2_bioq_sync(info.trans.pmp);
2415         atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH);
2416
2417         total_error = 0;
2418
2419         /*
2420          * Flush all storage elements making up the cluster
2421          *
2422          * We must also flush any deleted siblings because the super-root
2423          * flush won't do it for us.  They all must be staged or the
2424          * super-root flush will not be able to update its block table
2425          * properly.
2426          *
2427          * XXX currently done serially instead of concurrently
2428          */
2429         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2430                 chain = iroot->cluster.array[i].chain;
2431                 if (chain) {
2432                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
2433                         hammer2_flush(&info.trans, chain);
2434                         hammer2_chain_unlock(chain);
2435                 }
2436         }
2437 #if 0
2438         hammer2_trans_done(&info.trans);
2439 #endif
2440
2441         /*
2442          * Flush all volume roots to synchronize PFS flushes with the
2443          * storage media.  Use a super-root transaction for each one.
2444          *
2445          * The flush code will detect super-root -> pfs-root chain
2446          * transitions using the last pfs-root flush.
2447          */
2448         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2449                 hammer2_chain_t *tmp;
2450
2451                 chain = iroot->cluster.array[i].chain;
2452                 if (chain == NULL)
2453                         continue;
2454
2455                 hmp = chain->hmp;
2456
2457                 /*
2458                  * We only have to flush each hmp once
2459                  */
2460                 for (j = i - 1; j >= 0; --j) {
2461                         if ((tmp = iroot->cluster.array[j].chain) != NULL) {
2462                                 if (tmp->hmp == hmp)
2463                                         break;
2464                         }
2465                 }
2466                 if (j >= 0)
2467                         continue;
2468                 hammer2_trans_spmp(&info.trans, hmp->spmp);
2469
2470                 /*
2471                  * Force an update of the XID from the PFS root to the
2472                  * topology root.  We couldn't do this from the PFS
2473                  * transaction because a SPMP transaction is needed.
2474                  * This does not modify blocks, instead what it does is
2475                  * allow the flush code to find the transition point and
2476                  * then update on the way back up.
2477                  */
2478                 parent = chain->parent;
2479                 KKASSERT(chain->pmp != parent->pmp);
2480                 hammer2_chain_setflush(&info.trans, parent);
2481
2482                 /*
2483                  * Media mounts have two 'roots', vchain for the topology
2484                  * and fchain for the free block table.  Flush both.
2485                  *
2486                  * Note that the topology and free block table are handled
2487                  * independently, so the free block table can wind up being
2488                  * ahead of the topology.  We depend on the bulk free scan
2489                  * code to deal with any loose ends.
2490                  */
2491                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2492                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2493                 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2494                         /*
2495                          * This will also modify vchain as a side effect,
2496                          * mark vchain as modified now.
2497                          */
2498                         hammer2_voldata_modify(hmp);
2499                         chain = &hmp->fchain;
2500                         hammer2_flush(&info.trans, chain);
2501                         KKASSERT(chain == &hmp->fchain);
2502                 }
2503                 hammer2_chain_unlock(&hmp->fchain);
2504                 hammer2_chain_unlock(&hmp->vchain);
2505
2506                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2507                 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2508                         chain = &hmp->vchain;
2509                         hammer2_flush(&info.trans, chain);
2510                         KKASSERT(chain == &hmp->vchain);
2511                         force_fchain = 1;
2512                 } else {
2513                         force_fchain = 0;
2514                 }
2515                 hammer2_chain_unlock(&hmp->vchain);
2516
2517 #if 0
2518                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2519                 if ((hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) ||
2520                     force_fchain) {
2521                         /* this will also modify vchain as a side effect */
2522                         chain = &hmp->fchain;
2523                         hammer2_flush(&info.trans, chain);
2524                         KKASSERT(chain == &hmp->fchain);
2525                 }
2526                 hammer2_chain_unlock(&hmp->fchain);
2527 #endif
2528
2529                 error = 0;
2530
2531                 /*
2532                  * We can't safely flush the volume header until we have
2533                  * flushed any device buffers which have built up.
2534                  *
2535                  * XXX this isn't being incremental
2536                  */
2537                 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
2538                 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
2539                 vn_unlock(hmp->devvp);
2540
2541                 /*
2542                  * The flush code sets CHAIN_VOLUMESYNC to indicate that the
2543                  * volume header needs synchronization via hmp->volsync.
2544                  *
2545                  * XXX synchronize the flag & data with only this flush XXX
2546                  */
2547                 if (error == 0 &&
2548                     (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
2549                         struct buf *bp;
2550
2551                         /*
2552                          * Synchronize the disk before flushing the volume
2553                          * header.
2554                          */
2555                         bp = getpbuf(NULL);
2556                         bp->b_bio1.bio_offset = 0;
2557                         bp->b_bufsize = 0;
2558                         bp->b_bcount = 0;
2559                         bp->b_cmd = BUF_CMD_FLUSH;
2560                         bp->b_bio1.bio_done = biodone_sync;
2561                         bp->b_bio1.bio_flags |= BIO_SYNC;
2562                         vn_strategy(hmp->devvp, &bp->b_bio1);
2563                         biowait(&bp->b_bio1, "h2vol");
2564                         relpbuf(bp, NULL);
2565
2566                         /*
2567                          * Then we can safely flush the version of the
2568                          * volume header synchronized by the flush code.
2569                          */
2570                         i = hmp->volhdrno + 1;
2571                         if (i >= HAMMER2_NUM_VOLHDRS)
2572                                 i = 0;
2573                         if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
2574                             hmp->volsync.volu_size) {
2575                                 i = 0;
2576                         }
2577                         kprintf("sync volhdr %d %jd\n",
2578                                 i, (intmax_t)hmp->volsync.volu_size);
2579                         bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2580                                     HAMMER2_PBUFSIZE, 0, 0);
2581                         atomic_clear_int(&hmp->vchain.flags,
2582                                          HAMMER2_CHAIN_VOLUMESYNC);
2583                         bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
2584                         bawrite(bp);
2585                         hmp->volhdrno = i;
2586                 }
2587                 if (error)
2588                         total_error = error;
2589
2590 #if 0
2591                 hammer2_trans_done(&info.trans);
2592 #endif
2593         }
2594         hammer2_trans_done(&info.trans);
2595
2596         return (total_error);
2597 }
2598
2599 /*
2600  * Sync passes.
2601  */
2602 static int
2603 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2604 {
2605         struct hammer2_sync_info *info = data;
2606         hammer2_inode_t *ip;
2607         int error;
2608
2609         /*
2610          *
2611          */
2612         ip = VTOI(vp);
2613         if (ip == NULL)
2614                 return(0);
2615         if (vp->v_type == VNON || vp->v_type == VBAD) {
2616                 vclrisdirty(vp);
2617                 return(0);
2618         }
2619         if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
2620             RB_EMPTY(&vp->v_rbdirty_tree)) {
2621                 vclrisdirty(vp);
2622                 return(0);
2623         }
2624
2625         /*
2626          * VOP_FSYNC will start a new transaction so replicate some code
2627          * here to do it inline (see hammer2_vop_fsync()).
2628          *
2629          * WARNING: The vfsync interacts with the buffer cache and might
2630          *          block, we can't hold the inode lock at that time.
2631          *          However, we MUST ref ip before blocking to ensure that
2632          *          it isn't ripped out from under us (since we do not
2633          *          hold a lock on the vnode).
2634          */
2635         hammer2_inode_ref(ip);
2636         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
2637         if (vp)
2638                 vfsync(vp, MNT_NOWAIT, 1, NULL, NULL);
2639
2640         hammer2_inode_drop(ip);
2641 #if 1
2642         error = 0;
2643         if (error)
2644                 info->error = error;
2645 #endif
2646         return(0);
2647 }
2648
2649 static
2650 int
2651 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2652 {
2653         return (0);
2654 }
2655
2656 static
2657 int
2658 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2659                struct fid *fhp, struct vnode **vpp)
2660 {
2661         return (0);
2662 }
2663
2664 static
2665 int
2666 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2667                  int *exflagsp, struct ucred **credanonp)
2668 {
2669         return (0);
2670 }
2671
2672 /*
2673  * Support code for hammer2_vfs_mount().  Read, verify, and install the volume
2674  * header into the HMP
2675  *
2676  * XXX read four volhdrs and use the one with the highest TID whos CRC
2677  *     matches.
2678  *
2679  * XXX check iCRCs.
2680  *
2681  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2682  *     nonexistant locations.
2683  *
2684  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2685  */
2686 static
2687 int
2688 hammer2_install_volume_header(hammer2_dev_t *hmp)
2689 {
2690         hammer2_volume_data_t *vd;
2691         struct buf *bp;
2692         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2693         int error_reported;
2694         int error;
2695         int valid;
2696         int i;
2697
2698         error_reported = 0;
2699         error = 0;
2700         valid = 0;
2701         bp = NULL;
2702
2703         /*
2704          * There are up to 4 copies of the volume header (syncs iterate
2705          * between them so there is no single master).  We don't trust the
2706          * volu_size field so we don't know precisely how large the filesystem
2707          * is, so depend on the OS to return an error if we go beyond the
2708          * block device's EOF.
2709          */
2710         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2711                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2712                               HAMMER2_VOLUME_BYTES, &bp);
2713                 if (error) {
2714                         brelse(bp);
2715                         bp = NULL;
2716                         continue;
2717                 }
2718
2719                 vd = (struct hammer2_volume_data *) bp->b_data;
2720                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2721                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2722                         brelse(bp);
2723                         bp = NULL;
2724                         continue;
2725                 }
2726
2727                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2728                         /* XXX: Reversed-endianness filesystem */
2729                         kprintf("hammer2: reverse-endian filesystem detected");
2730                         brelse(bp);
2731                         bp = NULL;
2732                         continue;
2733                 }
2734
2735                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2736                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2737                                       HAMMER2_VOLUME_ICRC0_SIZE);
2738                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2739                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2740                                        HAMMER2_VOLUME_ICRC1_SIZE);
2741                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
2742                         kprintf("hammer2 volume header crc "
2743                                 "mismatch copy #%d %08x/%08x\n",
2744                                 i, crc0, crc);
2745                         error_reported = 1;
2746                         brelse(bp);
2747                         bp = NULL;
2748                         continue;
2749                 }
2750                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
2751                         valid = 1;
2752                         hmp->voldata = *vd;
2753                         hmp->volhdrno = i;
2754                 }
2755                 brelse(bp);
2756                 bp = NULL;
2757         }
2758         if (valid) {
2759                 hmp->volsync = hmp->voldata;
2760                 error = 0;
2761                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
2762                         kprintf("hammer2: using volume header #%d\n",
2763                                 hmp->volhdrno);
2764                 }
2765         } else {
2766                 error = EINVAL;
2767                 kprintf("hammer2: no valid volume headers found!\n");
2768         }
2769         return (error);
2770 }
2771
2772 /*
2773  * This handles hysteresis on regular file flushes.  Because the BIOs are
2774  * routed to a thread it is possible for an excessive number to build up
2775  * and cause long front-end stalls long before the runningbuffspace limit
2776  * is hit, so we implement hammer2_flush_pipe to control the
2777  * hysteresis.
2778  *
2779  * This is a particular problem when compression is used.
2780  */
2781 void
2782 hammer2_lwinprog_ref(hammer2_pfs_t *pmp)
2783 {
2784         atomic_add_int(&pmp->count_lwinprog, 1);
2785 }
2786
2787 void
2788 hammer2_lwinprog_drop(hammer2_pfs_t *pmp)
2789 {
2790         int lwinprog;
2791
2792         lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2793         if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2794             (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2795                 atomic_clear_int(&pmp->count_lwinprog,
2796                                  HAMMER2_LWINPROG_WAITING);
2797                 wakeup(&pmp->count_lwinprog);
2798         }
2799 }
2800
2801 void
2802 hammer2_lwinprog_wait(hammer2_pfs_t *pmp)
2803 {
2804         int lwinprog;
2805
2806         for (;;) {
2807                 lwinprog = pmp->count_lwinprog;
2808                 cpu_ccfence();
2809                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2810                         break;
2811                 tsleep_interlock(&pmp->count_lwinprog, 0);
2812                 atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING);
2813                 lwinprog = pmp->count_lwinprog;
2814                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2815                         break;
2816                 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2817         }
2818 }
2819
2820 /*
2821  * Manage excessive memory resource use for chain and related
2822  * structures.
2823  */
2824 void
2825 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
2826 {
2827         uint32_t waiting;
2828         uint32_t count;
2829         uint32_t limit;
2830 #if 0
2831         static int zzticks;
2832 #endif
2833
2834         /*
2835          * Atomic check condition and wait.  Also do an early speedup of
2836          * the syncer to try to avoid hitting the wait.
2837          */
2838         for (;;) {
2839                 waiting = pmp->inmem_dirty_chains;
2840                 cpu_ccfence();
2841                 count = waiting & HAMMER2_DIRTYCHAIN_MASK;
2842
2843                 limit = pmp->mp->mnt_nvnodelistsize / 10;
2844                 if (limit < hammer2_limit_dirty_chains)
2845                         limit = hammer2_limit_dirty_chains;
2846                 if (limit < 1000)
2847                         limit = 1000;
2848
2849 #if 0
2850                 if ((int)(ticks - zzticks) > hz) {
2851                         zzticks = ticks;
2852                         kprintf("count %ld %ld\n", count, limit);
2853                 }
2854 #endif
2855
2856                 /*
2857                  * Block if there are too many dirty chains present, wait
2858                  * for the flush to clean some out.
2859                  */
2860                 if (count > limit) {
2861                         tsleep_interlock(&pmp->inmem_dirty_chains, 0);
2862                         if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
2863                                                waiting,
2864                                        waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
2865                                 speedup_syncer(pmp->mp);
2866                                 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
2867                                        "chnmem", hz);
2868                         }
2869                         continue;       /* loop on success or fail */
2870                 }
2871
2872                 /*
2873                  * Try to start an early flush before we are forced to block.
2874                  */
2875                 if (count > limit * 7 / 10)
2876                         speedup_syncer(pmp->mp);
2877                 break;
2878         }
2879 }
2880
2881 void
2882 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp)
2883 {
2884         if (pmp) {
2885                 atomic_add_int(&pmp->inmem_dirty_chains, 1);
2886         }
2887 }
2888
2889 void
2890 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp)
2891 {
2892         uint32_t waiting;
2893
2894         if (pmp == NULL)
2895                 return;
2896
2897         for (;;) {
2898                 waiting = pmp->inmem_dirty_chains;
2899                 cpu_ccfence();
2900                 if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
2901                                        waiting,
2902                                        (waiting - 1) &
2903                                         ~HAMMER2_DIRTYCHAIN_WAITING)) {
2904                         break;
2905                 }
2906         }
2907
2908         if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
2909                 wakeup(&pmp->inmem_dirty_chains);
2910 }
2911
2912 /*
2913  * Debugging
2914  */
2915 void
2916 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
2917 {
2918         hammer2_chain_t *scan;
2919         hammer2_chain_t *parent;
2920
2921         --*countp;
2922         if (*countp == 0) {
2923                 kprintf("%*.*s...\n", tab, tab, "");
2924                 return;
2925         }
2926         if (*countp < 0)
2927                 return;
2928         kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
2929                 tab, tab, "", pfx,
2930                 chain, chain->bref.type,
2931                 chain->bref.key, chain->bref.keybits,
2932                 chain->bref.mirror_tid);
2933
2934         kprintf("%*.*s      [%08x] (%s) refs=%d\n",
2935                 tab, tab, "",
2936                 chain->flags,
2937                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
2938                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
2939                 chain->refs);
2940
2941         kprintf("%*.*s      core [%08x]",
2942                 tab, tab, "",
2943                 chain->core.flags);
2944
2945         parent = chain->parent;
2946         if (parent)
2947                 kprintf("\n%*.*s      p=%p [pflags %08x prefs %d",
2948                         tab, tab, "",
2949                         parent, parent->flags, parent->refs);
2950         if (RB_EMPTY(&chain->core.rbtree)) {
2951                 kprintf("\n");
2952         } else {
2953                 kprintf(" {\n");
2954                 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree)
2955                         hammer2_dump_chain(scan, tab + 4, countp, 'a');
2956                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
2957                         kprintf("%*.*s}(%s)\n", tab, tab, "",
2958                                 chain->data->ipdata.filename);
2959                 else
2960                         kprintf("%*.*s}\n", tab, tab, "");
2961         }
2962 }