Merge branch 'vendor/GCC50'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61 #include "hammer2_lz4.h"
62
63 #include "zlib/hammer2_zlib.h"
64
65 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
66
67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
68
69 struct hammer2_sync_info {
70         hammer2_trans_t trans;
71         int error;
72         int waitfor;
73 };
74
75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
76 TAILQ_HEAD(hammer2_pfslist, hammer2_pfs);
77 static struct hammer2_mntlist hammer2_mntlist;
78 static struct hammer2_pfslist hammer2_pfslist;
79 static struct lock hammer2_mntlk;
80
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 int hammer2_flush_pipe = 100;
85 int hammer2_synchronous_flush = 1;
86 int hammer2_dio_count;
87 long hammer2_limit_dirty_chains;
88 long hammer2_iod_file_read;
89 long hammer2_iod_meta_read;
90 long hammer2_iod_indr_read;
91 long hammer2_iod_fmap_read;
92 long hammer2_iod_volu_read;
93 long hammer2_iod_file_write;
94 long hammer2_iod_meta_write;
95 long hammer2_iod_indr_write;
96 long hammer2_iod_fmap_write;
97 long hammer2_iod_volu_write;
98 long hammer2_ioa_file_read;
99 long hammer2_ioa_meta_read;
100 long hammer2_ioa_indr_read;
101 long hammer2_ioa_fmap_read;
102 long hammer2_ioa_volu_read;
103 long hammer2_ioa_fmap_write;
104 long hammer2_ioa_file_write;
105 long hammer2_ioa_meta_write;
106 long hammer2_ioa_indr_write;
107 long hammer2_ioa_volu_write;
108
109 MALLOC_DECLARE(C_BUFFER);
110 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
111
112 MALLOC_DECLARE(D_BUFFER);
113 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
114
115 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
116
117 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
118            &hammer2_debug, 0, "");
119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
120            &hammer2_cluster_enable, 0, "");
121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
122            &hammer2_hardlink_enable, 0, "");
123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
124            &hammer2_flush_pipe, 0, "");
125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
126            &hammer2_synchronous_flush, 0, "");
127 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
128            &hammer2_limit_dirty_chains, 0, "");
129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
130            &hammer2_dio_count, 0, "");
131
132 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
133            &hammer2_iod_file_read, 0, "");
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
135            &hammer2_iod_meta_read, 0, "");
136 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
137            &hammer2_iod_indr_read, 0, "");
138 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
139            &hammer2_iod_fmap_read, 0, "");
140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
141            &hammer2_iod_volu_read, 0, "");
142
143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
144            &hammer2_iod_file_write, 0, "");
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
146            &hammer2_iod_meta_write, 0, "");
147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
148            &hammer2_iod_indr_write, 0, "");
149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
150            &hammer2_iod_fmap_write, 0, "");
151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
152            &hammer2_iod_volu_write, 0, "");
153
154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
155            &hammer2_ioa_file_read, 0, "");
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
157            &hammer2_ioa_meta_read, 0, "");
158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
159            &hammer2_ioa_indr_read, 0, "");
160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
161            &hammer2_ioa_fmap_read, 0, "");
162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
163            &hammer2_ioa_volu_read, 0, "");
164
165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
166            &hammer2_ioa_file_write, 0, "");
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
168            &hammer2_ioa_meta_write, 0, "");
169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
170            &hammer2_ioa_indr_write, 0, "");
171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
172            &hammer2_ioa_fmap_write, 0, "");
173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
174            &hammer2_ioa_volu_write, 0, "");
175
176 static int hammer2_vfs_init(struct vfsconf *conf);
177 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
178 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
179                                 struct ucred *cred);
180 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *,
181                                 struct vnode *, struct ucred *);
182 static int hammer2_recovery(hammer2_dev_t *hmp);
183 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
184 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
185 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
186                                 struct ucred *cred);
187 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
188                                 struct ucred *cred);
189 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
190                                 ino_t ino, struct vnode **vpp);
191 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
192                                 struct fid *fhp, struct vnode **vpp);
193 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
194 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
195                                 int *exflagsp, struct ucred **credanonp);
196
197 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
198 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
199
200 static void hammer2_update_pmps(hammer2_dev_t *hmp);
201 static void hammer2_write_thread(void *arg);
202
203 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp);
204 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp,
205                                 hammer2_dev_t *hmp);
206
207 /* 
208  * Functions for compression in threads,
209  * from hammer2_vnops.c
210  */
211 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
212                                 hammer2_inode_t *ip,
213                                 const hammer2_inode_data_t *ripdata,
214                                 hammer2_cluster_t *cparent,
215                                 hammer2_key_t lbase, int ioflag, int pblksize,
216                                 int *errorp);
217 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
218                                 hammer2_inode_t *ip,
219                                 const hammer2_inode_data_t *ripdata,
220                                 hammer2_cluster_t *cparent,
221                                 hammer2_key_t lbase, int ioflag,
222                                 int pblksize, int *errorp,
223                                 int comp_algo, int check_algo);
224 static void hammer2_zero_check_and_write(struct buf *bp,
225                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
226                                 const hammer2_inode_data_t *ripdata,
227                                 hammer2_cluster_t *cparent,
228                                 hammer2_key_t lbase,
229                                 int ioflag, int pblksize, int *errorp,
230                                 int check_algo);
231 static int test_block_zeros(const char *buf, size_t bytes);
232 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
233                                 hammer2_inode_t *ip,
234                                 const hammer2_inode_data_t *ripdata,
235                                 hammer2_cluster_t *cparent,
236                                 hammer2_key_t lbase,
237                                 int *errorp);
238 static void hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp,
239                                 int ioflag, int pblksize, int *errorp,
240                                 int check_algo);
241
242 /*
243  * HAMMER2 vfs operations.
244  */
245 static struct vfsops hammer2_vfsops = {
246         .vfs_init       = hammer2_vfs_init,
247         .vfs_uninit     = hammer2_vfs_uninit,
248         .vfs_sync       = hammer2_vfs_sync,
249         .vfs_mount      = hammer2_vfs_mount,
250         .vfs_unmount    = hammer2_vfs_unmount,
251         .vfs_root       = hammer2_vfs_root,
252         .vfs_statfs     = hammer2_vfs_statfs,
253         .vfs_statvfs    = hammer2_vfs_statvfs,
254         .vfs_vget       = hammer2_vfs_vget,
255         .vfs_vptofh     = hammer2_vfs_vptofh,
256         .vfs_fhtovp     = hammer2_vfs_fhtovp,
257         .vfs_checkexp   = hammer2_vfs_checkexp
258 };
259
260 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
261
262 VFS_SET(hammer2_vfsops, hammer2, 0);
263 MODULE_VERSION(hammer2, 1);
264
265 static
266 int
267 hammer2_vfs_init(struct vfsconf *conf)
268 {
269         static struct objcache_malloc_args margs_read;
270         static struct objcache_malloc_args margs_write;
271
272         int error;
273
274         error = 0;
275
276         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
277                 error = EINVAL;
278         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
279                 error = EINVAL;
280         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
281                 error = EINVAL;
282
283         if (error)
284                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
285         
286         margs_read.objsize = 65536;
287         margs_read.mtype = D_BUFFER;
288         
289         margs_write.objsize = 32768;
290         margs_write.mtype = C_BUFFER;
291         
292         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
293                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
294                                 objcache_malloc_free, &margs_read);
295         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
296                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
297                                 objcache_malloc_free, &margs_write);
298
299         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
300         TAILQ_INIT(&hammer2_mntlist);
301         TAILQ_INIT(&hammer2_pfslist);
302
303         hammer2_limit_dirty_chains = desiredvnodes / 10;
304
305         hammer2_trans_manage_init();
306
307         return (error);
308 }
309
310 static
311 int
312 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
313 {
314         objcache_destroy(cache_buffer_read);
315         objcache_destroy(cache_buffer_write);
316         return 0;
317 }
318
319 /*
320  * Core PFS allocator.  Used to allocate the pmp structure for PFS cluster
321  * mounts and the spmp structure for media (hmp) structures.
322  *
323  * pmp->modify_tid tracks new modify_tid transaction ids for front-end
324  * transactions.  Note that synchronization does not use this field.
325  * (typically frontend operations and synchronization cannot run on the
326  * same PFS node at the same time).
327  *
328  * XXX check locking
329  */
330 hammer2_pfs_t *
331 hammer2_pfsalloc(hammer2_cluster_t *cluster,
332                  const hammer2_inode_data_t *ripdata,
333                  hammer2_tid_t modify_tid)
334 {
335         hammer2_chain_t *rchain;
336         hammer2_pfs_t *pmp;
337         int i;
338         int j;
339
340         /*
341          * Locate or create the PFS based on the cluster id.  If ripdata
342          * is NULL this is a spmp which is unique and is always allocated.
343          */
344         if (ripdata) {
345                 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
346                         if (bcmp(&pmp->pfs_clid, &ripdata->pfs_clid,
347                                  sizeof(pmp->pfs_clid)) == 0) {
348                                         break;
349                         }
350                 }
351         } else {
352                 pmp = NULL;
353         }
354
355         if (pmp == NULL) {
356                 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
357                 kmalloc_create(&pmp->minode, "HAMMER2-inodes");
358                 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
359                 lockinit(&pmp->lock, "pfslk", 0, 0);
360                 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
361                 RB_INIT(&pmp->inum_tree);
362                 TAILQ_INIT(&pmp->unlinkq);
363                 spin_init(&pmp->list_spin, "hm2pfsalloc_list");
364
365                 /*
366                  * Save last media transaction id for flusher.
367                  */
368                 pmp->modify_tid = modify_tid;
369                 if (ripdata) {
370                         pmp->inode_tid = ripdata->pfs_inum + 1;
371                         pmp->pfs_clid = ripdata->pfs_clid;
372                 }
373                 hammer2_mtx_init(&pmp->wthread_mtx, "h2wthr");
374                 bioq_init(&pmp->wthread_bioq);
375                 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
376
377                 /*
378                  * The synchronization thread may start too early, make
379                  * sure it stays frozen until we are ready to let it go.
380                  * XXX
381                  */
382                 /*
383                 pmp->primary_thr.flags = HAMMER2_SYNCTHR_FROZEN |
384                                          HAMMER2_SYNCTHR_REMASTER;
385                 */
386         }
387
388         /*
389          * Create the PFS's root inode.
390          */
391         if (pmp->iroot == NULL) {
392                 pmp->iroot = hammer2_inode_get(pmp, NULL, NULL);
393                 hammer2_inode_ref(pmp->iroot);
394                 hammer2_inode_unlock(pmp->iroot, NULL);
395         }
396
397         /*
398          * Create a primary synchronizer thread for the PFS if necessary.
399          * Single-node masters (including snapshots) have nothing to
400          * synchronize and do not require this thread.
401          *
402          * Multi-node masters or any number of soft masters, slaves, copy,
403          * or other PFS types need the thread.
404          */
405         if (cluster && ripdata &&
406             (ripdata->pfs_type != HAMMER2_PFSTYPE_MASTER ||
407              ripdata->pfs_nmasters > 1) &&
408             pmp->primary_thr.td == NULL) {
409                 hammer2_syncthr_create(&pmp->primary_thr, pmp,
410                                        hammer2_syncthr_primary);
411         }
412
413         /*
414          * Update nmasters from any PFS which is part of the cluster.
415          * It is possible that this will result in a value which is too
416          * high.  MASTER PFSs are authoritative for pfs_nmasters and will
417          * override this value later on.
418          */
419         if (ripdata && pmp->pfs_nmasters < ripdata->pfs_nmasters) {
420                 pmp->pfs_nmasters = ripdata->pfs_nmasters;
421         }
422
423         /*
424          * When a cluster is passed in we must add the cluster's chains
425          * to the PFS's root inode and update pmp->pfs_types[].
426          *
427          * At the moment empty spots can develop due to removals or failures.
428          * Ultimately we want to re-fill these spots. XXX
429          */
430         if (cluster) {
431                 hammer2_inode_ref(pmp->iroot);
432                 hammer2_mtx_ex(&pmp->iroot->lock);
433                 j = pmp->iroot->cluster.nchains;
434
435                 kprintf("add PFS to pmp %p[%d]\n", pmp, j);
436
437                 for (i = 0; i < cluster->nchains; ++i) {
438                         if (j == HAMMER2_MAXCLUSTER)
439                                 break;
440                         rchain = cluster->array[i].chain;
441                         KKASSERT(rchain->pmp == NULL);
442                         rchain->pmp = pmp;
443                         hammer2_chain_ref(rchain);
444                         pmp->iroot->cluster.array[j].chain = rchain;
445                         pmp->pfs_types[j] = ripdata->pfs_type;
446
447                         /*
448                          * If the PFS is already mounted we must account
449                          * for the mount_count here.
450                          */
451                         if (pmp->mp)
452                                 ++rchain->hmp->mount_count;
453
454                         /*
455                          * May have to fixup dirty chain tracking.  Previous
456                          * pmp was NULL so nothing to undo.
457                          */
458                         if (rchain->flags & HAMMER2_CHAIN_MODIFIED)
459                                 hammer2_pfs_memory_inc(pmp);
460                         ++j;
461                 }
462                 pmp->iroot->cluster.nchains = j;
463                 hammer2_mtx_unlock(&pmp->iroot->lock);
464                 hammer2_inode_drop(pmp->iroot);
465
466                 if (i != cluster->nchains) {
467                         kprintf("hammer2_mount: cluster full!\n");
468                         /* XXX fatal error? */
469                 }
470         }
471
472         return pmp;
473 }
474
475 /*
476  * Destroy a PFS, typically only occurs after the last mount on a device
477  * has gone away.
478  */
479 static void
480 hammer2_pfsfree(hammer2_pfs_t *pmp)
481 {
482         /*
483          * Cleanup our reference on iroot.  iroot is (should) not be needed
484          * by the flush code.
485          */
486         TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
487
488         hammer2_syncthr_delete(&pmp->primary_thr);
489
490         if (pmp->iroot) {
491 #if REPORT_REFS_ERRORS
492                 if (pmp->iroot->refs != 1)
493                         kprintf("PMP->IROOT %p REFS WRONG %d\n",
494                                 pmp->iroot, pmp->iroot->refs);
495 #else
496                 KKASSERT(pmp->iroot->refs == 1);
497 #endif
498                 /* ref for pmp->iroot */
499                 hammer2_inode_drop(pmp->iroot);
500                 pmp->iroot = NULL;
501         }
502
503         kmalloc_destroy(&pmp->mmsg);
504         kmalloc_destroy(&pmp->minode);
505
506         kfree(pmp, M_HAMMER2);
507 }
508
509 /*
510  * Remove all references to hmp from the pfs list.  Any PFS which becomes
511  * empty is terminated and freed.
512  *
513  * XXX inefficient.
514  */
515 static void
516 hammer2_pfsfree_scan(hammer2_dev_t *hmp)
517 {
518         hammer2_pfs_t *pmp;
519         hammer2_cluster_t *cluster;
520         hammer2_chain_t *rchain;
521         int didfreeze;
522         int i;
523
524 again:
525         TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
526                 if (pmp->iroot == NULL)
527                         continue;
528                 if (hmp->spmp == pmp) {
529                         kprintf("unmount hmp %p remove spmp %p\n",
530                                 hmp, pmp);
531                         hmp->spmp = NULL;
532                 }
533
534                 /*
535                  * Determine if this PFS is affected.  If it is we must
536                  * freeze all management threads and lock its iroot.
537                  *
538                  * Freezing a management thread forces it idle, operations
539                  * in-progress will be aborted and it will have to start
540                  * over again when unfrozen, or exit if told to exit.
541                  */
542                 cluster = &pmp->iroot->cluster;
543                 for (i = 0; i < cluster->nchains; ++i) {
544                         rchain = cluster->array[i].chain;
545                         if (rchain == NULL || rchain->hmp != hmp)
546                                 continue;
547                         break;
548                 }
549                 if (i != cluster->nchains) {
550                         hammer2_syncthr_freeze(&pmp->primary_thr);
551
552                         /*
553                          * Lock the inode and clean out matching chains.
554                          * Note that we cannot use hammer2_inode_lock_*()
555                          * here because that would attempt to validate the
556                          * cluster that we are in the middle of ripping
557                          * apart.
558                          *
559                          * WARNING! We are working directly on the inodes
560                          *          embedded cluster.
561                          */
562                         hammer2_mtx_ex(&pmp->iroot->lock);
563
564                         /*
565                          * Remove the chain from matching elements of the PFS.
566                          */
567                         for (i = 0; i < cluster->nchains; ++i) {
568                                 rchain = cluster->array[i].chain;
569                                 if (rchain == NULL || rchain->hmp != hmp)
570                                         continue;
571
572                                 cluster->array[i].chain = NULL;
573                                 pmp->pfs_types[i] = 0;
574                                 hammer2_chain_drop(rchain);
575
576                                 /* focus hint */
577                                 if (cluster->focus == rchain)
578                                         cluster->focus = NULL;
579                         }
580                         hammer2_mtx_unlock(&pmp->iroot->lock);
581                         didfreeze = 1;  /* remaster, unfreeze down below */
582                 } else {
583                         didfreeze = 0;
584                 }
585
586                 /*
587                  * Cleanup trailing chains.  Do not reorder chains (for now).
588                  * XXX might remove more than we intended.
589                  */
590                 while (i > 0) {
591                         if (cluster->array[i - 1].chain)
592                                 break;
593                         --i;
594                 }
595                 cluster->nchains = i;
596
597                 /*
598                  * If the PMP has no elements remaining we can destroy it.
599                  * (this will transition management threads from frozen->exit).
600                  */
601                 if (cluster->nchains == 0) {
602                         kprintf("unmount hmp %p last ref to PMP=%p\n",
603                                 hmp, pmp);
604                         hammer2_pfsfree(pmp);
605                         goto again;
606                 }
607
608                 /*
609                  * If elements still remain we need to set the REMASTER
610                  * flag and unfreeze it.
611                  */
612                 if (didfreeze) {
613                         hammer2_syncthr_remaster(&pmp->primary_thr);
614                         hammer2_syncthr_unfreeze(&pmp->primary_thr);
615                 }
616         }
617 }
618
619 /*
620  * Mount or remount HAMMER2 fileystem from physical media
621  *
622  *      mountroot
623  *              mp              mount point structure
624  *              path            NULL
625  *              data            <unused>
626  *              cred            <unused>
627  *
628  *      mount
629  *              mp              mount point structure
630  *              path            path to mount point
631  *              data            pointer to argument structure in user space
632  *                      volume  volume path (device@LABEL form)
633  *                      hflags  user mount flags
634  *              cred            user credentials
635  *
636  * RETURNS:     0       Success
637  *              !0      error number
638  */
639 static
640 int
641 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
642                   struct ucred *cred)
643 {
644         struct hammer2_mount_info info;
645         hammer2_pfs_t *pmp;
646         hammer2_pfs_t *spmp;
647         hammer2_dev_t *hmp;
648         hammer2_key_t key_next;
649         hammer2_key_t key_dummy;
650         hammer2_key_t lhc;
651         struct vnode *devvp;
652         struct nlookupdata nd;
653         hammer2_chain_t *parent;
654         hammer2_cluster_t *cluster;
655         hammer2_cluster_t *cparent;
656         const hammer2_inode_data_t *ripdata;
657         hammer2_blockref_t bref;
658         struct file *fp;
659         char devstr[MNAMELEN];
660         size_t size;
661         size_t done;
662         char *dev;
663         char *label;
664         int ronly = 1;
665         int error;
666         int cache_index;
667         int i;
668
669         hmp = NULL;
670         pmp = NULL;
671         dev = NULL;
672         label = NULL;
673         devvp = NULL;
674         cache_index = -1;
675
676         kprintf("hammer2_mount\n");
677
678         if (path == NULL) {
679                 /*
680                  * Root mount
681                  */
682                 bzero(&info, sizeof(info));
683                 info.cluster_fd = -1;
684                 return (EOPNOTSUPP);
685         } else {
686                 /*
687                  * Non-root mount or updating a mount
688                  */
689                 error = copyin(data, &info, sizeof(info));
690                 if (error)
691                         return (error);
692
693                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
694                 if (error)
695                         return (error);
696
697                 /* Extract device and label */
698                 dev = devstr;
699                 label = strchr(devstr, '@');
700                 if (label == NULL ||
701                     ((label + 1) - dev) > done) {
702                         return (EINVAL);
703                 }
704                 *label = '\0';
705                 label++;
706                 if (*label == '\0')
707                         return (EINVAL);
708
709                 if (mp->mnt_flag & MNT_UPDATE) {
710                         /*
711                          * Update mount.  Note that pmp->iroot->cluster is
712                          * an inode-embedded cluster and thus cannot be
713                          * directly locked.
714                          *
715                          * XXX HAMMER2 needs to implement NFS export via
716                          *     mountctl.
717                          */
718                         pmp = MPTOPMP(mp);
719                         cluster = &pmp->iroot->cluster;
720                         for (i = 0; i < cluster->nchains; ++i) {
721                                 if (cluster->array[i].chain == NULL)
722                                         continue;
723                                 hmp = cluster->array[i].chain->hmp;
724                                 devvp = hmp->devvp;
725                                 error = hammer2_remount(hmp, mp, path,
726                                                         devvp, cred);
727                                 if (error)
728                                         break;
729                         }
730                         /*hammer2_inode_install_hidden(pmp);*/
731
732                         return error;
733                 }
734         }
735
736         /*
737          * HMP device mount
738          *
739          * Lookup name and verify it refers to a block device.
740          */
741         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
742         if (error == 0)
743                 error = nlookup(&nd);
744         if (error == 0)
745                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
746         nlookup_done(&nd);
747
748         if (error == 0) {
749                 if (vn_isdisk(devvp, &error))
750                         error = vfs_mountedon(devvp);
751         }
752
753         /*
754          * Determine if the device has already been mounted.  After this
755          * check hmp will be non-NULL if we are doing the second or more
756          * hammer2 mounts from the same device.
757          */
758         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
759         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
760                 if (hmp->devvp == devvp)
761                         break;
762         }
763
764         /*
765          * Open the device if this isn't a secondary mount and construct
766          * the H2 device mount (hmp).
767          */
768         if (hmp == NULL) {
769                 hammer2_chain_t *schain;
770                 hammer2_xid_t xid;
771
772                 if (error == 0 && vcount(devvp) > 0)
773                         error = EBUSY;
774
775                 /*
776                  * Now open the device
777                  */
778                 if (error == 0) {
779                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
780                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
781                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
782                         if (error == 0) {
783                                 error = VOP_OPEN(devvp,
784                                                  ronly ? FREAD : FREAD | FWRITE,
785                                                  FSCRED, NULL);
786                         }
787                         vn_unlock(devvp);
788                 }
789                 if (error && devvp) {
790                         vrele(devvp);
791                         devvp = NULL;
792                 }
793                 if (error) {
794                         lockmgr(&hammer2_mntlk, LK_RELEASE);
795                         return error;
796                 }
797                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
798                 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev);
799                 hmp->ronly = ronly;
800                 hmp->devvp = devvp;
801                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
802                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
803                 RB_INIT(&hmp->iotree);
804                 spin_init(&hmp->io_spin, "hm2mount_io");
805                 spin_init(&hmp->list_spin, "hm2mount_list");
806                 TAILQ_INIT(&hmp->flushq);
807
808                 lockinit(&hmp->vollk, "h2vol", 0, 0);
809
810                 /*
811                  * vchain setup. vchain.data is embedded.
812                  * vchain.refs is initialized and will never drop to 0.
813                  *
814                  * NOTE! voldata is not yet loaded.
815                  */
816                 hmp->vchain.hmp = hmp;
817                 hmp->vchain.refs = 1;
818                 hmp->vchain.data = (void *)&hmp->voldata;
819                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
820                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
821                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
822
823                 hammer2_chain_core_init(&hmp->vchain);
824                 /* hmp->vchain.u.xxx is left NULL */
825
826                 /*
827                  * fchain setup.  fchain.data is embedded.
828                  * fchain.refs is initialized and will never drop to 0.
829                  *
830                  * The data is not used but needs to be initialized to
831                  * pass assertion muster.  We use this chain primarily
832                  * as a placeholder for the freemap's top-level RBTREE
833                  * so it does not interfere with the volume's topology
834                  * RBTREE.
835                  */
836                 hmp->fchain.hmp = hmp;
837                 hmp->fchain.refs = 1;
838                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
839                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
840                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
841                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
842                 hmp->fchain.bref.methods =
843                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
844                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
845
846                 hammer2_chain_core_init(&hmp->fchain);
847                 /* hmp->fchain.u.xxx is left NULL */
848
849                 /*
850                  * Install the volume header and initialize fields from
851                  * voldata.
852                  */
853                 error = hammer2_install_volume_header(hmp);
854                 if (error) {
855                         hammer2_unmount_helper(mp, NULL, hmp);
856                         lockmgr(&hammer2_mntlk, LK_RELEASE);
857                         hammer2_vfs_unmount(mp, MNT_FORCE);
858                         return error;
859                 }
860
861                 /*
862                  * Really important to get these right or flush will get
863                  * confused.
864                  */
865                 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0);
866                 kprintf("alloc spmp %p tid %016jx\n",
867                         hmp->spmp, hmp->voldata.mirror_tid);
868                 spmp = hmp->spmp;
869                 spmp->inode_tid = 1;
870
871                 /*
872                  * Dummy-up vchain and fchain's modify_tid.  mirror_tid
873                  * is inherited from the volume header.
874                  */
875                 xid = 0;
876                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
877                 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
878                 hmp->vchain.pmp = spmp;
879                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
880                 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
881                 hmp->fchain.pmp = spmp;
882
883                 /*
884                  * First locate the super-root inode, which is key 0
885                  * relative to the volume header's blockset.
886                  *
887                  * Then locate the root inode by scanning the directory keyspace
888                  * represented by the label.
889                  */
890                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
891                 schain = hammer2_chain_lookup(&parent, &key_dummy,
892                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
893                                       &cache_index, 0);
894                 hammer2_chain_lookup_done(parent);
895                 if (schain == NULL) {
896                         kprintf("hammer2_mount: invalid super-root\n");
897                         hammer2_unmount_helper(mp, NULL, hmp);
898                         lockmgr(&hammer2_mntlk, LK_RELEASE);
899                         hammer2_vfs_unmount(mp, MNT_FORCE);
900                         return EINVAL;
901                 }
902                 if (schain->error) {
903                         kprintf("hammer2_mount: error %s reading super-root\n",
904                                 hammer2_error_str(schain->error));
905                         hammer2_chain_unlock(schain);
906                         hammer2_chain_drop(schain);
907                         schain = NULL;
908                         hammer2_unmount_helper(mp, NULL, hmp);
909                         lockmgr(&hammer2_mntlk, LK_RELEASE);
910                         hammer2_vfs_unmount(mp, MNT_FORCE);
911                         return EINVAL;
912                 }
913                 spmp->modify_tid = schain->bref.modify_tid;
914
915                 /*
916                  * Sanity-check schain's pmp and finish initialization.
917                  * Any chain belonging to the super-root topology should
918                  * have a NULL pmp (not even set to spmp).
919                  */
920                 ripdata = &hammer2_chain_rdata(schain)->ipdata;
921                 KKASSERT(schain->pmp == NULL);
922                 spmp->pfs_clid = ripdata->pfs_clid;
923
924                 /*
925                  * Replace the dummy spmp->iroot with a real one.  It's
926                  * easier to just do a wholesale replacement than to try
927                  * to update the chain and fixup the iroot fields.
928                  *
929                  * The returned inode is locked with the supplied cluster.
930                  */
931                 cluster = hammer2_cluster_from_chain(schain);
932                 hammer2_inode_drop(spmp->iroot);
933                 spmp->iroot = NULL;
934                 spmp->iroot = hammer2_inode_get(spmp, NULL, cluster);
935                 spmp->spmp_hmp = hmp;
936                 spmp->pfs_types[0] = ripdata->pfs_type;
937                 hammer2_inode_ref(spmp->iroot);
938                 hammer2_inode_unlock(spmp->iroot, cluster);
939                 schain = NULL;
940                 /* leave spmp->iroot with one ref */
941
942                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
943                         error = hammer2_recovery(hmp);
944                         /* XXX do something with error */
945                 }
946                 hammer2_update_pmps(hmp);
947                 hammer2_iocom_init(hmp);
948
949                 /*
950                  * Ref the cluster management messaging descriptor.  The mount
951                  * program deals with the other end of the communications pipe.
952                  */
953                 fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
954                 if (fp) {
955                         hammer2_cluster_reconnect(hmp, fp);
956                 } else {
957                         kprintf("hammer2_mount: bad cluster_fd!\n");
958                 }
959         } else {
960                 spmp = hmp->spmp;
961         }
962
963         /*
964          * Lookup the mount point under the media-localized super-root.
965          * Scanning hammer2_pfslist doesn't help us because it represents
966          * PFS cluster ids which can aggregate several named PFSs together.
967          *
968          * cluster->pmp will incorrectly point to spmp and must be fixed
969          * up later on.
970          */
971         cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
972         lhc = hammer2_dirhash(label, strlen(label));
973         cluster = hammer2_cluster_lookup(cparent, &key_next,
974                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
975                                       0);
976         while (cluster) {
977                 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE &&
978                     strcmp(label,
979                        hammer2_cluster_rdata(cluster)->ipdata.filename) == 0) {
980                         break;
981                 }
982                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
983                                             key_next,
984                                             lhc + HAMMER2_DIRHASH_LOMASK, 0);
985         }
986         hammer2_inode_unlock(spmp->iroot, cparent);
987
988         /*
989          * PFS could not be found?
990          */
991         if (cluster == NULL) {
992                 kprintf("hammer2_mount: PFS label not found\n");
993                 hammer2_unmount_helper(mp, NULL, hmp);
994                 lockmgr(&hammer2_mntlk, LK_RELEASE);
995                 hammer2_vfs_unmount(mp, MNT_FORCE);
996
997                 return EINVAL;
998         }
999
1000         /*
1001          * Acquire the pmp structure (it should have already been allocated
1002          * via hammer2_update_pmps() so do not pass cluster in to add to
1003          * available chains).
1004          *
1005          * Check if the cluster has already been mounted.  A cluster can
1006          * only be mounted once, use null mounts to mount additional copies.
1007          */
1008         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1009         hammer2_cluster_bref(cluster, &bref);
1010         pmp = hammer2_pfsalloc(NULL, ripdata, bref.modify_tid);
1011         hammer2_cluster_unlock(cluster);
1012         hammer2_cluster_drop(cluster);
1013
1014         if (pmp->mp) {
1015                 kprintf("hammer2_mount: PFS already mounted!\n");
1016                 hammer2_unmount_helper(mp, NULL, hmp);
1017                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1018                 hammer2_vfs_unmount(mp, MNT_FORCE);
1019
1020                 return EBUSY;
1021         }
1022
1023         /*
1024          * Finish the mount
1025          */
1026         kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp);
1027
1028         mp->mnt_flag = MNT_LOCAL;
1029         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
1030         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
1031  
1032         /*
1033          * required mount structure initializations
1034          */
1035         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
1036         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
1037  
1038         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
1039         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1040  
1041         /*
1042          * Optional fields
1043          */
1044         mp->mnt_iosize_max = MAXPHYS;
1045
1046         /*
1047          * Connect up mount pointers.
1048          */
1049         hammer2_mount_helper(mp, pmp);
1050
1051         lockmgr(&hammer2_mntlk, LK_RELEASE);
1052
1053         /*
1054          * A mounted PFS needs a write thread for logical buffers and
1055          * a hidden directory for deletions of open files.  These features
1056          * are not used by unmounted PFSs.
1057          *
1058          * The logical file buffer bio write thread handles things like
1059          * physical block assignment and compression.
1060          */
1061         pmp->wthread_destroy = 0;
1062         lwkt_create(hammer2_write_thread, pmp,
1063                     &pmp->wthread_td, NULL, 0, -1, "hwrite-%s", label);
1064
1065         /*
1066          * With the cluster operational install ihidden.
1067          * (only applicable to pfs mounts, not applicable to spmp)
1068          */
1069         hammer2_inode_install_hidden(pmp);
1070
1071         /*
1072          * Finish setup
1073          */
1074         vfs_getnewfsid(mp);
1075         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
1076         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
1077         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
1078
1079         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
1080         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
1081         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
1082         copyinstr(path, mp->mnt_stat.f_mntonname,
1083                   sizeof(mp->mnt_stat.f_mntonname) - 1,
1084                   &size);
1085
1086         /*
1087          * Initial statfs to prime mnt_stat.
1088          */
1089         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
1090         
1091         return 0;
1092 }
1093
1094 /*
1095  * Scan PFSs under the super-root and create hammer2_pfs structures.
1096  */
1097 static
1098 void
1099 hammer2_update_pmps(hammer2_dev_t *hmp)
1100 {
1101         const hammer2_inode_data_t *ripdata;
1102         hammer2_cluster_t *cparent;
1103         hammer2_cluster_t *cluster;
1104         hammer2_blockref_t bref;
1105         hammer2_pfs_t *spmp;
1106         hammer2_pfs_t *pmp;
1107         hammer2_key_t key_next;
1108
1109         /*
1110          * Lookup mount point under the media-localized super-root.
1111          *
1112          * cluster->pmp will incorrectly point to spmp and must be fixed
1113          * up later on.
1114          */
1115         spmp = hmp->spmp;
1116         cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1117         cluster = hammer2_cluster_lookup(cparent, &key_next,
1118                                          HAMMER2_KEY_MIN,
1119                                          HAMMER2_KEY_MAX,
1120                                          0);
1121         while (cluster) {
1122                 if (hammer2_cluster_type(cluster) != HAMMER2_BREF_TYPE_INODE)
1123                         continue;
1124                 ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1125                 hammer2_cluster_bref(cluster, &bref);
1126                 kprintf("ADD LOCAL PFS: %s\n", ripdata->filename);
1127
1128                 pmp = hammer2_pfsalloc(cluster, ripdata, bref.modify_tid);
1129                 cluster = hammer2_cluster_next(cparent, cluster,
1130                                                &key_next,
1131                                                key_next,
1132                                                HAMMER2_KEY_MAX,
1133                                                0);
1134         }
1135         hammer2_inode_unlock(spmp->iroot, cparent);
1136 }
1137
1138 /*
1139  * Handle bioq for strategy write
1140  */
1141 static
1142 void
1143 hammer2_write_thread(void *arg)
1144 {
1145         hammer2_pfs_t *pmp;
1146         struct bio *bio;
1147         struct buf *bp;
1148         hammer2_trans_t trans;
1149         struct vnode *vp;
1150         hammer2_inode_t *ip;
1151         hammer2_cluster_t *cparent;
1152         const hammer2_inode_data_t *ripdata;
1153         hammer2_key_t lbase;
1154         int lblksize;
1155         int pblksize;
1156         int error;
1157         
1158         pmp = arg;
1159         
1160         hammer2_mtx_ex(&pmp->wthread_mtx);
1161         while (pmp->wthread_destroy == 0) {
1162                 if (bioq_first(&pmp->wthread_bioq) == NULL) {
1163                         mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
1164                                  0, "h2bioqw", 0);
1165                 }
1166                 cparent = NULL;
1167
1168                 hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
1169
1170                 while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
1171                         /*
1172                          * dummy bio for synchronization.  The transaction
1173                          * must be reinitialized.
1174                          */
1175                         if (bio->bio_buf == NULL) {
1176                                 bio->bio_flags |= BIO_DONE;
1177                                 wakeup(bio);
1178                                 hammer2_trans_done(&trans);
1179                                 hammer2_trans_init(&trans, pmp,
1180                                                    HAMMER2_TRANS_BUFCACHE);
1181                                 continue;
1182                         }
1183
1184                         /*
1185                          * else normal bio processing
1186                          */
1187                         hammer2_mtx_unlock(&pmp->wthread_mtx);
1188
1189                         hammer2_lwinprog_drop(pmp);
1190                         
1191                         error = 0;
1192                         bp = bio->bio_buf;
1193                         vp = bp->b_vp;
1194                         ip = VTOI(vp);
1195
1196                         /*
1197                          * Inode is modified, flush size and mtime changes
1198                          * to ensure that the file size remains consistent
1199                          * with the buffers being flushed.
1200                          *
1201                          * NOTE: The inode_fsync() call only flushes the
1202                          *       inode's meta-data state, it doesn't try
1203                          *       to flush underlying buffers or chains.
1204                          *
1205                          * NOTE: hammer2_write_file_core() may indirectly
1206                          *       modify and modsync the inode.
1207                          */
1208                         cparent = hammer2_inode_lock(ip,
1209                                                      HAMMER2_RESOLVE_ALWAYS);
1210                         if (ip->flags & (HAMMER2_INODE_RESIZED |
1211                                          HAMMER2_INODE_MTIME)) {
1212                                 hammer2_inode_fsync(&trans, ip, cparent);
1213                         }
1214                         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1215                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
1216                                                         &lbase, NULL);
1217                         pblksize = hammer2_calc_physical(ip, ripdata, lbase);
1218                         hammer2_write_file_core(bp, &trans, ip, ripdata,
1219                                                 cparent,
1220                                                 lbase, IO_ASYNC,
1221                                                 pblksize, &error);
1222                         /* ripdata can be invalid after call */
1223                         hammer2_inode_unlock(ip, cparent);
1224                         if (error) {
1225                                 kprintf("hammer2: error in buffer write\n");
1226                                 bp->b_flags |= B_ERROR;
1227                                 bp->b_error = EIO;
1228                         }
1229                         biodone(bio);
1230                         hammer2_mtx_ex(&pmp->wthread_mtx);
1231                 }
1232                 hammer2_trans_done(&trans);
1233         }
1234         pmp->wthread_destroy = -1;
1235         wakeup(&pmp->wthread_destroy);
1236         
1237         hammer2_mtx_unlock(&pmp->wthread_mtx);
1238 }
1239
1240 void
1241 hammer2_bioq_sync(hammer2_pfs_t *pmp)
1242 {
1243         struct bio sync_bio;
1244
1245         bzero(&sync_bio, sizeof(sync_bio));     /* dummy with no bio_buf */
1246         hammer2_mtx_ex(&pmp->wthread_mtx);
1247         if (pmp->wthread_destroy == 0 &&
1248             TAILQ_FIRST(&pmp->wthread_bioq.queue)) {
1249                 bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
1250                 while ((sync_bio.bio_flags & BIO_DONE) == 0)
1251                         mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
1252         }
1253         hammer2_mtx_unlock(&pmp->wthread_mtx);
1254 }
1255
1256 /* 
1257  * Return a chain suitable for I/O, creating the chain if necessary
1258  * and assigning its physical block.
1259  *
1260  * cparent can wind up being anything.
1261  */
1262 static
1263 hammer2_cluster_t *
1264 hammer2_assign_physical(hammer2_trans_t *trans,
1265                         hammer2_inode_t *ip, hammer2_cluster_t *cparent,
1266                         hammer2_key_t lbase, int pblksize, int *errorp)
1267 {
1268         hammer2_cluster_t *cluster;
1269         hammer2_cluster_t *dparent;
1270         hammer2_key_t key_dummy;
1271         int pradix = hammer2_getradix(pblksize);
1272
1273         /*
1274          * Locate the chain associated with lbase, return a locked chain.
1275          * However, do not instantiate any data reference (which utilizes a
1276          * device buffer) because we will be using direct IO via the
1277          * logical buffer cache buffer.
1278          */
1279         *errorp = 0;
1280         KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
1281 retry:
1282         dparent = hammer2_cluster_lookup_init(cparent, 0);
1283         cluster = hammer2_cluster_lookup(dparent, &key_dummy,
1284                                      lbase, lbase,
1285                                      HAMMER2_LOOKUP_NODATA);
1286
1287         if (cluster == NULL) {
1288                 /*
1289                  * We found a hole, create a new chain entry.
1290                  *
1291                  * NOTE: DATA chains are created without device backing
1292                  *       store (nor do we want any).
1293                  */
1294                 *errorp = hammer2_cluster_create(trans, dparent, &cluster,
1295                                                lbase, HAMMER2_PBUFRADIX,
1296                                                HAMMER2_BREF_TYPE_DATA,
1297                                                pblksize, 0);
1298                 if (cluster == NULL) {
1299                         hammer2_cluster_lookup_done(dparent);
1300                         panic("hammer2_cluster_create: par=%p error=%d\n",
1301                                 dparent->focus, *errorp);
1302                         goto retry;
1303                 }
1304                 /*ip->delta_dcount += pblksize;*/
1305         } else {
1306                 switch (hammer2_cluster_type(cluster)) {
1307                 case HAMMER2_BREF_TYPE_INODE:
1308                         /*
1309                          * The data is embedded in the inode.  The
1310                          * caller is responsible for marking the inode
1311                          * modified and copying the data to the embedded
1312                          * area.
1313                          */
1314                         break;
1315                 case HAMMER2_BREF_TYPE_DATA:
1316                         if (hammer2_cluster_need_resize(cluster, pblksize)) {
1317                                 hammer2_cluster_resize(trans, ip,
1318                                                      dparent, cluster,
1319                                                      pradix,
1320                                                      HAMMER2_MODIFY_OPTDATA);
1321                         }
1322
1323                         /*
1324                          * DATA buffers must be marked modified whether the
1325                          * data is in a logical buffer or not.  We also have
1326                          * to make this call to fixup the chain data pointers
1327                          * after resizing in case this is an encrypted or
1328                          * compressed buffer.
1329                          */
1330                         hammer2_cluster_modify(trans, cluster,
1331                                                HAMMER2_MODIFY_OPTDATA);
1332                         break;
1333                 default:
1334                         panic("hammer2_assign_physical: bad type");
1335                         /* NOT REACHED */
1336                         break;
1337                 }
1338         }
1339
1340         /*
1341          * Cleanup.  If cluster wound up being the inode itself, i.e.
1342          * the DIRECTDATA case for offset 0, then we need to update cparent.
1343          * The caller expects cparent to not become stale.
1344          */
1345         hammer2_cluster_lookup_done(dparent);
1346         /* dparent = NULL; safety */
1347         return (cluster);
1348 }
1349
1350 /* 
1351  * bio queued from hammer2_vnops.c.
1352  *
1353  * The core write function which determines which path to take
1354  * depending on compression settings.  We also have to locate the
1355  * related clusters so we can calculate and set the check data for
1356  * the blockref.
1357  */
1358 static
1359 void
1360 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
1361                         hammer2_inode_t *ip,
1362                         const hammer2_inode_data_t *ripdata,
1363                         hammer2_cluster_t *cparent,
1364                         hammer2_key_t lbase, int ioflag, int pblksize,
1365                         int *errorp)
1366 {
1367         hammer2_cluster_t *cluster;
1368
1369         switch(HAMMER2_DEC_ALGO(ripdata->comp_algo)) {
1370         case HAMMER2_COMP_NONE:
1371                 /*
1372                  * We have to assign physical storage to the buffer
1373                  * we intend to dirty or write now to avoid deadlocks
1374                  * in the strategy code later.
1375                  *
1376                  * This can return NOOFFSET for inode-embedded data.
1377                  * The strategy code will take care of it in that case.
1378                  */
1379                 cluster = hammer2_assign_physical(trans, ip, cparent,
1380                                                 lbase, pblksize,
1381                                                 errorp);
1382                 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
1383                                  ripdata->check_algo);
1384                 /* ripdata can become invalid */
1385                 if (cluster) {
1386                         hammer2_cluster_unlock(cluster);
1387                         hammer2_cluster_drop(cluster);
1388                 }
1389                 break;
1390         case HAMMER2_COMP_AUTOZERO:
1391                 /*
1392                  * Check for zero-fill only
1393                  */
1394                 hammer2_zero_check_and_write(bp, trans, ip,
1395                                     ripdata, cparent, lbase,
1396                                     ioflag, pblksize, errorp,
1397                                     ripdata->check_algo);
1398                 break;
1399         case HAMMER2_COMP_LZ4:
1400         case HAMMER2_COMP_ZLIB:
1401         default:
1402                 /*
1403                  * Check for zero-fill and attempt compression.
1404                  */
1405                 hammer2_compress_and_write(bp, trans, ip,
1406                                            ripdata, cparent,
1407                                            lbase, ioflag,
1408                                            pblksize, errorp,
1409                                            ripdata->comp_algo,
1410                                            ripdata->check_algo);
1411                 break;
1412         }
1413 }
1414
1415 /*
1416  * Generic function that will perform the compression in compression
1417  * write path. The compression algorithm is determined by the settings
1418  * obtained from inode.
1419  */
1420 static
1421 void
1422 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
1423         hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1424         hammer2_cluster_t *cparent,
1425         hammer2_key_t lbase, int ioflag, int pblksize,
1426         int *errorp, int comp_algo, int check_algo)
1427 {
1428         hammer2_cluster_t *cluster;
1429         hammer2_chain_t *chain;
1430         int comp_size;
1431         int comp_block_size;
1432         int i;
1433         char *comp_buffer;
1434
1435         if (test_block_zeros(bp->b_data, pblksize)) {
1436                 zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
1437                 return;
1438         }
1439
1440         comp_size = 0;
1441         comp_buffer = NULL;
1442
1443         KKASSERT(pblksize / 2 <= 32768);
1444                 
1445         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
1446                 z_stream strm_compress;
1447                 int comp_level;
1448                 int ret;
1449
1450                 switch(HAMMER2_DEC_ALGO(comp_algo)) {
1451                 case HAMMER2_COMP_LZ4:
1452                         comp_buffer = objcache_get(cache_buffer_write,
1453                                                    M_INTWAIT);
1454                         comp_size = LZ4_compress_limitedOutput(
1455                                         bp->b_data,
1456                                         &comp_buffer[sizeof(int)],
1457                                         pblksize,
1458                                         pblksize / 2 - sizeof(int));
1459                         /*
1460                          * We need to prefix with the size, LZ4
1461                          * doesn't do it for us.  Add the related
1462                          * overhead.
1463                          */
1464                         *(int *)comp_buffer = comp_size;
1465                         if (comp_size)
1466                                 comp_size += sizeof(int);
1467                         break;
1468                 case HAMMER2_COMP_ZLIB:
1469                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1470                         if (comp_level == 0)
1471                                 comp_level = 6; /* default zlib compression */
1472                         else if (comp_level < 6)
1473                                 comp_level = 6;
1474                         else if (comp_level > 9)
1475                                 comp_level = 9;
1476                         ret = deflateInit(&strm_compress, comp_level);
1477                         if (ret != Z_OK) {
1478                                 kprintf("HAMMER2 ZLIB: fatal error "
1479                                         "on deflateInit.\n");
1480                         }
1481
1482                         comp_buffer = objcache_get(cache_buffer_write,
1483                                                    M_INTWAIT);
1484                         strm_compress.next_in = bp->b_data;
1485                         strm_compress.avail_in = pblksize;
1486                         strm_compress.next_out = comp_buffer;
1487                         strm_compress.avail_out = pblksize / 2;
1488                         ret = deflate(&strm_compress, Z_FINISH);
1489                         if (ret == Z_STREAM_END) {
1490                                 comp_size = pblksize / 2 -
1491                                             strm_compress.avail_out;
1492                         } else {
1493                                 comp_size = 0;
1494                         }
1495                         ret = deflateEnd(&strm_compress);
1496                         break;
1497                 default:
1498                         kprintf("Error: Unknown compression method.\n");
1499                         kprintf("Comp_method = %d.\n", comp_algo);
1500                         break;
1501                 }
1502         }
1503
1504         if (comp_size == 0) {
1505                 /*
1506                  * compression failed or turned off
1507                  */
1508                 comp_block_size = pblksize;     /* safety */
1509                 if (++ip->comp_heuristic > 128)
1510                         ip->comp_heuristic = 8;
1511         } else {
1512                 /*
1513                  * compression succeeded
1514                  */
1515                 ip->comp_heuristic = 0;
1516                 if (comp_size <= 1024) {
1517                         comp_block_size = 1024;
1518                 } else if (comp_size <= 2048) {
1519                         comp_block_size = 2048;
1520                 } else if (comp_size <= 4096) {
1521                         comp_block_size = 4096;
1522                 } else if (comp_size <= 8192) {
1523                         comp_block_size = 8192;
1524                 } else if (comp_size <= 16384) {
1525                         comp_block_size = 16384;
1526                 } else if (comp_size <= 32768) {
1527                         comp_block_size = 32768;
1528                 } else {
1529                         panic("hammer2: WRITE PATH: "
1530                               "Weird comp_size value.");
1531                         /* NOT REACHED */
1532                         comp_block_size = pblksize;
1533                 }
1534         }
1535
1536         cluster = hammer2_assign_physical(trans, ip, cparent,
1537                                           lbase, comp_block_size,
1538                                           errorp);
1539         ripdata = NULL;
1540
1541         if (*errorp) {
1542                 kprintf("WRITE PATH: An error occurred while "
1543                         "assigning physical space.\n");
1544                 KKASSERT(cluster == NULL);
1545                 goto done;
1546         }
1547
1548         if (cluster->ddflag) {
1549                 hammer2_inode_data_t *wipdata;
1550
1551                 wipdata = hammer2_cluster_modify_ip(trans, ip, cluster, 0);
1552                 KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1553                 KKASSERT(bp->b_loffset == 0);
1554                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1555                 hammer2_cluster_modsync(cluster);
1556         } else
1557         for (i = 0; i < cluster->nchains; ++i) {
1558                 hammer2_io_t *dio;
1559                 char *bdata;
1560
1561                 /* XXX hackx */
1562
1563                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1564                         continue;
1565                 chain = cluster->array[i].chain;        /* XXX */
1566                 if (chain == NULL)
1567                         continue;
1568                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1569
1570                 switch(chain->bref.type) {
1571                 case HAMMER2_BREF_TYPE_INODE:
1572                         panic("hammer2_write_bp: unexpected inode\n");
1573                         break;
1574                 case HAMMER2_BREF_TYPE_DATA:
1575                         /*
1576                          * Optimize out the read-before-write
1577                          * if possible.
1578                          */
1579                         *errorp = hammer2_io_newnz(chain->hmp,
1580                                                    chain->bref.data_off,
1581                                                    chain->bytes,
1582                                                    &dio);
1583                         if (*errorp) {
1584                                 hammer2_io_brelse(&dio);
1585                                 kprintf("hammer2: WRITE PATH: "
1586                                         "dbp bread error\n");
1587                                 break;
1588                         }
1589                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1590
1591                         /*
1592                          * When loading the block make sure we don't
1593                          * leave garbage after the compressed data.
1594                          */
1595                         if (comp_size) {
1596                                 chain->bref.methods =
1597                                         HAMMER2_ENC_COMP(comp_algo) +
1598                                         HAMMER2_ENC_CHECK(check_algo);
1599                                 bcopy(comp_buffer, bdata, comp_size);
1600                                 if (comp_size != comp_block_size) {
1601                                         bzero(bdata + comp_size,
1602                                               comp_block_size - comp_size);
1603                                 }
1604                         } else {
1605                                 chain->bref.methods =
1606                                         HAMMER2_ENC_COMP(
1607                                                 HAMMER2_COMP_NONE) +
1608                                         HAMMER2_ENC_CHECK(check_algo);
1609                                 bcopy(bp->b_data, bdata, pblksize);
1610                         }
1611
1612                         /*
1613                          * The flush code doesn't calculate check codes for
1614                          * file data (doing so can result in excessive I/O),
1615                          * so we do it here.
1616                          */
1617                         hammer2_chain_setcheck(chain, bdata);
1618
1619                         /*
1620                          * Device buffer is now valid, chain is no longer in
1621                          * the initial state.
1622                          *
1623                          * (No blockref table worries with file data)
1624                          */
1625                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1626
1627                         /* Now write the related bdp. */
1628                         if (ioflag & IO_SYNC) {
1629                                 /*
1630                                  * Synchronous I/O requested.
1631                                  */
1632                                 hammer2_io_bwrite(&dio);
1633                         /*
1634                         } else if ((ioflag & IO_DIRECT) &&
1635                                    loff + n == pblksize) {
1636                                 hammer2_io_bdwrite(&dio);
1637                         */
1638                         } else if (ioflag & IO_ASYNC) {
1639                                 hammer2_io_bawrite(&dio);
1640                         } else {
1641                                 hammer2_io_bdwrite(&dio);
1642                         }
1643                         break;
1644                 default:
1645                         panic("hammer2_write_bp: bad chain type %d\n",
1646                                 chain->bref.type);
1647                         /* NOT REACHED */
1648                         break;
1649                 }
1650         }
1651 done:
1652         if (cluster) {
1653                 hammer2_cluster_unlock(cluster);
1654                 hammer2_cluster_drop(cluster);
1655         }
1656         if (comp_buffer)
1657                 objcache_put(cache_buffer_write, comp_buffer);
1658 }
1659
1660 /*
1661  * Function that performs zero-checking and writing without compression,
1662  * it corresponds to default zero-checking path.
1663  */
1664 static
1665 void
1666 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1667         hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1668         hammer2_cluster_t *cparent,
1669         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp,
1670         int check_algo)
1671 {
1672         hammer2_cluster_t *cluster;
1673
1674         if (test_block_zeros(bp->b_data, pblksize)) {
1675                 zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
1676                 /* ripdata can become invalid */
1677         } else {
1678                 cluster = hammer2_assign_physical(trans, ip, cparent,
1679                                                   lbase, pblksize, errorp);
1680                 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
1681                                  check_algo);
1682                 /* ripdata can become invalid */
1683                 if (cluster) {
1684                         hammer2_cluster_unlock(cluster);
1685                         hammer2_cluster_drop(cluster);
1686                 }
1687         }
1688 }
1689
1690 /*
1691  * A function to test whether a block of data contains only zeros,
1692  * returns TRUE (non-zero) if the block is all zeros.
1693  */
1694 static
1695 int
1696 test_block_zeros(const char *buf, size_t bytes)
1697 {
1698         size_t i;
1699
1700         for (i = 0; i < bytes; i += sizeof(long)) {
1701                 if (*(const long *)(buf + i) != 0)
1702                         return (0);
1703         }
1704         return (1);
1705 }
1706
1707 /*
1708  * Function to "write" a block that contains only zeros.
1709  */
1710 static
1711 void
1712 zero_write(struct buf *bp, hammer2_trans_t *trans,
1713            hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1714            hammer2_cluster_t *cparent,
1715            hammer2_key_t lbase, int *errorp __unused)
1716 {
1717         hammer2_cluster_t *cluster;
1718         hammer2_key_t key_dummy;
1719
1720         cparent = hammer2_cluster_lookup_init(cparent, 0);
1721         cluster = hammer2_cluster_lookup(cparent, &key_dummy, lbase, lbase,
1722                                      HAMMER2_LOOKUP_NODATA);
1723         if (cluster) {
1724                 if (cluster->ddflag) {
1725                         hammer2_inode_data_t *wipdata;
1726
1727                         wipdata = hammer2_cluster_modify_ip(trans, ip,
1728                                                             cluster, 0);
1729                         KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1730                         KKASSERT(bp->b_loffset == 0);
1731                         bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1732                         hammer2_cluster_modsync(cluster);
1733                 } else {
1734                         hammer2_cluster_delete(trans, cparent, cluster,
1735                                                HAMMER2_DELETE_PERMANENT);
1736                 }
1737                 hammer2_cluster_unlock(cluster);
1738                 hammer2_cluster_drop(cluster);
1739         }
1740         hammer2_cluster_lookup_done(cparent);
1741 }
1742
1743 /*
1744  * Function to write the data as it is, without performing any sort of
1745  * compression. This function is used in path without compression and
1746  * default zero-checking path.
1747  */
1748 static
1749 void
1750 hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
1751                                 int pblksize, int *errorp, int check_algo)
1752 {
1753         hammer2_chain_t *chain;
1754         hammer2_inode_data_t *wipdata;
1755         hammer2_io_t *dio;
1756         char *bdata;
1757         int error;
1758         int i;
1759
1760         error = 0;      /* XXX TODO below */
1761
1762         for (i = 0; i < cluster->nchains; ++i) {
1763                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1764                         continue;
1765                 chain = cluster->array[i].chain;        /* XXX */
1766                 if (chain == NULL)
1767                         continue;
1768                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1769
1770                 switch(chain->bref.type) {
1771                 case HAMMER2_BREF_TYPE_INODE:
1772                         wipdata = &hammer2_chain_wdata(chain)->ipdata;
1773                         KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1774                         KKASSERT(bp->b_loffset == 0);
1775                         bcopy(bp->b_data, wipdata->u.data,
1776                               HAMMER2_EMBEDDED_BYTES);
1777                         error = 0;
1778                         break;
1779                 case HAMMER2_BREF_TYPE_DATA:
1780                         error = hammer2_io_newnz(chain->hmp,
1781                                                  chain->bref.data_off,
1782                                                  chain->bytes, &dio);
1783                         if (error) {
1784                                 hammer2_io_bqrelse(&dio);
1785                                 kprintf("hammer2: WRITE PATH: "
1786                                         "dbp bread error\n");
1787                                 break;
1788                         }
1789                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1790
1791                         chain->bref.methods = HAMMER2_ENC_COMP(
1792                                                         HAMMER2_COMP_NONE) +
1793                                               HAMMER2_ENC_CHECK(check_algo);
1794                         bcopy(bp->b_data, bdata, chain->bytes);
1795
1796                         /*
1797                          * The flush code doesn't calculate check codes for
1798                          * file data (doing so can result in excessive I/O),
1799                          * so we do it here.
1800                          */
1801                         hammer2_chain_setcheck(chain, bdata);
1802
1803                         /*
1804                          * Device buffer is now valid, chain is no longer in
1805                          * the initial state.
1806                          *
1807                          * (No blockref table worries with file data)
1808                          */
1809                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1810
1811                         if (ioflag & IO_SYNC) {
1812                                 /*
1813                                  * Synchronous I/O requested.
1814                                  */
1815                                 hammer2_io_bwrite(&dio);
1816                         /*
1817                         } else if ((ioflag & IO_DIRECT) &&
1818                                    loff + n == pblksize) {
1819                                 hammer2_io_bdwrite(&dio);
1820                         */
1821                         } else if (ioflag & IO_ASYNC) {
1822                                 hammer2_io_bawrite(&dio);
1823                         } else {
1824                                 hammer2_io_bdwrite(&dio);
1825                         }
1826                         break;
1827                 default:
1828                         panic("hammer2_write_bp: bad chain type %d\n",
1829                               chain->bref.type);
1830                         /* NOT REACHED */
1831                         error = 0;
1832                         break;
1833                 }
1834                 KKASSERT(error == 0);   /* XXX TODO */
1835         }
1836         *errorp = error;
1837 }
1838
1839 static
1840 int
1841 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path,
1842                 struct vnode *devvp, struct ucred *cred)
1843 {
1844         int error;
1845
1846         if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1847                 error = hammer2_recovery(hmp);
1848         } else {
1849                 error = 0;
1850         }
1851         return error;
1852 }
1853
1854 static
1855 int
1856 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1857 {
1858         hammer2_pfs_t *pmp;
1859         int flags;
1860         int error = 0;
1861
1862         pmp = MPTOPMP(mp);
1863
1864         if (pmp == NULL)
1865                 return(0);
1866
1867         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1868
1869         /*
1870          * If mount initialization proceeded far enough we must flush
1871          * its vnodes and sync the underlying mount points.  Three syncs
1872          * are required to fully flush the filesystem (freemap updates lag
1873          * by one flush, and one extra for safety).
1874          */
1875         if (mntflags & MNT_FORCE)
1876                 flags = FORCECLOSE;
1877         else
1878                 flags = 0;
1879         if (pmp->iroot) {
1880                 error = vflush(mp, 0, flags);
1881                 if (error)
1882                         goto failed;
1883                 hammer2_vfs_sync(mp, MNT_WAIT);
1884                 hammer2_vfs_sync(mp, MNT_WAIT);
1885                 hammer2_vfs_sync(mp, MNT_WAIT);
1886         }
1887
1888         if (pmp->wthread_td) {
1889                 hammer2_mtx_ex(&pmp->wthread_mtx);
1890                 pmp->wthread_destroy = 1;
1891                 wakeup(&pmp->wthread_bioq);
1892                 while (pmp->wthread_destroy != -1) {
1893                         mtxsleep(&pmp->wthread_destroy,
1894                                 &pmp->wthread_mtx, 0,
1895                                 "umount-sleep", 0);
1896                 }
1897                 hammer2_mtx_unlock(&pmp->wthread_mtx);
1898                 pmp->wthread_td = NULL;
1899         }
1900
1901         /*
1902          * Cleanup our reference on ihidden.
1903          */
1904         if (pmp->ihidden) {
1905                 hammer2_inode_drop(pmp->ihidden);
1906                 pmp->ihidden = NULL;
1907         }
1908         if (pmp->mp)
1909                 hammer2_unmount_helper(mp, pmp, NULL);
1910
1911         error = 0;
1912 failed:
1913         lockmgr(&hammer2_mntlk, LK_RELEASE);
1914
1915         return (error);
1916 }
1917
1918 /*
1919  * Mount helper, hook the system mount into our PFS.
1920  * The mount lock is held.
1921  *
1922  * We must bump the mount_count on related devices for any
1923  * mounted PFSs.
1924  */
1925 static
1926 void
1927 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp)
1928 {
1929         hammer2_cluster_t *cluster;
1930         hammer2_chain_t *rchain;
1931         int i;
1932
1933         mp->mnt_data = (qaddr_t)pmp;
1934         pmp->mp = mp;
1935
1936         /*
1937          * After pmp->mp is set we have to adjust hmp->mount_count.
1938          */
1939         cluster = &pmp->iroot->cluster;
1940         for (i = 0; i < cluster->nchains; ++i) {
1941                 rchain = cluster->array[i].chain;
1942                 if (rchain == NULL)
1943                         continue;
1944                 ++rchain->hmp->mount_count;
1945                 kprintf("hammer2_mount hmp=%p ++mount_count=%d\n",
1946                         rchain->hmp, rchain->hmp->mount_count);
1947         }
1948 }
1949
1950 /*
1951  * Mount helper, unhook the system mount from our PFS.
1952  * The mount lock is held.
1953  *
1954  * If hmp is supplied a mount responsible for being the first to open
1955  * the block device failed and the block device and all PFSs using the
1956  * block device must be cleaned up.
1957  *
1958  * If pmp is supplied multiple devices might be backing the PFS and each
1959  * must be disconnect.  This might not be the last PFS using some of the
1960  * underlying devices.  Also, we have to adjust our hmp->mount_count
1961  * accounting for the devices backing the pmp which is now undergoing an
1962  * unmount.
1963  */
1964 static
1965 void
1966 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp)
1967 {
1968         hammer2_cluster_t *cluster;
1969         hammer2_chain_t *rchain;
1970         struct vnode *devvp;
1971         int dumpcnt;
1972         int ronly = 0;
1973         int i;
1974
1975         /*
1976          * If no device supplied this is a high-level unmount and we have to
1977          * to disconnect the mount, adjust mount_count, and locate devices
1978          * that might now have no mounts.
1979          */
1980         if (pmp) {
1981                 KKASSERT(hmp == NULL);
1982                 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp);
1983                 pmp->mp = NULL;
1984                 mp->mnt_data = NULL;
1985
1986                 /*
1987                  * After pmp->mp is cleared we have to account for
1988                  * mount_count.
1989                  */
1990                 cluster = &pmp->iroot->cluster;
1991                 for (i = 0; i < cluster->nchains; ++i) {
1992                         rchain = cluster->array[i].chain;
1993                         if (rchain == NULL)
1994                                 continue;
1995                         --rchain->hmp->mount_count;
1996                         kprintf("hammer2_unmount hmp=%p --mount_count=%d\n",
1997                                 rchain->hmp, rchain->hmp->mount_count);
1998                         /* scrapping hmp now may invalidate the pmp */
1999                 }
2000 again:
2001                 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
2002                         if (hmp->mount_count == 0) {
2003                                 hammer2_unmount_helper(NULL, NULL, hmp);
2004                                 goto again;
2005                         }
2006                 }
2007                 return;
2008         }
2009
2010         /*
2011          * Try to terminate the block device.  We can't terminate it if
2012          * there are still PFSs referencing it.
2013          */
2014         kprintf("hammer2_unmount hmp=%p mount_count=%d\n",
2015                 hmp, hmp->mount_count);
2016         if (hmp->mount_count)
2017                 return;
2018
2019         hammer2_pfsfree_scan(hmp);
2020         hammer2_dev_exlock(hmp);        /* XXX order */
2021
2022         /*
2023          * Cycle the volume data lock as a safety (probably not needed any
2024          * more).  To ensure everything is out we need to flush at least
2025          * three times.  (1) The running of the unlinkq can dirty the
2026          * filesystem, (2) A normal flush can dirty the freemap, and
2027          * (3) ensure that the freemap is fully synchronized.
2028          *
2029          * The next mount's recovery scan can clean everything up but we want
2030          * to leave the filesystem in a 100% clean state on a normal unmount.
2031          */
2032 #if 0
2033         hammer2_voldata_lock(hmp);
2034         hammer2_voldata_unlock(hmp);
2035 #endif
2036         hammer2_iocom_uninit(hmp);
2037
2038         if ((hmp->vchain.flags | hmp->fchain.flags) &
2039             HAMMER2_CHAIN_FLUSH_MASK) {
2040                 kprintf("hammer2_unmount: chains left over "
2041                         "after final sync\n");
2042                 kprintf("    vchain %08x\n", hmp->vchain.flags);
2043                 kprintf("    fchain %08x\n", hmp->fchain.flags);
2044
2045                 if (hammer2_debug & 0x0010)
2046                         Debugger("entered debugger");
2047         }
2048
2049         KKASSERT(hmp->spmp == NULL);
2050
2051         /*
2052          * Finish up with the device vnode
2053          */
2054         if ((devvp = hmp->devvp) != NULL) {
2055                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2056                 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
2057                 hmp->devvp = NULL;
2058                 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
2059                 vn_unlock(devvp);
2060                 vrele(devvp);
2061                 devvp = NULL;
2062         }
2063
2064         /*
2065          * Clear vchain/fchain flags that might prevent final cleanup
2066          * of these chains.
2067          */
2068         if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
2069                 atomic_clear_int(&hmp->vchain.flags,
2070                                  HAMMER2_CHAIN_MODIFIED);
2071                 hammer2_pfs_memory_wakeup(hmp->vchain.pmp);
2072                 hammer2_chain_drop(&hmp->vchain);
2073         }
2074         if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
2075                 atomic_clear_int(&hmp->vchain.flags,
2076                                  HAMMER2_CHAIN_UPDATE);
2077                 hammer2_chain_drop(&hmp->vchain);
2078         }
2079
2080         if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
2081                 atomic_clear_int(&hmp->fchain.flags,
2082                                  HAMMER2_CHAIN_MODIFIED);
2083                 hammer2_pfs_memory_wakeup(hmp->fchain.pmp);
2084                 hammer2_chain_drop(&hmp->fchain);
2085         }
2086         if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
2087                 atomic_clear_int(&hmp->fchain.flags,
2088                                  HAMMER2_CHAIN_UPDATE);
2089                 hammer2_chain_drop(&hmp->fchain);
2090         }
2091
2092         /*
2093          * Final drop of embedded freemap root chain to
2094          * clean up fchain.core (fchain structure is not
2095          * flagged ALLOCATED so it is cleaned out and then
2096          * left to rot).
2097          */
2098         hammer2_chain_drop(&hmp->fchain);
2099
2100         /*
2101          * Final drop of embedded volume root chain to clean
2102          * up vchain.core (vchain structure is not flagged
2103          * ALLOCATED so it is cleaned out and then left to
2104          * rot).
2105          */
2106         dumpcnt = 50;
2107         hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
2108         dumpcnt = 50;
2109         hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
2110         hammer2_dev_unlock(hmp);
2111         hammer2_chain_drop(&hmp->vchain);
2112
2113         hammer2_io_cleanup(hmp, &hmp->iotree);
2114         if (hmp->iofree_count) {
2115                 kprintf("io_cleanup: %d I/O's left hanging\n",
2116                         hmp->iofree_count);
2117         }
2118
2119         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
2120         kmalloc_destroy(&hmp->mchain);
2121         kfree(hmp, M_HAMMER2);
2122 }
2123
2124 static
2125 int
2126 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
2127              ino_t ino, struct vnode **vpp)
2128 {
2129         kprintf("hammer2_vget\n");
2130         return (EOPNOTSUPP);
2131 }
2132
2133 static
2134 int
2135 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
2136 {
2137         hammer2_pfs_t *pmp;
2138         hammer2_cluster_t *cparent;
2139         int error;
2140         struct vnode *vp;
2141
2142         pmp = MPTOPMP(mp);
2143         if (pmp->iroot == NULL) {
2144                 *vpp = NULL;
2145                 error = EINVAL;
2146         } else {
2147                 cparent = hammer2_inode_lock(pmp->iroot,
2148                                                 HAMMER2_RESOLVE_ALWAYS |
2149                                                 HAMMER2_RESOLVE_SHARED);
2150                 vp = hammer2_igetv(pmp->iroot, cparent, &error);
2151                 hammer2_inode_unlock(pmp->iroot, cparent);
2152                 *vpp = vp;
2153                 if (vp == NULL)
2154                         kprintf("vnodefail\n");
2155         }
2156
2157         return (error);
2158 }
2159
2160 /*
2161  * Filesystem status
2162  *
2163  * XXX incorporate ipdata->inode_quota and data_quota
2164  */
2165 static
2166 int
2167 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
2168 {
2169         hammer2_pfs_t *pmp;
2170         hammer2_dev_t *hmp;
2171
2172         pmp = MPTOPMP(mp);
2173         KKASSERT(pmp->iroot->cluster.nchains >= 1);
2174         hmp = pmp->iroot->cluster.focus->hmp;   /* XXX */
2175
2176         mp->mnt_stat.f_files = pmp->inode_count;
2177         mp->mnt_stat.f_ffree = 0;
2178         mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
2179         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
2180         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
2181
2182         *sbp = mp->mnt_stat;
2183         return (0);
2184 }
2185
2186 static
2187 int
2188 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
2189 {
2190         hammer2_pfs_t *pmp;
2191         hammer2_dev_t *hmp;
2192
2193         pmp = MPTOPMP(mp);
2194         KKASSERT(pmp->iroot->cluster.nchains >= 1);
2195         hmp = pmp->iroot->cluster.focus->hmp;   /* XXX */
2196
2197         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
2198         mp->mnt_vstat.f_files = pmp->inode_count;
2199         mp->mnt_vstat.f_ffree = 0;
2200         mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
2201         mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
2202         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
2203
2204         *sbp = mp->mnt_vstat;
2205         return (0);
2206 }
2207
2208 /*
2209  * Mount-time recovery (RW mounts)
2210  *
2211  * Updates to the free block table are allowed to lag flushes by one
2212  * transaction.  In case of a crash, then on a fresh mount we must do an
2213  * incremental scan of the last committed transaction id and make sure that
2214  * all related blocks have been marked allocated.
2215  *
2216  * The super-root topology and each PFS has its own transaction id domain,
2217  * so we must track PFS boundary transitions.
2218  */
2219 struct hammer2_recovery_elm {
2220         TAILQ_ENTRY(hammer2_recovery_elm) entry;
2221         hammer2_chain_t *chain;
2222         hammer2_tid_t sync_tid;
2223 };
2224
2225 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
2226
2227 struct hammer2_recovery_info {
2228         struct hammer2_recovery_list list;
2229         int     depth;
2230 };
2231
2232 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2233                         hammer2_chain_t *parent,
2234                         struct hammer2_recovery_info *info,
2235                         hammer2_tid_t sync_tid);
2236
2237 #define HAMMER2_RECOVERY_MAXDEPTH       10
2238
2239 static
2240 int
2241 hammer2_recovery(hammer2_dev_t *hmp)
2242 {
2243         hammer2_trans_t trans;
2244         struct hammer2_recovery_info info;
2245         struct hammer2_recovery_elm *elm;
2246         hammer2_chain_t *parent;
2247         hammer2_tid_t sync_tid;
2248         hammer2_tid_t mirror_tid;
2249         int error;
2250         int cumulative_error = 0;
2251
2252         hammer2_trans_init(&trans, hmp->spmp, 0);
2253
2254         sync_tid = hmp->voldata.freemap_tid;
2255         mirror_tid = hmp->voldata.mirror_tid;
2256
2257         kprintf("hammer2 mount \"%s\": ", hmp->devrepname);
2258         if (sync_tid >= mirror_tid) {
2259                 kprintf(" no recovery needed\n");
2260         } else {
2261                 kprintf(" freemap recovery %016jx-%016jx\n",
2262                         sync_tid + 1, mirror_tid);
2263         }
2264
2265         TAILQ_INIT(&info.list);
2266         info.depth = 0;
2267         parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
2268         cumulative_error = hammer2_recovery_scan(&trans, hmp, parent,
2269                                                  &info, sync_tid);
2270         hammer2_chain_lookup_done(parent);
2271
2272         while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
2273                 TAILQ_REMOVE(&info.list, elm, entry);
2274                 parent = elm->chain;
2275                 sync_tid = elm->sync_tid;
2276                 kfree(elm, M_HAMMER2);
2277
2278                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2279                 error = hammer2_recovery_scan(&trans, hmp, parent,
2280                                               &info,
2281                                               hmp->voldata.freemap_tid);
2282                 hammer2_chain_unlock(parent);
2283                 hammer2_chain_drop(parent);     /* drop elm->chain ref */
2284                 if (error)
2285                         cumulative_error = error;
2286         }
2287         hammer2_trans_done(&trans);
2288
2289         return cumulative_error;
2290 }
2291
2292 static
2293 int
2294 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2295                       hammer2_chain_t *parent,
2296                       struct hammer2_recovery_info *info,
2297                       hammer2_tid_t sync_tid)
2298 {
2299         const hammer2_inode_data_t *ripdata;
2300         hammer2_chain_t *chain;
2301         int cache_index;
2302         int cumulative_error = 0;
2303         int error;
2304
2305         /*
2306          * Adjust freemap to ensure that the block(s) are marked allocated.
2307          */
2308         if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
2309                 hammer2_freemap_adjust(trans, hmp, &parent->bref,
2310                                        HAMMER2_FREEMAP_DORECOVER);
2311         }
2312
2313         /*
2314          * Check type for recursive scan
2315          */
2316         switch(parent->bref.type) {
2317         case HAMMER2_BREF_TYPE_VOLUME:
2318                 /* data already instantiated */
2319                 break;
2320         case HAMMER2_BREF_TYPE_INODE:
2321                 /*
2322                  * Must instantiate data for DIRECTDATA test and also
2323                  * for recursion.
2324                  */
2325                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2326                 ripdata = &hammer2_chain_rdata(parent)->ipdata;
2327                 if (ripdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2328                         /* not applicable to recovery scan */
2329                         hammer2_chain_unlock(parent);
2330                         return 0;
2331                 }
2332                 hammer2_chain_unlock(parent);
2333                 break;
2334         case HAMMER2_BREF_TYPE_INDIRECT:
2335                 /*
2336                  * Must instantiate data for recursion
2337                  */
2338                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2339                 hammer2_chain_unlock(parent);
2340                 break;
2341         case HAMMER2_BREF_TYPE_DATA:
2342         case HAMMER2_BREF_TYPE_FREEMAP:
2343         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2344         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2345                 /* not applicable to recovery scan */
2346                 return 0;
2347                 break;
2348         default:
2349                 return EDOM;
2350         }
2351
2352         /*
2353          * Defer operation if depth limit reached or if we are crossing a
2354          * PFS boundary.
2355          */
2356         if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) {
2357                 struct hammer2_recovery_elm *elm;
2358
2359                 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2360                 elm->chain = parent;
2361                 elm->sync_tid = sync_tid;
2362                 hammer2_chain_ref(parent);
2363                 TAILQ_INSERT_TAIL(&info->list, elm, entry);
2364                 /* unlocked by caller */
2365
2366                 return(0);
2367         }
2368
2369
2370         /*
2371          * Recursive scan of the last flushed transaction only.  We are
2372          * doing this without pmp assignments so don't leave the chains
2373          * hanging around after we are done with them.
2374          */
2375         cache_index = 0;
2376         chain = hammer2_chain_scan(parent, NULL, &cache_index,
2377                                    HAMMER2_LOOKUP_NODATA);
2378         while (chain) {
2379                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2380                 if (chain->bref.mirror_tid > sync_tid) {
2381                         ++info->depth;
2382                         error = hammer2_recovery_scan(trans, hmp, chain,
2383                                                       info, sync_tid);
2384                         --info->depth;
2385                         if (error)
2386                                 cumulative_error = error;
2387                 }
2388                 chain = hammer2_chain_scan(parent, chain, &cache_index,
2389                                            HAMMER2_LOOKUP_NODATA);
2390         }
2391
2392         return cumulative_error;
2393 }
2394
2395 /*
2396  * Sync the entire filesystem; this is called from the filesystem syncer
2397  * process periodically and whenever a user calls sync(1) on the hammer
2398  * mountpoint.
2399  *
2400  * Currently is actually called from the syncer! \o/
2401  *
2402  * This task will have to snapshot the state of the dirty inode chain.
2403  * From that, it will have to make sure all of the inodes on the dirty
2404  * chain have IO initiated. We make sure that io is initiated for the root
2405  * block.
2406  *
2407  * If waitfor is set, we wait for media to acknowledge the new rootblock.
2408  *
2409  * THINKS: side A vs side B, to have sync not stall all I/O?
2410  */
2411 int
2412 hammer2_vfs_sync(struct mount *mp, int waitfor)
2413 {
2414         struct hammer2_sync_info info;
2415         hammer2_inode_t *iroot;
2416         hammer2_chain_t *chain;
2417         hammer2_chain_t *parent;
2418         hammer2_pfs_t *pmp;
2419         hammer2_dev_t *hmp;
2420         int flags;
2421         int error;
2422         int total_error;
2423         int force_fchain;
2424         int i;
2425         int j;
2426
2427         pmp = MPTOPMP(mp);
2428         iroot = pmp->iroot;
2429         KKASSERT(iroot);
2430         KKASSERT(iroot->pmp == pmp);
2431
2432         /*
2433          * We can't acquire locks on existing vnodes while in a transaction
2434          * without risking a deadlock.  This assumes that vfsync() can be
2435          * called without the vnode locked (which it can in DragonFly).
2436          * Otherwise we'd have to implement a multi-pass or flag the lock
2437          * failures and retry.
2438          *
2439          * The reclamation code interlocks with the sync list's token
2440          * (by removing the vnode from the scan list) before unlocking
2441          * the inode, giving us time to ref the inode.
2442          */
2443         /*flags = VMSC_GETVP;*/
2444         flags = 0;
2445         if (waitfor & MNT_LAZY)
2446                 flags |= VMSC_ONEPASS;
2447
2448         /*
2449          * Start our flush transaction.  This does not return until all
2450          * concurrent transactions have completed and will prevent any
2451          * new transactions from running concurrently, except for the
2452          * buffer cache transactions.
2453          *
2454          * For efficiency do an async pass before making sure with a
2455          * synchronous pass on all related buffer cache buffers.  It
2456          * should theoretically not be possible for any new file buffers
2457          * to be instantiated during this sequence.
2458          */
2459         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH |
2460                                              HAMMER2_TRANS_PREFLUSH);
2461         hammer2_run_unlinkq(&info.trans, pmp);
2462
2463         info.error = 0;
2464         info.waitfor = MNT_NOWAIT;
2465         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
2466         info.waitfor = MNT_WAIT;
2467         vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2468
2469         /*
2470          * Clear PREFLUSH.  This prevents (or asserts on) any new logical
2471          * buffer cache flushes which occur during the flush.  Device buffers
2472          * are not affected.
2473          */
2474
2475 #if 0
2476         if (info.error == 0 && (waitfor & MNT_WAIT)) {
2477                 info.waitfor = waitfor;
2478                     vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2479
2480         }
2481 #endif
2482         hammer2_bioq_sync(info.trans.pmp);
2483         atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH);
2484
2485         total_error = 0;
2486
2487 #if 0
2488         /*
2489          * Flush all nodes making up the cluster
2490          *
2491          * We must also flush any deleted siblings because the super-root
2492          * flush won't do it for us.  They all must be staged or the
2493          * super-root flush will not be able to update its block table
2494          * properly.
2495          *
2496          * XXX currently done serially instead of concurrently
2497          */
2498         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2499                 chain = iroot->cluster.array[i].chain;
2500                 if (chain) {
2501                         hmp = chain->hmp;
2502                         hammer2_chain_ref(chain);    /* prevent destruction */
2503                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
2504                         hammer2_flush(&info.trans, chain);
2505                         hammer2_chain_unlock(chain);
2506                         hammer2_chain_drop(chain);
2507                 }
2508         }
2509 #endif
2510 #if 0
2511         hammer2_trans_done(&info.trans);
2512 #endif
2513
2514         /*
2515          * Flush all volume roots to synchronize PFS flushes with the
2516          * storage media.  Use a super-root transaction for each one.
2517          *
2518          * The flush code will detect super-root -> pfs-root chain
2519          * transitions using the last pfs-root flush.
2520          */
2521         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2522                 hammer2_chain_t *tmp;
2523
2524                 chain = iroot->cluster.array[i].chain;
2525                 if (chain == NULL)
2526                         continue;
2527
2528                 hmp = chain->hmp;
2529
2530                 /*
2531                  * We only have to flush each hmp once
2532                  */
2533                 for (j = i - 1; j >= 0; --j) {
2534                         if ((tmp = iroot->cluster.array[j].chain) != NULL) {
2535                                 if (tmp->hmp == hmp)
2536                                         break;
2537                         }
2538                 }
2539                 if (j >= 0)
2540                         continue;
2541 #if 0
2542                 hammer2_trans_spmp(&info.trans, hmp->spmp);
2543 #endif
2544
2545                 /*
2546                  * Force an update of the XID from the PFS root to the
2547                  * topology root.  We couldn't do this from the PFS
2548                  * transaction because a SPMP transaction is needed.
2549                  * This does not modify blocks, instead what it does is
2550                  * allow the flush code to find the transition point and
2551                  * then update on the way back up.
2552                  */
2553                 parent = chain->parent;
2554                 KKASSERT(chain->pmp != parent->pmp);
2555                 hammer2_chain_setflush(&info.trans, parent);
2556
2557                 /*
2558                  * Media mounts have two 'roots', vchain for the topology
2559                  * and fchain for the free block table.  Flush both.
2560                  *
2561                  * Note that the topology and free block table are handled
2562                  * independently, so the free block table can wind up being
2563                  * ahead of the topology.  We depend on the bulk free scan
2564                  * code to deal with any loose ends.
2565                  */
2566                 hammer2_chain_ref(&hmp->vchain);
2567                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2568                 hammer2_chain_ref(&hmp->fchain);
2569                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2570                 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2571                         /*
2572                          * This will also modify vchain as a side effect,
2573                          * mark vchain as modified now.
2574                          */
2575                         hammer2_voldata_modify(hmp);
2576                         chain = &hmp->fchain;
2577                         hammer2_flush(&info.trans, chain);
2578                         KKASSERT(chain == &hmp->fchain);
2579                 }
2580                 hammer2_chain_unlock(&hmp->fchain);
2581                 hammer2_chain_unlock(&hmp->vchain);
2582                 hammer2_chain_drop(&hmp->fchain);
2583                 /* vchain dropped down below */
2584
2585                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2586                 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2587                         chain = &hmp->vchain;
2588                         hammer2_flush(&info.trans, chain);
2589                         KKASSERT(chain == &hmp->vchain);
2590                         force_fchain = 1;
2591                 } else {
2592                         force_fchain = 0;
2593                 }
2594                 hammer2_chain_unlock(&hmp->vchain);
2595                 hammer2_chain_drop(&hmp->vchain);
2596
2597 #if 0
2598                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2599                 if ((hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) ||
2600                     force_fchain) {
2601                         /* this will also modify vchain as a side effect */
2602                         chain = &hmp->fchain;
2603                         hammer2_flush(&info.trans, chain);
2604                         KKASSERT(chain == &hmp->fchain);
2605                 }
2606                 hammer2_chain_unlock(&hmp->fchain);
2607 #endif
2608
2609                 error = 0;
2610
2611                 /*
2612                  * We can't safely flush the volume header until we have
2613                  * flushed any device buffers which have built up.
2614                  *
2615                  * XXX this isn't being incremental
2616                  */
2617                 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
2618                 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
2619                 vn_unlock(hmp->devvp);
2620
2621                 /*
2622                  * The flush code sets CHAIN_VOLUMESYNC to indicate that the
2623                  * volume header needs synchronization via hmp->volsync.
2624                  *
2625                  * XXX synchronize the flag & data with only this flush XXX
2626                  */
2627                 if (error == 0 &&
2628                     (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
2629                         struct buf *bp;
2630
2631                         /*
2632                          * Synchronize the disk before flushing the volume
2633                          * header.
2634                          */
2635                         bp = getpbuf(NULL);
2636                         bp->b_bio1.bio_offset = 0;
2637                         bp->b_bufsize = 0;
2638                         bp->b_bcount = 0;
2639                         bp->b_cmd = BUF_CMD_FLUSH;
2640                         bp->b_bio1.bio_done = biodone_sync;
2641                         bp->b_bio1.bio_flags |= BIO_SYNC;
2642                         vn_strategy(hmp->devvp, &bp->b_bio1);
2643                         biowait(&bp->b_bio1, "h2vol");
2644                         relpbuf(bp, NULL);
2645
2646                         /*
2647                          * Then we can safely flush the version of the
2648                          * volume header synchronized by the flush code.
2649                          */
2650                         i = hmp->volhdrno + 1;
2651                         if (i >= HAMMER2_NUM_VOLHDRS)
2652                                 i = 0;
2653                         if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
2654                             hmp->volsync.volu_size) {
2655                                 i = 0;
2656                         }
2657                         kprintf("sync volhdr %d %jd\n",
2658                                 i, (intmax_t)hmp->volsync.volu_size);
2659                         bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2660                                     HAMMER2_PBUFSIZE, 0, 0);
2661                         atomic_clear_int(&hmp->vchain.flags,
2662                                          HAMMER2_CHAIN_VOLUMESYNC);
2663                         bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
2664                         bawrite(bp);
2665                         hmp->volhdrno = i;
2666                 }
2667                 if (error)
2668                         total_error = error;
2669
2670 #if 0
2671                 hammer2_trans_done(&info.trans);
2672 #endif
2673         }
2674         hammer2_trans_done(&info.trans);
2675
2676         return (total_error);
2677 }
2678
2679 /*
2680  * Sync passes.
2681  */
2682 static int
2683 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2684 {
2685         struct hammer2_sync_info *info = data;
2686         hammer2_inode_t *ip;
2687         int error;
2688
2689         /*
2690          *
2691          */
2692         ip = VTOI(vp);
2693         if (ip == NULL)
2694                 return(0);
2695         if (vp->v_type == VNON || vp->v_type == VBAD) {
2696                 vclrisdirty(vp);
2697                 return(0);
2698         }
2699         if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
2700             RB_EMPTY(&vp->v_rbdirty_tree)) {
2701                 vclrisdirty(vp);
2702                 return(0);
2703         }
2704
2705         /*
2706          * VOP_FSYNC will start a new transaction so replicate some code
2707          * here to do it inline (see hammer2_vop_fsync()).
2708          *
2709          * WARNING: The vfsync interacts with the buffer cache and might
2710          *          block, we can't hold the inode lock at that time.
2711          *          However, we MUST ref ip before blocking to ensure that
2712          *          it isn't ripped out from under us (since we do not
2713          *          hold a lock on the vnode).
2714          */
2715         hammer2_inode_ref(ip);
2716         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
2717         if (vp)
2718                 vfsync(vp, MNT_NOWAIT, 1, NULL, NULL);
2719
2720         hammer2_inode_drop(ip);
2721 #if 1
2722         error = 0;
2723         if (error)
2724                 info->error = error;
2725 #endif
2726         return(0);
2727 }
2728
2729 static
2730 int
2731 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2732 {
2733         return (0);
2734 }
2735
2736 static
2737 int
2738 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2739                struct fid *fhp, struct vnode **vpp)
2740 {
2741         return (0);
2742 }
2743
2744 static
2745 int
2746 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2747                  int *exflagsp, struct ucred **credanonp)
2748 {
2749         return (0);
2750 }
2751
2752 /*
2753  * Support code for hammer2_vfs_mount().  Read, verify, and install the volume
2754  * header into the HMP
2755  *
2756  * XXX read four volhdrs and use the one with the highest TID whos CRC
2757  *     matches.
2758  *
2759  * XXX check iCRCs.
2760  *
2761  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2762  *     nonexistant locations.
2763  *
2764  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2765  */
2766 static
2767 int
2768 hammer2_install_volume_header(hammer2_dev_t *hmp)
2769 {
2770         hammer2_volume_data_t *vd;
2771         struct buf *bp;
2772         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2773         int error_reported;
2774         int error;
2775         int valid;
2776         int i;
2777
2778         error_reported = 0;
2779         error = 0;
2780         valid = 0;
2781         bp = NULL;
2782
2783         /*
2784          * There are up to 4 copies of the volume header (syncs iterate
2785          * between them so there is no single master).  We don't trust the
2786          * volu_size field so we don't know precisely how large the filesystem
2787          * is, so depend on the OS to return an error if we go beyond the
2788          * block device's EOF.
2789          */
2790         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2791                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2792                               HAMMER2_VOLUME_BYTES, &bp);
2793                 if (error) {
2794                         brelse(bp);
2795                         bp = NULL;
2796                         continue;
2797                 }
2798
2799                 vd = (struct hammer2_volume_data *) bp->b_data;
2800                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2801                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2802                         brelse(bp);
2803                         bp = NULL;
2804                         continue;
2805                 }
2806
2807                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2808                         /* XXX: Reversed-endianness filesystem */
2809                         kprintf("hammer2: reverse-endian filesystem detected");
2810                         brelse(bp);
2811                         bp = NULL;
2812                         continue;
2813                 }
2814
2815                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2816                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2817                                       HAMMER2_VOLUME_ICRC0_SIZE);
2818                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2819                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2820                                        HAMMER2_VOLUME_ICRC1_SIZE);
2821                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
2822                         kprintf("hammer2 volume header crc "
2823                                 "mismatch copy #%d %08x/%08x\n",
2824                                 i, crc0, crc);
2825                         error_reported = 1;
2826                         brelse(bp);
2827                         bp = NULL;
2828                         continue;
2829                 }
2830                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
2831                         valid = 1;
2832                         hmp->voldata = *vd;
2833                         hmp->volhdrno = i;
2834                 }
2835                 brelse(bp);
2836                 bp = NULL;
2837         }
2838         if (valid) {
2839                 hmp->volsync = hmp->voldata;
2840                 error = 0;
2841                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
2842                         kprintf("hammer2: using volume header #%d\n",
2843                                 hmp->volhdrno);
2844                 }
2845         } else {
2846                 error = EINVAL;
2847                 kprintf("hammer2: no valid volume headers found!\n");
2848         }
2849         return (error);
2850 }
2851
2852 /*
2853  * This handles hysteresis on regular file flushes.  Because the BIOs are
2854  * routed to a thread it is possible for an excessive number to build up
2855  * and cause long front-end stalls long before the runningbuffspace limit
2856  * is hit, so we implement hammer2_flush_pipe to control the
2857  * hysteresis.
2858  *
2859  * This is a particular problem when compression is used.
2860  */
2861 void
2862 hammer2_lwinprog_ref(hammer2_pfs_t *pmp)
2863 {
2864         atomic_add_int(&pmp->count_lwinprog, 1);
2865 }
2866
2867 void
2868 hammer2_lwinprog_drop(hammer2_pfs_t *pmp)
2869 {
2870         int lwinprog;
2871
2872         lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2873         if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2874             (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2875                 atomic_clear_int(&pmp->count_lwinprog,
2876                                  HAMMER2_LWINPROG_WAITING);
2877                 wakeup(&pmp->count_lwinprog);
2878         }
2879 }
2880
2881 void
2882 hammer2_lwinprog_wait(hammer2_pfs_t *pmp)
2883 {
2884         int lwinprog;
2885
2886         for (;;) {
2887                 lwinprog = pmp->count_lwinprog;
2888                 cpu_ccfence();
2889                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2890                         break;
2891                 tsleep_interlock(&pmp->count_lwinprog, 0);
2892                 atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING);
2893                 lwinprog = pmp->count_lwinprog;
2894                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2895                         break;
2896                 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2897         }
2898 }
2899
2900 /*
2901  * Manage excessive memory resource use for chain and related
2902  * structures.
2903  */
2904 void
2905 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
2906 {
2907         uint32_t waiting;
2908         uint32_t count;
2909         uint32_t limit;
2910 #if 0
2911         static int zzticks;
2912 #endif
2913
2914         /*
2915          * Atomic check condition and wait.  Also do an early speedup of
2916          * the syncer to try to avoid hitting the wait.
2917          */
2918         for (;;) {
2919                 waiting = pmp->inmem_dirty_chains;
2920                 cpu_ccfence();
2921                 count = waiting & HAMMER2_DIRTYCHAIN_MASK;
2922
2923                 limit = pmp->mp->mnt_nvnodelistsize / 10;
2924                 if (limit < hammer2_limit_dirty_chains)
2925                         limit = hammer2_limit_dirty_chains;
2926                 if (limit < 1000)
2927                         limit = 1000;
2928
2929 #if 0
2930                 if ((int)(ticks - zzticks) > hz) {
2931                         zzticks = ticks;
2932                         kprintf("count %ld %ld\n", count, limit);
2933                 }
2934 #endif
2935
2936                 /*
2937                  * Block if there are too many dirty chains present, wait
2938                  * for the flush to clean some out.
2939                  */
2940                 if (count > limit) {
2941                         tsleep_interlock(&pmp->inmem_dirty_chains, 0);
2942                         if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
2943                                                waiting,
2944                                        waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
2945                                 speedup_syncer(pmp->mp);
2946                                 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
2947                                        "chnmem", hz);
2948                         }
2949                         continue;       /* loop on success or fail */
2950                 }
2951
2952                 /*
2953                  * Try to start an early flush before we are forced to block.
2954                  */
2955                 if (count > limit * 7 / 10)
2956                         speedup_syncer(pmp->mp);
2957                 break;
2958         }
2959 }
2960
2961 void
2962 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp)
2963 {
2964         if (pmp) {
2965                 atomic_add_int(&pmp->inmem_dirty_chains, 1);
2966         }
2967 }
2968
2969 void
2970 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp)
2971 {
2972         uint32_t waiting;
2973
2974         if (pmp == NULL)
2975                 return;
2976
2977         for (;;) {
2978                 waiting = pmp->inmem_dirty_chains;
2979                 cpu_ccfence();
2980                 if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
2981                                        waiting,
2982                                        (waiting - 1) &
2983                                         ~HAMMER2_DIRTYCHAIN_WAITING)) {
2984                         break;
2985                 }
2986         }
2987
2988         if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
2989                 wakeup(&pmp->inmem_dirty_chains);
2990 }
2991
2992 /*
2993  * Debugging
2994  */
2995 void
2996 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
2997 {
2998         hammer2_chain_t *scan;
2999         hammer2_chain_t *parent;
3000
3001         --*countp;
3002         if (*countp == 0) {
3003                 kprintf("%*.*s...\n", tab, tab, "");
3004                 return;
3005         }
3006         if (*countp < 0)
3007                 return;
3008         kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
3009                 tab, tab, "", pfx,
3010                 chain, chain->bref.type,
3011                 chain->bref.key, chain->bref.keybits,
3012                 chain->bref.mirror_tid);
3013
3014         kprintf("%*.*s      [%08x] (%s) refs=%d\n",
3015                 tab, tab, "",
3016                 chain->flags,
3017                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
3018                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
3019                 chain->refs);
3020
3021         kprintf("%*.*s      core [%08x]",
3022                 tab, tab, "",
3023                 chain->core.flags);
3024
3025         parent = chain->parent;
3026         if (parent)
3027                 kprintf("\n%*.*s      p=%p [pflags %08x prefs %d",
3028                         tab, tab, "",
3029                         parent, parent->flags, parent->refs);
3030         if (RB_EMPTY(&chain->core.rbtree)) {
3031                 kprintf("\n");
3032         } else {
3033                 kprintf(" {\n");
3034                 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree)
3035                         hammer2_dump_chain(scan, tab + 4, countp, 'a');
3036                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
3037                         kprintf("%*.*s}(%s)\n", tab, tab, "",
3038                                 chain->data->ipdata.filename);
3039                 else
3040                         kprintf("%*.*s}\n", tab, tab, "");
3041         }
3042 }