hammer2 - Refactor frontend part 4/many
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61 #include "hammer2_lz4.h"
62
63 #include "zlib/hammer2_zlib.h"
64
65 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
66
67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
68
69 struct hammer2_sync_info {
70         hammer2_trans_t trans;
71         int error;
72         int waitfor;
73 };
74
75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
76 TAILQ_HEAD(hammer2_pfslist, hammer2_pfs);
77 static struct hammer2_mntlist hammer2_mntlist;
78 static struct hammer2_pfslist hammer2_pfslist;
79 static struct lock hammer2_mntlk;
80
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 int hammer2_flush_pipe = 100;
85 int hammer2_synchronous_flush = 1;
86 int hammer2_dio_count;
87 long hammer2_limit_dirty_chains;
88 long hammer2_iod_file_read;
89 long hammer2_iod_meta_read;
90 long hammer2_iod_indr_read;
91 long hammer2_iod_fmap_read;
92 long hammer2_iod_volu_read;
93 long hammer2_iod_file_write;
94 long hammer2_iod_meta_write;
95 long hammer2_iod_indr_write;
96 long hammer2_iod_fmap_write;
97 long hammer2_iod_volu_write;
98 long hammer2_ioa_file_read;
99 long hammer2_ioa_meta_read;
100 long hammer2_ioa_indr_read;
101 long hammer2_ioa_fmap_read;
102 long hammer2_ioa_volu_read;
103 long hammer2_ioa_fmap_write;
104 long hammer2_ioa_file_write;
105 long hammer2_ioa_meta_write;
106 long hammer2_ioa_indr_write;
107 long hammer2_ioa_volu_write;
108
109 MALLOC_DECLARE(M_HAMMER2_CBUFFER);
110 MALLOC_DEFINE(M_HAMMER2_CBUFFER, "HAMMER2-compbuffer",
111                 "Buffer used for compression.");
112
113 MALLOC_DECLARE(M_HAMMER2_DEBUFFER);
114 MALLOC_DEFINE(M_HAMMER2_DEBUFFER, "HAMMER2-decompbuffer",
115                 "Buffer used for decompression.");
116
117 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
118
119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
120            &hammer2_debug, 0, "");
121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
122            &hammer2_cluster_enable, 0, "");
123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
124            &hammer2_hardlink_enable, 0, "");
125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
126            &hammer2_flush_pipe, 0, "");
127 SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
128            &hammer2_synchronous_flush, 0, "");
129 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
130            &hammer2_limit_dirty_chains, 0, "");
131 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
132            &hammer2_dio_count, 0, "");
133
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
135            &hammer2_iod_file_read, 0, "");
136 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
137            &hammer2_iod_meta_read, 0, "");
138 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
139            &hammer2_iod_indr_read, 0, "");
140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
141            &hammer2_iod_fmap_read, 0, "");
142 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
143            &hammer2_iod_volu_read, 0, "");
144
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
146            &hammer2_iod_file_write, 0, "");
147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
148            &hammer2_iod_meta_write, 0, "");
149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
150            &hammer2_iod_indr_write, 0, "");
151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
152            &hammer2_iod_fmap_write, 0, "");
153 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
154            &hammer2_iod_volu_write, 0, "");
155
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
157            &hammer2_ioa_file_read, 0, "");
158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
159            &hammer2_ioa_meta_read, 0, "");
160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
161            &hammer2_ioa_indr_read, 0, "");
162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
163            &hammer2_ioa_fmap_read, 0, "");
164 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
165            &hammer2_ioa_volu_read, 0, "");
166
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
168            &hammer2_ioa_file_write, 0, "");
169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
170            &hammer2_ioa_meta_write, 0, "");
171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
172            &hammer2_ioa_indr_write, 0, "");
173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
174            &hammer2_ioa_fmap_write, 0, "");
175 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
176            &hammer2_ioa_volu_write, 0, "");
177
178 static int hammer2_vfs_init(struct vfsconf *conf);
179 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
180 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
181                                 struct ucred *cred);
182 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *,
183                                 struct vnode *, struct ucred *);
184 static int hammer2_recovery(hammer2_dev_t *hmp);
185 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
186 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
187 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
188                                 struct ucred *cred);
189 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
190                                 struct ucred *cred);
191 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
192                                 ino_t ino, struct vnode **vpp);
193 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
194                                 struct fid *fhp, struct vnode **vpp);
195 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
196 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
197                                 int *exflagsp, struct ucred **credanonp);
198
199 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
200 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
201
202 static void hammer2_update_pmps(hammer2_dev_t *hmp);
203 static void hammer2_write_thread(void *arg);
204
205 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp);
206 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp,
207                                 hammer2_dev_t *hmp);
208
209 /* 
210  * Functions for compression in threads,
211  * from hammer2_vnops.c
212  */
213 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
214                                 hammer2_inode_t *ip,
215                                 hammer2_cluster_t *cparent,
216                                 hammer2_key_t lbase, int ioflag, int pblksize,
217                                 int *errorp);
218 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
219                                 hammer2_inode_t *ip,
220                                 hammer2_cluster_t *cparent,
221                                 hammer2_key_t lbase, int ioflag,
222                                 int pblksize, int *errorp,
223                                 int comp_algo, int check_algo);
224 static void hammer2_zero_check_and_write(struct buf *bp,
225                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
226                                 hammer2_cluster_t *cparent,
227                                 hammer2_key_t lbase,
228                                 int ioflag, int pblksize, int *errorp,
229                                 int check_algo);
230 static int test_block_zeros(const char *buf, size_t bytes);
231 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
232                                 hammer2_inode_t *ip,
233                                 hammer2_cluster_t *cparent,
234                                 hammer2_key_t lbase,
235                                 int *errorp);
236 static void hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp,
237                                 int ioflag, int pblksize, int *errorp,
238                                 int check_algo);
239
240 /*
241  * HAMMER2 vfs operations.
242  */
243 static struct vfsops hammer2_vfsops = {
244         .vfs_init       = hammer2_vfs_init,
245         .vfs_uninit     = hammer2_vfs_uninit,
246         .vfs_sync       = hammer2_vfs_sync,
247         .vfs_mount      = hammer2_vfs_mount,
248         .vfs_unmount    = hammer2_vfs_unmount,
249         .vfs_root       = hammer2_vfs_root,
250         .vfs_statfs     = hammer2_vfs_statfs,
251         .vfs_statvfs    = hammer2_vfs_statvfs,
252         .vfs_vget       = hammer2_vfs_vget,
253         .vfs_vptofh     = hammer2_vfs_vptofh,
254         .vfs_fhtovp     = hammer2_vfs_fhtovp,
255         .vfs_checkexp   = hammer2_vfs_checkexp
256 };
257
258 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
259
260 VFS_SET(hammer2_vfsops, hammer2, 0);
261 MODULE_VERSION(hammer2, 1);
262
263 static
264 int
265 hammer2_vfs_init(struct vfsconf *conf)
266 {
267         static struct objcache_malloc_args margs_read;
268         static struct objcache_malloc_args margs_write;
269         static struct objcache_malloc_args margs_vop;
270
271         int error;
272
273         error = 0;
274
275         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
276                 error = EINVAL;
277         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
278                 error = EINVAL;
279         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
280                 error = EINVAL;
281
282         if (error)
283                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
284         
285         margs_read.objsize = 65536;
286         margs_read.mtype = M_HAMMER2_DEBUFFER;
287         
288         margs_write.objsize = 32768;
289         margs_write.mtype = M_HAMMER2_CBUFFER;
290
291         margs_vop.objsize = sizeof(hammer2_vop_info_t);
292         margs_vop.mtype = M_HAMMER2;
293         
294         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
295                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
296                                 objcache_malloc_free, &margs_read);
297         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
298                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
299                                 objcache_malloc_free, &margs_write);
300         cache_vop_info = objcache_create(margs_vop.mtype->ks_shortdesc,
301                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
302                                 objcache_malloc_free, &margs_vop);
303
304
305         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
306         TAILQ_INIT(&hammer2_mntlist);
307         TAILQ_INIT(&hammer2_pfslist);
308
309         hammer2_limit_dirty_chains = desiredvnodes / 10;
310
311         return (error);
312 }
313
314 static
315 int
316 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
317 {
318         objcache_destroy(cache_buffer_read);
319         objcache_destroy(cache_buffer_write);
320         objcache_destroy(cache_vop_info);
321         return 0;
322 }
323
324 /*
325  * Core PFS allocator.  Used to allocate the pmp structure for PFS cluster
326  * mounts and the spmp structure for media (hmp) structures.
327  *
328  * pmp->modify_tid tracks new modify_tid transaction ids for front-end
329  * transactions.  Note that synchronization does not use this field.
330  * (typically frontend operations and synchronization cannot run on the
331  * same PFS node at the same time).
332  *
333  * XXX check locking
334  */
335 hammer2_pfs_t *
336 hammer2_pfsalloc(hammer2_cluster_t *cluster,
337                  const hammer2_inode_data_t *ripdata,
338                  hammer2_tid_t modify_tid)
339 {
340         hammer2_chain_t *rchain;
341         hammer2_inode_t *iroot;
342         hammer2_pfs_t *pmp;
343         int count;
344         int i;
345         int j;
346
347         /*
348          * Locate or create the PFS based on the cluster id.  If ripdata
349          * is NULL this is a spmp which is unique and is always allocated.
350          */
351         if (ripdata) {
352                 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
353                         if (bcmp(&pmp->pfs_clid, &ripdata->meta.pfs_clid,
354                                  sizeof(pmp->pfs_clid)) == 0) {
355                                         break;
356                         }
357                 }
358         } else {
359                 pmp = NULL;
360         }
361
362         if (pmp == NULL) {
363                 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
364                 hammer2_trans_manage_init(&pmp->tmanage);
365                 kmalloc_create(&pmp->minode, "HAMMER2-inodes");
366                 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
367                 lockinit(&pmp->lock, "pfslk", 0, 0);
368                 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
369                 RB_INIT(&pmp->inum_tree);
370                 TAILQ_INIT(&pmp->unlinkq);
371                 spin_init(&pmp->list_spin, "hm2pfsalloc_list");
372
373                 /*
374                  * Save the last media transaction id for the flusher.  Set
375                  * initial 
376                  */
377                 if (ripdata)
378                         pmp->pfs_clid = ripdata->meta.pfs_clid;
379                 hammer2_mtx_init(&pmp->wthread_mtx, "h2wthr");
380                 bioq_init(&pmp->wthread_bioq);
381                 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
382
383                 /*
384                  * The synchronization thread may start too early, make
385                  * sure it stays frozen until we are ready to let it go.
386                  * XXX
387                  */
388                 /*
389                 pmp->primary_thr.flags = HAMMER2_SYNCTHR_FROZEN |
390                                          HAMMER2_SYNCTHR_REMASTER;
391                 */
392         }
393
394         /*
395          * Create the PFS's root inode.
396          */
397         if ((iroot = pmp->iroot) == NULL) {
398                 iroot = hammer2_inode_get(pmp, NULL, NULL);
399                 pmp->iroot = iroot;
400                 hammer2_inode_ref(iroot);
401                 hammer2_inode_unlock(iroot, NULL);
402         }
403
404         /*
405          * Stop here if no cluster is passed in.
406          */
407         if (cluster == NULL)
408                 goto done;
409
410         /*
411          * When a cluster is passed in we must add the cluster's chains
412          * to the PFS's root inode, update pmp->pfs_types[], and update
413          * the syncronization threads.
414          *
415          * At the moment empty spots can develop due to removals or failures.
416          * Ultimately we want to re-fill these spots but doing so might
417          * confused running code. XXX
418          */
419         hammer2_inode_ref(iroot);
420         hammer2_mtx_ex(&iroot->lock);
421         j = iroot->cluster.nchains;
422
423         kprintf("add PFS to pmp %p[%d]\n", pmp, j);
424
425         for (i = 0; i < cluster->nchains; ++i) {
426                 if (j == HAMMER2_MAXCLUSTER)
427                         break;
428                 rchain = cluster->array[i].chain;
429                 KKASSERT(rchain->pmp == NULL);
430                 rchain->pmp = pmp;
431                 hammer2_chain_ref(rchain);
432                 iroot->cluster.array[j].chain = rchain;
433                 pmp->pfs_types[j] = ripdata->meta.pfs_type;
434                 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2);
435
436                 /*
437                  * If the PFS is already mounted we must account
438                  * for the mount_count here.
439                  */
440                 if (pmp->mp)
441                         ++rchain->hmp->mount_count;
442
443                 /*
444                  * May have to fixup dirty chain tracking.  Previous
445                  * pmp was NULL so nothing to undo.
446                  */
447                 if (rchain->flags & HAMMER2_CHAIN_MODIFIED)
448                         hammer2_pfs_memory_inc(pmp);
449                 ++j;
450         }
451         iroot->cluster.nchains = j;
452
453         if (i != cluster->nchains) {
454                 kprintf("hammer2_mount: cluster full!\n");
455                 /* XXX fatal error? */
456         }
457
458         /*
459          * Update nmasters from any PFS inode which is part of the cluster.
460          * It is possible that this will result in a value which is too
461          * high.  MASTER PFSs are authoritative for pfs_nmasters and will
462          * override this value later on.
463          *
464          * (This informs us of masters that might not currently be
465          *  discoverable by this mount).
466          */
467         if (ripdata && pmp->pfs_nmasters < ripdata->meta.pfs_nmasters) {
468                 pmp->pfs_nmasters = ripdata->meta.pfs_nmasters;
469         }
470
471         /*
472          * Count visible masters.  Masters are usually added with
473          * ripdata->meta.pfs_nmasters set to 1.  This detects when there
474          * are more (XXX and must update the master inodes).
475          */
476         count = 0;
477         for (i = 0; i < iroot->cluster.nchains; ++i) {
478                 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER)
479                         ++count;
480         }
481         if (pmp->pfs_nmasters < count)
482                 pmp->pfs_nmasters = count;
483
484         /*
485          * Create missing synchronization threads.
486          *
487          * Single-node masters (including snapshots) have nothing to
488          * synchronize and do not require this thread.
489          *
490          * Multi-node masters or any number of soft masters, slaves, copy,
491          * or other PFS types need the thread.
492          *
493          * Each thread is responsible for its particular cluster index.
494          * We use independent threads so stalls or mismatches related to
495          * any given target do not affect other targets.
496          */
497         for (i = 0; i < iroot->cluster.nchains; ++i) {
498                 if (pmp->sync_thrs[i].td)
499                         continue;
500                 if ((pmp->pfs_nmasters > 1 &&
501                      (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER)) ||
502                     pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER) {
503                         hammer2_syncthr_create(&pmp->sync_thrs[i], pmp, i,
504                                                hammer2_syncthr_primary);
505                 }
506         }
507
508         hammer2_mtx_unlock(&iroot->lock);
509         hammer2_inode_drop(iroot);
510 done:
511         return pmp;
512 }
513
514 /*
515  * Destroy a PFS, typically only occurs after the last mount on a device
516  * has gone away.
517  */
518 static void
519 hammer2_pfsfree(hammer2_pfs_t *pmp)
520 {
521         hammer2_inode_t *iroot;
522         int i;
523
524         /*
525          * Cleanup our reference on iroot.  iroot is (should) not be needed
526          * by the flush code.
527          */
528         TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
529
530         iroot = pmp->iroot;
531         if (iroot) {
532                 for (i = 0; i < iroot->cluster.nchains; ++i)
533                         hammer2_syncthr_delete(&pmp->sync_thrs[i]);
534 #if REPORT_REFS_ERRORS
535                 if (pmp->iroot->refs != 1)
536                         kprintf("PMP->IROOT %p REFS WRONG %d\n",
537                                 pmp->iroot, pmp->iroot->refs);
538 #else
539                 KKASSERT(pmp->iroot->refs == 1);
540 #endif
541                 /* ref for pmp->iroot */
542                 hammer2_inode_drop(pmp->iroot);
543                 pmp->iroot = NULL;
544         }
545
546         kmalloc_destroy(&pmp->mmsg);
547         kmalloc_destroy(&pmp->minode);
548
549         kfree(pmp, M_HAMMER2);
550 }
551
552 /*
553  * Remove all references to hmp from the pfs list.  Any PFS which becomes
554  * empty is terminated and freed.
555  *
556  * XXX inefficient.
557  */
558 static void
559 hammer2_pfsfree_scan(hammer2_dev_t *hmp)
560 {
561         hammer2_pfs_t *pmp;
562         hammer2_inode_t *iroot;
563         hammer2_cluster_t *cluster;
564         hammer2_chain_t *rchain;
565         int didfreeze;
566         int i;
567
568 again:
569         TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
570                 if ((iroot = pmp->iroot) == NULL)
571                         continue;
572                 if (hmp->spmp == pmp) {
573                         kprintf("unmount hmp %p remove spmp %p\n",
574                                 hmp, pmp);
575                         hmp->spmp = NULL;
576                 }
577
578                 /*
579                  * Determine if this PFS is affected.  If it is we must
580                  * freeze all management threads and lock its iroot.
581                  *
582                  * Freezing a management thread forces it idle, operations
583                  * in-progress will be aborted and it will have to start
584                  * over again when unfrozen, or exit if told to exit.
585                  */
586                 cluster = &iroot->cluster;
587                 for (i = 0; i < cluster->nchains; ++i) {
588                         rchain = cluster->array[i].chain;
589                         if (rchain == NULL || rchain->hmp != hmp)
590                                 continue;
591                         break;
592                 }
593                 if (i != cluster->nchains) {
594                         /*
595                          * Make sure all synchronization threads are locked
596                          * down.
597                          */
598                         for (i = 0; i < iroot->cluster.nchains; ++i)
599                                 hammer2_syncthr_freeze(&pmp->sync_thrs[i]);
600
601                         /*
602                          * Lock the inode and clean out matching chains.
603                          * Note that we cannot use hammer2_inode_lock_*()
604                          * here because that would attempt to validate the
605                          * cluster that we are in the middle of ripping
606                          * apart.
607                          *
608                          * WARNING! We are working directly on the inodes
609                          *          embedded cluster.
610                          */
611                         hammer2_mtx_ex(&iroot->lock);
612
613                         /*
614                          * Remove the chain from matching elements of the PFS.
615                          */
616                         for (i = 0; i < cluster->nchains; ++i) {
617                                 rchain = cluster->array[i].chain;
618                                 if (rchain == NULL || rchain->hmp != hmp)
619                                         continue;
620                                 hammer2_syncthr_delete(&pmp->sync_thrs[i]);
621                                 rchain = cluster->array[i].chain;
622                                 cluster->array[i].chain = NULL;
623                                 pmp->pfs_types[i] = 0;
624                                 if (pmp->pfs_names[i]) {
625                                         kfree(pmp->pfs_names[i], M_HAMMER2);
626                                         pmp->pfs_names[i] = NULL;
627                                 }
628                                 hammer2_chain_drop(rchain);
629
630                                 /* focus hint */
631                                 if (cluster->focus == rchain)
632                                         cluster->focus = NULL;
633                         }
634                         hammer2_mtx_unlock(&iroot->lock);
635                         didfreeze = 1;  /* remaster, unfreeze down below */
636                 } else {
637                         didfreeze = 0;
638                 }
639
640                 /*
641                  * Cleanup trailing chains.  Do not reorder chains (for now).
642                  * XXX might remove more than we intended.
643                  */
644                 while (i > 0) {
645                         if (cluster->array[i - 1].chain)
646                                 break;
647                         --i;
648                 }
649                 cluster->nchains = i;
650
651                 /*
652                  * If the PMP has no elements remaining we can destroy it.
653                  * (this will transition management threads from frozen->exit).
654                  */
655                 if (cluster->nchains == 0) {
656                         kprintf("unmount hmp %p last ref to PMP=%p\n",
657                                 hmp, pmp);
658                         hammer2_pfsfree(pmp);
659                         goto again;
660                 }
661
662                 /*
663                  * If elements still remain we need to set the REMASTER
664                  * flag and unfreeze it.
665                  */
666                 if (didfreeze) {
667                         for (i = 0; i < iroot->cluster.nchains; ++i) {
668                                 hammer2_syncthr_remaster(&pmp->sync_thrs[i]);
669                                 hammer2_syncthr_unfreeze(&pmp->sync_thrs[i]);
670                         }
671                 }
672         }
673 }
674
675 /*
676  * Mount or remount HAMMER2 fileystem from physical media
677  *
678  *      mountroot
679  *              mp              mount point structure
680  *              path            NULL
681  *              data            <unused>
682  *              cred            <unused>
683  *
684  *      mount
685  *              mp              mount point structure
686  *              path            path to mount point
687  *              data            pointer to argument structure in user space
688  *                      volume  volume path (device@LABEL form)
689  *                      hflags  user mount flags
690  *              cred            user credentials
691  *
692  * RETURNS:     0       Success
693  *              !0      error number
694  */
695 static
696 int
697 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
698                   struct ucred *cred)
699 {
700         struct hammer2_mount_info info;
701         hammer2_pfs_t *pmp;
702         hammer2_pfs_t *spmp;
703         hammer2_dev_t *hmp;
704         hammer2_key_t key_next;
705         hammer2_key_t key_dummy;
706         hammer2_key_t lhc;
707         struct vnode *devvp;
708         struct nlookupdata nd;
709         hammer2_chain_t *parent;
710         hammer2_cluster_t *cluster;
711         hammer2_cluster_t *cparent;
712         const hammer2_inode_data_t *ripdata;
713         hammer2_blockref_t bref;
714         struct file *fp;
715         char devstr[MNAMELEN];
716         size_t size;
717         size_t done;
718         char *dev;
719         char *label;
720         int ronly = 1;
721         int error;
722         int cache_index;
723         int i;
724
725         hmp = NULL;
726         pmp = NULL;
727         dev = NULL;
728         label = NULL;
729         devvp = NULL;
730         cache_index = -1;
731
732         kprintf("hammer2_mount\n");
733
734         if (path == NULL) {
735                 /*
736                  * Root mount
737                  */
738                 bzero(&info, sizeof(info));
739                 info.cluster_fd = -1;
740                 return (EOPNOTSUPP);
741         } else {
742                 /*
743                  * Non-root mount or updating a mount
744                  */
745                 error = copyin(data, &info, sizeof(info));
746                 if (error)
747                         return (error);
748
749                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
750                 if (error)
751                         return (error);
752
753                 /* Extract device and label */
754                 dev = devstr;
755                 label = strchr(devstr, '@');
756                 if (label == NULL ||
757                     ((label + 1) - dev) > done) {
758                         return (EINVAL);
759                 }
760                 *label = '\0';
761                 label++;
762                 if (*label == '\0')
763                         return (EINVAL);
764
765                 if (mp->mnt_flag & MNT_UPDATE) {
766                         /*
767                          * Update mount.  Note that pmp->iroot->cluster is
768                          * an inode-embedded cluster and thus cannot be
769                          * directly locked.
770                          *
771                          * XXX HAMMER2 needs to implement NFS export via
772                          *     mountctl.
773                          */
774                         pmp = MPTOPMP(mp);
775                         cluster = &pmp->iroot->cluster;
776                         for (i = 0; i < cluster->nchains; ++i) {
777                                 if (cluster->array[i].chain == NULL)
778                                         continue;
779                                 hmp = cluster->array[i].chain->hmp;
780                                 devvp = hmp->devvp;
781                                 error = hammer2_remount(hmp, mp, path,
782                                                         devvp, cred);
783                                 if (error)
784                                         break;
785                         }
786                         /*hammer2_inode_install_hidden(pmp);*/
787
788                         return error;
789                 }
790         }
791
792         /*
793          * HMP device mount
794          *
795          * Lookup name and verify it refers to a block device.
796          */
797         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
798         if (error == 0)
799                 error = nlookup(&nd);
800         if (error == 0)
801                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
802         nlookup_done(&nd);
803
804         if (error == 0) {
805                 if (vn_isdisk(devvp, &error))
806                         error = vfs_mountedon(devvp);
807         }
808
809         /*
810          * Determine if the device has already been mounted.  After this
811          * check hmp will be non-NULL if we are doing the second or more
812          * hammer2 mounts from the same device.
813          */
814         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
815         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
816                 if (hmp->devvp == devvp)
817                         break;
818         }
819
820         /*
821          * Open the device if this isn't a secondary mount and construct
822          * the H2 device mount (hmp).
823          */
824         if (hmp == NULL) {
825                 hammer2_chain_t *schain;
826                 hammer2_xid_t xid;
827
828                 if (error == 0 && vcount(devvp) > 0)
829                         error = EBUSY;
830
831                 /*
832                  * Now open the device
833                  */
834                 if (error == 0) {
835                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
836                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
837                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
838                         if (error == 0) {
839                                 error = VOP_OPEN(devvp,
840                                                  ronly ? FREAD : FREAD | FWRITE,
841                                                  FSCRED, NULL);
842                         }
843                         vn_unlock(devvp);
844                 }
845                 if (error && devvp) {
846                         vrele(devvp);
847                         devvp = NULL;
848                 }
849                 if (error) {
850                         lockmgr(&hammer2_mntlk, LK_RELEASE);
851                         return error;
852                 }
853                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
854                 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev);
855                 hmp->ronly = ronly;
856                 hmp->devvp = devvp;
857                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
858                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
859                 RB_INIT(&hmp->iotree);
860                 spin_init(&hmp->io_spin, "hm2mount_io");
861                 spin_init(&hmp->list_spin, "hm2mount_list");
862                 TAILQ_INIT(&hmp->flushq);
863
864                 lockinit(&hmp->vollk, "h2vol", 0, 0);
865
866                 /*
867                  * vchain setup. vchain.data is embedded.
868                  * vchain.refs is initialized and will never drop to 0.
869                  *
870                  * NOTE! voldata is not yet loaded.
871                  */
872                 hmp->vchain.hmp = hmp;
873                 hmp->vchain.refs = 1;
874                 hmp->vchain.data = (void *)&hmp->voldata;
875                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
876                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
877                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
878
879                 hammer2_chain_core_init(&hmp->vchain);
880                 /* hmp->vchain.u.xxx is left NULL */
881
882                 /*
883                  * fchain setup.  fchain.data is embedded.
884                  * fchain.refs is initialized and will never drop to 0.
885                  *
886                  * The data is not used but needs to be initialized to
887                  * pass assertion muster.  We use this chain primarily
888                  * as a placeholder for the freemap's top-level RBTREE
889                  * so it does not interfere with the volume's topology
890                  * RBTREE.
891                  */
892                 hmp->fchain.hmp = hmp;
893                 hmp->fchain.refs = 1;
894                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
895                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
896                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
897                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
898                 hmp->fchain.bref.methods =
899                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
900                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
901
902                 hammer2_chain_core_init(&hmp->fchain);
903                 /* hmp->fchain.u.xxx is left NULL */
904
905                 /*
906                  * Install the volume header and initialize fields from
907                  * voldata.
908                  */
909                 error = hammer2_install_volume_header(hmp);
910                 if (error) {
911                         hammer2_unmount_helper(mp, NULL, hmp);
912                         lockmgr(&hammer2_mntlk, LK_RELEASE);
913                         hammer2_vfs_unmount(mp, MNT_FORCE);
914                         return error;
915                 }
916
917                 /*
918                  * Really important to get these right or flush will get
919                  * confused.
920                  */
921                 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0);
922                 kprintf("alloc spmp %p tid %016jx\n",
923                         hmp->spmp, hmp->voldata.mirror_tid);
924                 spmp = hmp->spmp;
925
926                 /*
927                  * Dummy-up vchain and fchain's modify_tid.  mirror_tid
928                  * is inherited from the volume header.
929                  */
930                 xid = 0;
931                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
932                 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
933                 hmp->vchain.pmp = spmp;
934                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
935                 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
936                 hmp->fchain.pmp = spmp;
937
938                 /*
939                  * First locate the super-root inode, which is key 0
940                  * relative to the volume header's blockset.
941                  *
942                  * Then locate the root inode by scanning the directory keyspace
943                  * represented by the label.
944                  */
945                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
946                 schain = hammer2_chain_lookup(&parent, &key_dummy,
947                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
948                                       &cache_index, 0);
949                 hammer2_chain_lookup_done(parent);
950                 if (schain == NULL) {
951                         kprintf("hammer2_mount: invalid super-root\n");
952                         hammer2_unmount_helper(mp, NULL, hmp);
953                         lockmgr(&hammer2_mntlk, LK_RELEASE);
954                         hammer2_vfs_unmount(mp, MNT_FORCE);
955                         return EINVAL;
956                 }
957                 if (schain->error) {
958                         kprintf("hammer2_mount: error %s reading super-root\n",
959                                 hammer2_error_str(schain->error));
960                         hammer2_chain_unlock(schain);
961                         hammer2_chain_drop(schain);
962                         schain = NULL;
963                         hammer2_unmount_helper(mp, NULL, hmp);
964                         lockmgr(&hammer2_mntlk, LK_RELEASE);
965                         hammer2_vfs_unmount(mp, MNT_FORCE);
966                         return EINVAL;
967                 }
968
969                 /*
970                  * The super-root always uses an inode_tid of 1 when
971                  * creating PFSs.
972                  */
973                 spmp->inode_tid = 1;
974                 spmp->modify_tid = schain->bref.modify_tid;
975
976                 /*
977                  * Sanity-check schain's pmp and finish initialization.
978                  * Any chain belonging to the super-root topology should
979                  * have a NULL pmp (not even set to spmp).
980                  */
981                 ripdata = &hammer2_chain_rdata(schain)->ipdata;
982                 KKASSERT(schain->pmp == NULL);
983                 spmp->pfs_clid = ripdata->meta.pfs_clid;
984
985                 /*
986                  * Replace the dummy spmp->iroot with a real one.  It's
987                  * easier to just do a wholesale replacement than to try
988                  * to update the chain and fixup the iroot fields.
989                  *
990                  * The returned inode is locked with the supplied cluster.
991                  */
992                 cluster = hammer2_cluster_from_chain(schain);
993                 hammer2_inode_drop(spmp->iroot);
994                 spmp->iroot = NULL;
995                 spmp->iroot = hammer2_inode_get(spmp, NULL, cluster);
996                 spmp->spmp_hmp = hmp;
997                 spmp->pfs_types[0] = ripdata->meta.pfs_type;
998                 hammer2_inode_ref(spmp->iroot);
999                 hammer2_inode_unlock(spmp->iroot, cluster);
1000                 schain = NULL;
1001                 /* leave spmp->iroot with one ref */
1002
1003                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
1004                         error = hammer2_recovery(hmp);
1005                         /* XXX do something with error */
1006                 }
1007                 hammer2_update_pmps(hmp);
1008                 hammer2_iocom_init(hmp);
1009
1010                 /*
1011                  * Ref the cluster management messaging descriptor.  The mount
1012                  * program deals with the other end of the communications pipe.
1013                  */
1014                 fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
1015                 if (fp) {
1016                         hammer2_cluster_reconnect(hmp, fp);
1017                 } else {
1018                         kprintf("hammer2_mount: bad cluster_fd!\n");
1019                 }
1020         } else {
1021                 spmp = hmp->spmp;
1022         }
1023
1024         /*
1025          * Lookup the mount point under the media-localized super-root.
1026          * Scanning hammer2_pfslist doesn't help us because it represents
1027          * PFS cluster ids which can aggregate several named PFSs together.
1028          *
1029          * cluster->pmp will incorrectly point to spmp and must be fixed
1030          * up later on.
1031          */
1032         cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1033         lhc = hammer2_dirhash(label, strlen(label));
1034         cluster = hammer2_cluster_lookup(cparent, &key_next,
1035                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1036                                       0);
1037         while (cluster) {
1038                 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE &&
1039                     strcmp(label,
1040                        hammer2_cluster_rdata(cluster)->ipdata.filename) == 0) {
1041                         break;
1042                 }
1043                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
1044                                             key_next,
1045                                             lhc + HAMMER2_DIRHASH_LOMASK, 0);
1046         }
1047         hammer2_inode_unlock(spmp->iroot, cparent);
1048
1049         /*
1050          * PFS could not be found?
1051          */
1052         if (cluster == NULL) {
1053                 kprintf("hammer2_mount: PFS label not found\n");
1054                 hammer2_unmount_helper(mp, NULL, hmp);
1055                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1056                 hammer2_vfs_unmount(mp, MNT_FORCE);
1057
1058                 return EINVAL;
1059         }
1060
1061         /*
1062          * Acquire the pmp structure (it should have already been allocated
1063          * via hammer2_update_pmps() so do not pass cluster in to add to
1064          * available chains).
1065          *
1066          * Check if the cluster has already been mounted.  A cluster can
1067          * only be mounted once, use null mounts to mount additional copies.
1068          */
1069         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1070         hammer2_cluster_bref(cluster, &bref);
1071         pmp = hammer2_pfsalloc(NULL, ripdata, bref.modify_tid);
1072         hammer2_cluster_unlock(cluster);
1073         hammer2_cluster_drop(cluster);
1074
1075         if (pmp->mp) {
1076                 kprintf("hammer2_mount: PFS already mounted!\n");
1077                 hammer2_unmount_helper(mp, NULL, hmp);
1078                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1079                 hammer2_vfs_unmount(mp, MNT_FORCE);
1080
1081                 return EBUSY;
1082         }
1083
1084         /*
1085          * Finish the mount
1086          */
1087         kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp);
1088
1089         mp->mnt_flag = MNT_LOCAL;
1090         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
1091         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
1092  
1093         /*
1094          * required mount structure initializations
1095          */
1096         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
1097         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
1098  
1099         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
1100         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1101  
1102         /*
1103          * Optional fields
1104          */
1105         mp->mnt_iosize_max = MAXPHYS;
1106
1107         /*
1108          * Connect up mount pointers.
1109          */
1110         hammer2_mount_helper(mp, pmp);
1111
1112         lockmgr(&hammer2_mntlk, LK_RELEASE);
1113
1114         /*
1115          * A mounted PFS needs a write thread for logical buffers and
1116          * a hidden directory for deletions of open files.  These features
1117          * are not used by unmounted PFSs.
1118          *
1119          * The logical file buffer bio write thread handles things like
1120          * physical block assignment and compression.
1121          */
1122         pmp->wthread_destroy = 0;
1123         lwkt_create(hammer2_write_thread, pmp,
1124                     &pmp->wthread_td, NULL, 0, -1, "h2pfs-%s", label);
1125
1126         /*
1127          * With the cluster operational install ihidden.
1128          * (only applicable to pfs mounts, not applicable to spmp)
1129          */
1130         hammer2_inode_install_hidden(pmp);
1131
1132         /*
1133          * Finish setup
1134          */
1135         vfs_getnewfsid(mp);
1136         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
1137         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
1138         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
1139
1140         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
1141         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
1142         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
1143         copyinstr(path, mp->mnt_stat.f_mntonname,
1144                   sizeof(mp->mnt_stat.f_mntonname) - 1,
1145                   &size);
1146
1147         /*
1148          * Initial statfs to prime mnt_stat.
1149          */
1150         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
1151         
1152         return 0;
1153 }
1154
1155 /*
1156  * Scan PFSs under the super-root and create hammer2_pfs structures.
1157  */
1158 static
1159 void
1160 hammer2_update_pmps(hammer2_dev_t *hmp)
1161 {
1162         const hammer2_inode_data_t *ripdata;
1163         hammer2_cluster_t *cparent;
1164         hammer2_cluster_t *cluster;
1165         hammer2_blockref_t bref;
1166         hammer2_pfs_t *spmp;
1167         hammer2_pfs_t *pmp;
1168         hammer2_key_t key_next;
1169
1170         /*
1171          * Lookup mount point under the media-localized super-root.
1172          *
1173          * cluster->pmp will incorrectly point to spmp and must be fixed
1174          * up later on.
1175          */
1176         spmp = hmp->spmp;
1177         cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1178         cluster = hammer2_cluster_lookup(cparent, &key_next,
1179                                          HAMMER2_KEY_MIN,
1180                                          HAMMER2_KEY_MAX,
1181                                          0);
1182         while (cluster) {
1183                 if (hammer2_cluster_type(cluster) != HAMMER2_BREF_TYPE_INODE)
1184                         continue;
1185                 ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1186                 hammer2_cluster_bref(cluster, &bref);
1187                 kprintf("ADD LOCAL PFS: %s\n", ripdata->filename);
1188
1189                 pmp = hammer2_pfsalloc(cluster, ripdata, bref.modify_tid);
1190                 cluster = hammer2_cluster_next(cparent, cluster,
1191                                                &key_next,
1192                                                key_next,
1193                                                HAMMER2_KEY_MAX,
1194                                                0);
1195         }
1196         hammer2_inode_unlock(spmp->iroot, cparent);
1197 }
1198
1199 /*
1200  * Handle bioq for strategy write
1201  */
1202 static
1203 void
1204 hammer2_write_thread(void *arg)
1205 {
1206         hammer2_pfs_t *pmp;
1207         struct bio *bio;
1208         struct buf *bp;
1209         hammer2_trans_t trans;
1210         struct vnode *vp;
1211         hammer2_inode_t *ip;
1212         hammer2_cluster_t *cparent;
1213         hammer2_key_t lbase;
1214         int lblksize;
1215         int pblksize;
1216         int error;
1217         
1218         pmp = arg;
1219         
1220         hammer2_mtx_ex(&pmp->wthread_mtx);
1221         for (;;) {
1222                 /*
1223                  * Wait for work.  Break out and destroy the thread only if
1224                  * requested and no work remains.
1225                  */
1226                 if (bioq_first(&pmp->wthread_bioq) == NULL) {
1227                         if (pmp->wthread_destroy)
1228                                 break;
1229                         mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
1230                                  0, "h2bioqw", 0);
1231                         continue;
1232                 }
1233
1234                 /*
1235                  * Special transaction for logical buffer cache writes.
1236                  */
1237                 hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
1238
1239                 while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
1240                         /*
1241                          * dummy bio for synchronization.  The transaction
1242                          * must be terminated.
1243                          */
1244                         if (bio->bio_buf == NULL) {
1245                                 bio->bio_flags |= BIO_DONE;
1246                                 /* bio will become invalid after DONE set */
1247                                 wakeup(bio);
1248                                 break;
1249                         }
1250
1251                         /*
1252                          * else normal bio processing
1253                          */
1254                         hammer2_mtx_unlock(&pmp->wthread_mtx);
1255
1256                         hammer2_lwinprog_drop(pmp);
1257                         
1258                         error = 0;
1259                         bp = bio->bio_buf;
1260                         vp = bp->b_vp;
1261                         ip = VTOI(vp);
1262
1263                         /*
1264                          * Inode is modified, flush size and mtime changes
1265                          * to ensure that the file size remains consistent
1266                          * with the buffers being flushed.
1267                          *
1268                          * NOTE: The inode_fsync() call only flushes the
1269                          *       inode's meta-data state, it doesn't try
1270                          *       to flush underlying buffers or chains.
1271                          *
1272                          * NOTE: hammer2_write_file_core() may indirectly
1273                          *       modify and modsync the inode.
1274                          */
1275                         cparent = hammer2_inode_lock(ip,
1276                                                      HAMMER2_RESOLVE_ALWAYS);
1277                         if (ip->flags & (HAMMER2_INODE_RESIZED |
1278                                          HAMMER2_INODE_MTIME)) {
1279                                 hammer2_inode_fsync(&trans, ip, cparent);
1280                         }
1281                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
1282                                                         &lbase, NULL);
1283                         pblksize = hammer2_calc_physical(ip, lbase);
1284                         hammer2_write_file_core(bp, &trans, ip,
1285                                                 cparent,
1286                                                 lbase, IO_ASYNC,
1287                                                 pblksize, &error);
1288                         hammer2_inode_unlock(ip, cparent);
1289                         if (error) {
1290                                 kprintf("hammer2: error in buffer write\n");
1291                                 bp->b_flags |= B_ERROR;
1292                                 bp->b_error = EIO;
1293                         }
1294                         biodone(bio);
1295                         hammer2_mtx_ex(&pmp->wthread_mtx);
1296                 }
1297                 hammer2_trans_done(&trans);
1298         }
1299         pmp->wthread_destroy = -1;
1300         wakeup(&pmp->wthread_destroy);
1301         
1302         hammer2_mtx_unlock(&pmp->wthread_mtx);
1303 }
1304
1305 void
1306 hammer2_bioq_sync(hammer2_pfs_t *pmp)
1307 {
1308         struct bio sync_bio;
1309
1310         bzero(&sync_bio, sizeof(sync_bio));     /* dummy with no bio_buf */
1311         hammer2_mtx_ex(&pmp->wthread_mtx);
1312         if (pmp->wthread_destroy == 0 &&
1313             TAILQ_FIRST(&pmp->wthread_bioq.queue)) {
1314                 bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
1315                 while ((sync_bio.bio_flags & BIO_DONE) == 0)
1316                         mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
1317         }
1318         hammer2_mtx_unlock(&pmp->wthread_mtx);
1319 }
1320
1321 /* 
1322  * Return a chain suitable for I/O, creating the chain if necessary
1323  * and assigning its physical block.  The cluster will be in a modified
1324  * state.
1325  *
1326  * cparent can wind up being anything.
1327  *
1328  * NOTE: Special case for data embedded in inode.
1329  */
1330 static
1331 hammer2_cluster_t *
1332 hammer2_assign_physical(hammer2_trans_t *trans,
1333                         hammer2_inode_t *ip, hammer2_cluster_t *cparent,
1334                         hammer2_key_t lbase, int pblksize, int *errorp)
1335 {
1336         hammer2_cluster_t *cluster;
1337         hammer2_cluster_t *dparent;
1338         hammer2_key_t key_dummy;
1339         int pradix = hammer2_getradix(pblksize);
1340
1341         /*
1342          * Locate the chain associated with lbase, return a locked chain.
1343          * However, do not instantiate any data reference (which utilizes a
1344          * device buffer) because we will be using direct IO via the
1345          * logical buffer cache buffer.
1346          */
1347         *errorp = 0;
1348         KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
1349 retry:
1350         dparent = hammer2_cluster_lookup_init(cparent, 0);
1351         cluster = hammer2_cluster_lookup(dparent, &key_dummy,
1352                                      lbase, lbase,
1353                                      HAMMER2_LOOKUP_NODATA);
1354
1355         if (cluster == NULL) {
1356                 /*
1357                  * We found a hole, create a new chain entry.
1358                  *
1359                  * NOTE: DATA chains are created without device backing
1360                  *       store (nor do we want any).
1361                  */
1362                 *errorp = hammer2_cluster_create(trans, dparent, &cluster,
1363                                                lbase, HAMMER2_PBUFRADIX,
1364                                                HAMMER2_BREF_TYPE_DATA,
1365                                                pblksize, 0);
1366                 if (cluster == NULL) {
1367                         hammer2_cluster_lookup_done(dparent);
1368                         panic("hammer2_cluster_create: par=%p error=%d\n",
1369                                 dparent->focus, *errorp);
1370                         goto retry;
1371                 }
1372                 /*ip->delta_dcount += pblksize;*/
1373         } else {
1374                 switch (hammer2_cluster_type(cluster)) {
1375                 case HAMMER2_BREF_TYPE_INODE:
1376                         /*
1377                          * The data is embedded in the inode, which requires
1378                          * a bit more finess.
1379                          */
1380                         hammer2_cluster_modify_ip(trans, ip, cluster, 0);
1381                         break;
1382                 case HAMMER2_BREF_TYPE_DATA:
1383                         if (hammer2_cluster_need_resize(cluster, pblksize)) {
1384                                 hammer2_cluster_resize(trans, ip,
1385                                                      dparent, cluster,
1386                                                      pradix,
1387                                                      HAMMER2_MODIFY_OPTDATA);
1388                         }
1389
1390                         /*
1391                          * DATA buffers must be marked modified whether the
1392                          * data is in a logical buffer or not.  We also have
1393                          * to make this call to fixup the chain data pointers
1394                          * after resizing in case this is an encrypted or
1395                          * compressed buffer.
1396                          */
1397                         hammer2_cluster_modify(trans, cluster,
1398                                                HAMMER2_MODIFY_OPTDATA);
1399                         break;
1400                 default:
1401                         panic("hammer2_assign_physical: bad type");
1402                         /* NOT REACHED */
1403                         break;
1404                 }
1405         }
1406
1407         /*
1408          * Cleanup.  If cluster wound up being the inode itself, i.e.
1409          * the DIRECTDATA case for offset 0, then we need to update cparent.
1410          * The caller expects cparent to not become stale.
1411          */
1412         hammer2_cluster_lookup_done(dparent);
1413         /* dparent = NULL; safety */
1414         return (cluster);
1415 }
1416
1417 /* 
1418  * bio queued from hammer2_vnops.c.
1419  *
1420  * The core write function which determines which path to take
1421  * depending on compression settings.  We also have to locate the
1422  * related clusters so we can calculate and set the check data for
1423  * the blockref.
1424  */
1425 static
1426 void
1427 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
1428                         hammer2_inode_t *ip,
1429                         hammer2_cluster_t *cparent,
1430                         hammer2_key_t lbase, int ioflag, int pblksize,
1431                         int *errorp)
1432 {
1433         hammer2_cluster_t *cluster;
1434
1435         switch(HAMMER2_DEC_ALGO(ip->meta.comp_algo)) {
1436         case HAMMER2_COMP_NONE:
1437                 /*
1438                  * We have to assign physical storage to the buffer
1439                  * we intend to dirty or write now to avoid deadlocks
1440                  * in the strategy code later.
1441                  *
1442                  * This can return NOOFFSET for inode-embedded data.
1443                  * The strategy code will take care of it in that case.
1444                  */
1445                 cluster = hammer2_assign_physical(trans, ip, cparent,
1446                                                 lbase, pblksize,
1447                                                 errorp);
1448                 if (cluster->ddflag) {
1449                         hammer2_inode_data_t *wipdata;
1450
1451                         wipdata = hammer2_cluster_modify_ip(trans, ip,
1452                                                             cluster, 0);
1453                         KKASSERT(wipdata->meta.op_flags &
1454                                  HAMMER2_OPFLAG_DIRECTDATA);
1455                         KKASSERT(bp->b_loffset == 0);
1456                         bcopy(bp->b_data, wipdata->u.data,
1457                               HAMMER2_EMBEDDED_BYTES);
1458                         hammer2_cluster_modsync(cluster);
1459                 } else {
1460                         hammer2_write_bp(cluster, bp, ioflag, pblksize,
1461                                          errorp, ip->meta.check_algo);
1462                 }
1463                 if (cluster) {
1464                         hammer2_cluster_unlock(cluster);
1465                         hammer2_cluster_drop(cluster);
1466                 }
1467                 break;
1468         case HAMMER2_COMP_AUTOZERO:
1469                 /*
1470                  * Check for zero-fill only
1471                  */
1472                 hammer2_zero_check_and_write(bp, trans, ip,
1473                                     cparent, lbase,
1474                                     ioflag, pblksize, errorp,
1475                                     ip->meta.check_algo);
1476                 break;
1477         case HAMMER2_COMP_LZ4:
1478         case HAMMER2_COMP_ZLIB:
1479         default:
1480                 /*
1481                  * Check for zero-fill and attempt compression.
1482                  */
1483                 hammer2_compress_and_write(bp, trans, ip,
1484                                            cparent,
1485                                            lbase, ioflag,
1486                                            pblksize, errorp,
1487                                            ip->meta.comp_algo,
1488                                            ip->meta.check_algo);
1489                 break;
1490         }
1491 }
1492
1493 /*
1494  * Generic function that will perform the compression in compression
1495  * write path. The compression algorithm is determined by the settings
1496  * obtained from inode.
1497  */
1498 static
1499 void
1500 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
1501         hammer2_inode_t *ip,
1502         hammer2_cluster_t *cparent,
1503         hammer2_key_t lbase, int ioflag, int pblksize,
1504         int *errorp, int comp_algo, int check_algo)
1505 {
1506         hammer2_cluster_t *cluster;
1507         hammer2_chain_t *chain;
1508         int comp_size;
1509         int comp_block_size;
1510         int i;
1511         char *comp_buffer;
1512
1513         if (test_block_zeros(bp->b_data, pblksize)) {
1514                 zero_write(bp, trans, ip, cparent, lbase, errorp);
1515                 return;
1516         }
1517
1518         comp_size = 0;
1519         comp_buffer = NULL;
1520
1521         KKASSERT(pblksize / 2 <= 32768);
1522                 
1523         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
1524                 z_stream strm_compress;
1525                 int comp_level;
1526                 int ret;
1527
1528                 switch(HAMMER2_DEC_ALGO(comp_algo)) {
1529                 case HAMMER2_COMP_LZ4:
1530                         comp_buffer = objcache_get(cache_buffer_write,
1531                                                    M_INTWAIT);
1532                         comp_size = LZ4_compress_limitedOutput(
1533                                         bp->b_data,
1534                                         &comp_buffer[sizeof(int)],
1535                                         pblksize,
1536                                         pblksize / 2 - sizeof(int));
1537                         /*
1538                          * We need to prefix with the size, LZ4
1539                          * doesn't do it for us.  Add the related
1540                          * overhead.
1541                          */
1542                         *(int *)comp_buffer = comp_size;
1543                         if (comp_size)
1544                                 comp_size += sizeof(int);
1545                         break;
1546                 case HAMMER2_COMP_ZLIB:
1547                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1548                         if (comp_level == 0)
1549                                 comp_level = 6; /* default zlib compression */
1550                         else if (comp_level < 6)
1551                                 comp_level = 6;
1552                         else if (comp_level > 9)
1553                                 comp_level = 9;
1554                         ret = deflateInit(&strm_compress, comp_level);
1555                         if (ret != Z_OK) {
1556                                 kprintf("HAMMER2 ZLIB: fatal error "
1557                                         "on deflateInit.\n");
1558                         }
1559
1560                         comp_buffer = objcache_get(cache_buffer_write,
1561                                                    M_INTWAIT);
1562                         strm_compress.next_in = bp->b_data;
1563                         strm_compress.avail_in = pblksize;
1564                         strm_compress.next_out = comp_buffer;
1565                         strm_compress.avail_out = pblksize / 2;
1566                         ret = deflate(&strm_compress, Z_FINISH);
1567                         if (ret == Z_STREAM_END) {
1568                                 comp_size = pblksize / 2 -
1569                                             strm_compress.avail_out;
1570                         } else {
1571                                 comp_size = 0;
1572                         }
1573                         ret = deflateEnd(&strm_compress);
1574                         break;
1575                 default:
1576                         kprintf("Error: Unknown compression method.\n");
1577                         kprintf("Comp_method = %d.\n", comp_algo);
1578                         break;
1579                 }
1580         }
1581
1582         if (comp_size == 0) {
1583                 /*
1584                  * compression failed or turned off
1585                  */
1586                 comp_block_size = pblksize;     /* safety */
1587                 if (++ip->comp_heuristic > 128)
1588                         ip->comp_heuristic = 8;
1589         } else {
1590                 /*
1591                  * compression succeeded
1592                  */
1593                 ip->comp_heuristic = 0;
1594                 if (comp_size <= 1024) {
1595                         comp_block_size = 1024;
1596                 } else if (comp_size <= 2048) {
1597                         comp_block_size = 2048;
1598                 } else if (comp_size <= 4096) {
1599                         comp_block_size = 4096;
1600                 } else if (comp_size <= 8192) {
1601                         comp_block_size = 8192;
1602                 } else if (comp_size <= 16384) {
1603                         comp_block_size = 16384;
1604                 } else if (comp_size <= 32768) {
1605                         comp_block_size = 32768;
1606                 } else {
1607                         panic("hammer2: WRITE PATH: "
1608                               "Weird comp_size value.");
1609                         /* NOT REACHED */
1610                         comp_block_size = pblksize;
1611                 }
1612         }
1613
1614         cluster = hammer2_assign_physical(trans, ip, cparent,
1615                                           lbase, comp_block_size,
1616                                           errorp);
1617         if (*errorp) {
1618                 kprintf("WRITE PATH: An error occurred while "
1619                         "assigning physical space.\n");
1620                 KKASSERT(cluster == NULL);
1621                 goto done;
1622         }
1623
1624         if (cluster->ddflag) {
1625                 hammer2_inode_data_t *wipdata;
1626
1627                 wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1628                 KKASSERT(wipdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1629                 KKASSERT(bp->b_loffset == 0);
1630                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1631                 hammer2_cluster_modsync(cluster);
1632         } else
1633         for (i = 0; i < cluster->nchains; ++i) {
1634                 hammer2_io_t *dio;
1635                 char *bdata;
1636
1637                 /* XXX hackx */
1638
1639                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1640                         continue;
1641                 chain = cluster->array[i].chain;        /* XXX */
1642                 if (chain == NULL)
1643                         continue;
1644                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1645
1646                 switch(chain->bref.type) {
1647                 case HAMMER2_BREF_TYPE_INODE:
1648                         panic("hammer2_write_bp: unexpected inode\n");
1649                         break;
1650                 case HAMMER2_BREF_TYPE_DATA:
1651                         /*
1652                          * Optimize out the read-before-write
1653                          * if possible.
1654                          */
1655                         *errorp = hammer2_io_newnz(chain->hmp,
1656                                                    chain->bref.data_off,
1657                                                    chain->bytes,
1658                                                    &dio);
1659                         if (*errorp) {
1660                                 hammer2_io_brelse(&dio);
1661                                 kprintf("hammer2: WRITE PATH: "
1662                                         "dbp bread error\n");
1663                                 break;
1664                         }
1665                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1666
1667                         /*
1668                          * When loading the block make sure we don't
1669                          * leave garbage after the compressed data.
1670                          */
1671                         if (comp_size) {
1672                                 chain->bref.methods =
1673                                         HAMMER2_ENC_COMP(comp_algo) +
1674                                         HAMMER2_ENC_CHECK(check_algo);
1675                                 bcopy(comp_buffer, bdata, comp_size);
1676                                 if (comp_size != comp_block_size) {
1677                                         bzero(bdata + comp_size,
1678                                               comp_block_size - comp_size);
1679                                 }
1680                         } else {
1681                                 chain->bref.methods =
1682                                         HAMMER2_ENC_COMP(
1683                                                 HAMMER2_COMP_NONE) +
1684                                         HAMMER2_ENC_CHECK(check_algo);
1685                                 bcopy(bp->b_data, bdata, pblksize);
1686                         }
1687
1688                         /*
1689                          * The flush code doesn't calculate check codes for
1690                          * file data (doing so can result in excessive I/O),
1691                          * so we do it here.
1692                          */
1693                         hammer2_chain_setcheck(chain, bdata);
1694
1695                         /*
1696                          * Device buffer is now valid, chain is no longer in
1697                          * the initial state.
1698                          *
1699                          * (No blockref table worries with file data)
1700                          */
1701                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1702
1703                         /* Now write the related bdp. */
1704                         if (ioflag & IO_SYNC) {
1705                                 /*
1706                                  * Synchronous I/O requested.
1707                                  */
1708                                 hammer2_io_bwrite(&dio);
1709                         /*
1710                         } else if ((ioflag & IO_DIRECT) &&
1711                                    loff + n == pblksize) {
1712                                 hammer2_io_bdwrite(&dio);
1713                         */
1714                         } else if (ioflag & IO_ASYNC) {
1715                                 hammer2_io_bawrite(&dio);
1716                         } else {
1717                                 hammer2_io_bdwrite(&dio);
1718                         }
1719                         break;
1720                 default:
1721                         panic("hammer2_write_bp: bad chain type %d\n",
1722                                 chain->bref.type);
1723                         /* NOT REACHED */
1724                         break;
1725                 }
1726         }
1727 done:
1728         if (cluster) {
1729                 hammer2_cluster_unlock(cluster);
1730                 hammer2_cluster_drop(cluster);
1731         }
1732         if (comp_buffer)
1733                 objcache_put(cache_buffer_write, comp_buffer);
1734 }
1735
1736 /*
1737  * Function that performs zero-checking and writing without compression,
1738  * it corresponds to default zero-checking path.
1739  */
1740 static
1741 void
1742 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1743         hammer2_inode_t *ip,
1744         hammer2_cluster_t *cparent,
1745         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp,
1746         int check_algo)
1747 {
1748         hammer2_cluster_t *cluster;
1749
1750         if (test_block_zeros(bp->b_data, pblksize)) {
1751                 zero_write(bp, trans, ip, cparent, lbase, errorp);
1752         } else {
1753                 cluster = hammer2_assign_physical(trans, ip, cparent,
1754                                                   lbase, pblksize, errorp);
1755                 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
1756                                  check_algo);
1757                 if (cluster) {
1758                         hammer2_cluster_unlock(cluster);
1759                         hammer2_cluster_drop(cluster);
1760                 }
1761         }
1762 }
1763
1764 /*
1765  * A function to test whether a block of data contains only zeros,
1766  * returns TRUE (non-zero) if the block is all zeros.
1767  */
1768 static
1769 int
1770 test_block_zeros(const char *buf, size_t bytes)
1771 {
1772         size_t i;
1773
1774         for (i = 0; i < bytes; i += sizeof(long)) {
1775                 if (*(const long *)(buf + i) != 0)
1776                         return (0);
1777         }
1778         return (1);
1779 }
1780
1781 /*
1782  * Function to "write" a block that contains only zeros.
1783  */
1784 static
1785 void
1786 zero_write(struct buf *bp, hammer2_trans_t *trans,
1787            hammer2_inode_t *ip,
1788            hammer2_cluster_t *cparent,
1789            hammer2_key_t lbase, int *errorp __unused)
1790 {
1791         hammer2_cluster_t *cluster;
1792         hammer2_key_t key_dummy;
1793
1794         cparent = hammer2_cluster_lookup_init(cparent, 0);
1795         cluster = hammer2_cluster_lookup(cparent, &key_dummy, lbase, lbase,
1796                                      HAMMER2_LOOKUP_NODATA);
1797         if (cluster) {
1798                 if (cluster->ddflag) {
1799                         hammer2_inode_data_t *wipdata;
1800
1801                         wipdata = hammer2_cluster_modify_ip(trans, ip,
1802                                                             cluster, 0);
1803                         KKASSERT(wipdata->meta.op_flags &
1804                                  HAMMER2_OPFLAG_DIRECTDATA);
1805                         KKASSERT(bp->b_loffset == 0);
1806                         bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1807                         hammer2_cluster_modsync(cluster);
1808                 } else {
1809                         hammer2_cluster_delete(trans, cparent, cluster,
1810                                                HAMMER2_DELETE_PERMANENT);
1811                 }
1812                 hammer2_cluster_unlock(cluster);
1813                 hammer2_cluster_drop(cluster);
1814         }
1815         hammer2_cluster_lookup_done(cparent);
1816 }
1817
1818 /*
1819  * Function to write the data as it is, without performing any sort of
1820  * compression. This function is used in path without compression and
1821  * default zero-checking path.
1822  */
1823 static
1824 void
1825 hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
1826                                 int pblksize, int *errorp, int check_algo)
1827 {
1828         hammer2_chain_t *chain;
1829         hammer2_inode_data_t *wipdata;
1830         hammer2_io_t *dio;
1831         char *bdata;
1832         int error;
1833         int i;
1834
1835         error = 0;      /* XXX TODO below */
1836
1837         for (i = 0; i < cluster->nchains; ++i) {
1838                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1839                         continue;
1840                 chain = cluster->array[i].chain;        /* XXX */
1841                 if (chain == NULL)
1842                         continue;
1843                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1844
1845                 switch(chain->bref.type) {
1846                 case HAMMER2_BREF_TYPE_INODE:
1847                         wipdata = &hammer2_chain_wdata(chain)->ipdata;
1848                         KKASSERT(wipdata->meta.op_flags &
1849                                  HAMMER2_OPFLAG_DIRECTDATA);
1850                         KKASSERT(bp->b_loffset == 0);
1851                         bcopy(bp->b_data, wipdata->u.data,
1852                               HAMMER2_EMBEDDED_BYTES);
1853                         error = 0;
1854                         break;
1855                 case HAMMER2_BREF_TYPE_DATA:
1856                         error = hammer2_io_newnz(chain->hmp,
1857                                                  chain->bref.data_off,
1858                                                  chain->bytes, &dio);
1859                         if (error) {
1860                                 hammer2_io_bqrelse(&dio);
1861                                 kprintf("hammer2: WRITE PATH: "
1862                                         "dbp bread error\n");
1863                                 break;
1864                         }
1865                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1866
1867                         chain->bref.methods = HAMMER2_ENC_COMP(
1868                                                         HAMMER2_COMP_NONE) +
1869                                               HAMMER2_ENC_CHECK(check_algo);
1870                         bcopy(bp->b_data, bdata, chain->bytes);
1871
1872                         /*
1873                          * The flush code doesn't calculate check codes for
1874                          * file data (doing so can result in excessive I/O),
1875                          * so we do it here.
1876                          */
1877                         hammer2_chain_setcheck(chain, bdata);
1878
1879                         /*
1880                          * Device buffer is now valid, chain is no longer in
1881                          * the initial state.
1882                          *
1883                          * (No blockref table worries with file data)
1884                          */
1885                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1886
1887                         if (ioflag & IO_SYNC) {
1888                                 /*
1889                                  * Synchronous I/O requested.
1890                                  */
1891                                 hammer2_io_bwrite(&dio);
1892                         /*
1893                         } else if ((ioflag & IO_DIRECT) &&
1894                                    loff + n == pblksize) {
1895                                 hammer2_io_bdwrite(&dio);
1896                         */
1897                         } else if (ioflag & IO_ASYNC) {
1898                                 hammer2_io_bawrite(&dio);
1899                         } else {
1900                                 hammer2_io_bdwrite(&dio);
1901                         }
1902                         break;
1903                 default:
1904                         panic("hammer2_write_bp: bad chain type %d\n",
1905                               chain->bref.type);
1906                         /* NOT REACHED */
1907                         error = 0;
1908                         break;
1909                 }
1910                 KKASSERT(error == 0);   /* XXX TODO */
1911         }
1912         *errorp = error;
1913 }
1914
1915 static
1916 int
1917 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path,
1918                 struct vnode *devvp, struct ucred *cred)
1919 {
1920         int error;
1921
1922         if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1923                 error = hammer2_recovery(hmp);
1924         } else {
1925                 error = 0;
1926         }
1927         return error;
1928 }
1929
1930 static
1931 int
1932 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1933 {
1934         hammer2_pfs_t *pmp;
1935         int flags;
1936         int error = 0;
1937
1938         pmp = MPTOPMP(mp);
1939
1940         if (pmp == NULL)
1941                 return(0);
1942
1943         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1944
1945         /*
1946          * If mount initialization proceeded far enough we must flush
1947          * its vnodes and sync the underlying mount points.  Three syncs
1948          * are required to fully flush the filesystem (freemap updates lag
1949          * by one flush, and one extra for safety).
1950          */
1951         if (mntflags & MNT_FORCE)
1952                 flags = FORCECLOSE;
1953         else
1954                 flags = 0;
1955         if (pmp->iroot) {
1956                 error = vflush(mp, 0, flags);
1957                 if (error)
1958                         goto failed;
1959                 hammer2_vfs_sync(mp, MNT_WAIT);
1960                 hammer2_vfs_sync(mp, MNT_WAIT);
1961                 hammer2_vfs_sync(mp, MNT_WAIT);
1962         }
1963
1964         if (pmp->wthread_td) {
1965                 hammer2_mtx_ex(&pmp->wthread_mtx);
1966                 pmp->wthread_destroy = 1;
1967                 wakeup(&pmp->wthread_bioq);
1968                 while (pmp->wthread_destroy != -1) {
1969                         mtxsleep(&pmp->wthread_destroy,
1970                                 &pmp->wthread_mtx, 0,
1971                                 "umount-sleep", 0);
1972                 }
1973                 hammer2_mtx_unlock(&pmp->wthread_mtx);
1974                 pmp->wthread_td = NULL;
1975         }
1976
1977         /*
1978          * Cleanup our reference on ihidden.
1979          */
1980         if (pmp->ihidden) {
1981                 hammer2_inode_drop(pmp->ihidden);
1982                 pmp->ihidden = NULL;
1983         }
1984         if (pmp->mp)
1985                 hammer2_unmount_helper(mp, pmp, NULL);
1986
1987         error = 0;
1988 failed:
1989         lockmgr(&hammer2_mntlk, LK_RELEASE);
1990
1991         return (error);
1992 }
1993
1994 /*
1995  * Mount helper, hook the system mount into our PFS.
1996  * The mount lock is held.
1997  *
1998  * We must bump the mount_count on related devices for any
1999  * mounted PFSs.
2000  */
2001 static
2002 void
2003 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp)
2004 {
2005         hammer2_cluster_t *cluster;
2006         hammer2_chain_t *rchain;
2007         int i;
2008
2009         mp->mnt_data = (qaddr_t)pmp;
2010         pmp->mp = mp;
2011
2012         /*
2013          * After pmp->mp is set we have to adjust hmp->mount_count.
2014          */
2015         cluster = &pmp->iroot->cluster;
2016         for (i = 0; i < cluster->nchains; ++i) {
2017                 rchain = cluster->array[i].chain;
2018                 if (rchain == NULL)
2019                         continue;
2020                 ++rchain->hmp->mount_count;
2021                 kprintf("hammer2_mount hmp=%p ++mount_count=%d\n",
2022                         rchain->hmp, rchain->hmp->mount_count);
2023         }
2024 }
2025
2026 /*
2027  * Mount helper, unhook the system mount from our PFS.
2028  * The mount lock is held.
2029  *
2030  * If hmp is supplied a mount responsible for being the first to open
2031  * the block device failed and the block device and all PFSs using the
2032  * block device must be cleaned up.
2033  *
2034  * If pmp is supplied multiple devices might be backing the PFS and each
2035  * must be disconnect.  This might not be the last PFS using some of the
2036  * underlying devices.  Also, we have to adjust our hmp->mount_count
2037  * accounting for the devices backing the pmp which is now undergoing an
2038  * unmount.
2039  */
2040 static
2041 void
2042 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp)
2043 {
2044         hammer2_cluster_t *cluster;
2045         hammer2_chain_t *rchain;
2046         struct vnode *devvp;
2047         int dumpcnt;
2048         int ronly = 0;
2049         int i;
2050
2051         /*
2052          * If no device supplied this is a high-level unmount and we have to
2053          * to disconnect the mount, adjust mount_count, and locate devices
2054          * that might now have no mounts.
2055          */
2056         if (pmp) {
2057                 KKASSERT(hmp == NULL);
2058                 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp);
2059                 pmp->mp = NULL;
2060                 mp->mnt_data = NULL;
2061
2062                 /*
2063                  * After pmp->mp is cleared we have to account for
2064                  * mount_count.
2065                  */
2066                 cluster = &pmp->iroot->cluster;
2067                 for (i = 0; i < cluster->nchains; ++i) {
2068                         rchain = cluster->array[i].chain;
2069                         if (rchain == NULL)
2070                                 continue;
2071                         --rchain->hmp->mount_count;
2072                         kprintf("hammer2_unmount hmp=%p --mount_count=%d\n",
2073                                 rchain->hmp, rchain->hmp->mount_count);
2074                         /* scrapping hmp now may invalidate the pmp */
2075                 }
2076 again:
2077                 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
2078                         if (hmp->mount_count == 0) {
2079                                 hammer2_unmount_helper(NULL, NULL, hmp);
2080                                 goto again;
2081                         }
2082                 }
2083                 return;
2084         }
2085
2086         /*
2087          * Try to terminate the block device.  We can't terminate it if
2088          * there are still PFSs referencing it.
2089          */
2090         kprintf("hammer2_unmount hmp=%p mount_count=%d\n",
2091                 hmp, hmp->mount_count);
2092         if (hmp->mount_count)
2093                 return;
2094
2095         hammer2_pfsfree_scan(hmp);
2096         hammer2_dev_exlock(hmp);        /* XXX order */
2097
2098         /*
2099          * Cycle the volume data lock as a safety (probably not needed any
2100          * more).  To ensure everything is out we need to flush at least
2101          * three times.  (1) The running of the unlinkq can dirty the
2102          * filesystem, (2) A normal flush can dirty the freemap, and
2103          * (3) ensure that the freemap is fully synchronized.
2104          *
2105          * The next mount's recovery scan can clean everything up but we want
2106          * to leave the filesystem in a 100% clean state on a normal unmount.
2107          */
2108 #if 0
2109         hammer2_voldata_lock(hmp);
2110         hammer2_voldata_unlock(hmp);
2111 #endif
2112         hammer2_iocom_uninit(hmp);
2113
2114         if ((hmp->vchain.flags | hmp->fchain.flags) &
2115             HAMMER2_CHAIN_FLUSH_MASK) {
2116                 kprintf("hammer2_unmount: chains left over "
2117                         "after final sync\n");
2118                 kprintf("    vchain %08x\n", hmp->vchain.flags);
2119                 kprintf("    fchain %08x\n", hmp->fchain.flags);
2120
2121                 if (hammer2_debug & 0x0010)
2122                         Debugger("entered debugger");
2123         }
2124
2125         KKASSERT(hmp->spmp == NULL);
2126
2127         /*
2128          * Finish up with the device vnode
2129          */
2130         if ((devvp = hmp->devvp) != NULL) {
2131                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2132                 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
2133                 hmp->devvp = NULL;
2134                 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
2135                 vn_unlock(devvp);
2136                 vrele(devvp);
2137                 devvp = NULL;
2138         }
2139
2140         /*
2141          * Clear vchain/fchain flags that might prevent final cleanup
2142          * of these chains.
2143          */
2144         if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
2145                 atomic_clear_int(&hmp->vchain.flags,
2146                                  HAMMER2_CHAIN_MODIFIED);
2147                 hammer2_pfs_memory_wakeup(hmp->vchain.pmp);
2148                 hammer2_chain_drop(&hmp->vchain);
2149         }
2150         if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
2151                 atomic_clear_int(&hmp->vchain.flags,
2152                                  HAMMER2_CHAIN_UPDATE);
2153                 hammer2_chain_drop(&hmp->vchain);
2154         }
2155
2156         if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
2157                 atomic_clear_int(&hmp->fchain.flags,
2158                                  HAMMER2_CHAIN_MODIFIED);
2159                 hammer2_pfs_memory_wakeup(hmp->fchain.pmp);
2160                 hammer2_chain_drop(&hmp->fchain);
2161         }
2162         if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
2163                 atomic_clear_int(&hmp->fchain.flags,
2164                                  HAMMER2_CHAIN_UPDATE);
2165                 hammer2_chain_drop(&hmp->fchain);
2166         }
2167
2168         /*
2169          * Final drop of embedded freemap root chain to
2170          * clean up fchain.core (fchain structure is not
2171          * flagged ALLOCATED so it is cleaned out and then
2172          * left to rot).
2173          */
2174         hammer2_chain_drop(&hmp->fchain);
2175
2176         /*
2177          * Final drop of embedded volume root chain to clean
2178          * up vchain.core (vchain structure is not flagged
2179          * ALLOCATED so it is cleaned out and then left to
2180          * rot).
2181          */
2182         dumpcnt = 50;
2183         hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
2184         dumpcnt = 50;
2185         hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
2186         hammer2_dev_unlock(hmp);
2187         hammer2_chain_drop(&hmp->vchain);
2188
2189         hammer2_io_cleanup(hmp, &hmp->iotree);
2190         if (hmp->iofree_count) {
2191                 kprintf("io_cleanup: %d I/O's left hanging\n",
2192                         hmp->iofree_count);
2193         }
2194
2195         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
2196         kmalloc_destroy(&hmp->mchain);
2197         kfree(hmp, M_HAMMER2);
2198 }
2199
2200 static
2201 int
2202 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
2203              ino_t ino, struct vnode **vpp)
2204 {
2205         kprintf("hammer2_vget\n");
2206         return (EOPNOTSUPP);
2207 }
2208
2209 static
2210 int
2211 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
2212 {
2213         hammer2_pfs_t *pmp;
2214         hammer2_cluster_t *cparent;
2215         int error;
2216         struct vnode *vp;
2217
2218         pmp = MPTOPMP(mp);
2219         if (pmp->iroot == NULL) {
2220                 *vpp = NULL;
2221                 error = EINVAL;
2222         } else {
2223                 cparent = hammer2_inode_lock(pmp->iroot,
2224                                                 HAMMER2_RESOLVE_ALWAYS |
2225                                                 HAMMER2_RESOLVE_SHARED);
2226                 vp = hammer2_igetv(pmp->iroot, cparent, &error);
2227                 hammer2_inode_unlock(pmp->iroot, cparent);
2228                 *vpp = vp;
2229                 if (vp == NULL)
2230                         kprintf("vnodefail\n");
2231         }
2232
2233         return (error);
2234 }
2235
2236 /*
2237  * Filesystem status
2238  *
2239  * XXX incorporate ipdata->meta.inode_quota and data_quota
2240  */
2241 static
2242 int
2243 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
2244 {
2245         hammer2_pfs_t *pmp;
2246         hammer2_dev_t *hmp;
2247         hammer2_blockref_t bref;
2248
2249         pmp = MPTOPMP(mp);
2250         KKASSERT(pmp->iroot->cluster.nchains >= 1);
2251         hmp = pmp->iroot->cluster.focus->hmp;   /* iroot retains focus */
2252         bref = pmp->iroot->cluster.focus->bref; /* no lock */
2253
2254         mp->mnt_stat.f_files = bref.inode_count;
2255         mp->mnt_stat.f_ffree = 0;
2256         mp->mnt_stat.f_blocks = (bref.data_count +
2257                                  hmp->voldata.allocator_free) /
2258                                 mp->mnt_vstat.f_bsize;
2259         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free /
2260                                 mp->mnt_vstat.f_bsize;
2261         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
2262
2263         *sbp = mp->mnt_stat;
2264         return (0);
2265 }
2266
2267 static
2268 int
2269 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
2270 {
2271         hammer2_pfs_t *pmp;
2272         hammer2_dev_t *hmp;
2273         hammer2_blockref_t bref;
2274
2275         pmp = MPTOPMP(mp);
2276         KKASSERT(pmp->iroot->cluster.nchains >= 1);
2277         hmp = pmp->iroot->cluster.focus->hmp;   /* iroot retains focus */
2278         bref = pmp->iroot->cluster.focus->bref; /* no lock */
2279
2280         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
2281         mp->mnt_vstat.f_files = bref.inode_count;
2282         mp->mnt_vstat.f_ffree = 0;
2283         mp->mnt_vstat.f_blocks = (bref.data_count +
2284                                  hmp->voldata.allocator_free) /
2285                                 mp->mnt_vstat.f_bsize;
2286         mp->mnt_vstat.f_bfree = hmp->voldata.allocator_free /
2287                                 mp->mnt_vstat.f_bsize;
2288         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
2289
2290         *sbp = mp->mnt_vstat;
2291         return (0);
2292 }
2293
2294 /*
2295  * Mount-time recovery (RW mounts)
2296  *
2297  * Updates to the free block table are allowed to lag flushes by one
2298  * transaction.  In case of a crash, then on a fresh mount we must do an
2299  * incremental scan of the last committed transaction id and make sure that
2300  * all related blocks have been marked allocated.
2301  *
2302  * The super-root topology and each PFS has its own transaction id domain,
2303  * so we must track PFS boundary transitions.
2304  */
2305 struct hammer2_recovery_elm {
2306         TAILQ_ENTRY(hammer2_recovery_elm) entry;
2307         hammer2_chain_t *chain;
2308         hammer2_tid_t sync_tid;
2309 };
2310
2311 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
2312
2313 struct hammer2_recovery_info {
2314         struct hammer2_recovery_list list;
2315         int     depth;
2316 };
2317
2318 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2319                         hammer2_chain_t *parent,
2320                         struct hammer2_recovery_info *info,
2321                         hammer2_tid_t sync_tid);
2322
2323 #define HAMMER2_RECOVERY_MAXDEPTH       10
2324
2325 static
2326 int
2327 hammer2_recovery(hammer2_dev_t *hmp)
2328 {
2329         hammer2_trans_t trans;
2330         struct hammer2_recovery_info info;
2331         struct hammer2_recovery_elm *elm;
2332         hammer2_chain_t *parent;
2333         hammer2_tid_t sync_tid;
2334         hammer2_tid_t mirror_tid;
2335         int error;
2336         int cumulative_error = 0;
2337
2338         hammer2_trans_init(&trans, hmp->spmp, 0);
2339
2340         sync_tid = hmp->voldata.freemap_tid;
2341         mirror_tid = hmp->voldata.mirror_tid;
2342
2343         kprintf("hammer2 mount \"%s\": ", hmp->devrepname);
2344         if (sync_tid >= mirror_tid) {
2345                 kprintf(" no recovery needed\n");
2346         } else {
2347                 kprintf(" freemap recovery %016jx-%016jx\n",
2348                         sync_tid + 1, mirror_tid);
2349         }
2350
2351         TAILQ_INIT(&info.list);
2352         info.depth = 0;
2353         parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
2354         cumulative_error = hammer2_recovery_scan(&trans, hmp, parent,
2355                                                  &info, sync_tid);
2356         hammer2_chain_lookup_done(parent);
2357
2358         while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
2359                 TAILQ_REMOVE(&info.list, elm, entry);
2360                 parent = elm->chain;
2361                 sync_tid = elm->sync_tid;
2362                 kfree(elm, M_HAMMER2);
2363
2364                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2365                 error = hammer2_recovery_scan(&trans, hmp, parent,
2366                                               &info,
2367                                               hmp->voldata.freemap_tid);
2368                 hammer2_chain_unlock(parent);
2369                 hammer2_chain_drop(parent);     /* drop elm->chain ref */
2370                 if (error)
2371                         cumulative_error = error;
2372         }
2373         hammer2_trans_done(&trans);
2374
2375         return cumulative_error;
2376 }
2377
2378 static
2379 int
2380 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2381                       hammer2_chain_t *parent,
2382                       struct hammer2_recovery_info *info,
2383                       hammer2_tid_t sync_tid)
2384 {
2385         const hammer2_inode_data_t *ripdata;
2386         hammer2_chain_t *chain;
2387         int cache_index;
2388         int cumulative_error = 0;
2389         int error;
2390
2391         /*
2392          * Adjust freemap to ensure that the block(s) are marked allocated.
2393          */
2394         if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
2395                 hammer2_freemap_adjust(trans, hmp, &parent->bref,
2396                                        HAMMER2_FREEMAP_DORECOVER);
2397         }
2398
2399         /*
2400          * Check type for recursive scan
2401          */
2402         switch(parent->bref.type) {
2403         case HAMMER2_BREF_TYPE_VOLUME:
2404                 /* data already instantiated */
2405                 break;
2406         case HAMMER2_BREF_TYPE_INODE:
2407                 /*
2408                  * Must instantiate data for DIRECTDATA test and also
2409                  * for recursion.
2410                  */
2411                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2412                 ripdata = &hammer2_chain_rdata(parent)->ipdata;
2413                 if (ripdata->meta.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2414                         /* not applicable to recovery scan */
2415                         hammer2_chain_unlock(parent);
2416                         return 0;
2417                 }
2418                 hammer2_chain_unlock(parent);
2419                 break;
2420         case HAMMER2_BREF_TYPE_INDIRECT:
2421                 /*
2422                  * Must instantiate data for recursion
2423                  */
2424                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2425                 hammer2_chain_unlock(parent);
2426                 break;
2427         case HAMMER2_BREF_TYPE_DATA:
2428         case HAMMER2_BREF_TYPE_FREEMAP:
2429         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2430         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2431                 /* not applicable to recovery scan */
2432                 return 0;
2433                 break;
2434         default:
2435                 return EDOM;
2436         }
2437
2438         /*
2439          * Defer operation if depth limit reached or if we are crossing a
2440          * PFS boundary.
2441          */
2442         if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) {
2443                 struct hammer2_recovery_elm *elm;
2444
2445                 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2446                 elm->chain = parent;
2447                 elm->sync_tid = sync_tid;
2448                 hammer2_chain_ref(parent);
2449                 TAILQ_INSERT_TAIL(&info->list, elm, entry);
2450                 /* unlocked by caller */
2451
2452                 return(0);
2453         }
2454
2455
2456         /*
2457          * Recursive scan of the last flushed transaction only.  We are
2458          * doing this without pmp assignments so don't leave the chains
2459          * hanging around after we are done with them.
2460          */
2461         cache_index = 0;
2462         chain = hammer2_chain_scan(parent, NULL, &cache_index,
2463                                    HAMMER2_LOOKUP_NODATA);
2464         while (chain) {
2465                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2466                 if (chain->bref.mirror_tid > sync_tid) {
2467                         ++info->depth;
2468                         error = hammer2_recovery_scan(trans, hmp, chain,
2469                                                       info, sync_tid);
2470                         --info->depth;
2471                         if (error)
2472                                 cumulative_error = error;
2473                 }
2474
2475                 /*
2476                  * Flush the recovery at the PFS boundary to stage it for
2477                  * the final flush of the super-root topology.
2478                  */
2479                 if ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
2480                     (chain->flags & HAMMER2_CHAIN_ONFLUSH)) {
2481                         hammer2_flush(trans, chain, 1);
2482                 }
2483                 chain = hammer2_chain_scan(parent, chain, &cache_index,
2484                                            HAMMER2_LOOKUP_NODATA);
2485         }
2486
2487         return cumulative_error;
2488 }
2489
2490 /*
2491  * Sync a mount point; this is called on a per-mount basis from the
2492  * filesystem syncer process periodically and whenever a user issues
2493  * a sync.
2494  */
2495 int
2496 hammer2_vfs_sync(struct mount *mp, int waitfor)
2497 {
2498         struct hammer2_sync_info info;
2499         hammer2_inode_t *iroot;
2500         hammer2_chain_t *chain;
2501         hammer2_chain_t *parent;
2502         hammer2_pfs_t *pmp;
2503         hammer2_dev_t *hmp;
2504         int flags;
2505         int error;
2506         int total_error;
2507         int i;
2508         int j;
2509
2510         pmp = MPTOPMP(mp);
2511         iroot = pmp->iroot;
2512         KKASSERT(iroot);
2513         KKASSERT(iroot->pmp == pmp);
2514
2515         /*
2516          * We can't acquire locks on existing vnodes while in a transaction
2517          * without risking a deadlock.  This assumes that vfsync() can be
2518          * called without the vnode locked (which it can in DragonFly).
2519          * Otherwise we'd have to implement a multi-pass or flag the lock
2520          * failures and retry.
2521          *
2522          * The reclamation code interlocks with the sync list's token
2523          * (by removing the vnode from the scan list) before unlocking
2524          * the inode, giving us time to ref the inode.
2525          */
2526         /*flags = VMSC_GETVP;*/
2527         flags = 0;
2528         if (waitfor & MNT_LAZY)
2529                 flags |= VMSC_ONEPASS;
2530
2531 #if 0
2532         /*
2533          * Preflush the vnodes using a normal transaction before interlocking
2534          * with a flush transaction.
2535          */
2536         hammer2_trans_init(&info.trans, pmp, 0);
2537         info.error = 0;
2538         info.waitfor = MNT_NOWAIT;
2539         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
2540         hammer2_trans_done(&info.trans);
2541 #endif
2542
2543         /*
2544          * Start our flush transaction.  This does not return until all
2545          * concurrent transactions have completed and will prevent any
2546          * new transactions from running concurrently, except for the
2547          * buffer cache transactions.
2548          *
2549          * For efficiency do an async pass before making sure with a
2550          * synchronous pass on all related buffer cache buffers.  It
2551          * should theoretically not be possible for any new file buffers
2552          * to be instantiated during this sequence.
2553          */
2554         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH |
2555                                              HAMMER2_TRANS_PREFLUSH);
2556         hammer2_run_unlinkq(&info.trans, pmp);
2557
2558         info.error = 0;
2559         info.waitfor = MNT_NOWAIT;
2560         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
2561         info.waitfor = MNT_WAIT;
2562         vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2563
2564         /*
2565          * Clear PREFLUSH.  This prevents (or asserts on) any new logical
2566          * buffer cache flushes which occur during the flush.  Device buffers
2567          * are not affected.
2568          */
2569         hammer2_bioq_sync(info.trans.pmp);
2570         atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH);
2571
2572         total_error = 0;
2573
2574         /*
2575          * Flush all nodes to synchronize the PFSROOT subtopology to the media.
2576          *
2577          * Note that this flush will not be visible on crash recovery until
2578          * we flush the super-root topology in the next loop.
2579          */
2580         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2581                 chain = iroot->cluster.array[i].chain;
2582                 if (chain == NULL)
2583                         continue;
2584
2585                 hammer2_chain_ref(chain);
2586                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
2587                 if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
2588                         hammer2_flush(&info.trans, chain, 1);
2589                         parent = chain->parent;
2590                         KKASSERT(chain->pmp != parent->pmp);
2591                         hammer2_chain_setflush(&info.trans, parent);
2592                 }
2593                 hammer2_chain_unlock(chain);
2594                 hammer2_chain_drop(chain);
2595         }
2596         hammer2_trans_done(&info.trans);
2597
2598         /*
2599          * Flush all volume roots to synchronize PFS flushes with the
2600          * storage media volume header.  This will flush the freemap and
2601          * the superroot topology but stops when it reaches a PFSROOT
2602          * (which we already flushed above).
2603          *
2604          * This is the last step which connects the volume root to the
2605          * PFSROOT dirs flushed above.
2606          *
2607          * Each spmp (representing the hmp's super-root) requires its own
2608          * transaction.
2609          */
2610         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2611                 hammer2_chain_t *tmp;
2612
2613                 chain = iroot->cluster.array[i].chain;
2614                 if (chain == NULL)
2615                         continue;
2616
2617                 hmp = chain->hmp;
2618
2619                 /*
2620                  * We only have to flush each hmp once
2621                  */
2622                 for (j = i - 1; j >= 0; --j) {
2623                         if ((tmp = iroot->cluster.array[j].chain) != NULL) {
2624                                 if (tmp->hmp == hmp)
2625                                         break;
2626                         }
2627                 }
2628                 if (j >= 0)
2629                         continue;
2630
2631                 /*
2632                  * spmp transaction.  The super-root is never directly
2633                  * mounted so there shouldn't be any vnodes, let alone any
2634                  * dirty vnodes associated with it.
2635                  */
2636                 hammer2_trans_init(&info.trans, hmp->spmp,
2637                                    HAMMER2_TRANS_ISFLUSH);
2638
2639                 /*
2640                  * Media mounts have two 'roots', vchain for the topology
2641                  * and fchain for the free block table.  Flush both.
2642                  *
2643                  * Note that the topology and free block table are handled
2644                  * independently, so the free block table can wind up being
2645                  * ahead of the topology.  We depend on the bulk free scan
2646                  * code to deal with any loose ends.
2647                  */
2648                 hammer2_chain_ref(&hmp->vchain);
2649                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2650                 hammer2_chain_ref(&hmp->fchain);
2651                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2652                 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2653                         /*
2654                          * This will also modify vchain as a side effect,
2655                          * mark vchain as modified now.
2656                          */
2657                         hammer2_voldata_modify(hmp);
2658                         chain = &hmp->fchain;
2659                         hammer2_flush(&info.trans, chain, 1);
2660                         KKASSERT(chain == &hmp->fchain);
2661                 }
2662                 hammer2_chain_unlock(&hmp->fchain);
2663                 hammer2_chain_unlock(&hmp->vchain);
2664                 hammer2_chain_drop(&hmp->fchain);
2665                 /* vchain dropped down below */
2666
2667                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2668                 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2669                         chain = &hmp->vchain;
2670                         hammer2_flush(&info.trans, chain, 1);
2671                         KKASSERT(chain == &hmp->vchain);
2672                 }
2673                 hammer2_chain_unlock(&hmp->vchain);
2674                 hammer2_chain_drop(&hmp->vchain);
2675
2676                 error = 0;
2677
2678                 /*
2679                  * We can't safely flush the volume header until we have
2680                  * flushed any device buffers which have built up.
2681                  *
2682                  * XXX this isn't being incremental
2683                  */
2684                 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
2685                 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
2686                 vn_unlock(hmp->devvp);
2687
2688                 /*
2689                  * The flush code sets CHAIN_VOLUMESYNC to indicate that the
2690                  * volume header needs synchronization via hmp->volsync.
2691                  *
2692                  * XXX synchronize the flag & data with only this flush XXX
2693                  */
2694                 if (error == 0 &&
2695                     (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
2696                         struct buf *bp;
2697
2698                         /*
2699                          * Synchronize the disk before flushing the volume
2700                          * header.
2701                          */
2702                         bp = getpbuf(NULL);
2703                         bp->b_bio1.bio_offset = 0;
2704                         bp->b_bufsize = 0;
2705                         bp->b_bcount = 0;
2706                         bp->b_cmd = BUF_CMD_FLUSH;
2707                         bp->b_bio1.bio_done = biodone_sync;
2708                         bp->b_bio1.bio_flags |= BIO_SYNC;
2709                         vn_strategy(hmp->devvp, &bp->b_bio1);
2710                         biowait(&bp->b_bio1, "h2vol");
2711                         relpbuf(bp, NULL);
2712
2713                         /*
2714                          * Then we can safely flush the version of the
2715                          * volume header synchronized by the flush code.
2716                          */
2717                         i = hmp->volhdrno + 1;
2718                         if (i >= HAMMER2_NUM_VOLHDRS)
2719                                 i = 0;
2720                         if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
2721                             hmp->volsync.volu_size) {
2722                                 i = 0;
2723                         }
2724                         kprintf("sync volhdr %d %jd\n",
2725                                 i, (intmax_t)hmp->volsync.volu_size);
2726                         bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2727                                     HAMMER2_PBUFSIZE, 0, 0);
2728                         atomic_clear_int(&hmp->vchain.flags,
2729                                          HAMMER2_CHAIN_VOLUMESYNC);
2730                         bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
2731                         bawrite(bp);
2732                         hmp->volhdrno = i;
2733                 }
2734                 if (error)
2735                         total_error = error;
2736
2737                 hammer2_trans_done(&info.trans);        /* spmp trans */
2738         }
2739         return (total_error);
2740 }
2741
2742 /*
2743  * Sync passes.
2744  */
2745 static int
2746 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2747 {
2748         struct hammer2_sync_info *info = data;
2749         hammer2_inode_t *ip;
2750         int error;
2751
2752         /*
2753          * Degenerate cases.  Note that ip == NULL typically means the
2754          * syncer vnode itself and we don't want to vclrisdirty() in that
2755          * situation.
2756          */
2757         ip = VTOI(vp);
2758         if (ip == NULL) {
2759                 return(0);
2760         }
2761         if (vp->v_type == VNON || vp->v_type == VBAD) {
2762                 vclrisdirty(vp);
2763                 return(0);
2764         }
2765
2766         /*
2767          * VOP_FSYNC will start a new transaction so replicate some code
2768          * here to do it inline (see hammer2_vop_fsync()).
2769          *
2770          * WARNING: The vfsync interacts with the buffer cache and might
2771          *          block, we can't hold the inode lock at that time.
2772          *          However, we MUST ref ip before blocking to ensure that
2773          *          it isn't ripped out from under us (since we do not
2774          *          hold a lock on the vnode).
2775          */
2776         hammer2_inode_ref(ip);
2777         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
2778         if ((ip->flags & HAMMER2_INODE_MODIFIED) ||
2779             !RB_EMPTY(&vp->v_rbdirty_tree)) {
2780                 vfsync(vp, info->waitfor, 1, NULL, NULL);
2781         }
2782         if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
2783             RB_EMPTY(&vp->v_rbdirty_tree)) {
2784                 vclrisdirty(vp);
2785         }
2786
2787         hammer2_inode_drop(ip);
2788 #if 1
2789         error = 0;
2790         if (error)
2791                 info->error = error;
2792 #endif
2793         return(0);
2794 }
2795
2796 static
2797 int
2798 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2799 {
2800         return (0);
2801 }
2802
2803 static
2804 int
2805 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2806                struct fid *fhp, struct vnode **vpp)
2807 {
2808         return (0);
2809 }
2810
2811 static
2812 int
2813 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2814                  int *exflagsp, struct ucred **credanonp)
2815 {
2816         return (0);
2817 }
2818
2819 /*
2820  * Support code for hammer2_vfs_mount().  Read, verify, and install the volume
2821  * header into the HMP
2822  *
2823  * XXX read four volhdrs and use the one with the highest TID whos CRC
2824  *     matches.
2825  *
2826  * XXX check iCRCs.
2827  *
2828  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2829  *     nonexistant locations.
2830  *
2831  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2832  */
2833 static
2834 int
2835 hammer2_install_volume_header(hammer2_dev_t *hmp)
2836 {
2837         hammer2_volume_data_t *vd;
2838         struct buf *bp;
2839         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2840         int error_reported;
2841         int error;
2842         int valid;
2843         int i;
2844
2845         error_reported = 0;
2846         error = 0;
2847         valid = 0;
2848         bp = NULL;
2849
2850         /*
2851          * There are up to 4 copies of the volume header (syncs iterate
2852          * between them so there is no single master).  We don't trust the
2853          * volu_size field so we don't know precisely how large the filesystem
2854          * is, so depend on the OS to return an error if we go beyond the
2855          * block device's EOF.
2856          */
2857         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2858                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2859                               HAMMER2_VOLUME_BYTES, &bp);
2860                 if (error) {
2861                         brelse(bp);
2862                         bp = NULL;
2863                         continue;
2864                 }
2865
2866                 vd = (struct hammer2_volume_data *) bp->b_data;
2867                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2868                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2869                         brelse(bp);
2870                         bp = NULL;
2871                         continue;
2872                 }
2873
2874                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2875                         /* XXX: Reversed-endianness filesystem */
2876                         kprintf("hammer2: reverse-endian filesystem detected");
2877                         brelse(bp);
2878                         bp = NULL;
2879                         continue;
2880                 }
2881
2882                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2883                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2884                                       HAMMER2_VOLUME_ICRC0_SIZE);
2885                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2886                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2887                                        HAMMER2_VOLUME_ICRC1_SIZE);
2888                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
2889                         kprintf("hammer2 volume header crc "
2890                                 "mismatch copy #%d %08x/%08x\n",
2891                                 i, crc0, crc);
2892                         error_reported = 1;
2893                         brelse(bp);
2894                         bp = NULL;
2895                         continue;
2896                 }
2897                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
2898                         valid = 1;
2899                         hmp->voldata = *vd;
2900                         hmp->volhdrno = i;
2901                 }
2902                 brelse(bp);
2903                 bp = NULL;
2904         }
2905         if (valid) {
2906                 hmp->volsync = hmp->voldata;
2907                 error = 0;
2908                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
2909                         kprintf("hammer2: using volume header #%d\n",
2910                                 hmp->volhdrno);
2911                 }
2912         } else {
2913                 error = EINVAL;
2914                 kprintf("hammer2: no valid volume headers found!\n");
2915         }
2916         return (error);
2917 }
2918
2919 /*
2920  * This handles hysteresis on regular file flushes.  Because the BIOs are
2921  * routed to a thread it is possible for an excessive number to build up
2922  * and cause long front-end stalls long before the runningbuffspace limit
2923  * is hit, so we implement hammer2_flush_pipe to control the
2924  * hysteresis.
2925  *
2926  * This is a particular problem when compression is used.
2927  */
2928 void
2929 hammer2_lwinprog_ref(hammer2_pfs_t *pmp)
2930 {
2931         atomic_add_int(&pmp->count_lwinprog, 1);
2932 }
2933
2934 void
2935 hammer2_lwinprog_drop(hammer2_pfs_t *pmp)
2936 {
2937         int lwinprog;
2938
2939         lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2940         if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2941             (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2942                 atomic_clear_int(&pmp->count_lwinprog,
2943                                  HAMMER2_LWINPROG_WAITING);
2944                 wakeup(&pmp->count_lwinprog);
2945         }
2946 }
2947
2948 void
2949 hammer2_lwinprog_wait(hammer2_pfs_t *pmp)
2950 {
2951         int lwinprog;
2952
2953         for (;;) {
2954                 lwinprog = pmp->count_lwinprog;
2955                 cpu_ccfence();
2956                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2957                         break;
2958                 tsleep_interlock(&pmp->count_lwinprog, 0);
2959                 atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING);
2960                 lwinprog = pmp->count_lwinprog;
2961                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2962                         break;
2963                 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2964         }
2965 }
2966
2967 /*
2968  * Manage excessive memory resource use for chain and related
2969  * structures.
2970  */
2971 void
2972 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
2973 {
2974         uint32_t waiting;
2975         uint32_t count;
2976         uint32_t limit;
2977 #if 0
2978         static int zzticks;
2979 #endif
2980
2981         /*
2982          * Atomic check condition and wait.  Also do an early speedup of
2983          * the syncer to try to avoid hitting the wait.
2984          */
2985         for (;;) {
2986                 waiting = pmp->inmem_dirty_chains;
2987                 cpu_ccfence();
2988                 count = waiting & HAMMER2_DIRTYCHAIN_MASK;
2989
2990                 limit = pmp->mp->mnt_nvnodelistsize / 10;
2991                 if (limit < hammer2_limit_dirty_chains)
2992                         limit = hammer2_limit_dirty_chains;
2993                 if (limit < 1000)
2994                         limit = 1000;
2995
2996 #if 0
2997                 if ((int)(ticks - zzticks) > hz) {
2998                         zzticks = ticks;
2999                         kprintf("count %ld %ld\n", count, limit);
3000                 }
3001 #endif
3002
3003                 /*
3004                  * Block if there are too many dirty chains present, wait
3005                  * for the flush to clean some out.
3006                  */
3007                 if (count > limit) {
3008                         tsleep_interlock(&pmp->inmem_dirty_chains, 0);
3009                         if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
3010                                                waiting,
3011                                        waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
3012                                 speedup_syncer(pmp->mp);
3013                                 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
3014                                        "chnmem", hz);
3015                         }
3016                         continue;       /* loop on success or fail */
3017                 }
3018
3019                 /*
3020                  * Try to start an early flush before we are forced to block.
3021                  */
3022                 if (count > limit * 7 / 10)
3023                         speedup_syncer(pmp->mp);
3024                 break;
3025         }
3026 }
3027
3028 void
3029 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp)
3030 {
3031         if (pmp) {
3032                 atomic_add_int(&pmp->inmem_dirty_chains, 1);
3033         }
3034 }
3035
3036 void
3037 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp)
3038 {
3039         uint32_t waiting;
3040
3041         if (pmp == NULL)
3042                 return;
3043
3044         for (;;) {
3045                 waiting = pmp->inmem_dirty_chains;
3046                 cpu_ccfence();
3047                 if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
3048                                        waiting,
3049                                        (waiting - 1) &
3050                                         ~HAMMER2_DIRTYCHAIN_WAITING)) {
3051                         break;
3052                 }
3053         }
3054
3055         if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
3056                 wakeup(&pmp->inmem_dirty_chains);
3057 }
3058
3059 /*
3060  * Debugging
3061  */
3062 void
3063 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
3064 {
3065         hammer2_chain_t *scan;
3066         hammer2_chain_t *parent;
3067
3068         --*countp;
3069         if (*countp == 0) {
3070                 kprintf("%*.*s...\n", tab, tab, "");
3071                 return;
3072         }
3073         if (*countp < 0)
3074                 return;
3075         kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
3076                 tab, tab, "", pfx,
3077                 chain, chain->bref.type,
3078                 chain->bref.key, chain->bref.keybits,
3079                 chain->bref.mirror_tid);
3080
3081         kprintf("%*.*s      [%08x] (%s) refs=%d",
3082                 tab, tab, "",
3083                 chain->flags,
3084                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
3085                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
3086                 chain->refs);
3087
3088         parent = chain->parent;
3089         if (parent)
3090                 kprintf("\n%*.*s      p=%p [pflags %08x prefs %d",
3091                         tab, tab, "",
3092                         parent, parent->flags, parent->refs);
3093         if (RB_EMPTY(&chain->core.rbtree)) {
3094                 kprintf("\n");
3095         } else {
3096                 kprintf(" {\n");
3097                 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree)
3098                         hammer2_dump_chain(scan, tab + 4, countp, 'a');
3099                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
3100                         kprintf("%*.*s}(%s)\n", tab, tab, "",
3101                                 chain->data->ipdata.filename);
3102                 else
3103                         kprintf("%*.*s}\n", tab, tab, "");
3104         }
3105 }