Merge branch 'vendor/OPENSSL'
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*
2  * Copyright (c) 2011-2015 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61 #include "hammer2_lz4.h"
62
63 #include "zlib/hammer2_zlib.h"
64
65 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
66
67 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
68
69 struct hammer2_sync_info {
70         hammer2_trans_t trans;
71         int error;
72         int waitfor;
73 };
74
75 TAILQ_HEAD(hammer2_mntlist, hammer2_dev);
76 TAILQ_HEAD(hammer2_pfslist, hammer2_pfs);
77 static struct hammer2_mntlist hammer2_mntlist;
78 static struct hammer2_pfslist hammer2_pfslist;
79 static struct lock hammer2_mntlk;
80
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 int hammer2_flush_pipe = 100;
85 int hammer2_synchronous_flush = 1;
86 int hammer2_dio_count;
87 long hammer2_limit_dirty_chains;
88 long hammer2_iod_file_read;
89 long hammer2_iod_meta_read;
90 long hammer2_iod_indr_read;
91 long hammer2_iod_fmap_read;
92 long hammer2_iod_volu_read;
93 long hammer2_iod_file_write;
94 long hammer2_iod_meta_write;
95 long hammer2_iod_indr_write;
96 long hammer2_iod_fmap_write;
97 long hammer2_iod_volu_write;
98 long hammer2_ioa_file_read;
99 long hammer2_ioa_meta_read;
100 long hammer2_ioa_indr_read;
101 long hammer2_ioa_fmap_read;
102 long hammer2_ioa_volu_read;
103 long hammer2_ioa_fmap_write;
104 long hammer2_ioa_file_write;
105 long hammer2_ioa_meta_write;
106 long hammer2_ioa_indr_write;
107 long hammer2_ioa_volu_write;
108
109 MALLOC_DECLARE(C_BUFFER);
110 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
111
112 MALLOC_DECLARE(D_BUFFER);
113 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
114
115 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
116
117 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
118            &hammer2_debug, 0, "");
119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
120            &hammer2_cluster_enable, 0, "");
121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
122            &hammer2_hardlink_enable, 0, "");
123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
124            &hammer2_flush_pipe, 0, "");
125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
126            &hammer2_synchronous_flush, 0, "");
127 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
128            &hammer2_limit_dirty_chains, 0, "");
129 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
130            &hammer2_dio_count, 0, "");
131
132 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
133            &hammer2_iod_file_read, 0, "");
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
135            &hammer2_iod_meta_read, 0, "");
136 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
137            &hammer2_iod_indr_read, 0, "");
138 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
139            &hammer2_iod_fmap_read, 0, "");
140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
141            &hammer2_iod_volu_read, 0, "");
142
143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
144            &hammer2_iod_file_write, 0, "");
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
146            &hammer2_iod_meta_write, 0, "");
147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
148            &hammer2_iod_indr_write, 0, "");
149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
150            &hammer2_iod_fmap_write, 0, "");
151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
152            &hammer2_iod_volu_write, 0, "");
153
154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
155            &hammer2_ioa_file_read, 0, "");
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
157            &hammer2_ioa_meta_read, 0, "");
158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
159            &hammer2_ioa_indr_read, 0, "");
160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
161            &hammer2_ioa_fmap_read, 0, "");
162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
163            &hammer2_ioa_volu_read, 0, "");
164
165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
166            &hammer2_ioa_file_write, 0, "");
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
168            &hammer2_ioa_meta_write, 0, "");
169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
170            &hammer2_ioa_indr_write, 0, "");
171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
172            &hammer2_ioa_fmap_write, 0, "");
173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
174            &hammer2_ioa_volu_write, 0, "");
175
176 static int hammer2_vfs_init(struct vfsconf *conf);
177 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
178 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
179                                 struct ucred *cred);
180 static int hammer2_remount(hammer2_dev_t *, struct mount *, char *,
181                                 struct vnode *, struct ucred *);
182 static int hammer2_recovery(hammer2_dev_t *hmp);
183 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
184 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
185 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
186                                 struct ucred *cred);
187 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
188                                 struct ucred *cred);
189 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
190                                 ino_t ino, struct vnode **vpp);
191 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
192                                 struct fid *fhp, struct vnode **vpp);
193 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
194 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
195                                 int *exflagsp, struct ucred **credanonp);
196
197 static int hammer2_install_volume_header(hammer2_dev_t *hmp);
198 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
199
200 static void hammer2_update_pmps(hammer2_dev_t *hmp);
201 static void hammer2_write_thread(void *arg);
202
203 static void hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp);
204 static void hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp,
205                                 hammer2_dev_t *hmp);
206
207 /* 
208  * Functions for compression in threads,
209  * from hammer2_vnops.c
210  */
211 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
212                                 hammer2_inode_t *ip,
213                                 const hammer2_inode_data_t *ripdata,
214                                 hammer2_cluster_t *cparent,
215                                 hammer2_key_t lbase, int ioflag, int pblksize,
216                                 int *errorp);
217 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
218                                 hammer2_inode_t *ip,
219                                 const hammer2_inode_data_t *ripdata,
220                                 hammer2_cluster_t *cparent,
221                                 hammer2_key_t lbase, int ioflag,
222                                 int pblksize, int *errorp,
223                                 int comp_algo, int check_algo);
224 static void hammer2_zero_check_and_write(struct buf *bp,
225                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
226                                 const hammer2_inode_data_t *ripdata,
227                                 hammer2_cluster_t *cparent,
228                                 hammer2_key_t lbase,
229                                 int ioflag, int pblksize, int *errorp,
230                                 int check_algo);
231 static int test_block_zeros(const char *buf, size_t bytes);
232 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
233                                 hammer2_inode_t *ip,
234                                 const hammer2_inode_data_t *ripdata,
235                                 hammer2_cluster_t *cparent,
236                                 hammer2_key_t lbase,
237                                 int *errorp);
238 static void hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp,
239                                 int ioflag, int pblksize, int *errorp,
240                                 int check_algo);
241
242 /*
243  * HAMMER2 vfs operations.
244  */
245 static struct vfsops hammer2_vfsops = {
246         .vfs_init       = hammer2_vfs_init,
247         .vfs_uninit     = hammer2_vfs_uninit,
248         .vfs_sync       = hammer2_vfs_sync,
249         .vfs_mount      = hammer2_vfs_mount,
250         .vfs_unmount    = hammer2_vfs_unmount,
251         .vfs_root       = hammer2_vfs_root,
252         .vfs_statfs     = hammer2_vfs_statfs,
253         .vfs_statvfs    = hammer2_vfs_statvfs,
254         .vfs_vget       = hammer2_vfs_vget,
255         .vfs_vptofh     = hammer2_vfs_vptofh,
256         .vfs_fhtovp     = hammer2_vfs_fhtovp,
257         .vfs_checkexp   = hammer2_vfs_checkexp
258 };
259
260 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
261
262 VFS_SET(hammer2_vfsops, hammer2, 0);
263 MODULE_VERSION(hammer2, 1);
264
265 static
266 int
267 hammer2_vfs_init(struct vfsconf *conf)
268 {
269         static struct objcache_malloc_args margs_read;
270         static struct objcache_malloc_args margs_write;
271
272         int error;
273
274         error = 0;
275
276         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
277                 error = EINVAL;
278         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
279                 error = EINVAL;
280         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
281                 error = EINVAL;
282
283         if (error)
284                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
285         
286         margs_read.objsize = 65536;
287         margs_read.mtype = D_BUFFER;
288         
289         margs_write.objsize = 32768;
290         margs_write.mtype = C_BUFFER;
291         
292         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
293                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
294                                 objcache_malloc_free, &margs_read);
295         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
296                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
297                                 objcache_malloc_free, &margs_write);
298
299         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
300         TAILQ_INIT(&hammer2_mntlist);
301         TAILQ_INIT(&hammer2_pfslist);
302
303         hammer2_limit_dirty_chains = desiredvnodes / 10;
304
305         return (error);
306 }
307
308 static
309 int
310 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
311 {
312         objcache_destroy(cache_buffer_read);
313         objcache_destroy(cache_buffer_write);
314         return 0;
315 }
316
317 /*
318  * Core PFS allocator.  Used to allocate the pmp structure for PFS cluster
319  * mounts and the spmp structure for media (hmp) structures.
320  *
321  * pmp->modify_tid tracks new modify_tid transaction ids for front-end
322  * transactions.  Note that synchronization does not use this field.
323  * (typically frontend operations and synchronization cannot run on the
324  * same PFS node at the same time).
325  *
326  * XXX check locking
327  */
328 hammer2_pfs_t *
329 hammer2_pfsalloc(hammer2_cluster_t *cluster,
330                  const hammer2_inode_data_t *ripdata,
331                  hammer2_tid_t modify_tid)
332 {
333         hammer2_chain_t *rchain;
334         hammer2_inode_t *iroot;
335         hammer2_pfs_t *pmp;
336         int count;
337         int i;
338         int j;
339
340         /*
341          * Locate or create the PFS based on the cluster id.  If ripdata
342          * is NULL this is a spmp which is unique and is always allocated.
343          */
344         if (ripdata) {
345                 TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
346                         if (bcmp(&pmp->pfs_clid, &ripdata->pfs_clid,
347                                  sizeof(pmp->pfs_clid)) == 0) {
348                                         break;
349                         }
350                 }
351         } else {
352                 pmp = NULL;
353         }
354
355         if (pmp == NULL) {
356                 pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
357                 hammer2_trans_manage_init(&pmp->tmanage);
358                 kmalloc_create(&pmp->minode, "HAMMER2-inodes");
359                 kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
360                 lockinit(&pmp->lock, "pfslk", 0, 0);
361                 spin_init(&pmp->inum_spin, "hm2pfsalloc_inum");
362                 RB_INIT(&pmp->inum_tree);
363                 TAILQ_INIT(&pmp->unlinkq);
364                 spin_init(&pmp->list_spin, "hm2pfsalloc_list");
365
366                 /*
367                  * Save the last media transaction id for the flusher.  Set
368                  * initial 
369                  */
370                 if (ripdata)
371                         pmp->pfs_clid = ripdata->pfs_clid;
372                 hammer2_mtx_init(&pmp->wthread_mtx, "h2wthr");
373                 bioq_init(&pmp->wthread_bioq);
374                 TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
375
376                 /*
377                  * The synchronization thread may start too early, make
378                  * sure it stays frozen until we are ready to let it go.
379                  * XXX
380                  */
381                 /*
382                 pmp->primary_thr.flags = HAMMER2_SYNCTHR_FROZEN |
383                                          HAMMER2_SYNCTHR_REMASTER;
384                 */
385         }
386
387         /*
388          * Create the PFS's root inode.
389          */
390         if ((iroot = pmp->iroot) == NULL) {
391                 iroot = hammer2_inode_get(pmp, NULL, NULL);
392                 pmp->iroot = iroot;
393                 hammer2_inode_ref(iroot);
394                 hammer2_inode_unlock(iroot, NULL);
395         }
396
397         /*
398          * Stop here if no cluster is passed in.
399          */
400         if (cluster == NULL)
401                 goto done;
402
403         /*
404          * When a cluster is passed in we must add the cluster's chains
405          * to the PFS's root inode, update pmp->pfs_types[], and update
406          * the syncronization threads.
407          *
408          * At the moment empty spots can develop due to removals or failures.
409          * Ultimately we want to re-fill these spots but doing so might
410          * confused running code. XXX
411          */
412         hammer2_inode_ref(iroot);
413         hammer2_mtx_ex(&iroot->lock);
414         j = iroot->cluster.nchains;
415
416         kprintf("add PFS to pmp %p[%d]\n", pmp, j);
417
418         for (i = 0; i < cluster->nchains; ++i) {
419                 if (j == HAMMER2_MAXCLUSTER)
420                         break;
421                 rchain = cluster->array[i].chain;
422                 KKASSERT(rchain->pmp == NULL);
423                 rchain->pmp = pmp;
424                 hammer2_chain_ref(rchain);
425                 iroot->cluster.array[j].chain = rchain;
426                 pmp->pfs_types[j] = ripdata->pfs_type;
427                 pmp->pfs_names[j] = kstrdup(ripdata->filename, M_HAMMER2);
428
429                 /*
430                  * If the PFS is already mounted we must account
431                  * for the mount_count here.
432                  */
433                 if (pmp->mp)
434                         ++rchain->hmp->mount_count;
435
436                 /*
437                  * May have to fixup dirty chain tracking.  Previous
438                  * pmp was NULL so nothing to undo.
439                  */
440                 if (rchain->flags & HAMMER2_CHAIN_MODIFIED)
441                         hammer2_pfs_memory_inc(pmp);
442                 ++j;
443         }
444         iroot->cluster.nchains = j;
445
446         if (i != cluster->nchains) {
447                 kprintf("hammer2_mount: cluster full!\n");
448                 /* XXX fatal error? */
449         }
450
451         /*
452          * Update nmasters from any PFS inode which is part of the cluster.
453          * It is possible that this will result in a value which is too
454          * high.  MASTER PFSs are authoritative for pfs_nmasters and will
455          * override this value later on.
456          *
457          * (This informs us of masters that might not currently be
458          *  discoverable by this mount).
459          */
460         if (ripdata && pmp->pfs_nmasters < ripdata->pfs_nmasters) {
461                 pmp->pfs_nmasters = ripdata->pfs_nmasters;
462         }
463
464         /*
465          * Count visible masters.  Masters are usually added with
466          * ripdata->pfs_nmasters set to 1.  This detects when there
467          * are more (XXX and must update the master inodes).
468          */
469         count = 0;
470         for (i = 0; i < iroot->cluster.nchains; ++i) {
471                 if (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER)
472                         ++count;
473         }
474         if (pmp->pfs_nmasters < count)
475                 pmp->pfs_nmasters = count;
476
477         /*
478          * Create missing synchronization threads.
479          *
480          * Single-node masters (including snapshots) have nothing to
481          * synchronize and do not require this thread.
482          *
483          * Multi-node masters or any number of soft masters, slaves, copy,
484          * or other PFS types need the thread.
485          *
486          * Each thread is responsible for its particular cluster index.
487          * We use independent threads so stalls or mismatches related to
488          * any given target do not affect other targets.
489          */
490         for (i = 0; i < iroot->cluster.nchains; ++i) {
491                 if (pmp->sync_thrs[i].td)
492                         continue;
493                 if ((pmp->pfs_nmasters > 1 &&
494                      (pmp->pfs_types[i] == HAMMER2_PFSTYPE_MASTER)) ||
495                     pmp->pfs_types[i] != HAMMER2_PFSTYPE_MASTER) {
496                         hammer2_syncthr_create(&pmp->sync_thrs[i], pmp, i,
497                                                hammer2_syncthr_primary);
498                 }
499         }
500
501         hammer2_mtx_unlock(&iroot->lock);
502         hammer2_inode_drop(iroot);
503 done:
504         return pmp;
505 }
506
507 /*
508  * Destroy a PFS, typically only occurs after the last mount on a device
509  * has gone away.
510  */
511 static void
512 hammer2_pfsfree(hammer2_pfs_t *pmp)
513 {
514         hammer2_inode_t *iroot;
515         int i;
516
517         /*
518          * Cleanup our reference on iroot.  iroot is (should) not be needed
519          * by the flush code.
520          */
521         TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
522
523         iroot = pmp->iroot;
524         if (iroot) {
525                 for (i = 0; i < iroot->cluster.nchains; ++i)
526                         hammer2_syncthr_delete(&pmp->sync_thrs[i]);
527 #if REPORT_REFS_ERRORS
528                 if (pmp->iroot->refs != 1)
529                         kprintf("PMP->IROOT %p REFS WRONG %d\n",
530                                 pmp->iroot, pmp->iroot->refs);
531 #else
532                 KKASSERT(pmp->iroot->refs == 1);
533 #endif
534                 /* ref for pmp->iroot */
535                 hammer2_inode_drop(pmp->iroot);
536                 pmp->iroot = NULL;
537         }
538
539         kmalloc_destroy(&pmp->mmsg);
540         kmalloc_destroy(&pmp->minode);
541
542         kfree(pmp, M_HAMMER2);
543 }
544
545 /*
546  * Remove all references to hmp from the pfs list.  Any PFS which becomes
547  * empty is terminated and freed.
548  *
549  * XXX inefficient.
550  */
551 static void
552 hammer2_pfsfree_scan(hammer2_dev_t *hmp)
553 {
554         hammer2_pfs_t *pmp;
555         hammer2_inode_t *iroot;
556         hammer2_cluster_t *cluster;
557         hammer2_chain_t *rchain;
558         int didfreeze;
559         int i;
560
561 again:
562         TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
563                 if ((iroot = pmp->iroot) == NULL)
564                         continue;
565                 if (hmp->spmp == pmp) {
566                         kprintf("unmount hmp %p remove spmp %p\n",
567                                 hmp, pmp);
568                         hmp->spmp = NULL;
569                 }
570
571                 /*
572                  * Determine if this PFS is affected.  If it is we must
573                  * freeze all management threads and lock its iroot.
574                  *
575                  * Freezing a management thread forces it idle, operations
576                  * in-progress will be aborted and it will have to start
577                  * over again when unfrozen, or exit if told to exit.
578                  */
579                 cluster = &iroot->cluster;
580                 for (i = 0; i < cluster->nchains; ++i) {
581                         rchain = cluster->array[i].chain;
582                         if (rchain == NULL || rchain->hmp != hmp)
583                                 continue;
584                         break;
585                 }
586                 if (i != cluster->nchains) {
587                         /*
588                          * Make sure all synchronization threads are locked
589                          * down.
590                          */
591                         for (i = 0; i < iroot->cluster.nchains; ++i)
592                                 hammer2_syncthr_freeze(&pmp->sync_thrs[i]);
593
594                         /*
595                          * Lock the inode and clean out matching chains.
596                          * Note that we cannot use hammer2_inode_lock_*()
597                          * here because that would attempt to validate the
598                          * cluster that we are in the middle of ripping
599                          * apart.
600                          *
601                          * WARNING! We are working directly on the inodes
602                          *          embedded cluster.
603                          */
604                         hammer2_mtx_ex(&iroot->lock);
605
606                         /*
607                          * Remove the chain from matching elements of the PFS.
608                          */
609                         for (i = 0; i < cluster->nchains; ++i) {
610                                 rchain = cluster->array[i].chain;
611                                 if (rchain == NULL || rchain->hmp != hmp)
612                                         continue;
613                                 hammer2_syncthr_delete(&pmp->sync_thrs[i]);
614                                 rchain = cluster->array[i].chain;
615                                 cluster->array[i].chain = NULL;
616                                 pmp->pfs_types[i] = 0;
617                                 if (pmp->pfs_names[i]) {
618                                         kfree(pmp->pfs_names[i], M_HAMMER2);
619                                         pmp->pfs_names[i] = NULL;
620                                 }
621                                 hammer2_chain_drop(rchain);
622
623                                 /* focus hint */
624                                 if (cluster->focus == rchain)
625                                         cluster->focus = NULL;
626                         }
627                         hammer2_mtx_unlock(&iroot->lock);
628                         didfreeze = 1;  /* remaster, unfreeze down below */
629                 } else {
630                         didfreeze = 0;
631                 }
632
633                 /*
634                  * Cleanup trailing chains.  Do not reorder chains (for now).
635                  * XXX might remove more than we intended.
636                  */
637                 while (i > 0) {
638                         if (cluster->array[i - 1].chain)
639                                 break;
640                         --i;
641                 }
642                 cluster->nchains = i;
643
644                 /*
645                  * If the PMP has no elements remaining we can destroy it.
646                  * (this will transition management threads from frozen->exit).
647                  */
648                 if (cluster->nchains == 0) {
649                         kprintf("unmount hmp %p last ref to PMP=%p\n",
650                                 hmp, pmp);
651                         hammer2_pfsfree(pmp);
652                         goto again;
653                 }
654
655                 /*
656                  * If elements still remain we need to set the REMASTER
657                  * flag and unfreeze it.
658                  */
659                 if (didfreeze) {
660                         for (i = 0; i < iroot->cluster.nchains; ++i) {
661                                 hammer2_syncthr_remaster(&pmp->sync_thrs[i]);
662                                 hammer2_syncthr_unfreeze(&pmp->sync_thrs[i]);
663                         }
664                 }
665         }
666 }
667
668 /*
669  * Mount or remount HAMMER2 fileystem from physical media
670  *
671  *      mountroot
672  *              mp              mount point structure
673  *              path            NULL
674  *              data            <unused>
675  *              cred            <unused>
676  *
677  *      mount
678  *              mp              mount point structure
679  *              path            path to mount point
680  *              data            pointer to argument structure in user space
681  *                      volume  volume path (device@LABEL form)
682  *                      hflags  user mount flags
683  *              cred            user credentials
684  *
685  * RETURNS:     0       Success
686  *              !0      error number
687  */
688 static
689 int
690 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
691                   struct ucred *cred)
692 {
693         struct hammer2_mount_info info;
694         hammer2_pfs_t *pmp;
695         hammer2_pfs_t *spmp;
696         hammer2_dev_t *hmp;
697         hammer2_key_t key_next;
698         hammer2_key_t key_dummy;
699         hammer2_key_t lhc;
700         struct vnode *devvp;
701         struct nlookupdata nd;
702         hammer2_chain_t *parent;
703         hammer2_cluster_t *cluster;
704         hammer2_cluster_t *cparent;
705         const hammer2_inode_data_t *ripdata;
706         hammer2_blockref_t bref;
707         struct file *fp;
708         char devstr[MNAMELEN];
709         size_t size;
710         size_t done;
711         char *dev;
712         char *label;
713         int ronly = 1;
714         int error;
715         int cache_index;
716         int i;
717
718         hmp = NULL;
719         pmp = NULL;
720         dev = NULL;
721         label = NULL;
722         devvp = NULL;
723         cache_index = -1;
724
725         kprintf("hammer2_mount\n");
726
727         if (path == NULL) {
728                 /*
729                  * Root mount
730                  */
731                 bzero(&info, sizeof(info));
732                 info.cluster_fd = -1;
733                 return (EOPNOTSUPP);
734         } else {
735                 /*
736                  * Non-root mount or updating a mount
737                  */
738                 error = copyin(data, &info, sizeof(info));
739                 if (error)
740                         return (error);
741
742                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
743                 if (error)
744                         return (error);
745
746                 /* Extract device and label */
747                 dev = devstr;
748                 label = strchr(devstr, '@');
749                 if (label == NULL ||
750                     ((label + 1) - dev) > done) {
751                         return (EINVAL);
752                 }
753                 *label = '\0';
754                 label++;
755                 if (*label == '\0')
756                         return (EINVAL);
757
758                 if (mp->mnt_flag & MNT_UPDATE) {
759                         /*
760                          * Update mount.  Note that pmp->iroot->cluster is
761                          * an inode-embedded cluster and thus cannot be
762                          * directly locked.
763                          *
764                          * XXX HAMMER2 needs to implement NFS export via
765                          *     mountctl.
766                          */
767                         pmp = MPTOPMP(mp);
768                         cluster = &pmp->iroot->cluster;
769                         for (i = 0; i < cluster->nchains; ++i) {
770                                 if (cluster->array[i].chain == NULL)
771                                         continue;
772                                 hmp = cluster->array[i].chain->hmp;
773                                 devvp = hmp->devvp;
774                                 error = hammer2_remount(hmp, mp, path,
775                                                         devvp, cred);
776                                 if (error)
777                                         break;
778                         }
779                         /*hammer2_inode_install_hidden(pmp);*/
780
781                         return error;
782                 }
783         }
784
785         /*
786          * HMP device mount
787          *
788          * Lookup name and verify it refers to a block device.
789          */
790         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
791         if (error == 0)
792                 error = nlookup(&nd);
793         if (error == 0)
794                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
795         nlookup_done(&nd);
796
797         if (error == 0) {
798                 if (vn_isdisk(devvp, &error))
799                         error = vfs_mountedon(devvp);
800         }
801
802         /*
803          * Determine if the device has already been mounted.  After this
804          * check hmp will be non-NULL if we are doing the second or more
805          * hammer2 mounts from the same device.
806          */
807         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
808         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
809                 if (hmp->devvp == devvp)
810                         break;
811         }
812
813         /*
814          * Open the device if this isn't a secondary mount and construct
815          * the H2 device mount (hmp).
816          */
817         if (hmp == NULL) {
818                 hammer2_chain_t *schain;
819                 hammer2_xid_t xid;
820
821                 if (error == 0 && vcount(devvp) > 0)
822                         error = EBUSY;
823
824                 /*
825                  * Now open the device
826                  */
827                 if (error == 0) {
828                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
829                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
830                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
831                         if (error == 0) {
832                                 error = VOP_OPEN(devvp,
833                                                  ronly ? FREAD : FREAD | FWRITE,
834                                                  FSCRED, NULL);
835                         }
836                         vn_unlock(devvp);
837                 }
838                 if (error && devvp) {
839                         vrele(devvp);
840                         devvp = NULL;
841                 }
842                 if (error) {
843                         lockmgr(&hammer2_mntlk, LK_RELEASE);
844                         return error;
845                 }
846                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
847                 ksnprintf(hmp->devrepname, sizeof(hmp->devrepname), "%s", dev);
848                 hmp->ronly = ronly;
849                 hmp->devvp = devvp;
850                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
851                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
852                 RB_INIT(&hmp->iotree);
853                 spin_init(&hmp->io_spin, "hm2mount_io");
854                 spin_init(&hmp->list_spin, "hm2mount_list");
855                 TAILQ_INIT(&hmp->flushq);
856
857                 lockinit(&hmp->vollk, "h2vol", 0, 0);
858
859                 /*
860                  * vchain setup. vchain.data is embedded.
861                  * vchain.refs is initialized and will never drop to 0.
862                  *
863                  * NOTE! voldata is not yet loaded.
864                  */
865                 hmp->vchain.hmp = hmp;
866                 hmp->vchain.refs = 1;
867                 hmp->vchain.data = (void *)&hmp->voldata;
868                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
869                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
870                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
871
872                 hammer2_chain_core_init(&hmp->vchain);
873                 /* hmp->vchain.u.xxx is left NULL */
874
875                 /*
876                  * fchain setup.  fchain.data is embedded.
877                  * fchain.refs is initialized and will never drop to 0.
878                  *
879                  * The data is not used but needs to be initialized to
880                  * pass assertion muster.  We use this chain primarily
881                  * as a placeholder for the freemap's top-level RBTREE
882                  * so it does not interfere with the volume's topology
883                  * RBTREE.
884                  */
885                 hmp->fchain.hmp = hmp;
886                 hmp->fchain.refs = 1;
887                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
888                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
889                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
890                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
891                 hmp->fchain.bref.methods =
892                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
893                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
894
895                 hammer2_chain_core_init(&hmp->fchain);
896                 /* hmp->fchain.u.xxx is left NULL */
897
898                 /*
899                  * Install the volume header and initialize fields from
900                  * voldata.
901                  */
902                 error = hammer2_install_volume_header(hmp);
903                 if (error) {
904                         hammer2_unmount_helper(mp, NULL, hmp);
905                         lockmgr(&hammer2_mntlk, LK_RELEASE);
906                         hammer2_vfs_unmount(mp, MNT_FORCE);
907                         return error;
908                 }
909
910                 /*
911                  * Really important to get these right or flush will get
912                  * confused.
913                  */
914                 hmp->spmp = hammer2_pfsalloc(NULL, NULL, 0);
915                 kprintf("alloc spmp %p tid %016jx\n",
916                         hmp->spmp, hmp->voldata.mirror_tid);
917                 spmp = hmp->spmp;
918
919                 /*
920                  * Dummy-up vchain and fchain's modify_tid.  mirror_tid
921                  * is inherited from the volume header.
922                  */
923                 xid = 0;
924                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
925                 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
926                 hmp->vchain.pmp = spmp;
927                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
928                 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
929                 hmp->fchain.pmp = spmp;
930
931                 /*
932                  * First locate the super-root inode, which is key 0
933                  * relative to the volume header's blockset.
934                  *
935                  * Then locate the root inode by scanning the directory keyspace
936                  * represented by the label.
937                  */
938                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
939                 schain = hammer2_chain_lookup(&parent, &key_dummy,
940                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
941                                       &cache_index, 0);
942                 hammer2_chain_lookup_done(parent);
943                 if (schain == NULL) {
944                         kprintf("hammer2_mount: invalid super-root\n");
945                         hammer2_unmount_helper(mp, NULL, hmp);
946                         lockmgr(&hammer2_mntlk, LK_RELEASE);
947                         hammer2_vfs_unmount(mp, MNT_FORCE);
948                         return EINVAL;
949                 }
950                 if (schain->error) {
951                         kprintf("hammer2_mount: error %s reading super-root\n",
952                                 hammer2_error_str(schain->error));
953                         hammer2_chain_unlock(schain);
954                         hammer2_chain_drop(schain);
955                         schain = NULL;
956                         hammer2_unmount_helper(mp, NULL, hmp);
957                         lockmgr(&hammer2_mntlk, LK_RELEASE);
958                         hammer2_vfs_unmount(mp, MNT_FORCE);
959                         return EINVAL;
960                 }
961
962                 /*
963                  * The super-root always uses an inode_tid of 1 when
964                  * creating PFSs.
965                  */
966                 spmp->inode_tid = 1;
967                 spmp->modify_tid = schain->bref.modify_tid;
968
969                 /*
970                  * Sanity-check schain's pmp and finish initialization.
971                  * Any chain belonging to the super-root topology should
972                  * have a NULL pmp (not even set to spmp).
973                  */
974                 ripdata = &hammer2_chain_rdata(schain)->ipdata;
975                 KKASSERT(schain->pmp == NULL);
976                 spmp->pfs_clid = ripdata->pfs_clid;
977
978                 /*
979                  * Replace the dummy spmp->iroot with a real one.  It's
980                  * easier to just do a wholesale replacement than to try
981                  * to update the chain and fixup the iroot fields.
982                  *
983                  * The returned inode is locked with the supplied cluster.
984                  */
985                 cluster = hammer2_cluster_from_chain(schain);
986                 hammer2_inode_drop(spmp->iroot);
987                 spmp->iroot = NULL;
988                 spmp->iroot = hammer2_inode_get(spmp, NULL, cluster);
989                 spmp->spmp_hmp = hmp;
990                 spmp->pfs_types[0] = ripdata->pfs_type;
991                 hammer2_inode_ref(spmp->iroot);
992                 hammer2_inode_unlock(spmp->iroot, cluster);
993                 schain = NULL;
994                 /* leave spmp->iroot with one ref */
995
996                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
997                         error = hammer2_recovery(hmp);
998                         /* XXX do something with error */
999                 }
1000                 hammer2_update_pmps(hmp);
1001                 hammer2_iocom_init(hmp);
1002
1003                 /*
1004                  * Ref the cluster management messaging descriptor.  The mount
1005                  * program deals with the other end of the communications pipe.
1006                  */
1007                 fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
1008                 if (fp) {
1009                         hammer2_cluster_reconnect(hmp, fp);
1010                 } else {
1011                         kprintf("hammer2_mount: bad cluster_fd!\n");
1012                 }
1013         } else {
1014                 spmp = hmp->spmp;
1015         }
1016
1017         /*
1018          * Lookup the mount point under the media-localized super-root.
1019          * Scanning hammer2_pfslist doesn't help us because it represents
1020          * PFS cluster ids which can aggregate several named PFSs together.
1021          *
1022          * cluster->pmp will incorrectly point to spmp and must be fixed
1023          * up later on.
1024          */
1025         cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1026         lhc = hammer2_dirhash(label, strlen(label));
1027         cluster = hammer2_cluster_lookup(cparent, &key_next,
1028                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
1029                                       0);
1030         while (cluster) {
1031                 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE &&
1032                     strcmp(label,
1033                        hammer2_cluster_rdata(cluster)->ipdata.filename) == 0) {
1034                         break;
1035                 }
1036                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
1037                                             key_next,
1038                                             lhc + HAMMER2_DIRHASH_LOMASK, 0);
1039         }
1040         hammer2_inode_unlock(spmp->iroot, cparent);
1041
1042         /*
1043          * PFS could not be found?
1044          */
1045         if (cluster == NULL) {
1046                 kprintf("hammer2_mount: PFS label not found\n");
1047                 hammer2_unmount_helper(mp, NULL, hmp);
1048                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1049                 hammer2_vfs_unmount(mp, MNT_FORCE);
1050
1051                 return EINVAL;
1052         }
1053
1054         /*
1055          * Acquire the pmp structure (it should have already been allocated
1056          * via hammer2_update_pmps() so do not pass cluster in to add to
1057          * available chains).
1058          *
1059          * Check if the cluster has already been mounted.  A cluster can
1060          * only be mounted once, use null mounts to mount additional copies.
1061          */
1062         ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1063         hammer2_cluster_bref(cluster, &bref);
1064         pmp = hammer2_pfsalloc(NULL, ripdata, bref.modify_tid);
1065         hammer2_cluster_unlock(cluster);
1066         hammer2_cluster_drop(cluster);
1067
1068         if (pmp->mp) {
1069                 kprintf("hammer2_mount: PFS already mounted!\n");
1070                 hammer2_unmount_helper(mp, NULL, hmp);
1071                 lockmgr(&hammer2_mntlk, LK_RELEASE);
1072                 hammer2_vfs_unmount(mp, MNT_FORCE);
1073
1074                 return EBUSY;
1075         }
1076
1077         /*
1078          * Finish the mount
1079          */
1080         kprintf("hammer2_mount hmp=%p pmp=%p\n", hmp, pmp);
1081
1082         mp->mnt_flag = MNT_LOCAL;
1083         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
1084         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
1085  
1086         /*
1087          * required mount structure initializations
1088          */
1089         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
1090         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
1091  
1092         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
1093         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1094  
1095         /*
1096          * Optional fields
1097          */
1098         mp->mnt_iosize_max = MAXPHYS;
1099
1100         /*
1101          * Connect up mount pointers.
1102          */
1103         hammer2_mount_helper(mp, pmp);
1104
1105         lockmgr(&hammer2_mntlk, LK_RELEASE);
1106
1107         /*
1108          * A mounted PFS needs a write thread for logical buffers and
1109          * a hidden directory for deletions of open files.  These features
1110          * are not used by unmounted PFSs.
1111          *
1112          * The logical file buffer bio write thread handles things like
1113          * physical block assignment and compression.
1114          */
1115         pmp->wthread_destroy = 0;
1116         lwkt_create(hammer2_write_thread, pmp,
1117                     &pmp->wthread_td, NULL, 0, -1, "h2pfs-%s", label);
1118
1119         /*
1120          * With the cluster operational install ihidden.
1121          * (only applicable to pfs mounts, not applicable to spmp)
1122          */
1123         hammer2_inode_install_hidden(pmp);
1124
1125         /*
1126          * Finish setup
1127          */
1128         vfs_getnewfsid(mp);
1129         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
1130         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
1131         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
1132
1133         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
1134         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
1135         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
1136         copyinstr(path, mp->mnt_stat.f_mntonname,
1137                   sizeof(mp->mnt_stat.f_mntonname) - 1,
1138                   &size);
1139
1140         /*
1141          * Initial statfs to prime mnt_stat.
1142          */
1143         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
1144         
1145         return 0;
1146 }
1147
1148 /*
1149  * Scan PFSs under the super-root and create hammer2_pfs structures.
1150  */
1151 static
1152 void
1153 hammer2_update_pmps(hammer2_dev_t *hmp)
1154 {
1155         const hammer2_inode_data_t *ripdata;
1156         hammer2_cluster_t *cparent;
1157         hammer2_cluster_t *cluster;
1158         hammer2_blockref_t bref;
1159         hammer2_pfs_t *spmp;
1160         hammer2_pfs_t *pmp;
1161         hammer2_key_t key_next;
1162
1163         /*
1164          * Lookup mount point under the media-localized super-root.
1165          *
1166          * cluster->pmp will incorrectly point to spmp and must be fixed
1167          * up later on.
1168          */
1169         spmp = hmp->spmp;
1170         cparent = hammer2_inode_lock(spmp->iroot, HAMMER2_RESOLVE_ALWAYS);
1171         cluster = hammer2_cluster_lookup(cparent, &key_next,
1172                                          HAMMER2_KEY_MIN,
1173                                          HAMMER2_KEY_MAX,
1174                                          0);
1175         while (cluster) {
1176                 if (hammer2_cluster_type(cluster) != HAMMER2_BREF_TYPE_INODE)
1177                         continue;
1178                 ripdata = &hammer2_cluster_rdata(cluster)->ipdata;
1179                 hammer2_cluster_bref(cluster, &bref);
1180                 kprintf("ADD LOCAL PFS: %s\n", ripdata->filename);
1181
1182                 pmp = hammer2_pfsalloc(cluster, ripdata, bref.modify_tid);
1183                 cluster = hammer2_cluster_next(cparent, cluster,
1184                                                &key_next,
1185                                                key_next,
1186                                                HAMMER2_KEY_MAX,
1187                                                0);
1188         }
1189         hammer2_inode_unlock(spmp->iroot, cparent);
1190 }
1191
1192 /*
1193  * Handle bioq for strategy write
1194  */
1195 static
1196 void
1197 hammer2_write_thread(void *arg)
1198 {
1199         hammer2_pfs_t *pmp;
1200         struct bio *bio;
1201         struct buf *bp;
1202         hammer2_trans_t trans;
1203         struct vnode *vp;
1204         hammer2_inode_t *ip;
1205         hammer2_cluster_t *cparent;
1206         const hammer2_inode_data_t *ripdata;
1207         hammer2_key_t lbase;
1208         int lblksize;
1209         int pblksize;
1210         int error;
1211         
1212         pmp = arg;
1213         
1214         hammer2_mtx_ex(&pmp->wthread_mtx);
1215         for (;;) {
1216                 /*
1217                  * Wait for work.  Break out and destroy the thread only if
1218                  * requested and no work remains.
1219                  */
1220                 if (bioq_first(&pmp->wthread_bioq) == NULL) {
1221                         if (pmp->wthread_destroy)
1222                                 break;
1223                         mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
1224                                  0, "h2bioqw", 0);
1225                         continue;
1226                 }
1227
1228                 /*
1229                  * Special transaction for logical buffer cache writes.
1230                  */
1231                 hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
1232
1233                 while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
1234                         /*
1235                          * dummy bio for synchronization.  The transaction
1236                          * must be terminated.
1237                          */
1238                         if (bio->bio_buf == NULL) {
1239                                 bio->bio_flags |= BIO_DONE;
1240                                 /* bio will become invalid after DONE set */
1241                                 wakeup(bio);
1242                                 break;
1243                         }
1244
1245                         /*
1246                          * else normal bio processing
1247                          */
1248                         hammer2_mtx_unlock(&pmp->wthread_mtx);
1249
1250                         hammer2_lwinprog_drop(pmp);
1251                         
1252                         error = 0;
1253                         bp = bio->bio_buf;
1254                         vp = bp->b_vp;
1255                         ip = VTOI(vp);
1256
1257                         /*
1258                          * Inode is modified, flush size and mtime changes
1259                          * to ensure that the file size remains consistent
1260                          * with the buffers being flushed.
1261                          *
1262                          * NOTE: The inode_fsync() call only flushes the
1263                          *       inode's meta-data state, it doesn't try
1264                          *       to flush underlying buffers or chains.
1265                          *
1266                          * NOTE: hammer2_write_file_core() may indirectly
1267                          *       modify and modsync the inode.
1268                          */
1269                         cparent = hammer2_inode_lock(ip,
1270                                                      HAMMER2_RESOLVE_ALWAYS);
1271                         if (ip->flags & (HAMMER2_INODE_RESIZED |
1272                                          HAMMER2_INODE_MTIME)) {
1273                                 hammer2_inode_fsync(&trans, ip, cparent);
1274                         }
1275                         ripdata = &hammer2_cluster_rdata(cparent)->ipdata;
1276                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
1277                                                         &lbase, NULL);
1278                         pblksize = hammer2_calc_physical(ip, ripdata, lbase);
1279                         hammer2_write_file_core(bp, &trans, ip, ripdata,
1280                                                 cparent,
1281                                                 lbase, IO_ASYNC,
1282                                                 pblksize, &error);
1283                         /* ripdata can be invalid after call */
1284                         hammer2_inode_unlock(ip, cparent);
1285                         if (error) {
1286                                 kprintf("hammer2: error in buffer write\n");
1287                                 bp->b_flags |= B_ERROR;
1288                                 bp->b_error = EIO;
1289                         }
1290                         biodone(bio);
1291                         hammer2_mtx_ex(&pmp->wthread_mtx);
1292                 }
1293                 hammer2_trans_done(&trans);
1294         }
1295         pmp->wthread_destroy = -1;
1296         wakeup(&pmp->wthread_destroy);
1297         
1298         hammer2_mtx_unlock(&pmp->wthread_mtx);
1299 }
1300
1301 void
1302 hammer2_bioq_sync(hammer2_pfs_t *pmp)
1303 {
1304         struct bio sync_bio;
1305
1306         bzero(&sync_bio, sizeof(sync_bio));     /* dummy with no bio_buf */
1307         hammer2_mtx_ex(&pmp->wthread_mtx);
1308         if (pmp->wthread_destroy == 0 &&
1309             TAILQ_FIRST(&pmp->wthread_bioq.queue)) {
1310                 bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
1311                 while ((sync_bio.bio_flags & BIO_DONE) == 0)
1312                         mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
1313         }
1314         hammer2_mtx_unlock(&pmp->wthread_mtx);
1315 }
1316
1317 /* 
1318  * Return a chain suitable for I/O, creating the chain if necessary
1319  * and assigning its physical block.  The cluster will be in a modified
1320  * state.
1321  *
1322  * cparent can wind up being anything.
1323  *
1324  * NOTE: Special case for data embedded in inode.
1325  */
1326 static
1327 hammer2_cluster_t *
1328 hammer2_assign_physical(hammer2_trans_t *trans,
1329                         hammer2_inode_t *ip, hammer2_cluster_t *cparent,
1330                         hammer2_key_t lbase, int pblksize, int *errorp)
1331 {
1332         hammer2_cluster_t *cluster;
1333         hammer2_cluster_t *dparent;
1334         hammer2_key_t key_dummy;
1335         int pradix = hammer2_getradix(pblksize);
1336
1337         /*
1338          * Locate the chain associated with lbase, return a locked chain.
1339          * However, do not instantiate any data reference (which utilizes a
1340          * device buffer) because we will be using direct IO via the
1341          * logical buffer cache buffer.
1342          */
1343         *errorp = 0;
1344         KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
1345 retry:
1346         dparent = hammer2_cluster_lookup_init(cparent, 0);
1347         cluster = hammer2_cluster_lookup(dparent, &key_dummy,
1348                                      lbase, lbase,
1349                                      HAMMER2_LOOKUP_NODATA);
1350
1351         if (cluster == NULL) {
1352                 /*
1353                  * We found a hole, create a new chain entry.
1354                  *
1355                  * NOTE: DATA chains are created without device backing
1356                  *       store (nor do we want any).
1357                  */
1358                 *errorp = hammer2_cluster_create(trans, dparent, &cluster,
1359                                                lbase, HAMMER2_PBUFRADIX,
1360                                                HAMMER2_BREF_TYPE_DATA,
1361                                                pblksize, 0);
1362                 if (cluster == NULL) {
1363                         hammer2_cluster_lookup_done(dparent);
1364                         panic("hammer2_cluster_create: par=%p error=%d\n",
1365                                 dparent->focus, *errorp);
1366                         goto retry;
1367                 }
1368                 /*ip->delta_dcount += pblksize;*/
1369         } else {
1370                 switch (hammer2_cluster_type(cluster)) {
1371                 case HAMMER2_BREF_TYPE_INODE:
1372                         /*
1373                          * The data is embedded in the inode, which requires
1374                          * a bit more finess.
1375                          */
1376                         hammer2_cluster_modify_ip(trans, ip, cluster, 0);
1377                         break;
1378                 case HAMMER2_BREF_TYPE_DATA:
1379                         if (hammer2_cluster_need_resize(cluster, pblksize)) {
1380                                 hammer2_cluster_resize(trans, ip,
1381                                                      dparent, cluster,
1382                                                      pradix,
1383                                                      HAMMER2_MODIFY_OPTDATA);
1384                         }
1385
1386                         /*
1387                          * DATA buffers must be marked modified whether the
1388                          * data is in a logical buffer or not.  We also have
1389                          * to make this call to fixup the chain data pointers
1390                          * after resizing in case this is an encrypted or
1391                          * compressed buffer.
1392                          */
1393                         hammer2_cluster_modify(trans, cluster,
1394                                                HAMMER2_MODIFY_OPTDATA);
1395                         break;
1396                 default:
1397                         panic("hammer2_assign_physical: bad type");
1398                         /* NOT REACHED */
1399                         break;
1400                 }
1401         }
1402
1403         /*
1404          * Cleanup.  If cluster wound up being the inode itself, i.e.
1405          * the DIRECTDATA case for offset 0, then we need to update cparent.
1406          * The caller expects cparent to not become stale.
1407          */
1408         hammer2_cluster_lookup_done(dparent);
1409         /* dparent = NULL; safety */
1410         return (cluster);
1411 }
1412
1413 /* 
1414  * bio queued from hammer2_vnops.c.
1415  *
1416  * The core write function which determines which path to take
1417  * depending on compression settings.  We also have to locate the
1418  * related clusters so we can calculate and set the check data for
1419  * the blockref.
1420  */
1421 static
1422 void
1423 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
1424                         hammer2_inode_t *ip,
1425                         const hammer2_inode_data_t *ripdata,
1426                         hammer2_cluster_t *cparent,
1427                         hammer2_key_t lbase, int ioflag, int pblksize,
1428                         int *errorp)
1429 {
1430         hammer2_cluster_t *cluster;
1431
1432         switch(HAMMER2_DEC_ALGO(ripdata->comp_algo)) {
1433         case HAMMER2_COMP_NONE:
1434                 /*
1435                  * We have to assign physical storage to the buffer
1436                  * we intend to dirty or write now to avoid deadlocks
1437                  * in the strategy code later.
1438                  *
1439                  * This can return NOOFFSET for inode-embedded data.
1440                  * The strategy code will take care of it in that case.
1441                  */
1442                 cluster = hammer2_assign_physical(trans, ip, cparent,
1443                                                 lbase, pblksize,
1444                                                 errorp);
1445                 if (cluster->ddflag) {
1446                         hammer2_inode_data_t *wipdata;
1447
1448                         wipdata = hammer2_cluster_modify_ip(trans, ip,
1449                                                             cluster, 0);
1450                         KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1451                         KKASSERT(bp->b_loffset == 0);
1452                         bcopy(bp->b_data, wipdata->u.data,
1453                               HAMMER2_EMBEDDED_BYTES);
1454                         hammer2_cluster_modsync(cluster);
1455                 } else {
1456                         hammer2_write_bp(cluster, bp, ioflag, pblksize,
1457                                          errorp, ripdata->check_algo);
1458                 }
1459                 /* ripdata can become invalid */
1460                 if (cluster) {
1461                         hammer2_cluster_unlock(cluster);
1462                         hammer2_cluster_drop(cluster);
1463                 }
1464                 break;
1465         case HAMMER2_COMP_AUTOZERO:
1466                 /*
1467                  * Check for zero-fill only
1468                  */
1469                 hammer2_zero_check_and_write(bp, trans, ip,
1470                                     ripdata, cparent, lbase,
1471                                     ioflag, pblksize, errorp,
1472                                     ripdata->check_algo);
1473                 break;
1474         case HAMMER2_COMP_LZ4:
1475         case HAMMER2_COMP_ZLIB:
1476         default:
1477                 /*
1478                  * Check for zero-fill and attempt compression.
1479                  */
1480                 hammer2_compress_and_write(bp, trans, ip,
1481                                            ripdata, cparent,
1482                                            lbase, ioflag,
1483                                            pblksize, errorp,
1484                                            ripdata->comp_algo,
1485                                            ripdata->check_algo);
1486                 break;
1487         }
1488 }
1489
1490 /*
1491  * Generic function that will perform the compression in compression
1492  * write path. The compression algorithm is determined by the settings
1493  * obtained from inode.
1494  */
1495 static
1496 void
1497 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
1498         hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1499         hammer2_cluster_t *cparent,
1500         hammer2_key_t lbase, int ioflag, int pblksize,
1501         int *errorp, int comp_algo, int check_algo)
1502 {
1503         hammer2_cluster_t *cluster;
1504         hammer2_chain_t *chain;
1505         int comp_size;
1506         int comp_block_size;
1507         int i;
1508         char *comp_buffer;
1509
1510         if (test_block_zeros(bp->b_data, pblksize)) {
1511                 zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
1512                 return;
1513         }
1514
1515         comp_size = 0;
1516         comp_buffer = NULL;
1517
1518         KKASSERT(pblksize / 2 <= 32768);
1519                 
1520         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
1521                 z_stream strm_compress;
1522                 int comp_level;
1523                 int ret;
1524
1525                 switch(HAMMER2_DEC_ALGO(comp_algo)) {
1526                 case HAMMER2_COMP_LZ4:
1527                         comp_buffer = objcache_get(cache_buffer_write,
1528                                                    M_INTWAIT);
1529                         comp_size = LZ4_compress_limitedOutput(
1530                                         bp->b_data,
1531                                         &comp_buffer[sizeof(int)],
1532                                         pblksize,
1533                                         pblksize / 2 - sizeof(int));
1534                         /*
1535                          * We need to prefix with the size, LZ4
1536                          * doesn't do it for us.  Add the related
1537                          * overhead.
1538                          */
1539                         *(int *)comp_buffer = comp_size;
1540                         if (comp_size)
1541                                 comp_size += sizeof(int);
1542                         break;
1543                 case HAMMER2_COMP_ZLIB:
1544                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1545                         if (comp_level == 0)
1546                                 comp_level = 6; /* default zlib compression */
1547                         else if (comp_level < 6)
1548                                 comp_level = 6;
1549                         else if (comp_level > 9)
1550                                 comp_level = 9;
1551                         ret = deflateInit(&strm_compress, comp_level);
1552                         if (ret != Z_OK) {
1553                                 kprintf("HAMMER2 ZLIB: fatal error "
1554                                         "on deflateInit.\n");
1555                         }
1556
1557                         comp_buffer = objcache_get(cache_buffer_write,
1558                                                    M_INTWAIT);
1559                         strm_compress.next_in = bp->b_data;
1560                         strm_compress.avail_in = pblksize;
1561                         strm_compress.next_out = comp_buffer;
1562                         strm_compress.avail_out = pblksize / 2;
1563                         ret = deflate(&strm_compress, Z_FINISH);
1564                         if (ret == Z_STREAM_END) {
1565                                 comp_size = pblksize / 2 -
1566                                             strm_compress.avail_out;
1567                         } else {
1568                                 comp_size = 0;
1569                         }
1570                         ret = deflateEnd(&strm_compress);
1571                         break;
1572                 default:
1573                         kprintf("Error: Unknown compression method.\n");
1574                         kprintf("Comp_method = %d.\n", comp_algo);
1575                         break;
1576                 }
1577         }
1578
1579         if (comp_size == 0) {
1580                 /*
1581                  * compression failed or turned off
1582                  */
1583                 comp_block_size = pblksize;     /* safety */
1584                 if (++ip->comp_heuristic > 128)
1585                         ip->comp_heuristic = 8;
1586         } else {
1587                 /*
1588                  * compression succeeded
1589                  */
1590                 ip->comp_heuristic = 0;
1591                 if (comp_size <= 1024) {
1592                         comp_block_size = 1024;
1593                 } else if (comp_size <= 2048) {
1594                         comp_block_size = 2048;
1595                 } else if (comp_size <= 4096) {
1596                         comp_block_size = 4096;
1597                 } else if (comp_size <= 8192) {
1598                         comp_block_size = 8192;
1599                 } else if (comp_size <= 16384) {
1600                         comp_block_size = 16384;
1601                 } else if (comp_size <= 32768) {
1602                         comp_block_size = 32768;
1603                 } else {
1604                         panic("hammer2: WRITE PATH: "
1605                               "Weird comp_size value.");
1606                         /* NOT REACHED */
1607                         comp_block_size = pblksize;
1608                 }
1609         }
1610
1611         cluster = hammer2_assign_physical(trans, ip, cparent,
1612                                           lbase, comp_block_size,
1613                                           errorp);
1614         ripdata = NULL;
1615
1616         if (*errorp) {
1617                 kprintf("WRITE PATH: An error occurred while "
1618                         "assigning physical space.\n");
1619                 KKASSERT(cluster == NULL);
1620                 goto done;
1621         }
1622
1623         if (cluster->ddflag) {
1624                 hammer2_inode_data_t *wipdata;
1625
1626                 wipdata = &hammer2_cluster_wdata(cluster)->ipdata;
1627                 KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1628                 KKASSERT(bp->b_loffset == 0);
1629                 bcopy(bp->b_data, wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1630                 hammer2_cluster_modsync(cluster);
1631         } else
1632         for (i = 0; i < cluster->nchains; ++i) {
1633                 hammer2_io_t *dio;
1634                 char *bdata;
1635
1636                 /* XXX hackx */
1637
1638                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1639                         continue;
1640                 chain = cluster->array[i].chain;        /* XXX */
1641                 if (chain == NULL)
1642                         continue;
1643                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1644
1645                 switch(chain->bref.type) {
1646                 case HAMMER2_BREF_TYPE_INODE:
1647                         panic("hammer2_write_bp: unexpected inode\n");
1648                         break;
1649                 case HAMMER2_BREF_TYPE_DATA:
1650                         /*
1651                          * Optimize out the read-before-write
1652                          * if possible.
1653                          */
1654                         *errorp = hammer2_io_newnz(chain->hmp,
1655                                                    chain->bref.data_off,
1656                                                    chain->bytes,
1657                                                    &dio);
1658                         if (*errorp) {
1659                                 hammer2_io_brelse(&dio);
1660                                 kprintf("hammer2: WRITE PATH: "
1661                                         "dbp bread error\n");
1662                                 break;
1663                         }
1664                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1665
1666                         /*
1667                          * When loading the block make sure we don't
1668                          * leave garbage after the compressed data.
1669                          */
1670                         if (comp_size) {
1671                                 chain->bref.methods =
1672                                         HAMMER2_ENC_COMP(comp_algo) +
1673                                         HAMMER2_ENC_CHECK(check_algo);
1674                                 bcopy(comp_buffer, bdata, comp_size);
1675                                 if (comp_size != comp_block_size) {
1676                                         bzero(bdata + comp_size,
1677                                               comp_block_size - comp_size);
1678                                 }
1679                         } else {
1680                                 chain->bref.methods =
1681                                         HAMMER2_ENC_COMP(
1682                                                 HAMMER2_COMP_NONE) +
1683                                         HAMMER2_ENC_CHECK(check_algo);
1684                                 bcopy(bp->b_data, bdata, pblksize);
1685                         }
1686
1687                         /*
1688                          * The flush code doesn't calculate check codes for
1689                          * file data (doing so can result in excessive I/O),
1690                          * so we do it here.
1691                          */
1692                         hammer2_chain_setcheck(chain, bdata);
1693
1694                         /*
1695                          * Device buffer is now valid, chain is no longer in
1696                          * the initial state.
1697                          *
1698                          * (No blockref table worries with file data)
1699                          */
1700                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1701
1702                         /* Now write the related bdp. */
1703                         if (ioflag & IO_SYNC) {
1704                                 /*
1705                                  * Synchronous I/O requested.
1706                                  */
1707                                 hammer2_io_bwrite(&dio);
1708                         /*
1709                         } else if ((ioflag & IO_DIRECT) &&
1710                                    loff + n == pblksize) {
1711                                 hammer2_io_bdwrite(&dio);
1712                         */
1713                         } else if (ioflag & IO_ASYNC) {
1714                                 hammer2_io_bawrite(&dio);
1715                         } else {
1716                                 hammer2_io_bdwrite(&dio);
1717                         }
1718                         break;
1719                 default:
1720                         panic("hammer2_write_bp: bad chain type %d\n",
1721                                 chain->bref.type);
1722                         /* NOT REACHED */
1723                         break;
1724                 }
1725         }
1726 done:
1727         if (cluster) {
1728                 hammer2_cluster_unlock(cluster);
1729                 hammer2_cluster_drop(cluster);
1730         }
1731         if (comp_buffer)
1732                 objcache_put(cache_buffer_write, comp_buffer);
1733 }
1734
1735 /*
1736  * Function that performs zero-checking and writing without compression,
1737  * it corresponds to default zero-checking path.
1738  */
1739 static
1740 void
1741 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1742         hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1743         hammer2_cluster_t *cparent,
1744         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp,
1745         int check_algo)
1746 {
1747         hammer2_cluster_t *cluster;
1748
1749         if (test_block_zeros(bp->b_data, pblksize)) {
1750                 zero_write(bp, trans, ip, ripdata, cparent, lbase, errorp);
1751                 /* ripdata can become invalid */
1752         } else {
1753                 cluster = hammer2_assign_physical(trans, ip, cparent,
1754                                                   lbase, pblksize, errorp);
1755                 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp,
1756                                  check_algo);
1757                 /* ripdata can become invalid */
1758                 if (cluster) {
1759                         hammer2_cluster_unlock(cluster);
1760                         hammer2_cluster_drop(cluster);
1761                 }
1762         }
1763 }
1764
1765 /*
1766  * A function to test whether a block of data contains only zeros,
1767  * returns TRUE (non-zero) if the block is all zeros.
1768  */
1769 static
1770 int
1771 test_block_zeros(const char *buf, size_t bytes)
1772 {
1773         size_t i;
1774
1775         for (i = 0; i < bytes; i += sizeof(long)) {
1776                 if (*(const long *)(buf + i) != 0)
1777                         return (0);
1778         }
1779         return (1);
1780 }
1781
1782 /*
1783  * Function to "write" a block that contains only zeros.
1784  */
1785 static
1786 void
1787 zero_write(struct buf *bp, hammer2_trans_t *trans,
1788            hammer2_inode_t *ip, const hammer2_inode_data_t *ripdata,
1789            hammer2_cluster_t *cparent,
1790            hammer2_key_t lbase, int *errorp __unused)
1791 {
1792         hammer2_cluster_t *cluster;
1793         hammer2_key_t key_dummy;
1794
1795         cparent = hammer2_cluster_lookup_init(cparent, 0);
1796         cluster = hammer2_cluster_lookup(cparent, &key_dummy, lbase, lbase,
1797                                      HAMMER2_LOOKUP_NODATA);
1798         if (cluster) {
1799                 if (cluster->ddflag) {
1800                         hammer2_inode_data_t *wipdata;
1801
1802                         wipdata = hammer2_cluster_modify_ip(trans, ip,
1803                                                             cluster, 0);
1804                         KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1805                         KKASSERT(bp->b_loffset == 0);
1806                         bzero(wipdata->u.data, HAMMER2_EMBEDDED_BYTES);
1807                         hammer2_cluster_modsync(cluster);
1808                 } else {
1809                         hammer2_cluster_delete(trans, cparent, cluster,
1810                                                HAMMER2_DELETE_PERMANENT);
1811                 }
1812                 hammer2_cluster_unlock(cluster);
1813                 hammer2_cluster_drop(cluster);
1814         }
1815         hammer2_cluster_lookup_done(cparent);
1816 }
1817
1818 /*
1819  * Function to write the data as it is, without performing any sort of
1820  * compression. This function is used in path without compression and
1821  * default zero-checking path.
1822  */
1823 static
1824 void
1825 hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
1826                                 int pblksize, int *errorp, int check_algo)
1827 {
1828         hammer2_chain_t *chain;
1829         hammer2_inode_data_t *wipdata;
1830         hammer2_io_t *dio;
1831         char *bdata;
1832         int error;
1833         int i;
1834
1835         error = 0;      /* XXX TODO below */
1836
1837         for (i = 0; i < cluster->nchains; ++i) {
1838                 if ((cluster->array[i].flags & HAMMER2_CITEM_FEMOD) == 0)
1839                         continue;
1840                 chain = cluster->array[i].chain;        /* XXX */
1841                 if (chain == NULL)
1842                         continue;
1843                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1844
1845                 switch(chain->bref.type) {
1846                 case HAMMER2_BREF_TYPE_INODE:
1847                         wipdata = &hammer2_chain_wdata(chain)->ipdata;
1848                         KKASSERT(wipdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA);
1849                         KKASSERT(bp->b_loffset == 0);
1850                         bcopy(bp->b_data, wipdata->u.data,
1851                               HAMMER2_EMBEDDED_BYTES);
1852                         error = 0;
1853                         break;
1854                 case HAMMER2_BREF_TYPE_DATA:
1855                         error = hammer2_io_newnz(chain->hmp,
1856                                                  chain->bref.data_off,
1857                                                  chain->bytes, &dio);
1858                         if (error) {
1859                                 hammer2_io_bqrelse(&dio);
1860                                 kprintf("hammer2: WRITE PATH: "
1861                                         "dbp bread error\n");
1862                                 break;
1863                         }
1864                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1865
1866                         chain->bref.methods = HAMMER2_ENC_COMP(
1867                                                         HAMMER2_COMP_NONE) +
1868                                               HAMMER2_ENC_CHECK(check_algo);
1869                         bcopy(bp->b_data, bdata, chain->bytes);
1870
1871                         /*
1872                          * The flush code doesn't calculate check codes for
1873                          * file data (doing so can result in excessive I/O),
1874                          * so we do it here.
1875                          */
1876                         hammer2_chain_setcheck(chain, bdata);
1877
1878                         /*
1879                          * Device buffer is now valid, chain is no longer in
1880                          * the initial state.
1881                          *
1882                          * (No blockref table worries with file data)
1883                          */
1884                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1885
1886                         if (ioflag & IO_SYNC) {
1887                                 /*
1888                                  * Synchronous I/O requested.
1889                                  */
1890                                 hammer2_io_bwrite(&dio);
1891                         /*
1892                         } else if ((ioflag & IO_DIRECT) &&
1893                                    loff + n == pblksize) {
1894                                 hammer2_io_bdwrite(&dio);
1895                         */
1896                         } else if (ioflag & IO_ASYNC) {
1897                                 hammer2_io_bawrite(&dio);
1898                         } else {
1899                                 hammer2_io_bdwrite(&dio);
1900                         }
1901                         break;
1902                 default:
1903                         panic("hammer2_write_bp: bad chain type %d\n",
1904                               chain->bref.type);
1905                         /* NOT REACHED */
1906                         error = 0;
1907                         break;
1908                 }
1909                 KKASSERT(error == 0);   /* XXX TODO */
1910         }
1911         *errorp = error;
1912 }
1913
1914 static
1915 int
1916 hammer2_remount(hammer2_dev_t *hmp, struct mount *mp, char *path,
1917                 struct vnode *devvp, struct ucred *cred)
1918 {
1919         int error;
1920
1921         if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1922                 error = hammer2_recovery(hmp);
1923         } else {
1924                 error = 0;
1925         }
1926         return error;
1927 }
1928
1929 static
1930 int
1931 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1932 {
1933         hammer2_pfs_t *pmp;
1934         int flags;
1935         int error = 0;
1936
1937         pmp = MPTOPMP(mp);
1938
1939         if (pmp == NULL)
1940                 return(0);
1941
1942         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1943
1944         /*
1945          * If mount initialization proceeded far enough we must flush
1946          * its vnodes and sync the underlying mount points.  Three syncs
1947          * are required to fully flush the filesystem (freemap updates lag
1948          * by one flush, and one extra for safety).
1949          */
1950         if (mntflags & MNT_FORCE)
1951                 flags = FORCECLOSE;
1952         else
1953                 flags = 0;
1954         if (pmp->iroot) {
1955                 error = vflush(mp, 0, flags);
1956                 if (error)
1957                         goto failed;
1958                 hammer2_vfs_sync(mp, MNT_WAIT);
1959                 hammer2_vfs_sync(mp, MNT_WAIT);
1960                 hammer2_vfs_sync(mp, MNT_WAIT);
1961         }
1962
1963         if (pmp->wthread_td) {
1964                 hammer2_mtx_ex(&pmp->wthread_mtx);
1965                 pmp->wthread_destroy = 1;
1966                 wakeup(&pmp->wthread_bioq);
1967                 while (pmp->wthread_destroy != -1) {
1968                         mtxsleep(&pmp->wthread_destroy,
1969                                 &pmp->wthread_mtx, 0,
1970                                 "umount-sleep", 0);
1971                 }
1972                 hammer2_mtx_unlock(&pmp->wthread_mtx);
1973                 pmp->wthread_td = NULL;
1974         }
1975
1976         /*
1977          * Cleanup our reference on ihidden.
1978          */
1979         if (pmp->ihidden) {
1980                 hammer2_inode_drop(pmp->ihidden);
1981                 pmp->ihidden = NULL;
1982         }
1983         if (pmp->mp)
1984                 hammer2_unmount_helper(mp, pmp, NULL);
1985
1986         error = 0;
1987 failed:
1988         lockmgr(&hammer2_mntlk, LK_RELEASE);
1989
1990         return (error);
1991 }
1992
1993 /*
1994  * Mount helper, hook the system mount into our PFS.
1995  * The mount lock is held.
1996  *
1997  * We must bump the mount_count on related devices for any
1998  * mounted PFSs.
1999  */
2000 static
2001 void
2002 hammer2_mount_helper(struct mount *mp, hammer2_pfs_t *pmp)
2003 {
2004         hammer2_cluster_t *cluster;
2005         hammer2_chain_t *rchain;
2006         int i;
2007
2008         mp->mnt_data = (qaddr_t)pmp;
2009         pmp->mp = mp;
2010
2011         /*
2012          * After pmp->mp is set we have to adjust hmp->mount_count.
2013          */
2014         cluster = &pmp->iroot->cluster;
2015         for (i = 0; i < cluster->nchains; ++i) {
2016                 rchain = cluster->array[i].chain;
2017                 if (rchain == NULL)
2018                         continue;
2019                 ++rchain->hmp->mount_count;
2020                 kprintf("hammer2_mount hmp=%p ++mount_count=%d\n",
2021                         rchain->hmp, rchain->hmp->mount_count);
2022         }
2023 }
2024
2025 /*
2026  * Mount helper, unhook the system mount from our PFS.
2027  * The mount lock is held.
2028  *
2029  * If hmp is supplied a mount responsible for being the first to open
2030  * the block device failed and the block device and all PFSs using the
2031  * block device must be cleaned up.
2032  *
2033  * If pmp is supplied multiple devices might be backing the PFS and each
2034  * must be disconnect.  This might not be the last PFS using some of the
2035  * underlying devices.  Also, we have to adjust our hmp->mount_count
2036  * accounting for the devices backing the pmp which is now undergoing an
2037  * unmount.
2038  */
2039 static
2040 void
2041 hammer2_unmount_helper(struct mount *mp, hammer2_pfs_t *pmp, hammer2_dev_t *hmp)
2042 {
2043         hammer2_cluster_t *cluster;
2044         hammer2_chain_t *rchain;
2045         struct vnode *devvp;
2046         int dumpcnt;
2047         int ronly = 0;
2048         int i;
2049
2050         /*
2051          * If no device supplied this is a high-level unmount and we have to
2052          * to disconnect the mount, adjust mount_count, and locate devices
2053          * that might now have no mounts.
2054          */
2055         if (pmp) {
2056                 KKASSERT(hmp == NULL);
2057                 KKASSERT((void *)(intptr_t)mp->mnt_data == pmp);
2058                 pmp->mp = NULL;
2059                 mp->mnt_data = NULL;
2060
2061                 /*
2062                  * After pmp->mp is cleared we have to account for
2063                  * mount_count.
2064                  */
2065                 cluster = &pmp->iroot->cluster;
2066                 for (i = 0; i < cluster->nchains; ++i) {
2067                         rchain = cluster->array[i].chain;
2068                         if (rchain == NULL)
2069                                 continue;
2070                         --rchain->hmp->mount_count;
2071                         kprintf("hammer2_unmount hmp=%p --mount_count=%d\n",
2072                                 rchain->hmp, rchain->hmp->mount_count);
2073                         /* scrapping hmp now may invalidate the pmp */
2074                 }
2075 again:
2076                 TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
2077                         if (hmp->mount_count == 0) {
2078                                 hammer2_unmount_helper(NULL, NULL, hmp);
2079                                 goto again;
2080                         }
2081                 }
2082                 return;
2083         }
2084
2085         /*
2086          * Try to terminate the block device.  We can't terminate it if
2087          * there are still PFSs referencing it.
2088          */
2089         kprintf("hammer2_unmount hmp=%p mount_count=%d\n",
2090                 hmp, hmp->mount_count);
2091         if (hmp->mount_count)
2092                 return;
2093
2094         hammer2_pfsfree_scan(hmp);
2095         hammer2_dev_exlock(hmp);        /* XXX order */
2096
2097         /*
2098          * Cycle the volume data lock as a safety (probably not needed any
2099          * more).  To ensure everything is out we need to flush at least
2100          * three times.  (1) The running of the unlinkq can dirty the
2101          * filesystem, (2) A normal flush can dirty the freemap, and
2102          * (3) ensure that the freemap is fully synchronized.
2103          *
2104          * The next mount's recovery scan can clean everything up but we want
2105          * to leave the filesystem in a 100% clean state on a normal unmount.
2106          */
2107 #if 0
2108         hammer2_voldata_lock(hmp);
2109         hammer2_voldata_unlock(hmp);
2110 #endif
2111         hammer2_iocom_uninit(hmp);
2112
2113         if ((hmp->vchain.flags | hmp->fchain.flags) &
2114             HAMMER2_CHAIN_FLUSH_MASK) {
2115                 kprintf("hammer2_unmount: chains left over "
2116                         "after final sync\n");
2117                 kprintf("    vchain %08x\n", hmp->vchain.flags);
2118                 kprintf("    fchain %08x\n", hmp->fchain.flags);
2119
2120                 if (hammer2_debug & 0x0010)
2121                         Debugger("entered debugger");
2122         }
2123
2124         KKASSERT(hmp->spmp == NULL);
2125
2126         /*
2127          * Finish up with the device vnode
2128          */
2129         if ((devvp = hmp->devvp) != NULL) {
2130                 vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
2131                 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
2132                 hmp->devvp = NULL;
2133                 VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
2134                 vn_unlock(devvp);
2135                 vrele(devvp);
2136                 devvp = NULL;
2137         }
2138
2139         /*
2140          * Clear vchain/fchain flags that might prevent final cleanup
2141          * of these chains.
2142          */
2143         if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
2144                 atomic_clear_int(&hmp->vchain.flags,
2145                                  HAMMER2_CHAIN_MODIFIED);
2146                 hammer2_pfs_memory_wakeup(hmp->vchain.pmp);
2147                 hammer2_chain_drop(&hmp->vchain);
2148         }
2149         if (hmp->vchain.flags & HAMMER2_CHAIN_UPDATE) {
2150                 atomic_clear_int(&hmp->vchain.flags,
2151                                  HAMMER2_CHAIN_UPDATE);
2152                 hammer2_chain_drop(&hmp->vchain);
2153         }
2154
2155         if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
2156                 atomic_clear_int(&hmp->fchain.flags,
2157                                  HAMMER2_CHAIN_MODIFIED);
2158                 hammer2_pfs_memory_wakeup(hmp->fchain.pmp);
2159                 hammer2_chain_drop(&hmp->fchain);
2160         }
2161         if (hmp->fchain.flags & HAMMER2_CHAIN_UPDATE) {
2162                 atomic_clear_int(&hmp->fchain.flags,
2163                                  HAMMER2_CHAIN_UPDATE);
2164                 hammer2_chain_drop(&hmp->fchain);
2165         }
2166
2167         /*
2168          * Final drop of embedded freemap root chain to
2169          * clean up fchain.core (fchain structure is not
2170          * flagged ALLOCATED so it is cleaned out and then
2171          * left to rot).
2172          */
2173         hammer2_chain_drop(&hmp->fchain);
2174
2175         /*
2176          * Final drop of embedded volume root chain to clean
2177          * up vchain.core (vchain structure is not flagged
2178          * ALLOCATED so it is cleaned out and then left to
2179          * rot).
2180          */
2181         dumpcnt = 50;
2182         hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
2183         dumpcnt = 50;
2184         hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
2185         hammer2_dev_unlock(hmp);
2186         hammer2_chain_drop(&hmp->vchain);
2187
2188         hammer2_io_cleanup(hmp, &hmp->iotree);
2189         if (hmp->iofree_count) {
2190                 kprintf("io_cleanup: %d I/O's left hanging\n",
2191                         hmp->iofree_count);
2192         }
2193
2194         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
2195         kmalloc_destroy(&hmp->mchain);
2196         kfree(hmp, M_HAMMER2);
2197 }
2198
2199 static
2200 int
2201 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
2202              ino_t ino, struct vnode **vpp)
2203 {
2204         kprintf("hammer2_vget\n");
2205         return (EOPNOTSUPP);
2206 }
2207
2208 static
2209 int
2210 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
2211 {
2212         hammer2_pfs_t *pmp;
2213         hammer2_cluster_t *cparent;
2214         int error;
2215         struct vnode *vp;
2216
2217         pmp = MPTOPMP(mp);
2218         if (pmp->iroot == NULL) {
2219                 *vpp = NULL;
2220                 error = EINVAL;
2221         } else {
2222                 cparent = hammer2_inode_lock(pmp->iroot,
2223                                                 HAMMER2_RESOLVE_ALWAYS |
2224                                                 HAMMER2_RESOLVE_SHARED);
2225                 vp = hammer2_igetv(pmp->iroot, cparent, &error);
2226                 hammer2_inode_unlock(pmp->iroot, cparent);
2227                 *vpp = vp;
2228                 if (vp == NULL)
2229                         kprintf("vnodefail\n");
2230         }
2231
2232         return (error);
2233 }
2234
2235 /*
2236  * Filesystem status
2237  *
2238  * XXX incorporate ipdata->inode_quota and data_quota
2239  */
2240 static
2241 int
2242 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
2243 {
2244         hammer2_pfs_t *pmp;
2245         hammer2_dev_t *hmp;
2246         hammer2_blockref_t bref;
2247
2248         pmp = MPTOPMP(mp);
2249         KKASSERT(pmp->iroot->cluster.nchains >= 1);
2250         hmp = pmp->iroot->cluster.focus->hmp;   /* iroot retains focus */
2251         bref = pmp->iroot->cluster.focus->bref; /* no lock */
2252
2253         mp->mnt_stat.f_files = bref.inode_count;
2254         mp->mnt_stat.f_ffree = 0;
2255         mp->mnt_stat.f_blocks = (bref.data_count +
2256                                  hmp->voldata.allocator_free) /
2257                                 mp->mnt_vstat.f_bsize;
2258         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free /
2259                                 mp->mnt_vstat.f_bsize;
2260         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
2261
2262         *sbp = mp->mnt_stat;
2263         return (0);
2264 }
2265
2266 static
2267 int
2268 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
2269 {
2270         hammer2_pfs_t *pmp;
2271         hammer2_dev_t *hmp;
2272         hammer2_blockref_t bref;
2273
2274         pmp = MPTOPMP(mp);
2275         KKASSERT(pmp->iroot->cluster.nchains >= 1);
2276         hmp = pmp->iroot->cluster.focus->hmp;   /* iroot retains focus */
2277         bref = pmp->iroot->cluster.focus->bref; /* no lock */
2278
2279         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
2280         mp->mnt_vstat.f_files = bref.inode_count;
2281         mp->mnt_vstat.f_ffree = 0;
2282         mp->mnt_vstat.f_blocks = (bref.data_count +
2283                                  hmp->voldata.allocator_free) /
2284                                 mp->mnt_vstat.f_bsize;
2285         mp->mnt_vstat.f_bfree = hmp->voldata.allocator_free /
2286                                 mp->mnt_vstat.f_bsize;
2287         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
2288
2289         *sbp = mp->mnt_vstat;
2290         return (0);
2291 }
2292
2293 /*
2294  * Mount-time recovery (RW mounts)
2295  *
2296  * Updates to the free block table are allowed to lag flushes by one
2297  * transaction.  In case of a crash, then on a fresh mount we must do an
2298  * incremental scan of the last committed transaction id and make sure that
2299  * all related blocks have been marked allocated.
2300  *
2301  * The super-root topology and each PFS has its own transaction id domain,
2302  * so we must track PFS boundary transitions.
2303  */
2304 struct hammer2_recovery_elm {
2305         TAILQ_ENTRY(hammer2_recovery_elm) entry;
2306         hammer2_chain_t *chain;
2307         hammer2_tid_t sync_tid;
2308 };
2309
2310 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
2311
2312 struct hammer2_recovery_info {
2313         struct hammer2_recovery_list list;
2314         int     depth;
2315 };
2316
2317 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2318                         hammer2_chain_t *parent,
2319                         struct hammer2_recovery_info *info,
2320                         hammer2_tid_t sync_tid);
2321
2322 #define HAMMER2_RECOVERY_MAXDEPTH       10
2323
2324 static
2325 int
2326 hammer2_recovery(hammer2_dev_t *hmp)
2327 {
2328         hammer2_trans_t trans;
2329         struct hammer2_recovery_info info;
2330         struct hammer2_recovery_elm *elm;
2331         hammer2_chain_t *parent;
2332         hammer2_tid_t sync_tid;
2333         hammer2_tid_t mirror_tid;
2334         int error;
2335         int cumulative_error = 0;
2336
2337         hammer2_trans_init(&trans, hmp->spmp, 0);
2338
2339         sync_tid = hmp->voldata.freemap_tid;
2340         mirror_tid = hmp->voldata.mirror_tid;
2341
2342         kprintf("hammer2 mount \"%s\": ", hmp->devrepname);
2343         if (sync_tid >= mirror_tid) {
2344                 kprintf(" no recovery needed\n");
2345         } else {
2346                 kprintf(" freemap recovery %016jx-%016jx\n",
2347                         sync_tid + 1, mirror_tid);
2348         }
2349
2350         TAILQ_INIT(&info.list);
2351         info.depth = 0;
2352         parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
2353         cumulative_error = hammer2_recovery_scan(&trans, hmp, parent,
2354                                                  &info, sync_tid);
2355         hammer2_chain_lookup_done(parent);
2356
2357         while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
2358                 TAILQ_REMOVE(&info.list, elm, entry);
2359                 parent = elm->chain;
2360                 sync_tid = elm->sync_tid;
2361                 kfree(elm, M_HAMMER2);
2362
2363                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2364                 error = hammer2_recovery_scan(&trans, hmp, parent,
2365                                               &info,
2366                                               hmp->voldata.freemap_tid);
2367                 hammer2_chain_unlock(parent);
2368                 hammer2_chain_drop(parent);     /* drop elm->chain ref */
2369                 if (error)
2370                         cumulative_error = error;
2371         }
2372         hammer2_trans_done(&trans);
2373
2374         return cumulative_error;
2375 }
2376
2377 static
2378 int
2379 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_dev_t *hmp,
2380                       hammer2_chain_t *parent,
2381                       struct hammer2_recovery_info *info,
2382                       hammer2_tid_t sync_tid)
2383 {
2384         const hammer2_inode_data_t *ripdata;
2385         hammer2_chain_t *chain;
2386         int cache_index;
2387         int cumulative_error = 0;
2388         int error;
2389
2390         /*
2391          * Adjust freemap to ensure that the block(s) are marked allocated.
2392          */
2393         if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
2394                 hammer2_freemap_adjust(trans, hmp, &parent->bref,
2395                                        HAMMER2_FREEMAP_DORECOVER);
2396         }
2397
2398         /*
2399          * Check type for recursive scan
2400          */
2401         switch(parent->bref.type) {
2402         case HAMMER2_BREF_TYPE_VOLUME:
2403                 /* data already instantiated */
2404                 break;
2405         case HAMMER2_BREF_TYPE_INODE:
2406                 /*
2407                  * Must instantiate data for DIRECTDATA test and also
2408                  * for recursion.
2409                  */
2410                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2411                 ripdata = &hammer2_chain_rdata(parent)->ipdata;
2412                 if (ripdata->op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2413                         /* not applicable to recovery scan */
2414                         hammer2_chain_unlock(parent);
2415                         return 0;
2416                 }
2417                 hammer2_chain_unlock(parent);
2418                 break;
2419         case HAMMER2_BREF_TYPE_INDIRECT:
2420                 /*
2421                  * Must instantiate data for recursion
2422                  */
2423                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2424                 hammer2_chain_unlock(parent);
2425                 break;
2426         case HAMMER2_BREF_TYPE_DATA:
2427         case HAMMER2_BREF_TYPE_FREEMAP:
2428         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2429         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2430                 /* not applicable to recovery scan */
2431                 return 0;
2432                 break;
2433         default:
2434                 return EDOM;
2435         }
2436
2437         /*
2438          * Defer operation if depth limit reached or if we are crossing a
2439          * PFS boundary.
2440          */
2441         if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH) {
2442                 struct hammer2_recovery_elm *elm;
2443
2444                 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2445                 elm->chain = parent;
2446                 elm->sync_tid = sync_tid;
2447                 hammer2_chain_ref(parent);
2448                 TAILQ_INSERT_TAIL(&info->list, elm, entry);
2449                 /* unlocked by caller */
2450
2451                 return(0);
2452         }
2453
2454
2455         /*
2456          * Recursive scan of the last flushed transaction only.  We are
2457          * doing this without pmp assignments so don't leave the chains
2458          * hanging around after we are done with them.
2459          */
2460         cache_index = 0;
2461         chain = hammer2_chain_scan(parent, NULL, &cache_index,
2462                                    HAMMER2_LOOKUP_NODATA);
2463         while (chain) {
2464                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2465                 if (chain->bref.mirror_tid > sync_tid) {
2466                         ++info->depth;
2467                         error = hammer2_recovery_scan(trans, hmp, chain,
2468                                                       info, sync_tid);
2469                         --info->depth;
2470                         if (error)
2471                                 cumulative_error = error;
2472                 }
2473
2474                 /*
2475                  * Flush the recovery at the PFS boundary to stage it for
2476                  * the final flush of the super-root topology.
2477                  */
2478                 if ((chain->bref.flags & HAMMER2_BREF_FLAG_PFSROOT) &&
2479                     (chain->flags & HAMMER2_CHAIN_ONFLUSH)) {
2480                         hammer2_flush(trans, chain, 1);
2481                 }
2482                 chain = hammer2_chain_scan(parent, chain, &cache_index,
2483                                            HAMMER2_LOOKUP_NODATA);
2484         }
2485
2486         return cumulative_error;
2487 }
2488
2489 /*
2490  * Sync a mount point; this is called on a per-mount basis from the
2491  * filesystem syncer process periodically and whenever a user issues
2492  * a sync.
2493  */
2494 int
2495 hammer2_vfs_sync(struct mount *mp, int waitfor)
2496 {
2497         struct hammer2_sync_info info;
2498         hammer2_inode_t *iroot;
2499         hammer2_chain_t *chain;
2500         hammer2_chain_t *parent;
2501         hammer2_pfs_t *pmp;
2502         hammer2_dev_t *hmp;
2503         int flags;
2504         int error;
2505         int total_error;
2506         int i;
2507         int j;
2508
2509         pmp = MPTOPMP(mp);
2510         iroot = pmp->iroot;
2511         KKASSERT(iroot);
2512         KKASSERT(iroot->pmp == pmp);
2513
2514         /*
2515          * We can't acquire locks on existing vnodes while in a transaction
2516          * without risking a deadlock.  This assumes that vfsync() can be
2517          * called without the vnode locked (which it can in DragonFly).
2518          * Otherwise we'd have to implement a multi-pass or flag the lock
2519          * failures and retry.
2520          *
2521          * The reclamation code interlocks with the sync list's token
2522          * (by removing the vnode from the scan list) before unlocking
2523          * the inode, giving us time to ref the inode.
2524          */
2525         /*flags = VMSC_GETVP;*/
2526         flags = 0;
2527         if (waitfor & MNT_LAZY)
2528                 flags |= VMSC_ONEPASS;
2529
2530 #if 0
2531         /*
2532          * Preflush the vnodes using a normal transaction before interlocking
2533          * with a flush transaction.
2534          */
2535         hammer2_trans_init(&info.trans, pmp, 0);
2536         info.error = 0;
2537         info.waitfor = MNT_NOWAIT;
2538         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
2539         hammer2_trans_done(&info.trans);
2540 #endif
2541
2542         /*
2543          * Start our flush transaction.  This does not return until all
2544          * concurrent transactions have completed and will prevent any
2545          * new transactions from running concurrently, except for the
2546          * buffer cache transactions.
2547          *
2548          * For efficiency do an async pass before making sure with a
2549          * synchronous pass on all related buffer cache buffers.  It
2550          * should theoretically not be possible for any new file buffers
2551          * to be instantiated during this sequence.
2552          */
2553         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH |
2554                                              HAMMER2_TRANS_PREFLUSH);
2555         hammer2_run_unlinkq(&info.trans, pmp);
2556
2557         info.error = 0;
2558         info.waitfor = MNT_NOWAIT;
2559         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
2560         info.waitfor = MNT_WAIT;
2561         vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2562
2563         /*
2564          * Clear PREFLUSH.  This prevents (or asserts on) any new logical
2565          * buffer cache flushes which occur during the flush.  Device buffers
2566          * are not affected.
2567          */
2568         hammer2_bioq_sync(info.trans.pmp);
2569         atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH);
2570
2571         total_error = 0;
2572
2573         /*
2574          * Flush all nodes to synchronize the PFSROOT subtopology to the media.
2575          *
2576          * Note that this flush will not be visible on crash recovery until
2577          * we flush the super-root topology in the next loop.
2578          */
2579         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2580                 chain = iroot->cluster.array[i].chain;
2581                 if (chain == NULL)
2582                         continue;
2583
2584                 hammer2_chain_ref(chain);
2585                 hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
2586                 if (chain->flags & HAMMER2_CHAIN_FLUSH_MASK) {
2587                         hammer2_flush(&info.trans, chain, 1);
2588                         parent = chain->parent;
2589                         KKASSERT(chain->pmp != parent->pmp);
2590                         hammer2_chain_setflush(&info.trans, parent);
2591                 }
2592                 hammer2_chain_unlock(chain);
2593                 hammer2_chain_drop(chain);
2594         }
2595         hammer2_trans_done(&info.trans);
2596
2597         /*
2598          * Flush all volume roots to synchronize PFS flushes with the
2599          * storage media volume header.  This will flush the freemap and
2600          * the superroot topology but stops when it reaches a PFSROOT
2601          * (which we already flushed above).
2602          *
2603          * This is the last step which connects the volume root to the
2604          * PFSROOT dirs flushed above.
2605          *
2606          * Each spmp (representing the hmp's super-root) requires its own
2607          * transaction.
2608          */
2609         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2610                 hammer2_chain_t *tmp;
2611
2612                 chain = iroot->cluster.array[i].chain;
2613                 if (chain == NULL)
2614                         continue;
2615
2616                 hmp = chain->hmp;
2617
2618                 /*
2619                  * We only have to flush each hmp once
2620                  */
2621                 for (j = i - 1; j >= 0; --j) {
2622                         if ((tmp = iroot->cluster.array[j].chain) != NULL) {
2623                                 if (tmp->hmp == hmp)
2624                                         break;
2625                         }
2626                 }
2627                 if (j >= 0)
2628                         continue;
2629
2630                 /*
2631                  * spmp transaction.  The super-root is never directly
2632                  * mounted so there shouldn't be any vnodes, let alone any
2633                  * dirty vnodes associated with it.
2634                  */
2635                 hammer2_trans_init(&info.trans, hmp->spmp,
2636                                    HAMMER2_TRANS_ISFLUSH);
2637
2638                 /*
2639                  * Media mounts have two 'roots', vchain for the topology
2640                  * and fchain for the free block table.  Flush both.
2641                  *
2642                  * Note that the topology and free block table are handled
2643                  * independently, so the free block table can wind up being
2644                  * ahead of the topology.  We depend on the bulk free scan
2645                  * code to deal with any loose ends.
2646                  */
2647                 hammer2_chain_ref(&hmp->vchain);
2648                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2649                 hammer2_chain_ref(&hmp->fchain);
2650                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2651                 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2652                         /*
2653                          * This will also modify vchain as a side effect,
2654                          * mark vchain as modified now.
2655                          */
2656                         hammer2_voldata_modify(hmp);
2657                         chain = &hmp->fchain;
2658                         hammer2_flush(&info.trans, chain, 1);
2659                         KKASSERT(chain == &hmp->fchain);
2660                 }
2661                 hammer2_chain_unlock(&hmp->fchain);
2662                 hammer2_chain_unlock(&hmp->vchain);
2663                 hammer2_chain_drop(&hmp->fchain);
2664                 /* vchain dropped down below */
2665
2666                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2667                 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_MASK) {
2668                         chain = &hmp->vchain;
2669                         hammer2_flush(&info.trans, chain, 1);
2670                         KKASSERT(chain == &hmp->vchain);
2671                 }
2672                 hammer2_chain_unlock(&hmp->vchain);
2673                 hammer2_chain_drop(&hmp->vchain);
2674
2675                 error = 0;
2676
2677                 /*
2678                  * We can't safely flush the volume header until we have
2679                  * flushed any device buffers which have built up.
2680                  *
2681                  * XXX this isn't being incremental
2682                  */
2683                 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
2684                 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
2685                 vn_unlock(hmp->devvp);
2686
2687                 /*
2688                  * The flush code sets CHAIN_VOLUMESYNC to indicate that the
2689                  * volume header needs synchronization via hmp->volsync.
2690                  *
2691                  * XXX synchronize the flag & data with only this flush XXX
2692                  */
2693                 if (error == 0 &&
2694                     (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
2695                         struct buf *bp;
2696
2697                         /*
2698                          * Synchronize the disk before flushing the volume
2699                          * header.
2700                          */
2701                         bp = getpbuf(NULL);
2702                         bp->b_bio1.bio_offset = 0;
2703                         bp->b_bufsize = 0;
2704                         bp->b_bcount = 0;
2705                         bp->b_cmd = BUF_CMD_FLUSH;
2706                         bp->b_bio1.bio_done = biodone_sync;
2707                         bp->b_bio1.bio_flags |= BIO_SYNC;
2708                         vn_strategy(hmp->devvp, &bp->b_bio1);
2709                         biowait(&bp->b_bio1, "h2vol");
2710                         relpbuf(bp, NULL);
2711
2712                         /*
2713                          * Then we can safely flush the version of the
2714                          * volume header synchronized by the flush code.
2715                          */
2716                         i = hmp->volhdrno + 1;
2717                         if (i >= HAMMER2_NUM_VOLHDRS)
2718                                 i = 0;
2719                         if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
2720                             hmp->volsync.volu_size) {
2721                                 i = 0;
2722                         }
2723                         kprintf("sync volhdr %d %jd\n",
2724                                 i, (intmax_t)hmp->volsync.volu_size);
2725                         bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2726                                     HAMMER2_PBUFSIZE, 0, 0);
2727                         atomic_clear_int(&hmp->vchain.flags,
2728                                          HAMMER2_CHAIN_VOLUMESYNC);
2729                         bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
2730                         bawrite(bp);
2731                         hmp->volhdrno = i;
2732                 }
2733                 if (error)
2734                         total_error = error;
2735
2736                 hammer2_trans_done(&info.trans);        /* spmp trans */
2737         }
2738         return (total_error);
2739 }
2740
2741 /*
2742  * Sync passes.
2743  */
2744 static int
2745 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2746 {
2747         struct hammer2_sync_info *info = data;
2748         hammer2_inode_t *ip;
2749         int error;
2750
2751         /*
2752          * Degenerate cases.  Note that ip == NULL typically means the
2753          * syncer vnode itself and we don't want to vclrisdirty() in that
2754          * situation.
2755          */
2756         ip = VTOI(vp);
2757         if (ip == NULL) {
2758                 return(0);
2759         }
2760         if (vp->v_type == VNON || vp->v_type == VBAD) {
2761                 vclrisdirty(vp);
2762                 return(0);
2763         }
2764
2765         /*
2766          * VOP_FSYNC will start a new transaction so replicate some code
2767          * here to do it inline (see hammer2_vop_fsync()).
2768          *
2769          * WARNING: The vfsync interacts with the buffer cache and might
2770          *          block, we can't hold the inode lock at that time.
2771          *          However, we MUST ref ip before blocking to ensure that
2772          *          it isn't ripped out from under us (since we do not
2773          *          hold a lock on the vnode).
2774          */
2775         hammer2_inode_ref(ip);
2776         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
2777         if ((ip->flags & HAMMER2_INODE_MODIFIED) ||
2778             !RB_EMPTY(&vp->v_rbdirty_tree)) {
2779                 vfsync(vp, info->waitfor, 1, NULL, NULL);
2780         }
2781         if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
2782             RB_EMPTY(&vp->v_rbdirty_tree)) {
2783                 vclrisdirty(vp);
2784         }
2785
2786         hammer2_inode_drop(ip);
2787 #if 1
2788         error = 0;
2789         if (error)
2790                 info->error = error;
2791 #endif
2792         return(0);
2793 }
2794
2795 static
2796 int
2797 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2798 {
2799         return (0);
2800 }
2801
2802 static
2803 int
2804 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2805                struct fid *fhp, struct vnode **vpp)
2806 {
2807         return (0);
2808 }
2809
2810 static
2811 int
2812 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2813                  int *exflagsp, struct ucred **credanonp)
2814 {
2815         return (0);
2816 }
2817
2818 /*
2819  * Support code for hammer2_vfs_mount().  Read, verify, and install the volume
2820  * header into the HMP
2821  *
2822  * XXX read four volhdrs and use the one with the highest TID whos CRC
2823  *     matches.
2824  *
2825  * XXX check iCRCs.
2826  *
2827  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2828  *     nonexistant locations.
2829  *
2830  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2831  */
2832 static
2833 int
2834 hammer2_install_volume_header(hammer2_dev_t *hmp)
2835 {
2836         hammer2_volume_data_t *vd;
2837         struct buf *bp;
2838         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2839         int error_reported;
2840         int error;
2841         int valid;
2842         int i;
2843
2844         error_reported = 0;
2845         error = 0;
2846         valid = 0;
2847         bp = NULL;
2848
2849         /*
2850          * There are up to 4 copies of the volume header (syncs iterate
2851          * between them so there is no single master).  We don't trust the
2852          * volu_size field so we don't know precisely how large the filesystem
2853          * is, so depend on the OS to return an error if we go beyond the
2854          * block device's EOF.
2855          */
2856         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2857                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2858                               HAMMER2_VOLUME_BYTES, &bp);
2859                 if (error) {
2860                         brelse(bp);
2861                         bp = NULL;
2862                         continue;
2863                 }
2864
2865                 vd = (struct hammer2_volume_data *) bp->b_data;
2866                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2867                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2868                         brelse(bp);
2869                         bp = NULL;
2870                         continue;
2871                 }
2872
2873                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2874                         /* XXX: Reversed-endianness filesystem */
2875                         kprintf("hammer2: reverse-endian filesystem detected");
2876                         brelse(bp);
2877                         bp = NULL;
2878                         continue;
2879                 }
2880
2881                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2882                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2883                                       HAMMER2_VOLUME_ICRC0_SIZE);
2884                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2885                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2886                                        HAMMER2_VOLUME_ICRC1_SIZE);
2887                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
2888                         kprintf("hammer2 volume header crc "
2889                                 "mismatch copy #%d %08x/%08x\n",
2890                                 i, crc0, crc);
2891                         error_reported = 1;
2892                         brelse(bp);
2893                         bp = NULL;
2894                         continue;
2895                 }
2896                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
2897                         valid = 1;
2898                         hmp->voldata = *vd;
2899                         hmp->volhdrno = i;
2900                 }
2901                 brelse(bp);
2902                 bp = NULL;
2903         }
2904         if (valid) {
2905                 hmp->volsync = hmp->voldata;
2906                 error = 0;
2907                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
2908                         kprintf("hammer2: using volume header #%d\n",
2909                                 hmp->volhdrno);
2910                 }
2911         } else {
2912                 error = EINVAL;
2913                 kprintf("hammer2: no valid volume headers found!\n");
2914         }
2915         return (error);
2916 }
2917
2918 /*
2919  * This handles hysteresis on regular file flushes.  Because the BIOs are
2920  * routed to a thread it is possible for an excessive number to build up
2921  * and cause long front-end stalls long before the runningbuffspace limit
2922  * is hit, so we implement hammer2_flush_pipe to control the
2923  * hysteresis.
2924  *
2925  * This is a particular problem when compression is used.
2926  */
2927 void
2928 hammer2_lwinprog_ref(hammer2_pfs_t *pmp)
2929 {
2930         atomic_add_int(&pmp->count_lwinprog, 1);
2931 }
2932
2933 void
2934 hammer2_lwinprog_drop(hammer2_pfs_t *pmp)
2935 {
2936         int lwinprog;
2937
2938         lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2939         if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2940             (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2941                 atomic_clear_int(&pmp->count_lwinprog,
2942                                  HAMMER2_LWINPROG_WAITING);
2943                 wakeup(&pmp->count_lwinprog);
2944         }
2945 }
2946
2947 void
2948 hammer2_lwinprog_wait(hammer2_pfs_t *pmp)
2949 {
2950         int lwinprog;
2951
2952         for (;;) {
2953                 lwinprog = pmp->count_lwinprog;
2954                 cpu_ccfence();
2955                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2956                         break;
2957                 tsleep_interlock(&pmp->count_lwinprog, 0);
2958                 atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING);
2959                 lwinprog = pmp->count_lwinprog;
2960                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2961                         break;
2962                 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2963         }
2964 }
2965
2966 /*
2967  * Manage excessive memory resource use for chain and related
2968  * structures.
2969  */
2970 void
2971 hammer2_pfs_memory_wait(hammer2_pfs_t *pmp)
2972 {
2973         uint32_t waiting;
2974         uint32_t count;
2975         uint32_t limit;
2976 #if 0
2977         static int zzticks;
2978 #endif
2979
2980         /*
2981          * Atomic check condition and wait.  Also do an early speedup of
2982          * the syncer to try to avoid hitting the wait.
2983          */
2984         for (;;) {
2985                 waiting = pmp->inmem_dirty_chains;
2986                 cpu_ccfence();
2987                 count = waiting & HAMMER2_DIRTYCHAIN_MASK;
2988
2989                 limit = pmp->mp->mnt_nvnodelistsize / 10;
2990                 if (limit < hammer2_limit_dirty_chains)
2991                         limit = hammer2_limit_dirty_chains;
2992                 if (limit < 1000)
2993                         limit = 1000;
2994
2995 #if 0
2996                 if ((int)(ticks - zzticks) > hz) {
2997                         zzticks = ticks;
2998                         kprintf("count %ld %ld\n", count, limit);
2999                 }
3000 #endif
3001
3002                 /*
3003                  * Block if there are too many dirty chains present, wait
3004                  * for the flush to clean some out.
3005                  */
3006                 if (count > limit) {
3007                         tsleep_interlock(&pmp->inmem_dirty_chains, 0);
3008                         if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
3009                                                waiting,
3010                                        waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
3011                                 speedup_syncer(pmp->mp);
3012                                 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
3013                                        "chnmem", hz);
3014                         }
3015                         continue;       /* loop on success or fail */
3016                 }
3017
3018                 /*
3019                  * Try to start an early flush before we are forced to block.
3020                  */
3021                 if (count > limit * 7 / 10)
3022                         speedup_syncer(pmp->mp);
3023                 break;
3024         }
3025 }
3026
3027 void
3028 hammer2_pfs_memory_inc(hammer2_pfs_t *pmp)
3029 {
3030         if (pmp) {
3031                 atomic_add_int(&pmp->inmem_dirty_chains, 1);
3032         }
3033 }
3034
3035 void
3036 hammer2_pfs_memory_wakeup(hammer2_pfs_t *pmp)
3037 {
3038         uint32_t waiting;
3039
3040         if (pmp == NULL)
3041                 return;
3042
3043         for (;;) {
3044                 waiting = pmp->inmem_dirty_chains;
3045                 cpu_ccfence();
3046                 if (atomic_cmpset_int(&pmp->inmem_dirty_chains,
3047                                        waiting,
3048                                        (waiting - 1) &
3049                                         ~HAMMER2_DIRTYCHAIN_WAITING)) {
3050                         break;
3051                 }
3052         }
3053
3054         if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
3055                 wakeup(&pmp->inmem_dirty_chains);
3056 }
3057
3058 /*
3059  * Debugging
3060  */
3061 void
3062 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
3063 {
3064         hammer2_chain_t *scan;
3065         hammer2_chain_t *parent;
3066
3067         --*countp;
3068         if (*countp == 0) {
3069                 kprintf("%*.*s...\n", tab, tab, "");
3070                 return;
3071         }
3072         if (*countp < 0)
3073                 return;
3074         kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
3075                 tab, tab, "", pfx,
3076                 chain, chain->bref.type,
3077                 chain->bref.key, chain->bref.keybits,
3078                 chain->bref.mirror_tid);
3079
3080         kprintf("%*.*s      [%08x] (%s) refs=%d",
3081                 tab, tab, "",
3082                 chain->flags,
3083                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
3084                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
3085                 chain->refs);
3086
3087         parent = chain->parent;
3088         if (parent)
3089                 kprintf("\n%*.*s      p=%p [pflags %08x prefs %d",
3090                         tab, tab, "",
3091                         parent, parent->flags, parent->refs);
3092         if (RB_EMPTY(&chain->core.rbtree)) {
3093                 kprintf("\n");
3094         } else {
3095                 kprintf(" {\n");
3096                 RB_FOREACH(scan, hammer2_chain_tree, &chain->core.rbtree)
3097                         hammer2_dump_chain(scan, tab + 4, countp, 'a');
3098                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
3099                         kprintf("%*.*s}(%s)\n", tab, tab, "",
3100                                 chain->data->ipdata.filename);
3101                 else
3102                         kprintf("%*.*s}\n", tab, tab, "");
3103         }
3104 }