hammer2 - more dmsg/separation work
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*
2  * Copyright (c) 2011-2014 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61
62 #include "hammer2.h"
63 #include "hammer2_lz4.h"
64
65 #include "zlib/hammer2_zlib.h"
66
67 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
68
69 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
70
71 struct hammer2_sync_info {
72         hammer2_trans_t trans;
73         int error;
74         int waitfor;
75 };
76
77 TAILQ_HEAD(hammer2_mntlist, hammer2_mount);
78 TAILQ_HEAD(hammer2_pfslist, hammer2_pfsmount);
79 static struct hammer2_mntlist hammer2_mntlist;
80 static struct hammer2_pfslist hammer2_pfslist;
81 static struct lock hammer2_mntlk;
82
83 int hammer2_debug;
84 int hammer2_cluster_enable = 1;
85 int hammer2_hardlink_enable = 1;
86 int hammer2_flush_pipe = 100;
87 int hammer2_synchronous_flush = 1;
88 int hammer2_dio_count;
89 long hammer2_limit_dirty_chains;
90 long hammer2_iod_file_read;
91 long hammer2_iod_meta_read;
92 long hammer2_iod_indr_read;
93 long hammer2_iod_fmap_read;
94 long hammer2_iod_volu_read;
95 long hammer2_iod_file_write;
96 long hammer2_iod_meta_write;
97 long hammer2_iod_indr_write;
98 long hammer2_iod_fmap_write;
99 long hammer2_iod_volu_write;
100 long hammer2_ioa_file_read;
101 long hammer2_ioa_meta_read;
102 long hammer2_ioa_indr_read;
103 long hammer2_ioa_fmap_read;
104 long hammer2_ioa_volu_read;
105 long hammer2_ioa_fmap_write;
106 long hammer2_ioa_file_write;
107 long hammer2_ioa_meta_write;
108 long hammer2_ioa_indr_write;
109 long hammer2_ioa_volu_write;
110
111 MALLOC_DECLARE(C_BUFFER);
112 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
113
114 MALLOC_DECLARE(D_BUFFER);
115 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
116
117 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
118
119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
120            &hammer2_debug, 0, "");
121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
122            &hammer2_cluster_enable, 0, "");
123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
124            &hammer2_hardlink_enable, 0, "");
125 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
126            &hammer2_flush_pipe, 0, "");
127 SYSCTL_INT(_vfs_hammer2, OID_AUTO, synchronous_flush, CTLFLAG_RW,
128            &hammer2_synchronous_flush, 0, "");
129 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, limit_dirty_chains, CTLFLAG_RW,
130            &hammer2_limit_dirty_chains, 0, "");
131 SYSCTL_INT(_vfs_hammer2, OID_AUTO, dio_count, CTLFLAG_RD,
132            &hammer2_dio_count, 0, "");
133
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
135            &hammer2_iod_file_read, 0, "");
136 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
137            &hammer2_iod_meta_read, 0, "");
138 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
139            &hammer2_iod_indr_read, 0, "");
140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
141            &hammer2_iod_fmap_read, 0, "");
142 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
143            &hammer2_iod_volu_read, 0, "");
144
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
146            &hammer2_iod_file_write, 0, "");
147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
148            &hammer2_iod_meta_write, 0, "");
149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
150            &hammer2_iod_indr_write, 0, "");
151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
152            &hammer2_iod_fmap_write, 0, "");
153 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
154            &hammer2_iod_volu_write, 0, "");
155
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
157            &hammer2_ioa_file_read, 0, "");
158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
159            &hammer2_ioa_meta_read, 0, "");
160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
161            &hammer2_ioa_indr_read, 0, "");
162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
163            &hammer2_ioa_fmap_read, 0, "");
164 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
165            &hammer2_ioa_volu_read, 0, "");
166
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
168            &hammer2_ioa_file_write, 0, "");
169 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
170            &hammer2_ioa_meta_write, 0, "");
171 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
172            &hammer2_ioa_indr_write, 0, "");
173 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
174            &hammer2_ioa_fmap_write, 0, "");
175 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
176            &hammer2_ioa_volu_write, 0, "");
177
178 static int hammer2_vfs_init(struct vfsconf *conf);
179 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
180 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
181                                 struct ucred *cred);
182 static int hammer2_remount(hammer2_mount_t *, struct mount *, char *,
183                                 struct vnode *, struct ucred *);
184 static int hammer2_recovery(hammer2_mount_t *hmp);
185 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
186 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
187 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
188                                 struct ucred *cred);
189 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
190                                 struct ucred *cred);
191 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
192                                 ino_t ino, struct vnode **vpp);
193 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
194                                 struct fid *fhp, struct vnode **vpp);
195 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
196 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
197                                 int *exflagsp, struct ucred **credanonp);
198
199 static int hammer2_install_volume_header(hammer2_mount_t *hmp);
200 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
201
202 static void hammer2_write_thread(void *arg);
203
204 static void hammer2_vfs_unmount_hmp1(struct mount *mp, hammer2_mount_t *hmp);
205 static void hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp);
206
207 /* 
208  * Functions for compression in threads,
209  * from hammer2_vnops.c
210  */
211 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
212                                 hammer2_inode_t *ip,
213                                 hammer2_inode_data_t *ipdata,
214                                 hammer2_cluster_t *cparent,
215                                 hammer2_key_t lbase, int ioflag, int pblksize,
216                                 int *errorp);
217 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
218                                 hammer2_inode_t *ip,
219                                 const hammer2_inode_data_t *ipdata,
220                                 hammer2_cluster_t *cparent,
221                                 hammer2_key_t lbase, int ioflag,
222                                 int pblksize, int *errorp, int comp_algo);
223 static void hammer2_zero_check_and_write(struct buf *bp,
224                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
225                                 const hammer2_inode_data_t *ipdata,
226                                 hammer2_cluster_t *cparent,
227                                 hammer2_key_t lbase,
228                                 int ioflag, int pblksize, int *errorp);
229 static int test_block_zeros(const char *buf, size_t bytes);
230 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
231                                 hammer2_inode_t *ip,
232                                 const hammer2_inode_data_t *ipdata,
233                                 hammer2_cluster_t *cparent,
234                                 hammer2_key_t lbase,
235                                 int *errorp);
236 static void hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp,
237                                 int ioflag, int pblksize, int *errorp);
238
239 static int hammer2_rcvdmsg(kdmsg_msg_t *msg);
240 static void hammer2_autodmsg(kdmsg_msg_t *msg);
241
242
243 /*
244  * HAMMER2 vfs operations.
245  */
246 static struct vfsops hammer2_vfsops = {
247         .vfs_init       = hammer2_vfs_init,
248         .vfs_uninit     = hammer2_vfs_uninit,
249         .vfs_sync       = hammer2_vfs_sync,
250         .vfs_mount      = hammer2_vfs_mount,
251         .vfs_unmount    = hammer2_vfs_unmount,
252         .vfs_root       = hammer2_vfs_root,
253         .vfs_statfs     = hammer2_vfs_statfs,
254         .vfs_statvfs    = hammer2_vfs_statvfs,
255         .vfs_vget       = hammer2_vfs_vget,
256         .vfs_vptofh     = hammer2_vfs_vptofh,
257         .vfs_fhtovp     = hammer2_vfs_fhtovp,
258         .vfs_checkexp   = hammer2_vfs_checkexp
259 };
260
261 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
262
263 VFS_SET(hammer2_vfsops, hammer2, 0);
264 MODULE_VERSION(hammer2, 1);
265
266 static
267 int
268 hammer2_vfs_init(struct vfsconf *conf)
269 {
270         static struct objcache_malloc_args margs_read;
271         static struct objcache_malloc_args margs_write;
272
273         int error;
274
275         error = 0;
276
277         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
278                 error = EINVAL;
279         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
280                 error = EINVAL;
281         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
282                 error = EINVAL;
283
284         if (error)
285                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
286         
287         margs_read.objsize = 65536;
288         margs_read.mtype = D_BUFFER;
289         
290         margs_write.objsize = 32768;
291         margs_write.mtype = C_BUFFER;
292         
293         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
294                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
295                                 objcache_malloc_free, &margs_read);
296         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
297                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
298                                 objcache_malloc_free, &margs_write);
299
300         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
301         TAILQ_INIT(&hammer2_mntlist);
302         TAILQ_INIT(&hammer2_pfslist);
303
304         hammer2_limit_dirty_chains = desiredvnodes / 10;
305
306         hammer2_trans_manage_init();
307
308         return (error);
309 }
310
311 static
312 int
313 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
314 {
315         objcache_destroy(cache_buffer_read);
316         objcache_destroy(cache_buffer_write);
317         return 0;
318 }
319
320 /*
321  * Core PFS allocator.  Used to allocate the pmp structure for PFS cluster
322  * mounts and the spmp structure for media (hmp) structures.
323  */
324 static hammer2_pfsmount_t *
325 hammer2_pfsalloc(const hammer2_inode_data_t *ipdata, hammer2_tid_t alloc_tid)
326 {
327         hammer2_pfsmount_t *pmp;
328
329         pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
330         kmalloc_create(&pmp->minode, "HAMMER2-inodes");
331         kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
332         lockinit(&pmp->lock, "pfslk", 0, 0);
333         spin_init(&pmp->inum_spin);
334         RB_INIT(&pmp->inum_tree);
335         TAILQ_INIT(&pmp->unlinkq);
336         spin_init(&pmp->unlinkq_spin);
337
338         pmp->alloc_tid = alloc_tid + 1;   /* our first media transaction id */
339         pmp->flush_tid = pmp->alloc_tid;
340         if (ipdata) {
341                 pmp->inode_tid = ipdata->pfs_inum + 1;
342                 pmp->pfs_clid = ipdata->pfs_clid;
343         }
344         mtx_init(&pmp->wthread_mtx);
345         bioq_init(&pmp->wthread_bioq);
346
347         return pmp;
348 }
349
350 /*
351  * Mount or remount HAMMER2 fileystem from physical media
352  *
353  *      mountroot
354  *              mp              mount point structure
355  *              path            NULL
356  *              data            <unused>
357  *              cred            <unused>
358  *
359  *      mount
360  *              mp              mount point structure
361  *              path            path to mount point
362  *              data            pointer to argument structure in user space
363  *                      volume  volume path (device@LABEL form)
364  *                      hflags  user mount flags
365  *              cred            user credentials
366  *
367  * RETURNS:     0       Success
368  *              !0      error number
369  */
370 static
371 int
372 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
373                   struct ucred *cred)
374 {
375         struct hammer2_mount_info info;
376         hammer2_pfsmount_t *pmp;
377         hammer2_pfsmount_t *spmp;
378         hammer2_mount_t *hmp;
379         hammer2_key_t key_next;
380         hammer2_key_t key_dummy;
381         hammer2_key_t lhc;
382         struct vnode *devvp;
383         struct nlookupdata nd;
384         hammer2_chain_t *parent;
385         hammer2_chain_t *rchain;
386         hammer2_cluster_t *cluster;
387         hammer2_cluster_t *cparent;
388         const hammer2_inode_data_t *ipdata;
389         hammer2_blockref_t bref;
390         struct file *fp;
391         char devstr[MNAMELEN];
392         size_t size;
393         size_t done;
394         char *dev;
395         char *label;
396         int ronly = 1;
397         int error;
398         int cache_index;
399         int ddflag;
400         int i;
401
402         hmp = NULL;
403         pmp = NULL;
404         dev = NULL;
405         label = NULL;
406         devvp = NULL;
407         cache_index = -1;
408
409         kprintf("hammer2_mount\n");
410
411         if (path == NULL) {
412                 /*
413                  * Root mount
414                  */
415                 bzero(&info, sizeof(info));
416                 info.cluster_fd = -1;
417                 return (EOPNOTSUPP);
418         } else {
419                 /*
420                  * Non-root mount or updating a mount
421                  */
422                 error = copyin(data, &info, sizeof(info));
423                 if (error)
424                         return (error);
425
426                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
427                 if (error)
428                         return (error);
429
430                 /* Extract device and label */
431                 dev = devstr;
432                 label = strchr(devstr, '@');
433                 if (label == NULL ||
434                     ((label + 1) - dev) > done) {
435                         return (EINVAL);
436                 }
437                 *label = '\0';
438                 label++;
439                 if (*label == '\0')
440                         return (EINVAL);
441
442                 if (mp->mnt_flag & MNT_UPDATE) {
443                         /* Update mount */
444                         /* HAMMER2 implements NFS export via mountctl */
445                         pmp = MPTOPMP(mp);
446                         for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
447                                 hmp = pmp->iroot->cluster.array[i]->hmp;
448                                 devvp = hmp->devvp;
449                                 error = hammer2_remount(hmp, mp, path,
450                                                         devvp, cred);
451                                 if (error)
452                                         break;
453                         }
454                         /*hammer2_inode_install_hidden(pmp);*/
455
456                         return error;
457                 }
458         }
459
460         /*
461          * HMP device mount
462          *
463          * Lookup name and verify it refers to a block device.
464          */
465         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
466         if (error == 0)
467                 error = nlookup(&nd);
468         if (error == 0)
469                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
470         nlookup_done(&nd);
471
472         if (error == 0) {
473                 if (vn_isdisk(devvp, &error))
474                         error = vfs_mountedon(devvp);
475         }
476
477         /*
478          * Determine if the device has already been mounted.  After this
479          * check hmp will be non-NULL if we are doing the second or more
480          * hammer2 mounts from the same device.
481          */
482         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
483         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
484                 if (hmp->devvp == devvp)
485                         break;
486         }
487
488         /*
489          * Open the device if this isn't a secondary mount and construct
490          * the H2 device mount (hmp).
491          */
492         if (hmp == NULL) {
493                 hammer2_chain_t *schain;
494                 hammer2_xid_t xid;
495
496                 if (error == 0 && vcount(devvp) > 0)
497                         error = EBUSY;
498
499                 /*
500                  * Now open the device
501                  */
502                 if (error == 0) {
503                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
504                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
505                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
506                         if (error == 0) {
507                                 error = VOP_OPEN(devvp,
508                                                  ronly ? FREAD : FREAD | FWRITE,
509                                                  FSCRED, NULL);
510                         }
511                         vn_unlock(devvp);
512                 }
513                 if (error && devvp) {
514                         vrele(devvp);
515                         devvp = NULL;
516                 }
517                 if (error) {
518                         lockmgr(&hammer2_mntlk, LK_RELEASE);
519                         return error;
520                 }
521                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
522                 hmp->ronly = ronly;
523                 hmp->devvp = devvp;
524                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
525                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
526                 RB_INIT(&hmp->iotree);
527
528                 lockinit(&hmp->vollk, "h2vol", 0, 0);
529
530                 /*
531                  * vchain setup. vchain.data is embedded.
532                  * vchain.refs is initialized and will never drop to 0.
533                  *
534                  * NOTE! voldata is not yet loaded.
535                  */
536                 hmp->vchain.hmp = hmp;
537                 hmp->vchain.refs = 1;
538                 hmp->vchain.data = (void *)&hmp->voldata;
539                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
540                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
541                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
542                 hmp->vchain.delete_xid = HAMMER2_XID_MAX;
543
544                 hammer2_chain_core_alloc(NULL, &hmp->vchain, NULL);
545                 /* hmp->vchain.u.xxx is left NULL */
546
547                 /*
548                  * fchain setup.  fchain.data is embedded.
549                  * fchain.refs is initialized and will never drop to 0.
550                  *
551                  * The data is not used but needs to be initialized to
552                  * pass assertion muster.  We use this chain primarily
553                  * as a placeholder for the freemap's top-level RBTREE
554                  * so it does not interfere with the volume's topology
555                  * RBTREE.
556                  */
557                 hmp->fchain.hmp = hmp;
558                 hmp->fchain.refs = 1;
559                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
560                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
561                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
562                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
563                 hmp->fchain.bref.methods =
564                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
565                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
566                 hmp->fchain.delete_xid = HAMMER2_XID_MAX;
567
568                 hammer2_chain_core_alloc(NULL, &hmp->fchain, NULL);
569                 /* hmp->fchain.u.xxx is left NULL */
570
571                 /*
572                  * Install the volume header and initialize fields from
573                  * voldata.
574                  */
575                 error = hammer2_install_volume_header(hmp);
576                 if (error) {
577                         ++hmp->pmp_count;
578                         hammer2_vfs_unmount_hmp1(mp, hmp);
579                         hammer2_vfs_unmount_hmp2(mp, hmp);
580                         hammer2_vfs_unmount(mp, MNT_FORCE);
581                         return error;
582                 }
583
584                 /*
585                  * Really important to get these right or flush will get
586                  * confused.
587                  */
588                 hmp->spmp = hammer2_pfsalloc(NULL, hmp->voldata.mirror_tid);
589                 kprintf("alloc spmp %p tid %016jx\n",
590                         hmp->spmp, hmp->voldata.mirror_tid);
591                 spmp = hmp->spmp;
592                 spmp->inode_tid = 1;
593
594                 xid = 0;
595                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
596                 hmp->vchain.bref.modify_tid = hmp->vchain.bref.mirror_tid;
597                 hmp->vchain.modify_xid = xid;
598                 hmp->vchain.update_xlo = xid;
599                 hmp->vchain.update_xhi = xid;
600                 hmp->vchain.pmp = spmp;
601                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
602                 hmp->fchain.bref.modify_tid = hmp->fchain.bref.mirror_tid;
603                 hmp->fchain.modify_xid = xid;
604                 hmp->fchain.update_xlo = xid;
605                 hmp->fchain.update_xhi = xid;
606                 hmp->fchain.pmp = spmp;
607
608                 /*
609                  * First locate the super-root inode, which is key 0
610                  * relative to the volume header's blockset.
611                  *
612                  * Then locate the root inode by scanning the directory keyspace
613                  * represented by the label.
614                  */
615                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
616                 schain = hammer2_chain_lookup(&parent, &key_dummy,
617                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
618                                       &cache_index, 0, &ddflag);
619                 hammer2_chain_lookup_done(parent);
620                 if (schain == NULL) {
621                         kprintf("hammer2_mount: invalid super-root\n");
622                         ++hmp->pmp_count;
623                         hammer2_vfs_unmount_hmp1(mp, hmp);
624                         hammer2_vfs_unmount_hmp2(mp, hmp);
625                         hammer2_vfs_unmount(mp, MNT_FORCE);
626                         return EINVAL;
627                 }
628
629                 /*
630                  * Sanity-check schain's pmp, finish initializing spmp.
631                  */
632                 KKASSERT(schain->pmp == spmp);
633                 spmp->pfs_clid = schain->data->ipdata.pfs_clid;
634
635                 /*
636                  * NOTE: The CHAIN_PFSROOT is not set on the super-root inode.
637                  * NOTE: inode_get sucks up schain's lock.
638                  */
639                 cluster = hammer2_cluster_from_chain(schain);
640                 spmp->iroot = hammer2_inode_get(spmp, NULL, cluster);
641                 spmp->spmp_hmp = hmp;
642                 hammer2_inode_ref(spmp->iroot);
643                 hammer2_inode_unlock_ex(spmp->iroot, cluster);
644                 schain = NULL;
645                 /* leave spmp->iroot with one ref */
646
647                 if ((mp->mnt_flag & MNT_RDONLY) == 0) {
648                         error = hammer2_recovery(hmp);
649                         /* XXX do something with error */
650                 }
651         } else {
652                 spmp = hmp->spmp;
653         }
654         ++hmp->pmp_count;
655
656         /*
657          * Lookup mount point under the media-localized super-root.
658          *
659          * cluster->pmp will incorrectly point to spmp and must be fixed
660          * up later on.
661          */
662         cparent = hammer2_inode_lock_ex(spmp->iroot);
663         lhc = hammer2_dirhash(label, strlen(label));
664         cluster = hammer2_cluster_lookup(cparent, &key_next,
665                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
666                                       0, &ddflag);
667         while (cluster) {
668                 if (hammer2_cluster_type(cluster) == HAMMER2_BREF_TYPE_INODE &&
669                     strcmp(label,
670                        hammer2_cluster_data(cluster)->ipdata.filename) == 0) {
671                         break;
672                 }
673                 cluster = hammer2_cluster_next(cparent, cluster, &key_next,
674                                             key_next,
675                                             lhc + HAMMER2_DIRHASH_LOMASK, 0);
676         }
677         hammer2_inode_unlock_ex(spmp->iroot, cparent);
678
679         if (cluster == NULL) {
680                 kprintf("hammer2_mount: PFS label not found\n");
681                 hammer2_vfs_unmount_hmp1(mp, hmp);
682                 hammer2_vfs_unmount_hmp2(mp, hmp);
683                 lockmgr(&hammer2_mntlk, LK_RELEASE);
684                 hammer2_vfs_unmount(mp, MNT_FORCE);
685                 return EINVAL;
686         }
687
688         for (i = 0; i < cluster->nchains; ++i) {
689                 rchain = cluster->array[i];
690                 KKASSERT(rchain->pmp == NULL);
691                 if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
692                         kprintf("hammer2_mount: PFS label already mounted!\n");
693                         hammer2_cluster_unlock(cluster);
694                         hammer2_vfs_unmount_hmp1(mp, hmp);
695                         hammer2_vfs_unmount_hmp2(mp, hmp);
696                         lockmgr(&hammer2_mntlk, LK_RELEASE);
697                         hammer2_vfs_unmount(mp, MNT_FORCE);
698                         return EBUSY;
699                 }
700 #if 0
701                 if (rchain->flags & HAMMER2_CHAIN_RECYCLE) {
702                         kprintf("hammer2_mount: PFS label is recycling\n");
703                         hammer2_cluster_unlock(cluster);
704                         hammer2_vfs_unmount_hmp1(mp, hmp);
705                         hammer2_vfs_unmount_hmp2(mp, hmp);
706                         lockmgr(&hammer2_mntlk, LK_RELEASE);
707                         hammer2_vfs_unmount(mp, MNT_FORCE);
708                         return EBUSY;
709                 }
710 #endif
711         }
712
713         /*
714          * Check to see if the cluster id is already mounted at the mount
715          * point.  If it is, add us to the cluster.
716          */
717         ipdata = &hammer2_cluster_data(cluster)->ipdata;
718         hammer2_cluster_bref(cluster, &bref);
719         TAILQ_FOREACH(pmp, &hammer2_pfslist, mntentry) {
720                 if (pmp->spmp_hmp == NULL &&
721                     bcmp(&pmp->pfs_clid, &ipdata->pfs_clid,
722                          sizeof(pmp->pfs_clid)) == 0) {
723                         break;
724                 }
725         }
726
727         if (pmp) {
728                 int i;
729                 int j;
730
731                 hammer2_inode_ref(pmp->iroot);
732                 ccms_thread_lock(&pmp->iroot->topo_cst, CCMS_STATE_EXCLUSIVE);
733
734                 if (pmp->iroot->cluster.nchains + cluster->nchains >
735                     HAMMER2_MAXCLUSTER) {
736                         kprintf("hammer2_mount: cluster full!\n");
737
738                         ccms_thread_unlock(&pmp->iroot->topo_cst);
739                         hammer2_inode_drop(pmp->iroot);
740
741                         hammer2_cluster_unlock(cluster);
742                         hammer2_vfs_unmount_hmp1(mp, hmp);
743                         hammer2_vfs_unmount_hmp2(mp, hmp);
744                         lockmgr(&hammer2_mntlk, LK_RELEASE);
745                         hammer2_vfs_unmount(mp, MNT_FORCE);
746                         return EBUSY;
747                 }
748                 kprintf("hammer2_vfs_mount: Adding pfs to existing cluster\n");
749                 j = pmp->iroot->cluster.nchains;
750                 for (i = 0; i < cluster->nchains; ++i) {
751                         rchain = cluster->array[i];
752                         KKASSERT(rchain->pmp == NULL);
753                         rchain->pmp = pmp;
754                         hammer2_chain_ref(cluster->array[i]);
755                         pmp->iroot->cluster.array[j] = cluster->array[i];
756                         ++j;
757                 }
758                 pmp->iroot->cluster.nchains = j;
759                 ccms_thread_unlock(&pmp->iroot->topo_cst);
760                 hammer2_inode_drop(pmp->iroot);
761                 hammer2_cluster_unlock(cluster);
762                 lockmgr(&hammer2_mntlk, LK_RELEASE);
763
764                 kprintf("ok\n");
765                 hammer2_inode_install_hidden(pmp);
766
767                 return ERANGE;
768         }
769
770         /*
771          * Block device opened successfully, finish initializing the
772          * mount structure.
773          *
774          * From this point on we have to call hammer2_unmount() on failure.
775          */
776         pmp = hammer2_pfsalloc(ipdata, bref.mirror_tid);
777         kprintf("PMP mirror_tid is %016jx\n", bref.mirror_tid);
778         for (i = 0; i < cluster->nchains; ++i) {
779                 rchain = cluster->array[i];
780                 KKASSERT(rchain->pmp == NULL);
781                 rchain->pmp = pmp;
782                 atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
783         }
784         cluster->pmp = pmp;
785
786         kdmsg_iocom_init(&pmp->iocom, pmp,
787                          KDMSG_IOCOMF_AUTOCONN |
788                          KDMSG_IOCOMF_AUTOSPAN |
789                          KDMSG_IOCOMF_AUTOCIRC,
790                          pmp->mmsg, hammer2_rcvdmsg);
791
792         ccms_domain_init(&pmp->ccms_dom);
793         TAILQ_INSERT_TAIL(&hammer2_pfslist, pmp, mntentry);
794         lockmgr(&hammer2_mntlk, LK_RELEASE);
795
796         kprintf("hammer2_mount hmp=%p pmp=%p pmpcnt=%d\n",
797                 hmp, pmp, hmp->pmp_count);
798
799         mp->mnt_flag = MNT_LOCAL;
800         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
801         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
802
803         /*
804          * required mount structure initializations
805          */
806         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
807         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
808
809         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
810         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
811
812         /*
813          * Optional fields
814          */
815         mp->mnt_iosize_max = MAXPHYS;
816         mp->mnt_data = (qaddr_t)pmp;
817         pmp->mp = mp;
818
819         /*
820          * After this point hammer2_vfs_unmount() has visibility on hmp
821          * and manual hmp1/hmp2 calls are not needed on fatal errors.
822          */
823         pmp->iroot = hammer2_inode_get(pmp, NULL, cluster);
824         hammer2_inode_ref(pmp->iroot);          /* ref for pmp->iroot */
825         hammer2_inode_unlock_ex(pmp->iroot, cluster);
826
827         /*
828          * The logical file buffer bio write thread handles things
829          * like physical block assignment and compression.
830          *
831          * (only applicable to pfs mounts, not applicable to spmp)
832          */
833         pmp->wthread_destroy = 0;
834         lwkt_create(hammer2_write_thread, pmp,
835                     &pmp->wthread_td, NULL, 0, -1, "hwrite-%s", label);
836
837         /*
838          * Ref the cluster management messaging descriptor.  The mount
839          * program deals with the other end of the communications pipe.
840          */
841         fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
842         if (fp == NULL) {
843                 kprintf("hammer2_mount: bad cluster_fd!\n");
844                 hammer2_vfs_unmount(mp, MNT_FORCE);
845                 return EBADF;
846         }
847         hammer2_cluster_reconnect(pmp, fp);
848
849         /*
850          * With the cluster operational install ihidden.
851          * (only applicable to pfs mounts, not applicable to spmp)
852          */
853         hammer2_inode_install_hidden(pmp);
854
855         /*
856          * Finish setup
857          */
858         vfs_getnewfsid(mp);
859         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
860         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
861         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
862
863         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
864         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
865         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
866         copyinstr(path, mp->mnt_stat.f_mntonname,
867                   sizeof(mp->mnt_stat.f_mntonname) - 1,
868                   &size);
869
870         /*
871          * Initial statfs to prime mnt_stat.
872          */
873         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
874         
875         return 0;
876 }
877
878 /*
879  * Handle bioq for strategy write
880  */
881 static
882 void
883 hammer2_write_thread(void *arg)
884 {
885         hammer2_pfsmount_t *pmp;
886         struct bio *bio;
887         struct buf *bp;
888         hammer2_trans_t trans;
889         struct vnode *vp;
890         hammer2_inode_t *ip;
891         hammer2_cluster_t *cparent;
892         hammer2_inode_data_t *wipdata;
893         hammer2_key_t lbase;
894         int lblksize;
895         int pblksize;
896         int error;
897         
898         pmp = arg;
899         
900         mtx_lock(&pmp->wthread_mtx);
901         while (pmp->wthread_destroy == 0) {
902                 if (bioq_first(&pmp->wthread_bioq) == NULL) {
903                         mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
904                                  0, "h2bioqw", 0);
905                 }
906                 cparent = NULL;
907
908                 hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
909
910                 while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
911                         /*
912                          * dummy bio for synchronization.  The transaction
913                          * must be reinitialized.
914                          */
915                         if (bio->bio_buf == NULL) {
916                                 bio->bio_flags |= BIO_DONE;
917                                 wakeup(bio);
918                                 hammer2_trans_done(&trans);
919                                 hammer2_trans_init(&trans, pmp,
920                                                    HAMMER2_TRANS_BUFCACHE);
921                                 continue;
922                         }
923
924                         /*
925                          * else normal bio processing
926                          */
927                         mtx_unlock(&pmp->wthread_mtx);
928
929                         hammer2_lwinprog_drop(pmp);
930                         
931                         error = 0;
932                         bp = bio->bio_buf;
933                         vp = bp->b_vp;
934                         ip = VTOI(vp);
935
936                         /*
937                          * Inode is modified, flush size and mtime changes
938                          * to ensure that the file size remains consistent
939                          * with the buffers being flushed.
940                          *
941                          * NOTE: The inode_fsync() call only flushes the
942                          *       inode's meta-data state, it doesn't try
943                          *       to flush underlying buffers or chains.
944                          */
945                         cparent = hammer2_inode_lock_ex(ip);
946                         if (ip->flags & (HAMMER2_INODE_RESIZED |
947                                          HAMMER2_INODE_MTIME)) {
948                                 hammer2_inode_fsync(&trans, ip, cparent);
949                         }
950                         wipdata = hammer2_cluster_modify_ip(&trans, ip,
951                                                          cparent, 0);
952                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
953                                                         &lbase, NULL);
954                         pblksize = hammer2_calc_physical(ip, wipdata, lbase);
955                         hammer2_write_file_core(bp, &trans, ip, wipdata,
956                                                 cparent,
957                                                 lbase, IO_ASYNC,
958                                                 pblksize, &error);
959                         hammer2_cluster_modsync(cparent);
960                         hammer2_inode_unlock_ex(ip, cparent);
961                         if (error) {
962                                 kprintf("hammer2: error in buffer write\n");
963                                 bp->b_flags |= B_ERROR;
964                                 bp->b_error = EIO;
965                         }
966                         biodone(bio);
967                         mtx_lock(&pmp->wthread_mtx);
968                 }
969                 hammer2_trans_done(&trans);
970         }
971         pmp->wthread_destroy = -1;
972         wakeup(&pmp->wthread_destroy);
973         
974         mtx_unlock(&pmp->wthread_mtx);
975 }
976
977 void
978 hammer2_bioq_sync(hammer2_pfsmount_t *pmp)
979 {
980         struct bio sync_bio;
981
982         bzero(&sync_bio, sizeof(sync_bio));     /* dummy with no bio_buf */
983         mtx_lock(&pmp->wthread_mtx);
984         if (pmp->wthread_destroy == 0 &&
985             TAILQ_FIRST(&pmp->wthread_bioq.queue)) {
986                 bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
987                 while ((sync_bio.bio_flags & BIO_DONE) == 0)
988                         mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
989         }
990         mtx_unlock(&pmp->wthread_mtx);
991 }
992
993 /* 
994  * Return a chain suitable for I/O, creating the chain if necessary
995  * and assigning its physical block.
996  */
997 static
998 hammer2_cluster_t *
999 hammer2_assign_physical(hammer2_trans_t *trans,
1000                         hammer2_inode_t *ip, hammer2_cluster_t *cparent,
1001                         hammer2_key_t lbase, int pblksize, int *errorp)
1002 {
1003         hammer2_cluster_t *cluster;
1004         hammer2_cluster_t *dparent;
1005         hammer2_key_t key_dummy;
1006         int pradix = hammer2_getradix(pblksize);
1007         int ddflag;
1008
1009         /*
1010          * Locate the chain associated with lbase, return a locked chain.
1011          * However, do not instantiate any data reference (which utilizes a
1012          * device buffer) because we will be using direct IO via the
1013          * logical buffer cache buffer.
1014          */
1015         *errorp = 0;
1016         KKASSERT(pblksize >= HAMMER2_ALLOC_MIN);
1017 retry:
1018         dparent = hammer2_cluster_lookup_init(cparent, 0);
1019         cluster = hammer2_cluster_lookup(dparent, &key_dummy,
1020                                      lbase, lbase,
1021                                      HAMMER2_LOOKUP_NODATA, &ddflag);
1022
1023         if (cluster == NULL) {
1024                 /*
1025                  * We found a hole, create a new chain entry.
1026                  *
1027                  * NOTE: DATA chains are created without device backing
1028                  *       store (nor do we want any).
1029                  */
1030                 *errorp = hammer2_cluster_create(trans, dparent, &cluster,
1031                                                lbase, HAMMER2_PBUFRADIX,
1032                                                HAMMER2_BREF_TYPE_DATA,
1033                                                pblksize);
1034                 if (cluster == NULL) {
1035                         hammer2_cluster_lookup_done(dparent);
1036                         panic("hammer2_cluster_create: par=%p error=%d\n",
1037                                 dparent->focus, *errorp);
1038                         goto retry;
1039                 }
1040                 /*ip->delta_dcount += pblksize;*/
1041         } else {
1042                 switch (hammer2_cluster_type(cluster)) {
1043                 case HAMMER2_BREF_TYPE_INODE:
1044                         /*
1045                          * The data is embedded in the inode.  The
1046                          * caller is responsible for marking the inode
1047                          * modified and copying the data to the embedded
1048                          * area.
1049                          */
1050                         break;
1051                 case HAMMER2_BREF_TYPE_DATA:
1052                         if (hammer2_cluster_bytes(cluster) != pblksize) {
1053                                 hammer2_cluster_resize(trans, ip,
1054                                                      dparent, cluster,
1055                                                      pradix,
1056                                                      HAMMER2_MODIFY_OPTDATA);
1057                         }
1058                         hammer2_cluster_modify(trans, cluster,
1059                                              HAMMER2_MODIFY_OPTDATA);
1060                         break;
1061                 default:
1062                         panic("hammer2_assign_physical: bad type");
1063                         /* NOT REACHED */
1064                         break;
1065                 }
1066         }
1067
1068         /*
1069          * Cleanup.  If cluster wound up being the inode itself, i.e.
1070          * the DIRECTDATA case for offset 0, then we need to update cparent.
1071          * The caller expects cparent to not become stale.
1072          */
1073         hammer2_cluster_lookup_done(dparent);
1074         /* dparent = NULL; safety */
1075         if (cluster && ddflag)
1076                 hammer2_cluster_replace_locked(cparent, cluster);
1077         return (cluster);
1078 }
1079
1080 /* 
1081  * From hammer2_vnops.c.
1082  * The core write function which determines which path to take
1083  * depending on compression settings.
1084  */
1085 static
1086 void
1087 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
1088                         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
1089                         hammer2_cluster_t *cparent,
1090                         hammer2_key_t lbase, int ioflag, int pblksize,
1091                         int *errorp)
1092 {
1093         hammer2_cluster_t *cluster;
1094
1095         switch(HAMMER2_DEC_COMP(ipdata->comp_algo)) {
1096         case HAMMER2_COMP_NONE:
1097                 /*
1098                  * We have to assign physical storage to the buffer
1099                  * we intend to dirty or write now to avoid deadlocks
1100                  * in the strategy code later.
1101                  *
1102                  * This can return NOOFFSET for inode-embedded data.
1103                  * The strategy code will take care of it in that case.
1104                  */
1105                 cluster = hammer2_assign_physical(trans, ip, cparent,
1106                                                 lbase, pblksize,
1107                                                 errorp);
1108                 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp);
1109                 if (cluster)
1110                         hammer2_cluster_unlock(cluster);
1111                 break;
1112         case HAMMER2_COMP_AUTOZERO:
1113                 /*
1114                  * Check for zero-fill only
1115                  */
1116                 hammer2_zero_check_and_write(bp, trans, ip,
1117                                     ipdata, cparent, lbase,
1118                                     ioflag, pblksize, errorp);
1119                 break;
1120         case HAMMER2_COMP_LZ4:
1121         case HAMMER2_COMP_ZLIB:
1122         default:
1123                 /*
1124                  * Check for zero-fill and attempt compression.
1125                  */
1126                 hammer2_compress_and_write(bp, trans, ip,
1127                                            ipdata, cparent,
1128                                            lbase, ioflag,
1129                                            pblksize, errorp,
1130                                            ipdata->comp_algo);
1131                 break;
1132         }
1133 }
1134
1135 /*
1136  * Generic function that will perform the compression in compression
1137  * write path. The compression algorithm is determined by the settings
1138  * obtained from inode.
1139  */
1140 static
1141 void
1142 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
1143         hammer2_inode_t *ip, const hammer2_inode_data_t *ipdata,
1144         hammer2_cluster_t *cparent,
1145         hammer2_key_t lbase, int ioflag, int pblksize,
1146         int *errorp, int comp_algo)
1147 {
1148         hammer2_cluster_t *cluster;
1149         hammer2_chain_t *chain;
1150         int comp_size;
1151         int comp_block_size;
1152         int i;
1153         char *comp_buffer;
1154
1155         if (test_block_zeros(bp->b_data, pblksize)) {
1156                 zero_write(bp, trans, ip, ipdata, cparent, lbase, errorp);
1157                 return;
1158         }
1159
1160         comp_size = 0;
1161         comp_buffer = NULL;
1162
1163         KKASSERT(pblksize / 2 <= 32768);
1164                 
1165         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
1166                 z_stream strm_compress;
1167                 int comp_level;
1168                 int ret;
1169
1170                 switch(HAMMER2_DEC_COMP(comp_algo)) {
1171                 case HAMMER2_COMP_LZ4:
1172                         comp_buffer = objcache_get(cache_buffer_write,
1173                                                    M_INTWAIT);
1174                         comp_size = LZ4_compress_limitedOutput(
1175                                         bp->b_data,
1176                                         &comp_buffer[sizeof(int)],
1177                                         pblksize,
1178                                         pblksize / 2 - sizeof(int));
1179                         /*
1180                          * We need to prefix with the size, LZ4
1181                          * doesn't do it for us.  Add the related
1182                          * overhead.
1183                          */
1184                         *(int *)comp_buffer = comp_size;
1185                         if (comp_size)
1186                                 comp_size += sizeof(int);
1187                         break;
1188                 case HAMMER2_COMP_ZLIB:
1189                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1190                         if (comp_level == 0)
1191                                 comp_level = 6; /* default zlib compression */
1192                         else if (comp_level < 6)
1193                                 comp_level = 6;
1194                         else if (comp_level > 9)
1195                                 comp_level = 9;
1196                         ret = deflateInit(&strm_compress, comp_level);
1197                         if (ret != Z_OK) {
1198                                 kprintf("HAMMER2 ZLIB: fatal error "
1199                                         "on deflateInit.\n");
1200                         }
1201
1202                         comp_buffer = objcache_get(cache_buffer_write,
1203                                                    M_INTWAIT);
1204                         strm_compress.next_in = bp->b_data;
1205                         strm_compress.avail_in = pblksize;
1206                         strm_compress.next_out = comp_buffer;
1207                         strm_compress.avail_out = pblksize / 2;
1208                         ret = deflate(&strm_compress, Z_FINISH);
1209                         if (ret == Z_STREAM_END) {
1210                                 comp_size = pblksize / 2 -
1211                                             strm_compress.avail_out;
1212                         } else {
1213                                 comp_size = 0;
1214                         }
1215                         ret = deflateEnd(&strm_compress);
1216                         break;
1217                 default:
1218                         kprintf("Error: Unknown compression method.\n");
1219                         kprintf("Comp_method = %d.\n", comp_algo);
1220                         break;
1221                 }
1222         }
1223
1224         if (comp_size == 0) {
1225                 /*
1226                  * compression failed or turned off
1227                  */
1228                 comp_block_size = pblksize;     /* safety */
1229                 if (++ip->comp_heuristic > 128)
1230                         ip->comp_heuristic = 8;
1231         } else {
1232                 /*
1233                  * compression succeeded
1234                  */
1235                 ip->comp_heuristic = 0;
1236                 if (comp_size <= 1024) {
1237                         comp_block_size = 1024;
1238                 } else if (comp_size <= 2048) {
1239                         comp_block_size = 2048;
1240                 } else if (comp_size <= 4096) {
1241                         comp_block_size = 4096;
1242                 } else if (comp_size <= 8192) {
1243                         comp_block_size = 8192;
1244                 } else if (comp_size <= 16384) {
1245                         comp_block_size = 16384;
1246                 } else if (comp_size <= 32768) {
1247                         comp_block_size = 32768;
1248                 } else {
1249                         panic("hammer2: WRITE PATH: "
1250                               "Weird comp_size value.");
1251                         /* NOT REACHED */
1252                         comp_block_size = pblksize;
1253                 }
1254         }
1255
1256         cluster = hammer2_assign_physical(trans, ip, cparent,
1257                                           lbase, comp_block_size,
1258                                           errorp);
1259         ipdata = &hammer2_cluster_data(cparent)->ipdata;
1260
1261         if (*errorp) {
1262                 kprintf("WRITE PATH: An error occurred while "
1263                         "assigning physical space.\n");
1264                 KKASSERT(cluster == NULL);
1265                 goto done;
1266         }
1267
1268         for (i = 0; i < cluster->nchains; ++i) {
1269                 hammer2_io_t *dio;
1270                 char *bdata;
1271                 int temp_check;
1272
1273                 chain = cluster->array[i];
1274                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1275
1276                 switch(chain->bref.type) {
1277                 case HAMMER2_BREF_TYPE_INODE:
1278                         KKASSERT(chain->data->ipdata.op_flags &
1279                                  HAMMER2_OPFLAG_DIRECTDATA);
1280                         KKASSERT(bp->b_loffset == 0);
1281                         bcopy(bp->b_data, chain->data->ipdata.u.data,
1282                               HAMMER2_EMBEDDED_BYTES);
1283                         break;
1284                 case HAMMER2_BREF_TYPE_DATA:
1285                         temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1286
1287                         /*
1288                          * Optimize out the read-before-write
1289                          * if possible.
1290                          */
1291                         *errorp = hammer2_io_newnz(chain->hmp,
1292                                                    chain->bref.data_off,
1293                                                    chain->bytes,
1294                                                    &dio);
1295                         if (*errorp) {
1296                                 hammer2_io_brelse(&dio);
1297                                 kprintf("hammer2: WRITE PATH: "
1298                                         "dbp bread error\n");
1299                                 break;
1300                         }
1301                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1302
1303                         /*
1304                          * When loading the block make sure we don't
1305                          * leave garbage after the compressed data.
1306                          */
1307                         if (comp_size) {
1308                                 chain->bref.methods =
1309                                         HAMMER2_ENC_COMP(comp_algo) +
1310                                         HAMMER2_ENC_CHECK(temp_check);
1311                                 bcopy(comp_buffer, bdata, comp_size);
1312                                 if (comp_size != comp_block_size) {
1313                                         bzero(bdata + comp_size,
1314                                               comp_block_size - comp_size);
1315                                 }
1316                         } else {
1317                                 chain->bref.methods =
1318                                         HAMMER2_ENC_COMP(
1319                                                 HAMMER2_COMP_NONE) +
1320                                         HAMMER2_ENC_CHECK(temp_check);
1321                                 bcopy(bp->b_data, bdata, pblksize);
1322                         }
1323
1324                         /*
1325                          * Device buffer is now valid, chain is no
1326                          * longer in the initial state.
1327                          */
1328                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1329
1330                         /* Now write the related bdp. */
1331                         if (ioflag & IO_SYNC) {
1332                                 /*
1333                                  * Synchronous I/O requested.
1334                                  */
1335                                 hammer2_io_bwrite(&dio);
1336                         /*
1337                         } else if ((ioflag & IO_DIRECT) &&
1338                                    loff + n == pblksize) {
1339                                 hammer2_io_bdwrite(&dio);
1340                         */
1341                         } else if (ioflag & IO_ASYNC) {
1342                                 hammer2_io_bawrite(&dio);
1343                         } else {
1344                                 hammer2_io_bdwrite(&dio);
1345                         }
1346                         break;
1347                 default:
1348                         panic("hammer2_write_bp: bad chain type %d\n",
1349                                 chain->bref.type);
1350                         /* NOT REACHED */
1351                         break;
1352                 }
1353         }
1354 done:
1355         if (cluster)
1356                 hammer2_cluster_unlock(cluster);
1357         if (comp_buffer)
1358                 objcache_put(cache_buffer_write, comp_buffer);
1359 }
1360
1361 /*
1362  * Function that performs zero-checking and writing without compression,
1363  * it corresponds to default zero-checking path.
1364  */
1365 static
1366 void
1367 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1368         hammer2_inode_t *ip, const hammer2_inode_data_t *ipdata,
1369         hammer2_cluster_t *cparent,
1370         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp)
1371 {
1372         hammer2_cluster_t *cluster;
1373
1374         if (test_block_zeros(bp->b_data, pblksize)) {
1375                 zero_write(bp, trans, ip, ipdata, cparent, lbase, errorp);
1376         } else {
1377                 cluster = hammer2_assign_physical(trans, ip, cparent,
1378                                                   lbase, pblksize, errorp);
1379                 hammer2_write_bp(cluster, bp, ioflag, pblksize, errorp);
1380                 if (cluster)
1381                         hammer2_cluster_unlock(cluster);
1382         }
1383 }
1384
1385 /*
1386  * A function to test whether a block of data contains only zeros,
1387  * returns TRUE (non-zero) if the block is all zeros.
1388  */
1389 static
1390 int
1391 test_block_zeros(const char *buf, size_t bytes)
1392 {
1393         size_t i;
1394
1395         for (i = 0; i < bytes; i += sizeof(long)) {
1396                 if (*(const long *)(buf + i) != 0)
1397                         return (0);
1398         }
1399         return (1);
1400 }
1401
1402 /*
1403  * Function to "write" a block that contains only zeros.
1404  */
1405 static
1406 void
1407 zero_write(struct buf *bp, hammer2_trans_t *trans,
1408         hammer2_inode_t *ip, const hammer2_inode_data_t *ipdata,
1409         hammer2_cluster_t *cparent,
1410         hammer2_key_t lbase, int *errorp __unused)
1411 {
1412         hammer2_cluster_t *cluster;
1413         hammer2_media_data_t *data;
1414         hammer2_key_t key_dummy;
1415         int ddflag;
1416
1417         cparent = hammer2_cluster_lookup_init(cparent, 0);
1418         cluster = hammer2_cluster_lookup(cparent, &key_dummy, lbase, lbase,
1419                                      HAMMER2_LOOKUP_NODATA, &ddflag);
1420         if (cluster) {
1421                 data = hammer2_cluster_wdata(cluster);
1422
1423                 if (ddflag) {
1424                         KKASSERT(cluster->focus->flags &
1425                                  HAMMER2_CHAIN_MODIFIED);
1426                         bzero(data->ipdata.u.data, HAMMER2_EMBEDDED_BYTES);
1427                         hammer2_cluster_modsync(cluster);
1428                 } else {
1429                         hammer2_cluster_delete(trans, cluster, 0);
1430                 }
1431                 hammer2_cluster_unlock(cluster);
1432         }
1433         hammer2_cluster_lookup_done(cparent);
1434 }
1435
1436 /*
1437  * Function to write the data as it is, without performing any sort of
1438  * compression. This function is used in path without compression and
1439  * default zero-checking path.
1440  */
1441 static
1442 void
1443 hammer2_write_bp(hammer2_cluster_t *cluster, struct buf *bp, int ioflag,
1444                                 int pblksize, int *errorp)
1445 {
1446         hammer2_chain_t *chain;
1447         hammer2_io_t *dio;
1448         char *bdata;
1449         int error;
1450         int i;
1451         int temp_check;
1452
1453         error = 0;      /* XXX TODO below */
1454
1455         for (i = 0; i < cluster->nchains; ++i) {
1456                 chain = cluster->array[i];
1457
1458                 temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1459
1460                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1461
1462                 switch(chain->bref.type) {
1463                 case HAMMER2_BREF_TYPE_INODE:
1464                         KKASSERT(chain->data->ipdata.op_flags &
1465                                  HAMMER2_OPFLAG_DIRECTDATA);
1466                         KKASSERT(bp->b_loffset == 0);
1467                         bcopy(bp->b_data, chain->data->ipdata.u.data,
1468                               HAMMER2_EMBEDDED_BYTES);
1469                         error = 0;
1470                         break;
1471                 case HAMMER2_BREF_TYPE_DATA:
1472                         error = hammer2_io_newnz(chain->hmp,
1473                                                  chain->bref.data_off,
1474                                                  chain->bytes, &dio);
1475                         if (error) {
1476                                 hammer2_io_bqrelse(&dio);
1477                                 kprintf("hammer2: WRITE PATH: "
1478                                         "dbp bread error\n");
1479                                 break;
1480                         }
1481                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1482
1483                         chain->bref.methods = HAMMER2_ENC_COMP(
1484                                                         HAMMER2_COMP_NONE) +
1485                                               HAMMER2_ENC_CHECK(temp_check);
1486                         bcopy(bp->b_data, bdata, chain->bytes);
1487
1488                         /*
1489                          * Device buffer is now valid, chain is no
1490                          * longer in the initial state.
1491                          */
1492                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1493
1494                         if (ioflag & IO_SYNC) {
1495                                 /*
1496                                  * Synchronous I/O requested.
1497                                  */
1498                                 hammer2_io_bwrite(&dio);
1499                         /*
1500                         } else if ((ioflag & IO_DIRECT) &&
1501                                    loff + n == pblksize) {
1502                                 hammer2_io_bdwrite(&dio);
1503                         */
1504                         } else if (ioflag & IO_ASYNC) {
1505                                 hammer2_io_bawrite(&dio);
1506                         } else {
1507                                 hammer2_io_bdwrite(&dio);
1508                         }
1509                         break;
1510                 default:
1511                         panic("hammer2_write_bp: bad chain type %d\n",
1512                               chain->bref.type);
1513                         /* NOT REACHED */
1514                         error = 0;
1515                         break;
1516                 }
1517                 KKASSERT(error == 0);   /* XXX TODO */
1518         }
1519         *errorp = error;
1520 }
1521
1522 static
1523 int
1524 hammer2_remount(hammer2_mount_t *hmp, struct mount *mp, char *path,
1525                 struct vnode *devvp, struct ucred *cred)
1526 {
1527         int error;
1528
1529         if (hmp->ronly && (mp->mnt_kern_flag & MNTK_WANTRDWR)) {
1530                 error = hammer2_recovery(hmp);
1531         } else {
1532                 error = 0;
1533         }
1534         return error;
1535 }
1536
1537 static
1538 int
1539 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1540 {
1541         hammer2_pfsmount_t *pmp;
1542         hammer2_mount_t *hmp;
1543         hammer2_chain_t *rchain;
1544         hammer2_cluster_t *cluster;
1545         int flags;
1546         int error = 0;
1547         int i;
1548
1549         pmp = MPTOPMP(mp);
1550
1551         if (pmp == NULL)
1552                 return(0);
1553
1554         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1555         TAILQ_REMOVE(&hammer2_pfslist, pmp, mntentry);
1556
1557         /*
1558          * If mount initialization proceeded far enough we must flush
1559          * its vnodes.
1560          */
1561         if (mntflags & MNT_FORCE)
1562                 flags = FORCECLOSE;
1563         else
1564                 flags = 0;
1565         if (pmp->iroot) {
1566                 error = vflush(mp, 0, flags);
1567                 if (error)
1568                         goto failed;
1569         }
1570
1571         ccms_domain_uninit(&pmp->ccms_dom);
1572         kdmsg_iocom_uninit(&pmp->iocom);        /* XXX chain dependency */
1573
1574         if (pmp->wthread_td) {
1575                 mtx_lock(&pmp->wthread_mtx);
1576                 pmp->wthread_destroy = 1;
1577                 wakeup(&pmp->wthread_bioq);
1578                 while (pmp->wthread_destroy != -1) {
1579                         mtxsleep(&pmp->wthread_destroy,
1580                                 &pmp->wthread_mtx, 0,
1581                                 "umount-sleep", 0);
1582                 }
1583                 mtx_unlock(&pmp->wthread_mtx);
1584                 pmp->wthread_td = NULL;
1585         }
1586
1587         /*
1588          * Cleanup our reference on ihidden.
1589          */
1590         if (pmp->ihidden) {
1591                 hammer2_inode_drop(pmp->ihidden);
1592                 pmp->ihidden = NULL;
1593         }
1594
1595         /*
1596          * Cleanup our reference on iroot.  iroot is (should) not be needed
1597          * by the flush code.
1598          */
1599         if (pmp->iroot) {
1600                 cluster = &pmp->iroot->cluster;
1601                 for (i = 0; i < pmp->iroot->cluster.nchains; ++i) {
1602                         rchain = pmp->iroot->cluster.array[i];
1603                         if (rchain == NULL)
1604                                 continue;
1605                         hmp = rchain->hmp;
1606                         hammer2_vfs_unmount_hmp1(mp, hmp);
1607
1608                         atomic_clear_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
1609 #if REPORT_REFS_ERRORS
1610                         if (rchain->refs != 1)
1611                                 kprintf("PMP->RCHAIN %p REFS WRONG %d\n",
1612                                         rchain, rchain->refs);
1613 #else
1614                         KKASSERT(rchain->refs == 1);
1615 #endif
1616                         hammer2_chain_drop(rchain);
1617                         cluster->array[i] = NULL;
1618                         hammer2_vfs_unmount_hmp2(mp, hmp);
1619                 }
1620                 cluster->focus = NULL;
1621
1622 #if REPORT_REFS_ERRORS
1623                 if (pmp->iroot->refs != 1)
1624                         kprintf("PMP->IROOT %p REFS WRONG %d\n",
1625                                 pmp->iroot, pmp->iroot->refs);
1626 #else
1627                 KKASSERT(pmp->iroot->refs == 1);
1628 #endif
1629                 /* ref for pmp->iroot */
1630                 hammer2_inode_drop(pmp->iroot);
1631                 pmp->iroot = NULL;
1632         }
1633
1634         pmp->mp = NULL;
1635         mp->mnt_data = NULL;
1636
1637         kmalloc_destroy(&pmp->mmsg);
1638         kmalloc_destroy(&pmp->minode);
1639
1640         kfree(pmp, M_HAMMER2);
1641         error = 0;
1642
1643 failed:
1644         lockmgr(&hammer2_mntlk, LK_RELEASE);
1645
1646         return (error);
1647 }
1648
1649 static
1650 void
1651 hammer2_vfs_unmount_hmp1(struct mount *mp, hammer2_mount_t *hmp)
1652 {
1653         hammer2_mount_exlock(hmp);
1654         --hmp->pmp_count;
1655
1656         kprintf("hammer2_unmount hmp=%p pmpcnt=%d\n", hmp, hmp->pmp_count);
1657
1658         /*
1659          * Flush any left over chains.  The voldata lock is only used
1660          * to synchronize against HAMMER2_CHAIN_MODIFIED_AUX.
1661          *
1662          * Flush twice to ensure that the freemap is completely
1663          * synchronized.  If we only do it once the next mount's
1664          * recovery scan will have to do some fixups (which isn't
1665          * bad, but we don't want it to have to do it except when
1666          * recovering from a crash).
1667          */
1668         hammer2_voldata_lock(hmp);
1669         if (((hmp->vchain.flags | hmp->fchain.flags) &
1670              HAMMER2_CHAIN_MODIFIED) ||
1671             hmp->vchain.update_xhi > hmp->vchain.update_xlo ||
1672             hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
1673                 hammer2_voldata_unlock(hmp);
1674                 hammer2_vfs_sync(mp, MNT_WAIT);
1675                 /*hammer2_vfs_sync(mp, MNT_WAIT);*/
1676         } else {
1677                 hammer2_voldata_unlock(hmp);
1678         }
1679         if (hmp->pmp_count == 0) {
1680                 if (((hmp->vchain.flags | hmp->fchain.flags) &
1681                      HAMMER2_CHAIN_MODIFIED) ||
1682                     hmp->vchain.update_xhi > hmp->vchain.update_xlo ||
1683                     hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
1684                         kprintf("hammer2_unmount: chains left over "
1685                                 "after final sync\n");
1686                         kprintf("    vchain %08x update_xlo/hi %08x/%08x\n",
1687                                 hmp->vchain.flags,
1688                                 hmp->vchain.update_xlo,
1689                                 hmp->vchain.update_xhi);
1690                         kprintf("    fchain %08x update_xhi/hi %08x/%08x\n",
1691                                 hmp->fchain.flags,
1692                                 hmp->fchain.update_xlo,
1693                                 hmp->fchain.update_xhi);
1694
1695                         if (hammer2_debug & 0x0010)
1696                                 Debugger("entered debugger");
1697                 }
1698         }
1699 }
1700
1701 static
1702 void
1703 hammer2_vfs_unmount_hmp2(struct mount *mp, hammer2_mount_t *hmp)
1704 {
1705         hammer2_pfsmount_t *spmp;
1706         struct vnode *devvp;
1707         int dumpcnt;
1708         int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
1709
1710         /*
1711          * If no PFS's left drop the master hammer2_mount for the
1712          * device.
1713          */
1714         if (hmp->pmp_count == 0) {
1715                 /*
1716                  * Clean up SPMP and the super-root inode
1717                  */
1718                 spmp = hmp->spmp;
1719                 if (spmp) {
1720                         if (spmp->iroot) {
1721                                 hammer2_inode_drop(spmp->iroot);
1722                                 spmp->iroot = NULL;
1723                         }
1724                         hmp->spmp = NULL;
1725                         kmalloc_destroy(&spmp->mmsg);
1726                         kmalloc_destroy(&spmp->minode);
1727                         kfree(spmp, M_HAMMER2);
1728                 }
1729
1730                 /*
1731                  * Finish up with the device vnode
1732                  */
1733                 if ((devvp = hmp->devvp) != NULL) {
1734                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
1735                         vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1736                         hmp->devvp = NULL;
1737                         VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE), NULL);
1738                         vn_unlock(devvp);
1739                         vrele(devvp);
1740                         devvp = NULL;
1741                 }
1742
1743                 /*
1744                  * Clear vchain/fchain flags that might prevent final cleanup
1745                  * of these chains.
1746                  */
1747                 if (hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) {
1748                         atomic_clear_int(&hmp->vchain.flags,
1749                                          HAMMER2_CHAIN_MODIFIED);
1750                         hammer2_chain_drop(&hmp->vchain);
1751                 }
1752                 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_CREATE) {
1753                         atomic_clear_int(&hmp->vchain.flags,
1754                                          HAMMER2_CHAIN_FLUSH_CREATE);
1755                         hammer2_chain_drop(&hmp->vchain);
1756                 }
1757                 if (hmp->vchain.flags & HAMMER2_CHAIN_FLUSH_DELETE) {
1758                         atomic_clear_int(&hmp->vchain.flags,
1759                                          HAMMER2_CHAIN_FLUSH_DELETE);
1760                         hammer2_chain_drop(&hmp->vchain);
1761                 }
1762
1763                 if (hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) {
1764                         atomic_clear_int(&hmp->fchain.flags,
1765                                          HAMMER2_CHAIN_MODIFIED);
1766                         hammer2_chain_drop(&hmp->fchain);
1767                 }
1768                 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_CREATE) {
1769                         atomic_clear_int(&hmp->fchain.flags,
1770                                          HAMMER2_CHAIN_FLUSH_CREATE);
1771                         hammer2_chain_drop(&hmp->fchain);
1772                 }
1773                 if (hmp->fchain.flags & HAMMER2_CHAIN_FLUSH_DELETE) {
1774                         atomic_clear_int(&hmp->fchain.flags,
1775                                          HAMMER2_CHAIN_FLUSH_DELETE);
1776                         hammer2_chain_drop(&hmp->fchain);
1777                 }
1778
1779                 /*
1780                  * Final drop of embedded freemap root chain to
1781                  * clean up fchain.core (fchain structure is not
1782                  * flagged ALLOCATED so it is cleaned out and then
1783                  * left to rot).
1784                  */
1785                 hammer2_chain_drop(&hmp->fchain);
1786
1787                 /*
1788                  * Final drop of embedded volume root chain to clean
1789                  * up vchain.core (vchain structure is not flagged
1790                  * ALLOCATED so it is cleaned out and then left to
1791                  * rot).
1792                  */
1793                 dumpcnt = 50;
1794                 hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt, 'v');
1795                 dumpcnt = 50;
1796                 hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt, 'f');
1797                 hammer2_mount_unlock(hmp);
1798                 hammer2_chain_drop(&hmp->vchain);
1799
1800                 hammer2_io_cleanup(hmp, &hmp->iotree);
1801                 if (hmp->iofree_count) {
1802                         kprintf("io_cleanup: %d I/O's left hanging\n",
1803                                 hmp->iofree_count);
1804                 }
1805
1806                 TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1807                 kmalloc_destroy(&hmp->mchain);
1808                 kfree(hmp, M_HAMMER2);
1809         } else {
1810                 hammer2_mount_unlock(hmp);
1811         }
1812 }
1813
1814 static
1815 int
1816 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1817              ino_t ino, struct vnode **vpp)
1818 {
1819         kprintf("hammer2_vget\n");
1820         return (EOPNOTSUPP);
1821 }
1822
1823 static
1824 int
1825 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1826 {
1827         hammer2_pfsmount_t *pmp;
1828         hammer2_cluster_t *cparent;
1829         int error;
1830         struct vnode *vp;
1831
1832         pmp = MPTOPMP(mp);
1833         if (pmp->iroot == NULL) {
1834                 *vpp = NULL;
1835                 error = EINVAL;
1836         } else {
1837                 cparent = hammer2_inode_lock_sh(pmp->iroot);
1838                 vp = hammer2_igetv(pmp->iroot, cparent, &error);
1839                 hammer2_inode_unlock_sh(pmp->iroot, cparent);
1840                 *vpp = vp;
1841                 if (vp == NULL)
1842                         kprintf("vnodefail\n");
1843         }
1844
1845         return (error);
1846 }
1847
1848 /*
1849  * Filesystem status
1850  *
1851  * XXX incorporate ipdata->inode_quota and data_quota
1852  */
1853 static
1854 int
1855 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
1856 {
1857         hammer2_pfsmount_t *pmp;
1858         hammer2_mount_t *hmp;
1859
1860         pmp = MPTOPMP(mp);
1861         KKASSERT(pmp->iroot->cluster.nchains >= 1);
1862         hmp = pmp->iroot->cluster.focus->hmp;   /* XXX */
1863
1864         mp->mnt_stat.f_files = pmp->inode_count;
1865         mp->mnt_stat.f_ffree = 0;
1866         mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1867         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1868         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
1869
1870         *sbp = mp->mnt_stat;
1871         return (0);
1872 }
1873
1874 static
1875 int
1876 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
1877 {
1878         hammer2_pfsmount_t *pmp;
1879         hammer2_mount_t *hmp;
1880
1881         pmp = MPTOPMP(mp);
1882         KKASSERT(pmp->iroot->cluster.nchains >= 1);
1883         hmp = pmp->iroot->cluster.focus->hmp;   /* XXX */
1884
1885         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1886         mp->mnt_vstat.f_files = pmp->inode_count;
1887         mp->mnt_vstat.f_ffree = 0;
1888         mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1889         mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1890         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
1891
1892         *sbp = mp->mnt_vstat;
1893         return (0);
1894 }
1895
1896 /*
1897  * Mount-time recovery (RW mounts)
1898  *
1899  * Updates to the free block table are allowed to lag flushes by one
1900  * transaction.  In case of a crash, then on a fresh mount we must do an
1901  * incremental scan of the last committed transaction id and make sure that
1902  * all related blocks have been marked allocated.
1903  *
1904  * The super-root topology and each PFS has its own transaction id domain,
1905  * so we must track PFS boundary transitions.
1906  */
1907 struct hammer2_recovery_elm {
1908         TAILQ_ENTRY(hammer2_recovery_elm) entry;
1909         hammer2_chain_t *chain;
1910         hammer2_tid_t sync_tid;
1911 };
1912
1913 TAILQ_HEAD(hammer2_recovery_list, hammer2_recovery_elm);
1914
1915 struct hammer2_recovery_info {
1916         struct hammer2_recovery_list list;
1917         int     depth;
1918 };
1919
1920 static int hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
1921                         hammer2_chain_t *parent,
1922                         struct hammer2_recovery_info *info,
1923                         hammer2_tid_t sync_tid);
1924
1925 #define HAMMER2_RECOVERY_MAXDEPTH       10
1926
1927 static
1928 int
1929 hammer2_recovery(hammer2_mount_t *hmp)
1930 {
1931         hammer2_trans_t trans;
1932         struct hammer2_recovery_info info;
1933         struct hammer2_recovery_elm *elm;
1934         hammer2_chain_t *parent;
1935         hammer2_tid_t sync_tid;
1936         int error;
1937         int cumulative_error = 0;
1938
1939         hammer2_trans_init(&trans, hmp->spmp, 0);
1940
1941         sync_tid = 0;
1942         TAILQ_INIT(&info.list);
1943         info.depth = 0;
1944         parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
1945         cumulative_error = hammer2_recovery_scan(&trans, hmp, parent,
1946                                                  &info, sync_tid);
1947         hammer2_chain_lookup_done(parent);
1948
1949         while ((elm = TAILQ_FIRST(&info.list)) != NULL) {
1950                 TAILQ_REMOVE(&info.list, elm, entry);
1951                 parent = elm->chain;
1952                 sync_tid = elm->sync_tid;
1953                 kfree(elm, M_HAMMER2);
1954
1955                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS |
1956                                            HAMMER2_RESOLVE_NOREF);
1957                 error = hammer2_recovery_scan(&trans, hmp, parent,
1958                                               &info, sync_tid);
1959                 hammer2_chain_unlock(parent);
1960                 if (error)
1961                         cumulative_error = error;
1962         }
1963         hammer2_trans_done(&trans);
1964
1965         return cumulative_error;
1966 }
1967
1968 static
1969 int
1970 hammer2_recovery_scan(hammer2_trans_t *trans, hammer2_mount_t *hmp,
1971                       hammer2_chain_t *parent,
1972                       struct hammer2_recovery_info *info,
1973                       hammer2_tid_t sync_tid)
1974 {
1975         hammer2_chain_t *chain;
1976         int cache_index;
1977         int cumulative_error = 0;
1978         int pfs_boundary = 0;
1979         int error;
1980
1981         /*
1982          * Adjust freemap to ensure that the block(s) are marked allocated.
1983          */
1984         if (parent->bref.type != HAMMER2_BREF_TYPE_VOLUME) {
1985                 hammer2_freemap_adjust(trans, hmp, &parent->bref,
1986                                        HAMMER2_FREEMAP_DORECOVER);
1987         }
1988
1989         /*
1990          * Check type for recursive scan
1991          */
1992         switch(parent->bref.type) {
1993         case HAMMER2_BREF_TYPE_VOLUME:
1994                 /* data already instantiated */
1995                 break;
1996         case HAMMER2_BREF_TYPE_INODE:
1997                 /*
1998                  * Must instantiate data for DIRECTDATA test and also
1999                  * for recursion.
2000                  */
2001                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2002                 if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
2003                         /* not applicable to recovery scan */
2004                         hammer2_chain_unlock(parent);
2005                         return 0;
2006                 }
2007                 if ((parent->data->ipdata.op_flags & HAMMER2_OPFLAG_PFSROOT) &&
2008                     info->depth != 0) {
2009                         pfs_boundary = 1;
2010                         sync_tid = parent->bref.mirror_tid - 1;
2011                 }
2012                 hammer2_chain_unlock(parent);
2013                 break;
2014         case HAMMER2_BREF_TYPE_INDIRECT:
2015                 /*
2016                  * Must instantiate data for recursion
2017                  */
2018                 hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS);
2019                 hammer2_chain_unlock(parent);
2020                 break;
2021         case HAMMER2_BREF_TYPE_DATA:
2022         case HAMMER2_BREF_TYPE_FREEMAP:
2023         case HAMMER2_BREF_TYPE_FREEMAP_NODE:
2024         case HAMMER2_BREF_TYPE_FREEMAP_LEAF:
2025                 /* not applicable to recovery scan */
2026                 return 0;
2027                 break;
2028         default:
2029                 return EDOM;
2030         }
2031
2032         /*
2033          * Defer operation if depth limit reached or if we are crossing a
2034          * PFS boundary.
2035          */
2036         if (info->depth >= HAMMER2_RECOVERY_MAXDEPTH || pfs_boundary) {
2037                 struct hammer2_recovery_elm *elm;
2038
2039                 elm = kmalloc(sizeof(*elm), M_HAMMER2, M_ZERO | M_WAITOK);
2040                 elm->chain = parent;
2041                 elm->sync_tid = sync_tid;
2042                 hammer2_chain_ref(parent);
2043                 TAILQ_INSERT_TAIL(&info->list, elm, entry);
2044                 /* unlocked by caller */
2045
2046                 return(0);
2047         }
2048
2049
2050         /*
2051          * Recursive scan of the last flushed transaction only.  We are
2052          * doing this without pmp assignments so don't leave the chains
2053          * hanging around after we are done with them.
2054          */
2055         cache_index = 0;
2056         chain = hammer2_chain_scan(parent, NULL, &cache_index,
2057                                    HAMMER2_LOOKUP_NODATA);
2058         while (chain) {
2059                 atomic_set_int(&chain->flags, HAMMER2_CHAIN_RELEASE);
2060                 if (chain->bref.mirror_tid >= sync_tid) {
2061                         ++info->depth;
2062                         error = hammer2_recovery_scan(trans, hmp, chain,
2063                                                       info, sync_tid);
2064                         --info->depth;
2065                         if (error)
2066                                 cumulative_error = error;
2067                 }
2068                 chain = hammer2_chain_scan(parent, chain, &cache_index,
2069                                            HAMMER2_LOOKUP_NODATA);
2070         }
2071
2072         return cumulative_error;
2073 }
2074
2075 /*
2076  * Sync the entire filesystem; this is called from the filesystem syncer
2077  * process periodically and whenever a user calls sync(1) on the hammer
2078  * mountpoint.
2079  *
2080  * Currently is actually called from the syncer! \o/
2081  *
2082  * This task will have to snapshot the state of the dirty inode chain.
2083  * From that, it will have to make sure all of the inodes on the dirty
2084  * chain have IO initiated. We make sure that io is initiated for the root
2085  * block.
2086  *
2087  * If waitfor is set, we wait for media to acknowledge the new rootblock.
2088  *
2089  * THINKS: side A vs side B, to have sync not stall all I/O?
2090  */
2091 int
2092 hammer2_vfs_sync(struct mount *mp, int waitfor)
2093 {
2094         struct hammer2_sync_info info;
2095         hammer2_inode_t *iroot;
2096         hammer2_chain_t *chain;
2097         hammer2_chain_t *parent;
2098         hammer2_pfsmount_t *pmp;
2099         hammer2_mount_t *hmp;
2100         int flags;
2101         int error;
2102         int total_error;
2103         int force_fchain;
2104         int i;
2105         int j;
2106
2107         pmp = MPTOPMP(mp);
2108         iroot = pmp->iroot;
2109         KKASSERT(iroot);
2110         KKASSERT(iroot->pmp == pmp);
2111
2112         /*
2113          * We can't acquire locks on existing vnodes while in a transaction
2114          * without risking a deadlock.  This assumes that vfsync() can be
2115          * called without the vnode locked (which it can in DragonFly).
2116          * Otherwise we'd have to implement a multi-pass or flag the lock
2117          * failures and retry.
2118          *
2119          * The reclamation code interlocks with the sync list's token
2120          * (by removing the vnode from the scan list) before unlocking
2121          * the inode, giving us time to ref the inode.
2122          */
2123         /*flags = VMSC_GETVP;*/
2124         flags = 0;
2125         if (waitfor & MNT_LAZY)
2126                 flags |= VMSC_ONEPASS;
2127
2128         /*
2129          * Start our flush transaction.  This does not return until all
2130          * concurrent transactions have completed and will prevent any
2131          * new transactions from running concurrently, except for the
2132          * buffer cache transactions.
2133          *
2134          * For efficiency do an async pass before making sure with a
2135          * synchronous pass on all related buffer cache buffers.  It
2136          * should theoretically not be possible for any new file buffers
2137          * to be instantiated during this sequence.
2138          */
2139         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH |
2140                                              HAMMER2_TRANS_PREFLUSH);
2141         hammer2_run_unlinkq(&info.trans, pmp);
2142
2143         info.error = 0;
2144         info.waitfor = MNT_NOWAIT;
2145         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
2146         info.waitfor = MNT_WAIT;
2147         vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2148
2149         /*
2150          * Clear PREFLUSH.  This prevents (or asserts on) any new logical
2151          * buffer cache flushes which occur during the flush.  Device buffers
2152          * are not affected.
2153          */
2154
2155 #if 0
2156         if (info.error == 0 && (waitfor & MNT_WAIT)) {
2157                 info.waitfor = waitfor;
2158                     vsyncscan(mp, flags, hammer2_sync_scan2, &info);
2159
2160         }
2161 #endif
2162         hammer2_bioq_sync(info.trans.pmp);
2163         atomic_clear_int(&info.trans.flags, HAMMER2_TRANS_PREFLUSH);
2164
2165         total_error = 0;
2166
2167         /*
2168          * Flush all storage elements making up the cluster
2169          *
2170          * We must also flush any deleted siblings because the super-root
2171          * flush won't do it for us.  They all must be staged or the
2172          * super-root flush will not be able to update its block table
2173          * properly.
2174          *
2175          * XXX currently done serially instead of concurrently
2176          */
2177         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2178                 chain = iroot->cluster.array[i];
2179                 if (chain) {
2180                         hammer2_chain_lock(chain, HAMMER2_RESOLVE_ALWAYS);
2181                         hammer2_flush(&info.trans, &chain);
2182                         hammer2_chain_unlock(chain);
2183                 }
2184                 if (chain) {
2185                         hammer2_chain_t *nchain;
2186                         chain = TAILQ_FIRST(&chain->core->ownerq);
2187                         hammer2_chain_ref(chain);
2188                         while (chain) {
2189                                 hammer2_chain_lock(chain,
2190                                                    HAMMER2_RESOLVE_ALWAYS);
2191                                 hammer2_flush(&info.trans, &chain);
2192                                 hammer2_chain_unlock(chain);
2193                                 nchain = TAILQ_NEXT(chain, core_entry);
2194                                 if (nchain)
2195                                         hammer2_chain_ref(nchain);
2196                                 hammer2_chain_drop(chain);
2197                                 chain = nchain;
2198                         }
2199                 }
2200         }
2201 #if 0
2202         hammer2_trans_done(&info.trans);
2203 #endif
2204
2205         /*
2206          * Flush all volume roots to synchronize PFS flushes with the
2207          * storage media.  Use a super-root transaction for each one.
2208          *
2209          * The flush code will detect super-root -> pfs-root chain
2210          * transitions using the last pfs-root flush.
2211          */
2212         for (i = 0; iroot && i < iroot->cluster.nchains; ++i) {
2213                 chain = iroot->cluster.array[i];
2214                 if (chain == NULL)
2215                         continue;
2216
2217                 hmp = chain->hmp;
2218
2219                 /*
2220                  * We only have to flush each hmp once
2221                  */
2222                 for (j = i - 1; j >= 0; --j) {
2223                         if (iroot->cluster.array[j] &&
2224                             iroot->cluster.array[j]->hmp == hmp)
2225                                 break;
2226                 }
2227                 if (j >= 0)
2228                         continue;
2229                 hammer2_trans_spmp(&info.trans, hmp->spmp);
2230
2231                 /*
2232                  * Force an update of the XID from the PFS root to the
2233                  * topology root.  We couldn't do this from the PFS
2234                  * transaction because a SPMP transaction is needed.
2235                  * This does not modify blocks, instead what it does is
2236                  * allow the flush code to find the transition point and
2237                  * then update on the way back up.
2238                  */
2239                 parent = TAILQ_LAST(&chain->above->ownerq, h2_core_list);
2240                 KKASSERT(chain->pmp != parent->pmp);
2241                 hammer2_chain_setsubmod(&info.trans, parent);
2242
2243                 /*
2244                  * Media mounts have two 'roots', vchain for the topology
2245                  * and fchain for the free block table.  Flush both.
2246                  *
2247                  * Note that the topology and free block table are handled
2248                  * independently, so the free block table can wind up being
2249                  * ahead of the topology.  We depend on the bulk free scan
2250                  * code to deal with any loose ends.
2251                  */
2252                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2253                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2254                 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
2255                     hmp->fchain.update_xhi > hmp->fchain.update_xlo) {
2256                         /*
2257                          * This will also modify vchain as a side effect,
2258                          * mark vchain as modified now.
2259                          */
2260                         hammer2_voldata_modify(hmp);
2261                         chain = &hmp->fchain;
2262                         hammer2_flush(&info.trans, &chain);
2263                         KKASSERT(chain == &hmp->fchain);
2264                 }
2265                 hammer2_chain_unlock(&hmp->fchain);
2266                 hammer2_chain_unlock(&hmp->vchain);
2267
2268                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
2269                 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) ||
2270                     hmp->vchain.update_xhi > hmp->vchain.update_xlo) {
2271                         chain = &hmp->vchain;
2272                         hammer2_flush(&info.trans, &chain);
2273                         KKASSERT(chain == &hmp->vchain);
2274                         force_fchain = 1;
2275                 } else {
2276                         force_fchain = 0;
2277                 }
2278                 hammer2_chain_unlock(&hmp->vchain);
2279
2280 #if 0
2281                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
2282                 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
2283                     hmp->fchain.update_xhi > hmp->fchain.update_xlo ||
2284                     force_fchain) {
2285                         /* this will also modify vchain as a side effect */
2286                         chain = &hmp->fchain;
2287                         hammer2_flush(&info.trans, &chain);
2288                         KKASSERT(chain == &hmp->fchain);
2289                 }
2290                 hammer2_chain_unlock(&hmp->fchain);
2291 #endif
2292
2293                 error = 0;
2294
2295                 /*
2296                  * We can't safely flush the volume header until we have
2297                  * flushed any device buffers which have built up.
2298                  *
2299                  * XXX this isn't being incremental
2300                  */
2301                 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
2302                 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
2303                 vn_unlock(hmp->devvp);
2304
2305                 /*
2306                  * The flush code sets CHAIN_VOLUMESYNC to indicate that the
2307                  * volume header needs synchronization via hmp->volsync.
2308                  *
2309                  * XXX synchronize the flag & data with only this flush XXX
2310                  */
2311                 if (error == 0 &&
2312                     (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
2313                         struct buf *bp;
2314
2315                         /*
2316                          * Synchronize the disk before flushing the volume
2317                          * header.
2318                          */
2319                         bp = getpbuf(NULL);
2320                         bp->b_bio1.bio_offset = 0;
2321                         bp->b_bufsize = 0;
2322                         bp->b_bcount = 0;
2323                         bp->b_cmd = BUF_CMD_FLUSH;
2324                         bp->b_bio1.bio_done = biodone_sync;
2325                         bp->b_bio1.bio_flags |= BIO_SYNC;
2326                         vn_strategy(hmp->devvp, &bp->b_bio1);
2327                         biowait(&bp->b_bio1, "h2vol");
2328                         relpbuf(bp, NULL);
2329
2330                         /*
2331                          * Then we can safely flush the version of the
2332                          * volume header synchronized by the flush code.
2333                          */
2334                         i = hmp->volhdrno + 1;
2335                         if (i >= HAMMER2_NUM_VOLHDRS)
2336                                 i = 0;
2337                         if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
2338                             hmp->volsync.volu_size) {
2339                                 i = 0;
2340                         }
2341                         kprintf("sync volhdr %d %jd\n",
2342                                 i, (intmax_t)hmp->volsync.volu_size);
2343                         bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2344                                     HAMMER2_PBUFSIZE, 0, 0);
2345                         atomic_clear_int(&hmp->vchain.flags,
2346                                          HAMMER2_CHAIN_VOLUMESYNC);
2347                         bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
2348                         bawrite(bp);
2349                         hmp->volhdrno = i;
2350                 }
2351                 if (error)
2352                         total_error = error;
2353
2354 #if 0
2355                 hammer2_trans_done(&info.trans);
2356 #endif
2357         }
2358         hammer2_trans_done(&info.trans);
2359
2360         return (total_error);
2361 }
2362
2363 /*
2364  * Sync passes.
2365  */
2366 static int
2367 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
2368 {
2369         struct hammer2_sync_info *info = data;
2370         hammer2_inode_t *ip;
2371         int error;
2372
2373         /*
2374          *
2375          */
2376         ip = VTOI(vp);
2377         if (ip == NULL)
2378                 return(0);
2379         if (vp->v_type == VNON || vp->v_type == VBAD) {
2380                 vclrisdirty(vp);
2381                 return(0);
2382         }
2383         if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
2384             RB_EMPTY(&vp->v_rbdirty_tree)) {
2385                 vclrisdirty(vp);
2386                 return(0);
2387         }
2388
2389         /*
2390          * VOP_FSYNC will start a new transaction so replicate some code
2391          * here to do it inline (see hammer2_vop_fsync()).
2392          *
2393          * WARNING: The vfsync interacts with the buffer cache and might
2394          *          block, we can't hold the inode lock at that time.
2395          *          However, we MUST ref ip before blocking to ensure that
2396          *          it isn't ripped out from under us (since we do not
2397          *          hold a lock on the vnode).
2398          */
2399         hammer2_inode_ref(ip);
2400         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
2401         if (vp)
2402                 vfsync(vp, MNT_NOWAIT, 1, NULL, NULL);
2403
2404         hammer2_inode_drop(ip);
2405 #if 1
2406         error = 0;
2407         if (error)
2408                 info->error = error;
2409 #endif
2410         return(0);
2411 }
2412
2413 static
2414 int
2415 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
2416 {
2417         return (0);
2418 }
2419
2420 static
2421 int
2422 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
2423                struct fid *fhp, struct vnode **vpp)
2424 {
2425         return (0);
2426 }
2427
2428 static
2429 int
2430 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
2431                  int *exflagsp, struct ucred **credanonp)
2432 {
2433         return (0);
2434 }
2435
2436 /*
2437  * Support code for hammer2_mount().  Read, verify, and install the volume
2438  * header into the HMP
2439  *
2440  * XXX read four volhdrs and use the one with the highest TID whos CRC
2441  *     matches.
2442  *
2443  * XXX check iCRCs.
2444  *
2445  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
2446  *     nonexistant locations.
2447  *
2448  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
2449  */
2450 static
2451 int
2452 hammer2_install_volume_header(hammer2_mount_t *hmp)
2453 {
2454         hammer2_volume_data_t *vd;
2455         struct buf *bp;
2456         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
2457         int error_reported;
2458         int error;
2459         int valid;
2460         int i;
2461
2462         error_reported = 0;
2463         error = 0;
2464         valid = 0;
2465         bp = NULL;
2466
2467         /*
2468          * There are up to 4 copies of the volume header (syncs iterate
2469          * between them so there is no single master).  We don't trust the
2470          * volu_size field so we don't know precisely how large the filesystem
2471          * is, so depend on the OS to return an error if we go beyond the
2472          * block device's EOF.
2473          */
2474         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
2475                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
2476                               HAMMER2_VOLUME_BYTES, &bp);
2477                 if (error) {
2478                         brelse(bp);
2479                         bp = NULL;
2480                         continue;
2481                 }
2482
2483                 vd = (struct hammer2_volume_data *) bp->b_data;
2484                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
2485                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
2486                         brelse(bp);
2487                         bp = NULL;
2488                         continue;
2489                 }
2490
2491                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
2492                         /* XXX: Reversed-endianness filesystem */
2493                         kprintf("hammer2: reverse-endian filesystem detected");
2494                         brelse(bp);
2495                         bp = NULL;
2496                         continue;
2497                 }
2498
2499                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
2500                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
2501                                       HAMMER2_VOLUME_ICRC0_SIZE);
2502                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
2503                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
2504                                        HAMMER2_VOLUME_ICRC1_SIZE);
2505                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
2506                         kprintf("hammer2 volume header crc "
2507                                 "mismatch copy #%d %08x/%08x\n",
2508                                 i, crc0, crc);
2509                         error_reported = 1;
2510                         brelse(bp);
2511                         bp = NULL;
2512                         continue;
2513                 }
2514                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
2515                         valid = 1;
2516                         hmp->voldata = *vd;
2517                         hmp->volhdrno = i;
2518                 }
2519                 brelse(bp);
2520                 bp = NULL;
2521         }
2522         if (valid) {
2523                 hmp->volsync = hmp->voldata;
2524                 error = 0;
2525                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
2526                         kprintf("hammer2: using volume header #%d\n",
2527                                 hmp->volhdrno);
2528                 }
2529         } else {
2530                 error = EINVAL;
2531                 kprintf("hammer2: no valid volume headers found!\n");
2532         }
2533         return (error);
2534 }
2535
2536 /*
2537  * Reconnect using the passed file pointer.  The caller must ref the
2538  * fp for us.
2539  */
2540 void
2541 hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
2542 {
2543         const hammer2_inode_data_t *ipdata;
2544         hammer2_cluster_t *cparent;
2545         hammer2_mount_t *hmp;
2546         size_t name_len;
2547
2548         hmp = pmp->iroot->cluster.focus->hmp;   /* XXX */
2549
2550         /*
2551          * Closes old comm descriptor, kills threads, cleans up
2552          * states, then installs the new descriptor and creates
2553          * new threads.
2554          */
2555         kdmsg_iocom_reconnect(&pmp->iocom, fp, "hammer2");
2556
2557         /*
2558          * Setup LNK_CONN fields for autoinitiated state machine
2559          */
2560         cparent = hammer2_inode_lock_ex(pmp->iroot);
2561         ipdata = &hammer2_cluster_data(cparent)->ipdata;
2562         pmp->iocom.auto_lnk_conn.pfs_clid = ipdata->pfs_clid;
2563         pmp->iocom.auto_lnk_conn.pfs_fsid = ipdata->pfs_fsid;
2564         pmp->iocom.auto_lnk_conn.pfs_type = ipdata->pfs_type;
2565         pmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
2566         pmp->iocom.auto_lnk_conn.peer_type = hmp->voldata.peer_type;
2567
2568         /*
2569          * Filter adjustment.  Clients do not need visibility into other
2570          * clients (otherwise millions of clients would present a serious
2571          * problem).  The fs_label also serves to restrict the namespace.
2572          */
2573         pmp->iocom.auto_lnk_conn.peer_mask = 1LLU << DMSG_PEER_HAMMER2;
2574         pmp->iocom.auto_lnk_conn.pfs_mask = (uint64_t)-1;
2575         switch (ipdata->pfs_type) {
2576         case DMSG_PFSTYPE_CLIENT:
2577                 pmp->iocom.auto_lnk_conn.peer_mask &=
2578                                 ~(1LLU << DMSG_PFSTYPE_CLIENT);
2579                 break;
2580         default:
2581                 break;
2582         }
2583
2584         name_len = ipdata->name_len;
2585         if (name_len >= sizeof(pmp->iocom.auto_lnk_conn.fs_label))
2586                 name_len = sizeof(pmp->iocom.auto_lnk_conn.fs_label) - 1;
2587         bcopy(ipdata->filename,
2588               pmp->iocom.auto_lnk_conn.fs_label,
2589               name_len);
2590         pmp->iocom.auto_lnk_conn.fs_label[name_len] = 0;
2591
2592         /*
2593          * Setup LNK_SPAN fields for autoinitiated state machine
2594          */
2595         pmp->iocom.auto_lnk_span.pfs_clid = ipdata->pfs_clid;
2596         pmp->iocom.auto_lnk_span.pfs_fsid = ipdata->pfs_fsid;
2597         pmp->iocom.auto_lnk_span.pfs_type = ipdata->pfs_type;
2598         pmp->iocom.auto_lnk_span.peer_type = hmp->voldata.peer_type;
2599         pmp->iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1;
2600         name_len = ipdata->name_len;
2601         if (name_len >= sizeof(pmp->iocom.auto_lnk_span.fs_label))
2602                 name_len = sizeof(pmp->iocom.auto_lnk_span.fs_label) - 1;
2603         bcopy(ipdata->filename,
2604               pmp->iocom.auto_lnk_span.fs_label,
2605               name_len);
2606         pmp->iocom.auto_lnk_span.fs_label[name_len] = 0;
2607         hammer2_inode_unlock_ex(pmp->iroot, cparent);
2608
2609         kdmsg_iocom_autoinitiate(&pmp->iocom, hammer2_autodmsg);
2610 }
2611
2612 static int
2613 hammer2_rcvdmsg(kdmsg_msg_t *msg)
2614 {
2615         switch(msg->any.head.cmd & DMSGF_TRANSMASK) {
2616         case DMSG_DBG_SHELL:
2617                 /*
2618                  * (non-transaction)
2619                  * Execute shell command (not supported atm)
2620                  */
2621                 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2622                 break;
2623         case DMSG_DBG_SHELL | DMSGF_REPLY:
2624                 /*
2625                  * (non-transaction)
2626                  */
2627                 if (msg->aux_data) {
2628                         msg->aux_data[msg->aux_size - 1] = 0;
2629                         kprintf("HAMMER2 DBG: %s\n", msg->aux_data);
2630                 }
2631                 break;
2632         default:
2633                 /*
2634                  * Unsupported message received.  We only need to
2635                  * reply if it's a transaction in order to close our end.
2636                  * Ignore any one-way messages are any further messages
2637                  * associated with the transaction.
2638                  *
2639                  * NOTE: This case also includes DMSG_LNK_ERROR messages
2640                  *       which might be one-way, replying to those would
2641                  *       cause an infinite ping-pong.
2642                  */
2643                 if (msg->any.head.cmd & DMSGF_CREATE)
2644                         kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2645                 break;
2646         }
2647         return(0);
2648 }
2649
2650 /*
2651  * This function is called after KDMSG has automatically handled processing
2652  * of a LNK layer message (typically CONN, SPAN, or CIRC).
2653  *
2654  * We tag off the LNK_CONN to trigger our LNK_VOLCONF messages which
2655  * advertises all available hammer2 super-root volumes.
2656  */
2657 static void
2658 hammer2_autodmsg(kdmsg_msg_t *msg)
2659 {
2660         hammer2_pfsmount_t *pmp = msg->iocom->handle;
2661         hammer2_mount_t *hmp = pmp->iroot->cluster.focus->hmp; /* XXX */
2662         int copyid;
2663
2664         /*
2665          * We only care about replies to our LNK_CONN auto-request.  kdmsg
2666          * has already processed the reply, we use this calback as a shim
2667          * to know when we can advertise available super-root volumes.
2668          */
2669         if ((msg->any.head.cmd & DMSGF_TRANSMASK) !=
2670             (DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_REPLY) ||
2671             msg->state == NULL) {
2672                 return;
2673         }
2674
2675         kprintf("LNK_CONN REPLY RECEIVED CMD %08x\n", msg->any.head.cmd);
2676
2677         if (msg->any.head.cmd & DMSGF_CREATE) {
2678                 kprintf("HAMMER2: VOLDATA DUMP\n");
2679
2680                 /*
2681                  * Dump the configuration stored in the volume header
2682                  */
2683                 hammer2_voldata_lock(hmp);
2684                 for (copyid = 0; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
2685                         if (hmp->voldata.copyinfo[copyid].copyid == 0)
2686                                 continue;
2687                         hammer2_volconf_update(pmp, copyid);
2688                 }
2689                 hammer2_voldata_unlock(hmp);
2690         }
2691         if ((msg->any.head.cmd & DMSGF_DELETE) &&
2692             msg->state && (msg->state->txcmd & DMSGF_DELETE) == 0) {
2693                 kprintf("HAMMER2: CONN WAS TERMINATED\n");
2694         }
2695 }
2696
2697 /*
2698  * Volume configuration updates are passed onto the userland service
2699  * daemon via the open LNK_CONN transaction.
2700  */
2701 void
2702 hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index)
2703 {
2704         hammer2_mount_t *hmp = pmp->iroot->cluster.focus->hmp;  /* XXX */
2705         kdmsg_msg_t *msg;
2706
2707         /* XXX interlock against connection state termination */
2708         kprintf("volconf update %p\n", pmp->iocom.conn_state);
2709         if (pmp->iocom.conn_state) {
2710                 kprintf("TRANSMIT VOLCONF VIA OPEN CONN TRANSACTION\n");
2711                 msg = kdmsg_msg_alloc_state(pmp->iocom.conn_state,
2712                                             DMSG_LNK_HAMMER2_VOLCONF,
2713                                             NULL, NULL);
2714                 H2_LNK_VOLCONF(msg)->copy = hmp->voldata.copyinfo[index];
2715                 H2_LNK_VOLCONF(msg)->mediaid = hmp->voldata.fsid;
2716                 H2_LNK_VOLCONF(msg)->index = index;
2717                 kdmsg_msg_write(msg);
2718         }
2719 }
2720
2721 /*
2722  * This handles hysteresis on regular file flushes.  Because the BIOs are
2723  * routed to a thread it is possible for an excessive number to build up
2724  * and cause long front-end stalls long before the runningbuffspace limit
2725  * is hit, so we implement hammer2_flush_pipe to control the
2726  * hysteresis.
2727  *
2728  * This is a particular problem when compression is used.
2729  */
2730 void
2731 hammer2_lwinprog_ref(hammer2_pfsmount_t *pmp)
2732 {
2733         atomic_add_int(&pmp->count_lwinprog, 1);
2734 }
2735
2736 void
2737 hammer2_lwinprog_drop(hammer2_pfsmount_t *pmp)
2738 {
2739         int lwinprog;
2740
2741         lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2742         if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2743             (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2744                 atomic_clear_int(&pmp->count_lwinprog,
2745                                  HAMMER2_LWINPROG_WAITING);
2746                 wakeup(&pmp->count_lwinprog);
2747         }
2748 }
2749
2750 void
2751 hammer2_lwinprog_wait(hammer2_pfsmount_t *pmp)
2752 {
2753         int lwinprog;
2754
2755         for (;;) {
2756                 lwinprog = pmp->count_lwinprog;
2757                 cpu_ccfence();
2758                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2759                         break;
2760                 tsleep_interlock(&pmp->count_lwinprog, 0);
2761                 atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING);
2762                 lwinprog = pmp->count_lwinprog;
2763                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2764                         break;
2765                 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2766         }
2767 }
2768
2769 /*
2770  * Manage excessive memory resource use for chain and related
2771  * structures.
2772  */
2773 void
2774 hammer2_pfs_memory_wait(hammer2_pfsmount_t *pmp)
2775 {
2776         long waiting;
2777         long count;
2778         long limit;
2779 #if 0
2780         static int zzticks;
2781 #endif
2782
2783         /*
2784          * Atomic check condition and wait.  Also do an early speedup of
2785          * the syncer to try to avoid hitting the wait.
2786          */
2787         for (;;) {
2788                 waiting = pmp->inmem_dirty_chains;
2789                 cpu_ccfence();
2790                 count = waiting & HAMMER2_DIRTYCHAIN_MASK;
2791
2792                 limit = pmp->mp->mnt_nvnodelistsize / 10;
2793                 if (limit < hammer2_limit_dirty_chains)
2794                         limit = hammer2_limit_dirty_chains;
2795                 if (limit < 1000)
2796                         limit = 1000;
2797
2798 #if 0
2799                 if ((int)(ticks - zzticks) > hz) {
2800                         zzticks = ticks;
2801                         kprintf("count %ld %ld\n", count, limit);
2802                 }
2803 #endif
2804
2805                 /*
2806                  * Block if there are too many dirty chains present, wait
2807                  * for the flush to clean some out.
2808                  */
2809                 if (count > limit) {
2810                         tsleep_interlock(&pmp->inmem_dirty_chains, 0);
2811                         if (atomic_cmpset_long(&pmp->inmem_dirty_chains,
2812                                                waiting,
2813                                        waiting | HAMMER2_DIRTYCHAIN_WAITING)) {
2814                                 speedup_syncer(pmp->mp);
2815                                 tsleep(&pmp->inmem_dirty_chains, PINTERLOCKED,
2816                                        "chnmem", hz);
2817                         }
2818                         continue;       /* loop on success or fail */
2819                 }
2820
2821                 /*
2822                  * Try to start an early flush before we are forced to block.
2823                  */
2824                 if (count > limit * 7 / 10)
2825                         speedup_syncer(pmp->mp);
2826                 break;
2827         }
2828 }
2829
2830 void
2831 hammer2_pfs_memory_inc(hammer2_pfsmount_t *pmp)
2832 {
2833         if (pmp)
2834                 atomic_add_long(&pmp->inmem_dirty_chains, 1);
2835 }
2836
2837 void
2838 hammer2_pfs_memory_wakeup(hammer2_pfsmount_t *pmp)
2839 {
2840         long waiting;
2841
2842         if (pmp == NULL)
2843                 return;
2844
2845         for (;;) {
2846                 waiting = pmp->inmem_dirty_chains;
2847                 cpu_ccfence();
2848                 if (atomic_cmpset_long(&pmp->inmem_dirty_chains,
2849                                        waiting,
2850                                        (waiting - 1) &
2851                                         ~HAMMER2_DIRTYCHAIN_WAITING)) {
2852                         break;
2853                 }
2854         }
2855
2856         if (waiting & HAMMER2_DIRTYCHAIN_WAITING)
2857                 wakeup(&pmp->inmem_dirty_chains);
2858 }
2859
2860 /*
2861  * Debugging
2862  */
2863 void
2864 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp, char pfx)
2865 {
2866         hammer2_chain_t *scan;
2867         hammer2_chain_t *first_parent;
2868
2869         --*countp;
2870         if (*countp == 0) {
2871                 kprintf("%*.*s...\n", tab, tab, "");
2872                 return;
2873         }
2874         if (*countp < 0)
2875                 return;
2876         first_parent = chain->core ? TAILQ_FIRST(&chain->core->ownerq) : NULL;
2877         kprintf("%*.*s%c-chain %p.%d %016jx/%d mir=%016jx\n",
2878                 tab, tab, "", pfx,
2879                 chain, chain->bref.type,
2880                 chain->bref.key, chain->bref.keybits,
2881                 chain->bref.mirror_tid);
2882
2883         kprintf("%*.*s      [%08x] (%s) mod=%08x del=%08x "
2884                 "lo=%08x hi=%08x refs=%d\n",
2885                 tab, tab, "",
2886                 chain->flags,
2887                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
2888                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
2889                 chain->modify_xid,
2890                 chain->delete_xid,
2891                 chain->update_xlo,
2892                 chain->update_xhi,
2893                 chain->refs);
2894
2895         kprintf("%*.*s      core %p [%08x]",
2896                 tab, tab, "",
2897                 chain->core, (chain->core ? chain->core->flags : 0));
2898
2899         if (first_parent)
2900                 kprintf("\n%*.*s      fp=%p np=%p [fpflags %08x fprefs %d",
2901                         tab, tab, "",
2902                         first_parent,
2903                         (first_parent ? TAILQ_NEXT(first_parent, core_entry) :
2904                                         NULL),
2905                         first_parent->flags,
2906                         first_parent->refs);
2907         if (chain->core == NULL || RB_EMPTY(&chain->core->rbtree))
2908                 kprintf("\n");
2909         else
2910                 kprintf(" {\n");
2911         if (chain->core) {
2912                 RB_FOREACH(scan, hammer2_chain_tree, &chain->core->rbtree)
2913                         hammer2_dump_chain(scan, tab + 4, countp, 'a');
2914                 RB_FOREACH(scan, hammer2_chain_tree, &chain->core->dbtree)
2915                         hammer2_dump_chain(scan, tab + 4, countp, 'r');
2916                 TAILQ_FOREACH(scan, &chain->core->dbq, db_entry)
2917                         hammer2_dump_chain(scan, tab + 4, countp, 'd');
2918         }
2919         if (chain->core && !RB_EMPTY(&chain->core->rbtree)) {
2920                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
2921                         kprintf("%*.*s}(%s)\n", tab, tab, "",
2922                                 chain->data->ipdata.filename);
2923                 else
2924                         kprintf("%*.*s}\n", tab, tab, "");
2925         }
2926 }