hammer2 - performance, stabilization
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*-
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61
62 #include "hammer2.h"
63 #include "hammer2_lz4.h"
64
65 #include "zlib/hammer2_zlib.h"
66
67 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
68
69 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
70
71 struct hammer2_sync_info {
72         hammer2_trans_t trans;
73         int error;
74         int waitfor;
75 };
76
77 TAILQ_HEAD(hammer2_mntlist, hammer2_mount);
78 static struct hammer2_mntlist hammer2_mntlist;
79 static struct lock hammer2_mntlk;
80
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 int hammer2_flush_pipe = 100;
85 long hammer2_iod_file_read;
86 long hammer2_iod_meta_read;
87 long hammer2_iod_indr_read;
88 long hammer2_iod_fmap_read;
89 long hammer2_iod_volu_read;
90 long hammer2_iod_file_write;
91 long hammer2_iod_meta_write;
92 long hammer2_iod_indr_write;
93 long hammer2_iod_fmap_write;
94 long hammer2_iod_volu_write;
95 long hammer2_ioa_file_read;
96 long hammer2_ioa_meta_read;
97 long hammer2_ioa_indr_read;
98 long hammer2_ioa_fmap_read;
99 long hammer2_ioa_volu_read;
100 long hammer2_ioa_fmap_write;
101 long hammer2_ioa_file_write;
102 long hammer2_ioa_meta_write;
103 long hammer2_ioa_indr_write;
104 long hammer2_ioa_volu_write;
105
106 MALLOC_DECLARE(C_BUFFER);
107 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
108
109 MALLOC_DECLARE(D_BUFFER);
110 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
111
112 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
113
114 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
115            &hammer2_debug, 0, "");
116 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
117            &hammer2_cluster_enable, 0, "");
118 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
119            &hammer2_hardlink_enable, 0, "");
120 SYSCTL_INT(_vfs_hammer2, OID_AUTO, flush_pipe, CTLFLAG_RW,
121            &hammer2_flush_pipe, 0, "");
122
123 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
124            &hammer2_iod_file_read, 0, "");
125 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
126            &hammer2_iod_meta_read, 0, "");
127 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
128            &hammer2_iod_indr_read, 0, "");
129 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
130            &hammer2_iod_fmap_read, 0, "");
131 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
132            &hammer2_iod_volu_read, 0, "");
133
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
135            &hammer2_iod_file_write, 0, "");
136 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
137            &hammer2_iod_meta_write, 0, "");
138 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
139            &hammer2_iod_indr_write, 0, "");
140 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
141            &hammer2_iod_fmap_write, 0, "");
142 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
143            &hammer2_iod_volu_write, 0, "");
144
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
146            &hammer2_ioa_file_read, 0, "");
147 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
148            &hammer2_ioa_meta_read, 0, "");
149 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
150            &hammer2_ioa_indr_read, 0, "");
151 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
152            &hammer2_ioa_fmap_read, 0, "");
153 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
154            &hammer2_ioa_volu_read, 0, "");
155
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
157            &hammer2_ioa_file_write, 0, "");
158 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
159            &hammer2_ioa_meta_write, 0, "");
160 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
161            &hammer2_ioa_indr_write, 0, "");
162 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
163            &hammer2_ioa_fmap_write, 0, "");
164 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
165            &hammer2_ioa_volu_write, 0, "");
166
167 static int hammer2_vfs_init(struct vfsconf *conf);
168 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
169 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
170                                 struct ucred *cred);
171 static int hammer2_remount(hammer2_mount_t *, char *, struct vnode *,
172                                 struct ucred *);
173 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
174 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
175 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
176                                 struct ucred *cred);
177 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
178                                 struct ucred *cred);
179 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
180                                 ino_t ino, struct vnode **vpp);
181 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
182                                 struct fid *fhp, struct vnode **vpp);
183 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
184 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
185                                 int *exflagsp, struct ucred **credanonp);
186
187 static int hammer2_install_volume_header(hammer2_mount_t *hmp);
188 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
189
190 static void hammer2_write_thread(void *arg);
191
192 /* 
193  * Functions for compression in threads,
194  * from hammer2_vnops.c
195  */
196 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
197                                 hammer2_inode_t *ip,
198                                 hammer2_inode_data_t *ipdata,
199                                 hammer2_chain_t **parentp,
200                                 hammer2_key_t lbase, int ioflag, int pblksize,
201                                 int *errorp);
202 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
203                                 hammer2_inode_t *ip,
204                                 hammer2_inode_data_t *ipdata,
205                                 hammer2_chain_t **parentp,
206                                 hammer2_key_t lbase, int ioflag,
207                                 int pblksize, int *errorp, int comp_algo);
208 static void hammer2_zero_check_and_write(struct buf *bp,
209                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
210                                 hammer2_inode_data_t *ipdata,
211                                 hammer2_chain_t **parentp,
212                                 hammer2_key_t lbase,
213                                 int ioflag, int pblksize, int *errorp);
214 static int test_block_zeros(const char *buf, size_t bytes);
215 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
216                                 hammer2_inode_t *ip,
217                                 hammer2_inode_data_t *ipdata,
218                                 hammer2_chain_t **parentp, 
219                                 hammer2_key_t lbase,
220                                 int *errorp);
221 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
222                                 int ioflag, int pblksize, int *errorp);
223
224 static int hammer2_rcvdmsg(kdmsg_msg_t *msg);
225 static void hammer2_autodmsg(kdmsg_msg_t *msg);
226
227
228 /*
229  * HAMMER2 vfs operations.
230  */
231 static struct vfsops hammer2_vfsops = {
232         .vfs_init       = hammer2_vfs_init,
233         .vfs_uninit = hammer2_vfs_uninit,
234         .vfs_sync       = hammer2_vfs_sync,
235         .vfs_mount      = hammer2_vfs_mount,
236         .vfs_unmount    = hammer2_vfs_unmount,
237         .vfs_root       = hammer2_vfs_root,
238         .vfs_statfs     = hammer2_vfs_statfs,
239         .vfs_statvfs    = hammer2_vfs_statvfs,
240         .vfs_vget       = hammer2_vfs_vget,
241         .vfs_vptofh     = hammer2_vfs_vptofh,
242         .vfs_fhtovp     = hammer2_vfs_fhtovp,
243         .vfs_checkexp   = hammer2_vfs_checkexp
244 };
245
246 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
247
248 VFS_SET(hammer2_vfsops, hammer2, 0);
249 MODULE_VERSION(hammer2, 1);
250
251 static
252 int
253 hammer2_vfs_init(struct vfsconf *conf)
254 {
255         static struct objcache_malloc_args margs_read;
256         static struct objcache_malloc_args margs_write;
257
258         int error;
259
260         error = 0;
261
262         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
263                 error = EINVAL;
264         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
265                 error = EINVAL;
266         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
267                 error = EINVAL;
268
269         if (error)
270                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
271         
272         margs_read.objsize = 65536;
273         margs_read.mtype = D_BUFFER;
274         
275         margs_write.objsize = 32768;
276         margs_write.mtype = C_BUFFER;
277         
278         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
279                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
280                                 objcache_malloc_free, &margs_read);
281         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
282                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
283                                 objcache_malloc_free, &margs_write);
284
285         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
286         TAILQ_INIT(&hammer2_mntlist);
287
288         return (error);
289 }
290
291 static
292 int
293 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
294 {
295         objcache_destroy(cache_buffer_read);
296         objcache_destroy(cache_buffer_write);
297         return 0;
298 }
299
300 /*
301  * Mount or remount HAMMER2 fileystem from physical media
302  *
303  *      mountroot
304  *              mp              mount point structure
305  *              path            NULL
306  *              data            <unused>
307  *              cred            <unused>
308  *
309  *      mount
310  *              mp              mount point structure
311  *              path            path to mount point
312  *              data            pointer to argument structure in user space
313  *                      volume  volume path (device@LABEL form)
314  *                      hflags  user mount flags
315  *              cred            user credentials
316  *
317  * RETURNS:     0       Success
318  *              !0      error number
319  */
320 static
321 int
322 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
323                   struct ucred *cred)
324 {
325         struct hammer2_mount_info info;
326         hammer2_pfsmount_t *pmp;
327         hammer2_mount_t *hmp;
328         hammer2_key_t key_next;
329         hammer2_key_t key_dummy;
330         hammer2_key_t lhc;
331         struct vnode *devvp;
332         struct nlookupdata nd;
333         hammer2_chain_t *parent;
334         hammer2_chain_t *schain;
335         hammer2_chain_t *rchain;
336         struct file *fp;
337         char devstr[MNAMELEN];
338         size_t size;
339         size_t done;
340         char *dev;
341         char *label;
342         int ronly = 1;
343         int error;
344         int cache_index;
345         int i;
346
347         hmp = NULL;
348         pmp = NULL;
349         dev = NULL;
350         label = NULL;
351         devvp = NULL;
352         cache_index = -1;
353
354         kprintf("hammer2_mount\n");
355
356         if (path == NULL) {
357                 /*
358                  * Root mount
359                  */
360                 bzero(&info, sizeof(info));
361                 info.cluster_fd = -1;
362                 return (EOPNOTSUPP);
363         } else {
364                 /*
365                  * Non-root mount or updating a mount
366                  */
367                 error = copyin(data, &info, sizeof(info));
368                 if (error)
369                         return (error);
370
371                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
372                 if (error)
373                         return (error);
374
375                 /* Extract device and label */
376                 dev = devstr;
377                 label = strchr(devstr, '@');
378                 if (label == NULL ||
379                     ((label + 1) - dev) > done) {
380                         return (EINVAL);
381                 }
382                 *label = '\0';
383                 label++;
384                 if (*label == '\0')
385                         return (EINVAL);
386
387                 if (mp->mnt_flag & MNT_UPDATE) {
388                         /* Update mount */
389                         /* HAMMER2 implements NFS export via mountctl */
390                         pmp = MPTOPMP(mp);
391                         for (i = 0; i < pmp->cluster.nchains; ++i) {
392                                 hmp = pmp->cluster.chains[i]->hmp;
393                                 devvp = hmp->devvp;
394                                 error = hammer2_remount(hmp, path, devvp, cred);
395                                 if (error)
396                                         break;
397                         }
398                         return error;
399                 }
400         }
401
402         /*
403          * PFS mount
404          *
405          * Lookup name and verify it refers to a block device.
406          */
407         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
408         if (error == 0)
409                 error = nlookup(&nd);
410         if (error == 0)
411                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
412         nlookup_done(&nd);
413
414         if (error == 0) {
415                 if (vn_isdisk(devvp, &error))
416                         error = vfs_mountedon(devvp);
417         }
418
419         /*
420          * Determine if the device has already been mounted.  After this
421          * check hmp will be non-NULL if we are doing the second or more
422          * hammer2 mounts from the same device.
423          */
424         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
425         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
426                 if (hmp->devvp == devvp)
427                         break;
428         }
429
430         /*
431          * Open the device if this isn't a secondary mount and construct
432          * the H2 device mount (hmp).
433          */
434         if (hmp == NULL) {
435                 if (error == 0 && vcount(devvp) > 0)
436                         error = EBUSY;
437
438                 /*
439                  * Now open the device
440                  */
441                 if (error == 0) {
442                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
443                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
444                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
445                         if (error == 0) {
446                                 error = VOP_OPEN(devvp,
447                                                  ronly ? FREAD : FREAD | FWRITE,
448                                                  FSCRED, NULL);
449                         }
450                         vn_unlock(devvp);
451                 }
452                 if (error && devvp) {
453                         vrele(devvp);
454                         devvp = NULL;
455                 }
456                 if (error) {
457                         lockmgr(&hammer2_mntlk, LK_RELEASE);
458                         return error;
459                 }
460                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
461                 hmp->ronly = ronly;
462                 hmp->devvp = devvp;
463                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
464                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
465                 RB_INIT(&hmp->iotree);
466
467                 lockinit(&hmp->alloclk, "h2alloc", 0, 0);
468                 lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
469                 TAILQ_INIT(&hmp->transq);
470
471                 /*
472                  * vchain setup. vchain.data is embedded.
473                  * vchain.refs is initialized and will never drop to 0.
474                  */
475                 hmp->vchain.hmp = hmp;
476                 hmp->vchain.refs = 1;
477                 hmp->vchain.data = (void *)&hmp->voldata;
478                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
479                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
480                 hmp->vchain.delete_tid = HAMMER2_MAX_TID;
481                 hammer2_chain_core_alloc(NULL, &hmp->vchain, NULL);
482                 /* hmp->vchain.u.xxx is left NULL */
483
484                 /*
485                  * fchain setup.  fchain.data is embedded.
486                  * fchain.refs is initialized and will never drop to 0.
487                  *
488                  * The data is not used but needs to be initialized to
489                  * pass assertion muster.  We use this chain primarily
490                  * as a placeholder for the freemap's top-level RBTREE
491                  * so it does not interfere with the volume's topology
492                  * RBTREE.
493                  */
494                 hmp->fchain.hmp = hmp;
495                 hmp->fchain.refs = 1;
496                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
497                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
498                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
499                 hmp->fchain.bref.methods =
500                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
501                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
502                 hmp->fchain.delete_tid = HAMMER2_MAX_TID;
503
504                 hammer2_chain_core_alloc(NULL, &hmp->fchain, NULL);
505                 /* hmp->fchain.u.xxx is left NULL */
506
507                 /*
508                  * Install the volume header
509                  */
510                 error = hammer2_install_volume_header(hmp);
511                 if (error) {
512                         hammer2_vfs_unmount(mp, MNT_FORCE);
513                         return error;
514                 }
515
516                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
517                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
518
519                 /*
520                  * First locate the super-root inode, which is key 0
521                  * relative to the volume header's blockset.
522                  *
523                  * Then locate the root inode by scanning the directory keyspace
524                  * represented by the label.
525                  */
526                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
527                 schain = hammer2_chain_lookup(&parent, &key_dummy,
528                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
529                                       &cache_index, 0);
530                 hammer2_chain_lookup_done(parent);
531                 if (schain == NULL) {
532                         kprintf("hammer2_mount: invalid super-root\n");
533                         hammer2_vfs_unmount(mp, MNT_FORCE);
534                         return EINVAL;
535                 }
536
537                 /*
538                  * NOTE: inode_get sucks up schain's lock.
539                  */
540                 atomic_set_int(&schain->flags, HAMMER2_CHAIN_PFSROOT);
541                 hmp->sroot = hammer2_inode_get(NULL, NULL, schain);
542                 hammer2_inode_ref(hmp->sroot);
543                 hammer2_inode_unlock_ex(hmp->sroot, schain);
544                 schain = NULL;
545                 /* leave hmp->sroot with one ref */
546         }
547
548         /*
549          * Block device opened successfully, finish initializing the
550          * mount structure.
551          *
552          * From this point on we have to call hammer2_unmount() on failure.
553          */
554         pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
555
556         kmalloc_create(&pmp->minode, "HAMMER2-inodes");
557         kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
558
559         spin_init(&pmp->inum_spin);
560         RB_INIT(&pmp->inum_tree);
561
562         kdmsg_iocom_init(&pmp->iocom, pmp,
563                          KDMSG_IOCOMF_AUTOCONN |
564                          KDMSG_IOCOMF_AUTOSPAN |
565                          KDMSG_IOCOMF_AUTOCIRC,
566                          pmp->mmsg, hammer2_rcvdmsg);
567
568         ccms_domain_init(&pmp->ccms_dom);
569         ++hmp->pmp_count;
570         lockmgr(&hammer2_mntlk, LK_RELEASE);
571         kprintf("hammer2_mount hmp=%p pmp=%p pmpcnt=%d\n",
572                 hmp, pmp, hmp->pmp_count);
573
574         mp->mnt_flag = MNT_LOCAL;
575         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
576         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
577
578         /*
579          * required mount structure initializations
580          */
581         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
582         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
583
584         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
585         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
586
587         /*
588          * Optional fields
589          */
590         mp->mnt_iosize_max = MAXPHYS;
591         mp->mnt_data = (qaddr_t)pmp;
592         pmp->mp = mp;
593
594         /*
595          * Lookup mount point under the media-localized super-root.
596          */
597         parent = hammer2_inode_lock_ex(hmp->sroot);
598         lhc = hammer2_dirhash(label, strlen(label));
599         rchain = hammer2_chain_lookup(&parent, &key_next,
600                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
601                                       &cache_index, 0);
602         while (rchain) {
603                 if (rchain->bref.type == HAMMER2_BREF_TYPE_INODE &&
604                     strcmp(label, rchain->data->ipdata.filename) == 0) {
605                         break;
606                 }
607                 rchain = hammer2_chain_next(&parent, rchain, &key_next,
608                                             key_next,
609                                             lhc + HAMMER2_DIRHASH_LOMASK,
610                                             &cache_index, 0);
611         }
612         hammer2_inode_unlock_ex(hmp->sroot, parent);
613
614         if (rchain == NULL) {
615                 kprintf("hammer2_mount: PFS label not found\n");
616                 --hmp->pmp_count;
617                 hammer2_vfs_unmount(mp, MNT_FORCE);
618                 return EINVAL;
619         }
620         if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
621                 hammer2_chain_unlock(rchain);
622                 kprintf("hammer2_mount: PFS label already mounted!\n");
623                 --hmp->pmp_count;
624                 hammer2_vfs_unmount(mp, MNT_FORCE);
625                 return EBUSY;
626         }
627 #if 0
628         if (rchain->flags & HAMMER2_CHAIN_RECYCLE) {
629                 kprintf("hammer2_mount: PFS label currently recycling\n");
630                 --hmp->pmp_count;
631                 hammer2_vfs_unmount(mp, MNT_FORCE);
632                 return EBUSY;
633         }
634 #endif
635
636         atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
637
638         /*
639          * NOTE: *_get() integrates chain's lock into the inode lock.
640          */
641         hammer2_chain_ref(rchain);              /* for pmp->rchain */
642         pmp->cluster.nchains = 1;
643         pmp->cluster.chains[0] = rchain;
644         pmp->iroot = hammer2_inode_get(pmp, NULL, rchain);
645         hammer2_inode_ref(pmp->iroot);          /* ref for pmp->iroot */
646
647         KKASSERT(rchain->pmp == NULL);          /* tracking pmp for rchain */
648         rchain->pmp = pmp;
649         atomic_add_long(&pmp->inmem_chains, 1);
650
651         hammer2_inode_unlock_ex(pmp->iroot, rchain);
652
653         kprintf("iroot %p\n", pmp->iroot);
654
655         /*
656          * The logical file buffer bio write thread handles things
657          * like physical block assignment and compression.
658          */
659         mtx_init(&pmp->wthread_mtx);
660         bioq_init(&pmp->wthread_bioq);
661         pmp->wthread_destroy = 0;
662         lwkt_create(hammer2_write_thread, pmp,
663                     &pmp->wthread_td, NULL, 0, -1, "hwrite-%s", label);
664
665         /*
666          * Ref the cluster management messaging descriptor.  The mount
667          * program deals with the other end of the communications pipe.
668          */
669         fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
670         if (fp == NULL) {
671                 kprintf("hammer2_mount: bad cluster_fd!\n");
672                 hammer2_vfs_unmount(mp, MNT_FORCE);
673                 return EBADF;
674         }
675         hammer2_cluster_reconnect(pmp, fp);
676
677         /*
678          * Finish setup
679          */
680         vfs_getnewfsid(mp);
681         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
682         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
683         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
684
685         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
686         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
687         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
688         copyinstr(path, mp->mnt_stat.f_mntonname,
689                   sizeof(mp->mnt_stat.f_mntonname) - 1,
690                   &size);
691
692         /*
693          * Initial statfs to prime mnt_stat.
694          */
695         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
696         
697         return 0;
698 }
699
700 /*
701  * Handle bioq for strategy write
702  */
703 static
704 void
705 hammer2_write_thread(void *arg)
706 {
707         hammer2_pfsmount_t *pmp;
708         struct bio *bio;
709         struct buf *bp;
710         hammer2_trans_t trans;
711         struct vnode *vp;
712         hammer2_inode_t *ip;
713         hammer2_chain_t *parent;
714         hammer2_chain_t **parentp;
715         hammer2_inode_data_t *ipdata;
716         hammer2_key_t lbase;
717         int lblksize;
718         int pblksize;
719         int error;
720         
721         pmp = arg;
722         
723         mtx_lock(&pmp->wthread_mtx);
724         while (pmp->wthread_destroy == 0) {
725                 if (bioq_first(&pmp->wthread_bioq) == NULL) {
726                         mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
727                                  0, "h2bioqw", 0);
728                 }
729                 parent = NULL;
730                 parentp = &parent;
731
732                 hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
733
734                 while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
735                         /*
736                          * dummy bio for synchronization.  The transaction
737                          * must be reinitialized.
738                          */
739                         if (bio->bio_buf == NULL) {
740                                 bio->bio_flags |= BIO_DONE;
741                                 wakeup(bio);
742                                 hammer2_trans_done(&trans);
743                                 hammer2_trans_init(&trans, pmp,
744                                                    HAMMER2_TRANS_BUFCACHE);
745                                 continue;
746                         }
747
748                         /*
749                          * else normal bio processing
750                          */
751                         mtx_unlock(&pmp->wthread_mtx);
752
753                         hammer2_lwinprog_drop(pmp);
754                         
755                         error = 0;
756                         bp = bio->bio_buf;
757                         vp = bp->b_vp;
758                         ip = VTOI(vp);
759
760                         /*
761                          * Inode is modified, flush size and mtime changes
762                          * to ensure that the file size remains consistent
763                          * with the buffers being flushed.
764                          */
765                         parent = hammer2_inode_lock_ex(ip);
766                         if (ip->flags & (HAMMER2_INODE_RESIZED |
767                                          HAMMER2_INODE_MTIME)) {
768                                 hammer2_inode_fsync(&trans, ip, parentp);
769                         }
770                         ipdata = hammer2_chain_modify_ip(&trans, ip,
771                                                          parentp, 0);
772                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
773                                                         &lbase, NULL);
774                         pblksize = hammer2_calc_physical(ip, lbase);
775                         hammer2_write_file_core(bp, &trans, ip, ipdata,
776                                                 parentp,
777                                                 lbase, IO_ASYNC,
778                                                 pblksize, &error);
779                         hammer2_inode_unlock_ex(ip, parent);
780                         if (error) {
781                                 kprintf("hammer2: error in buffer write\n");
782                                 bp->b_flags |= B_ERROR;
783                                 bp->b_error = EIO;
784                         }
785                         biodone(bio);
786                         mtx_lock(&pmp->wthread_mtx);
787                 }
788                 hammer2_trans_done(&trans);
789         }
790         pmp->wthread_destroy = -1;
791         wakeup(&pmp->wthread_destroy);
792         
793         mtx_unlock(&pmp->wthread_mtx);
794 }
795
796 void
797 hammer2_bioq_sync(hammer2_pfsmount_t *pmp)
798 {
799         struct bio sync_bio;
800
801         bzero(&sync_bio, sizeof(sync_bio));     /* dummy with no bio_buf */
802         mtx_lock(&pmp->wthread_mtx);
803         if (pmp->wthread_destroy == 0) {
804                 if (TAILQ_EMPTY(&pmp->wthread_bioq.queue)) {
805                        bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
806                        wakeup(&pmp->wthread_bioq);
807                 } else {
808                        bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
809                 }
810                 while ((sync_bio.bio_flags & BIO_DONE) == 0)
811                         mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
812         }
813         mtx_unlock(&pmp->wthread_mtx);
814 }
815
816 /* 
817  * Return a chain suitable for I/O, creating the chain if necessary
818  * and assigning its physical block.
819  */
820 static
821 hammer2_chain_t *
822 hammer2_assign_physical(hammer2_trans_t *trans,
823                         hammer2_inode_t *ip, hammer2_chain_t **parentp,
824                         hammer2_key_t lbase, int pblksize, int *errorp)
825 {
826         hammer2_chain_t *parent;
827         hammer2_chain_t *chain;
828         hammer2_off_t pbase;
829         hammer2_key_t key_dummy;
830         int pradix = hammer2_getradix(pblksize);
831         int cache_index = -1;
832
833         /*
834          * Locate the chain associated with lbase, return a locked chain.
835          * However, do not instantiate any data reference (which utilizes a
836          * device buffer) because we will be using direct IO via the
837          * logical buffer cache buffer.
838          */
839         *errorp = 0;
840         KKASSERT(pblksize >= HAMMER2_MIN_ALLOC);
841 retry:
842         parent = *parentp;
843         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); /* extra lock */
844         chain = hammer2_chain_lookup(&parent, &key_dummy,
845                                      lbase, lbase,
846                                      &cache_index, HAMMER2_LOOKUP_NODATA);
847
848         if (chain == NULL) {
849                 /*
850                  * We found a hole, create a new chain entry.
851                  *
852                  * NOTE: DATA chains are created without device backing
853                  *       store (nor do we want any).
854                  */
855                 *errorp = hammer2_chain_create(trans, &parent, &chain,
856                                                lbase, HAMMER2_PBUFRADIX,
857                                                HAMMER2_BREF_TYPE_DATA,
858                                                pblksize);
859                 if (chain == NULL) {
860                         hammer2_chain_lookup_done(parent);
861                         panic("hammer2_chain_create: par=%p error=%d\n",
862                                 parent, *errorp);
863                         goto retry;
864                 }
865
866                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
867                 /*ip->delta_dcount += pblksize;*/
868         } else {
869                 switch (chain->bref.type) {
870                 case HAMMER2_BREF_TYPE_INODE:
871                         /*
872                          * The data is embedded in the inode.  The
873                          * caller is responsible for marking the inode
874                          * modified and copying the data to the embedded
875                          * area.
876                          */
877                         pbase = NOOFFSET;
878                         break;
879                 case HAMMER2_BREF_TYPE_DATA:
880                         if (chain->bytes != pblksize) {
881                                 hammer2_chain_resize(trans, ip,
882                                                      parent, &chain,
883                                                      pradix,
884                                                      HAMMER2_MODIFY_OPTDATA);
885                         }
886                         hammer2_chain_modify(trans, &chain,
887                                              HAMMER2_MODIFY_OPTDATA);
888                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
889                         break;
890                 default:
891                         panic("hammer2_assign_physical: bad type");
892                         /* NOT REACHED */
893                         pbase = NOOFFSET;
894                         break;
895                 }
896         }
897
898         /*
899          * Cleanup.  If chain wound up being the inode (i.e. DIRECTDATA),
900          * we might have to replace *parentp.
901          */
902         hammer2_chain_lookup_done(parent);
903         if (chain) {
904                 if (*parentp != chain &&
905                     (*parentp)->core == chain->core) {
906                         parent = *parentp;
907                         *parentp = chain;               /* eats lock */
908                         hammer2_chain_unlock(parent);
909                         hammer2_chain_lock(chain, 0);   /* need another */
910                 }
911                 /* else chain already locked for return */
912         }
913         return (chain);
914 }
915
916 /* 
917  * From hammer2_vnops.c.
918  * The core write function which determines which path to take
919  * depending on compression settings.
920  */
921 static
922 void
923 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
924                         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
925                         hammer2_chain_t **parentp,
926                         hammer2_key_t lbase, int ioflag, int pblksize,
927                         int *errorp)
928 {
929         hammer2_chain_t *chain;
930
931         switch(HAMMER2_DEC_COMP(ipdata->comp_algo)) {
932         case HAMMER2_COMP_NONE:
933                 /*
934                  * We have to assign physical storage to the buffer
935                  * we intend to dirty or write now to avoid deadlocks
936                  * in the strategy code later.
937                  *
938                  * This can return NOOFFSET for inode-embedded data.
939                  * The strategy code will take care of it in that case.
940                  */
941                 chain = hammer2_assign_physical(trans, ip, parentp,
942                                                 lbase, pblksize,
943                                                 errorp);
944                 hammer2_write_bp(chain, bp, ioflag, pblksize, errorp);
945                 if (chain)
946                         hammer2_chain_unlock(chain);
947                 break;
948         case HAMMER2_COMP_AUTOZERO:
949                 /*
950                  * Check for zero-fill only
951                  */
952                 hammer2_zero_check_and_write(bp, trans, ip,
953                                     ipdata, parentp, lbase,
954                                     ioflag, pblksize, errorp);
955                 break;
956         case HAMMER2_COMP_LZ4:
957         case HAMMER2_COMP_ZLIB:
958         default:
959                 /*
960                  * Check for zero-fill and attempt compression.
961                  */
962                 hammer2_compress_and_write(bp, trans, ip,
963                                            ipdata, parentp,
964                                            lbase, ioflag,
965                                            pblksize, errorp,
966                                            ipdata->comp_algo);
967                 break;
968         }
969         ipdata = &ip->chain->data->ipdata;      /* reload */
970 }
971
972 /*
973  * From hammer2_vnops.c
974  * Generic function that will perform the compression in compression
975  * write path. The compression algorithm is determined by the settings
976  * obtained from inode.
977  */
978 static
979 void
980 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
981         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
982         hammer2_chain_t **parentp,
983         hammer2_key_t lbase, int ioflag, int pblksize,
984         int *errorp, int comp_algo)
985 {
986         hammer2_chain_t *chain;
987         int comp_size;
988         int comp_block_size;
989         char *comp_buffer;
990
991         if (test_block_zeros(bp->b_data, pblksize)) {
992                 zero_write(bp, trans, ip, ipdata, parentp, lbase, errorp);
993                 return;
994         }
995
996         comp_size = 0;
997         comp_buffer = NULL;
998
999         KKASSERT(pblksize / 2 <= 32768);
1000                 
1001         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
1002                 z_stream strm_compress;
1003                 int comp_level;
1004                 int ret;
1005
1006                 switch(HAMMER2_DEC_COMP(comp_algo)) {
1007                 case HAMMER2_COMP_LZ4:
1008                         comp_buffer = objcache_get(cache_buffer_write,
1009                                                    M_INTWAIT);
1010                         comp_size = LZ4_compress_limitedOutput(
1011                                         bp->b_data,
1012                                         &comp_buffer[sizeof(int)],
1013                                         pblksize,
1014                                         pblksize / 2 - sizeof(int));
1015                         /*
1016                          * We need to prefix with the size, LZ4
1017                          * doesn't do it for us.  Add the related
1018                          * overhead.
1019                          */
1020                         *(int *)comp_buffer = comp_size;
1021                         if (comp_size)
1022                                 comp_size += sizeof(int);
1023                         break;
1024                 case HAMMER2_COMP_ZLIB:
1025                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1026                         if (comp_level == 0)
1027                                 comp_level = 6; /* default zlib compression */
1028                         else if (comp_level < 6)
1029                                 comp_level = 6;
1030                         else if (comp_level > 9)
1031                                 comp_level = 9;
1032                         ret = deflateInit(&strm_compress, comp_level);
1033                         if (ret != Z_OK) {
1034                                 kprintf("HAMMER2 ZLIB: fatal error "
1035                                         "on deflateInit.\n");
1036                         }
1037
1038                         comp_buffer = objcache_get(cache_buffer_write,
1039                                                    M_INTWAIT);
1040                         strm_compress.next_in = bp->b_data;
1041                         strm_compress.avail_in = pblksize;
1042                         strm_compress.next_out = comp_buffer;
1043                         strm_compress.avail_out = pblksize / 2;
1044                         ret = deflate(&strm_compress, Z_FINISH);
1045                         if (ret == Z_STREAM_END) {
1046                                 comp_size = pblksize / 2 -
1047                                             strm_compress.avail_out;
1048                         } else {
1049                                 comp_size = 0;
1050                         }
1051                         ret = deflateEnd(&strm_compress);
1052                         break;
1053                 default:
1054                         kprintf("Error: Unknown compression method.\n");
1055                         kprintf("Comp_method = %d.\n", comp_algo);
1056                         break;
1057                 }
1058         }
1059
1060         if (comp_size == 0) {
1061                 /*
1062                  * compression failed or turned off
1063                  */
1064                 comp_block_size = pblksize;     /* safety */
1065                 if (++ip->comp_heuristic > 128)
1066                         ip->comp_heuristic = 8;
1067         } else {
1068                 /*
1069                  * compression succeeded
1070                  */
1071                 ip->comp_heuristic = 0;
1072                 if (comp_size <= 1024) {
1073                         comp_block_size = 1024;
1074                 } else if (comp_size <= 2048) {
1075                         comp_block_size = 2048;
1076                 } else if (comp_size <= 4096) {
1077                         comp_block_size = 4096;
1078                 } else if (comp_size <= 8192) {
1079                         comp_block_size = 8192;
1080                 } else if (comp_size <= 16384) {
1081                         comp_block_size = 16384;
1082                 } else if (comp_size <= 32768) {
1083                         comp_block_size = 32768;
1084                 } else {
1085                         panic("hammer2: WRITE PATH: "
1086                               "Weird comp_size value.");
1087                         /* NOT REACHED */
1088                         comp_block_size = pblksize;
1089                 }
1090         }
1091
1092         chain = hammer2_assign_physical(trans, ip, parentp,
1093                                         lbase, comp_block_size,
1094                                         errorp);
1095         ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1096
1097         if (*errorp) {
1098                 kprintf("WRITE PATH: An error occurred while "
1099                         "assigning physical space.\n");
1100                 KKASSERT(chain == NULL);
1101         } else {
1102                 /* Get device offset */
1103                 hammer2_io_t *dio;
1104                 char *bdata;
1105                 int temp_check;
1106
1107                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1108
1109                 switch(chain->bref.type) {
1110                 case HAMMER2_BREF_TYPE_INODE:
1111                         KKASSERT(chain->data->ipdata.op_flags &
1112                                  HAMMER2_OPFLAG_DIRECTDATA);
1113                         KKASSERT(bp->b_loffset == 0);
1114                         bcopy(bp->b_data, chain->data->ipdata.u.data,
1115                               HAMMER2_EMBEDDED_BYTES);
1116                         break;
1117                 case HAMMER2_BREF_TYPE_DATA:
1118                         temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1119
1120                         /*
1121                          * Optimize out the read-before-write
1122                          * if possible.
1123                          */
1124                         *errorp = hammer2_io_newnz(chain->hmp,
1125                                                    chain->bref.data_off,
1126                                                    chain->bytes,
1127                                                    &dio);
1128                         if (*errorp) {
1129                                 hammer2_io_brelse(&dio);
1130                                 kprintf("hammer2: WRITE PATH: "
1131                                         "dbp bread error\n");
1132                                 break;
1133                         }
1134                         bdata = hammer2_io_data(dio, chain->bref.data_off);
1135
1136                         /*
1137                          * When loading the block make sure we don't
1138                          * leave garbage after the compressed data.
1139                          */
1140                         if (comp_size) {
1141                                 chain->bref.methods =
1142                                         HAMMER2_ENC_COMP(comp_algo) +
1143                                         HAMMER2_ENC_CHECK(temp_check);
1144                                 bcopy(comp_buffer, bdata, comp_size);
1145                                 if (comp_size != comp_block_size) {
1146                                         bzero(bdata + comp_size,
1147                                               comp_block_size - comp_size);
1148                                 }
1149                         } else {
1150                                 chain->bref.methods =
1151                                         HAMMER2_ENC_COMP(
1152                                                 HAMMER2_COMP_NONE) +
1153                                         HAMMER2_ENC_CHECK(temp_check);
1154                                 bcopy(bp->b_data, bdata, pblksize);
1155                         }
1156
1157                         /*
1158                          * Device buffer is now valid, chain is no
1159                          * longer in the initial state.
1160                          */
1161                         atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1162
1163                         /* Now write the related bdp. */
1164                         if (ioflag & IO_SYNC) {
1165                                 /*
1166                                  * Synchronous I/O requested.
1167                                  */
1168                                 hammer2_io_bwrite(&dio);
1169                         /*
1170                         } else if ((ioflag & IO_DIRECT) &&
1171                                    loff + n == pblksize) {
1172                                 hammer2_io_bdwrite(&dio);
1173                         */
1174                         } else if (ioflag & IO_ASYNC) {
1175                                 hammer2_io_bawrite(&dio);
1176                         } else {
1177                                 hammer2_io_bdwrite(&dio);
1178                         }
1179                         break;
1180                 default:
1181                         panic("hammer2_write_bp: bad chain type %d\n",
1182                                 chain->bref.type);
1183                         /* NOT REACHED */
1184                         break;
1185                 }
1186
1187                 hammer2_chain_unlock(chain);
1188         }
1189         if (comp_buffer)
1190                 objcache_put(cache_buffer_write, comp_buffer);
1191 }
1192
1193 /*
1194  * Function that performs zero-checking and writing without compression,
1195  * it corresponds to default zero-checking path.
1196  */
1197 static
1198 void
1199 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1200         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
1201         hammer2_chain_t **parentp,
1202         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp)
1203 {
1204         hammer2_chain_t *chain;
1205
1206         if (test_block_zeros(bp->b_data, pblksize)) {
1207                 zero_write(bp, trans, ip, ipdata, parentp, lbase, errorp);
1208         } else {
1209                 chain = hammer2_assign_physical(trans, ip, parentp,
1210                                                 lbase, pblksize, errorp);
1211                 hammer2_write_bp(chain, bp, ioflag, pblksize, errorp);
1212                 if (chain)
1213                         hammer2_chain_unlock(chain);
1214         }
1215 }
1216
1217 /*
1218  * A function to test whether a block of data contains only zeros,
1219  * returns TRUE (non-zero) if the block is all zeros.
1220  */
1221 static
1222 int
1223 test_block_zeros(const char *buf, size_t bytes)
1224 {
1225         size_t i;
1226
1227         for (i = 0; i < bytes; i += sizeof(long)) {
1228                 if (*(const long *)(buf + i) != 0)
1229                         return (0);
1230         }
1231         return (1);
1232 }
1233
1234 /*
1235  * Function to "write" a block that contains only zeros.
1236  */
1237 static
1238 void
1239 zero_write(struct buf *bp, hammer2_trans_t *trans, hammer2_inode_t *ip,
1240         hammer2_inode_data_t *ipdata, hammer2_chain_t **parentp,
1241         hammer2_key_t lbase, int *errorp __unused)
1242 {
1243         hammer2_chain_t *parent;
1244         hammer2_chain_t *chain;
1245         hammer2_key_t key_dummy;
1246         int cache_index = -1;
1247
1248         parent = hammer2_chain_lookup_init(*parentp, 0);
1249
1250         chain = hammer2_chain_lookup(&parent, &key_dummy, lbase, lbase,
1251                                      &cache_index, HAMMER2_LOOKUP_NODATA);
1252         if (chain) {
1253                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1254                         bzero(chain->data->ipdata.u.data,
1255                               HAMMER2_EMBEDDED_BYTES);
1256                 } else {
1257                         hammer2_chain_delete(trans, chain, 0);
1258                 }
1259                 hammer2_chain_unlock(chain);
1260         }
1261         hammer2_chain_lookup_done(parent);
1262 }
1263
1264 /*
1265  * Function to write the data as it is, without performing any sort of
1266  * compression. This function is used in path without compression and
1267  * default zero-checking path.
1268  */
1269 static
1270 void
1271 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1272                                 int pblksize, int *errorp)
1273 {
1274         hammer2_io_t *dio;
1275         char *bdata;
1276         int error;
1277         int temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1278
1279         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1280
1281         switch(chain->bref.type) {
1282         case HAMMER2_BREF_TYPE_INODE:
1283                 KKASSERT(chain->data->ipdata.op_flags &
1284                          HAMMER2_OPFLAG_DIRECTDATA);
1285                 KKASSERT(bp->b_loffset == 0);
1286                 bcopy(bp->b_data, chain->data->ipdata.u.data,
1287                       HAMMER2_EMBEDDED_BYTES);
1288                 error = 0;
1289                 break;
1290         case HAMMER2_BREF_TYPE_DATA:
1291                 error = hammer2_io_newnz(chain->hmp, chain->bref.data_off,
1292                                          chain->bytes, &dio);
1293                 if (error) {
1294                         hammer2_io_bqrelse(&dio);
1295                         kprintf("hammer2: WRITE PATH: dbp bread error\n");
1296                         break;
1297                 }
1298                 bdata = hammer2_io_data(dio, chain->bref.data_off);
1299
1300                 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
1301                                       HAMMER2_ENC_CHECK(temp_check);
1302                 bcopy(bp->b_data, bdata, chain->bytes);
1303                 
1304                 /*
1305                  * Device buffer is now valid, chain is no
1306                  * longer in the initial state.
1307                  */
1308                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1309
1310                 if (ioflag & IO_SYNC) {
1311                         /*
1312                          * Synchronous I/O requested.
1313                          */
1314                         hammer2_io_bwrite(&dio);
1315                 /*
1316                 } else if ((ioflag & IO_DIRECT) && loff + n == pblksize) {
1317                         hammer2_io_bdwrite(&dio);
1318                 */
1319                 } else if (ioflag & IO_ASYNC) {
1320                         hammer2_io_bawrite(&dio);
1321                 } else {
1322                         hammer2_io_bdwrite(&dio);
1323                 }
1324                 break;
1325         default:
1326                 panic("hammer2_write_bp: bad chain type %d\n",
1327                       chain->bref.type);
1328                 /* NOT REACHED */
1329                 error = 0;
1330                 break;
1331         }
1332         *errorp = error;
1333 }
1334
1335 static
1336 int
1337 hammer2_remount(hammer2_mount_t *hmp, char *path, struct vnode *devvp,
1338                 struct ucred *cred)
1339 {
1340         return (0);
1341 }
1342
1343 static
1344 int
1345 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1346 {
1347         hammer2_pfsmount_t *pmp;
1348         hammer2_mount_t *hmp;
1349         hammer2_chain_t *rchain;
1350         int flags;
1351         int error = 0;
1352         int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
1353         int dumpcnt;
1354         int i;
1355         struct vnode *devvp;
1356
1357         pmp = MPTOPMP(mp);
1358
1359         ccms_domain_uninit(&pmp->ccms_dom);
1360         kdmsg_iocom_uninit(&pmp->iocom);        /* XXX chain dependency */
1361
1362         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1363
1364         /*
1365          * If mount initialization proceeded far enough we must flush
1366          * its vnodes.
1367          */
1368         if (mntflags & MNT_FORCE)
1369                 flags = FORCECLOSE;
1370         else
1371                 flags = 0;
1372         if (pmp->iroot) {
1373                 error = vflush(mp, 0, flags);
1374                 if (error)
1375                         goto failed;
1376         }
1377
1378         if (pmp->wthread_td) {
1379                 mtx_lock(&pmp->wthread_mtx);
1380                 pmp->wthread_destroy = 1;
1381                 wakeup(&pmp->wthread_bioq);
1382                 while (pmp->wthread_destroy != -1) {
1383                         mtxsleep(&pmp->wthread_destroy,
1384                                 &pmp->wthread_mtx, 0,
1385                                 "umount-sleep", 0);
1386                 }
1387                 mtx_unlock(&pmp->wthread_mtx);
1388                 pmp->wthread_td = NULL;
1389         }
1390
1391         for (i = 0; i < pmp->cluster.nchains; ++i) {
1392                 hmp = pmp->cluster.chains[i]->hmp;
1393
1394                 hammer2_mount_exlock(hmp);
1395
1396                 --hmp->pmp_count;
1397                 kprintf("hammer2_unmount hmp=%p pmpcnt=%d\n",
1398                         hmp, hmp->pmp_count);
1399
1400                 /*
1401                  * Flush any left over chains.  The voldata lock is only used
1402                  * to synchronize against HAMMER2_CHAIN_MODIFIED_AUX.
1403                  */
1404                 hammer2_voldata_lock(hmp);
1405                 if (((hmp->vchain.flags | hmp->fchain.flags) &
1406                      HAMMER2_CHAIN_MODIFIED) ||
1407                     hmp->vchain.core->update_hi > hmp->voldata.mirror_tid ||
1408                     hmp->fchain.core->update_hi > hmp->voldata.freemap_tid) {
1409                         hammer2_voldata_unlock(hmp, 0);
1410                         hammer2_vfs_sync(mp, MNT_WAIT);
1411                         hammer2_vfs_sync(mp, MNT_WAIT);
1412                 } else {
1413                         hammer2_voldata_unlock(hmp, 0);
1414                 }
1415                 if (hmp->pmp_count == 0) {
1416                         if (((hmp->vchain.flags | hmp->fchain.flags) &
1417                              HAMMER2_CHAIN_MODIFIED) ||
1418                             (hmp->vchain.core->update_hi >
1419                              hmp->voldata.mirror_tid) ||
1420                             (hmp->fchain.core->update_hi >
1421                              hmp->voldata.freemap_tid)) {
1422                                 kprintf("hammer2_unmount: chains left over "
1423                                         "after final sync\n");
1424                                 if (hammer2_debug & 0x0010)
1425                                         Debugger("entered debugger");
1426                         }
1427                 }
1428
1429                 /*
1430                  * Cleanup the root and super-root chain elements
1431                  * (which should be clean).
1432                  */
1433                 if (pmp->iroot) {
1434 #if REPORT_REFS_ERRORS
1435                         if (pmp->iroot->refs != 1)
1436                                 kprintf("PMP->IROOT %p REFS WRONG %d\n",
1437                                         pmp->iroot, pmp->iroot->refs);
1438 #else
1439                         KKASSERT(pmp->iroot->refs == 1);
1440 #endif
1441                         /* ref for pmp->iroot */
1442                         hammer2_inode_drop(pmp->iroot);
1443                         pmp->iroot = NULL;
1444                 }
1445
1446                 rchain = pmp->cluster.chains[i];
1447                 if (rchain) {
1448                         atomic_clear_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
1449 #if REPORT_REFS_ERRORS
1450                         if (rchain->refs != 1)
1451                                 kprintf("PMP->RCHAIN %p REFS WRONG %d\n",
1452                                         rchain, rchain->refs);
1453 #else
1454                         KKASSERT(rchain->refs == 1);
1455 #endif
1456                         hammer2_chain_drop(rchain);
1457                         pmp->cluster.chains[i] = NULL;
1458                 }
1459
1460                 /*
1461                  * If no PFS's left drop the master hammer2_mount for the
1462                  * device.
1463                  */
1464                 if (hmp->pmp_count == 0) {
1465                         if (hmp->sroot) {
1466                                 hammer2_inode_drop(hmp->sroot);
1467                                 hmp->sroot = NULL;
1468                         }
1469
1470                         /*
1471                          * Finish up with the device vnode
1472                          */
1473                         if ((devvp = hmp->devvp) != NULL) {
1474                                 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1475                                 hmp->devvp = NULL;
1476                                 VOP_CLOSE(devvp,
1477                                           (ronly ? FREAD : FREAD|FWRITE));
1478                                 vrele(devvp);
1479                                 devvp = NULL;
1480                         }
1481
1482                         /*
1483                          * Final drop of embedded freemap root chain to
1484                          * clean up fchain.core (fchain structure is not
1485                          * flagged ALLOCATED so it is cleaned out and then
1486                          * left to rot).
1487                          */
1488                         hammer2_chain_drop(&hmp->fchain);
1489
1490                         /*
1491                          * Final drop of embedded volume root chain to clean
1492                          * up vchain.core (vchain structure is not flagged
1493                          * ALLOCATED so it is cleaned out and then left to
1494                          * rot).
1495                          */
1496                         dumpcnt = 50;
1497                         hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt);
1498                         dumpcnt = 50;
1499                         hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt);
1500                         hammer2_mount_unlock(hmp);
1501                         hammer2_chain_drop(&hmp->vchain);
1502
1503                         hammer2_io_cleanup(hmp, &hmp->iotree);
1504                         if (hmp->iofree_count) {
1505                                 kprintf("io_cleanup: %d I/O's left hanging\n",
1506                                         hmp->iofree_count);
1507                         }
1508
1509                         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1510                         kmalloc_destroy(&hmp->mchain);
1511                         kfree(hmp, M_HAMMER2);
1512                 } else {
1513                         hammer2_mount_unlock(hmp);
1514                 }
1515         }
1516
1517         pmp->mp = NULL;
1518         mp->mnt_data = NULL;
1519
1520         kmalloc_destroy(&pmp->mmsg);
1521         kmalloc_destroy(&pmp->minode);
1522
1523         kfree(pmp, M_HAMMER2);
1524         error = 0;
1525
1526 failed:
1527         lockmgr(&hammer2_mntlk, LK_RELEASE);
1528
1529         return (error);
1530 }
1531
1532 static
1533 int
1534 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1535              ino_t ino, struct vnode **vpp)
1536 {
1537         kprintf("hammer2_vget\n");
1538         return (EOPNOTSUPP);
1539 }
1540
1541 static
1542 int
1543 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1544 {
1545         hammer2_pfsmount_t *pmp;
1546         hammer2_chain_t *parent;
1547         int error;
1548         struct vnode *vp;
1549
1550         pmp = MPTOPMP(mp);
1551         if (pmp->iroot == NULL) {
1552                 *vpp = NULL;
1553                 error = EINVAL;
1554         } else {
1555                 parent = hammer2_inode_lock_sh(pmp->iroot);
1556                 vp = hammer2_igetv(pmp->iroot, &error);
1557                 hammer2_inode_unlock_sh(pmp->iroot, parent);
1558                 *vpp = vp;
1559                 if (vp == NULL)
1560                         kprintf("vnodefail\n");
1561         }
1562
1563         return (error);
1564 }
1565
1566 /*
1567  * Filesystem status
1568  *
1569  * XXX incorporate ipdata->inode_quota and data_quota
1570  */
1571 static
1572 int
1573 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
1574 {
1575         hammer2_pfsmount_t *pmp;
1576         hammer2_mount_t *hmp;
1577
1578         pmp = MPTOPMP(mp);
1579         KKASSERT(pmp->cluster.nchains >= 1);
1580         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
1581
1582         mp->mnt_stat.f_files = pmp->inode_count;
1583         mp->mnt_stat.f_ffree = 0;
1584         mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1585         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1586         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
1587
1588         *sbp = mp->mnt_stat;
1589         return (0);
1590 }
1591
1592 static
1593 int
1594 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
1595 {
1596         hammer2_pfsmount_t *pmp;
1597         hammer2_mount_t *hmp;
1598
1599         pmp = MPTOPMP(mp);
1600         KKASSERT(pmp->cluster.nchains >= 1);
1601         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
1602
1603         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1604         mp->mnt_vstat.f_files = pmp->inode_count;
1605         mp->mnt_vstat.f_ffree = 0;
1606         mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1607         mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1608         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
1609
1610         *sbp = mp->mnt_vstat;
1611         return (0);
1612 }
1613
1614 /*
1615  * Sync the entire filesystem; this is called from the filesystem syncer
1616  * process periodically and whenever a user calls sync(1) on the hammer
1617  * mountpoint.
1618  *
1619  * Currently is actually called from the syncer! \o/
1620  *
1621  * This task will have to snapshot the state of the dirty inode chain.
1622  * From that, it will have to make sure all of the inodes on the dirty
1623  * chain have IO initiated. We make sure that io is initiated for the root
1624  * block.
1625  *
1626  * If waitfor is set, we wait for media to acknowledge the new rootblock.
1627  *
1628  * THINKS: side A vs side B, to have sync not stall all I/O?
1629  */
1630 int
1631 hammer2_vfs_sync(struct mount *mp, int waitfor)
1632 {
1633         struct hammer2_sync_info info;
1634         hammer2_chain_t *chain;
1635         hammer2_pfsmount_t *pmp;
1636         hammer2_mount_t *hmp;
1637         int flags;
1638         int error;
1639         int total_error;
1640         int force_fchain;
1641         int i;
1642
1643         pmp = MPTOPMP(mp);
1644
1645         /*
1646          * We can't acquire locks on existing vnodes while in a transaction
1647          * without risking a deadlock.  This assumes that vfsync() can be
1648          * called without the vnode locked (which it can in DragonFly).
1649          * Otherwise we'd have to implement a multi-pass or flag the lock
1650          * failures and retry.
1651          *
1652          * The reclamation code interlocks with the sync list's token
1653          * (by removing the vnode from the scan list) before unlocking
1654          * the inode, giving us time to ref the inode.
1655          */
1656         /*flags = VMSC_GETVP;*/
1657         flags = 0;
1658         if (waitfor & MNT_LAZY)
1659                 flags |= VMSC_ONEPASS;
1660
1661         /*
1662          * Initialize a normal transaction and sync everything out, then
1663          * wait for pending I/O to finish (so it gets a transaction id
1664          * that the meta-data flush will catch).
1665          */
1666         hammer2_trans_init(&info.trans, pmp, 0);
1667         info.error = 0;
1668         info.waitfor = MNT_NOWAIT;
1669         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
1670
1671         if (info.error == 0 && (waitfor & MNT_WAIT)) {
1672                 info.waitfor = waitfor;
1673                     vsyncscan(mp, flags, hammer2_sync_scan2, &info);
1674
1675         }
1676         hammer2_trans_done(&info.trans);
1677         hammer2_bioq_sync(info.trans.pmp);
1678
1679         /*
1680          * Start the flush transaction and flush all meta-data.
1681          */
1682         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH);
1683
1684         total_error = 0;
1685         for (i = 0; i < pmp->cluster.nchains; ++i) {
1686                 hmp = pmp->cluster.chains[i]->hmp;
1687
1688                 /*
1689                  * Media mounts have two 'roots', vchain for the topology
1690                  * and fchain for the free block table.  Flush both.
1691                  *
1692                  * Note that the topology and free block table are handled
1693                  * independently, so the free block table can wind up being
1694                  * ahead of the topology.  We depend on the bulk free scan
1695                  * code to deal with any loose ends.
1696                  */
1697                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1698                 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) ||
1699                     hmp->vchain.core->update_hi > hmp->voldata.mirror_tid) {
1700                         chain = &hmp->vchain;
1701                         hammer2_chain_flush(&info.trans, &chain);
1702                         KKASSERT(chain == &hmp->vchain);
1703                         hmp->voldata.mirror_tid = chain->bref.mirror_tid;
1704                         force_fchain = 1;
1705                 } else {
1706                         force_fchain = 0;
1707                 }
1708                 hammer2_chain_unlock(&hmp->vchain);
1709
1710                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1711                 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
1712                     hmp->fchain.core->update_hi > hmp->voldata.freemap_tid ||
1713                     force_fchain) {
1714                         /* this will also modify vchain as a side effect */
1715                         chain = &hmp->fchain;
1716                         hammer2_chain_flush(&info.trans, &chain);
1717                         KKASSERT(chain == &hmp->fchain);
1718                         hmp->voldata.freemap_tid = chain->bref.mirror_tid;
1719                 }
1720                 hammer2_chain_unlock(&hmp->fchain);
1721
1722                 error = 0;
1723
1724                 /*
1725                  * We can't safely flush the volume header until we have
1726                  * flushed any device buffers which have built up.
1727                  *
1728                  * XXX this isn't being incremental
1729                  */
1730                 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
1731                 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
1732                 vn_unlock(hmp->devvp);
1733
1734                 /*
1735                  * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1736                  * volume header needs synchronization via hmp->volsync.
1737                  *
1738                  * XXX synchronize the flag & data with only this flush XXX
1739                  */
1740                 if (error == 0 &&
1741                     (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1742                         struct buf *bp;
1743
1744                         /*
1745                          * Synchronize the disk before flushing the volume
1746                          * header.
1747                          */
1748                         bp = getpbuf(NULL);
1749                         bp->b_bio1.bio_offset = 0;
1750                         bp->b_bufsize = 0;
1751                         bp->b_bcount = 0;
1752                         bp->b_cmd = BUF_CMD_FLUSH;
1753                         bp->b_bio1.bio_done = biodone_sync;
1754                         bp->b_bio1.bio_flags |= BIO_SYNC;
1755                         vn_strategy(hmp->devvp, &bp->b_bio1);
1756                         biowait(&bp->b_bio1, "h2vol");
1757                         relpbuf(bp, NULL);
1758
1759                         /*
1760                          * Then we can safely flush the version of the
1761                          * volume header synchronized by the flush code.
1762                          */
1763                         i = hmp->volhdrno + 1;
1764                         if (i >= HAMMER2_NUM_VOLHDRS)
1765                                 i = 0;
1766                         if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1767                             hmp->volsync.volu_size) {
1768                                 i = 0;
1769                         }
1770                         kprintf("sync volhdr %d %jd\n",
1771                                 i, (intmax_t)hmp->volsync.volu_size);
1772                         bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
1773                                     HAMMER2_PBUFSIZE, 0, 0);
1774                         atomic_clear_int(&hmp->vchain.flags,
1775                                          HAMMER2_CHAIN_VOLUMESYNC);
1776                         bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1777                         bawrite(bp);
1778                         hmp->volhdrno = i;
1779                 }
1780                 if (error)
1781                         total_error = error;
1782         }
1783         hammer2_trans_done(&info.trans);
1784
1785         return (total_error);
1786 }
1787
1788 /*
1789  * Sync passes.
1790  *
1791  * NOTE: We don't test update_lo/update_hi or MOVED here because the fsync
1792  *       code won't flush on those flags.  The syncer code above will do a
1793  *       general meta-data flush globally that will catch these flags.
1794  */
1795
1796 static int
1797 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1798 {
1799         struct hammer2_sync_info *info = data;
1800         hammer2_inode_t *ip;
1801         int error;
1802
1803         /*
1804          *
1805          */
1806         ip = VTOI(vp);
1807         if (ip == NULL)
1808                 return(0);
1809         if (vp->v_type == VNON || vp->v_type == VBAD) {
1810                 vclrisdirty(vp);
1811                 return(0);
1812         }
1813         if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
1814             RB_EMPTY(&vp->v_rbdirty_tree)) {
1815                 vclrisdirty(vp);
1816                 return(0);
1817         }
1818
1819         /*
1820          * VOP_FSYNC will start a new transaction so replicate some code
1821          * here to do it inline (see hammer2_vop_fsync()).
1822          *
1823          * WARNING: The vfsync interacts with the buffer cache and might
1824          *          block, we can't hold the inode lock at that time.
1825          *          However, we MUST ref ip before blocking to ensure that
1826          *          it isn't ripped out from under us (since we do not
1827          *          hold a lock on the vnode).
1828          */
1829         hammer2_inode_ref(ip);
1830         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1831         if (vp)
1832                 vfsync(vp, MNT_NOWAIT, 1, NULL, NULL);
1833
1834 #if 0
1835         /*
1836          * XXX this interferes with flush operations mainly because the
1837          *     same transaction id is being used by asynchronous buffer
1838          *     operations above and can be reordered after the flush
1839          *     below.
1840          */
1841         parent = hammer2_inode_lock_ex(ip);
1842         hammer2_chain_flush(&info->trans, &parent);
1843         hammer2_inode_unlock_ex(ip, parent);
1844 #endif
1845         hammer2_inode_drop(ip);
1846         error = 0;
1847 #if 0
1848         error = VOP_FSYNC(vp, MNT_NOWAIT, 0);
1849 #endif
1850         if (error)
1851                 info->error = error;
1852         return(0);
1853 }
1854
1855 static
1856 int
1857 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
1858 {
1859         return (0);
1860 }
1861
1862 static
1863 int
1864 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
1865                struct fid *fhp, struct vnode **vpp)
1866 {
1867         return (0);
1868 }
1869
1870 static
1871 int
1872 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
1873                  int *exflagsp, struct ucred **credanonp)
1874 {
1875         return (0);
1876 }
1877
1878 /*
1879  * Support code for hammer2_mount().  Read, verify, and install the volume
1880  * header into the HMP
1881  *
1882  * XXX read four volhdrs and use the one with the highest TID whos CRC
1883  *     matches.
1884  *
1885  * XXX check iCRCs.
1886  *
1887  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
1888  *     nonexistant locations.
1889  *
1890  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
1891  */
1892 static
1893 int
1894 hammer2_install_volume_header(hammer2_mount_t *hmp)
1895 {
1896         hammer2_volume_data_t *vd;
1897         struct buf *bp;
1898         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
1899         int error_reported;
1900         int error;
1901         int valid;
1902         int i;
1903
1904         error_reported = 0;
1905         error = 0;
1906         valid = 0;
1907         bp = NULL;
1908
1909         /*
1910          * There are up to 4 copies of the volume header (syncs iterate
1911          * between them so there is no single master).  We don't trust the
1912          * volu_size field so we don't know precisely how large the filesystem
1913          * is, so depend on the OS to return an error if we go beyond the
1914          * block device's EOF.
1915          */
1916         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
1917                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
1918                               HAMMER2_VOLUME_BYTES, &bp);
1919                 if (error) {
1920                         brelse(bp);
1921                         bp = NULL;
1922                         continue;
1923                 }
1924
1925                 vd = (struct hammer2_volume_data *) bp->b_data;
1926                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
1927                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
1928                         brelse(bp);
1929                         bp = NULL;
1930                         continue;
1931                 }
1932
1933                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
1934                         /* XXX: Reversed-endianness filesystem */
1935                         kprintf("hammer2: reverse-endian filesystem detected");
1936                         brelse(bp);
1937                         bp = NULL;
1938                         continue;
1939                 }
1940
1941                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
1942                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
1943                                       HAMMER2_VOLUME_ICRC0_SIZE);
1944                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
1945                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
1946                                        HAMMER2_VOLUME_ICRC1_SIZE);
1947                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
1948                         kprintf("hammer2 volume header crc "
1949                                 "mismatch copy #%d %08x/%08x\n",
1950                                 i, crc0, crc);
1951                         error_reported = 1;
1952                         brelse(bp);
1953                         bp = NULL;
1954                         continue;
1955                 }
1956                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
1957                         valid = 1;
1958                         hmp->voldata = *vd;
1959                         hmp->volhdrno = i;
1960                 }
1961                 brelse(bp);
1962                 bp = NULL;
1963         }
1964         if (valid) {
1965                 hmp->volsync = hmp->voldata;
1966                 error = 0;
1967                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
1968                         kprintf("hammer2: using volume header #%d\n",
1969                                 hmp->volhdrno);
1970                 }
1971         } else {
1972                 error = EINVAL;
1973                 kprintf("hammer2: no valid volume headers found!\n");
1974         }
1975         return (error);
1976 }
1977
1978 /*
1979  * Reconnect using the passed file pointer.  The caller must ref the
1980  * fp for us.
1981  */
1982 void
1983 hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
1984 {
1985         hammer2_inode_data_t *ipdata;
1986         hammer2_chain_t *parent;
1987         hammer2_mount_t *hmp;
1988         size_t name_len;
1989
1990         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
1991
1992         /*
1993          * Closes old comm descriptor, kills threads, cleans up
1994          * states, then installs the new descriptor and creates
1995          * new threads.
1996          */
1997         kdmsg_iocom_reconnect(&pmp->iocom, fp, "hammer2");
1998
1999         /*
2000          * Setup LNK_CONN fields for autoinitiated state machine
2001          */
2002         parent = hammer2_inode_lock_ex(pmp->iroot);
2003         ipdata = &parent->data->ipdata;
2004         pmp->iocom.auto_lnk_conn.pfs_clid = ipdata->pfs_clid;
2005         pmp->iocom.auto_lnk_conn.pfs_fsid = ipdata->pfs_fsid;
2006         pmp->iocom.auto_lnk_conn.pfs_type = ipdata->pfs_type;
2007         pmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
2008         pmp->iocom.auto_lnk_conn.peer_type = hmp->voldata.peer_type;
2009
2010         /*
2011          * Filter adjustment.  Clients do not need visibility into other
2012          * clients (otherwise millions of clients would present a serious
2013          * problem).  The fs_label also serves to restrict the namespace.
2014          */
2015         pmp->iocom.auto_lnk_conn.peer_mask = 1LLU << HAMMER2_PEER_HAMMER2;
2016         pmp->iocom.auto_lnk_conn.pfs_mask = (uint64_t)-1;
2017         switch (ipdata->pfs_type) {
2018         case DMSG_PFSTYPE_CLIENT:
2019                 pmp->iocom.auto_lnk_conn.peer_mask &=
2020                                 ~(1LLU << DMSG_PFSTYPE_CLIENT);
2021                 break;
2022         default:
2023                 break;
2024         }
2025
2026         name_len = ipdata->name_len;
2027         if (name_len >= sizeof(pmp->iocom.auto_lnk_conn.fs_label))
2028                 name_len = sizeof(pmp->iocom.auto_lnk_conn.fs_label) - 1;
2029         bcopy(ipdata->filename,
2030               pmp->iocom.auto_lnk_conn.fs_label,
2031               name_len);
2032         pmp->iocom.auto_lnk_conn.fs_label[name_len] = 0;
2033
2034         /*
2035          * Setup LNK_SPAN fields for autoinitiated state machine
2036          */
2037         pmp->iocom.auto_lnk_span.pfs_clid = ipdata->pfs_clid;
2038         pmp->iocom.auto_lnk_span.pfs_fsid = ipdata->pfs_fsid;
2039         pmp->iocom.auto_lnk_span.pfs_type = ipdata->pfs_type;
2040         pmp->iocom.auto_lnk_span.peer_type = hmp->voldata.peer_type;
2041         pmp->iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1;
2042         name_len = ipdata->name_len;
2043         if (name_len >= sizeof(pmp->iocom.auto_lnk_span.fs_label))
2044                 name_len = sizeof(pmp->iocom.auto_lnk_span.fs_label) - 1;
2045         bcopy(ipdata->filename,
2046               pmp->iocom.auto_lnk_span.fs_label,
2047               name_len);
2048         pmp->iocom.auto_lnk_span.fs_label[name_len] = 0;
2049         hammer2_inode_unlock_ex(pmp->iroot, parent);
2050
2051         kdmsg_iocom_autoinitiate(&pmp->iocom, hammer2_autodmsg);
2052 }
2053
2054 static int
2055 hammer2_rcvdmsg(kdmsg_msg_t *msg)
2056 {
2057         switch(msg->any.head.cmd & DMSGF_TRANSMASK) {
2058         case DMSG_DBG_SHELL:
2059                 /*
2060                  * (non-transaction)
2061                  * Execute shell command (not supported atm)
2062                  */
2063                 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2064                 break;
2065         case DMSG_DBG_SHELL | DMSGF_REPLY:
2066                 /*
2067                  * (non-transaction)
2068                  */
2069                 if (msg->aux_data) {
2070                         msg->aux_data[msg->aux_size - 1] = 0;
2071                         kprintf("HAMMER2 DBG: %s\n", msg->aux_data);
2072                 }
2073                 break;
2074         default:
2075                 /*
2076                  * Unsupported message received.  We only need to
2077                  * reply if it's a transaction in order to close our end.
2078                  * Ignore any one-way messages are any further messages
2079                  * associated with the transaction.
2080                  *
2081                  * NOTE: This case also includes DMSG_LNK_ERROR messages
2082                  *       which might be one-way, replying to those would
2083                  *       cause an infinite ping-pong.
2084                  */
2085                 if (msg->any.head.cmd & DMSGF_CREATE)
2086                         kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2087                 break;
2088         }
2089         return(0);
2090 }
2091
2092 /*
2093  * This function is called after KDMSG has automatically handled processing
2094  * of a LNK layer message (typically CONN, SPAN, or CIRC).
2095  *
2096  * We tag off the LNK_CONN to trigger our LNK_VOLCONF messages which
2097  * advertises all available hammer2 super-root volumes.
2098  */
2099 static void
2100 hammer2_autodmsg(kdmsg_msg_t *msg)
2101 {
2102         hammer2_pfsmount_t *pmp = msg->iocom->handle;
2103         hammer2_mount_t *hmp = pmp->cluster.chains[0]->hmp; /* XXX */
2104         int copyid;
2105
2106         /*
2107          * We only care about replies to our LNK_CONN auto-request.  kdmsg
2108          * has already processed the reply, we use this calback as a shim
2109          * to know when we can advertise available super-root volumes.
2110          */
2111         if ((msg->any.head.cmd & DMSGF_TRANSMASK) !=
2112             (DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_REPLY) ||
2113             msg->state == NULL) {
2114                 return;
2115         }
2116
2117         kprintf("LNK_CONN REPLY RECEIVED CMD %08x\n", msg->any.head.cmd);
2118
2119         if (msg->any.head.cmd & DMSGF_CREATE) {
2120                 kprintf("HAMMER2: VOLDATA DUMP\n");
2121
2122                 /*
2123                  * Dump the configuration stored in the volume header
2124                  */
2125                 hammer2_voldata_lock(hmp);
2126                 for (copyid = 0; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
2127                         if (hmp->voldata.copyinfo[copyid].copyid == 0)
2128                                 continue;
2129                         hammer2_volconf_update(pmp, copyid);
2130                 }
2131                 hammer2_voldata_unlock(hmp, 0);
2132         }
2133         if ((msg->any.head.cmd & DMSGF_DELETE) &&
2134             msg->state && (msg->state->txcmd & DMSGF_DELETE) == 0) {
2135                 kprintf("HAMMER2: CONN WAS TERMINATED\n");
2136         }
2137 }
2138
2139 /*
2140  * Volume configuration updates are passed onto the userland service
2141  * daemon via the open LNK_CONN transaction.
2142  */
2143 void
2144 hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index)
2145 {
2146         hammer2_mount_t *hmp = pmp->cluster.chains[0]->hmp;     /* XXX */
2147         kdmsg_msg_t *msg;
2148
2149         /* XXX interlock against connection state termination */
2150         kprintf("volconf update %p\n", pmp->iocom.conn_state);
2151         if (pmp->iocom.conn_state) {
2152                 kprintf("TRANSMIT VOLCONF VIA OPEN CONN TRANSACTION\n");
2153                 msg = kdmsg_msg_alloc_state(pmp->iocom.conn_state,
2154                                             DMSG_LNK_VOLCONF, NULL, NULL);
2155                 msg->any.lnk_volconf.copy = hmp->voldata.copyinfo[index];
2156                 msg->any.lnk_volconf.mediaid = hmp->voldata.fsid;
2157                 msg->any.lnk_volconf.index = index;
2158                 kdmsg_msg_write(msg);
2159         }
2160 }
2161
2162 /*
2163  * This handles hysteresis on regular file flushes.  Because the BIOs are
2164  * routed to a thread it is possible for an excessive number to build up
2165  * and cause long front-end stalls long before the runningbuffspace limit
2166  * is hit, so we implement hammer2_flush_pipe to control the
2167  * hysteresis.
2168  *
2169  * This is a particular problem when compression is used.
2170  */
2171 void
2172 hammer2_lwinprog_ref(hammer2_pfsmount_t *pmp)
2173 {
2174         atomic_add_int(&pmp->count_lwinprog, 1);
2175 }
2176
2177 void
2178 hammer2_lwinprog_drop(hammer2_pfsmount_t *pmp)
2179 {
2180         int lwinprog;
2181
2182         lwinprog = atomic_fetchadd_int(&pmp->count_lwinprog, -1);
2183         if ((lwinprog & HAMMER2_LWINPROG_WAITING) &&
2184             (lwinprog & HAMMER2_LWINPROG_MASK) <= hammer2_flush_pipe * 2 / 3) {
2185                 atomic_clear_int(&pmp->count_lwinprog,
2186                                  HAMMER2_LWINPROG_WAITING);
2187                 wakeup(&pmp->count_lwinprog);
2188         }
2189 }
2190
2191 void
2192 hammer2_lwinprog_wait(hammer2_pfsmount_t *pmp)
2193 {
2194         int lwinprog;
2195
2196         for (;;) {
2197                 lwinprog = pmp->count_lwinprog;
2198                 cpu_ccfence();
2199                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2200                         break;
2201                 tsleep_interlock(&pmp->count_lwinprog, 0);
2202                 atomic_set_int(&pmp->count_lwinprog, HAMMER2_LWINPROG_WAITING);
2203                 lwinprog = pmp->count_lwinprog;
2204                 if ((lwinprog & HAMMER2_LWINPROG_MASK) < hammer2_flush_pipe)
2205                         break;
2206                 tsleep(&pmp->count_lwinprog, PINTERLOCKED, "h2wpipe", hz);
2207         }
2208 }
2209
2210 void
2211 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp)
2212 {
2213         hammer2_chain_layer_t *layer;
2214         hammer2_chain_t *scan;
2215         hammer2_chain_t *first_parent;
2216
2217         --*countp;
2218         if (*countp == 0) {
2219                 kprintf("%*.*s...\n", tab, tab, "");
2220                 return;
2221         }
2222         if (*countp < 0)
2223                 return;
2224         first_parent = chain->core ? TAILQ_FIRST(&chain->core->ownerq) : NULL;
2225         kprintf("%*.*schain %p.%d %016jx/%d mir=%016jx\n",
2226                 tab, tab, "",
2227                 chain, chain->bref.type,
2228                 chain->bref.key, chain->bref.keybits,
2229                 chain->bref.mirror_tid);
2230
2231         kprintf("%*.*s      [%08x] (%s) dt=%016jx refs=%d\n",
2232                 tab, tab, "",
2233                 chain->flags,
2234                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
2235                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
2236                 chain->delete_tid,
2237                 chain->refs);
2238
2239         kprintf("%*.*s      core %p [%08x] lo=%08jx hi=%08jx fp=%p np=%p",
2240                 tab, tab, "",
2241                 chain->core, (chain->core ? chain->core->flags : 0),
2242                 (chain->core ? chain->core->update_lo : -1),
2243                 (chain->core ? chain->core->update_hi : -1),
2244                 first_parent,
2245                 (first_parent ? TAILQ_NEXT(chain, core_entry) : NULL));
2246
2247         if (first_parent)
2248                 kprintf(" [fpflags %08x fprefs %d\n",
2249                         first_parent->flags,
2250                         first_parent->refs);
2251         if (chain->core == NULL || TAILQ_EMPTY(&chain->core->layerq))
2252                 kprintf("\n");
2253         else
2254                 kprintf(" {\n");
2255         if (chain->core) {
2256                 TAILQ_FOREACH(layer, &chain->core->layerq, entry) {
2257                         RB_FOREACH(scan, hammer2_chain_tree, &layer->rbtree) {
2258                                 hammer2_dump_chain(scan, tab + 4, countp);
2259                         }
2260                 }
2261         }
2262         if (chain->core && !TAILQ_EMPTY(&chain->core->layerq)) {
2263                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
2264                         kprintf("%*.*s}(%s)\n", tab, tab, "",
2265                                 chain->data->ipdata.filename);
2266                 else
2267                         kprintf("%*.*s}\n", tab, tab, "");
2268         }
2269 }