hammer2 - Stabilization pass, more flush refactoring
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*-
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61
62 #include "hammer2.h"
63 #include "hammer2_lz4.h"
64
65 #include "zlib/hammer2_zlib.h"
66
67 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
68
69 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
70
71 struct hammer2_sync_info {
72         hammer2_trans_t trans;
73         int error;
74         int waitfor;
75 };
76
77 TAILQ_HEAD(hammer2_mntlist, hammer2_mount);
78 static struct hammer2_mntlist hammer2_mntlist;
79 static struct lock hammer2_mntlk;
80
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 long hammer2_iod_file_read;
85 long hammer2_iod_meta_read;
86 long hammer2_iod_indr_read;
87 long hammer2_iod_fmap_read;
88 long hammer2_iod_volu_read;
89 long hammer2_iod_file_write;
90 long hammer2_iod_meta_write;
91 long hammer2_iod_indr_write;
92 long hammer2_iod_fmap_write;
93 long hammer2_iod_volu_write;
94 long hammer2_ioa_file_read;
95 long hammer2_ioa_meta_read;
96 long hammer2_ioa_indr_read;
97 long hammer2_ioa_fmap_read;
98 long hammer2_ioa_volu_read;
99 long hammer2_ioa_fmap_write;
100 long hammer2_ioa_file_write;
101 long hammer2_ioa_meta_write;
102 long hammer2_ioa_indr_write;
103 long hammer2_ioa_volu_write;
104
105 MALLOC_DECLARE(C_BUFFER);
106 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
107
108 MALLOC_DECLARE(D_BUFFER);
109 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
110
111 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
112
113 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
114            &hammer2_debug, 0, "");
115 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
116            &hammer2_cluster_enable, 0, "");
117 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
118            &hammer2_hardlink_enable, 0, "");
119
120 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
121            &hammer2_iod_file_read, 0, "");
122 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
123            &hammer2_iod_meta_read, 0, "");
124 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
125            &hammer2_iod_indr_read, 0, "");
126 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
127            &hammer2_iod_fmap_read, 0, "");
128 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
129            &hammer2_iod_volu_read, 0, "");
130
131 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
132            &hammer2_iod_file_write, 0, "");
133 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
134            &hammer2_iod_meta_write, 0, "");
135 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
136            &hammer2_iod_indr_write, 0, "");
137 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
138            &hammer2_iod_fmap_write, 0, "");
139 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
140            &hammer2_iod_volu_write, 0, "");
141
142 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
143            &hammer2_ioa_file_read, 0, "");
144 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
145            &hammer2_ioa_meta_read, 0, "");
146 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
147            &hammer2_ioa_indr_read, 0, "");
148 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
149            &hammer2_ioa_fmap_read, 0, "");
150 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
151            &hammer2_ioa_volu_read, 0, "");
152
153 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
154            &hammer2_ioa_file_write, 0, "");
155 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
156            &hammer2_ioa_meta_write, 0, "");
157 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
158            &hammer2_ioa_indr_write, 0, "");
159 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
160            &hammer2_ioa_fmap_write, 0, "");
161 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
162            &hammer2_ioa_volu_write, 0, "");
163
164 static int hammer2_vfs_init(struct vfsconf *conf);
165 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
166 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
167                                 struct ucred *cred);
168 static int hammer2_remount(hammer2_mount_t *, char *, struct vnode *,
169                                 struct ucred *);
170 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
171 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
172 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
173                                 struct ucred *cred);
174 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
175                                 struct ucred *cred);
176 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
177                                 ino_t ino, struct vnode **vpp);
178 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
179                                 struct fid *fhp, struct vnode **vpp);
180 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
181 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
182                                 int *exflagsp, struct ucred **credanonp);
183
184 static int hammer2_install_volume_header(hammer2_mount_t *hmp);
185 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
186
187 static void hammer2_write_thread(void *arg);
188
189 /* 
190  * Functions for compression in threads,
191  * from hammer2_vnops.c
192  */
193 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
194                                 hammer2_inode_t *ip,
195                                 hammer2_inode_data_t *ipdata,
196                                 hammer2_chain_t **parentp,
197                                 hammer2_key_t lbase, int ioflag, int pblksize,
198                                 int *errorp);
199 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
200                                 hammer2_inode_t *ip,
201                                 hammer2_inode_data_t *ipdata,
202                                 hammer2_chain_t **parentp,
203                                 hammer2_key_t lbase, int ioflag,
204                                 int pblksize, int *errorp, int comp_algo);
205 static void hammer2_zero_check_and_write(struct buf *bp,
206                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
207                                 hammer2_inode_data_t *ipdata,
208                                 hammer2_chain_t **parentp,
209                                 hammer2_key_t lbase,
210                                 int ioflag, int pblksize, int *errorp);
211 static int test_block_zeros(const char *buf, size_t bytes);
212 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
213                                 hammer2_inode_t *ip,
214                                 hammer2_inode_data_t *ipdata,
215                                 hammer2_chain_t **parentp, 
216                                 hammer2_key_t lbase,
217                                 int *errorp);
218 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
219                                 int ioflag, int pblksize, int *errorp);
220
221 static int hammer2_rcvdmsg(kdmsg_msg_t *msg);
222 static void hammer2_autodmsg(kdmsg_msg_t *msg);
223
224
225 /*
226  * HAMMER2 vfs operations.
227  */
228 static struct vfsops hammer2_vfsops = {
229         .vfs_init       = hammer2_vfs_init,
230         .vfs_uninit = hammer2_vfs_uninit,
231         .vfs_sync       = hammer2_vfs_sync,
232         .vfs_mount      = hammer2_vfs_mount,
233         .vfs_unmount    = hammer2_vfs_unmount,
234         .vfs_root       = hammer2_vfs_root,
235         .vfs_statfs     = hammer2_vfs_statfs,
236         .vfs_statvfs    = hammer2_vfs_statvfs,
237         .vfs_vget       = hammer2_vfs_vget,
238         .vfs_vptofh     = hammer2_vfs_vptofh,
239         .vfs_fhtovp     = hammer2_vfs_fhtovp,
240         .vfs_checkexp   = hammer2_vfs_checkexp
241 };
242
243 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
244
245 VFS_SET(hammer2_vfsops, hammer2, 0);
246 MODULE_VERSION(hammer2, 1);
247
248 static
249 int
250 hammer2_vfs_init(struct vfsconf *conf)
251 {
252         static struct objcache_malloc_args margs_read;
253         static struct objcache_malloc_args margs_write;
254
255         int error;
256
257         error = 0;
258
259         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
260                 error = EINVAL;
261         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
262                 error = EINVAL;
263         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
264                 error = EINVAL;
265
266         if (error)
267                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
268         
269         margs_read.objsize = 65536;
270         margs_read.mtype = D_BUFFER;
271         
272         margs_write.objsize = 32768;
273         margs_write.mtype = C_BUFFER;
274         
275         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
276                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
277                                 objcache_malloc_free, &margs_read);
278         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
279                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
280                                 objcache_malloc_free, &margs_write);
281
282         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
283         TAILQ_INIT(&hammer2_mntlist);
284
285         return (error);
286 }
287
288 static
289 int
290 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
291 {
292         objcache_destroy(cache_buffer_read);
293         objcache_destroy(cache_buffer_write);
294         return 0;
295 }
296
297 /*
298  * Mount or remount HAMMER2 fileystem from physical media
299  *
300  *      mountroot
301  *              mp              mount point structure
302  *              path            NULL
303  *              data            <unused>
304  *              cred            <unused>
305  *
306  *      mount
307  *              mp              mount point structure
308  *              path            path to mount point
309  *              data            pointer to argument structure in user space
310  *                      volume  volume path (device@LABEL form)
311  *                      hflags  user mount flags
312  *              cred            user credentials
313  *
314  * RETURNS:     0       Success
315  *              !0      error number
316  */
317 static
318 int
319 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
320                   struct ucred *cred)
321 {
322         struct hammer2_mount_info info;
323         hammer2_pfsmount_t *pmp;
324         hammer2_mount_t *hmp;
325         hammer2_key_t key_next;
326         hammer2_key_t key_dummy;
327         hammer2_key_t lhc;
328         struct vnode *devvp;
329         struct nlookupdata nd;
330         hammer2_chain_t *parent;
331         hammer2_chain_t *schain;
332         hammer2_chain_t *rchain;
333         struct file *fp;
334         char devstr[MNAMELEN];
335         size_t size;
336         size_t done;
337         char *dev;
338         char *label;
339         int ronly = 1;
340         int error;
341         int cache_index;
342         int i;
343
344         hmp = NULL;
345         pmp = NULL;
346         dev = NULL;
347         label = NULL;
348         devvp = NULL;
349         cache_index = -1;
350
351         kprintf("hammer2_mount\n");
352
353         if (path == NULL) {
354                 /*
355                  * Root mount
356                  */
357                 bzero(&info, sizeof(info));
358                 info.cluster_fd = -1;
359                 return (EOPNOTSUPP);
360         } else {
361                 /*
362                  * Non-root mount or updating a mount
363                  */
364                 error = copyin(data, &info, sizeof(info));
365                 if (error)
366                         return (error);
367
368                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
369                 if (error)
370                         return (error);
371
372                 /* Extract device and label */
373                 dev = devstr;
374                 label = strchr(devstr, '@');
375                 if (label == NULL ||
376                     ((label + 1) - dev) > done) {
377                         return (EINVAL);
378                 }
379                 *label = '\0';
380                 label++;
381                 if (*label == '\0')
382                         return (EINVAL);
383
384                 if (mp->mnt_flag & MNT_UPDATE) {
385                         /* Update mount */
386                         /* HAMMER2 implements NFS export via mountctl */
387                         pmp = MPTOPMP(mp);
388                         for (i = 0; i < pmp->cluster.nchains; ++i) {
389                                 hmp = pmp->cluster.chains[i]->hmp;
390                                 devvp = hmp->devvp;
391                                 error = hammer2_remount(hmp, path, devvp, cred);
392                                 if (error)
393                                         break;
394                         }
395                         return error;
396                 }
397         }
398
399         /*
400          * PFS mount
401          *
402          * Lookup name and verify it refers to a block device.
403          */
404         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
405         if (error == 0)
406                 error = nlookup(&nd);
407         if (error == 0)
408                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
409         nlookup_done(&nd);
410
411         if (error == 0) {
412                 if (vn_isdisk(devvp, &error))
413                         error = vfs_mountedon(devvp);
414         }
415
416         /*
417          * Determine if the device has already been mounted.  After this
418          * check hmp will be non-NULL if we are doing the second or more
419          * hammer2 mounts from the same device.
420          */
421         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
422         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
423                 if (hmp->devvp == devvp)
424                         break;
425         }
426
427         /*
428          * Open the device if this isn't a secondary mount and construct
429          * the H2 device mount (hmp).
430          */
431         if (hmp == NULL) {
432                 if (error == 0 && vcount(devvp) > 0)
433                         error = EBUSY;
434
435                 /*
436                  * Now open the device
437                  */
438                 if (error == 0) {
439                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
440                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
441                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
442                         if (error == 0) {
443                                 error = VOP_OPEN(devvp,
444                                                  ronly ? FREAD : FREAD | FWRITE,
445                                                  FSCRED, NULL);
446                         }
447                         vn_unlock(devvp);
448                 }
449                 if (error && devvp) {
450                         vrele(devvp);
451                         devvp = NULL;
452                 }
453                 if (error) {
454                         lockmgr(&hammer2_mntlk, LK_RELEASE);
455                         return error;
456                 }
457                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
458                 hmp->ronly = ronly;
459                 hmp->devvp = devvp;
460                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
461                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
462
463                 lockinit(&hmp->alloclk, "h2alloc", 0, 0);
464                 lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
465                 TAILQ_INIT(&hmp->transq);
466
467                 /*
468                  * vchain setup. vchain.data is embedded.
469                  * vchain.refs is initialized and will never drop to 0.
470                  */
471                 hmp->vchain.hmp = hmp;
472                 hmp->vchain.refs = 1;
473                 hmp->vchain.data = (void *)&hmp->voldata;
474                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
475                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
476                 hmp->vchain.delete_tid = HAMMER2_MAX_TID;
477                 hammer2_chain_core_alloc(NULL, &hmp->vchain, NULL);
478                 /* hmp->vchain.u.xxx is left NULL */
479
480                 /*
481                  * fchain setup.  fchain.data is embedded.
482                  * fchain.refs is initialized and will never drop to 0.
483                  *
484                  * The data is not used but needs to be initialized to
485                  * pass assertion muster.  We use this chain primarily
486                  * as a placeholder for the freemap's top-level RBTREE
487                  * so it does not interfere with the volume's topology
488                  * RBTREE.
489                  */
490                 hmp->fchain.hmp = hmp;
491                 hmp->fchain.refs = 1;
492                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
493                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
494                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
495                 hmp->fchain.bref.methods =
496                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
497                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
498                 hmp->fchain.delete_tid = HAMMER2_MAX_TID;
499
500                 hammer2_chain_core_alloc(NULL, &hmp->fchain, NULL);
501                 /* hmp->fchain.u.xxx is left NULL */
502
503                 /*
504                  * Install the volume header
505                  */
506                 error = hammer2_install_volume_header(hmp);
507                 if (error) {
508                         hammer2_vfs_unmount(mp, MNT_FORCE);
509                         return error;
510                 }
511
512                 hmp->vchain.bref.mirror_tid = hmp->voldata.mirror_tid;
513                 hmp->fchain.bref.mirror_tid = hmp->voldata.freemap_tid;
514
515                 /*
516                  * First locate the super-root inode, which is key 0
517                  * relative to the volume header's blockset.
518                  *
519                  * Then locate the root inode by scanning the directory keyspace
520                  * represented by the label.
521                  */
522                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
523                 schain = hammer2_chain_lookup(&parent, &key_dummy,
524                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
525                                       &cache_index, 0);
526                 hammer2_chain_lookup_done(parent);
527                 if (schain == NULL) {
528                         kprintf("hammer2_mount: invalid super-root\n");
529                         hammer2_vfs_unmount(mp, MNT_FORCE);
530                         return EINVAL;
531                 }
532
533                 /*
534                  * NOTE: inode_get sucks up schain's lock.
535                  */
536                 atomic_set_int(&schain->flags, HAMMER2_CHAIN_PFSROOT);
537                 hmp->sroot = hammer2_inode_get(NULL, NULL, schain);
538                 hammer2_inode_ref(hmp->sroot);
539                 hammer2_inode_unlock_ex(hmp->sroot, schain);
540                 schain = NULL;
541                 /* leave hmp->sroot with one ref */
542         }
543
544         /*
545          * Block device opened successfully, finish initializing the
546          * mount structure.
547          *
548          * From this point on we have to call hammer2_unmount() on failure.
549          */
550         pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
551
552         kmalloc_create(&pmp->minode, "HAMMER2-inodes");
553         kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
554
555         spin_init(&pmp->inum_spin);
556         RB_INIT(&pmp->inum_tree);
557
558         kdmsg_iocom_init(&pmp->iocom, pmp,
559                          KDMSG_IOCOMF_AUTOCONN |
560                          KDMSG_IOCOMF_AUTOSPAN |
561                          KDMSG_IOCOMF_AUTOCIRC,
562                          pmp->mmsg, hammer2_rcvdmsg);
563
564         ccms_domain_init(&pmp->ccms_dom);
565         ++hmp->pmp_count;
566         lockmgr(&hammer2_mntlk, LK_RELEASE);
567         kprintf("hammer2_mount hmp=%p pmp=%p pmpcnt=%d\n",
568                 hmp, pmp, hmp->pmp_count);
569
570         mp->mnt_flag = MNT_LOCAL;
571         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
572         mp->mnt_kern_flag |= MNTK_THR_SYNC;     /* new vsyncscan semantics */
573
574         /*
575          * required mount structure initializations
576          */
577         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
578         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
579
580         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
581         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
582
583         /*
584          * Optional fields
585          */
586         mp->mnt_iosize_max = MAXPHYS;
587         mp->mnt_data = (qaddr_t)pmp;
588         pmp->mp = mp;
589
590         /*
591          * Lookup mount point under the media-localized super-root.
592          */
593         parent = hammer2_inode_lock_ex(hmp->sroot);
594         lhc = hammer2_dirhash(label, strlen(label));
595         rchain = hammer2_chain_lookup(&parent, &key_next,
596                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
597                                       &cache_index, 0);
598         while (rchain) {
599                 if (rchain->bref.type == HAMMER2_BREF_TYPE_INODE &&
600                     strcmp(label, rchain->data->ipdata.filename) == 0) {
601                         break;
602                 }
603                 rchain = hammer2_chain_next(&parent, rchain, &key_next,
604                                             key_next,
605                                             lhc + HAMMER2_DIRHASH_LOMASK,
606                                             &cache_index, 0);
607         }
608         hammer2_inode_unlock_ex(hmp->sroot, parent);
609
610         if (rchain == NULL) {
611                 kprintf("hammer2_mount: PFS label not found\n");
612                 --hmp->pmp_count;
613                 hammer2_vfs_unmount(mp, MNT_FORCE);
614                 return EINVAL;
615         }
616         if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
617                 hammer2_chain_unlock(rchain);
618                 kprintf("hammer2_mount: PFS label already mounted!\n");
619                 --hmp->pmp_count;
620                 hammer2_vfs_unmount(mp, MNT_FORCE);
621                 return EBUSY;
622         }
623 #if 0
624         if (rchain->flags & HAMMER2_CHAIN_RECYCLE) {
625                 kprintf("hammer2_mount: PFS label currently recycling\n");
626                 --hmp->pmp_count;
627                 hammer2_vfs_unmount(mp, MNT_FORCE);
628                 return EBUSY;
629         }
630 #endif
631
632         atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
633
634         /*
635          * NOTE: *_get() integrates chain's lock into the inode lock.
636          */
637         hammer2_chain_ref(rchain);              /* for pmp->rchain */
638         pmp->cluster.nchains = 1;
639         pmp->cluster.chains[0] = rchain;
640         pmp->iroot = hammer2_inode_get(pmp, NULL, rchain);
641         hammer2_inode_ref(pmp->iroot);          /* ref for pmp->iroot */
642
643         KKASSERT(rchain->pmp == NULL);          /* tracking pmp for rchain */
644         rchain->pmp = pmp;
645         atomic_add_long(&pmp->inmem_chains, 1);
646
647         hammer2_inode_unlock_ex(pmp->iroot, rchain);
648
649         kprintf("iroot %p\n", pmp->iroot);
650
651         /*
652          * The logical file buffer bio write thread handles things
653          * like physical block assignment and compression.
654          */
655         mtx_init(&pmp->wthread_mtx);
656         bioq_init(&pmp->wthread_bioq);
657         pmp->wthread_destroy = 0;
658         lwkt_create(hammer2_write_thread, pmp,
659                     &pmp->wthread_td, NULL, 0, -1, "hwrite-%s", label);
660
661         /*
662          * Ref the cluster management messaging descriptor.  The mount
663          * program deals with the other end of the communications pipe.
664          */
665         fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
666         if (fp == NULL) {
667                 kprintf("hammer2_mount: bad cluster_fd!\n");
668                 hammer2_vfs_unmount(mp, MNT_FORCE);
669                 return EBADF;
670         }
671         hammer2_cluster_reconnect(pmp, fp);
672
673         /*
674          * Finish setup
675          */
676         vfs_getnewfsid(mp);
677         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
678         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
679         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
680
681         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
682         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
683         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
684         copyinstr(path, mp->mnt_stat.f_mntonname,
685                   sizeof(mp->mnt_stat.f_mntonname) - 1,
686                   &size);
687
688         /*
689          * Initial statfs to prime mnt_stat.
690          */
691         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
692         
693         return 0;
694 }
695
696 /*
697  * Handle bioq for strategy write
698  */
699 static
700 void
701 hammer2_write_thread(void *arg)
702 {
703         hammer2_pfsmount_t *pmp;
704         struct bio *bio;
705         struct buf *bp;
706         hammer2_trans_t trans;
707         struct vnode *vp;
708         hammer2_inode_t *ip;
709         hammer2_chain_t *parent;
710         hammer2_chain_t **parentp;
711         hammer2_inode_data_t *ipdata;
712         hammer2_key_t lbase;
713         int lblksize;
714         int pblksize;
715         int error;
716         
717         pmp = arg;
718         
719         mtx_lock(&pmp->wthread_mtx);
720         while (pmp->wthread_destroy == 0) {
721                 if (bioq_first(&pmp->wthread_bioq) == NULL) {
722                         mtxsleep(&pmp->wthread_bioq, &pmp->wthread_mtx,
723                                  0, "h2bioqw", 0);
724                 }
725                 parent = NULL;
726                 parentp = &parent;
727
728                 hammer2_trans_init(&trans, pmp, HAMMER2_TRANS_BUFCACHE);
729
730                 while ((bio = bioq_takefirst(&pmp->wthread_bioq)) != NULL) {
731                         /*
732                          * dummy bio for synchronization.  The transaction
733                          * must be reinitialized.
734                          */
735                         if (bio->bio_buf == NULL) {
736                                 bio->bio_flags |= BIO_DONE;
737                                 wakeup(bio);
738                                 hammer2_trans_done(&trans);
739                                 hammer2_trans_init(&trans, pmp,
740                                                    HAMMER2_TRANS_BUFCACHE);
741                                 continue;
742                         }
743
744                         /*
745                          * else normal bio processing
746                          */
747                         mtx_unlock(&pmp->wthread_mtx);
748                         
749                         error = 0;
750                         bp = bio->bio_buf;
751                         vp = bp->b_vp;
752                         ip = VTOI(vp);
753
754                         /*
755                          * Inode is modified, flush size and mtime changes
756                          * to ensure that the file size remains consistent
757                          * with the buffers being flushed.
758                          */
759                         parent = hammer2_inode_lock_ex(ip);
760                         if (ip->flags & (HAMMER2_INODE_RESIZED |
761                                          HAMMER2_INODE_MTIME)) {
762                                 hammer2_inode_fsync(&trans, ip, parentp);
763                         }
764                         ipdata = hammer2_chain_modify_ip(&trans, ip,
765                                                          parentp, 0);
766                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
767                                                         &lbase, NULL);
768                         pblksize = hammer2_calc_physical(ip, lbase);
769                         hammer2_write_file_core(bp, &trans, ip, ipdata,
770                                                 parentp,
771                                                 lbase, IO_ASYNC,
772                                                 pblksize, &error);
773                         hammer2_inode_unlock_ex(ip, parent);
774                         if (error) {
775                                 kprintf("hammer2: error in buffer write\n");
776                                 bp->b_flags |= B_ERROR;
777                                 bp->b_error = EIO;
778                         }
779                         biodone(bio);
780                         mtx_lock(&pmp->wthread_mtx);
781                 }
782                 hammer2_trans_done(&trans);
783         }
784         pmp->wthread_destroy = -1;
785         wakeup(&pmp->wthread_destroy);
786         
787         mtx_unlock(&pmp->wthread_mtx);
788 }
789
790 void
791 hammer2_bioq_sync(hammer2_pfsmount_t *pmp)
792 {
793         struct bio sync_bio;
794
795         bzero(&sync_bio, sizeof(sync_bio));     /* dummy with no bio_buf */
796         mtx_lock(&pmp->wthread_mtx);
797         if (pmp->wthread_destroy == 0) {
798                 if (TAILQ_EMPTY(&pmp->wthread_bioq.queue)) {
799                        bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
800                        wakeup(&pmp->wthread_bioq);
801                 } else {
802                        bioq_insert_tail(&pmp->wthread_bioq, &sync_bio);
803                 }
804                 while ((sync_bio.bio_flags & BIO_DONE) == 0)
805                         mtxsleep(&sync_bio, &pmp->wthread_mtx, 0, "h2bioq", 0);
806         }
807         mtx_unlock(&pmp->wthread_mtx);
808 }
809
810 /* 
811  * Return a chain suitable for I/O, creating the chain if necessary
812  * and assigning its physical block.
813  */
814 static
815 hammer2_chain_t *
816 hammer2_assign_physical(hammer2_trans_t *trans,
817                         hammer2_inode_t *ip, hammer2_chain_t **parentp,
818                         hammer2_key_t lbase, int pblksize, int *errorp)
819 {
820         hammer2_chain_t *parent;
821         hammer2_chain_t *chain;
822         hammer2_off_t pbase;
823         hammer2_key_t key_dummy;
824         int pradix = hammer2_getradix(pblksize);
825         int cache_index = -1;
826
827         /*
828          * Locate the chain associated with lbase, return a locked chain.
829          * However, do not instantiate any data reference (which utilizes a
830          * device buffer) because we will be using direct IO via the
831          * logical buffer cache buffer.
832          */
833         *errorp = 0;
834         KKASSERT(pblksize >= HAMMER2_MIN_ALLOC);
835 retry:
836         parent = *parentp;
837         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); /* extra lock */
838         chain = hammer2_chain_lookup(&parent, &key_dummy,
839                                      lbase, lbase,
840                                      &cache_index, HAMMER2_LOOKUP_NODATA);
841
842         if (chain == NULL) {
843                 /*
844                  * We found a hole, create a new chain entry.
845                  *
846                  * NOTE: DATA chains are created without device backing
847                  *       store (nor do we want any).
848                  */
849                 *errorp = hammer2_chain_create(trans, &parent, &chain,
850                                                lbase, HAMMER2_PBUFRADIX,
851                                                HAMMER2_BREF_TYPE_DATA,
852                                                pblksize);
853                 if (chain == NULL) {
854                         hammer2_chain_lookup_done(parent);
855                         panic("hammer2_chain_create: par=%p error=%d\n",
856                                 parent, *errorp);
857                         goto retry;
858                 }
859
860                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
861                 /*ip->delta_dcount += pblksize;*/
862         } else {
863                 switch (chain->bref.type) {
864                 case HAMMER2_BREF_TYPE_INODE:
865                         /*
866                          * The data is embedded in the inode.  The
867                          * caller is responsible for marking the inode
868                          * modified and copying the data to the embedded
869                          * area.
870                          */
871                         pbase = NOOFFSET;
872                         break;
873                 case HAMMER2_BREF_TYPE_DATA:
874                         if (chain->bytes != pblksize) {
875                                 hammer2_chain_resize(trans, ip,
876                                                      parent, &chain,
877                                                      pradix,
878                                                      HAMMER2_MODIFY_OPTDATA);
879                         }
880                         hammer2_chain_modify(trans, &chain,
881                                              HAMMER2_MODIFY_OPTDATA);
882                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
883                         break;
884                 default:
885                         panic("hammer2_assign_physical: bad type");
886                         /* NOT REACHED */
887                         pbase = NOOFFSET;
888                         break;
889                 }
890         }
891
892         /*
893          * Cleanup.  If chain wound up being the inode (i.e. DIRECTDATA),
894          * we might have to replace *parentp.
895          */
896         hammer2_chain_lookup_done(parent);
897         if (chain) {
898                 if (*parentp != chain &&
899                     (*parentp)->core == chain->core) {
900                         parent = *parentp;
901                         *parentp = chain;               /* eats lock */
902                         hammer2_chain_unlock(parent);
903                         hammer2_chain_lock(chain, 0);   /* need another */
904                 }
905                 /* else chain already locked for return */
906         }
907         return (chain);
908 }
909
910 /* 
911  * From hammer2_vnops.c.
912  * The core write function which determines which path to take
913  * depending on compression settings.
914  */
915 static
916 void
917 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
918                         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
919                         hammer2_chain_t **parentp,
920                         hammer2_key_t lbase, int ioflag, int pblksize,
921                         int *errorp)
922 {
923         hammer2_chain_t *chain;
924
925         switch(HAMMER2_DEC_COMP(ipdata->comp_algo)) {
926         case HAMMER2_COMP_NONE:
927                 /*
928                  * We have to assign physical storage to the buffer
929                  * we intend to dirty or write now to avoid deadlocks
930                  * in the strategy code later.
931                  *
932                  * This can return NOOFFSET for inode-embedded data.
933                  * The strategy code will take care of it in that case.
934                  */
935                 chain = hammer2_assign_physical(trans, ip, parentp,
936                                                 lbase, pblksize,
937                                                 errorp);
938                 hammer2_write_bp(chain, bp, ioflag, pblksize, errorp);
939                 if (chain)
940                         hammer2_chain_unlock(chain);
941                 break;
942         case HAMMER2_COMP_AUTOZERO:
943                 /*
944                  * Check for zero-fill only
945                  */
946                 hammer2_zero_check_and_write(bp, trans, ip,
947                                     ipdata, parentp, lbase,
948                                     ioflag, pblksize, errorp);
949                 break;
950         case HAMMER2_COMP_LZ4:
951         case HAMMER2_COMP_ZLIB:
952         default:
953                 /*
954                  * Check for zero-fill and attempt compression.
955                  */
956                 hammer2_compress_and_write(bp, trans, ip,
957                                            ipdata, parentp,
958                                            lbase, ioflag,
959                                            pblksize, errorp,
960                                            ipdata->comp_algo);
961                 break;
962         }
963         ipdata = &ip->chain->data->ipdata;      /* reload */
964 }
965
966 /*
967  * From hammer2_vnops.c
968  * Generic function that will perform the compression in compression
969  * write path. The compression algorithm is determined by the settings
970  * obtained from inode.
971  */
972 static
973 void
974 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
975         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
976         hammer2_chain_t **parentp,
977         hammer2_key_t lbase, int ioflag, int pblksize,
978         int *errorp, int comp_algo)
979 {
980         hammer2_chain_t *chain;
981         int comp_size;
982         int comp_block_size;
983         char *comp_buffer;
984
985         if (test_block_zeros(bp->b_data, pblksize)) {
986                 zero_write(bp, trans, ip, ipdata, parentp, lbase, errorp);
987                 return;
988         }
989
990         comp_size = 0;
991         comp_buffer = NULL;
992
993         KKASSERT(pblksize / 2 <= 32768);
994                 
995         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
996                 z_stream strm_compress;
997                 int comp_level;
998                 int ret;
999
1000                 switch(HAMMER2_DEC_COMP(comp_algo)) {
1001                 case HAMMER2_COMP_LZ4:
1002                         comp_buffer = objcache_get(cache_buffer_write,
1003                                                    M_INTWAIT);
1004                         comp_size = LZ4_compress_limitedOutput(
1005                                         bp->b_data,
1006                                         &comp_buffer[sizeof(int)],
1007                                         pblksize,
1008                                         pblksize / 2 - sizeof(int));
1009                         /*
1010                          * We need to prefix with the size, LZ4
1011                          * doesn't do it for us.  Add the related
1012                          * overhead.
1013                          */
1014                         *(int *)comp_buffer = comp_size;
1015                         if (comp_size)
1016                                 comp_size += sizeof(int);
1017                         break;
1018                 case HAMMER2_COMP_ZLIB:
1019                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1020                         if (comp_level == 0)
1021                                 comp_level = 6; /* default zlib compression */
1022                         else if (comp_level < 6)
1023                                 comp_level = 6;
1024                         else if (comp_level > 9)
1025                                 comp_level = 9;
1026                         ret = deflateInit(&strm_compress, comp_level);
1027                         if (ret != Z_OK) {
1028                                 kprintf("HAMMER2 ZLIB: fatal error "
1029                                         "on deflateInit.\n");
1030                         }
1031
1032                         comp_buffer = objcache_get(cache_buffer_write,
1033                                                    M_INTWAIT);
1034                         strm_compress.next_in = bp->b_data;
1035                         strm_compress.avail_in = pblksize;
1036                         strm_compress.next_out = comp_buffer;
1037                         strm_compress.avail_out = pblksize / 2;
1038                         ret = deflate(&strm_compress, Z_FINISH);
1039                         if (ret == Z_STREAM_END) {
1040                                 comp_size = pblksize / 2 -
1041                                             strm_compress.avail_out;
1042                         } else {
1043                                 comp_size = 0;
1044                         }
1045                         ret = deflateEnd(&strm_compress);
1046                         break;
1047                 default:
1048                         kprintf("Error: Unknown compression method.\n");
1049                         kprintf("Comp_method = %d.\n", comp_algo);
1050                         break;
1051                 }
1052         }
1053
1054         if (comp_size == 0) {
1055                 /*
1056                  * compression failed or turned off
1057                  */
1058                 comp_block_size = pblksize;     /* safety */
1059                 if (++ip->comp_heuristic > 128)
1060                         ip->comp_heuristic = 8;
1061         } else {
1062                 /*
1063                  * compression succeeded
1064                  */
1065                 ip->comp_heuristic = 0;
1066                 if (comp_size <= 1024) {
1067                         comp_block_size = 1024;
1068                 } else if (comp_size <= 2048) {
1069                         comp_block_size = 2048;
1070                 } else if (comp_size <= 4096) {
1071                         comp_block_size = 4096;
1072                 } else if (comp_size <= 8192) {
1073                         comp_block_size = 8192;
1074                 } else if (comp_size <= 16384) {
1075                         comp_block_size = 16384;
1076                 } else if (comp_size <= 32768) {
1077                         comp_block_size = 32768;
1078                 } else {
1079                         panic("hammer2: WRITE PATH: "
1080                               "Weird comp_size value.");
1081                         /* NOT REACHED */
1082                         comp_block_size = pblksize;
1083                 }
1084         }
1085
1086         chain = hammer2_assign_physical(trans, ip, parentp,
1087                                         lbase, comp_block_size,
1088                                         errorp);
1089         ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1090
1091         if (*errorp) {
1092                 kprintf("WRITE PATH: An error occurred while "
1093                         "assigning physical space.\n");
1094                 KKASSERT(chain == NULL);
1095         } else {
1096                 /* Get device offset */
1097                 hammer2_off_t pbase;
1098                 hammer2_off_t pmask;
1099                 hammer2_off_t peof;
1100                 size_t boff;
1101                 size_t psize;
1102                 struct buf *dbp;
1103                 int temp_check;
1104
1105                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1106
1107                 switch(chain->bref.type) {
1108                 case HAMMER2_BREF_TYPE_INODE:
1109                         KKASSERT(chain->data->ipdata.op_flags &
1110                                  HAMMER2_OPFLAG_DIRECTDATA);
1111                         KKASSERT(bp->b_loffset == 0);
1112                         bcopy(bp->b_data, chain->data->ipdata.u.data,
1113                               HAMMER2_EMBEDDED_BYTES);
1114                         break;
1115                 case HAMMER2_BREF_TYPE_DATA:
1116                         psize = hammer2_devblksize(chain->bytes);
1117                         pmask = (hammer2_off_t)psize - 1;
1118                         pbase = chain->bref.data_off & ~pmask;
1119                         boff = chain->bref.data_off &
1120                                (HAMMER2_OFF_MASK & pmask);
1121                         peof = (pbase + HAMMER2_SEGMASK64) &
1122                                ~HAMMER2_SEGMASK64;
1123                         temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1124
1125                         /*
1126                          * Optimize out the read-before-write
1127                          * if possible.
1128                          */
1129                         if (comp_block_size == psize) {
1130                                 dbp = getblk(chain->hmp->devvp, pbase,
1131                                              psize, 0, 0);
1132                         } else {
1133                                 *errorp = bread(chain->hmp->devvp,
1134                                                 pbase, psize, &dbp);
1135                                 if (*errorp) {
1136                                         kprintf("hammer2: WRITE PATH: "
1137                                                 "dbp bread error\n");
1138                                         break;
1139                                 }
1140                         }
1141
1142                         /*
1143                          * When loading the block make sure we don't
1144                          * leave garbage after the compressed data.
1145                          */
1146                         if (comp_size) {
1147                                 chain->bref.methods =
1148                                         HAMMER2_ENC_COMP(comp_algo) +
1149                                         HAMMER2_ENC_CHECK(temp_check);
1150                                 bcopy(comp_buffer, dbp->b_data + boff,
1151                                       comp_size);
1152                                 if (comp_size != comp_block_size) {
1153                                         bzero(dbp->b_data + boff +
1154                                                 comp_size,
1155                                               comp_block_size -
1156                                                 comp_size);
1157                                 }
1158                         } else {
1159                                 chain->bref.methods =
1160                                         HAMMER2_ENC_COMP(
1161                                                 HAMMER2_COMP_NONE) +
1162                                         HAMMER2_ENC_CHECK(temp_check);
1163                                 bcopy(bp->b_data, dbp->b_data + boff,
1164                                       pblksize);
1165                         }
1166
1167                         /*
1168                          * Device buffer is now valid, chain is no
1169                          * longer in the initial state.
1170                          */
1171                         atomic_clear_int(&chain->flags,
1172                                          HAMMER2_CHAIN_INITIAL);
1173
1174                         /* Now write the related bdp. */
1175                         if (ioflag & IO_SYNC) {
1176                                 /*
1177                                  * Synchronous I/O requested.
1178                                  */
1179                                 bwrite(dbp);
1180                         /*
1181                         } else if ((ioflag & IO_DIRECT) &&
1182                                    loff + n == pblksize) {
1183                                 bdwrite(dbp);
1184                         */
1185                         } else if (ioflag & IO_ASYNC) {
1186                                 bawrite(dbp);
1187                         } else if (hammer2_cluster_enable) {
1188                                 cluster_write(dbp, peof,
1189                                               HAMMER2_PBUFSIZE,
1190                                               4/*XXX*/);
1191                         } else {
1192                                 bdwrite(dbp);
1193                         }
1194                         break;
1195                 default:
1196                         panic("hammer2_write_bp: bad chain type %d\n",
1197                                 chain->bref.type);
1198                         /* NOT REACHED */
1199                         break;
1200                 }
1201
1202                 hammer2_chain_unlock(chain);
1203         }
1204         if (comp_buffer)
1205                 objcache_put(cache_buffer_write, comp_buffer);
1206 }
1207
1208 /*
1209  * Function that performs zero-checking and writing without compression,
1210  * it corresponds to default zero-checking path.
1211  */
1212 static
1213 void
1214 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1215         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
1216         hammer2_chain_t **parentp,
1217         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp)
1218 {
1219         hammer2_chain_t *chain;
1220
1221         if (test_block_zeros(bp->b_data, pblksize)) {
1222                 zero_write(bp, trans, ip, ipdata, parentp, lbase, errorp);
1223         } else {
1224                 chain = hammer2_assign_physical(trans, ip, parentp,
1225                                                 lbase, pblksize, errorp);
1226                 hammer2_write_bp(chain, bp, ioflag, pblksize, errorp);
1227                 if (chain)
1228                         hammer2_chain_unlock(chain);
1229         }
1230 }
1231
1232 /*
1233  * A function to test whether a block of data contains only zeros,
1234  * returns TRUE (non-zero) if the block is all zeros.
1235  */
1236 static
1237 int
1238 test_block_zeros(const char *buf, size_t bytes)
1239 {
1240         size_t i;
1241
1242         for (i = 0; i < bytes; i += sizeof(long)) {
1243                 if (*(const long *)(buf + i) != 0)
1244                         return (0);
1245         }
1246         return (1);
1247 }
1248
1249 /*
1250  * Function to "write" a block that contains only zeros.
1251  */
1252 static
1253 void
1254 zero_write(struct buf *bp, hammer2_trans_t *trans, hammer2_inode_t *ip,
1255         hammer2_inode_data_t *ipdata, hammer2_chain_t **parentp,
1256         hammer2_key_t lbase, int *errorp __unused)
1257 {
1258         hammer2_chain_t *parent;
1259         hammer2_chain_t *chain;
1260         hammer2_key_t key_dummy;
1261         int cache_index = -1;
1262
1263         parent = hammer2_chain_lookup_init(*parentp, 0);
1264
1265         chain = hammer2_chain_lookup(&parent, &key_dummy, lbase, lbase,
1266                                      &cache_index, HAMMER2_LOOKUP_NODATA);
1267         if (chain) {
1268                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1269                         bzero(chain->data->ipdata.u.data,
1270                               HAMMER2_EMBEDDED_BYTES);
1271                 } else {
1272                         hammer2_chain_delete(trans, chain, 0);
1273                 }
1274                 hammer2_chain_unlock(chain);
1275         }
1276         hammer2_chain_lookup_done(parent);
1277 }
1278
1279 /*
1280  * Function to write the data as it is, without performing any sort of
1281  * compression. This function is used in path without compression and
1282  * default zero-checking path.
1283  */
1284 static
1285 void
1286 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1287                                 int pblksize, int *errorp)
1288 {
1289         hammer2_off_t pbase;
1290         hammer2_off_t pmask;
1291         hammer2_off_t peof;
1292         struct buf *dbp;
1293         size_t boff;
1294         size_t psize;
1295         int error;
1296         int temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1297
1298         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1299
1300         switch(chain->bref.type) {
1301         case HAMMER2_BREF_TYPE_INODE:
1302                 KKASSERT(chain->data->ipdata.op_flags &
1303                          HAMMER2_OPFLAG_DIRECTDATA);
1304                 KKASSERT(bp->b_loffset == 0);
1305                 bcopy(bp->b_data, chain->data->ipdata.u.data,
1306                       HAMMER2_EMBEDDED_BYTES);
1307                 error = 0;
1308                 break;
1309         case HAMMER2_BREF_TYPE_DATA:
1310                 psize = hammer2_devblksize(chain->bytes);
1311                 pmask = (hammer2_off_t)psize - 1;
1312                 pbase = chain->bref.data_off & ~pmask;
1313                 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
1314                 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
1315
1316                 if (psize == pblksize) {
1317                         dbp = getblk(chain->hmp->devvp, pbase,
1318                                      psize, 0, 0);
1319                         error = 0;
1320                 } else {
1321                         error = bread(chain->hmp->devvp, pbase, psize, &dbp);
1322                         if (error) {
1323                                 kprintf("hammer2: WRITE PATH: "
1324                                         "dbp bread error\n");
1325                                 break;
1326                         }
1327                 }
1328
1329                 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
1330                                       HAMMER2_ENC_CHECK(temp_check);
1331                 bcopy(bp->b_data, dbp->b_data + boff, chain->bytes);
1332                 
1333                 /*
1334                  * Device buffer is now valid, chain is no
1335                  * longer in the initial state.
1336                  */
1337                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1338
1339                 if (ioflag & IO_SYNC) {
1340                         /*
1341                          * Synchronous I/O requested.
1342                          */
1343                         bwrite(dbp);
1344                 /*
1345                 } else if ((ioflag & IO_DIRECT) && loff + n == pblksize) {
1346                         bdwrite(dbp);
1347                 */
1348                 } else if (ioflag & IO_ASYNC) {
1349                         bawrite(dbp);
1350                 } else if (hammer2_cluster_enable) {
1351                         cluster_write(dbp, peof, HAMMER2_PBUFSIZE, 4/*XXX*/);
1352                 } else {
1353                         bdwrite(dbp);
1354                 }
1355                 break;
1356         default:
1357                 panic("hammer2_write_bp: bad chain type %d\n",
1358                       chain->bref.type);
1359                 /* NOT REACHED */
1360                 error = 0;
1361                 break;
1362         }
1363         *errorp = error;
1364 }
1365
1366 static
1367 int
1368 hammer2_remount(hammer2_mount_t *hmp, char *path, struct vnode *devvp,
1369                 struct ucred *cred)
1370 {
1371         return (0);
1372 }
1373
1374 static
1375 int
1376 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1377 {
1378         hammer2_pfsmount_t *pmp;
1379         hammer2_mount_t *hmp;
1380         hammer2_chain_t *rchain;
1381         int flags;
1382         int error = 0;
1383         int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
1384         int dumpcnt;
1385         int i;
1386         struct vnode *devvp;
1387
1388         pmp = MPTOPMP(mp);
1389
1390         ccms_domain_uninit(&pmp->ccms_dom);
1391         kdmsg_iocom_uninit(&pmp->iocom);        /* XXX chain dependency */
1392
1393         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1394
1395         /*
1396          * If mount initialization proceeded far enough we must flush
1397          * its vnodes.
1398          */
1399         if (mntflags & MNT_FORCE)
1400                 flags = FORCECLOSE;
1401         else
1402                 flags = 0;
1403         if (pmp->iroot) {
1404                 error = vflush(mp, 0, flags);
1405                 if (error)
1406                         goto failed;
1407         }
1408
1409         if (pmp->wthread_td) {
1410                 mtx_lock(&pmp->wthread_mtx);
1411                 pmp->wthread_destroy = 1;
1412                 wakeup(&pmp->wthread_bioq);
1413                 while (pmp->wthread_destroy != -1) {
1414                         mtxsleep(&pmp->wthread_destroy,
1415                                 &pmp->wthread_mtx, 0,
1416                                 "umount-sleep", 0);
1417                 }
1418                 mtx_unlock(&pmp->wthread_mtx);
1419                 pmp->wthread_td = NULL;
1420         }
1421
1422         for (i = 0; i < pmp->cluster.nchains; ++i) {
1423                 hmp = pmp->cluster.chains[i]->hmp;
1424
1425                 hammer2_mount_exlock(hmp);
1426
1427                 --hmp->pmp_count;
1428                 kprintf("hammer2_unmount hmp=%p pmpcnt=%d\n",
1429                         hmp, hmp->pmp_count);
1430
1431                 /*
1432                  * Flush any left over chains.  The voldata lock is only used
1433                  * to synchronize against HAMMER2_CHAIN_MODIFIED_AUX.
1434                  */
1435                 hammer2_voldata_lock(hmp);
1436                 if (((hmp->vchain.flags | hmp->fchain.flags) &
1437                      HAMMER2_CHAIN_MODIFIED) ||
1438                     hmp->vchain.core->update_tid > hmp->voldata.mirror_tid ||
1439                     hmp->fchain.core->update_tid > hmp->voldata.freemap_tid) {
1440                         hammer2_voldata_unlock(hmp, 0);
1441                         hammer2_vfs_sync(mp, MNT_WAIT);
1442                         hammer2_vfs_sync(mp, MNT_WAIT);
1443                 } else {
1444                         hammer2_voldata_unlock(hmp, 0);
1445                 }
1446                 if (hmp->pmp_count == 0) {
1447                         if (((hmp->vchain.flags | hmp->fchain.flags) &
1448                              HAMMER2_CHAIN_MODIFIED) ||
1449                             (hmp->vchain.core->update_tid >
1450                              hmp->voldata.mirror_tid) ||
1451                             (hmp->fchain.core->update_tid >
1452                              hmp->voldata.freemap_tid)) {
1453                                 kprintf("hammer2_unmount: chains left over "
1454                                         "after final sync\n");
1455                                 if (hammer2_debug & 0x0010)
1456                                         Debugger("entered debugger");
1457                         }
1458                 }
1459
1460                 /*
1461                  * Cleanup the root and super-root chain elements
1462                  * (which should be clean).
1463                  */
1464                 if (pmp->iroot) {
1465 #if REPORT_REFS_ERRORS
1466                         if (pmp->iroot->refs != 1)
1467                                 kprintf("PMP->IROOT %p REFS WRONG %d\n",
1468                                         pmp->iroot, pmp->iroot->refs);
1469 #else
1470                         KKASSERT(pmp->iroot->refs == 1);
1471 #endif
1472                         /* ref for pmp->iroot */
1473                         hammer2_inode_drop(pmp->iroot);
1474                         pmp->iroot = NULL;
1475                 }
1476
1477                 rchain = pmp->cluster.chains[i];
1478                 if (rchain) {
1479                         atomic_clear_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
1480 #if REPORT_REFS_ERRORS
1481                         if (rchain->refs != 1)
1482                                 kprintf("PMP->RCHAIN %p REFS WRONG %d\n",
1483                                         rchain, rchain->refs);
1484 #else
1485                         KKASSERT(rchain->refs == 1);
1486 #endif
1487                         hammer2_chain_drop(rchain);
1488                         pmp->cluster.chains[i] = NULL;
1489                 }
1490
1491                 /*
1492                  * If no PFS's left drop the master hammer2_mount for the
1493                  * device.
1494                  */
1495                 if (hmp->pmp_count == 0) {
1496                         if (hmp->sroot) {
1497                                 hammer2_inode_drop(hmp->sroot);
1498                                 hmp->sroot = NULL;
1499                         }
1500
1501                         /*
1502                          * Finish up with the device vnode
1503                          */
1504                         if ((devvp = hmp->devvp) != NULL) {
1505                                 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1506                                 hmp->devvp = NULL;
1507                                 VOP_CLOSE(devvp,
1508                                           (ronly ? FREAD : FREAD|FWRITE));
1509                                 vrele(devvp);
1510                                 devvp = NULL;
1511                         }
1512
1513                         /*
1514                          * Final drop of embedded freemap root chain to
1515                          * clean up fchain.core (fchain structure is not
1516                          * flagged ALLOCATED so it is cleaned out and then
1517                          * left to rot).
1518                          */
1519                         hammer2_chain_drop(&hmp->fchain);
1520
1521                         /*
1522                          * Final drop of embedded volume root chain to clean
1523                          * up vchain.core (vchain structure is not flagged
1524                          * ALLOCATED so it is cleaned out and then left to
1525                          * rot).
1526                          */
1527                         dumpcnt = 50;
1528                         hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt);
1529                         dumpcnt = 50;
1530                         hammer2_dump_chain(&hmp->fchain, 0, &dumpcnt);
1531                         hammer2_mount_unlock(hmp);
1532                         hammer2_chain_drop(&hmp->vchain);
1533
1534                         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1535                         kmalloc_destroy(&hmp->mchain);
1536                         kfree(hmp, M_HAMMER2);
1537                 } else {
1538                         hammer2_mount_unlock(hmp);
1539                 }
1540         }
1541
1542         pmp->mp = NULL;
1543         mp->mnt_data = NULL;
1544
1545         kmalloc_destroy(&pmp->mmsg);
1546         kmalloc_destroy(&pmp->minode);
1547
1548         kfree(pmp, M_HAMMER2);
1549         error = 0;
1550
1551 failed:
1552         lockmgr(&hammer2_mntlk, LK_RELEASE);
1553
1554         return (error);
1555 }
1556
1557 static
1558 int
1559 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1560              ino_t ino, struct vnode **vpp)
1561 {
1562         kprintf("hammer2_vget\n");
1563         return (EOPNOTSUPP);
1564 }
1565
1566 static
1567 int
1568 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1569 {
1570         hammer2_pfsmount_t *pmp;
1571         hammer2_chain_t *parent;
1572         int error;
1573         struct vnode *vp;
1574
1575         pmp = MPTOPMP(mp);
1576         if (pmp->iroot == NULL) {
1577                 *vpp = NULL;
1578                 error = EINVAL;
1579         } else {
1580                 parent = hammer2_inode_lock_sh(pmp->iroot);
1581                 vp = hammer2_igetv(pmp->iroot, &error);
1582                 hammer2_inode_unlock_sh(pmp->iroot, parent);
1583                 *vpp = vp;
1584                 if (vp == NULL)
1585                         kprintf("vnodefail\n");
1586         }
1587
1588         return (error);
1589 }
1590
1591 /*
1592  * Filesystem status
1593  *
1594  * XXX incorporate ipdata->inode_quota and data_quota
1595  */
1596 static
1597 int
1598 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
1599 {
1600         hammer2_pfsmount_t *pmp;
1601         hammer2_mount_t *hmp;
1602
1603         pmp = MPTOPMP(mp);
1604         KKASSERT(pmp->cluster.nchains >= 1);
1605         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
1606
1607         mp->mnt_stat.f_files = pmp->inode_count;
1608         mp->mnt_stat.f_ffree = 0;
1609         mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1610         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1611         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
1612
1613         *sbp = mp->mnt_stat;
1614         return (0);
1615 }
1616
1617 static
1618 int
1619 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
1620 {
1621         hammer2_pfsmount_t *pmp;
1622         hammer2_mount_t *hmp;
1623
1624         pmp = MPTOPMP(mp);
1625         KKASSERT(pmp->cluster.nchains >= 1);
1626         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
1627
1628         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1629         mp->mnt_vstat.f_files = pmp->inode_count;
1630         mp->mnt_vstat.f_ffree = 0;
1631         mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1632         mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1633         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
1634
1635         *sbp = mp->mnt_vstat;
1636         return (0);
1637 }
1638
1639 /*
1640  * Sync the entire filesystem; this is called from the filesystem syncer
1641  * process periodically and whenever a user calls sync(1) on the hammer
1642  * mountpoint.
1643  *
1644  * Currently is actually called from the syncer! \o/
1645  *
1646  * This task will have to snapshot the state of the dirty inode chain.
1647  * From that, it will have to make sure all of the inodes on the dirty
1648  * chain have IO initiated. We make sure that io is initiated for the root
1649  * block.
1650  *
1651  * If waitfor is set, we wait for media to acknowledge the new rootblock.
1652  *
1653  * THINKS: side A vs side B, to have sync not stall all I/O?
1654  */
1655 int
1656 hammer2_vfs_sync(struct mount *mp, int waitfor)
1657 {
1658         struct hammer2_sync_info info;
1659         hammer2_chain_t *chain;
1660         hammer2_pfsmount_t *pmp;
1661         hammer2_mount_t *hmp;
1662         int flags;
1663         int error;
1664         int total_error;
1665         int force_fchain;
1666         int i;
1667
1668         pmp = MPTOPMP(mp);
1669
1670         /*
1671          * We can't acquire locks on existing vnodes while in a transaction
1672          * without risking a deadlock.  This assumes that vfsync() can be
1673          * called without the vnode locked (which it can in DragonFly).
1674          * Otherwise we'd have to implement a multi-pass or flag the lock
1675          * failures and retry.
1676          *
1677          * The reclamation code interlocks with the sync list's token
1678          * (by removing the vnode from the scan list) before unlocking
1679          * the inode, giving us time to ref the inode.
1680          */
1681         /*flags = VMSC_GETVP;*/
1682         flags = 0;
1683         if (waitfor & MNT_LAZY)
1684                 flags |= VMSC_ONEPASS;
1685
1686         /*
1687          * Initialize a normal transaction and sync everything out, then
1688          * wait for pending I/O to finish (so it gets a transaction id
1689          * that the meta-data flush will catch).
1690          */
1691         hammer2_trans_init(&info.trans, pmp, 0);
1692         info.error = 0;
1693         info.waitfor = MNT_NOWAIT;
1694         vsyncscan(mp, flags | VMSC_NOWAIT, hammer2_sync_scan2, &info);
1695
1696         if (info.error == 0 && (waitfor & MNT_WAIT)) {
1697                 info.waitfor = waitfor;
1698                     vsyncscan(mp, flags, hammer2_sync_scan2, &info);
1699
1700         }
1701         hammer2_trans_done(&info.trans);
1702         hammer2_bioq_sync(info.trans.pmp);
1703
1704         /*
1705          * Start the flush transaction and flush all meta-data.
1706          */
1707         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH);
1708
1709         total_error = 0;
1710         for (i = 0; i < pmp->cluster.nchains; ++i) {
1711                 hmp = pmp->cluster.chains[i]->hmp;
1712
1713                 /*
1714                  * Media mounts have two 'roots', vchain for the topology
1715                  * and fchain for the free block table.  Flush both.
1716                  *
1717                  * Note that the topology and free block table are handled
1718                  * independently, so the free block table can wind up being
1719                  * ahead of the topology.  We depend on the bulk free scan
1720                  * code to deal with any loose ends.
1721                  */
1722                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1723                 if ((hmp->vchain.flags & HAMMER2_CHAIN_MODIFIED) ||
1724                     hmp->vchain.core->update_tid > hmp->voldata.mirror_tid) {
1725                         chain = &hmp->vchain;
1726                         hammer2_chain_flush(&info.trans, &chain);
1727                         KKASSERT(chain == &hmp->vchain);
1728                         force_fchain = 1;
1729                 } else {
1730                         force_fchain = 0;
1731                 }
1732                 hammer2_chain_unlock(&hmp->vchain);
1733
1734                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1735                 if ((hmp->fchain.flags & HAMMER2_CHAIN_MODIFIED) ||
1736                     hmp->fchain.core->update_tid > hmp->voldata.freemap_tid ||
1737                     force_fchain) {
1738                         /* this will also modify vchain as a side effect */
1739                         chain = &hmp->fchain;
1740                         hammer2_chain_flush(&info.trans, &chain);
1741                         KKASSERT(chain == &hmp->fchain);
1742                 }
1743                 hammer2_chain_unlock(&hmp->fchain);
1744
1745                 error = 0;
1746
1747                 /*
1748                  * We can't safely flush the volume header until we have
1749                  * flushed any device buffers which have built up.
1750                  *
1751                  * XXX this isn't being incremental
1752                  */
1753                 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
1754                 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
1755                 vn_unlock(hmp->devvp);
1756
1757                 /*
1758                  * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1759                  * volume header needs synchronization via hmp->volsync.
1760                  *
1761                  * XXX synchronize the flag & data with only this flush XXX
1762                  */
1763                 if (error == 0 &&
1764                     (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1765                         struct buf *bp;
1766
1767                         /*
1768                          * Synchronize the disk before flushing the volume
1769                          * header.
1770                          */
1771                         bp = getpbuf(NULL);
1772                         bp->b_bio1.bio_offset = 0;
1773                         bp->b_bufsize = 0;
1774                         bp->b_bcount = 0;
1775                         bp->b_cmd = BUF_CMD_FLUSH;
1776                         bp->b_bio1.bio_done = biodone_sync;
1777                         bp->b_bio1.bio_flags |= BIO_SYNC;
1778                         vn_strategy(hmp->devvp, &bp->b_bio1);
1779                         biowait(&bp->b_bio1, "h2vol");
1780                         relpbuf(bp, NULL);
1781
1782                         /*
1783                          * Then we can safely flush the version of the
1784                          * volume header synchronized by the flush code.
1785                          */
1786                         i = hmp->volhdrno + 1;
1787                         if (i >= HAMMER2_NUM_VOLHDRS)
1788                                 i = 0;
1789                         if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1790                             hmp->volsync.volu_size) {
1791                                 i = 0;
1792                         }
1793                         kprintf("sync volhdr %d %jd\n",
1794                                 i, (intmax_t)hmp->volsync.volu_size);
1795                         bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
1796                                     HAMMER2_PBUFSIZE, 0, 0);
1797                         atomic_clear_int(&hmp->vchain.flags,
1798                                          HAMMER2_CHAIN_VOLUMESYNC);
1799                         bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1800                         bawrite(bp);
1801                         hmp->volhdrno = i;
1802                 }
1803                 if (error)
1804                         total_error = error;
1805         }
1806         hammer2_trans_done(&info.trans);
1807
1808         return (total_error);
1809 }
1810
1811 /*
1812  * Sync passes.
1813  *
1814  * NOTE: We don't test update_tid or MOVED here because the fsync code
1815  *       won't flush on those flags.  The syncer code above will do a
1816  *       general meta-data flush globally that will catch these flags.
1817  */
1818
1819 static int
1820 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1821 {
1822         struct hammer2_sync_info *info = data;
1823         hammer2_inode_t *ip;
1824         int error;
1825
1826         /*
1827          *
1828          */
1829         ip = VTOI(vp);
1830         if (ip == NULL)
1831                 return(0);
1832         if (vp->v_type == VNON || vp->v_type == VBAD) {
1833                 vclrisdirty(vp);
1834                 return(0);
1835         }
1836         if ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
1837             RB_EMPTY(&vp->v_rbdirty_tree)) {
1838                 vclrisdirty(vp);
1839                 return(0);
1840         }
1841
1842         /*
1843          * VOP_FSYNC will start a new transaction so replicate some code
1844          * here to do it inline (see hammer2_vop_fsync()).
1845          *
1846          * WARNING: The vfsync interacts with the buffer cache and might
1847          *          block, we can't hold the inode lock at that time.
1848          *          However, we MUST ref ip before blocking to ensure that
1849          *          it isn't ripped out from under us (since we do not
1850          *          hold a lock on the vnode).
1851          */
1852         hammer2_inode_ref(ip);
1853         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1854         if (vp)
1855                 vfsync(vp, MNT_NOWAIT, 1, NULL, NULL);
1856
1857 #if 0
1858         /*
1859          * XXX this interferes with flush operations mainly because the
1860          *     same transaction id is being used by asynchronous buffer
1861          *     operations above and can be reordered after the flush
1862          *     below.
1863          */
1864         parent = hammer2_inode_lock_ex(ip);
1865         hammer2_chain_flush(&info->trans, &parent);
1866         hammer2_inode_unlock_ex(ip, parent);
1867 #endif
1868         hammer2_inode_drop(ip);
1869         error = 0;
1870 #if 0
1871         error = VOP_FSYNC(vp, MNT_NOWAIT, 0);
1872 #endif
1873         if (error)
1874                 info->error = error;
1875         return(0);
1876 }
1877
1878 static
1879 int
1880 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
1881 {
1882         return (0);
1883 }
1884
1885 static
1886 int
1887 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
1888                struct fid *fhp, struct vnode **vpp)
1889 {
1890         return (0);
1891 }
1892
1893 static
1894 int
1895 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
1896                  int *exflagsp, struct ucred **credanonp)
1897 {
1898         return (0);
1899 }
1900
1901 /*
1902  * Support code for hammer2_mount().  Read, verify, and install the volume
1903  * header into the HMP
1904  *
1905  * XXX read four volhdrs and use the one with the highest TID whos CRC
1906  *     matches.
1907  *
1908  * XXX check iCRCs.
1909  *
1910  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
1911  *     nonexistant locations.
1912  *
1913  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
1914  */
1915 static
1916 int
1917 hammer2_install_volume_header(hammer2_mount_t *hmp)
1918 {
1919         hammer2_volume_data_t *vd;
1920         struct buf *bp;
1921         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
1922         int error_reported;
1923         int error;
1924         int valid;
1925         int i;
1926
1927         error_reported = 0;
1928         error = 0;
1929         valid = 0;
1930         bp = NULL;
1931
1932         /*
1933          * There are up to 4 copies of the volume header (syncs iterate
1934          * between them so there is no single master).  We don't trust the
1935          * volu_size field so we don't know precisely how large the filesystem
1936          * is, so depend on the OS to return an error if we go beyond the
1937          * block device's EOF.
1938          */
1939         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
1940                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
1941                               HAMMER2_VOLUME_BYTES, &bp);
1942                 if (error) {
1943                         brelse(bp);
1944                         bp = NULL;
1945                         continue;
1946                 }
1947
1948                 vd = (struct hammer2_volume_data *) bp->b_data;
1949                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
1950                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
1951                         brelse(bp);
1952                         bp = NULL;
1953                         continue;
1954                 }
1955
1956                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
1957                         /* XXX: Reversed-endianness filesystem */
1958                         kprintf("hammer2: reverse-endian filesystem detected");
1959                         brelse(bp);
1960                         bp = NULL;
1961                         continue;
1962                 }
1963
1964                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
1965                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
1966                                       HAMMER2_VOLUME_ICRC0_SIZE);
1967                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
1968                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
1969                                        HAMMER2_VOLUME_ICRC1_SIZE);
1970                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
1971                         kprintf("hammer2 volume header crc "
1972                                 "mismatch copy #%d %08x/%08x\n",
1973                                 i, crc0, crc);
1974                         error_reported = 1;
1975                         brelse(bp);
1976                         bp = NULL;
1977                         continue;
1978                 }
1979                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
1980                         valid = 1;
1981                         hmp->voldata = *vd;
1982                         hmp->volhdrno = i;
1983                 }
1984                 brelse(bp);
1985                 bp = NULL;
1986         }
1987         if (valid) {
1988                 hmp->volsync = hmp->voldata;
1989                 error = 0;
1990                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
1991                         kprintf("hammer2: using volume header #%d\n",
1992                                 hmp->volhdrno);
1993                 }
1994         } else {
1995                 error = EINVAL;
1996                 kprintf("hammer2: no valid volume headers found!\n");
1997         }
1998         return (error);
1999 }
2000
2001 /*
2002  * Reconnect using the passed file pointer.  The caller must ref the
2003  * fp for us.
2004  */
2005 void
2006 hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
2007 {
2008         hammer2_inode_data_t *ipdata;
2009         hammer2_chain_t *parent;
2010         hammer2_mount_t *hmp;
2011         size_t name_len;
2012
2013         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
2014
2015         /*
2016          * Closes old comm descriptor, kills threads, cleans up
2017          * states, then installs the new descriptor and creates
2018          * new threads.
2019          */
2020         kdmsg_iocom_reconnect(&pmp->iocom, fp, "hammer2");
2021
2022         /*
2023          * Setup LNK_CONN fields for autoinitiated state machine
2024          */
2025         parent = hammer2_inode_lock_ex(pmp->iroot);
2026         ipdata = &parent->data->ipdata;
2027         pmp->iocom.auto_lnk_conn.pfs_clid = ipdata->pfs_clid;
2028         pmp->iocom.auto_lnk_conn.pfs_fsid = ipdata->pfs_fsid;
2029         pmp->iocom.auto_lnk_conn.pfs_type = ipdata->pfs_type;
2030         pmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
2031         pmp->iocom.auto_lnk_conn.peer_type = hmp->voldata.peer_type;
2032
2033         /*
2034          * Filter adjustment.  Clients do not need visibility into other
2035          * clients (otherwise millions of clients would present a serious
2036          * problem).  The fs_label also serves to restrict the namespace.
2037          */
2038         pmp->iocom.auto_lnk_conn.peer_mask = 1LLU << HAMMER2_PEER_HAMMER2;
2039         pmp->iocom.auto_lnk_conn.pfs_mask = (uint64_t)-1;
2040         switch (ipdata->pfs_type) {
2041         case DMSG_PFSTYPE_CLIENT:
2042                 pmp->iocom.auto_lnk_conn.peer_mask &=
2043                                 ~(1LLU << DMSG_PFSTYPE_CLIENT);
2044                 break;
2045         default:
2046                 break;
2047         }
2048
2049         name_len = ipdata->name_len;
2050         if (name_len >= sizeof(pmp->iocom.auto_lnk_conn.fs_label))
2051                 name_len = sizeof(pmp->iocom.auto_lnk_conn.fs_label) - 1;
2052         bcopy(ipdata->filename,
2053               pmp->iocom.auto_lnk_conn.fs_label,
2054               name_len);
2055         pmp->iocom.auto_lnk_conn.fs_label[name_len] = 0;
2056
2057         /*
2058          * Setup LNK_SPAN fields for autoinitiated state machine
2059          */
2060         pmp->iocom.auto_lnk_span.pfs_clid = ipdata->pfs_clid;
2061         pmp->iocom.auto_lnk_span.pfs_fsid = ipdata->pfs_fsid;
2062         pmp->iocom.auto_lnk_span.pfs_type = ipdata->pfs_type;
2063         pmp->iocom.auto_lnk_span.peer_type = hmp->voldata.peer_type;
2064         pmp->iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1;
2065         name_len = ipdata->name_len;
2066         if (name_len >= sizeof(pmp->iocom.auto_lnk_span.fs_label))
2067                 name_len = sizeof(pmp->iocom.auto_lnk_span.fs_label) - 1;
2068         bcopy(ipdata->filename,
2069               pmp->iocom.auto_lnk_span.fs_label,
2070               name_len);
2071         pmp->iocom.auto_lnk_span.fs_label[name_len] = 0;
2072         hammer2_inode_unlock_ex(pmp->iroot, parent);
2073
2074         kdmsg_iocom_autoinitiate(&pmp->iocom, hammer2_autodmsg);
2075 }
2076
2077 static int
2078 hammer2_rcvdmsg(kdmsg_msg_t *msg)
2079 {
2080         switch(msg->any.head.cmd & DMSGF_TRANSMASK) {
2081         case DMSG_DBG_SHELL:
2082                 /*
2083                  * (non-transaction)
2084                  * Execute shell command (not supported atm)
2085                  */
2086                 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2087                 break;
2088         case DMSG_DBG_SHELL | DMSGF_REPLY:
2089                 /*
2090                  * (non-transaction)
2091                  */
2092                 if (msg->aux_data) {
2093                         msg->aux_data[msg->aux_size - 1] = 0;
2094                         kprintf("HAMMER2 DBG: %s\n", msg->aux_data);
2095                 }
2096                 break;
2097         default:
2098                 /*
2099                  * Unsupported message received.  We only need to
2100                  * reply if it's a transaction in order to close our end.
2101                  * Ignore any one-way messages are any further messages
2102                  * associated with the transaction.
2103                  *
2104                  * NOTE: This case also includes DMSG_LNK_ERROR messages
2105                  *       which might be one-way, replying to those would
2106                  *       cause an infinite ping-pong.
2107                  */
2108                 if (msg->any.head.cmd & DMSGF_CREATE)
2109                         kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2110                 break;
2111         }
2112         return(0);
2113 }
2114
2115 /*
2116  * This function is called after KDMSG has automatically handled processing
2117  * of a LNK layer message (typically CONN, SPAN, or CIRC).
2118  *
2119  * We tag off the LNK_CONN to trigger our LNK_VOLCONF messages which
2120  * advertises all available hammer2 super-root volumes.
2121  */
2122 static void
2123 hammer2_autodmsg(kdmsg_msg_t *msg)
2124 {
2125         hammer2_pfsmount_t *pmp = msg->iocom->handle;
2126         hammer2_mount_t *hmp = pmp->cluster.chains[0]->hmp; /* XXX */
2127         int copyid;
2128
2129         /*
2130          * We only care about replies to our LNK_CONN auto-request.  kdmsg
2131          * has already processed the reply, we use this calback as a shim
2132          * to know when we can advertise available super-root volumes.
2133          */
2134         if ((msg->any.head.cmd & DMSGF_TRANSMASK) !=
2135             (DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_REPLY) ||
2136             msg->state == NULL) {
2137                 return;
2138         }
2139
2140         kprintf("LNK_CONN REPLY RECEIVED CMD %08x\n", msg->any.head.cmd);
2141
2142         if (msg->any.head.cmd & DMSGF_CREATE) {
2143                 kprintf("HAMMER2: VOLDATA DUMP\n");
2144
2145                 /*
2146                  * Dump the configuration stored in the volume header
2147                  */
2148                 hammer2_voldata_lock(hmp);
2149                 for (copyid = 0; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
2150                         if (hmp->voldata.copyinfo[copyid].copyid == 0)
2151                                 continue;
2152                         hammer2_volconf_update(pmp, copyid);
2153                 }
2154                 hammer2_voldata_unlock(hmp, 0);
2155         }
2156         if ((msg->any.head.cmd & DMSGF_DELETE) &&
2157             msg->state && (msg->state->txcmd & DMSGF_DELETE) == 0) {
2158                 kprintf("HAMMER2: CONN WAS TERMINATED\n");
2159         }
2160 }
2161
2162 /*
2163  * Volume configuration updates are passed onto the userland service
2164  * daemon via the open LNK_CONN transaction.
2165  */
2166 void
2167 hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index)
2168 {
2169         hammer2_mount_t *hmp = pmp->cluster.chains[0]->hmp;     /* XXX */
2170         kdmsg_msg_t *msg;
2171
2172         /* XXX interlock against connection state termination */
2173         kprintf("volconf update %p\n", pmp->iocom.conn_state);
2174         if (pmp->iocom.conn_state) {
2175                 kprintf("TRANSMIT VOLCONF VIA OPEN CONN TRANSACTION\n");
2176                 msg = kdmsg_msg_alloc_state(pmp->iocom.conn_state,
2177                                             DMSG_LNK_VOLCONF, NULL, NULL);
2178                 msg->any.lnk_volconf.copy = hmp->voldata.copyinfo[index];
2179                 msg->any.lnk_volconf.mediaid = hmp->voldata.fsid;
2180                 msg->any.lnk_volconf.index = index;
2181                 kdmsg_msg_write(msg);
2182         }
2183 }
2184
2185 void
2186 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp)
2187 {
2188         hammer2_chain_layer_t *layer;
2189         hammer2_chain_t *scan;
2190         hammer2_chain_t *first_parent;
2191
2192         --*countp;
2193         if (*countp == 0) {
2194                 kprintf("%*.*s...\n", tab, tab, "");
2195                 return;
2196         }
2197         if (*countp < 0)
2198                 return;
2199         first_parent = chain->core ? TAILQ_FIRST(&chain->core->ownerq) : NULL;
2200         kprintf("%*.*schain %p.%d %016jx/%d mir=%016jx\n",
2201                 tab, tab, "",
2202                 chain, chain->bref.type,
2203                 chain->bref.key, chain->bref.keybits,
2204                 chain->bref.mirror_tid);
2205
2206         kprintf("%*.*s      [%08x] (%s) dt=%016jx refs=%d\n",
2207                 tab, tab, "",
2208                 chain->flags,
2209                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
2210                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
2211                 chain->delete_tid,
2212                 chain->refs);
2213
2214         kprintf("%*.*s      core %p [%08x] fp=%p np=%p",
2215                 tab, tab, "",
2216                 chain->core, (chain->core ? chain->core->flags : 0),
2217                 first_parent,
2218                 (first_parent ? TAILQ_NEXT(chain, core_entry) : NULL));
2219
2220         if (first_parent)
2221                 kprintf(" [fpflags %08x fprefs %d\n",
2222                         first_parent->flags,
2223                         first_parent->refs);
2224         if (chain->core == NULL || TAILQ_EMPTY(&chain->core->layerq))
2225                 kprintf("\n");
2226         else
2227                 kprintf(" {\n");
2228         if (chain->core) {
2229                 TAILQ_FOREACH(layer, &chain->core->layerq, entry) {
2230                         RB_FOREACH(scan, hammer2_chain_tree, &layer->rbtree) {
2231                                 hammer2_dump_chain(scan, tab + 4, countp);
2232                         }
2233                 }
2234         }
2235         if (chain->core && !TAILQ_EMPTY(&chain->core->layerq)) {
2236                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
2237                         kprintf("%*.*s}(%s)\n", tab, tab, "",
2238                                 chain->data->ipdata.filename);
2239                 else
2240                         kprintf("%*.*s}\n", tab, tab, "");
2241         }
2242 }