hammer2 - Revamp hammer2_cluster structure part 1
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*-
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61
62 #include "hammer2.h"
63 #include "hammer2_lz4.h"
64
65 #include "zlib/hammer2_zlib.h"
66
67 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
68
69 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
70
71 struct hammer2_sync_info {
72         hammer2_trans_t trans;
73         int error;
74         int waitfor;
75 };
76
77 TAILQ_HEAD(hammer2_mntlist, hammer2_mount);
78 static struct hammer2_mntlist hammer2_mntlist;
79 static struct lock hammer2_mntlk;
80
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 long hammer2_iod_file_read;
85 long hammer2_iod_meta_read;
86 long hammer2_iod_indr_read;
87 long hammer2_iod_fmap_read;
88 long hammer2_iod_volu_read;
89 long hammer2_iod_file_write;
90 long hammer2_iod_meta_write;
91 long hammer2_iod_indr_write;
92 long hammer2_iod_fmap_write;
93 long hammer2_iod_volu_write;
94 long hammer2_ioa_file_read;
95 long hammer2_ioa_meta_read;
96 long hammer2_ioa_indr_read;
97 long hammer2_ioa_fmap_read;
98 long hammer2_ioa_volu_read;
99 long hammer2_ioa_fmap_write;
100 long hammer2_ioa_file_write;
101 long hammer2_ioa_meta_write;
102 long hammer2_ioa_indr_write;
103 long hammer2_ioa_volu_write;
104
105 MALLOC_DECLARE(C_BUFFER);
106 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
107
108 MALLOC_DECLARE(D_BUFFER);
109 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
110
111 MALLOC_DECLARE(W_BIOQUEUE);
112 MALLOC_DEFINE(W_BIOQUEUE, "wbioqueue", "Writing bio queue.");
113
114 MALLOC_DECLARE(W_MTX);
115 MALLOC_DEFINE(W_MTX, "wmutex", "Mutex for write thread.");
116
117 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
118
119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
120            &hammer2_debug, 0, "");
121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
122            &hammer2_cluster_enable, 0, "");
123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
124            &hammer2_hardlink_enable, 0, "");
125
126 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
127            &hammer2_iod_file_read, 0, "");
128 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
129            &hammer2_iod_meta_read, 0, "");
130 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
131            &hammer2_iod_indr_read, 0, "");
132 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
133            &hammer2_iod_fmap_read, 0, "");
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
135            &hammer2_iod_volu_read, 0, "");
136
137 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
138            &hammer2_iod_file_write, 0, "");
139 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
140            &hammer2_iod_meta_write, 0, "");
141 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
142            &hammer2_iod_indr_write, 0, "");
143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
144            &hammer2_iod_fmap_write, 0, "");
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
146            &hammer2_iod_volu_write, 0, "");
147
148 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
149            &hammer2_ioa_file_read, 0, "");
150 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
151            &hammer2_ioa_meta_read, 0, "");
152 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
153            &hammer2_ioa_indr_read, 0, "");
154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
155            &hammer2_ioa_fmap_read, 0, "");
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
157            &hammer2_ioa_volu_read, 0, "");
158
159 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
160            &hammer2_ioa_file_write, 0, "");
161 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
162            &hammer2_ioa_meta_write, 0, "");
163 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
164            &hammer2_ioa_indr_write, 0, "");
165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
166            &hammer2_ioa_fmap_write, 0, "");
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
168            &hammer2_ioa_volu_write, 0, "");
169
170 static int hammer2_vfs_init(struct vfsconf *conf);
171 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
172 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
173                                 struct ucred *cred);
174 static int hammer2_remount(hammer2_mount_t *, char *, struct vnode *,
175                                 struct ucred *);
176 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
177 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
178 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
179                                 struct ucred *cred);
180 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
181                                 struct ucred *cred);
182 static int hammer2_vfs_sync(struct mount *mp, int waitfor);
183 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
184                                 ino_t ino, struct vnode **vpp);
185 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
186                                 struct fid *fhp, struct vnode **vpp);
187 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
188 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
189                                 int *exflagsp, struct ucred **credanonp);
190
191 static int hammer2_install_volume_header(hammer2_mount_t *hmp);
192 static int hammer2_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
193 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
194
195 static void hammer2_write_thread(void *arg);
196
197 /* 
198  * Functions for compression in threads,
199  * from hammer2_vnops.c
200  */
201 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
202                                 hammer2_inode_t *ip,
203                                 hammer2_inode_data_t *ipdata,
204                                 hammer2_chain_t **parentp,
205                                 hammer2_key_t lbase, int ioflag, int pblksize,
206                                 int *errorp);
207 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
208                                 hammer2_inode_t *ip,
209                                 hammer2_inode_data_t *ipdata,
210                                 hammer2_chain_t **parentp,
211                                 hammer2_key_t lbase, int ioflag,
212                                 int pblksize, int *errorp, int comp_algo);
213 static void hammer2_zero_check_and_write(struct buf *bp,
214                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
215                                 hammer2_inode_data_t *ipdata,
216                                 hammer2_chain_t **parentp,
217                                 hammer2_key_t lbase,
218                                 int ioflag, int pblksize, int *errorp);
219 static int test_block_zeros(const char *buf, size_t bytes);
220 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
221                                 hammer2_inode_t *ip,
222                                 hammer2_inode_data_t *ipdata,
223                                 hammer2_chain_t **parentp, 
224                                 hammer2_key_t lbase,
225                                 int *errorp);
226 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
227                                 int ioflag, int pblksize, int *errorp);
228
229 static int hammer2_rcvdmsg(kdmsg_msg_t *msg);
230 static void hammer2_autodmsg(kdmsg_msg_t *msg);
231
232
233 /*
234  * HAMMER2 vfs operations.
235  */
236 static struct vfsops hammer2_vfsops = {
237         .vfs_init       = hammer2_vfs_init,
238         .vfs_uninit = hammer2_vfs_uninit,
239         .vfs_sync       = hammer2_vfs_sync,
240         .vfs_mount      = hammer2_vfs_mount,
241         .vfs_unmount    = hammer2_vfs_unmount,
242         .vfs_root       = hammer2_vfs_root,
243         .vfs_statfs     = hammer2_vfs_statfs,
244         .vfs_statvfs    = hammer2_vfs_statvfs,
245         .vfs_vget       = hammer2_vfs_vget,
246         .vfs_vptofh     = hammer2_vfs_vptofh,
247         .vfs_fhtovp     = hammer2_vfs_fhtovp,
248         .vfs_checkexp   = hammer2_vfs_checkexp
249 };
250
251 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
252
253 VFS_SET(hammer2_vfsops, hammer2, 0);
254 MODULE_VERSION(hammer2, 1);
255
256 static
257 int
258 hammer2_vfs_init(struct vfsconf *conf)
259 {
260         static struct objcache_malloc_args margs_read;
261         static struct objcache_malloc_args margs_write;
262
263         int error;
264
265         error = 0;
266
267         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
268                 error = EINVAL;
269         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
270                 error = EINVAL;
271         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
272                 error = EINVAL;
273
274         if (error)
275                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
276         
277         margs_read.objsize = 65536;
278         margs_read.mtype = D_BUFFER;
279         
280         margs_write.objsize = 32768;
281         margs_write.mtype = C_BUFFER;
282         
283         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
284                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
285                                 objcache_malloc_free, &margs_read);
286         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
287                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
288                                 objcache_malloc_free, &margs_write);
289
290         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
291         TAILQ_INIT(&hammer2_mntlist);
292
293         return (error);
294 }
295
296 static
297 int
298 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
299 {
300         objcache_destroy(cache_buffer_read);
301         objcache_destroy(cache_buffer_write);
302         return 0;
303 }
304
305 /*
306  * Mount or remount HAMMER2 fileystem from physical media
307  *
308  *      mountroot
309  *              mp              mount point structure
310  *              path            NULL
311  *              data            <unused>
312  *              cred            <unused>
313  *
314  *      mount
315  *              mp              mount point structure
316  *              path            path to mount point
317  *              data            pointer to argument structure in user space
318  *                      volume  volume path (device@LABEL form)
319  *                      hflags  user mount flags
320  *              cred            user credentials
321  *
322  * RETURNS:     0       Success
323  *              !0      error number
324  */
325 static
326 int
327 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
328                   struct ucred *cred)
329 {
330         struct hammer2_mount_info info;
331         hammer2_pfsmount_t *pmp;
332         hammer2_mount_t *hmp;
333         hammer2_key_t key_next;
334         hammer2_key_t key_dummy;
335         hammer2_key_t lhc;
336         struct vnode *devvp;
337         struct nlookupdata nd;
338         hammer2_chain_t *parent;
339         hammer2_chain_t *schain;
340         hammer2_chain_t *rchain;
341         struct file *fp;
342         char devstr[MNAMELEN];
343         size_t size;
344         size_t done;
345         char *dev;
346         char *label;
347         int ronly = 1;
348         int error;
349         int cache_index;
350         int i;
351
352         hmp = NULL;
353         pmp = NULL;
354         dev = NULL;
355         label = NULL;
356         devvp = NULL;
357         cache_index = -1;
358
359         kprintf("hammer2_mount\n");
360
361         if (path == NULL) {
362                 /*
363                  * Root mount
364                  */
365                 bzero(&info, sizeof(info));
366                 info.cluster_fd = -1;
367                 return (EOPNOTSUPP);
368         } else {
369                 /*
370                  * Non-root mount or updating a mount
371                  */
372                 error = copyin(data, &info, sizeof(info));
373                 if (error)
374                         return (error);
375
376                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
377                 if (error)
378                         return (error);
379
380                 /* Extract device and label */
381                 dev = devstr;
382                 label = strchr(devstr, '@');
383                 if (label == NULL ||
384                     ((label + 1) - dev) > done) {
385                         return (EINVAL);
386                 }
387                 *label = '\0';
388                 label++;
389                 if (*label == '\0')
390                         return (EINVAL);
391
392                 if (mp->mnt_flag & MNT_UPDATE) {
393                         /* Update mount */
394                         /* HAMMER2 implements NFS export via mountctl */
395                         pmp = MPTOPMP(mp);
396                         for (i = 0; i < pmp->cluster.nchains; ++i) {
397                                 hmp = pmp->cluster.chains[i]->hmp;
398                                 devvp = hmp->devvp;
399                                 error = hammer2_remount(hmp, path, devvp, cred);
400                                 if (error)
401                                         break;
402                         }
403                         return error;
404                 }
405         }
406
407         /*
408          * PFS mount
409          *
410          * Lookup name and verify it refers to a block device.
411          */
412         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
413         if (error == 0)
414                 error = nlookup(&nd);
415         if (error == 0)
416                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
417         nlookup_done(&nd);
418
419         if (error == 0) {
420                 if (vn_isdisk(devvp, &error))
421                         error = vfs_mountedon(devvp);
422         }
423
424         /*
425          * Determine if the device has already been mounted.  After this
426          * check hmp will be non-NULL if we are doing the second or more
427          * hammer2 mounts from the same device.
428          */
429         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
430         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
431                 if (hmp->devvp == devvp)
432                         break;
433         }
434
435         /*
436          * Open the device if this isn't a secondary mount and construct
437          * the H2 device mount (hmp).
438          */
439         if (hmp == NULL) {
440                 if (error == 0 && vcount(devvp) > 0)
441                         error = EBUSY;
442
443                 /*
444                  * Now open the device
445                  */
446                 if (error == 0) {
447                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
448                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
449                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
450                         if (error == 0) {
451                                 error = VOP_OPEN(devvp,
452                                                  ronly ? FREAD : FREAD | FWRITE,
453                                                  FSCRED, NULL);
454                         }
455                         vn_unlock(devvp);
456                 }
457                 if (error && devvp) {
458                         vrele(devvp);
459                         devvp = NULL;
460                 }
461                 if (error) {
462                         lockmgr(&hammer2_mntlk, LK_RELEASE);
463                         return error;
464                 }
465                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
466                 hmp->ronly = ronly;
467                 hmp->devvp = devvp;
468                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
469                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
470
471                 lockinit(&hmp->alloclk, "h2alloc", 0, 0);
472                 lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
473                 TAILQ_INIT(&hmp->transq);
474
475                 /*
476                  * vchain setup. vchain.data is embedded.
477                  * vchain.refs is initialized and will never drop to 0.
478                  */
479                 hmp->vchain.hmp = hmp;
480                 hmp->vchain.refs = 1;
481                 hmp->vchain.data = (void *)&hmp->voldata;
482                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
483                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
484                 hmp->vchain.delete_tid = HAMMER2_MAX_TID;
485                 hammer2_chain_core_alloc(NULL, &hmp->vchain, NULL);
486                 /* hmp->vchain.u.xxx is left NULL */
487
488                 /*
489                  * fchain setup.  fchain.data is embedded.
490                  * fchain.refs is initialized and will never drop to 0.
491                  *
492                  * The data is not used but needs to be initialized to
493                  * pass assertion muster.  We use this chain primarily
494                  * as a placeholder for the freemap's top-level RBTREE
495                  * so it does not interfere with the volume's topology
496                  * RBTREE.
497                  */
498                 hmp->fchain.hmp = hmp;
499                 hmp->fchain.refs = 1;
500                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
501                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
502                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
503                 hmp->fchain.bref.methods =
504                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
505                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
506                 hmp->fchain.delete_tid = HAMMER2_MAX_TID;
507
508                 hammer2_chain_core_alloc(NULL, &hmp->fchain, NULL);
509                 /* hmp->fchain.u.xxx is left NULL */
510
511                 /*
512                  * Install the volume header
513                  */
514                 error = hammer2_install_volume_header(hmp);
515                 if (error) {
516                         hammer2_vfs_unmount(mp, MNT_FORCE);
517                         return error;
518                 }
519
520                 /*
521                  * First locate the super-root inode, which is key 0
522                  * relative to the volume header's blockset.
523                  *
524                  * Then locate the root inode by scanning the directory keyspace
525                  * represented by the label.
526                  */
527                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
528                 schain = hammer2_chain_lookup(&parent, &key_dummy,
529                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY,
530                                       &cache_index, 0);
531                 hammer2_chain_lookup_done(parent);
532                 if (schain == NULL) {
533                         kprintf("hammer2_mount: invalid super-root\n");
534                         hammer2_vfs_unmount(mp, MNT_FORCE);
535                         return EINVAL;
536                 }
537
538                 /*
539                  * NOTE: inode_get sucks up schain's lock.
540                  */
541                 atomic_set_int(&schain->flags, HAMMER2_CHAIN_PFSROOT);
542                 hmp->sroot = hammer2_inode_get(NULL, NULL, schain);
543                 hammer2_inode_ref(hmp->sroot);
544                 hammer2_inode_unlock_ex(hmp->sroot, schain);
545                 schain = NULL;
546                 /* leave hmp->sroot with one ref */
547                 
548                 mtx_init(&hmp->wthread_mtx);
549                 bioq_init(&hmp->wthread_bioq);
550                 hmp->wthread_destroy = 0;
551         
552                 /*
553                  * Launch threads.
554                  */
555                 lwkt_create(hammer2_write_thread, hmp,
556                                 NULL, NULL, 0, -1, "hammer2-write");
557         }
558
559         /*
560          * Block device opened successfully, finish initializing the
561          * mount structure.
562          *
563          * From this point on we have to call hammer2_unmount() on failure.
564          */
565         pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
566
567         kmalloc_create(&pmp->minode, "HAMMER2-inodes");
568         kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
569
570         spin_init(&pmp->inum_spin);
571         RB_INIT(&pmp->inum_tree);
572
573         kdmsg_iocom_init(&pmp->iocom, pmp,
574                          KDMSG_IOCOMF_AUTOCONN |
575                          KDMSG_IOCOMF_AUTOSPAN |
576                          KDMSG_IOCOMF_AUTOCIRC,
577                          pmp->mmsg, hammer2_rcvdmsg);
578
579         ccms_domain_init(&pmp->ccms_dom);
580         ++hmp->pmp_count;
581         lockmgr(&hammer2_mntlk, LK_RELEASE);
582         kprintf("hammer2_mount hmp=%p pmp=%p pmpcnt=%d\n",
583                 hmp, pmp, hmp->pmp_count);
584
585         mp->mnt_flag = MNT_LOCAL;
586         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
587
588         /*
589          * required mount structure initializations
590          */
591         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
592         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
593
594         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
595         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
596
597         /*
598          * Optional fields
599          */
600         mp->mnt_iosize_max = MAXPHYS;
601         mp->mnt_data = (qaddr_t)pmp;
602         pmp->mp = mp;
603
604         /*
605          * Lookup mount point under the media-localized super-root.
606          */
607         parent = hammer2_inode_lock_ex(hmp->sroot);
608         lhc = hammer2_dirhash(label, strlen(label));
609         rchain = hammer2_chain_lookup(&parent, &key_next,
610                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
611                                       &cache_index, 0);
612         while (rchain) {
613                 if (rchain->bref.type == HAMMER2_BREF_TYPE_INODE &&
614                     strcmp(label, rchain->data->ipdata.filename) == 0) {
615                         break;
616                 }
617                 rchain = hammer2_chain_next(&parent, rchain, &key_next,
618                                             key_next,
619                                             lhc + HAMMER2_DIRHASH_LOMASK,
620                                             &cache_index, 0);
621         }
622         hammer2_inode_unlock_ex(hmp->sroot, parent);
623
624         if (rchain == NULL) {
625                 kprintf("hammer2_mount: PFS label not found\n");
626                 hammer2_vfs_unmount(mp, MNT_FORCE);
627                 return EINVAL;
628         }
629         if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
630                 hammer2_chain_unlock(rchain);
631                 kprintf("hammer2_mount: PFS label already mounted!\n");
632                 hammer2_vfs_unmount(mp, MNT_FORCE);
633                 return EBUSY;
634         }
635         if (rchain->flags & HAMMER2_CHAIN_RECYCLE) {
636                 kprintf("hammer2_mount: PFS label currently recycling\n");
637                 hammer2_vfs_unmount(mp, MNT_FORCE);
638                 return EBUSY;
639         }
640
641         atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
642
643         /*
644          * NOTE: *_get() integrates chain's lock into the inode lock.
645          */
646         hammer2_chain_ref(rchain);              /* for pmp->rchain */
647         pmp->cluster.nchains = 1;
648         pmp->cluster.chains[0] = rchain;
649         pmp->iroot = hammer2_inode_get(pmp, NULL, rchain);
650         hammer2_inode_ref(pmp->iroot);          /* ref for pmp->iroot */
651
652         KKASSERT(rchain->pmp == NULL);          /* tracking pmp for rchain */
653         rchain->pmp = pmp;
654         atomic_add_long(&pmp->inmem_chains, 1);
655
656         hammer2_inode_unlock_ex(pmp->iroot, rchain);
657
658         kprintf("iroot %p\n", pmp->iroot);
659
660         /*
661          * Ref the cluster management messaging descriptor.  The mount
662          * program deals with the other end of the communications pipe.
663          */
664         fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
665         if (fp == NULL) {
666                 kprintf("hammer2_mount: bad cluster_fd!\n");
667                 hammer2_vfs_unmount(mp, MNT_FORCE);
668                 return EBADF;
669         }
670         hammer2_cluster_reconnect(pmp, fp);
671
672         /*
673          * Finish setup
674          */
675         vfs_getnewfsid(mp);
676         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
677         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
678         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
679
680         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
681         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
682         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
683         copyinstr(path, mp->mnt_stat.f_mntonname,
684                   sizeof(mp->mnt_stat.f_mntonname) - 1,
685                   &size);
686
687         /*
688          * Initial statfs to prime mnt_stat.
689          */
690         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
691         
692         return 0;
693 }
694
695 /*
696  * Handle bioq for strategy write
697  */
698 static
699 void
700 hammer2_write_thread(void *arg)
701 {
702         hammer2_mount_t* hmp;
703         struct bio *bio;
704         struct buf *bp;
705         hammer2_trans_t trans;
706         struct vnode *vp;
707         hammer2_inode_t *last_ip;
708         hammer2_inode_t *ip;
709         hammer2_chain_t *parent;
710         hammer2_chain_t **parentp;
711         hammer2_inode_data_t *ipdata;
712         hammer2_key_t lbase;
713         int lblksize;
714         int pblksize;
715         int error;
716         
717         hmp = arg;
718         
719         mtx_lock(&hmp->wthread_mtx);
720         while (hmp->wthread_destroy == 0) {
721                 if (bioq_first(&hmp->wthread_bioq) == NULL) {
722                         mtxsleep(&hmp->wthread_bioq, &hmp->wthread_mtx,
723                                  0, "h2bioqw", 0);
724                 }
725                 last_ip = NULL;
726                 parent = NULL;
727                 parentp = &parent;
728
729                 while ((bio = bioq_takefirst(&hmp->wthread_bioq)) != NULL) {
730                         mtx_unlock(&hmp->wthread_mtx);
731                         
732                         error = 0;
733                         bp = bio->bio_buf;
734                         vp = bp->b_vp;
735                         ip = VTOI(vp);
736
737                         /*
738                          * Cache transaction for multi-buffer flush efficiency.
739                          * Lock the ip separately for each buffer to allow
740                          * interleaving with frontend writes.
741                          */
742                         if (last_ip != ip) {
743                                 if (last_ip)
744                                         hammer2_trans_done(&trans);
745                                 hammer2_trans_init(&trans, ip->pmp,
746                                                    HAMMER2_TRANS_BUFCACHE);
747                                 last_ip = ip;
748                         }
749                         parent = hammer2_inode_lock_ex(ip);
750
751                         /*
752                          * Inode is modified, flush size and mtime changes
753                          * to ensure that the file size remains consistent
754                          * with the buffers being flushed.
755                          */
756                         if (ip->flags & (HAMMER2_INODE_RESIZED |
757                                          HAMMER2_INODE_MTIME)) {
758                                 hammer2_inode_fsync(&trans, ip, parentp);
759                         }
760                         ipdata = hammer2_chain_modify_ip(&trans, ip,
761                                                          parentp, 0);
762                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
763                                                         &lbase, NULL);
764                         pblksize = hammer2_calc_physical(ip, lbase);
765                         hammer2_write_file_core(bp, &trans, ip, ipdata,
766                                                 parentp,
767                                                 lbase, IO_ASYNC,
768                                                 pblksize, &error);
769                         hammer2_inode_unlock_ex(ip, parent);
770                         if (error) {
771                                 kprintf("hammer2: error in buffer write\n");
772                                 bp->b_flags |= B_ERROR;
773                                 bp->b_error = EIO;
774                         }
775                         biodone(bio);
776                         mtx_lock(&hmp->wthread_mtx);
777                 }
778
779                 /*
780                  * Clean out transaction cache
781                  */
782                 if (last_ip)
783                         hammer2_trans_done(&trans);
784         }
785         hmp->wthread_destroy = -1;
786         wakeup(&hmp->wthread_destroy);
787         
788         mtx_unlock(&hmp->wthread_mtx);
789 }
790
791 /* 
792  * Return a chain suitable for I/O, creating the chain if necessary
793  * and assigning its physical block.
794  */
795 static
796 hammer2_chain_t *
797 hammer2_assign_physical(hammer2_trans_t *trans,
798                         hammer2_inode_t *ip, hammer2_chain_t **parentp,
799                         hammer2_key_t lbase, int pblksize, int *errorp)
800 {
801         hammer2_chain_t *parent;
802         hammer2_chain_t *chain;
803         hammer2_off_t pbase;
804         hammer2_key_t key_dummy;
805         int pradix = hammer2_getradix(pblksize);
806         int cache_index = -1;
807
808         /*
809          * Locate the chain associated with lbase, return a locked chain.
810          * However, do not instantiate any data reference (which utilizes a
811          * device buffer) because we will be using direct IO via the
812          * logical buffer cache buffer.
813          */
814         *errorp = 0;
815         KKASSERT(pblksize >= HAMMER2_MIN_ALLOC);
816 retry:
817         parent = *parentp;
818         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); /* extra lock */
819         chain = hammer2_chain_lookup(&parent, &key_dummy,
820                                      lbase, lbase,
821                                      &cache_index, HAMMER2_LOOKUP_NODATA);
822
823         if (chain == NULL) {
824                 /*
825                  * We found a hole, create a new chain entry.
826                  *
827                  * NOTE: DATA chains are created without device backing
828                  *       store (nor do we want any).
829                  */
830                 *errorp = hammer2_chain_create(trans, &parent, &chain,
831                                                lbase, HAMMER2_PBUFRADIX,
832                                                HAMMER2_BREF_TYPE_DATA,
833                                                pblksize);
834                 if (chain == NULL) {
835                         hammer2_chain_lookup_done(parent);
836                         panic("hammer2_chain_create: par=%p error=%d\n",
837                                 parent, *errorp);
838                         goto retry;
839                 }
840
841                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
842                 /*ip->delta_dcount += pblksize;*/
843         } else {
844                 switch (chain->bref.type) {
845                 case HAMMER2_BREF_TYPE_INODE:
846                         /*
847                          * The data is embedded in the inode.  The
848                          * caller is responsible for marking the inode
849                          * modified and copying the data to the embedded
850                          * area.
851                          */
852                         pbase = NOOFFSET;
853                         break;
854                 case HAMMER2_BREF_TYPE_DATA:
855                         if (chain->bytes != pblksize) {
856                                 hammer2_chain_resize(trans, ip,
857                                                      parent, &chain,
858                                                      pradix,
859                                                      HAMMER2_MODIFY_OPTDATA);
860                         }
861                         hammer2_chain_modify(trans, &chain,
862                                              HAMMER2_MODIFY_OPTDATA);
863                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
864                         break;
865                 default:
866                         panic("hammer2_assign_physical: bad type");
867                         /* NOT REACHED */
868                         pbase = NOOFFSET;
869                         break;
870                 }
871         }
872
873         /*
874          * Cleanup.  If chain wound up being the inode (i.e. DIRECTDATA),
875          * we might have to replace *parentp.
876          */
877         hammer2_chain_lookup_done(parent);
878         if (chain) {
879                 if (*parentp != chain &&
880                     (*parentp)->core == chain->core) {
881                         parent = *parentp;
882                         *parentp = chain;               /* eats lock */
883                         hammer2_chain_unlock(parent);
884                         hammer2_chain_lock(chain, 0);   /* need another */
885                 }
886                 /* else chain already locked for return */
887         }
888         return (chain);
889 }
890
891 /* 
892  * From hammer2_vnops.c.
893  * The core write function which determines which path to take
894  * depending on compression settings.
895  */
896 static
897 void
898 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
899                         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
900                         hammer2_chain_t **parentp,
901                         hammer2_key_t lbase, int ioflag, int pblksize,
902                         int *errorp)
903 {
904         hammer2_chain_t *chain;
905
906         switch(HAMMER2_DEC_COMP(ipdata->comp_algo)) {
907         case HAMMER2_COMP_NONE:
908                 /*
909                  * We have to assign physical storage to the buffer
910                  * we intend to dirty or write now to avoid deadlocks
911                  * in the strategy code later.
912                  *
913                  * This can return NOOFFSET for inode-embedded data.
914                  * The strategy code will take care of it in that case.
915                  */
916                 chain = hammer2_assign_physical(trans, ip, parentp,
917                                                 lbase, pblksize,
918                                                 errorp);
919                 hammer2_write_bp(chain, bp, ioflag, pblksize, errorp);
920                 if (chain)
921                         hammer2_chain_unlock(chain);
922                 break;
923         case HAMMER2_COMP_AUTOZERO:
924                 /*
925                  * Check for zero-fill only
926                  */
927                 hammer2_zero_check_and_write(bp, trans, ip,
928                                     ipdata, parentp, lbase,
929                                     ioflag, pblksize, errorp);
930                 break;
931         case HAMMER2_COMP_LZ4:
932         case HAMMER2_COMP_ZLIB:
933         default:
934                 /*
935                  * Check for zero-fill and attempt compression.
936                  */
937                 hammer2_compress_and_write(bp, trans, ip,
938                                            ipdata, parentp,
939                                            lbase, ioflag,
940                                            pblksize, errorp,
941                                            ipdata->comp_algo);
942                 break;
943         }
944         ipdata = &ip->chain->data->ipdata;      /* reload */
945 }
946
947 /*
948  * From hammer2_vnops.c
949  * Generic function that will perform the compression in compression
950  * write path. The compression algorithm is determined by the settings
951  * obtained from inode.
952  */
953 static
954 void
955 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
956         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
957         hammer2_chain_t **parentp,
958         hammer2_key_t lbase, int ioflag, int pblksize,
959         int *errorp, int comp_algo)
960 {
961         hammer2_chain_t *chain;
962         int comp_size;
963         int comp_block_size;
964         char *comp_buffer;
965
966         if (test_block_zeros(bp->b_data, pblksize)) {
967                 zero_write(bp, trans, ip, ipdata, parentp, lbase, errorp);
968                 return;
969         }
970
971         comp_size = 0;
972         comp_buffer = NULL;
973
974         KKASSERT(pblksize / 2 <= 32768);
975                 
976         if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
977                 z_stream strm_compress;
978                 int comp_level;
979                 int ret;
980
981                 switch(HAMMER2_DEC_COMP(comp_algo)) {
982                 case HAMMER2_COMP_LZ4:
983                         comp_buffer = objcache_get(cache_buffer_write,
984                                                    M_INTWAIT);
985                         comp_size = LZ4_compress_limitedOutput(
986                                         bp->b_data,
987                                         &comp_buffer[sizeof(int)],
988                                         pblksize,
989                                         pblksize / 2 - sizeof(int));
990                         /*
991                          * We need to prefix with the size, LZ4
992                          * doesn't do it for us.  Add the related
993                          * overhead.
994                          */
995                         *(int *)comp_buffer = comp_size;
996                         if (comp_size)
997                                 comp_size += sizeof(int);
998                         break;
999                 case HAMMER2_COMP_ZLIB:
1000                         comp_level = HAMMER2_DEC_LEVEL(comp_algo);
1001                         if (comp_level == 0)
1002                                 comp_level = 6; /* default zlib compression */
1003                         else if (comp_level < 6)
1004                                 comp_level = 6;
1005                         else if (comp_level > 9)
1006                                 comp_level = 9;
1007                         ret = deflateInit(&strm_compress, comp_level);
1008                         if (ret != Z_OK) {
1009                                 kprintf("HAMMER2 ZLIB: fatal error "
1010                                         "on deflateInit.\n");
1011                         }
1012
1013                         comp_buffer = objcache_get(cache_buffer_write,
1014                                                    M_INTWAIT);
1015                         strm_compress.next_in = bp->b_data;
1016                         strm_compress.avail_in = pblksize;
1017                         strm_compress.next_out = comp_buffer;
1018                         strm_compress.avail_out = pblksize / 2;
1019                         ret = deflate(&strm_compress, Z_FINISH);
1020                         if (ret == Z_STREAM_END) {
1021                                 comp_size = pblksize / 2 -
1022                                             strm_compress.avail_out;
1023                         } else {
1024                                 comp_size = 0;
1025                         }
1026                         ret = deflateEnd(&strm_compress);
1027                         break;
1028                 default:
1029                         kprintf("Error: Unknown compression method.\n");
1030                         kprintf("Comp_method = %d.\n", comp_algo);
1031                         break;
1032                 }
1033         }
1034
1035         if (comp_size == 0) {
1036                 /*
1037                  * compression failed or turned off
1038                  */
1039                 comp_block_size = pblksize;     /* safety */
1040                 if (++ip->comp_heuristic > 128)
1041                         ip->comp_heuristic = 8;
1042         } else {
1043                 /*
1044                  * compression succeeded
1045                  */
1046                 ip->comp_heuristic = 0;
1047                 if (comp_size <= 1024) {
1048                         comp_block_size = 1024;
1049                 } else if (comp_size <= 2048) {
1050                         comp_block_size = 2048;
1051                 } else if (comp_size <= 4096) {
1052                         comp_block_size = 4096;
1053                 } else if (comp_size <= 8192) {
1054                         comp_block_size = 8192;
1055                 } else if (comp_size <= 16384) {
1056                         comp_block_size = 16384;
1057                 } else if (comp_size <= 32768) {
1058                         comp_block_size = 32768;
1059                 } else {
1060                         panic("hammer2: WRITE PATH: "
1061                               "Weird comp_size value.");
1062                         /* NOT REACHED */
1063                         comp_block_size = pblksize;
1064                 }
1065         }
1066
1067         chain = hammer2_assign_physical(trans, ip, parentp,
1068                                         lbase, comp_block_size,
1069                                         errorp);
1070         ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1071
1072         if (*errorp) {
1073                 kprintf("WRITE PATH: An error occurred while "
1074                         "assigning physical space.\n");
1075                 KKASSERT(chain == NULL);
1076         } else {
1077                 /* Get device offset */
1078                 hammer2_off_t pbase;
1079                 hammer2_off_t pmask;
1080                 hammer2_off_t peof;
1081                 size_t boff;
1082                 size_t psize;
1083                 struct buf *dbp;
1084                 int temp_check;
1085
1086                 KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1087
1088                 switch(chain->bref.type) {
1089                 case HAMMER2_BREF_TYPE_INODE:
1090                         KKASSERT(chain->data->ipdata.op_flags &
1091                                  HAMMER2_OPFLAG_DIRECTDATA);
1092                         KKASSERT(bp->b_loffset == 0);
1093                         bcopy(bp->b_data, chain->data->ipdata.u.data,
1094                               HAMMER2_EMBEDDED_BYTES);
1095                         break;
1096                 case HAMMER2_BREF_TYPE_DATA:
1097                         psize = hammer2_devblksize(chain->bytes);
1098                         pmask = (hammer2_off_t)psize - 1;
1099                         pbase = chain->bref.data_off & ~pmask;
1100                         boff = chain->bref.data_off &
1101                                (HAMMER2_OFF_MASK & pmask);
1102                         peof = (pbase + HAMMER2_SEGMASK64) &
1103                                ~HAMMER2_SEGMASK64;
1104                         temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1105
1106                         /*
1107                          * Optimize out the read-before-write
1108                          * if possible.
1109                          */
1110                         if (comp_block_size == psize) {
1111                                 dbp = getblk(chain->hmp->devvp, pbase,
1112                                              psize, 0, 0);
1113                         } else {
1114                                 *errorp = bread(chain->hmp->devvp,
1115                                                 pbase, psize, &dbp);
1116                                 if (*errorp) {
1117                                         kprintf("hammer2: WRITE PATH: "
1118                                                 "dbp bread error\n");
1119                                         break;
1120                                 }
1121                         }
1122
1123                         /*
1124                          * When loading the block make sure we don't
1125                          * leave garbage after the compressed data.
1126                          */
1127                         if (comp_size) {
1128                                 chain->bref.methods =
1129                                         HAMMER2_ENC_COMP(comp_algo) +
1130                                         HAMMER2_ENC_CHECK(temp_check);
1131                                 bcopy(comp_buffer, dbp->b_data + boff,
1132                                       comp_size);
1133                                 if (comp_size != comp_block_size) {
1134                                         bzero(dbp->b_data + boff +
1135                                                 comp_size,
1136                                               comp_block_size -
1137                                                 comp_size);
1138                                 }
1139                         } else {
1140                                 chain->bref.methods =
1141                                         HAMMER2_ENC_COMP(
1142                                                 HAMMER2_COMP_NONE) +
1143                                         HAMMER2_ENC_CHECK(temp_check);
1144                                 bcopy(bp->b_data, dbp->b_data + boff,
1145                                       pblksize);
1146                         }
1147
1148                         /*
1149                          * Device buffer is now valid, chain is no
1150                          * longer in the initial state.
1151                          */
1152                         atomic_clear_int(&chain->flags,
1153                                          HAMMER2_CHAIN_INITIAL);
1154
1155                         /* Now write the related bdp. */
1156                         if (ioflag & IO_SYNC) {
1157                                 /*
1158                                  * Synchronous I/O requested.
1159                                  */
1160                                 bwrite(dbp);
1161                         /*
1162                         } else if ((ioflag & IO_DIRECT) &&
1163                                    loff + n == pblksize) {
1164                                 bdwrite(dbp);
1165                         */
1166                         } else if (ioflag & IO_ASYNC) {
1167                                 bawrite(dbp);
1168                         } else if (hammer2_cluster_enable) {
1169                                 cluster_write(dbp, peof,
1170                                               HAMMER2_PBUFSIZE,
1171                                               4/*XXX*/);
1172                         } else {
1173                                 bdwrite(dbp);
1174                         }
1175                         break;
1176                 default:
1177                         panic("hammer2_write_bp: bad chain type %d\n",
1178                                 chain->bref.type);
1179                         /* NOT REACHED */
1180                         break;
1181                 }
1182
1183                 hammer2_chain_unlock(chain);
1184         }
1185         if (comp_buffer)
1186                 objcache_put(cache_buffer_write, comp_buffer);
1187 }
1188
1189 /*
1190  * Function that performs zero-checking and writing without compression,
1191  * it corresponds to default zero-checking path.
1192  */
1193 static
1194 void
1195 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1196         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
1197         hammer2_chain_t **parentp,
1198         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp)
1199 {
1200         hammer2_chain_t *chain;
1201
1202         if (test_block_zeros(bp->b_data, pblksize)) {
1203                 zero_write(bp, trans, ip, ipdata, parentp, lbase, errorp);
1204         } else {
1205                 chain = hammer2_assign_physical(trans, ip, parentp,
1206                                                 lbase, pblksize, errorp);
1207                 hammer2_write_bp(chain, bp, ioflag, pblksize, errorp);
1208                 if (chain)
1209                         hammer2_chain_unlock(chain);
1210         }
1211 }
1212
1213 /*
1214  * A function to test whether a block of data contains only zeros,
1215  * returns TRUE (non-zero) if the block is all zeros.
1216  */
1217 static
1218 int
1219 test_block_zeros(const char *buf, size_t bytes)
1220 {
1221         size_t i;
1222
1223         for (i = 0; i < bytes; i += sizeof(long)) {
1224                 if (*(const long *)(buf + i) != 0)
1225                         return (0);
1226         }
1227         return (1);
1228 }
1229
1230 /*
1231  * Function to "write" a block that contains only zeros.
1232  */
1233 static
1234 void
1235 zero_write(struct buf *bp, hammer2_trans_t *trans, hammer2_inode_t *ip,
1236         hammer2_inode_data_t *ipdata, hammer2_chain_t **parentp,
1237         hammer2_key_t lbase, int *errorp __unused)
1238 {
1239         hammer2_chain_t *parent;
1240         hammer2_chain_t *chain;
1241         hammer2_key_t key_dummy;
1242         int cache_index = -1;
1243
1244         parent = hammer2_chain_lookup_init(*parentp, 0);
1245
1246         chain = hammer2_chain_lookup(&parent, &key_dummy, lbase, lbase,
1247                                      &cache_index, HAMMER2_LOOKUP_NODATA);
1248         if (chain) {
1249                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1250                         bzero(chain->data->ipdata.u.data,
1251                               HAMMER2_EMBEDDED_BYTES);
1252                 } else {
1253                         hammer2_chain_delete(trans, chain, 0);
1254                 }
1255                 hammer2_chain_unlock(chain);
1256         }
1257         hammer2_chain_lookup_done(parent);
1258 }
1259
1260 /*
1261  * Function to write the data as it is, without performing any sort of
1262  * compression. This function is used in path without compression and
1263  * default zero-checking path.
1264  */
1265 static
1266 void
1267 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1268                                 int pblksize, int *errorp)
1269 {
1270         hammer2_off_t pbase;
1271         hammer2_off_t pmask;
1272         hammer2_off_t peof;
1273         struct buf *dbp;
1274         size_t boff;
1275         size_t psize;
1276         int error;
1277         int temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1278
1279         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1280
1281         switch(chain->bref.type) {
1282         case HAMMER2_BREF_TYPE_INODE:
1283                 KKASSERT(chain->data->ipdata.op_flags &
1284                          HAMMER2_OPFLAG_DIRECTDATA);
1285                 KKASSERT(bp->b_loffset == 0);
1286                 bcopy(bp->b_data, chain->data->ipdata.u.data,
1287                       HAMMER2_EMBEDDED_BYTES);
1288                 error = 0;
1289                 break;
1290         case HAMMER2_BREF_TYPE_DATA:
1291                 psize = hammer2_devblksize(chain->bytes);
1292                 pmask = (hammer2_off_t)psize - 1;
1293                 pbase = chain->bref.data_off & ~pmask;
1294                 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
1295                 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
1296
1297                 if (psize == pblksize) {
1298                         dbp = getblk(chain->hmp->devvp, pbase,
1299                                      psize, 0, 0);
1300                         error = 0;
1301                 } else {
1302                         error = bread(chain->hmp->devvp, pbase, psize, &dbp);
1303                         if (error) {
1304                                 kprintf("hammer2: WRITE PATH: "
1305                                         "dbp bread error\n");
1306                                 break;
1307                         }
1308                 }
1309
1310                 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
1311                                       HAMMER2_ENC_CHECK(temp_check);
1312                 bcopy(bp->b_data, dbp->b_data + boff, chain->bytes);
1313                 
1314                 /*
1315                  * Device buffer is now valid, chain is no
1316                  * longer in the initial state.
1317                  */
1318                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1319
1320                 if (ioflag & IO_SYNC) {
1321                         /*
1322                          * Synchronous I/O requested.
1323                          */
1324                         bwrite(dbp);
1325                 /*
1326                 } else if ((ioflag & IO_DIRECT) && loff + n == pblksize) {
1327                         bdwrite(dbp);
1328                 */
1329                 } else if (ioflag & IO_ASYNC) {
1330                         bawrite(dbp);
1331                 } else if (hammer2_cluster_enable) {
1332                         cluster_write(dbp, peof, HAMMER2_PBUFSIZE, 4/*XXX*/);
1333                 } else {
1334                         bdwrite(dbp);
1335                 }
1336                 break;
1337         default:
1338                 panic("hammer2_write_bp: bad chain type %d\n",
1339                       chain->bref.type);
1340                 /* NOT REACHED */
1341                 error = 0;
1342                 break;
1343         }
1344         *errorp = error;
1345 }
1346
1347 static
1348 int
1349 hammer2_remount(hammer2_mount_t *hmp, char *path, struct vnode *devvp,
1350                 struct ucred *cred)
1351 {
1352         return (0);
1353 }
1354
1355 static
1356 int
1357 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1358 {
1359         hammer2_pfsmount_t *pmp;
1360         hammer2_mount_t *hmp;
1361         hammer2_chain_t *rchain;
1362         int flags;
1363         int error = 0;
1364         int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
1365         int dumpcnt;
1366         int i;
1367         struct vnode *devvp;
1368
1369         pmp = MPTOPMP(mp);
1370
1371         ccms_domain_uninit(&pmp->ccms_dom);
1372         kdmsg_iocom_uninit(&pmp->iocom);        /* XXX chain dependency */
1373
1374         for (i = 0; i < pmp->cluster.nchains; ++i) {
1375                 hmp = pmp->cluster.chains[i]->hmp;
1376
1377                 flags = 0;
1378
1379                 if (mntflags & MNT_FORCE)
1380                         flags |= FORCECLOSE;
1381
1382                 hammer2_mount_exlock(hmp);
1383
1384                 /*
1385                  * If mount initialization proceeded far enough we must flush
1386                  * its vnodes.
1387                  */
1388                 if (pmp->iroot)
1389                         error = vflush(mp, 0, flags);
1390
1391                 if (error) {
1392                         hammer2_mount_unlock(hmp);
1393                         return error;
1394                 }
1395
1396                 lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1397                 --hmp->pmp_count;
1398                 kprintf("hammer2_unmount hmp=%p pmpcnt=%d\n",
1399                         hmp, hmp->pmp_count);
1400
1401                 /*
1402                  * Flush any left over chains.  The voldata lock is only used
1403                  * to synchronize against HAMMER2_CHAIN_MODIFIED_AUX.
1404                  */
1405                 hammer2_voldata_lock(hmp);
1406                 if ((hmp->vchain.flags | hmp->fchain.flags) &
1407                     (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_SUBMODIFIED)) {
1408                         hammer2_voldata_unlock(hmp, 0);
1409                         hammer2_vfs_sync(mp, MNT_WAIT);
1410                         hammer2_vfs_sync(mp, MNT_WAIT);
1411                 } else {
1412                         hammer2_voldata_unlock(hmp, 0);
1413                 }
1414                 if (hmp->pmp_count == 0) {
1415                         if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
1416                                                  HAMMER2_CHAIN_SUBMODIFIED)) {
1417                                 kprintf("hammer2_unmount: chains left over "
1418                                         "after final sync\n");
1419                                 if (hammer2_debug & 0x0010)
1420                                         Debugger("entered debugger");
1421                         }
1422                 }
1423
1424                 /*
1425                  * Cleanup the root and super-root chain elements
1426                  * (which should be clean).
1427                  */
1428                 if (pmp->iroot) {
1429 #if REPORT_REFS_ERRORS
1430                         if (pmp->iroot->refs != 1)
1431                                 kprintf("PMP->IROOT %p REFS WRONG %d\n",
1432                                         pmp->iroot, pmp->iroot->refs);
1433 #else
1434                         KKASSERT(pmp->iroot->refs == 1);
1435 #endif
1436                         /* ref for pmp->iroot */
1437                         hammer2_inode_drop(pmp->iroot);
1438                         pmp->iroot = NULL;
1439                 }
1440
1441                 rchain = pmp->cluster.chains[i];
1442                 if (rchain) {
1443                         atomic_clear_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
1444 #if REPORT_REFS_ERRORS
1445                         if (rchain->refs != 1)
1446                                 kprintf("PMP->RCHAIN %p REFS WRONG %d\n",
1447                                         rchain, rchain->refs);
1448 #else
1449                         KKASSERT(rchain->refs == 1);
1450 #endif
1451                         hammer2_chain_drop(rchain);
1452                         pmp->cluster.chains[i] = NULL;
1453                 }
1454
1455                 /*
1456                  * If no PFS's left drop the master hammer2_mount for the
1457                  * device.
1458                  */
1459                 if (hmp->pmp_count == 0) {
1460                         if (hmp->sroot) {
1461                                 hammer2_inode_drop(hmp->sroot);
1462                                 hmp->sroot = NULL;
1463                         }
1464
1465                         /*
1466                          * Finish up with the device vnode
1467                          */
1468                         if ((devvp = hmp->devvp) != NULL) {
1469                                 vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1470                                 hmp->devvp = NULL;
1471                                 VOP_CLOSE(devvp,
1472                                           (ronly ? FREAD : FREAD|FWRITE));
1473                                 vrele(devvp);
1474                                 devvp = NULL;
1475                         }
1476
1477                         /*
1478                          * Final drop of embedded freemap root chain to clean up
1479                          * fchain.core (fchain structure is not flagged ALLOCATED
1480                          * so it is cleaned out and then left to rot).
1481                          */
1482                         hammer2_chain_drop(&hmp->fchain);
1483
1484                         /*
1485                          * Final drop of embedded volume root chain to clean up
1486                          * vchain.core (vchain structure is not flagged ALLOCATED
1487                          * so it is cleaned out and then left to rot).
1488                          */
1489                         dumpcnt = 50;
1490                         hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt);
1491                         hammer2_mount_unlock(hmp);
1492                         hammer2_chain_drop(&hmp->vchain);
1493                 } else {
1494                         hammer2_mount_unlock(hmp);
1495                 }
1496                 if (hmp->pmp_count == 0) {
1497                         mtx_lock(&hmp->wthread_mtx);
1498                         hmp->wthread_destroy = 1;
1499                         wakeup(&hmp->wthread_bioq);
1500                         while (hmp->wthread_destroy != -1) {
1501                                 mtxsleep(&hmp->wthread_destroy,
1502                                         &hmp->wthread_mtx, 0,
1503                                         "umount-sleep", 0);
1504                         }
1505                         mtx_unlock(&hmp->wthread_mtx);
1506
1507                         TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1508                         kmalloc_destroy(&hmp->mchain);
1509                         kfree(hmp, M_HAMMER2);
1510                 }
1511         }
1512
1513         pmp->mp = NULL;
1514         mp->mnt_data = NULL;
1515
1516         kmalloc_destroy(&pmp->mmsg);
1517         kmalloc_destroy(&pmp->minode);
1518
1519         kfree(pmp, M_HAMMER2);
1520         lockmgr(&hammer2_mntlk, LK_RELEASE);
1521
1522         return (error);
1523 }
1524
1525 static
1526 int
1527 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1528              ino_t ino, struct vnode **vpp)
1529 {
1530         kprintf("hammer2_vget\n");
1531         return (EOPNOTSUPP);
1532 }
1533
1534 static
1535 int
1536 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1537 {
1538         hammer2_pfsmount_t *pmp;
1539         hammer2_chain_t *parent;
1540         int error;
1541         struct vnode *vp;
1542
1543         pmp = MPTOPMP(mp);
1544         if (pmp->iroot == NULL) {
1545                 *vpp = NULL;
1546                 error = EINVAL;
1547         } else {
1548                 parent = hammer2_inode_lock_sh(pmp->iroot);
1549                 vp = hammer2_igetv(pmp->iroot, &error);
1550                 hammer2_inode_unlock_sh(pmp->iroot, parent);
1551                 *vpp = vp;
1552                 if (vp == NULL)
1553                         kprintf("vnodefail\n");
1554         }
1555
1556         return (error);
1557 }
1558
1559 /*
1560  * Filesystem status
1561  *
1562  * XXX incorporate ipdata->inode_quota and data_quota
1563  */
1564 static
1565 int
1566 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
1567 {
1568         hammer2_pfsmount_t *pmp;
1569         hammer2_mount_t *hmp;
1570
1571         pmp = MPTOPMP(mp);
1572         KKASSERT(pmp->cluster.nchains >= 1);
1573         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
1574
1575         mp->mnt_stat.f_files = pmp->inode_count;
1576         mp->mnt_stat.f_ffree = 0;
1577         mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1578         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1579         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
1580
1581         *sbp = mp->mnt_stat;
1582         return (0);
1583 }
1584
1585 static
1586 int
1587 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
1588 {
1589         hammer2_pfsmount_t *pmp;
1590         hammer2_mount_t *hmp;
1591
1592         pmp = MPTOPMP(mp);
1593         KKASSERT(pmp->cluster.nchains >= 1);
1594         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
1595
1596         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1597         mp->mnt_vstat.f_files = pmp->inode_count;
1598         mp->mnt_vstat.f_ffree = 0;
1599         mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1600         mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1601         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
1602
1603         *sbp = mp->mnt_vstat;
1604         return (0);
1605 }
1606
1607 /*
1608  * Sync the entire filesystem; this is called from the filesystem syncer
1609  * process periodically and whenever a user calls sync(1) on the hammer
1610  * mountpoint.
1611  *
1612  * Currently is actually called from the syncer! \o/
1613  *
1614  * This task will have to snapshot the state of the dirty inode chain.
1615  * From that, it will have to make sure all of the inodes on the dirty
1616  * chain have IO initiated. We make sure that io is initiated for the root
1617  * block.
1618  *
1619  * If waitfor is set, we wait for media to acknowledge the new rootblock.
1620  *
1621  * THINKS: side A vs side B, to have sync not stall all I/O?
1622  */
1623 static
1624 int
1625 hammer2_vfs_sync(struct mount *mp, int waitfor)
1626 {
1627         struct hammer2_sync_info info;
1628         hammer2_pfsmount_t *pmp;
1629         hammer2_mount_t *hmp;
1630         int flags;
1631         int error;
1632         int total_error;
1633         int i;
1634
1635         pmp = MPTOPMP(mp);
1636
1637         /*
1638          * We can't acquire locks on existing vnodes while in a transaction
1639          * without risking a deadlock.  This assumes that vfsync() can be
1640          * called without the vnode locked (which it can in DragonFly).
1641          * Otherwise we'd have to implement a multi-pass or flag the lock
1642          * failures and retry.
1643          */
1644         /*flags = VMSC_GETVP;*/
1645         flags = 0;
1646         if (waitfor & MNT_LAZY)
1647                 flags |= VMSC_ONEPASS;
1648
1649         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH);
1650
1651         info.error = 0;
1652         info.waitfor = MNT_NOWAIT;
1653         vmntvnodescan(mp, flags | VMSC_NOWAIT,
1654                       hammer2_sync_scan1,
1655                       hammer2_sync_scan2, &info);
1656         if (info.error == 0 && (waitfor & MNT_WAIT)) {
1657                 info.waitfor = waitfor;
1658                     vmntvnodescan(mp, flags,
1659                                   hammer2_sync_scan1,
1660                                   hammer2_sync_scan2, &info);
1661
1662         }
1663 #if 0
1664         if (waitfor == MNT_WAIT) {
1665                 /* XXX */
1666         } else {
1667                 /* XXX */
1668         }
1669 #endif
1670
1671         total_error = 0;
1672         for (i = 0; i < pmp->cluster.nchains; ++i) {
1673                 hmp = pmp->cluster.chains[i]->hmp;
1674
1675                 hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1676                 if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
1677                                           HAMMER2_CHAIN_SUBMODIFIED)) {
1678                         hammer2_chain_flush(&info.trans, &hmp->vchain);
1679                 }
1680                 hammer2_chain_unlock(&hmp->vchain);
1681
1682 #if 1
1683                 /*
1684                  * Rollup flush.  The fsyncs above basically just flushed
1685                  * data blocks.  The flush below gets all the meta-data.
1686                  */
1687                 hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1688                 if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED |
1689                                          HAMMER2_CHAIN_SUBMODIFIED)) {
1690                         /* this will modify vchain as a side effect */
1691                         hammer2_chain_flush(&info.trans, &hmp->fchain);
1692                 }
1693                 hammer2_chain_unlock(&hmp->fchain);
1694 #endif
1695
1696                 error = 0;
1697
1698                 /*
1699                  * We can't safely flush the volume header until we have
1700                  * flushed any device buffers which have built up.
1701                  *
1702                  * XXX this isn't being incremental
1703                  */
1704                 vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
1705                 error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
1706                 vn_unlock(hmp->devvp);
1707
1708                 /*
1709                  * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1710                  * volume header needs synchronization via hmp->volsync.
1711                  *
1712                  * XXX synchronize the flag & data with only this flush XXX
1713                  */
1714                 if (error == 0 &&
1715                     (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1716                         struct buf *bp;
1717
1718                         /*
1719                          * Synchronize the disk before flushing the volume
1720                          * header.
1721                          */
1722                         bp = getpbuf(NULL);
1723                         bp->b_bio1.bio_offset = 0;
1724                         bp->b_bufsize = 0;
1725                         bp->b_bcount = 0;
1726                         bp->b_cmd = BUF_CMD_FLUSH;
1727                         bp->b_bio1.bio_done = biodone_sync;
1728                         bp->b_bio1.bio_flags |= BIO_SYNC;
1729                         vn_strategy(hmp->devvp, &bp->b_bio1);
1730                         biowait(&bp->b_bio1, "h2vol");
1731                         relpbuf(bp, NULL);
1732
1733                         /*
1734                          * Then we can safely flush the version of the
1735                          * volume header synchronized by the flush code.
1736                          */
1737                         i = hmp->volhdrno + 1;
1738                         if (i >= HAMMER2_NUM_VOLHDRS)
1739                                 i = 0;
1740                         if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1741                             hmp->volsync.volu_size) {
1742                                 i = 0;
1743                         }
1744                         kprintf("sync volhdr %d %jd\n",
1745                                 i, (intmax_t)hmp->volsync.volu_size);
1746                         bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
1747                                     HAMMER2_PBUFSIZE, 0, 0);
1748                         atomic_clear_int(&hmp->vchain.flags,
1749                                          HAMMER2_CHAIN_VOLUMESYNC);
1750                         bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1751                         bawrite(bp);
1752                         hmp->volhdrno = i;
1753                 }
1754                 if (error)
1755                         total_error = error;
1756         }
1757
1758         hammer2_trans_done(&info.trans);
1759         return (total_error);
1760 }
1761
1762 /*
1763  * Sync passes.
1764  *
1765  * NOTE: We don't test SUBMODIFIED or MOVED here because the fsync code
1766  *       won't flush on those flags.  The syncer code above will do a
1767  *       general meta-data flush globally that will catch these flags.
1768  */
1769 static int
1770 hammer2_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1771 {
1772         hammer2_inode_t *ip;
1773
1774         ip = VTOI(vp);
1775         if (vp->v_type == VNON || ip == NULL ||
1776             ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
1777              RB_EMPTY(&vp->v_rbdirty_tree))) {
1778                 return(-1);
1779         }
1780         return(0);
1781 }
1782
1783 static int
1784 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1785 {
1786         struct hammer2_sync_info *info = data;
1787         hammer2_inode_t *ip;
1788         hammer2_chain_t *parent;
1789         int error;
1790
1791         ip = VTOI(vp);
1792         if (vp->v_type == VNON || vp->v_type == VBAD ||
1793             ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
1794              RB_EMPTY(&vp->v_rbdirty_tree))) {
1795                 return(0);
1796         }
1797
1798         /*
1799          * VOP_FSYNC will start a new transaction so replicate some code
1800          * here to do it inline (see hammer2_vop_fsync()).
1801          *
1802          * WARNING: The vfsync interacts with the buffer cache and might
1803          *          block, we can't hold the inode lock at that time.
1804          */
1805         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1806         if (ip->vp)
1807                 vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
1808         parent = hammer2_inode_lock_ex(ip);
1809         hammer2_chain_flush(&info->trans, parent);
1810         hammer2_inode_unlock_ex(ip, parent);
1811         error = 0;
1812 #if 0
1813         error = VOP_FSYNC(vp, MNT_NOWAIT, 0);
1814 #endif
1815         if (error)
1816                 info->error = error;
1817         return(0);
1818 }
1819
1820 static
1821 int
1822 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
1823 {
1824         return (0);
1825 }
1826
1827 static
1828 int
1829 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
1830                struct fid *fhp, struct vnode **vpp)
1831 {
1832         return (0);
1833 }
1834
1835 static
1836 int
1837 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
1838                  int *exflagsp, struct ucred **credanonp)
1839 {
1840         return (0);
1841 }
1842
1843 /*
1844  * Support code for hammer2_mount().  Read, verify, and install the volume
1845  * header into the HMP
1846  *
1847  * XXX read four volhdrs and use the one with the highest TID whos CRC
1848  *     matches.
1849  *
1850  * XXX check iCRCs.
1851  *
1852  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
1853  *     nonexistant locations.
1854  *
1855  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
1856  */
1857 static
1858 int
1859 hammer2_install_volume_header(hammer2_mount_t *hmp)
1860 {
1861         hammer2_volume_data_t *vd;
1862         struct buf *bp;
1863         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
1864         int error_reported;
1865         int error;
1866         int valid;
1867         int i;
1868
1869         error_reported = 0;
1870         error = 0;
1871         valid = 0;
1872         bp = NULL;
1873
1874         /*
1875          * There are up to 4 copies of the volume header (syncs iterate
1876          * between them so there is no single master).  We don't trust the
1877          * volu_size field so we don't know precisely how large the filesystem
1878          * is, so depend on the OS to return an error if we go beyond the
1879          * block device's EOF.
1880          */
1881         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
1882                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
1883                               HAMMER2_VOLUME_BYTES, &bp);
1884                 if (error) {
1885                         brelse(bp);
1886                         bp = NULL;
1887                         continue;
1888                 }
1889
1890                 vd = (struct hammer2_volume_data *) bp->b_data;
1891                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
1892                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
1893                         brelse(bp);
1894                         bp = NULL;
1895                         continue;
1896                 }
1897
1898                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
1899                         /* XXX: Reversed-endianness filesystem */
1900                         kprintf("hammer2: reverse-endian filesystem detected");
1901                         brelse(bp);
1902                         bp = NULL;
1903                         continue;
1904                 }
1905
1906                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
1907                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
1908                                       HAMMER2_VOLUME_ICRC0_SIZE);
1909                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
1910                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
1911                                        HAMMER2_VOLUME_ICRC1_SIZE);
1912                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
1913                         kprintf("hammer2 volume header crc "
1914                                 "mismatch copy #%d %08x/%08x\n",
1915                                 i, crc0, crc);
1916                         error_reported = 1;
1917                         brelse(bp);
1918                         bp = NULL;
1919                         continue;
1920                 }
1921                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
1922                         valid = 1;
1923                         hmp->voldata = *vd;
1924                         hmp->volhdrno = i;
1925                 }
1926                 brelse(bp);
1927                 bp = NULL;
1928         }
1929         if (valid) {
1930                 hmp->volsync = hmp->voldata;
1931                 error = 0;
1932                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
1933                         kprintf("hammer2: using volume header #%d\n",
1934                                 hmp->volhdrno);
1935                 }
1936         } else {
1937                 error = EINVAL;
1938                 kprintf("hammer2: no valid volume headers found!\n");
1939         }
1940         return (error);
1941 }
1942
1943 /*
1944  * Reconnect using the passed file pointer.  The caller must ref the
1945  * fp for us.
1946  */
1947 void
1948 hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
1949 {
1950         hammer2_inode_data_t *ipdata;
1951         hammer2_chain_t *parent;
1952         hammer2_mount_t *hmp;
1953         size_t name_len;
1954
1955         hmp = pmp->cluster.chains[0]->hmp;      /* XXX */
1956
1957         /*
1958          * Closes old comm descriptor, kills threads, cleans up
1959          * states, then installs the new descriptor and creates
1960          * new threads.
1961          */
1962         kdmsg_iocom_reconnect(&pmp->iocom, fp, "hammer2");
1963
1964         /*
1965          * Setup LNK_CONN fields for autoinitiated state machine
1966          */
1967         parent = hammer2_inode_lock_ex(pmp->iroot);
1968         ipdata = &parent->data->ipdata;
1969         pmp->iocom.auto_lnk_conn.pfs_clid = ipdata->pfs_clid;
1970         pmp->iocom.auto_lnk_conn.pfs_fsid = ipdata->pfs_fsid;
1971         pmp->iocom.auto_lnk_conn.pfs_type = ipdata->pfs_type;
1972         pmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
1973         pmp->iocom.auto_lnk_conn.peer_type = hmp->voldata.peer_type;
1974
1975         /*
1976          * Filter adjustment.  Clients do not need visibility into other
1977          * clients (otherwise millions of clients would present a serious
1978          * problem).  The fs_label also serves to restrict the namespace.
1979          */
1980         pmp->iocom.auto_lnk_conn.peer_mask = 1LLU << HAMMER2_PEER_HAMMER2;
1981         pmp->iocom.auto_lnk_conn.pfs_mask = (uint64_t)-1;
1982         switch (ipdata->pfs_type) {
1983         case DMSG_PFSTYPE_CLIENT:
1984                 pmp->iocom.auto_lnk_conn.peer_mask &=
1985                                 ~(1LLU << DMSG_PFSTYPE_CLIENT);
1986                 break;
1987         default:
1988                 break;
1989         }
1990
1991         name_len = ipdata->name_len;
1992         if (name_len >= sizeof(pmp->iocom.auto_lnk_conn.fs_label))
1993                 name_len = sizeof(pmp->iocom.auto_lnk_conn.fs_label) - 1;
1994         bcopy(ipdata->filename,
1995               pmp->iocom.auto_lnk_conn.fs_label,
1996               name_len);
1997         pmp->iocom.auto_lnk_conn.fs_label[name_len] = 0;
1998
1999         /*
2000          * Setup LNK_SPAN fields for autoinitiated state machine
2001          */
2002         pmp->iocom.auto_lnk_span.pfs_clid = ipdata->pfs_clid;
2003         pmp->iocom.auto_lnk_span.pfs_fsid = ipdata->pfs_fsid;
2004         pmp->iocom.auto_lnk_span.pfs_type = ipdata->pfs_type;
2005         pmp->iocom.auto_lnk_span.peer_type = hmp->voldata.peer_type;
2006         pmp->iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1;
2007         name_len = ipdata->name_len;
2008         if (name_len >= sizeof(pmp->iocom.auto_lnk_span.fs_label))
2009                 name_len = sizeof(pmp->iocom.auto_lnk_span.fs_label) - 1;
2010         bcopy(ipdata->filename,
2011               pmp->iocom.auto_lnk_span.fs_label,
2012               name_len);
2013         pmp->iocom.auto_lnk_span.fs_label[name_len] = 0;
2014         hammer2_inode_unlock_ex(pmp->iroot, parent);
2015
2016         kdmsg_iocom_autoinitiate(&pmp->iocom, hammer2_autodmsg);
2017 }
2018
2019 static int
2020 hammer2_rcvdmsg(kdmsg_msg_t *msg)
2021 {
2022         switch(msg->any.head.cmd & DMSGF_TRANSMASK) {
2023         case DMSG_DBG_SHELL:
2024                 /*
2025                  * (non-transaction)
2026                  * Execute shell command (not supported atm)
2027                  */
2028                 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2029                 break;
2030         case DMSG_DBG_SHELL | DMSGF_REPLY:
2031                 /*
2032                  * (non-transaction)
2033                  */
2034                 if (msg->aux_data) {
2035                         msg->aux_data[msg->aux_size - 1] = 0;
2036                         kprintf("HAMMER2 DBG: %s\n", msg->aux_data);
2037                 }
2038                 break;
2039         default:
2040                 /*
2041                  * Unsupported message received.  We only need to
2042                  * reply if it's a transaction in order to close our end.
2043                  * Ignore any one-way messages are any further messages
2044                  * associated with the transaction.
2045                  *
2046                  * NOTE: This case also includes DMSG_LNK_ERROR messages
2047                  *       which might be one-way, replying to those would
2048                  *       cause an infinite ping-pong.
2049                  */
2050                 if (msg->any.head.cmd & DMSGF_CREATE)
2051                         kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2052                 break;
2053         }
2054         return(0);
2055 }
2056
2057 /*
2058  * This function is called after KDMSG has automatically handled processing
2059  * of a LNK layer message (typically CONN, SPAN, or CIRC).
2060  *
2061  * We tag off the LNK_CONN to trigger our LNK_VOLCONF messages which
2062  * advertises all available hammer2 super-root volumes.
2063  */
2064 static void
2065 hammer2_autodmsg(kdmsg_msg_t *msg)
2066 {
2067         hammer2_pfsmount_t *pmp = msg->iocom->handle;
2068         hammer2_mount_t *hmp = pmp->cluster.chains[0]->hmp; /* XXX */
2069         int copyid;
2070
2071         /*
2072          * We only care about replies to our LNK_CONN auto-request.  kdmsg
2073          * has already processed the reply, we use this calback as a shim
2074          * to know when we can advertise available super-root volumes.
2075          */
2076         if ((msg->any.head.cmd & DMSGF_TRANSMASK) !=
2077             (DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_REPLY) ||
2078             msg->state == NULL) {
2079                 return;
2080         }
2081
2082         kprintf("LNK_CONN REPLY RECEIVED CMD %08x\n", msg->any.head.cmd);
2083
2084         if (msg->any.head.cmd & DMSGF_CREATE) {
2085                 kprintf("HAMMER2: VOLDATA DUMP\n");
2086
2087                 /*
2088                  * Dump the configuration stored in the volume header
2089                  */
2090                 hammer2_voldata_lock(hmp);
2091                 for (copyid = 0; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
2092                         if (hmp->voldata.copyinfo[copyid].copyid == 0)
2093                                 continue;
2094                         hammer2_volconf_update(pmp, copyid);
2095                 }
2096                 hammer2_voldata_unlock(hmp, 0);
2097         }
2098         if ((msg->any.head.cmd & DMSGF_DELETE) &&
2099             msg->state && (msg->state->txcmd & DMSGF_DELETE) == 0) {
2100                 kprintf("HAMMER2: CONN WAS TERMINATED\n");
2101         }
2102 }
2103
2104 /*
2105  * Volume configuration updates are passed onto the userland service
2106  * daemon via the open LNK_CONN transaction.
2107  */
2108 void
2109 hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index)
2110 {
2111         hammer2_mount_t *hmp = pmp->cluster.chains[0]->hmp;     /* XXX */
2112         kdmsg_msg_t *msg;
2113
2114         /* XXX interlock against connection state termination */
2115         kprintf("volconf update %p\n", pmp->iocom.conn_state);
2116         if (pmp->iocom.conn_state) {
2117                 kprintf("TRANSMIT VOLCONF VIA OPEN CONN TRANSACTION\n");
2118                 msg = kdmsg_msg_alloc_state(pmp->iocom.conn_state,
2119                                             DMSG_LNK_VOLCONF, NULL, NULL);
2120                 msg->any.lnk_volconf.copy = hmp->voldata.copyinfo[index];
2121                 msg->any.lnk_volconf.mediaid = hmp->voldata.fsid;
2122                 msg->any.lnk_volconf.index = index;
2123                 kdmsg_msg_write(msg);
2124         }
2125 }
2126
2127 void
2128 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp)
2129 {
2130         hammer2_chain_layer_t *layer;
2131         hammer2_chain_t *scan;
2132         hammer2_chain_t *first_parent;
2133
2134         --*countp;
2135         if (*countp == 0) {
2136                 kprintf("%*.*s...\n", tab, tab, "");
2137                 return;
2138         }
2139         if (*countp < 0)
2140                 return;
2141         first_parent = chain->core ? TAILQ_FIRST(&chain->core->ownerq) : NULL;
2142         kprintf("%*.*schain %p.%d [%08x][core=%p fp=%p] (%s) np=%p dt=%s refs=%d",
2143                 tab, tab, "",
2144                 chain, chain->bref.type, chain->flags,
2145                 chain->core,
2146                 first_parent,
2147                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
2148                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
2149                 (first_parent ? TAILQ_NEXT(chain, core_entry) : NULL),
2150                 (chain->delete_tid == HAMMER2_MAX_TID ? "max" : "fls"),
2151                 chain->refs);
2152         if (first_parent)
2153                 kprintf(" [fpflags %08x fprefs %d\n",
2154                         first_parent->flags,
2155                         first_parent->refs);
2156         if (chain->core == NULL || TAILQ_EMPTY(&chain->core->layerq))
2157                 kprintf("\n");
2158         else
2159                 kprintf(" {\n");
2160         TAILQ_FOREACH(layer, &chain->core->layerq, entry) {
2161                 RB_FOREACH(scan, hammer2_chain_tree, &layer->rbtree) {
2162                         hammer2_dump_chain(scan, tab + 4, countp);
2163                 }
2164         }
2165         if (chain->core && !TAILQ_EMPTY(&chain->core->layerq)) {
2166                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
2167                         kprintf("%*.*s}(%s)\n", tab, tab, "",
2168                                 chain->data->ipdata.filename);
2169                 else
2170                         kprintf("%*.*s}\n", tab, tab, "");
2171         }
2172 }