hammer2 - GSOC cleanup pass
[dragonfly.git] / sys / vfs / hammer2 / hammer2_vfsops.c
1 /*-
2  * Copyright (c) 2011-2013 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  * by Daniel Flores (GSOC 2013 - mentored by Matthew Dillon, compression)
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  *
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in
16  *    the documentation and/or other materials provided with the
17  *    distribution.
18  * 3. Neither the name of The DragonFly Project nor the names of its
19  *    contributors may be used to endorse or promote products derived
20  *    from this software without specific, prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
26  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
28  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
30  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
31  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
32  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 #include <sys/param.h>
36 #include <sys/systm.h>
37 #include <sys/kernel.h>
38 #include <sys/nlookup.h>
39 #include <sys/vnode.h>
40 #include <sys/mount.h>
41 #include <sys/fcntl.h>
42 #include <sys/buf.h>
43 #include <sys/uuid.h>
44 #include <sys/vfsops.h>
45 #include <sys/sysctl.h>
46 #include <sys/socket.h>
47 #include <sys/objcache.h>
48
49 #include <sys/proc.h>
50 #include <sys/namei.h>
51 #include <sys/mountctl.h>
52 #include <sys/dirent.h>
53 #include <sys/uio.h>
54
55 #include <sys/mutex.h>
56 #include <sys/mutex2.h>
57
58 #include "hammer2.h"
59 #include "hammer2_disk.h"
60 #include "hammer2_mount.h"
61
62 #include "hammer2.h"
63 #include "hammer2_lz4.h"
64
65 #include "zlib/hammer2_zlib.h"
66
67 #define REPORT_REFS_ERRORS 1    /* XXX remove me */
68
69 MALLOC_DEFINE(M_OBJCACHE, "objcache", "Object Cache");
70
71 struct hammer2_sync_info {
72         hammer2_trans_t trans;
73         int error;
74         int waitfor;
75 };
76
77 TAILQ_HEAD(hammer2_mntlist, hammer2_mount);
78 static struct hammer2_mntlist hammer2_mntlist;
79 static struct lock hammer2_mntlk;
80
81 int hammer2_debug;
82 int hammer2_cluster_enable = 1;
83 int hammer2_hardlink_enable = 1;
84 long hammer2_iod_file_read;
85 long hammer2_iod_meta_read;
86 long hammer2_iod_indr_read;
87 long hammer2_iod_fmap_read;
88 long hammer2_iod_volu_read;
89 long hammer2_iod_file_write;
90 long hammer2_iod_meta_write;
91 long hammer2_iod_indr_write;
92 long hammer2_iod_fmap_write;
93 long hammer2_iod_volu_write;
94 long hammer2_ioa_file_read;
95 long hammer2_ioa_meta_read;
96 long hammer2_ioa_indr_read;
97 long hammer2_ioa_fmap_read;
98 long hammer2_ioa_volu_read;
99 long hammer2_ioa_fmap_write;
100 long hammer2_ioa_file_write;
101 long hammer2_ioa_meta_write;
102 long hammer2_ioa_indr_write;
103 long hammer2_ioa_volu_write;
104
105 MALLOC_DECLARE(C_BUFFER);
106 MALLOC_DEFINE(C_BUFFER, "compbuffer", "Buffer used for compression.");
107
108 MALLOC_DECLARE(D_BUFFER);
109 MALLOC_DEFINE(D_BUFFER, "decompbuffer", "Buffer used for decompression.");
110
111 MALLOC_DECLARE(W_BIOQUEUE);
112 MALLOC_DEFINE(W_BIOQUEUE, "wbioqueue", "Writing bio queue.");
113
114 MALLOC_DECLARE(W_MTX);
115 MALLOC_DEFINE(W_MTX, "wmutex", "Mutex for write thread.");
116
117 SYSCTL_NODE(_vfs, OID_AUTO, hammer2, CTLFLAG_RW, 0, "HAMMER2 filesystem");
118
119 SYSCTL_INT(_vfs_hammer2, OID_AUTO, debug, CTLFLAG_RW,
120            &hammer2_debug, 0, "");
121 SYSCTL_INT(_vfs_hammer2, OID_AUTO, cluster_enable, CTLFLAG_RW,
122            &hammer2_cluster_enable, 0, "");
123 SYSCTL_INT(_vfs_hammer2, OID_AUTO, hardlink_enable, CTLFLAG_RW,
124            &hammer2_hardlink_enable, 0, "");
125
126 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_read, CTLFLAG_RW,
127            &hammer2_iod_file_read, 0, "");
128 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_read, CTLFLAG_RW,
129            &hammer2_iod_meta_read, 0, "");
130 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_read, CTLFLAG_RW,
131            &hammer2_iod_indr_read, 0, "");
132 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_read, CTLFLAG_RW,
133            &hammer2_iod_fmap_read, 0, "");
134 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_read, CTLFLAG_RW,
135            &hammer2_iod_volu_read, 0, "");
136
137 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_file_write, CTLFLAG_RW,
138            &hammer2_iod_file_write, 0, "");
139 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_meta_write, CTLFLAG_RW,
140            &hammer2_iod_meta_write, 0, "");
141 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_indr_write, CTLFLAG_RW,
142            &hammer2_iod_indr_write, 0, "");
143 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_fmap_write, CTLFLAG_RW,
144            &hammer2_iod_fmap_write, 0, "");
145 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, iod_volu_write, CTLFLAG_RW,
146            &hammer2_iod_volu_write, 0, "");
147
148 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_read, CTLFLAG_RW,
149            &hammer2_ioa_file_read, 0, "");
150 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_read, CTLFLAG_RW,
151            &hammer2_ioa_meta_read, 0, "");
152 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_read, CTLFLAG_RW,
153            &hammer2_ioa_indr_read, 0, "");
154 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_read, CTLFLAG_RW,
155            &hammer2_ioa_fmap_read, 0, "");
156 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_read, CTLFLAG_RW,
157            &hammer2_ioa_volu_read, 0, "");
158
159 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_file_write, CTLFLAG_RW,
160            &hammer2_ioa_file_write, 0, "");
161 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_meta_write, CTLFLAG_RW,
162            &hammer2_ioa_meta_write, 0, "");
163 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_indr_write, CTLFLAG_RW,
164            &hammer2_ioa_indr_write, 0, "");
165 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_fmap_write, CTLFLAG_RW,
166            &hammer2_ioa_fmap_write, 0, "");
167 SYSCTL_LONG(_vfs_hammer2, OID_AUTO, ioa_volu_write, CTLFLAG_RW,
168            &hammer2_ioa_volu_write, 0, "");
169
170 static int hammer2_vfs_init(struct vfsconf *conf);
171 static int hammer2_vfs_uninit(struct vfsconf *vfsp);
172 static int hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
173                                 struct ucred *cred);
174 static int hammer2_remount(struct mount *, char *, struct vnode *,
175                                 struct ucred *);
176 static int hammer2_vfs_unmount(struct mount *mp, int mntflags);
177 static int hammer2_vfs_root(struct mount *mp, struct vnode **vpp);
178 static int hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp,
179                                 struct ucred *cred);
180 static int hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp,
181                                 struct ucred *cred);
182 static int hammer2_vfs_sync(struct mount *mp, int waitfor);
183 static int hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
184                                 ino_t ino, struct vnode **vpp);
185 static int hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
186                                 struct fid *fhp, struct vnode **vpp);
187 static int hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp);
188 static int hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
189                                 int *exflagsp, struct ucred **credanonp);
190
191 static int hammer2_install_volume_header(hammer2_mount_t *hmp);
192 static int hammer2_sync_scan1(struct mount *mp, struct vnode *vp, void *data);
193 static int hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data);
194
195 static void hammer2_write_thread(void *arg);
196
197 /* 
198  * Functions for compression in threads,
199  * from hammer2_vnops.c
200  */
201 static void hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
202                                 hammer2_inode_t *ip,
203                                 hammer2_inode_data_t *ipdata,
204                                 hammer2_chain_t **parentp,
205                                 hammer2_key_t lbase, int ioflag, int pblksize,
206                                 int *errorp);
207 static void hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
208                                 hammer2_inode_t *ip,
209                                 hammer2_inode_data_t *ipdata,
210                                 hammer2_chain_t **parentp,
211                                 hammer2_key_t lbase, int ioflag,
212                                 int pblksize, int *errorp, int comp_method);
213 static void hammer2_zero_check_and_write(struct buf *bp,
214                                 hammer2_trans_t *trans, hammer2_inode_t *ip,
215                                 hammer2_inode_data_t *ipdata,
216                                 hammer2_chain_t **parentp,
217                                 hammer2_key_t lbase,
218                                 int ioflag, int pblksize, int *errorp);
219 static int test_block_not_zeros(char *buf, size_t bytes);
220 static void zero_write(struct buf *bp, hammer2_trans_t *trans,
221                                 hammer2_inode_t *ip,
222                                 hammer2_inode_data_t *ipdata,
223                                 hammer2_chain_t **parentp, 
224                                 hammer2_key_t lbase,
225                                 int *errorp);
226 static void hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp,
227                                 int ioflag, int pblksize, int *errorp);
228
229 static int hammer2_rcvdmsg(kdmsg_msg_t *msg);
230 static void hammer2_autodmsg(kdmsg_msg_t *msg);
231
232
233 /*
234  * HAMMER2 vfs operations.
235  */
236 static struct vfsops hammer2_vfsops = {
237         .vfs_init       = hammer2_vfs_init,
238         .vfs_uninit = hammer2_vfs_uninit,
239         .vfs_sync       = hammer2_vfs_sync,
240         .vfs_mount      = hammer2_vfs_mount,
241         .vfs_unmount    = hammer2_vfs_unmount,
242         .vfs_root       = hammer2_vfs_root,
243         .vfs_statfs     = hammer2_vfs_statfs,
244         .vfs_statvfs    = hammer2_vfs_statvfs,
245         .vfs_vget       = hammer2_vfs_vget,
246         .vfs_vptofh     = hammer2_vfs_vptofh,
247         .vfs_fhtovp     = hammer2_vfs_fhtovp,
248         .vfs_checkexp   = hammer2_vfs_checkexp
249 };
250
251 MALLOC_DEFINE(M_HAMMER2, "HAMMER2-mount", "");
252
253 VFS_SET(hammer2_vfsops, hammer2, 0);
254 MODULE_VERSION(hammer2, 1);
255
256 static
257 int
258 hammer2_vfs_init(struct vfsconf *conf)
259 {
260         static struct objcache_malloc_args margs_read;
261         static struct objcache_malloc_args margs_write;
262
263         int error;
264
265         error = 0;
266
267         if (HAMMER2_BLOCKREF_BYTES != sizeof(struct hammer2_blockref))
268                 error = EINVAL;
269         if (HAMMER2_INODE_BYTES != sizeof(struct hammer2_inode_data))
270                 error = EINVAL;
271         if (HAMMER2_VOLUME_BYTES != sizeof(struct hammer2_volume_data))
272                 error = EINVAL;
273
274         if (error)
275                 kprintf("HAMMER2 structure size mismatch; cannot continue.\n");
276         
277         margs_read.objsize = 65536;
278         margs_read.mtype = D_BUFFER;
279         
280         margs_write.objsize = 32768;
281         margs_write.mtype = C_BUFFER;
282         
283         cache_buffer_read = objcache_create(margs_read.mtype->ks_shortdesc,
284                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
285                                 objcache_malloc_free, &margs_read);
286         cache_buffer_write = objcache_create(margs_write.mtype->ks_shortdesc,
287                                 0, 1, NULL, NULL, NULL, objcache_malloc_alloc,
288                                 objcache_malloc_free, &margs_write);
289
290         lockinit(&hammer2_mntlk, "mntlk", 0, 0);
291         TAILQ_INIT(&hammer2_mntlist);
292
293         return (error);
294 }
295
296 static
297 int
298 hammer2_vfs_uninit(struct vfsconf *vfsp __unused)
299 {
300         objcache_destroy(cache_buffer_read);
301         objcache_destroy(cache_buffer_write);
302         return 0;
303 }
304
305 /*
306  * Mount or remount HAMMER2 fileystem from physical media
307  *
308  *      mountroot
309  *              mp              mount point structure
310  *              path            NULL
311  *              data            <unused>
312  *              cred            <unused>
313  *
314  *      mount
315  *              mp              mount point structure
316  *              path            path to mount point
317  *              data            pointer to argument structure in user space
318  *                      volume  volume path (device@LABEL form)
319  *                      hflags  user mount flags
320  *              cred            user credentials
321  *
322  * RETURNS:     0       Success
323  *              !0      error number
324  */
325 static
326 int
327 hammer2_vfs_mount(struct mount *mp, char *path, caddr_t data,
328                   struct ucred *cred)
329 {
330         struct hammer2_mount_info info;
331         hammer2_pfsmount_t *pmp;
332         hammer2_mount_t *hmp;
333         hammer2_key_t lhc;
334         struct vnode *devvp;
335         struct nlookupdata nd;
336         hammer2_chain_t *parent;
337         hammer2_chain_t *schain;
338         hammer2_chain_t *rchain;
339         struct file *fp;
340         char devstr[MNAMELEN];
341         size_t size;
342         size_t done;
343         char *dev;
344         char *label;
345         int ronly = 1;
346         int error;
347
348         hmp = NULL;
349         pmp = NULL;
350         dev = NULL;
351         label = NULL;
352         devvp = NULL;
353         
354
355         kprintf("hammer2_mount\n");
356
357         if (path == NULL) {
358                 /*
359                  * Root mount
360                  */
361                 bzero(&info, sizeof(info));
362                 info.cluster_fd = -1;
363                 return (EOPNOTSUPP);
364         } else {
365                 /*
366                  * Non-root mount or updating a mount
367                  */
368                 error = copyin(data, &info, sizeof(info));
369                 if (error)
370                         return (error);
371
372                 error = copyinstr(info.volume, devstr, MNAMELEN - 1, &done);
373                 if (error)
374                         return (error);
375
376                 /* Extract device and label */
377                 dev = devstr;
378                 label = strchr(devstr, '@');
379                 if (label == NULL ||
380                     ((label + 1) - dev) > done) {
381                         return (EINVAL);
382                 }
383                 *label = '\0';
384                 label++;
385                 if (*label == '\0')
386                         return (EINVAL);
387
388                 if (mp->mnt_flag & MNT_UPDATE) {
389                         /* Update mount */
390                         /* HAMMER2 implements NFS export via mountctl */
391                         hmp = MPTOHMP(mp);
392                         devvp = hmp->devvp;
393                         error = hammer2_remount(mp, path, devvp, cred);
394                         return error;
395                 }
396         }
397
398         /*
399          * PFS mount
400          *
401          * Lookup name and verify it refers to a block device.
402          */
403         error = nlookup_init(&nd, dev, UIO_SYSSPACE, NLC_FOLLOW);
404         if (error == 0)
405                 error = nlookup(&nd);
406         if (error == 0)
407                 error = cache_vref(&nd.nl_nch, nd.nl_cred, &devvp);
408         nlookup_done(&nd);
409
410         if (error == 0) {
411                 if (vn_isdisk(devvp, &error))
412                         error = vfs_mountedon(devvp);
413         }
414
415         /*
416          * Determine if the device has already been mounted.  After this
417          * check hmp will be non-NULL if we are doing the second or more
418          * hammer2 mounts from the same device.
419          */
420         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
421         TAILQ_FOREACH(hmp, &hammer2_mntlist, mntentry) {
422                 if (hmp->devvp == devvp)
423                         break;
424         }
425
426         /*
427          * Open the device if this isn't a secondary mount and construct
428          * the H2 device mount (hmp).
429          */
430         if (hmp == NULL) {
431                 if (error == 0 && vcount(devvp) > 0)
432                         error = EBUSY;
433
434                 /*
435                  * Now open the device
436                  */
437                 if (error == 0) {
438                         ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
439                         vn_lock(devvp, LK_EXCLUSIVE | LK_RETRY);
440                         error = vinvalbuf(devvp, V_SAVE, 0, 0);
441                         if (error == 0) {
442                                 error = VOP_OPEN(devvp,
443                                                  ronly ? FREAD : FREAD | FWRITE,
444                                                  FSCRED, NULL);
445                         }
446                         vn_unlock(devvp);
447                 }
448                 if (error && devvp) {
449                         vrele(devvp);
450                         devvp = NULL;
451                 }
452                 if (error) {
453                         lockmgr(&hammer2_mntlk, LK_RELEASE);
454                         return error;
455                 }
456                 hmp = kmalloc(sizeof(*hmp), M_HAMMER2, M_WAITOK | M_ZERO);
457                 hmp->ronly = ronly;
458                 hmp->devvp = devvp;
459                 kmalloc_create(&hmp->mchain, "HAMMER2-chains");
460                 TAILQ_INSERT_TAIL(&hammer2_mntlist, hmp, mntentry);
461
462                 lockinit(&hmp->alloclk, "h2alloc", 0, 0);
463                 lockinit(&hmp->voldatalk, "voldata", 0, LK_CANRECURSE);
464                 TAILQ_INIT(&hmp->transq);
465
466                 /*
467                  * vchain setup. vchain.data is embedded.
468                  * vchain.refs is initialized and will never drop to 0.
469                  */
470                 hmp->vchain.hmp = hmp;
471                 hmp->vchain.refs = 1;
472                 hmp->vchain.data = (void *)&hmp->voldata;
473                 hmp->vchain.bref.type = HAMMER2_BREF_TYPE_VOLUME;
474                 hmp->vchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
475                 hmp->vchain.delete_tid = HAMMER2_MAX_TID;
476                 hammer2_chain_core_alloc(&hmp->vchain, NULL);
477                 /* hmp->vchain.u.xxx is left NULL */
478
479                 /*
480                  * fchain setup.  fchain.data is embedded.
481                  * fchain.refs is initialized and will never drop to 0.
482                  *
483                  * The data is not used but needs to be initialized to
484                  * pass assertion muster.  We use this chain primarily
485                  * as a placeholder for the freemap's top-level RBTREE
486                  * so it does not interfere with the volume's topology
487                  * RBTREE.
488                  */
489                 hmp->fchain.hmp = hmp;
490                 hmp->fchain.refs = 1;
491                 hmp->fchain.data = (void *)&hmp->voldata.freemap_blockset;
492                 hmp->fchain.bref.type = HAMMER2_BREF_TYPE_FREEMAP;
493                 hmp->fchain.bref.data_off = 0 | HAMMER2_PBUFRADIX;
494                 hmp->fchain.bref.methods =
495                         HAMMER2_ENC_CHECK(HAMMER2_CHECK_FREEMAP) |
496                         HAMMER2_ENC_COMP(HAMMER2_COMP_NONE);
497                 hmp->fchain.delete_tid = HAMMER2_MAX_TID;
498
499                 hammer2_chain_core_alloc(&hmp->fchain, NULL);
500                 /* hmp->fchain.u.xxx is left NULL */
501
502                 /*
503                  * Install the volume header
504                  */
505                 error = hammer2_install_volume_header(hmp);
506                 if (error) {
507                         hammer2_vfs_unmount(mp, MNT_FORCE);
508                         return error;
509                 }
510
511                 /*
512                  * First locate the super-root inode, which is key 0
513                  * relative to the volume header's blockset.
514                  *
515                  * Then locate the root inode by scanning the directory keyspace
516                  * represented by the label.
517                  */
518                 parent = hammer2_chain_lookup_init(&hmp->vchain, 0);
519                 schain = hammer2_chain_lookup(&parent,
520                                       HAMMER2_SROOT_KEY, HAMMER2_SROOT_KEY, 0);
521                 hammer2_chain_lookup_done(parent);
522                 if (schain == NULL) {
523                         kprintf("hammer2_mount: invalid super-root\n");
524                         hammer2_vfs_unmount(mp, MNT_FORCE);
525                         return EINVAL;
526                 }
527                 hammer2_chain_ref(schain);      /* for hmp->schain */
528                 hmp->schain = schain;           /* left locked for inode_get */
529                 hmp->sroot = hammer2_inode_get(NULL, NULL, schain);
530                 hammer2_inode_ref(hmp->sroot);  /* for hmp->sroot */
531                 hammer2_inode_unlock_ex(hmp->sroot, schain);
532                 schain = NULL;
533                 
534                 mtx_init(&hmp->wthread_mtx);
535                 bioq_init(&hmp->wthread_bioq);
536                 hmp->wthread_destroy = 0;
537         
538                 /*
539                  * Launch threads.
540                  */
541                 lwkt_create(hammer2_write_thread, hmp,
542                                 NULL, NULL, 0, -1, "hammer2-write");
543         }
544
545         /*
546          * Block device opened successfully, finish initializing the
547          * mount structure.
548          *
549          * From this point on we have to call hammer2_unmount() on failure.
550          */
551         pmp = kmalloc(sizeof(*pmp), M_HAMMER2, M_WAITOK | M_ZERO);
552         pmp->mount_cluster = kmalloc(sizeof(hammer2_cluster_t), M_HAMMER2,
553                                      M_WAITOK | M_ZERO);
554         pmp->cluster = pmp->mount_cluster;
555
556         kmalloc_create(&pmp->minode, "HAMMER2-inodes");
557         kmalloc_create(&pmp->mmsg, "HAMMER2-pfsmsg");
558
559         pmp->mount_cluster->hmp = hmp;
560         spin_init(&pmp->inum_spin);
561         RB_INIT(&pmp->inum_tree);
562
563         kdmsg_iocom_init(&pmp->iocom, pmp,
564                          KDMSG_IOCOMF_AUTOCONN |
565                          KDMSG_IOCOMF_AUTOSPAN |
566                          KDMSG_IOCOMF_AUTOCIRC,
567                          pmp->mmsg, hammer2_rcvdmsg);
568
569         ccms_domain_init(&pmp->ccms_dom);
570         ++hmp->pmp_count;
571         lockmgr(&hammer2_mntlk, LK_RELEASE);
572         kprintf("hammer2_mount hmp=%p pmp=%p pmpcnt=%d\n",
573                 hmp, pmp, hmp->pmp_count);
574
575         mp->mnt_flag = MNT_LOCAL;
576         mp->mnt_kern_flag |= MNTK_ALL_MPSAFE;   /* all entry pts are SMP */
577
578         /*
579          * required mount structure initializations
580          */
581         mp->mnt_stat.f_iosize = HAMMER2_PBUFSIZE;
582         mp->mnt_stat.f_bsize = HAMMER2_PBUFSIZE;
583
584         mp->mnt_vstat.f_frsize = HAMMER2_PBUFSIZE;
585         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
586
587         /*
588          * Optional fields
589          */
590         mp->mnt_iosize_max = MAXPHYS;
591         mp->mnt_data = (qaddr_t)pmp;
592         pmp->mp = mp;
593
594         /*
595          * schain only has 1 ref now for its hmp->schain assignment.
596          * Setup for lookup (which will lock it).
597          */
598         parent = hammer2_chain_lookup_init(hmp->schain, 0);
599         lhc = hammer2_dirhash(label, strlen(label));
600         rchain = hammer2_chain_lookup(&parent,
601                                       lhc, lhc + HAMMER2_DIRHASH_LOMASK,
602                                       0);
603         while (rchain) {
604                 if (rchain->bref.type == HAMMER2_BREF_TYPE_INODE &&
605                     strcmp(label, rchain->data->ipdata.filename) == 0) {
606                         break;
607                 }
608                 rchain = hammer2_chain_next(&parent, rchain,
609                                             lhc, lhc + HAMMER2_DIRHASH_LOMASK,
610                                             0);
611         }
612         hammer2_chain_lookup_done(parent);
613         if (rchain == NULL) {
614                 kprintf("hammer2_mount: PFS label not found\n");
615                 hammer2_vfs_unmount(mp, MNT_FORCE);
616                 return EINVAL;
617         }
618         if (rchain->flags & HAMMER2_CHAIN_MOUNTED) {
619                 hammer2_chain_unlock(rchain);
620                 kprintf("hammer2_mount: PFS label already mounted!\n");
621                 hammer2_vfs_unmount(mp, MNT_FORCE);
622                 return EBUSY;
623         }
624         if (rchain->flags & HAMMER2_CHAIN_RECYCLE) {
625                 kprintf("hammer2_mount: PFS label currently recycling\n");
626                 hammer2_vfs_unmount(mp, MNT_FORCE);
627                 return EBUSY;
628         }
629
630         atomic_set_int(&rchain->flags, HAMMER2_CHAIN_MOUNTED);
631
632         /*
633          * NOTE: *_get() integrates chain's lock into the inode lock.
634          */
635         hammer2_chain_ref(rchain);              /* for pmp->rchain */
636         pmp->mount_cluster->rchain = rchain;    /* left held & unlocked */
637         pmp->iroot = hammer2_inode_get(pmp, NULL, rchain);
638         hammer2_inode_ref(pmp->iroot);          /* ref for pmp->iroot */
639
640         KKASSERT(rchain->pmp == NULL);          /* tracking pmp for rchain */
641         rchain->pmp = pmp;
642         atomic_add_long(&pmp->inmem_chains, 1);
643
644         hammer2_inode_unlock_ex(pmp->iroot, rchain);
645
646         kprintf("iroot %p\n", pmp->iroot);
647
648         /*
649          * Ref the cluster management messaging descriptor.  The mount
650          * program deals with the other end of the communications pipe.
651          */
652         fp = holdfp(curproc->p_fd, info.cluster_fd, -1);
653         if (fp == NULL) {
654                 kprintf("hammer2_mount: bad cluster_fd!\n");
655                 hammer2_vfs_unmount(mp, MNT_FORCE);
656                 return EBADF;
657         }
658         hammer2_cluster_reconnect(pmp, fp);
659
660         /*
661          * Finish setup
662          */
663         vfs_getnewfsid(mp);
664         vfs_add_vnodeops(mp, &hammer2_vnode_vops, &mp->mnt_vn_norm_ops);
665         vfs_add_vnodeops(mp, &hammer2_spec_vops, &mp->mnt_vn_spec_ops);
666         vfs_add_vnodeops(mp, &hammer2_fifo_vops, &mp->mnt_vn_fifo_ops);
667
668         copyinstr(info.volume, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, &size);
669         bzero(mp->mnt_stat.f_mntfromname + size, MNAMELEN - size);
670         bzero(mp->mnt_stat.f_mntonname, sizeof(mp->mnt_stat.f_mntonname));
671         copyinstr(path, mp->mnt_stat.f_mntonname,
672                   sizeof(mp->mnt_stat.f_mntonname) - 1,
673                   &size);
674
675         /*
676          * Initial statfs to prime mnt_stat.
677          */
678         hammer2_vfs_statfs(mp, &mp->mnt_stat, cred);
679         
680         return 0;
681 }
682
683 /*
684  * Handle bioq for strategy write
685  */
686 static
687 void
688 hammer2_write_thread(void *arg)
689 {
690         hammer2_mount_t* hmp;
691         struct bio *bio;
692         struct buf *bp;
693         hammer2_trans_t trans;
694         struct vnode *vp;
695         hammer2_inode_t *last_ip;
696         hammer2_inode_t *ip;
697         hammer2_chain_t *parent;
698         hammer2_chain_t **parentp;
699         hammer2_inode_data_t *ipdata;
700         hammer2_key_t lbase;
701         int lblksize;
702         int pblksize;
703         int error;
704         
705         hmp = arg;
706         
707         mtx_lock(&hmp->wthread_mtx);
708         while (hmp->wthread_destroy == 0) {
709                 if (bioq_first(&hmp->wthread_bioq) == NULL) {
710                         mtxsleep(&hmp->wthread_bioq, &hmp->wthread_mtx,
711                                  0, "h2bioqw", 0);
712                 }
713                 last_ip = NULL;
714                 parent = NULL;
715                 parentp = &parent;
716
717                 while ((bio = bioq_takefirst(&hmp->wthread_bioq)) != NULL) {
718                         mtx_unlock(&hmp->wthread_mtx);
719                         
720                         error = 0;
721                         bp = bio->bio_buf;
722                         vp = bp->b_vp;
723                         ip = VTOI(vp);
724
725                         /*
726                          * Cache transaction for multi-buffer flush efficiency.
727                          * Lock the ip separately for each buffer to allow
728                          * interleaving with frontend writes.
729                          */
730                         if (last_ip != ip) {
731                                 if (last_ip)
732                                         hammer2_trans_done(&trans);
733                                 hammer2_trans_init(&trans, ip->pmp,
734                                                    HAMMER2_TRANS_BUFCACHE);
735                                 last_ip = ip;
736                         }
737                         parent = hammer2_inode_lock_ex(ip);
738
739                         /*
740                          * Inode is modified, flush size and mtime changes
741                          * to ensure that the file size remains consistent
742                          * with the buffers being flushed.
743                          */
744                         if (ip->flags & (HAMMER2_INODE_RESIZED |
745                                          HAMMER2_INODE_MTIME)) {
746                                 hammer2_inode_fsync(&trans, ip, parentp);
747                         }
748                         ipdata = hammer2_chain_modify_ip(&trans, ip,
749                                                          parentp, 0);
750                         lblksize = hammer2_calc_logical(ip, bio->bio_offset,
751                                                         &lbase, NULL);
752                         pblksize = hammer2_calc_physical(ip, lbase);
753                         hammer2_write_file_core(bp, &trans, ip, ipdata,
754                                                 parentp,
755                                                 lbase, IO_ASYNC,
756                                                 pblksize, &error);
757                         hammer2_inode_unlock_ex(ip, parent);
758                         if (error) {
759                                 kprintf("hammer2: error in buffer write\n");
760                                 bp->b_flags |= B_ERROR;
761                                 bp->b_error = EIO;
762                         }
763                         biodone(bio);
764                         mtx_lock(&hmp->wthread_mtx);
765                 }
766
767                 /*
768                  * Clean out transaction cache
769                  */
770                 if (last_ip)
771                         hammer2_trans_done(&trans);
772         }
773         hmp->wthread_destroy = -1;
774         wakeup(&hmp->wthread_destroy);
775         
776         mtx_unlock(&hmp->wthread_mtx);
777 }
778
779 /* 
780  * From hammer2_vnops.c. 
781  * Physical block assignement function.
782  */
783 static
784 hammer2_chain_t *
785 hammer2_assign_physical(hammer2_trans_t *trans,
786                         hammer2_inode_t *ip, hammer2_chain_t **parentp,
787                         hammer2_key_t lbase, int pblksize, int *errorp)
788 {
789         hammer2_chain_t *parent;
790         hammer2_chain_t *chain;
791         hammer2_off_t pbase;
792         int pradix = hammer2_getradix(pblksize);
793
794         /*
795          * Locate the chain associated with lbase, return a locked chain.
796          * However, do not instantiate any data reference (which utilizes a
797          * device buffer) because we will be using direct IO via the
798          * logical buffer cache buffer.
799          */
800         *errorp = 0;
801 retry:
802         parent = *parentp;
803         hammer2_chain_lock(parent, HAMMER2_RESOLVE_ALWAYS); /* extra lock */
804         chain = hammer2_chain_lookup(&parent,
805                                      lbase, lbase,
806                                      HAMMER2_LOOKUP_NODATA);
807
808         if (chain == NULL) {
809                 /*
810                  * We found a hole, create a new chain entry.
811                  *
812                  * NOTE: DATA chains are created without device backing
813                  *       store (nor do we want any).
814                  */
815                 *errorp = hammer2_chain_create(trans, &parent, &chain,
816                                                lbase, HAMMER2_PBUFRADIX,
817                                                HAMMER2_BREF_TYPE_DATA,
818                                                pblksize);
819                 if (chain == NULL) {
820                         hammer2_chain_lookup_done(parent);
821                         panic("hammer2_chain_create: par=%p error=%d\n",
822                                 parent, *errorp);
823                         goto retry;
824                 }
825
826                 pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
827                 /*ip->delta_dcount += pblksize;*/
828         } else {
829                 switch (chain->bref.type) {
830                 case HAMMER2_BREF_TYPE_INODE:
831                         /*
832                          * The data is embedded in the inode.  The
833                          * caller is responsible for marking the inode
834                          * modified and copying the data to the embedded
835                          * area.
836                          */
837                         pbase = NOOFFSET;
838                         break;
839                 case HAMMER2_BREF_TYPE_DATA:
840                         if (chain->bytes != pblksize) {
841                                 hammer2_chain_resize(trans, ip,
842                                                      parent, &chain,
843                                                      pradix,
844                                                      HAMMER2_MODIFY_OPTDATA);
845                         }
846                         hammer2_chain_modify(trans, &chain,
847                                              HAMMER2_MODIFY_OPTDATA);
848                         pbase = chain->bref.data_off & ~HAMMER2_OFF_MASK_RADIX;
849                         break;
850                 default:
851                         panic("hammer2_assign_physical: bad type");
852                         /* NOT REACHED */
853                         pbase = NOOFFSET;
854                         break;
855                 }
856         }
857
858         /*
859          * Cleanup.  If chain wound up being the inode (i.e. DIRECTDATA),
860          * we might have to replace *parentp.
861          */
862         hammer2_chain_lookup_done(parent);
863         if (chain) {
864                 if (*parentp != chain &&
865                     (*parentp)->core == chain->core) {
866                         parent = *parentp;
867                         *parentp = chain;               /* eats lock */
868                         hammer2_chain_unlock(parent);
869                         hammer2_chain_lock(chain, 0);   /* need another */
870                 }
871                 /* else chain already locked for return */
872         }
873         return (chain);
874 }
875
876 /* 
877  * From hammer2_vnops.c.
878  * The core write function which determines which path to take
879  * depending on compression settings.
880  */
881 static
882 void
883 hammer2_write_file_core(struct buf *bp, hammer2_trans_t *trans,
884                         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
885                         hammer2_chain_t **parentp,
886                         hammer2_key_t lbase, int ioflag, int pblksize,
887                         int *errorp)
888 {
889         hammer2_chain_t *chain;
890         if (ipdata->comp_algo > HAMMER2_COMP_AUTOZERO) {
891                 hammer2_compress_and_write(bp, trans, ip,
892                                            ipdata, parentp,
893                                            lbase, ioflag,
894                                            pblksize, errorp,
895                                            ipdata->comp_algo);
896         } else if (ipdata->comp_algo == HAMMER2_COMP_AUTOZERO) {
897                 hammer2_zero_check_and_write(bp, trans, ip,
898                                     ipdata, parentp, lbase,
899                                     ioflag, pblksize, errorp);
900         } else {
901                 /*
902                  * We have to assign physical storage to the buffer
903                  * we intend to dirty or write now to avoid deadlocks
904                  * in the strategy code later.
905                  *
906                  * This can return NOOFFSET for inode-embedded data.
907                  * The strategy code will take care of it in that case.
908                  */
909                 chain = hammer2_assign_physical(trans, ip, parentp,
910                                                 lbase, pblksize,
911                                                 errorp);
912                 hammer2_write_bp(chain, bp, ioflag, pblksize, errorp);
913                 if (chain)
914                         hammer2_chain_unlock(chain);
915         }
916         ipdata = &ip->chain->data->ipdata;      /* reload */
917 }
918
919 /*
920  * From hammer2_vnops.c
921  * Generic function that will perform the compression in compression
922  * write path. The compression algorithm is determined by the settings
923  * obtained from inode.
924  */
925 static
926 void
927 hammer2_compress_and_write(struct buf *bp, hammer2_trans_t *trans,
928         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
929         hammer2_chain_t **parentp,
930         hammer2_key_t lbase, int ioflag, int pblksize,
931         int *errorp, int comp_method)
932 {
933         hammer2_chain_t *chain;
934
935         if (test_block_not_zeros(bp->b_data, pblksize)) {
936                 int comp_size = 0;
937                 int comp_block_size;
938                 char *comp_buffer;
939
940                 comp_buffer = NULL;
941
942                 KKASSERT(pblksize / 2 <= 32768);
943                 
944                 if (ip->comp_heuristic < 8 || (ip->comp_heuristic & 7) == 0) {
945                         if ((comp_method & 0x0F) == HAMMER2_COMP_LZ4) {
946                                 comp_buffer = objcache_get(cache_buffer_write,
947                                                            M_INTWAIT);
948                                 comp_size = LZ4_compress_limitedOutput(
949                                                 bp->b_data,
950                                                 &comp_buffer[sizeof(int)],
951                                                 pblksize,
952                                                 pblksize / 2 - sizeof(int));
953                                 /*
954                                  * We need to prefix with the size, LZ4
955                                  * doesn't do it for us.  Add the related
956                                  * overhead.
957                                  */
958                                 *(int *)comp_buffer = comp_size;
959                                 if (comp_size)
960                                         comp_size += sizeof(int);
961                         } else if ((comp_method & 0x0F) == HAMMER2_COMP_ZLIB) {
962                                 int comp_level = (comp_method >> 4) & 0x0F;
963                                 z_stream strm_compress;
964                                 int ret;
965
966                                 ret = deflateInit(&strm_compress, comp_level);
967                                 if (ret != Z_OK)
968                                         kprintf("HAMMER2 ZLIB: fatal error "
969                                                 "on deflateInit.\n");
970                                 
971                                 comp_buffer = objcache_get(cache_buffer_write,
972                                                            M_INTWAIT);
973                                 strm_compress.next_in = bp->b_data;
974                                 strm_compress.avail_in = pblksize;
975                                 strm_compress.next_out = comp_buffer;
976                                 strm_compress.avail_out = pblksize / 2;
977                                 ret = deflate(&strm_compress, Z_FINISH);
978                                 if (ret == Z_STREAM_END) {
979                                         comp_size = pblksize / 2 -
980                                                     strm_compress.avail_out;
981                                 } else {
982                                         comp_size = 0;
983                                 }
984                                 ret = deflateEnd(&strm_compress);
985                         } else {
986                                 kprintf("Error: Unknown compression method.\n");
987                                 kprintf("Comp_method = %d.\n", comp_method);
988                         }
989                 }
990
991                 if (comp_size == 0) {
992                         /*
993                          * compression failed or turned off
994                          */
995                         comp_block_size = pblksize;     /* safety */
996                         if (++ip->comp_heuristic > 128)
997                                 ip->comp_heuristic = 8;
998                 } else {
999                         /*
1000                          * compression succeeded
1001                          */
1002                         ip->comp_heuristic = 0;
1003                         if (comp_size <= 1024) {
1004                                 comp_block_size = 1024;
1005                         } else if (comp_size <= 2048) {
1006                                 comp_block_size = 2048;
1007                         } else if (comp_size <= 4096) {
1008                                 comp_block_size = 4096;
1009                         } else if (comp_size <= 8192) {
1010                                 comp_block_size = 8192;
1011                         } else if (comp_size <= 16384) {
1012                                 comp_block_size = 16384;
1013                         } else if (comp_size <= 32768) {
1014                                 comp_block_size = 32768;
1015                         } else {
1016                                 panic("hammer2: WRITE PATH: "
1017                                       "Weird comp_size value.");
1018                                 /* NOT REACHED */
1019                                 comp_block_size = pblksize;
1020                         }
1021                 }
1022
1023                 chain = hammer2_assign_physical(trans, ip, parentp,
1024                                                 lbase, comp_block_size,
1025                                                 errorp);
1026                 ipdata = &ip->chain->data->ipdata;      /* RELOAD */
1027                         
1028                 if (*errorp) {
1029                         kprintf("WRITE PATH: An error occurred while "
1030                                 "assigning physical space.\n");
1031                         KKASSERT(chain == NULL);
1032                 } else {
1033                         /* Get device offset */
1034                         hammer2_off_t pbase;
1035                         hammer2_off_t pmask;
1036                         hammer2_off_t peof;
1037                         size_t boff;
1038                         size_t psize;
1039                         struct buf *dbp;
1040                         int temp_check;
1041                         
1042                         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1043                         
1044                         switch(chain->bref.type) {
1045                         case HAMMER2_BREF_TYPE_INODE:
1046                                 KKASSERT(chain->data->ipdata.op_flags &
1047                                         HAMMER2_OPFLAG_DIRECTDATA);
1048                                 KKASSERT(bp->b_loffset == 0);
1049                                 bcopy(bp->b_data, chain->data->ipdata.u.data,
1050                                         HAMMER2_EMBEDDED_BYTES);
1051                                 break;
1052                         case HAMMER2_BREF_TYPE_DATA:                            
1053                                 psize = hammer2_devblksize(chain->bytes);
1054                                 pmask = (hammer2_off_t)psize - 1;
1055                                 pbase = chain->bref.data_off & ~pmask;
1056                                 boff = chain->bref.data_off &
1057                                        (HAMMER2_OFF_MASK & pmask);
1058                                 peof = (pbase + HAMMER2_SEGMASK64) &
1059                                        ~HAMMER2_SEGMASK64;
1060                                 temp_check = HAMMER2_DEC_CHECK(
1061                                                         chain->bref.methods);
1062
1063                                 /*
1064                                  * Optimize out the read-before-write
1065                                  * if possible.
1066                                  */
1067                                 if (comp_block_size == psize) {
1068                                         dbp = getblk(chain->hmp->devvp, pbase,
1069                                                      psize, 0, 0);
1070                                 } else {
1071                                         *errorp = bread(chain->hmp->devvp,
1072                                                         pbase, psize, &dbp);
1073                                         if (*errorp) {
1074                                                 kprintf("hammer2: WRITE PATH: "
1075                                                         "dbp bread error\n");
1076                                                 break;
1077                                         }
1078                                 }
1079
1080                                 /*
1081                                  * When loading the block make sure we don't
1082                                  * leave garbage after the compressed data.
1083                                  */
1084                                 if (comp_size) {
1085                                         chain->bref.methods =
1086                                                 HAMMER2_ENC_COMP(comp_method) +
1087                                                 HAMMER2_ENC_CHECK(temp_check);
1088                                         bcopy(comp_buffer, dbp->b_data + boff,
1089                                               comp_size);
1090                                         if (comp_size != comp_block_size) {
1091                                                 bzero(dbp->b_data + boff +
1092                                                         comp_size,
1093                                                       comp_block_size -
1094                                                         comp_size);
1095                                         }
1096                                 } else {
1097                                         chain->bref.methods =
1098                                                 HAMMER2_ENC_COMP(
1099                                                         HAMMER2_COMP_NONE) +
1100                                                 HAMMER2_ENC_CHECK(temp_check);
1101                                         bcopy(bp->b_data, dbp->b_data + boff,
1102                                               pblksize);
1103                                 }
1104
1105                                 /*
1106                                  * Device buffer is now valid, chain is no
1107                                  * longer in the initial state.
1108                                  */
1109                                 atomic_clear_int(&chain->flags,
1110                                                  HAMMER2_CHAIN_INITIAL);
1111
1112                                 /* Now write the related bdp. */
1113                                 if (ioflag & IO_SYNC) {
1114                                         /*
1115                                          * Synchronous I/O requested.
1116                                          */
1117                                         bwrite(dbp);
1118                                 /*
1119                                 } else if ((ioflag & IO_DIRECT) &&
1120                                            loff + n == pblksize) {
1121                                         bdwrite(dbp);
1122                                 */
1123                                 } else if (ioflag & IO_ASYNC) {
1124                                         bawrite(dbp);
1125                                 } else if (hammer2_cluster_enable) {
1126                                         cluster_write(dbp, peof,
1127                                                       HAMMER2_PBUFSIZE,
1128                                                       4/*XXX*/);
1129                                 } else {
1130                                         bdwrite(dbp);
1131                                 }
1132                                 break;
1133                         default:
1134                                 panic("hammer2_write_bp: bad chain type %d\n",
1135                                         chain->bref.type);
1136                                 /* NOT REACHED */
1137                                 break;
1138                         }
1139                         
1140                         hammer2_chain_unlock(chain);
1141                 }
1142                 if (comp_buffer)
1143                         objcache_put(cache_buffer_write, comp_buffer);
1144         } else {
1145                 zero_write(bp, trans, ip, ipdata, parentp, lbase, errorp);
1146         }
1147 }
1148
1149 /*
1150  * Function that performs zero-checking and writing without compression,
1151  * it corresponds to default zero-checking path.
1152  */
1153 static
1154 void
1155 hammer2_zero_check_and_write(struct buf *bp, hammer2_trans_t *trans,
1156         hammer2_inode_t *ip, hammer2_inode_data_t *ipdata,
1157         hammer2_chain_t **parentp,
1158         hammer2_key_t lbase, int ioflag, int pblksize, int *errorp)
1159 {
1160         hammer2_chain_t *chain;
1161
1162         if (test_block_not_zeros(bp->b_data, pblksize)) {
1163                 chain = hammer2_assign_physical(trans, ip, parentp,
1164                                                 lbase, pblksize, errorp);
1165                 hammer2_write_bp(chain, bp, ioflag, pblksize, errorp);
1166                 if (chain)
1167                         hammer2_chain_unlock(chain);
1168         } else {
1169                 zero_write(bp, trans, ip, ipdata, parentp, lbase, errorp);
1170         }
1171 }
1172
1173 /*
1174  * A function to test whether a block of data contains only zeros,
1175  * returns 0 in that case or returns 1 otherwise.
1176  */
1177 static
1178 int
1179 test_block_not_zeros(char *buf, size_t bytes)
1180 {
1181         size_t i;
1182
1183         for (i = 0; i < bytes; i += sizeof(long)) {
1184                 if (*(long *)(buf + i) != 0)
1185                         return (1);
1186         }
1187         return (0);
1188 }
1189
1190 /*
1191  * Function to "write" a block that contains only zeros.
1192  */
1193 static
1194 void
1195 zero_write(struct buf *bp, hammer2_trans_t *trans, hammer2_inode_t *ip,
1196         hammer2_inode_data_t *ipdata, hammer2_chain_t **parentp,
1197         hammer2_key_t lbase, int *errorp __unused)
1198 {
1199         hammer2_chain_t *parent;
1200         hammer2_chain_t *chain;
1201
1202         parent = hammer2_chain_lookup_init(*parentp, 0);
1203
1204         chain = hammer2_chain_lookup(&parent, lbase, lbase,
1205                                      HAMMER2_LOOKUP_NODATA);
1206         if (chain) {
1207                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
1208                         bzero(chain->data->ipdata.u.data,
1209                               HAMMER2_EMBEDDED_BYTES);
1210                 } else {
1211                         hammer2_chain_delete(trans, chain, 0);
1212                 }
1213                 hammer2_chain_unlock(chain);
1214         }
1215         hammer2_chain_lookup_done(parent);
1216 }
1217
1218 /*
1219  * Function to write the data as it is, without performing any sort of
1220  * compression. This function is used in path without compression and
1221  * default zero-checking path.
1222  */
1223 static
1224 void
1225 hammer2_write_bp(hammer2_chain_t *chain, struct buf *bp, int ioflag,
1226                                 int pblksize, int *errorp)
1227 {
1228         hammer2_off_t pbase;
1229         hammer2_off_t pmask;
1230         hammer2_off_t peof;
1231         struct buf *dbp;
1232         size_t boff;
1233         size_t psize;
1234         int error;
1235         int temp_check = HAMMER2_DEC_CHECK(chain->bref.methods);
1236
1237         KKASSERT(chain->flags & HAMMER2_CHAIN_MODIFIED);
1238
1239         switch(chain->bref.type) {
1240         case HAMMER2_BREF_TYPE_INODE:
1241                 KKASSERT(chain->data->ipdata.op_flags &
1242                          HAMMER2_OPFLAG_DIRECTDATA);
1243                 KKASSERT(bp->b_loffset == 0);
1244                 bcopy(bp->b_data, chain->data->ipdata.u.data,
1245                       HAMMER2_EMBEDDED_BYTES);
1246                 error = 0;
1247                 break;
1248         case HAMMER2_BREF_TYPE_DATA:
1249                 psize = hammer2_devblksize(chain->bytes);
1250                 pmask = (hammer2_off_t)psize - 1;
1251                 pbase = chain->bref.data_off & ~pmask;
1252                 boff = chain->bref.data_off & (HAMMER2_OFF_MASK & pmask);
1253                 peof = (pbase + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
1254
1255                 if (psize == pblksize) {
1256                         dbp = getblk(chain->hmp->devvp, pbase,
1257                                      psize, 0, 0);
1258                         error = 0;
1259                 } else {
1260                         error = bread(chain->hmp->devvp, pbase, psize, &dbp);
1261                         if (error) {
1262                                 kprintf("hammer2: WRITE PATH: "
1263                                         "dbp bread error\n");
1264                                 break;
1265                         }
1266                 }
1267
1268                 chain->bref.methods = HAMMER2_ENC_COMP(HAMMER2_COMP_NONE) +
1269                                       HAMMER2_ENC_CHECK(temp_check);
1270                 bcopy(bp->b_data, dbp->b_data + boff, chain->bytes);
1271                 
1272                 /*
1273                  * Device buffer is now valid, chain is no
1274                  * longer in the initial state.
1275                  */
1276                 atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
1277
1278                 if (ioflag & IO_SYNC) {
1279                         /*
1280                          * Synchronous I/O requested.
1281                          */
1282                         bwrite(dbp);
1283                 /*
1284                 } else if ((ioflag & IO_DIRECT) && loff + n == pblksize) {
1285                         bdwrite(dbp);
1286                 */
1287                 } else if (ioflag & IO_ASYNC) {
1288                         bawrite(dbp);
1289                 } else if (hammer2_cluster_enable) {
1290                         cluster_write(dbp, peof, HAMMER2_PBUFSIZE, 4/*XXX*/);
1291                 } else {
1292                         bdwrite(dbp);
1293                 }
1294                 break;
1295         default:
1296                 panic("hammer2_write_bp: bad chain type %d\n",
1297                       chain->bref.type);
1298                 /* NOT REACHED */
1299                 error = 0;
1300                 break;
1301         }
1302         *errorp = error;
1303 }
1304
1305 static
1306 int
1307 hammer2_remount(struct mount *mp, char *path, struct vnode *devvp,
1308                 struct ucred *cred)
1309 {
1310         return (0);
1311 }
1312
1313 static
1314 int
1315 hammer2_vfs_unmount(struct mount *mp, int mntflags)
1316 {
1317         hammer2_pfsmount_t *pmp;
1318         hammer2_mount_t *hmp;
1319         hammer2_cluster_t *cluster;
1320         int flags;
1321         int error = 0;
1322         int ronly = ((mp->mnt_flag & MNT_RDONLY) != 0);
1323         int dumpcnt;
1324         struct vnode *devvp;
1325
1326         pmp = MPTOPMP(mp);
1327         cluster = pmp->mount_cluster;
1328         hmp = cluster->hmp;
1329         flags = 0;
1330
1331         if (mntflags & MNT_FORCE)
1332                 flags |= FORCECLOSE;
1333
1334         hammer2_mount_exlock(hmp);
1335
1336         /*
1337          * If mount initialization proceeded far enough we must flush
1338          * its vnodes.
1339          */
1340         if (pmp->iroot)
1341                 error = vflush(mp, 0, flags);
1342
1343         if (error) {
1344                 hammer2_mount_unlock(hmp);
1345                 return error;
1346         }
1347
1348         lockmgr(&hammer2_mntlk, LK_EXCLUSIVE);
1349         --hmp->pmp_count;
1350         kprintf("hammer2_unmount hmp=%p pmpcnt=%d\n", hmp, hmp->pmp_count);
1351
1352         /*
1353          * Flush any left over chains.  The voldata lock is only used
1354          * to synchronize against HAMMER2_CHAIN_MODIFIED_AUX.
1355          */
1356         hammer2_voldata_lock(hmp);
1357         if ((hmp->vchain.flags | hmp->fchain.flags) &
1358             (HAMMER2_CHAIN_MODIFIED | HAMMER2_CHAIN_SUBMODIFIED)) {
1359                 hammer2_voldata_unlock(hmp, 0);
1360                 hammer2_vfs_sync(mp, MNT_WAIT);
1361                 hammer2_vfs_sync(mp, MNT_WAIT);
1362         } else {
1363                 hammer2_voldata_unlock(hmp, 0);
1364         }
1365         if (hmp->pmp_count == 0) {
1366                 if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
1367                                          HAMMER2_CHAIN_SUBMODIFIED)) {
1368                         kprintf("hammer2_unmount: chains left over after "
1369                                 "final sync\n");
1370                         if (hammer2_debug & 0x0010)
1371                                 Debugger("entered debugger");
1372                 }
1373         }
1374
1375         /*
1376          * Cleanup the root and super-root chain elements (which should be
1377          * clean).
1378          */
1379         if (pmp->iroot) {
1380 #if REPORT_REFS_ERRORS
1381                 if (pmp->iroot->refs != 1)
1382                         kprintf("PMP->IROOT %p REFS WRONG %d\n",
1383                                 pmp->iroot, pmp->iroot->refs);
1384 #else
1385                 KKASSERT(pmp->iroot->refs == 1);
1386 #endif
1387                 hammer2_inode_drop(pmp->iroot);     /* ref for pmp->iroot */
1388                 pmp->iroot = NULL;
1389         }
1390         if (cluster->rchain) {
1391                 atomic_clear_int(&cluster->rchain->flags,
1392                                  HAMMER2_CHAIN_MOUNTED);
1393 #if REPORT_REFS_ERRORS
1394                 if (cluster->rchain->refs != 1)
1395                         kprintf("PMP->RCHAIN %p REFS WRONG %d\n",
1396                                 cluster->rchain, cluster->rchain->refs);
1397 #else
1398                 KKASSERT(cluster->rchain->refs == 1);
1399 #endif
1400                 hammer2_chain_drop(cluster->rchain);
1401                 cluster->rchain = NULL;
1402         }
1403         ccms_domain_uninit(&pmp->ccms_dom);
1404
1405         /*
1406          * Kill cluster controller
1407          */
1408         kdmsg_iocom_uninit(&pmp->iocom);
1409
1410         /*
1411          * If no PFS's left drop the master hammer2_mount for the device.
1412          */
1413         if (hmp->pmp_count == 0) {
1414                 if (hmp->sroot) {
1415                         hammer2_inode_drop(hmp->sroot);
1416                         hmp->sroot = NULL;
1417                 }
1418                 if (hmp->schain) {
1419 #if REPORT_REFS_ERRORS
1420                         if (hmp->schain->refs != 1)
1421                                 kprintf("HMP->SCHAIN %p REFS WRONG %d\n",
1422                                         hmp->schain, hmp->schain->refs);
1423 #else
1424                         KKASSERT(hmp->schain->refs == 1);
1425 #endif
1426                         hammer2_chain_drop(hmp->schain);
1427                         hmp->schain = NULL;
1428                 }
1429
1430                 /*
1431                  * Finish up with the device vnode
1432                  */
1433                 if ((devvp = hmp->devvp) != NULL) {
1434                         vinvalbuf(devvp, (ronly ? 0 : V_SAVE), 0, 0);
1435                         hmp->devvp = NULL;
1436                         VOP_CLOSE(devvp, (ronly ? FREAD : FREAD|FWRITE));
1437                         vrele(devvp);
1438                         devvp = NULL;
1439                 }
1440
1441                 /*
1442                  * Final drop of embedded freemap root chain to clean up
1443                  * fchain.core (fchain structure is not flagged ALLOCATED
1444                  * so it is cleaned out and then left to rot).
1445                  */
1446                 hammer2_chain_drop(&hmp->fchain);
1447
1448                 /*
1449                  * Final drop of embedded volume root chain to clean up
1450                  * vchain.core (vchain structure is not flagged ALLOCATED
1451                  * so it is cleaned out and then left to rot).
1452                  */
1453                 dumpcnt = 50;
1454                 hammer2_dump_chain(&hmp->vchain, 0, &dumpcnt);
1455                 hammer2_mount_unlock(hmp);
1456                 hammer2_chain_drop(&hmp->vchain);
1457         } else {
1458                 hammer2_mount_unlock(hmp);
1459         }
1460
1461         pmp->mp = NULL;
1462         mp->mnt_data = NULL;
1463
1464         pmp->mount_cluster = NULL;
1465         pmp->cluster = NULL;            /* XXX */
1466
1467         kmalloc_destroy(&pmp->mmsg);
1468         kmalloc_destroy(&pmp->minode);
1469
1470         cluster->hmp = NULL;
1471
1472         kfree(cluster, M_HAMMER2);
1473         kfree(pmp, M_HAMMER2);
1474         if (hmp->pmp_count == 0) {
1475                 mtx_lock(&hmp->wthread_mtx);
1476                 hmp->wthread_destroy = 1;
1477                 wakeup(&hmp->wthread_bioq);
1478                 while (hmp->wthread_destroy != -1) {
1479                         mtxsleep(&hmp->wthread_destroy, &hmp->wthread_mtx, 0,
1480                                 "umount-sleep", 0);
1481                 }
1482                 mtx_unlock(&hmp->wthread_mtx);
1483                 
1484                 TAILQ_REMOVE(&hammer2_mntlist, hmp, mntentry);
1485                 kmalloc_destroy(&hmp->mchain);
1486                 kfree(hmp, M_HAMMER2);
1487         }
1488         lockmgr(&hammer2_mntlk, LK_RELEASE);
1489
1490         return (error);
1491 }
1492
1493 static
1494 int
1495 hammer2_vfs_vget(struct mount *mp, struct vnode *dvp,
1496              ino_t ino, struct vnode **vpp)
1497 {
1498         kprintf("hammer2_vget\n");
1499         return (EOPNOTSUPP);
1500 }
1501
1502 static
1503 int
1504 hammer2_vfs_root(struct mount *mp, struct vnode **vpp)
1505 {
1506         hammer2_pfsmount_t *pmp;
1507         hammer2_chain_t *parent;
1508         int error;
1509         struct vnode *vp;
1510
1511         pmp = MPTOPMP(mp);
1512         if (pmp->iroot == NULL) {
1513                 *vpp = NULL;
1514                 error = EINVAL;
1515         } else {
1516                 parent = hammer2_inode_lock_sh(pmp->iroot);
1517                 vp = hammer2_igetv(pmp->iroot, &error);
1518                 hammer2_inode_unlock_sh(pmp->iroot, parent);
1519                 *vpp = vp;
1520                 if (vp == NULL)
1521                         kprintf("vnodefail\n");
1522         }
1523
1524         return (error);
1525 }
1526
1527 /*
1528  * Filesystem status
1529  *
1530  * XXX incorporate ipdata->inode_quota and data_quota
1531  */
1532 static
1533 int
1534 hammer2_vfs_statfs(struct mount *mp, struct statfs *sbp, struct ucred *cred)
1535 {
1536         hammer2_pfsmount_t *pmp;
1537         hammer2_mount_t *hmp;
1538
1539         pmp = MPTOPMP(mp);
1540         hmp = MPTOHMP(mp);
1541
1542         mp->mnt_stat.f_files = pmp->inode_count;
1543         mp->mnt_stat.f_ffree = 0;
1544         mp->mnt_stat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1545         mp->mnt_stat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1546         mp->mnt_stat.f_bavail = mp->mnt_stat.f_bfree;
1547
1548         *sbp = mp->mnt_stat;
1549         return (0);
1550 }
1551
1552 static
1553 int
1554 hammer2_vfs_statvfs(struct mount *mp, struct statvfs *sbp, struct ucred *cred)
1555 {
1556         hammer2_pfsmount_t *pmp;
1557         hammer2_mount_t *hmp;
1558
1559         pmp = MPTOPMP(mp);
1560         hmp = MPTOHMP(mp);
1561
1562         mp->mnt_vstat.f_bsize = HAMMER2_PBUFSIZE;
1563         mp->mnt_vstat.f_files = pmp->inode_count;
1564         mp->mnt_vstat.f_ffree = 0;
1565         mp->mnt_vstat.f_blocks = hmp->voldata.allocator_size / HAMMER2_PBUFSIZE;
1566         mp->mnt_vstat.f_bfree =  hmp->voldata.allocator_free / HAMMER2_PBUFSIZE;
1567         mp->mnt_vstat.f_bavail = mp->mnt_vstat.f_bfree;
1568
1569         *sbp = mp->mnt_vstat;
1570         return (0);
1571 }
1572
1573 /*
1574  * Sync the entire filesystem; this is called from the filesystem syncer
1575  * process periodically and whenever a user calls sync(1) on the hammer
1576  * mountpoint.
1577  *
1578  * Currently is actually called from the syncer! \o/
1579  *
1580  * This task will have to snapshot the state of the dirty inode chain.
1581  * From that, it will have to make sure all of the inodes on the dirty
1582  * chain have IO initiated. We make sure that io is initiated for the root
1583  * block.
1584  *
1585  * If waitfor is set, we wait for media to acknowledge the new rootblock.
1586  *
1587  * THINKS: side A vs side B, to have sync not stall all I/O?
1588  */
1589 static
1590 int
1591 hammer2_vfs_sync(struct mount *mp, int waitfor)
1592 {
1593         struct hammer2_sync_info info;
1594         hammer2_pfsmount_t *pmp;
1595         hammer2_cluster_t *cluster;
1596         hammer2_mount_t *hmp;
1597         int flags;
1598         int error;
1599         int i;
1600
1601         pmp = MPTOPMP(mp);
1602
1603         /*
1604          * We can't acquire locks on existing vnodes while in a transaction
1605          * without risking a deadlock.  This assumes that vfsync() can be
1606          * called without the vnode locked (which it can in DragonFly).
1607          * Otherwise we'd have to implement a multi-pass or flag the lock
1608          * failures and retry.
1609          */
1610         /*flags = VMSC_GETVP;*/
1611         flags = 0;
1612         if (waitfor & MNT_LAZY)
1613                 flags |= VMSC_ONEPASS;
1614
1615         hammer2_trans_init(&info.trans, pmp, HAMMER2_TRANS_ISFLUSH);
1616
1617         info.error = 0;
1618         info.waitfor = MNT_NOWAIT;
1619         vmntvnodescan(mp, flags | VMSC_NOWAIT,
1620                       hammer2_sync_scan1,
1621                       hammer2_sync_scan2, &info);
1622         if (info.error == 0 && (waitfor & MNT_WAIT)) {
1623                 info.waitfor = waitfor;
1624                     vmntvnodescan(mp, flags,
1625                                   hammer2_sync_scan1,
1626                                   hammer2_sync_scan2, &info);
1627
1628         }
1629 #if 0
1630         if (waitfor == MNT_WAIT) {
1631                 /* XXX */
1632         } else {
1633                 /* XXX */
1634         }
1635 #endif
1636
1637         cluster = pmp->cluster;
1638         hmp = cluster->hmp;
1639
1640         hammer2_chain_lock(&hmp->vchain, HAMMER2_RESOLVE_ALWAYS);
1641         if (hmp->vchain.flags & (HAMMER2_CHAIN_MODIFIED |
1642                                   HAMMER2_CHAIN_SUBMODIFIED)) {
1643                 hammer2_chain_flush(&info.trans, &hmp->vchain);
1644         }
1645         hammer2_chain_unlock(&hmp->vchain);
1646
1647 #if 1
1648         /*
1649          * Rollup flush.  The fsyncs above basically just flushed
1650          * data blocks.  The flush below gets all the meta-data.
1651          */
1652         hammer2_chain_lock(&hmp->fchain, HAMMER2_RESOLVE_ALWAYS);
1653         if (hmp->fchain.flags & (HAMMER2_CHAIN_MODIFIED |
1654                                  HAMMER2_CHAIN_SUBMODIFIED)) {
1655                 /* this will modify vchain as a side effect */
1656                 hammer2_chain_flush(&info.trans, &hmp->fchain);
1657         }
1658         hammer2_chain_unlock(&hmp->fchain);
1659 #endif
1660
1661
1662         error = 0;
1663
1664         /*
1665          * We can't safely flush the volume header until we have
1666          * flushed any device buffers which have built up.
1667          *
1668          * XXX this isn't being incremental
1669          */
1670         vn_lock(hmp->devvp, LK_EXCLUSIVE | LK_RETRY);
1671         error = VOP_FSYNC(hmp->devvp, MNT_WAIT, 0);
1672         vn_unlock(hmp->devvp);
1673
1674         /*
1675          * The flush code sets CHAIN_VOLUMESYNC to indicate that the
1676          * volume header needs synchronization via hmp->volsync.
1677          *
1678          * XXX synchronize the flag & data with only this flush XXX
1679          */
1680         if (error == 0 && (hmp->vchain.flags & HAMMER2_CHAIN_VOLUMESYNC)) {
1681                 struct buf *bp;
1682
1683                 /*
1684                  * Synchronize the disk before flushing the volume
1685                  * header.
1686                  */
1687                 bp = getpbuf(NULL);
1688                 bp->b_bio1.bio_offset = 0;
1689                 bp->b_bufsize = 0;
1690                 bp->b_bcount = 0;
1691                 bp->b_cmd = BUF_CMD_FLUSH;
1692                 bp->b_bio1.bio_done = biodone_sync;
1693                 bp->b_bio1.bio_flags |= BIO_SYNC;
1694                 vn_strategy(hmp->devvp, &bp->b_bio1);
1695                 biowait(&bp->b_bio1, "h2vol");
1696                 relpbuf(bp, NULL);
1697
1698                 /*
1699                  * Then we can safely flush the version of the volume header
1700                  * synchronized by the flush code.
1701                  */
1702                 i = hmp->volhdrno + 1;
1703                 if (i >= HAMMER2_NUM_VOLHDRS)
1704                         i = 0;
1705                 if (i * HAMMER2_ZONE_BYTES64 + HAMMER2_SEGSIZE >
1706                     hmp->volsync.volu_size) {
1707                         i = 0;
1708                 }
1709                 kprintf("sync volhdr %d %jd\n",
1710                         i, (intmax_t)hmp->volsync.volu_size);
1711                 bp = getblk(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
1712                             HAMMER2_PBUFSIZE, 0, 0);
1713                 atomic_clear_int(&hmp->vchain.flags, HAMMER2_CHAIN_VOLUMESYNC);
1714                 bcopy(&hmp->volsync, bp->b_data, HAMMER2_PBUFSIZE);
1715                 bawrite(bp);
1716                 hmp->volhdrno = i;
1717         }
1718         hammer2_trans_done(&info.trans);
1719         return (error);
1720 }
1721
1722 /*
1723  * Sync passes.
1724  *
1725  * NOTE: We don't test SUBMODIFIED or MOVED here because the fsync code
1726  *       won't flush on those flags.  The syncer code above will do a
1727  *       general meta-data flush globally that will catch these flags.
1728  */
1729 static int
1730 hammer2_sync_scan1(struct mount *mp, struct vnode *vp, void *data)
1731 {
1732         hammer2_inode_t *ip;
1733
1734         ip = VTOI(vp);
1735         if (vp->v_type == VNON || ip == NULL ||
1736             ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
1737              RB_EMPTY(&vp->v_rbdirty_tree))) {
1738                 return(-1);
1739         }
1740         return(0);
1741 }
1742
1743 static int
1744 hammer2_sync_scan2(struct mount *mp, struct vnode *vp, void *data)
1745 {
1746         struct hammer2_sync_info *info = data;
1747         hammer2_inode_t *ip;
1748         hammer2_chain_t *parent;
1749         int error;
1750
1751         ip = VTOI(vp);
1752         if (vp->v_type == VNON || vp->v_type == VBAD ||
1753             ((ip->flags & HAMMER2_INODE_MODIFIED) == 0 &&
1754              RB_EMPTY(&vp->v_rbdirty_tree))) {
1755                 return(0);
1756         }
1757
1758         /*
1759          * VOP_FSYNC will start a new transaction so replicate some code
1760          * here to do it inline (see hammer2_vop_fsync()).
1761          *
1762          * WARNING: The vfsync interacts with the buffer cache and might
1763          *          block, we can't hold the inode lock at that time.
1764          */
1765         atomic_clear_int(&ip->flags, HAMMER2_INODE_MODIFIED);
1766         if (ip->vp)
1767                 vfsync(ip->vp, MNT_NOWAIT, 1, NULL, NULL);
1768         parent = hammer2_inode_lock_ex(ip);
1769         hammer2_chain_flush(&info->trans, parent);
1770         hammer2_inode_unlock_ex(ip, parent);
1771         error = 0;
1772 #if 0
1773         error = VOP_FSYNC(vp, MNT_NOWAIT, 0);
1774 #endif
1775         if (error)
1776                 info->error = error;
1777         return(0);
1778 }
1779
1780 static
1781 int
1782 hammer2_vfs_vptofh(struct vnode *vp, struct fid *fhp)
1783 {
1784         return (0);
1785 }
1786
1787 static
1788 int
1789 hammer2_vfs_fhtovp(struct mount *mp, struct vnode *rootvp,
1790                struct fid *fhp, struct vnode **vpp)
1791 {
1792         return (0);
1793 }
1794
1795 static
1796 int
1797 hammer2_vfs_checkexp(struct mount *mp, struct sockaddr *nam,
1798                  int *exflagsp, struct ucred **credanonp)
1799 {
1800         return (0);
1801 }
1802
1803 /*
1804  * Support code for hammer2_mount().  Read, verify, and install the volume
1805  * header into the HMP
1806  *
1807  * XXX read four volhdrs and use the one with the highest TID whos CRC
1808  *     matches.
1809  *
1810  * XXX check iCRCs.
1811  *
1812  * XXX For filesystems w/ less than 4 volhdrs, make sure to not write to
1813  *     nonexistant locations.
1814  *
1815  * XXX Record selected volhdr and ring updates to each of 4 volhdrs
1816  */
1817 static
1818 int
1819 hammer2_install_volume_header(hammer2_mount_t *hmp)
1820 {
1821         hammer2_volume_data_t *vd;
1822         struct buf *bp;
1823         hammer2_crc32_t crc0, crc, bcrc0, bcrc;
1824         int error_reported;
1825         int error;
1826         int valid;
1827         int i;
1828
1829         error_reported = 0;
1830         error = 0;
1831         valid = 0;
1832         bp = NULL;
1833
1834         /*
1835          * There are up to 4 copies of the volume header (syncs iterate
1836          * between them so there is no single master).  We don't trust the
1837          * volu_size field so we don't know precisely how large the filesystem
1838          * is, so depend on the OS to return an error if we go beyond the
1839          * block device's EOF.
1840          */
1841         for (i = 0; i < HAMMER2_NUM_VOLHDRS; i++) {
1842                 error = bread(hmp->devvp, i * HAMMER2_ZONE_BYTES64,
1843                               HAMMER2_VOLUME_BYTES, &bp);
1844                 if (error) {
1845                         brelse(bp);
1846                         bp = NULL;
1847                         continue;
1848                 }
1849
1850                 vd = (struct hammer2_volume_data *) bp->b_data;
1851                 if ((vd->magic != HAMMER2_VOLUME_ID_HBO) &&
1852                     (vd->magic != HAMMER2_VOLUME_ID_ABO)) {
1853                         brelse(bp);
1854                         bp = NULL;
1855                         continue;
1856                 }
1857
1858                 if (vd->magic == HAMMER2_VOLUME_ID_ABO) {
1859                         /* XXX: Reversed-endianness filesystem */
1860                         kprintf("hammer2: reverse-endian filesystem detected");
1861                         brelse(bp);
1862                         bp = NULL;
1863                         continue;
1864                 }
1865
1866                 crc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT0];
1867                 crc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC0_OFF,
1868                                       HAMMER2_VOLUME_ICRC0_SIZE);
1869                 bcrc = vd->icrc_sects[HAMMER2_VOL_ICRC_SECT1];
1870                 bcrc0 = hammer2_icrc32(bp->b_data + HAMMER2_VOLUME_ICRC1_OFF,
1871                                        HAMMER2_VOLUME_ICRC1_SIZE);
1872                 if ((crc0 != crc) || (bcrc0 != bcrc)) {
1873                         kprintf("hammer2 volume header crc "
1874                                 "mismatch copy #%d %08x/%08x\n",
1875                                 i, crc0, crc);
1876                         error_reported = 1;
1877                         brelse(bp);
1878                         bp = NULL;
1879                         continue;
1880                 }
1881                 if (valid == 0 || hmp->voldata.mirror_tid < vd->mirror_tid) {
1882                         valid = 1;
1883                         hmp->voldata = *vd;
1884                         hmp->volhdrno = i;
1885                 }
1886                 brelse(bp);
1887                 bp = NULL;
1888         }
1889         if (valid) {
1890                 hmp->volsync = hmp->voldata;
1891                 error = 0;
1892                 if (error_reported || bootverbose || 1) { /* 1/DEBUG */
1893                         kprintf("hammer2: using volume header #%d\n",
1894                                 hmp->volhdrno);
1895                 }
1896         } else {
1897                 error = EINVAL;
1898                 kprintf("hammer2: no valid volume headers found!\n");
1899         }
1900         return (error);
1901 }
1902
1903 /*
1904  * Reconnect using the passed file pointer.  The caller must ref the
1905  * fp for us.
1906  */
1907 void
1908 hammer2_cluster_reconnect(hammer2_pfsmount_t *pmp, struct file *fp)
1909 {
1910         hammer2_inode_data_t *ipdata;
1911         hammer2_chain_t *parent;
1912         hammer2_mount_t *hmp;
1913         size_t name_len;
1914
1915         hmp = pmp->mount_cluster->hmp;
1916
1917         /*
1918          * Closes old comm descriptor, kills threads, cleans up
1919          * states, then installs the new descriptor and creates
1920          * new threads.
1921          */
1922         kdmsg_iocom_reconnect(&pmp->iocom, fp, "hammer2");
1923
1924         /*
1925          * Setup LNK_CONN fields for autoinitiated state machine
1926          */
1927         parent = hammer2_inode_lock_ex(pmp->iroot);
1928         ipdata = &parent->data->ipdata;
1929         pmp->iocom.auto_lnk_conn.pfs_clid = ipdata->pfs_clid;
1930         pmp->iocom.auto_lnk_conn.pfs_fsid = ipdata->pfs_fsid;
1931         pmp->iocom.auto_lnk_conn.pfs_type = ipdata->pfs_type;
1932         pmp->iocom.auto_lnk_conn.proto_version = DMSG_SPAN_PROTO_1;
1933         pmp->iocom.auto_lnk_conn.peer_type = hmp->voldata.peer_type;
1934
1935         /*
1936          * Filter adjustment.  Clients do not need visibility into other
1937          * clients (otherwise millions of clients would present a serious
1938          * problem).  The fs_label also serves to restrict the namespace.
1939          */
1940         pmp->iocom.auto_lnk_conn.peer_mask = 1LLU << HAMMER2_PEER_HAMMER2;
1941         pmp->iocom.auto_lnk_conn.pfs_mask = (uint64_t)-1;
1942         switch (ipdata->pfs_type) {
1943         case DMSG_PFSTYPE_CLIENT:
1944                 pmp->iocom.auto_lnk_conn.peer_mask &=
1945                                 ~(1LLU << DMSG_PFSTYPE_CLIENT);
1946                 break;
1947         default:
1948                 break;
1949         }
1950
1951         name_len = ipdata->name_len;
1952         if (name_len >= sizeof(pmp->iocom.auto_lnk_conn.fs_label))
1953                 name_len = sizeof(pmp->iocom.auto_lnk_conn.fs_label) - 1;
1954         bcopy(ipdata->filename,
1955               pmp->iocom.auto_lnk_conn.fs_label,
1956               name_len);
1957         pmp->iocom.auto_lnk_conn.fs_label[name_len] = 0;
1958
1959         /*
1960          * Setup LNK_SPAN fields for autoinitiated state machine
1961          */
1962         pmp->iocom.auto_lnk_span.pfs_clid = ipdata->pfs_clid;
1963         pmp->iocom.auto_lnk_span.pfs_fsid = ipdata->pfs_fsid;
1964         pmp->iocom.auto_lnk_span.pfs_type = ipdata->pfs_type;
1965         pmp->iocom.auto_lnk_span.peer_type = hmp->voldata.peer_type;
1966         pmp->iocom.auto_lnk_span.proto_version = DMSG_SPAN_PROTO_1;
1967         name_len = ipdata->name_len;
1968         if (name_len >= sizeof(pmp->iocom.auto_lnk_span.fs_label))
1969                 name_len = sizeof(pmp->iocom.auto_lnk_span.fs_label) - 1;
1970         bcopy(ipdata->filename,
1971               pmp->iocom.auto_lnk_span.fs_label,
1972               name_len);
1973         pmp->iocom.auto_lnk_span.fs_label[name_len] = 0;
1974         hammer2_inode_unlock_ex(pmp->iroot, parent);
1975
1976         kdmsg_iocom_autoinitiate(&pmp->iocom, hammer2_autodmsg);
1977 }
1978
1979 static int
1980 hammer2_rcvdmsg(kdmsg_msg_t *msg)
1981 {
1982         switch(msg->any.head.cmd & DMSGF_TRANSMASK) {
1983         case DMSG_DBG_SHELL:
1984                 /*
1985                  * (non-transaction)
1986                  * Execute shell command (not supported atm)
1987                  */
1988                 kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
1989                 break;
1990         case DMSG_DBG_SHELL | DMSGF_REPLY:
1991                 /*
1992                  * (non-transaction)
1993                  */
1994                 if (msg->aux_data) {
1995                         msg->aux_data[msg->aux_size - 1] = 0;
1996                         kprintf("HAMMER2 DBG: %s\n", msg->aux_data);
1997                 }
1998                 break;
1999         default:
2000                 /*
2001                  * Unsupported message received.  We only need to
2002                  * reply if it's a transaction in order to close our end.
2003                  * Ignore any one-way messages are any further messages
2004                  * associated with the transaction.
2005                  *
2006                  * NOTE: This case also includes DMSG_LNK_ERROR messages
2007                  *       which might be one-way, replying to those would
2008                  *       cause an infinite ping-pong.
2009                  */
2010                 if (msg->any.head.cmd & DMSGF_CREATE)
2011                         kdmsg_msg_reply(msg, DMSG_ERR_NOSUPP);
2012                 break;
2013         }
2014         return(0);
2015 }
2016
2017 /*
2018  * This function is called after KDMSG has automatically handled processing
2019  * of a LNK layer message (typically CONN, SPAN, or CIRC).
2020  *
2021  * We tag off the LNK_CONN to trigger our LNK_VOLCONF messages which
2022  * advertises all available hammer2 super-root volumes.
2023  */
2024 static void
2025 hammer2_autodmsg(kdmsg_msg_t *msg)
2026 {
2027         hammer2_pfsmount_t *pmp = msg->iocom->handle;
2028         hammer2_mount_t *hmp = pmp->mount_cluster->hmp;
2029         int copyid;
2030
2031         /*
2032          * We only care about replies to our LNK_CONN auto-request.  kdmsg
2033          * has already processed the reply, we use this calback as a shim
2034          * to know when we can advertise available super-root volumes.
2035          */
2036         if ((msg->any.head.cmd & DMSGF_TRANSMASK) !=
2037             (DMSG_LNK_CONN | DMSGF_CREATE | DMSGF_REPLY) ||
2038             msg->state == NULL) {
2039                 return;
2040         }
2041
2042         kprintf("LNK_CONN REPLY RECEIVED CMD %08x\n", msg->any.head.cmd);
2043
2044         if (msg->any.head.cmd & DMSGF_CREATE) {
2045                 kprintf("HAMMER2: VOLDATA DUMP\n");
2046
2047                 /*
2048                  * Dump the configuration stored in the volume header
2049                  */
2050                 hammer2_voldata_lock(hmp);
2051                 for (copyid = 0; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
2052                         if (hmp->voldata.copyinfo[copyid].copyid == 0)
2053                                 continue;
2054                         hammer2_volconf_update(pmp, copyid);
2055                 }
2056                 hammer2_voldata_unlock(hmp, 0);
2057         }
2058         if ((msg->any.head.cmd & DMSGF_DELETE) &&
2059             msg->state && (msg->state->txcmd & DMSGF_DELETE) == 0) {
2060                 kprintf("HAMMER2: CONN WAS TERMINATED\n");
2061         }
2062 }
2063
2064 /*
2065  * Volume configuration updates are passed onto the userland service
2066  * daemon via the open LNK_CONN transaction.
2067  */
2068 void
2069 hammer2_volconf_update(hammer2_pfsmount_t *pmp, int index)
2070 {
2071         hammer2_mount_t *hmp = pmp->mount_cluster->hmp;
2072         kdmsg_msg_t *msg;
2073
2074         /* XXX interlock against connection state termination */
2075         kprintf("volconf update %p\n", pmp->iocom.conn_state);
2076         if (pmp->iocom.conn_state) {
2077                 kprintf("TRANSMIT VOLCONF VIA OPEN CONN TRANSACTION\n");
2078                 msg = kdmsg_msg_alloc_state(pmp->iocom.conn_state,
2079                                             DMSG_LNK_VOLCONF, NULL, NULL);
2080                 msg->any.lnk_volconf.copy = hmp->voldata.copyinfo[index];
2081                 msg->any.lnk_volconf.mediaid = hmp->voldata.fsid;
2082                 msg->any.lnk_volconf.index = index;
2083                 kdmsg_msg_write(msg);
2084         }
2085 }
2086
2087 void
2088 hammer2_dump_chain(hammer2_chain_t *chain, int tab, int *countp)
2089 {
2090         hammer2_chain_t *scan;
2091
2092         --*countp;
2093         if (*countp == 0) {
2094                 kprintf("%*.*s...\n", tab, tab, "");
2095                 return;
2096         }
2097         if (*countp < 0)
2098                 return;
2099         kprintf("%*.*schain[%d] %p.%d [%08x][core=%p] (%s) dl=%p dt=%s refs=%d",
2100                 tab, tab, "",
2101                 chain->index, chain, chain->bref.type, chain->flags,
2102                 chain->core,
2103                 ((chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
2104                 chain->data) ?  (char *)chain->data->ipdata.filename : "?"),
2105                 chain->next_parent,
2106                 (chain->delete_tid == HAMMER2_MAX_TID ? "max" : "fls"),
2107                 chain->refs);
2108         if (chain->core == NULL || RB_EMPTY(&chain->core->rbtree))
2109                 kprintf("\n");
2110         else
2111                 kprintf(" {\n");
2112         RB_FOREACH(scan, hammer2_chain_tree, &chain->core->rbtree) {
2113                 hammer2_dump_chain(scan, tab + 4, countp);
2114         }
2115         if (chain->core && !RB_EMPTY(&chain->core->rbtree)) {
2116                 if (chain->bref.type == HAMMER2_BREF_TYPE_INODE && chain->data)
2117                         kprintf("%*.*s}(%s)\n", tab, tab, "",
2118                                 chain->data->ipdata.filename);
2119                 else
2120                         kprintf("%*.*s}\n", tab, tab, "");
2121         }
2122 }