Merge branches 'hammer2' and 'master' of ssh://crater.dragonflybsd.org/repository...
authorMatthew Dillon <dillon@apollo.backplane.com>
Wed, 11 Apr 2012 18:30:08 +0000 (11:30 -0700)
committerMatthew Dillon <dillon@apollo.backplane.com>
Wed, 11 Apr 2012 18:30:08 +0000 (11:30 -0700)
36 files changed:
lib/libstand/hammer2.c [new file with mode: 0644]
sbin/hammer2/Makefile [new file with mode: 0644]
sbin/hammer2/cmd_helper.c [new file with mode: 0644]
sbin/hammer2/cmd_pfs.c [new file with mode: 0644]
sbin/hammer2/cmd_remote.c [new file with mode: 0644]
sbin/hammer2/cmd_snapshot.c [new file with mode: 0644]
sbin/hammer2/hammer2.h [new file with mode: 0644]
sbin/hammer2/main.c [new file with mode: 0644]
sbin/hammer2/subs.c [new file with mode: 0644]
sbin/mount_hammer2/Makefile [new file with mode: 0644]
sbin/mount_hammer2/mount_hammer2.c [new file with mode: 0644]
sbin/newfs_hammer2/Makefile [new file with mode: 0644]
sbin/newfs_hammer2/newfs_hammer2.8 [new file with mode: 0644]
sbin/newfs_hammer2/newfs_hammer2.c [new file with mode: 0644]
sys/vfs/hammer2/CHANGES [new file with mode: 0644]
sys/vfs/hammer2/DESIGN [new file with mode: 0644]
sys/vfs/hammer2/Makefile [new file with mode: 0644]
sys/vfs/hammer2/TODO [new file with mode: 0644]
sys/vfs/hammer2/donew [new file with mode: 0755]
sys/vfs/hammer2/donew2 [new file with mode: 0755]
sys/vfs/hammer2/dossd [new file with mode: 0755]
sys/vfs/hammer2/dossd2 [new file with mode: 0755]
sys/vfs/hammer2/dotest [new file with mode: 0755]
sys/vfs/hammer2/hammer2.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_chain.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_disk.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_freemap.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_icrc.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_inode.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_ioctl.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_ioctl.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_mount.h [new file with mode: 0644]
sys/vfs/hammer2/hammer2_subr.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_vfsops.c [new file with mode: 0644]
sys/vfs/hammer2/hammer2_vnops.c [new file with mode: 0644]
sys/vfs/hammer2/mkvntest [new file with mode: 0755]

diff --git a/lib/libstand/hammer2.c b/lib/libstand/hammer2.c
new file mode 100644 (file)
index 0000000..0d4cf4e
--- /dev/null
@@ -0,0 +1,177 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/uuid.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <unistd.h>
+#include <fcntl.h>
+#include <string.h>
+#include <strings.h>
+#include <errno.h>
+
+#include <hammer2/hammer2_disk.h>
+
+struct hammer2 {
+       int                             fd;     /* Device fd */
+       struct hammer2_blockref         sroot;  /* Superroot blockref */
+};
+
+struct inode {
+       struct hammer2_inode_data       dat;    /* raw inode data */
+       off_t                           doff;   /* disk inode offset */
+};
+
+off_t blockoff(ref)
+       struct hammer2_blockref ref;
+{
+
+}
+
+hinit(hfs)
+       struct hammer2 *hfs;
+{
+       struct hammer2_volume_data volhdr;
+       ssize_t rc;
+       hammer2_crc_t crc0;
+
+       rc = pread(hfs->fd, &volhdr, HAMMER2_VOLUME_SIZE, 0);
+       if (volhdr.magic == HAMMER2_VOLUME_ID_HBO) {
+               printf("Valid HAMMER2 filesystem\n");
+       } else {
+               return (-1);
+       }
+
+       hfs->sroot = volhdr.sroot_blockref;
+       return (0);
+}
+
+shread(hfs, ino, buf, off, len)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       char *buf;
+       off_t off;
+       size_t len;
+{
+       /*
+        * Read [off, off+len) from inode ino rather than from disk
+        * offsets; correctly decodes blockrefs/indirs/...
+        */
+}
+
+struct inode *hlookup1(hfs, ino, name)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       char *name;
+{
+       static struct inode filino;
+       off_t off;
+       int rc;
+
+       bzero(&filino, sizeof(struct inode));
+
+       for (off = 0;
+            off < ino->dat.size;
+            off += sizeof(struct hammer2_inode_data))
+       {
+               rc = shread(hfs, ino, &filino.dat, off,
+                           sizeof(struct hammer2_inode_data));
+               if (rc != sizeof(struct hammer2_inode_data))
+                       continue;
+               if (strcmp(name, &filino.dat.filename) == 0)
+                       return (&filino);
+       }
+
+       return (NULL);
+}
+
+struct inode *hlookup(hfs, name)
+       struct hammer2 *hfs;
+       char *name;
+{
+       /* Name is of form /SUPERROOT/a/b/c/file */
+
+}
+
+void hstat(hfs, ino, sb)
+       struct hammer2 *hfs;
+       struct inode *ino;
+       struct stat *sb;
+{
+
+}
+
+main(argc, argv)
+       int argc;
+       char *argv[];
+{
+       struct hammer2 hammer2;
+       struct inode *ino;
+       struct stat sb;
+       int i;
+
+       if (argc < 2) {
+               fprintf(stderr, "usage: hammer2 <dev>\n");
+               exit(1);
+       }
+
+       hammer2.fd = open(argv[1], O_RDONLY);
+       if (hammer2.fd < 0) {
+               fprintf(stderr, "unable to open %s\n", argv[1]);
+               exit(1);
+       }
+
+       if (hinit(&hammer2)) {
+               fprintf(stderr, "invalid fs\n");
+               close(hammer2.fd);
+               exit(1);
+       }
+
+       for (i = 2; i < argc; i++) {
+               ino = hlookup(&hammer2, argv[i]);
+               if (ino == NULL) {
+                       fprintf(stderr, "hlookup %s\n", argv[i]);
+                       continue;
+               }
+               hstat(&hammer2, ino, &sb);
+
+               printf("%s %lld", argv[i], sb.st_size);
+
+       }
+}
diff --git a/sbin/hammer2/Makefile b/sbin/hammer2/Makefile
new file mode 100644 (file)
index 0000000..742a93c
--- /dev/null
@@ -0,0 +1,15 @@
+PROG=  hammer2
+SRCS=  main.c subs.c
+SRCS+= cmd_remote.c cmd_snapshot.c cmd_pfs.c cmd_helper.c
+#MAN=  hammer2.8
+NOMAN= TRUE
+
+CFLAGS+= -I${.CURDIR}/../../sys
+CFLAGS+= -pthread
+LDADD= -lm -lutil -lmd
+DPADD= ${LIBM} ${LIBUTIL} ${LIBMD}
+
+#.PATH: ${.CURDIR}/../../sys/libkern
+#SRCS+= crc32.c
+
+.include <bsd.prog.mk>
diff --git a/sbin/hammer2/cmd_helper.c b/sbin/hammer2/cmd_helper.c
new file mode 100644 (file)
index 0000000..e870c26
--- /dev/null
@@ -0,0 +1,194 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void helper_master_listen(void);
+static void *helper_master_accept(void *data);
+static void *helper_master_service(void *data);
+
+/*
+ * The first hammer2 helper will also fork off a daemon to listen on
+ * and accept connections from the machine interconnect socket.  This
+ * helper operates across all HAMMER2 mounts.
+ *
+ * An additional independent multi-threaded helper daemon is run for
+ * each HAMMER2 PFS mount.  This helper connects to the master helper
+ * and registers the PFSID for each mount, allowing the master helper
+ * to forward accepted descriptors to the per-PFS helpers after handling
+ * authentication and accepting the PFSID.
+ *
+ * The per-mount helper daemon will then install relay pipe descriptors
+ * into the kernel VFS so the HAMMER2 filesystem can issue requests / accept
+ * commands as needed.  Note that the HAMMER2 filesystem will also track
+ * the cache state and will generally be able to bypass talking to the helper
+ * threads when local media is available and determined to contain the
+ * required data.
+ *
+ * WARNING!  Except for sel_path, we avoid accessing the filesystem.  In
+ *          a fully remote root mount scenario the administrative root
+ *          will be mounted before the helper is started up.
+ */
+int
+cmd_helper(const char *sel_path)
+{
+       int ecode = 0;
+       int fd;
+
+       /*
+        * Install the master server if it is not already running.
+        */
+       helper_master_listen();
+
+       /*
+        * Acquire a handle for ioctls, which will also extract the PFSID
+        * for the mounted PFS.  If sel_path is NULL we just start the
+        * master listener and do not go any further.
+        */
+       if (sel_path == NULL)
+               return(0);
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+
+       /*
+        * Connect to the master to register the PFSID and start the
+        * per-PFS helper if we succeed, otherwise a helper is already
+        * running and registered.
+        */
+
+       return ecode;
+}
+
+static
+void
+helper_master_listen(void)
+{
+       struct sockaddr_in lsin;
+       int on;
+       int lfd;
+
+       /*
+        * Acquire socket and set options
+        */
+       if ((lfd = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+               fprintf(stderr, "helper_master_listen: socket(): %s\n",
+                       strerror(errno));
+               return;
+       }
+       on = 1;
+       setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+       /*
+        * Setup listen port and try to bind.  If the bind fails we assume
+        * that a master listener process is already running.
+        */
+       bzero(&lsin, sizeof(lsin));
+       lsin.sin_addr.s_addr = INADDR_ANY;
+       lsin.sin_port = htons(HAMMER2_LISTEN_PORT);
+       if (bind(lfd, (struct sockaddr *)&lsin, sizeof(lsin)) < 0) {
+               close(lfd);
+               return;
+       }
+       listen(lfd, 50);
+
+       /*
+        * Fork and disconnect the controlling terminal and parent process,
+        * executing the specified function as a pthread.
+        *
+        * Returns to the original process which can then continue running.
+        * In debug mode this call will create the pthread without forking
+        * and set NormalExit to 0.
+        */
+       hammer2_disconnect(helper_master_accept, (void *)(intptr_t)lfd);
+       if (NormalExit)
+               close(lfd);
+}
+
+/*
+ * pthread to accept connections on the master socket
+ */
+static
+void *
+helper_master_accept(void *data)
+{
+       struct sockaddr_in asin;
+       socklen_t alen;
+       pthread_t thread;
+       int lfd = (int)(intptr_t)data;
+       int fd;
+
+       /*
+        * Nobody waits for us
+        */
+       setproctitle("hammer2 master listen");
+       pthread_detach(pthread_self());
+
+       /*
+        * Accept connections and create pthreads to handle them after
+        * validating the IP.
+        */
+       for (;;) {
+               alen = sizeof(asin);
+               fd = accept(lfd, (struct sockaddr *)&asin, &alen);
+               if (fd < 0) {
+                       if (errno == EINTR)
+                               continue;
+                       break;
+               }
+               thread = NULL;
+               pthread_create(&thread, NULL,
+                              helper_master_service, (void *)(intptr_t)fd);
+       }
+       return (NULL);
+}
+
+/*
+ * pthread for each connection
+ */
+static
+void *
+helper_master_service(void *data)
+{
+       char buf[256];
+       ssize_t len;
+       int fd = (int)(intptr_t)data;
+
+       while ((len = read(fd, buf, sizeof(buf))) > 0) {
+               write(fd, buf, len);
+       }
+       close(fd);
+
+       return (NULL);
+}
diff --git a/sbin/hammer2/cmd_pfs.c b/sbin/hammer2/cmd_pfs.c
new file mode 100644 (file)
index 0000000..c830bfa
--- /dev/null
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+int
+cmd_pfs_list(const char *sel_path)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int count = 0;
+       int fd;
+       uint32_t status;
+       char *pfs_id_str = NULL;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+
+       while ((pfs.name_key = pfs.name_next) != (hammer2_key_t)-1) {
+               if (ioctl(fd, HAMMER2IOC_PFS_GET, &pfs) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+                       break;
+               }
+               if (count == 0) {
+                       printf("Type        "
+                              "Pfs_id                               "
+                              "Label\n");
+               }
+               switch(pfs.pfs_type) {
+               case HAMMER2_PFSTYPE_NONE:
+                       printf("NONE        ");
+                       break;
+               case HAMMER2_PFSTYPE_ADMIN:
+                       printf("ADMIN       ");
+                       break;
+               case HAMMER2_PFSTYPE_CACHE:
+                       printf("CACHE       ");
+                       break;
+               case HAMMER2_PFSTYPE_COPY:
+                       printf("COPY        ");
+                       break;
+               case HAMMER2_PFSTYPE_SLAVE:
+                       printf("SLAVE       ");
+                       break;
+               case HAMMER2_PFSTYPE_SOFT_SLAVE:
+                       printf("SOFT_SLAVE  ");
+                       break;
+               case HAMMER2_PFSTYPE_SOFT_MASTER:
+                       printf("SOFT_MASTER ");
+                       break;
+               case HAMMER2_PFSTYPE_MASTER:
+                       printf("MASTER      ");
+                       break;
+               default:
+                       printf("%02x          ", pfs.pfs_type);
+                       break;
+               }
+               uuid_to_string(&pfs.pfs_id, &pfs_id_str, &status);
+               printf("%s ", pfs_id_str);
+               free(pfs_id_str);
+               pfs_id_str = NULL;
+               printf("%s\n", pfs.name);
+               ++count;
+       }
+       close(fd);
+
+       return (ecode);
+}
+
+int
+cmd_pfs_create(const char *sel_path, const char *name,
+              uint8_t pfs_type, const char *uuid_str)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int fd;
+       uint32_t status;
+
+       if (pfs_type == HAMMER2_PFSTYPE_NONE) {
+               fprintf(stderr, "hammer2: pfs_create: requires -t pfs_type\n");
+               return(1);
+       }
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+       snprintf(pfs.name, sizeof(pfs.name), "%s", name);
+       pfs.pfs_type = pfs_type;
+       if (uuid_str) {
+               uuid_from_string(uuid_str, &pfs.pfs_id, &status);
+       } else {
+               uuid_create(&pfs.pfs_id, &status);
+       }
+       if (status == uuid_s_ok)
+               uuid_create(&pfs.pfs_fsid, &status);
+       if (status == uuid_s_ok) {
+               if (ioctl(fd, HAMMER2IOC_PFS_CREATE, &pfs) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+               }
+       } else {
+               fprintf(stderr, "hammer2: pfs_create: badly formed uuid\n");
+               ecode = 1;
+       }
+       close(fd);
+       return (ecode);
+}
+
+int
+cmd_pfs_delete(const char *sel_path, const char *name)
+{
+       hammer2_ioc_pfs_t pfs;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&pfs, sizeof(pfs));
+       snprintf(pfs.name, sizeof(pfs.name), "%s", name);
+
+       if (ioctl(fd, HAMMER2IOC_PFS_CREATE, &pfs) < 0) {
+               fprintf(stderr, "hammer2: pfs_delete(%s): %s\n",
+                       name, strerror(errno));
+               ecode = 1;
+       }
+       close(fd);
+
+       return (ecode);
+}
diff --git a/sbin/hammer2/cmd_remote.c b/sbin/hammer2/cmd_remote.c
new file mode 100644 (file)
index 0000000..8300e09
--- /dev/null
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+int
+cmd_remote_connect(const char *sel_path, const char *url)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+       remote.copyid = -1;
+       remote.fd = -1;
+       if (strlen(url) >= sizeof(remote.copy1.path)) {
+               fprintf(stderr, "hammer2: connect: Path too long\n");
+               close(fd);
+               return(1);
+       }
+       snprintf(remote.copy1.path, sizeof(remote.copy1.path), "%s", url);
+       if (ioctl(fd, HAMMER2IOC_REMOTE_ADD, &remote) < 0) {
+               perror("ioctl");
+               ecode = 1;
+       }
+       close(fd);
+       return 0;;
+}
+
+int
+cmd_remote_disconnect(const char *sel_path, const char *url)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+       remote.copyid = -1;
+       remote.fd = -1;
+       if (strlen(url) >= sizeof(remote.copy1.path)) {
+               fprintf(stderr, "hammer2: disconnect: Path too long\n");
+               close(fd);
+               return(1);
+       }
+       snprintf(remote.copy1.path, sizeof(remote.copy1.path), "%s", url);
+       if (ioctl(fd, HAMMER2IOC_REMOTE_DEL, &remote) < 0) {
+               perror("ioctl");
+               ecode = 1;
+       }
+       close(fd);
+       return 0;;
+}
+
+int
+cmd_remote_status(const char *sel_path, int all_opt __unused)
+{
+       hammer2_ioc_remote_t remote;
+       int ecode = 0;
+       int count = 0;
+       int fd;
+
+       if ((fd = hammer2_ioctl_handle(sel_path)) < 0)
+               return(1);
+       bzero(&remote, sizeof(remote));
+
+       while ((remote.copyid = remote.nextid) >= 0) {
+               if (ioctl(fd, HAMMER2IOC_REMOTE_GET, &remote) < 0) {
+                       perror("ioctl");
+                       ecode = 1;
+                       break;
+               }
+               if (remote.copy1.copyid == 0)
+                       continue;
+               if (count == 0)
+                       printf("CPYID LABEL           STATUS PATH\n");
+               printf("%5d %-15s %c%c%c.%02x %s\n",
+                       remote.copy1.copyid,
+                       remote.copy1.label,
+                       '-', '-', '-',
+                       remote.copy1.priority,
+                       remote.copy1.path);
+               ++count;
+       }
+       if (count == 0)
+               printf("No linkages found\n");
+       return (ecode);
+}
diff --git a/sbin/hammer2/cmd_snapshot.c b/sbin/hammer2/cmd_snapshot.c
new file mode 100644 (file)
index 0000000..2d46b9e
--- /dev/null
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * The snapshot is named <PFSNAME>_<YYYYMMDD.HHMMSS.TRANSID> unless
+ * overridden by a label.
+ *
+ * When local non-cache media is involved the media is
+ * first synchronized and the snapshot is then based on
+ * the media.
+ *
+ * If the media is remote the snapshot is created on the remote
+ * end (if you have sufficient administrative rights) and a local
+ * ADMIN or CACHE PFS is created with a connection to the snapshot
+ * on the remote.
+ *
+ * If the client has snapshot rights to multiple remotes then TBD.
+ */
diff --git a/sbin/hammer2/hammer2.h b/sbin/hammer2/hammer2.h
new file mode 100644 (file)
index 0000000..7f451e5
--- /dev/null
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Rollup headers for hammer2 utility
+ */
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <sys/file.h>
+#include <sys/socket.h>
+#include <sys/time.h>
+#include <sys/wait.h>
+#include <sys/tty.h>
+
+#include <netinet/in.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netdb.h>
+
+#include <vfs/hammer2/hammer2_disk.h>
+#include <vfs/hammer2/hammer2_mount.h>
+#include <vfs/hammer2/hammer2_ioctl.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stddef.h>
+
+#include <ctype.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <uuid.h>
+
+extern int DebugOpt;
+extern int NormalExit;
+
+int hammer2_ioctl_handle(const char *sel_path);
+void hammer2_disconnect(void *(*func)(void *), void *arg);
+
+int cmd_remote_connect(const char *sel_path, const char *url);
+int cmd_remote_disconnect(const char *sel_path, const char *url);
+int cmd_remote_status(const char *sel_path, int all_opt);
+
+int cmd_pfs_list(const char *sel_path);
+int cmd_pfs_create(const char *sel_path, const char *name,
+                       uint8_t pfs_type, const char *uuid_str);
+int cmd_pfs_delete(const char *sel_path, const char *name);
+
+int cmd_helper(const char *sel_path);
diff --git a/sbin/hammer2/main.c b/sbin/hammer2/main.c
new file mode 100644 (file)
index 0000000..81ef955
--- /dev/null
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+static void usage(int code);
+
+int DebugOpt;
+int NormalExit = 1;    /* if set to 0 main() has to pthread_exit() */
+
+int
+main(int ac, char **av)
+{
+       const char *sel_path = NULL;
+       const char *uuid_str = NULL;
+       int pfs_type = HAMMER2_PFSTYPE_NONE;
+       int quick_opt = 0;
+       int all_opt = 0;
+       int ecode = 0;
+       int ch;
+
+       /*
+        * Core options
+        */
+       while ((ch = getopt(ac, av, "aqs:t:u:")) != -1) {
+               switch(ch) {
+               case 'a':
+                       all_opt = 1;
+                       break;
+               case 'q':
+                       /*
+                        * Quick mode - do not block verifying certain
+                        * operations such as (connect).
+                        */
+                       quick_opt = 1;
+                       break;
+               case 's':
+                       sel_path = optarg;
+                       break;
+               case 't':
+                       /*
+                        * set node type for mkpfs
+                        */
+                       if (strcasecmp(optarg, "ADMIN") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_ADMIN;
+                       } else if (strcasecmp(optarg, "CACHE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_CACHE;
+                       } else if (strcasecmp(optarg, "COPY") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_COPY;
+                       } else if (strcasecmp(optarg, "SLAVE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SLAVE;
+                       } else if (strcasecmp(optarg, "SOFT_SLAVE") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SOFT_SLAVE;
+                       } else if (strcasecmp(optarg, "SOFT_MASTER") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_SOFT_MASTER;
+                       } else if (strcasecmp(optarg, "MASTER") == 0) {
+                               pfs_type = HAMMER2_PFSTYPE_MASTER;
+                       } else {
+                               fprintf(stderr, "-t: Unrecognized node type\n");
+                               usage(1);
+                       }
+                       break;
+               case 'u':
+                       /*
+                        * set uuid for mkpfs, else one will be generated
+                        * (required for all except the MASTER node_type)
+                        */
+                       uuid_str = optarg;
+                       break;
+               case 'd':
+                       DebugOpt = 1;
+                       break;
+               default:
+                       fprintf(stderr, "Unknown option: %c\n", ch);
+                       usage(1);
+                       /* not reached */
+                       break;
+               }
+       }
+
+       /*
+        * Adjust, then process the command
+        */
+       ac -= optind;
+       av += optind;
+       if (ac < 1) {
+               fprintf(stderr, "Missing command\n");
+               usage(1);
+               /* not reached */
+       }
+
+       if (strcmp(av[0], "connect") == 0) {
+               /*
+                * Add cluster connection
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "connect: missing argument\n");
+                       usage(1);
+               }
+               ecode = cmd_remote_connect(sel_path, av[1]);
+       } else if (strcmp(av[0], "disconnect") == 0) {
+               /*
+                * Remove cluster connection
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "disconnect: missing argument\n");
+                       usage(1);
+               }
+               ecode = cmd_remote_disconnect(sel_path, av[1]);
+       } else if (strcmp(av[0], "status") == 0) {
+               /*
+                * Get status of PFS and its connections (-a for all PFSs)
+                */
+               ecode = cmd_remote_status(sel_path, all_opt);
+       } else if (strcmp(av[0], "pfs_list") == 0) {
+               /*
+                * List all PFSs
+                */
+               ecode = cmd_pfs_list(sel_path);
+       } else if (strcmp(av[0], "pfs_create") == 0) {
+               /*
+                * Create new PFS using pfs_type
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "pfs_create: requires name\n");
+                       usage(1);
+               }
+               ecode = cmd_pfs_create(sel_path, av[1], pfs_type, uuid_str);
+       } else if (strcmp(av[0], "pfs_delete") == 0) {
+               /*
+                * Delete a PFS by name
+                */
+               if (ac < 2) {
+                       fprintf(stderr, "pfs_delete: requires name\n");
+                       usage(1);
+               }
+               ecode = cmd_pfs_delete(sel_path, av[1]);
+       } else if (strcmp(av[0], "snapshot") == 0) {
+               /*
+                * Create snapshot with optional pfs_type and optional
+                * label override.
+                */
+       } else if (strcmp(av[0], "helper") == 0) {
+               /*
+                * Typically run as a daemon, this multi-threaded helper
+                * subsystem manages socket communications for the
+                * filesystem.
+                */
+               ecode = cmd_helper(sel_path);
+       } else {
+               fprintf(stderr, "Unrecognized command: %s\n", av[0]);
+               usage(1);
+       }
+
+       /*
+        * In DebugMode we may wind up starting several pthreads in the
+        * original process, in which case we have to let them run and
+        * not actually exit.
+        */
+       if (NormalExit) {
+               return (ecode);
+       } else {
+               pthread_exit(NULL);
+               _exit(2);       /* NOT REACHED */
+       }
+}
+
+static
+void
+usage(int code)
+{
+       fprintf(stderr,
+               "hammer2 [-s path] command...\n"
+               "    -s path            Select filesystem\n"
+       );
+       exit(code);
+}
diff --git a/sbin/hammer2/subs.c b/sbin/hammer2/subs.c
new file mode 100644 (file)
index 0000000..751bf8a
--- /dev/null
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "hammer2.h"
+
+/*
+ * Obtain a file descriptor that the caller can execute ioctl()'s on.
+ */
+int
+hammer2_ioctl_handle(const char *sel_path)
+{
+       struct hammer2_ioc_version info;
+       int fd;
+
+       if (sel_path == NULL)
+               sel_path = ".";
+
+       fd = open(sel_path, O_RDONLY, 0);
+       if (fd < 0) {
+               fprintf(stderr, "hammer2: Unable to open %s: %s\n",
+                       sel_path, strerror(errno));
+               return(-1);
+       }
+       if (ioctl(fd, HAMMER2IOC_VERSION_GET, &info) < 0) {
+               fprintf(stderr, "hammer2: '%s' is not a hammer2 filesystem\n",
+                       sel_path);
+               close(fd);
+               return(-1);
+       }
+       return (fd);
+}
+
+void
+hammer2_disconnect(void *(*func)(void *), void *arg)
+{
+       pthread_t thread = NULL;
+       pid_t pid;
+       int ttyfd;
+
+       /*
+        * Do not disconnect in debug mode
+        */
+       if (DebugOpt) {
+                pthread_create(&thread, NULL, func, arg);
+               NormalExit = 0;
+               return;
+       }
+
+       /*
+        * Otherwise disconnect us.  Double-fork to get rid of the ppid
+        * association and disconnect the TTY.
+        */
+       if ((pid = fork()) < 0) {
+               fprintf(stderr, "hammer2: fork(): %s\n", strerror(errno));
+               exit(1);
+       }
+       if (pid > 0) {
+               while (waitpid(pid, NULL, 0) != pid)
+                       ;
+               return;         /* parent returns */
+       }
+
+       /*
+        * Get rid of the TTY/session before double-forking to finish off
+        * the ppid.
+        */
+       ttyfd = open("/dev/null", O_RDWR);
+       if (ttyfd >= 0) {
+               if (ttyfd != 0)
+                       dup2(ttyfd, 0);
+               if (ttyfd != 1)
+                       dup2(ttyfd, 1);
+               if (ttyfd != 2)
+                       dup2(ttyfd, 2);
+               if (ttyfd > 2)
+                       close(ttyfd);
+       }
+
+       ttyfd = open("/dev/tty", O_RDWR);
+       if (ttyfd >= 0) {
+               ioctl(ttyfd, TIOCNOTTY, 0);
+               close(ttyfd);
+       }
+       setsid();
+
+       /*
+        * Second fork to disconnect ppid (the original parent waits for
+        * us to exit).
+        */
+       if ((pid = fork()) < 0) {
+               _exit(2);
+       }
+       if (pid > 0)
+               _exit(0);
+
+       /*
+        * The double child
+        */
+       setsid();
+       pthread_create(&thread, NULL, func, arg);
+       pthread_exit(NULL);
+       _exit(2);       /* NOT REACHED */
+}
diff --git a/sbin/mount_hammer2/Makefile b/sbin/mount_hammer2/Makefile
new file mode 100644 (file)
index 0000000..f3f4fc8
--- /dev/null
@@ -0,0 +1,7 @@
+PROG=  mount_hammer2
+SRCS=  mount_hammer2.c
+MAN=
+
+CFLAGS+= -I${.CURDIR}/..
+
+.include <bsd.prog.mk>
diff --git a/sbin/mount_hammer2/mount_hammer2.c b/sbin/mount_hammer2/mount_hammer2.c
new file mode 100644 (file)
index 0000000..7a2688a
--- /dev/null
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/types.h>
+#include <sys/mount.h>
+#include <vfs/hammer2/hammer2_mount.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+
+/*
+ * Usage: mount_hammer2 [volume] [mtpt]
+ */
+int
+main(int argc, char *argv[])
+{
+       struct hammer2_mount_info info;
+       struct vfsconf vfc;
+       char *mountpt;
+       int error;
+       int mount_flags;
+
+       bzero(&info, sizeof(info));
+       mount_flags = 0;
+
+       if (argc < 3)
+               exit(1);
+
+       error = getvfsbyname("hammer2", &vfc);
+       if (error) {
+               fprintf(stderr, "hammer2 vfs not loaded\n");
+               exit(1);
+       }
+
+       info.volume = argv[1];
+       info.hflags = 0;
+       mountpt = argv[2];
+
+       error = mount(vfc.vfc_name, mountpt, mount_flags, &info);
+       if (error)
+               perror("mount: ");
+}
diff --git a/sbin/newfs_hammer2/Makefile b/sbin/newfs_hammer2/Makefile
new file mode 100644 (file)
index 0000000..f306feb
--- /dev/null
@@ -0,0 +1,12 @@
+#
+#
+PROG=  newfs_hammer2
+MAN=   newfs_hammer2.8
+CFLAGS+= -I${.CURDIR}/../../sys -I${.CURDIR}/../hammer2
+SRCS= newfs_hammer2.c hammer2_icrc.c
+
+.PATH: ${.CURDIR}/../../sys/libkern
+.PATH: ${.CURDIR}/../../sys/vfs/hammer2
+SRCS+= crc32.c
+
+.include <bsd.prog.mk>
diff --git a/sbin/newfs_hammer2/newfs_hammer2.8 b/sbin/newfs_hammer2/newfs_hammer2.8
new file mode 100644 (file)
index 0000000..c42516b
--- /dev/null
@@ -0,0 +1,180 @@
+.\" Copyright (c) 2011 The DragonFly Project.  All rights reserved.
+.\"
+.\" This code is derived from software contributed to The DragonFly Project
+.\" by Matthew Dillon <dillon@backplane.com>
+.\"
+.\" Redistribution and use in source and binary forms, with or without
+.\" modification, are permitted provided that the following conditions
+.\" are met:
+.\"
+.\" 1. Redistributions of source code must retain the above copyright
+.\"    notice, this list of conditions and the following disclaimer.
+.\" 2. Redistributions in binary form must reproduce the above copyright
+.\"    notice, this list of conditions and the following disclaimer in
+.\"    the documentation and/or other materials provided with the
+.\"    distribution.
+.\" 3. Neither the name of The DragonFly Project nor the names of its
+.\"    contributors may be used to endorse or promote products derived
+.\"    from this software without specific, prior written permission.
+.\"
+.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+.\" ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+.\" FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+.\" COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+.\" INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+.\" BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+.\" LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+.\" AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+.\" OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+.\" OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+.\" SUCH DAMAGE.
+.\"
+.Dd May 23, 2011
+.Dt NEWFS_HAMMER2 8
+.Os
+.Sh NAME
+.Nm newfs_hammer2
+.Nd construct a new HAMMER2 file system
+.Sh SYNOPSIS
+.Nm
+.Fl L Ar label
+.Op Fl f
+.Op Fl b Ar bootsize
+.Op Fl r Ar redosize
+.Op Fl V Ar version
+.Ar special ...
+.Sh DESCRIPTION
+The
+.Nm
+utility creates a
+.Nm HAMMER2
+file system on device(s)
+.Ar special .
+If multiple devices are specified a single
+.Nm HAMMER2
+file system is created
+which spans all of them.
+Each
+.Ar special
+will constitute a volume which the
+.Nm HAMMER2
+file system is built on.
+.Nm HAMMER2
+file systems are sector-size agnostic, however the
+.Dx
+implementation requires the sector size to be no larger than 16K.
+.Nm HAMMER2
+file systems start at a relative offset of 0 and may only be created
+under out-of-band disk labels
+.Po
+.Xr disklabel64 5
+or
+.Xr gpt 8
+labels
+.Pc ,
+or in
+.Xr disklabel32 5
+partitions which do not overlap the label area (have a starting sector
+greater than 16).
+.Pp
+.Nm HAMMER2
+file systems are designed for large storage systems, up to 1 Exabyte, and
+will not operate efficiently on small storage systems.
+The minimum recommended file system size is 50GB.
+.Nm HAMMER2
+must reserve 500MB to 1GB of its storage for reblocking and UNDO/REDO.
+In addition,
+.Nm HAMMER2
+file systems operating normally, with full history
+retention and daily snapshots, do not immediately reclaim space when
+files are deleted.
+A regular system maintenance job runs once a day by
+.Xr periodic 8
+to handle reclamation.
+.Pp
+.Nm HAMMER2
+works best when the machine's normal workload would not otherwise fill
+the file system up in the course of 60 days of operation.
+.Pp
+The options are as follows:
+.Bl -tag -width indent
+.It Fl L Ar label
+All
+.Nm HAMMER2
+file systems must be named and names should be unique on a
+per-machine basis.
+.It Fl b Ar bootsize
+Specify a fixed area in which a boot related kernel and data can be stored.
+The
+.Ar bootsize
+is specified in bytes.
+By default a boot area of approximately 4MB will be created.
+.It Fl f
+Force operation.
+This is needed for the creation of a
+.Nm HAMMER2
+file system less than 10GB size or
+with less than 500MB UNDO/REDO FIFO.
+This should not be used under normal circumstances.
+.It Fl r Ar redosize
+Specify the size of the fixed REDO FIFO.
+The
+.Ar redosize
+is specified in bytes.
+By default 0.1% of the root
+volume's size is used, with a reasonable minimum and a reasonable cap.
+The UNDO/REDO FIFO is used to sequence meta-data out to the media for
+instant crash recovery.
+.It Fl V Ar version
+Specify the
+.Nm HAMMER2
+file system version to format.
+By default
+.Nm
+formats the file system using the highest production version number
+supported by the
+.Nm HAMMER2
+VFS by checking the
+.Va vfs.hammer2.supported_version
+sysctl.
+If you need to maintain compatibility with an older version of
+.Nm HAMMER2
+you may specify the version with this option.
+.El
+.Pp
+The
+.Ar bootsize
+and
+.Ar redosize
+must be given with a suffix of
+.Cm K , M , G
+or
+.Cm T
+meaning kilobyte, megabyte, gigabyte and terabyte.
+Lower case can also be used for suffix.
+.Sh EXAMPLES
+.Bd -literal -offset indent
+newfs_hammer2 -L Home /dev/ad0s1d
+.Ed
+.Pp
+Create a file system named
+.Sq Home
+on
+.Pa /dev/ad0s1d .
+.Sh DIAGNOSTICS
+Exit status is 0 on success and 1 on error.
+.Sh SEE ALSO
+.Xr disklabel32 5 ,
+.Xr disklabel64 5 ,
+.Xr HAMMER2 5 ,
+.Xr fdisk 8 ,
+.Xr gpt 8 ,
+.Xr newfs 8
+.Sh HISTORY
+The
+.Nm
+utility first appeared in
+.Dx 1.11 .
+.Sh AUTHORS
+.An Matthew Dillon Aq dillon@backplane.com
diff --git a/sbin/newfs_hammer2/newfs_hammer2.c b/sbin/newfs_hammer2/newfs_hammer2.c
new file mode 100644 (file)
index 0000000..1be34e6
--- /dev/null
@@ -0,0 +1,763 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/diskslice.h>
+#include <sys/diskmbr.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/sysctl.h>
+#include <vfs/hammer2/hammer2_disk.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <stddef.h>
+#include <unistd.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <assert.h>
+#include <err.h>
+#include <uuid.h>
+
+static hammer2_off_t check_volume(const char *path, int *fdp);
+static int64_t getsize(const char *str, int64_t minval, int64_t maxval, int pw);
+static const char *sizetostr(hammer2_off_t size);
+static uint64_t nowtime(void);
+static void usage(void);
+
+static void format_hammer2(int fd, hammer2_off_t total_space,
+                               hammer2_off_t free_space);
+static void alloc_direct(hammer2_off_t *basep, hammer2_blockref_t *bref,
+                               size_t bytes);
+static hammer2_key_t dirhash(const unsigned char *name, size_t len);
+
+static int Hammer2Version = -1;
+static int ForceOpt = 0;
+static uuid_t Hammer2_FSType;  /* static filesystem type id for HAMMER2 */
+static uuid_t Hammer2_FSId;    /* unique filesystem id in volu header */
+static uuid_t Hammer2_SPFSId;  /* PFS id in super-root inode */
+static uuid_t Hammer2_RPFSId;  /* PFS id in root inode */
+static const char *Label = "ROOT";
+static hammer2_off_t BootAreaSize;
+static hammer2_off_t AuxAreaSize;
+
+#define GIG    ((hammer2_off_t)1024*1024*1024)
+
+int
+main(int ac, char **av)
+{
+       uint32_t status;
+       hammer2_off_t total_space;
+       hammer2_off_t free_space;
+       hammer2_off_t reserved_space;
+       int ch;
+       int fd = -1;
+       char *fsidstr;
+       char *spfsidstr;
+       char *rpfsidstr;
+
+       /*
+        * Sanity check basic filesystem structures.  No cookies for us
+        * if it gets broken!
+        */
+       assert(sizeof(hammer2_volume_data_t) == HAMMER2_VOLUME_BYTES);
+       assert(sizeof(hammer2_inode_data_t) == HAMMER2_INODE_BYTES);
+       assert(sizeof(hammer2_blockref_t) == HAMMER2_BLOCKREF_BYTES);
+
+       /*
+        * Generate a filesystem id and lookup the filesystem type
+        */
+       srandomdev();
+       uuidgen(&Hammer2_FSId, 1);
+       uuidgen(&Hammer2_SPFSId, 1);
+       uuidgen(&Hammer2_RPFSId, 1);
+       uuid_from_string(HAMMER2_UUID_STRING, &Hammer2_FSType, &status);
+       /*uuid_name_lookup(&Hammer2_FSType, "DragonFly HAMMER2", &status);*/
+       if (status != uuid_s_ok) {
+               errx(1, "uuids file does not have the DragonFly "
+                       "HAMMER filesystem type");
+       }
+
+       /*
+        * Parse arguments
+        */
+       while ((ch = getopt(ac, av, "fL:b:m:r:V:")) != -1) {
+               switch(ch) {
+               case 'f':
+                       ForceOpt = 1;
+                       break;
+               case 'L':
+                       Label = optarg;
+                       if (strlen(Label) > HAMMER2_INODE_MAXNAME) {
+                               errx(1, "Root directory label too long "
+                                       "(64 chars max)\n");
+                       }
+                       break;
+               case 'b':
+                       BootAreaSize = getsize(optarg,
+                                        HAMMER2_NEWFS_ALIGN,
+                                        HAMMER2_BOOT_MAX_BYTES, 2);
+                       break;
+               case 'r':
+                       AuxAreaSize = getsize(optarg,
+                                        HAMMER2_NEWFS_ALIGN,
+                                        HAMMER2_REDO_MAX_BYTES, 2);
+                       break;
+               case 'V':
+                       Hammer2Version = strtol(optarg, NULL, 0);
+                       if (Hammer2Version < HAMMER2_VOL_VERSION_MIN ||
+                           Hammer2Version >= HAMMER2_VOL_VERSION_WIP) {
+                               errx(1,
+                                    "I don't understand how to format "
+                                    "HAMMER2 version %d\n",
+                                    Hammer2Version);
+                       }
+                       break;
+               default:
+                       usage();
+                       break;
+               }
+       }
+
+       if (Hammer2Version < 0) {
+               size_t olen = sizeof(Hammer2Version);
+               Hammer2Version = HAMMER2_VOL_VERSION_DEFAULT;
+               if (sysctlbyname("vfs.hammer2.supported_version",
+                                &Hammer2Version, &olen, NULL, 0) == 0) {
+                       if (Hammer2Version >= HAMMER2_VOL_VERSION_WIP) {
+                               Hammer2Version = HAMMER2_VOL_VERSION_WIP - 1;
+                               fprintf(stderr,
+                                       "newfs_hammer: WARNING: HAMMER2 VFS "
+                                       "supports higher version than I "
+                                       "understand,\n"
+                                       "using version %d\n",
+                                       Hammer2Version);
+                       }
+               } else {
+                       fprintf(stderr,
+                               "newfs_hammer: WARNING: HAMMER2 VFS not "
+                               "loaded, cannot get version info.\n"
+                               "Using version %d\n",
+                               HAMMER2_VOL_VERSION_DEFAULT);
+               }
+       }
+
+       /*
+        * Collect volume information.
+        */
+       ac -= optind;
+       av += optind;
+
+       if (ac != 1) {
+               fprintf(stderr, "Exactly one disk device must be specified\n");
+               exit(1);
+       }
+       total_space = check_volume(av[0], &fd);
+
+       /*
+        * ~typically 8MB alignment to avoid edge cases for reserved blocks
+        * and so raid stripes (if any) operate efficiently.
+        */
+       total_space &= ~HAMMER2_VOLUME_ALIGNMASK64;
+
+       /*
+        * Calculate defaults for the boot area size and round to the
+        * volume alignment boundary.
+        */
+       if (BootAreaSize == 0) {
+               BootAreaSize = HAMMER2_BOOT_NOM_BYTES;
+               while (BootAreaSize > total_space / 20)
+                       BootAreaSize >>= 1;
+               if (BootAreaSize < HAMMER2_BOOT_MIN_BYTES)
+                       BootAreaSize = HAMMER2_BOOT_MIN_BYTES;
+       } else if (BootAreaSize < HAMMER2_BOOT_MIN_BYTES) {
+               BootAreaSize = HAMMER2_BOOT_MIN_BYTES;
+       }
+       BootAreaSize = (BootAreaSize + HAMMER2_VOLUME_ALIGNMASK64) &
+                      ~HAMMER2_VOLUME_ALIGNMASK64;
+
+       /*
+        * Calculate defaults for the redo area size and round to the
+        * volume alignment boundary.
+        */
+       if (AuxAreaSize == 0) {
+               AuxAreaSize = HAMMER2_REDO_NOM_BYTES;
+               while (AuxAreaSize > total_space / 20)
+                       AuxAreaSize >>= 1;
+               if (AuxAreaSize < HAMMER2_REDO_MIN_BYTES)
+                       AuxAreaSize = HAMMER2_REDO_MIN_BYTES;
+       } else if (AuxAreaSize < HAMMER2_REDO_MIN_BYTES) {
+               AuxAreaSize = HAMMER2_REDO_MIN_BYTES;
+       }
+       AuxAreaSize = (AuxAreaSize + HAMMER2_VOLUME_ALIGNMASK64) &
+                      ~HAMMER2_VOLUME_ALIGNMASK64;
+
+       /*
+        * We'll need to stuff this in the volume header soon.
+        */
+       uuid_to_string(&Hammer2_FSId, &fsidstr, &status);
+       uuid_to_string(&Hammer2_SPFSId, &spfsidstr, &status);
+       uuid_to_string(&Hammer2_RPFSId, &rpfsidstr, &status);
+
+       /*
+        * Calculate the amount of reserved space.  HAMMER2_RESERVE_SEG (4MB)
+        * is reserved at the beginning of every 2GB of storage, rounded up.
+        * Thus a 200MB filesystem will still have a 4MB reserve area.
+        *
+        * We also include the boot and redo areas in the reserve.  The
+        * reserve is used to help 'df' calculate the amount of available
+        * space.
+        */
+       reserved_space = ((total_space + HAMMER2_RESERVE_MASK64) /
+                         HAMMER2_RESERVE_BYTES64) * HAMMER2_RESERVE_SEG64;
+
+       free_space = total_space - reserved_space -
+                    BootAreaSize - AuxAreaSize;
+
+       format_hammer2(fd, total_space, free_space);
+       fsync(fd);
+       close(fd);
+
+       printf("---------------------------------------------\n");
+       printf("total-size:       %s (%jd bytes)\n",
+              sizetostr(total_space),
+              (intmax_t)total_space);
+       printf("root-label:       %s\n", Label);
+       printf("version:            %d\n", Hammer2Version);
+       printf("boot-area-size:   %s\n", sizetostr(BootAreaSize));
+       printf("aux-area-size:    %s\n", sizetostr(AuxAreaSize));
+       printf("topo-reserved:    %s\n", sizetostr(reserved_space));
+       printf("free-space:       %s\n", sizetostr(free_space));
+       printf("fsid:             %s\n", fsidstr);
+       printf("supr-pfsid:       %s\n", spfsidstr);
+       printf("root-pfsid:       %s\n", rpfsidstr);
+       printf("\n");
+
+       return(0);
+}
+
+static
+void
+usage(void)
+{
+       fprintf(stderr,
+               "usage: newfs_hammer -L label [-f] [-b bootsize] "
+               "[-r redosize] [-V version] special ...\n"
+       );
+       exit(1);
+}
+
+/*
+ * Convert the size in bytes to a human readable string.
+ */
+static
+const char *
+sizetostr(hammer2_off_t size)
+{
+       static char buf[32];
+
+       if (size < 1024 / 2) {
+               snprintf(buf, sizeof(buf), "%6.2f", (double)size);
+       } else if (size < 1024 * 1024 / 2) {
+               snprintf(buf, sizeof(buf), "%6.2fKB",
+                       (double)size / 1024);
+       } else if (size < 1024 * 1024 * 1024LL / 2) {
+               snprintf(buf, sizeof(buf), "%6.2fMB",
+                       (double)size / (1024 * 1024));
+       } else if (size < 1024 * 1024 * 1024LL * 1024LL / 2) {
+               snprintf(buf, sizeof(buf), "%6.2fGB",
+                       (double)size / (1024 * 1024 * 1024LL));
+       } else {
+               snprintf(buf, sizeof(buf), "%6.2fTB",
+                       (double)size / (1024 * 1024 * 1024LL * 1024LL));
+       }
+       return(buf);
+}
+
+/*
+ * Convert a string to a 64 bit signed integer with various requirements.
+ */
+static int64_t
+getsize(const char *str, int64_t minval, int64_t maxval, int powerof2)
+{
+       int64_t val;
+       char *ptr;
+
+       val = strtoll(str, &ptr, 0);
+       switch(*ptr) {
+       case 't':
+       case 'T':
+               val *= 1024;
+               /* fall through */
+       case 'g':
+       case 'G':
+               val *= 1024;
+               /* fall through */
+       case 'm':
+       case 'M':
+               val *= 1024;
+               /* fall through */
+       case 'k':
+       case 'K':
+               val *= 1024;
+               break;
+       default:
+               errx(1, "Unknown suffix in number '%s'\n", str);
+               /* not reached */
+       }
+       if (ptr[1]) {
+               errx(1, "Unknown suffix in number '%s'\n", str);
+               /* not reached */
+       }
+       if (val < minval) {
+               errx(1, "Value too small: %s, min is %s\n",
+                    str, sizetostr(minval));
+               /* not reached */
+       }
+       if (val > maxval) {
+               errx(1, "Value too large: %s, max is %s\n",
+                    str, sizetostr(maxval));
+               /* not reached */
+       }
+       if ((powerof2 & 1) && (val ^ (val - 1)) != ((val << 1) - 1)) {
+               errx(1, "Value not power of 2: %s\n", str);
+               /* not reached */
+       }
+       if ((powerof2 & 2) && (val & HAMMER2_NEWFS_ALIGNMASK)) {
+               errx(1, "Value not an integral multiple of %dK: %s",
+                    HAMMER2_NEWFS_ALIGN / 1024, str);
+               /* not reached */
+       }
+       return(val);
+}
+
+static uint64_t
+nowtime(void)
+{
+       struct timeval tv;
+       uint64_t xtime;
+
+       gettimeofday(&tv, NULL);
+       xtime = tv.tv_sec * 1000000LL + tv.tv_usec;
+       return(xtime);
+}
+
+/*
+ * Figure out how big the volume is.
+ */
+static
+hammer2_off_t
+check_volume(const char *path, int *fdp)
+{
+       struct partinfo pinfo;
+       struct stat st;
+       hammer2_off_t size;
+
+       /*
+        * Get basic information about the volume
+        */
+       *fdp = open(path, O_RDWR);
+       if (*fdp < 0)
+               err(1, "Unable to open %s R+W", path);
+       if (ioctl(*fdp, DIOCGPART, &pinfo) < 0) {
+               /*
+                * Allow the formatting of regular files as HAMMER2 volumes
+                */
+               if (fstat(*fdp, &st) < 0)
+                       err(1, "Unable to stat %s", path);
+               size = st.st_size;
+       } else {
+               /*
+                * When formatting a block device as a HAMMER2 volume the
+                * sector size must be compatible.  HAMMER2 uses 64K
+                * filesystem buffers but logical buffers for direct I/O
+                * can be as small as HAMMER2_LOGSIZE (16KB).
+                */
+               if (pinfo.reserved_blocks) {
+                       errx(1, "HAMMER cannot be placed in a partition "
+                               "which overlaps the disklabel or MBR");
+               }
+               if (pinfo.media_blksize > HAMMER2_PBUFSIZE ||
+                   HAMMER2_PBUFSIZE % pinfo.media_blksize) {
+                       errx(1, "A media sector size of %d is not supported",
+                            pinfo.media_blksize);
+               }
+               size = pinfo.media_size;
+       }
+       printf("Volume %-15s size %s\n", path, sizetostr(size));
+       return (size);
+}
+
+/*
+ * Create the volume header, the super-root directory inode, and
+ * the writable snapshot subdirectory (named via the label) which
+ * is to be the initial mount point, or at least the first mount point.
+ *
+ * [----reserved_area----][boot_area][aux_area]
+ * [[vol_hdr]...         ]                      [sroot][root]
+ *
+ * The sroot and root inodes eat 512 bytes each.  newfs labels can only be
+ * 64 bytes so the root (snapshot) inode does not need to extend past 512
+ * bytes.  We use the correct hash slot correct but note that because
+ * directory hashes are chained 16x, any slot in the inode will work.
+ *
+ * Also format the allocation map.
+ *
+ * NOTE: The passed total_space is 8MB-aligned to avoid edge cases.
+ */
+static
+void
+format_hammer2(int fd, hammer2_off_t total_space, hammer2_off_t free_space)
+{
+       char *buf = malloc(HAMMER2_PBUFSIZE);
+       hammer2_volume_data_t *vol;
+       hammer2_inode_data_t *rawip;
+       hammer2_blockref_t sroot_blockref;
+       hammer2_blockref_t root_blockref;
+       uint64_t now;
+       hammer2_off_t volu_base = 0;
+       hammer2_off_t boot_base = HAMMER2_RESERVE_SEG;
+       hammer2_off_t aux_base = boot_base + BootAreaSize;
+       hammer2_off_t alloc_base = aux_base + AuxAreaSize;
+       hammer2_off_t tmp_base;
+       size_t n;
+       int i;
+
+       /*
+        * Clear the entire reserve for the first 2G segment and
+        * make sure we can write to the last block.
+        */
+       bzero(buf, HAMMER2_PBUFSIZE);
+       tmp_base = volu_base;
+       for (i = 0; i < HAMMER2_RESERVE_BLOCKS; ++i) {
+               n = pwrite(fd, buf, HAMMER2_PBUFSIZE, tmp_base);
+               if (n != HAMMER2_PBUFSIZE) {
+                       perror("write");
+                       exit(1);
+               }
+               tmp_base += HAMMER2_PBUFSIZE;
+       }
+
+       n = pwrite(fd, buf, HAMMER2_PBUFSIZE,
+                  volu_base + total_space - HAMMER2_PBUFSIZE);
+       if (n != HAMMER2_PBUFSIZE) {
+               perror("write (at-end-of-volume)");
+               exit(1);
+       }
+
+       /*
+        * Reserve space for the super-root inode and the root inode.
+        * Put them in the same 64K block.
+        */
+       assert((alloc_base & HAMMER2_PBUFMASK) == 0);
+
+       alloc_base &= ~HAMMER2_PBUFMASK64;
+       alloc_direct(&alloc_base, &sroot_blockref, HAMMER2_INODE_BYTES);
+       alloc_direct(&alloc_base, &root_blockref, HAMMER2_INODE_BYTES);
+       assert(((sroot_blockref.data_off ^ root_blockref.data_off) &
+               HAMMER2_OFF_MASK_HI) == 0);
+
+       bzero(buf, HAMMER2_PBUFSIZE);
+       now = nowtime();
+
+       /*
+        * Format the root directory inode, which is left empty.
+        */
+       rawip = (void *)(buf + (HAMMER2_OFF_MASK_LO & root_blockref.data_off));
+       rawip->version = HAMMER2_INODE_VERSION_ONE;
+       rawip->ctime = now;
+       rawip->mtime = now;
+       /* rawip->atime = now; NOT IMPL MUST BE ZERO */
+       rawip->btime = now;
+       rawip->type = HAMMER2_OBJTYPE_DIRECTORY;
+       rawip->mode = 0755;
+       rawip->inum = 1;                /* root inode, inumber 1 */
+       rawip->nlinks = 1;              /* directory link count compat */
+
+       rawip->name_len = strlen(Label);
+       bcopy(Label, rawip->filename, rawip->name_len);
+       rawip->name_key = dirhash(rawip->filename, rawip->name_len);
+
+       /*
+        * Compression mode and supported copyids.
+        */
+       rawip->comp_algo = HAMMER2_COMP_AUTOZERO;
+
+       rawip->pfs_id = Hammer2_RPFSId;
+       rawip->pfs_type = HAMMER2_PFSTYPE_MASTER;
+       rawip->op_flags |= HAMMER2_OPFLAG_PFSROOT;
+
+       /* rawip->u.blockset is left empty */
+
+       /*
+        * The root blockref will be stored in the super-root inode as
+        * the only directory entry.  The copyid here is the actual copyid
+        * of the storage ref.
+        *
+        * The key field for a directory entry's blockref is essentially
+        * the name key for the entry.
+        */
+       root_blockref.key = rawip->name_key;
+       root_blockref.copyid = HAMMER2_COPYID_LOCAL;
+       root_blockref.keybits = 0;
+       root_blockref.check.iscsi32.value =
+                                       hammer2_icrc32(rawip, sizeof(*rawip));
+       root_blockref.type = HAMMER2_BREF_TYPE_INODE;
+       root_blockref.methods = HAMMER2_ENC_CHECKMETHOD(HAMMER2_CHECK_ICRC) |
+                               HAMMER2_ENC_COMPMETHOD(HAMMER2_COMP_AUTOZERO);
+
+       /*
+        * Format the super-root directory inode, giving it one directory
+        * entry (root_blockref) and fixup the icrc method.
+        *
+        * The superroot contains one directory entry pointing at the root
+        * inode (named via the label).  Inodes contain one blockset which
+        * is fully associative so we can put the entry anywhere without
+        * having to worry about the hash.  Use index 0.
+        */
+       rawip = (void *)(buf + (HAMMER2_OFF_MASK_LO & sroot_blockref.data_off));
+       rawip->version = HAMMER2_INODE_VERSION_ONE;
+       rawip->ctime = now;
+       rawip->mtime = now;
+       /* rawip->atime = now; NOT IMPL MUST BE ZERO */
+       rawip->btime = now;
+       rawip->type = HAMMER2_OBJTYPE_DIRECTORY;
+       rawip->mode = 0700;             /* super-root - root only */
+       rawip->inum = 0;                /* super root inode, inumber 0 */
+       rawip->nlinks = 2;              /* directory link count compat */
+
+       rawip->name_len = 0;            /* super-root is unnamed */
+       rawip->name_key = 0;
+
+       rawip->comp_algo = HAMMER2_COMP_AUTOZERO;
+
+       /*
+        * The super-root is flagged as a PFS and typically given its own
+        * random FSID, making it possible to mirror an entire HAMMER2 disk
+        * snapshots and all if desired.  PFS ids are used to match up
+        * mirror sources and targets and cluster copy sources and targets.
+        */
+       rawip->pfs_id = Hammer2_SPFSId;
+       rawip->pfs_type = HAMMER2_PFSTYPE_MASTER;
+       rawip->op_flags |= HAMMER2_OPFLAG_PFSROOT;
+
+       /*
+        * The super-root has one directory entry pointing at the named
+        * root inode.
+        */
+       rawip->u.blockset.blockref[0] = root_blockref;
+
+       /*
+        * The sroot blockref will be stored in the volume header.
+        */
+       sroot_blockref.copyid = HAMMER2_COPYID_LOCAL;
+       sroot_blockref.keybits = 0;
+       sroot_blockref.check.iscsi32.value =
+                                       hammer2_icrc32(rawip, sizeof(*rawip));
+       sroot_blockref.type = HAMMER2_BREF_TYPE_INODE;
+       sroot_blockref.methods = HAMMER2_ENC_CHECKMETHOD(HAMMER2_CHECK_ICRC) |
+                                HAMMER2_ENC_COMPMETHOD(HAMMER2_COMP_AUTOZERO);
+
+       /*
+        * Write out the 64K HAMMER2 block containing the root and sroot.
+        */
+       n = pwrite(fd, buf, HAMMER2_PBUFSIZE,
+                  root_blockref.data_off & HAMMER2_OFF_MASK_HI);
+       if (n != HAMMER2_PBUFSIZE) {
+               perror("write");
+               exit(1);
+       }
+
+       /*
+        * Format the volume header.
+        *
+        * The volume header points to sroot_blockref.  Also be absolutely
+        * sure that allocator_beg is set.
+        */
+       bzero(buf, HAMMER2_PBUFSIZE);
+       vol = (void *)buf;
+
+       vol->magic = HAMMER2_VOLUME_ID_HBO;
+       vol->boot_beg = boot_base;
+       vol->boot_end = boot_base + BootAreaSize;
+       vol->aux_beg = aux_base;
+       vol->aux_end = aux_base + AuxAreaSize;
+       vol->volu_size = total_space;
+       vol->version = Hammer2Version;
+       vol->flags = 0;
+
+       vol->fsid = Hammer2_FSId;
+       vol->fstype = Hammer2_FSType;
+
+       vol->allocator_size = free_space;
+       vol->allocator_free = free_space;
+       vol->allocator_beg = alloc_base;
+
+       vol->sroot_blockset.blockref[0] = sroot_blockref;
+       vol->last_tid = 0;
+       vol->alloc_tid = 16;
+       vol->icrc_sects[HAMMER2_VOL_ICRC_SECT1] =
+                       hammer2_icrc32((char *)vol + HAMMER2_VOLUME_ICRC1_OFF,
+                                      HAMMER2_VOLUME_ICRC1_SIZE);
+
+       /*
+        * Set ICRC_SECT0 after all remaining elements of sect0 have been
+        * populated in the volume header.  Note hat ICRC_SECT* (except for
+        * SECT0) are part of sect0.
+        */
+       vol->icrc_sects[HAMMER2_VOL_ICRC_SECT0] =
+                       hammer2_icrc32((char *)vol + HAMMER2_VOLUME_ICRC0_OFF,
+                                      HAMMER2_VOLUME_ICRC0_SIZE);
+       vol->icrc_volheader =
+                       hammer2_icrc32((char *)vol + HAMMER2_VOLUME_ICRCVH_OFF,
+                                      HAMMER2_VOLUME_ICRCVH_SIZE);
+
+       /*
+        * Write the volume header and all alternates.
+        */
+       for (i = 0; i < HAMMER2_NUM_VOLHDRS; ++i) {
+               if (i * HAMMER2_RESERVE_BYTES64 >= total_space)
+                       break;
+               n = pwrite(fd, buf, HAMMER2_PBUFSIZE,
+                          volu_base + i * HAMMER2_RESERVE_BYTES64);
+               if (n != HAMMER2_PBUFSIZE) {
+                       perror("write");
+                       exit(1);
+               }
+       }
+
+       /*
+        * Cleanup
+        */
+       free(buf);
+}
+
+static void
+alloc_direct(hammer2_off_t *basep, hammer2_blockref_t *bref, size_t bytes)
+{
+       int radix;
+
+       radix = 0;
+       assert(bytes);
+       while ((bytes & 1) == 0) {
+               bytes >>= 1;
+               ++radix;
+       }
+       assert(bytes == 1);
+       if (radix < HAMMER2_MIN_RADIX)
+               radix = HAMMER2_MIN_RADIX;
+
+       bzero(bref, sizeof(*bref));
+       bref->data_off = *basep | radix;
+       bref->vradix = radix;
+
+       *basep += 1U << radix;
+}
+
+/*
+ * Borrow HAMMER1's directory hash algorithm #1 with a few modifications.
+ * The filename is split into fields which are hashed separately and then
+ * added together.
+ *
+ * Differences include: bit 63 must be set to 1 for HAMMER2 (HAMMER1 sets
+ * it to 0), this is because bit63=0 is used for hidden hardlinked inodes.
+ * (This means we do not need to do a 0-check/or-with-0x100000000 either).
+ *
+ * Also, the iscsi crc code is used instead of the old crc32 code.
+ */
+static hammer2_key_t
+dirhash(const unsigned char *name, size_t len)
+{
+       const unsigned char *aname = name;
+       uint32_t crcx;
+       uint64_t key;
+       size_t i;
+       size_t j;
+
+       /*
+        * Filesystem version 6 or better will create directories
+        * using the ALG1 dirhash.  This hash breaks the filename
+        * up into domains separated by special characters and
+        * hashes each domain independently.
+        *
+        * We also do a simple sub-sort using the first character
+        * of the filename in the top 5-bits.
+        */
+       key = 0;
+
+       /*
+        * m32
+        */
+       crcx = 0;
+       for (i = j = 0; i < len; ++i) {
+               if (aname[i] == '.' ||
+                   aname[i] == '-' ||
+                   aname[i] == '_' ||
+                   aname[i] == '~') {
+                       if (i != j)
+                               crcx += hammer2_icrc32(aname + j, i - j);
+                       j = i + 1;
+               }
+       }
+       if (i != j)
+               crcx += hammer2_icrc32(aname + j, i - j);
+
+       /*
+        * The directory hash utilizes the top 32 bits of the 64-bit key.
+        * Bit 63 must be set to 1.
+        */
+       crcx |= 0x80000000U;
+       key |= (uint64_t)crcx << 32;
+
+       /*
+        * l16 - crc of entire filename
+        *
+        * This crc reduces degenerate hash collision conditions
+        */
+       crcx = hammer2_icrc32(aname, len);
+       crcx = crcx ^ (crcx << 16);
+       key |= crcx & 0xFFFF0000U;
+
+       /*
+        * Set bit 15.  This allows readdir to strip bit 63 so a positive
+        * 64-bit cookie/offset can always be returned, and still guarantee
+        * that the values 0x0000-0x7FFF are available for artificial entries.
+        * ('.' and '..').
+        */
+       key |= 0x8000U;
+
+       return (key);
+}
diff --git a/sys/vfs/hammer2/CHANGES b/sys/vfs/hammer2/CHANGES
new file mode 100644 (file)
index 0000000..e9243b0
--- /dev/null
@@ -0,0 +1,10 @@
+
+                           DESIGN CHANGES & ISSUES
+
+* Indirect blocks have to be fully associative (all 1024 entries) for now,
+  I haven't figured out a way to break it down into smaller associative
+  blocks without breaking copies.
+
+* (temporary) all data blocks are 64K at the moment.
+
+* currently directory cookies are non-linear.
diff --git a/sys/vfs/hammer2/DESIGN b/sys/vfs/hammer2/DESIGN
new file mode 100644 (file)
index 0000000..5e87f52
--- /dev/null
@@ -0,0 +1,350 @@
+
+                           HAMMER2 DESIGN DOCUMENT
+
+                               Matthew Dillon
+                                08-Feb-2012
+                            dillon@backplane.com
+
+* These features have been speced in the media structures.
+
+* Implementation work has begun.
+
+* A working filesystem with some features implemented is expected by July 2012.
+
+* A fully functional filesystem with most (but not all) features is expected
+  by the end of 2012.
+
+* All elements of the filesystem have been designed except for the freemap
+  (which isn't needed for initial work).  8MB per 2GB of filesystem
+  storage has been reserved for the freemap.  The design of the freemap
+  is expected to be completely speced by mid-year.
+
+* This is my only project this year.  I'm not going to be doing any major
+  kernel bug hunting this year.
+
+                               Feature List
+
+* Multiple roots (allowing snapshots to be mounted).  This is implemented
+  via the super-root concept.  When mounting a HAMMER2 filesystem you specify
+  a device path and a directory name in the super-root.
+
+* HAMMER1 had PFS's.  HAMMER2 does not.  Instead, in HAMMER2 any directory
+  in the tree can be configured as a PFS, causing all elements recursively
+  underneath that directory to become a part of that PFS.
+
+* Writable snapshots.  Any subdirectory tree can be snapshotted.  Snapshots
+  show up in the super-root.  It is possible to snapshot a subdirectory
+  and then later snapshot a parent of that subdirectory... really there are
+  no limitations here.
+
+* Directory sub-hierarchy based quotas and space and inode usage tracking.
+  Any directory sub-tree, whether at a mount point or not, tracks aggregate
+  inode use and data space use.  This is stored in the directory inode all
+  the way up the chain.
+
+* Incremental queueless mirroring / mirroring-streams.  Because HAMMER2 is
+  block-oriented and copy-on-write each blockref tracks both direct
+  modifications to the referenced data via (modify_tid) and indirect
+  modifications to the referenced data or any sub-tree via (mirror_tid).
+  This makes it possible to do an incremental scan of meta-data that covers
+  only changes made since the mirror_tid recorded in a prior-run.
+
+  This feature is also intended to be used to locate recently allocated
+  blocks and thus be able to fixup the freemap after a crash.
+
+  HAMMER2 mirroring works a bit differently than HAMMER1 mirroring in
+  that HAMMER2 does not keep track of 'deleted' records.  Instead any
+  recursion by the mirroring code which finds that (modify_tid) has
+  been updated must also send the direct block table or indirect block
+  table state it winds up recursing through so the target can check
+  similar key ranges and locate elements to be deleted.  This can be
+  avoided if the mirroring stream is mostly caught up in that very recent
+  deletions will be cached in memory and can be queried, allowing shorter
+  record deletions to be passed in the stream instead.
+
+* Will support multiple compression algorithms configured on subdirectory
+  tree basis and on a file basis.  Up to 64K block compression will be used.
+  Only compression ratios near powers of 2 that are at least 2:1 (e.g. 2:1,
+  4:1, 8:1, etc) will work in this scheme because physical block allocations
+  in HAMMER2 are always power-of-2.
+
+  Compression algorithm #0 will mean no compression and no zero-checking.
+  Compression algorithm #1 will mean zero-checking but no other compression.
+  Real compression will be supported starting with algorithm 2.
+
+* Zero detection on write (writing all-zeros), which requires the data
+  buffer to be scanned, will be supported as compression algorithm #1.
+  This allows the writing of 0's to create holes and will be the default
+  compression algorithm for HAMMER2.
+
+* Copies support for redundancy.  The media blockref structure would
+  have become too bloated but I found a clean way to do copies using the
+  blockset structure (which is a set of 8 fully associative blockref's).
+
+  The design is such that the filesystem should be able to function at
+  full speed even if disks are pulled or inserted, as long as at least one
+  good copy is present.  A background task will be needed to resynchronize
+  missing copies (or remove excessive copies in the case where the copies
+  value is reduced on a live filesystem).
+
+* Clusterable with MESI cache coherency and dynamic granularity.
+  The media format for HAMMER1 was less condusive to logical clustering
+  than I had hoped so I was never able to get that aspect of my personal goals
+  working with HAMMER1.  HAMMER2 effectively solves the issues that cropped
+  up with HAMMER1 (mainly that HAMMER1's B-Tree did not reflect the logical
+  file/directory hierarchy, making cache coherency very difficult).
+
+* Hardlinks will be supported.  All other standard features will be supported
+  too of course.  Hardlinks in this sort of filesystem require significant
+  work.
+
+* The media blockref structure is now large enough to support up to a 192-bit
+  check value, which would typically be a cryptographic hash of some sort.
+  Multiple check value algorithms will be supported with the default being
+  a simple 32-bit iSCSI CRC.
+
+* Fully verified deduplication will be supported and automatic (and
+  necessary in many respects).
+
+* Non-verified de-duplication will be supported as a configurable option on
+  a file or subdirectory tree.  Non-verified deduplication would use the
+  largest available check code (192 bits) and not bother to verify data
+  matches during the dedup pass, which is necessary on extremely large
+  filesystems with a great deal of deduplicable data (as otherwise a large
+  chunk of the media would have to be read to implement the dedup).
+
+  This feature is intended only for those files where occassional corruption
+  is ok, such as in a large data store of farmed web content.
+
+                               GENERAL DESIGN
+
+HAMMER2 generally implements a copy-on-write block design for the filesystem,
+which is very different from HAMMER1's B-Tree design.  Because the design
+is copy-on-write it can be trivially snapshotted simply by referencing an
+existing block, and because the media structures logically match a standard
+filesystem directory/file hierarchy snapshots and other similar operations
+can be trivially performed on an entire subdirectory tree at any level in
+the filesystem.
+
+The copy-on-write nature of the filesystem implies that any modification
+whatsoever will have to eventually synchronize new disk blocks all the way
+to the super-root of the filesystem and the volume header itself.  This forms
+the basis for crash recovery.  All disk writes are to new blocks except for
+the volume header, thus allowing all writes to run concurrently except for
+the volume header update at the end.
+
+Clearly this method requires intermediate modifications to the chain to be
+cached so multiple modifications can be aggregated prior to being
+synchronized.  One advantage, however, is that the cache can be flushed at
+any time WITHOUT having to allocate yet another new block when further
+modifications are made as long as the volume header has not yet been flushed.
+This means that buffer cache overhead is very well bounded and can handle
+filesystem operations of any complexity even on boxes with very small amounts
+of physical memory.
+
+I intend to implement a shortcut to make fsync()'s run fast, and that is to
+allow deep updates to blockrefs to shortcut to auxillary space in the
+volume header to satisfy the fsync requirement.  The related blockref is
+then recorded when the filesystem is mounted after a crash and the update
+chain is reconstituted when a matching blockref is encountered again during
+normal operation of the filesystem.
+
+Basically this means that no real work needs to be done at mount-time
+even after a crash.
+
+Directories are hashed, and another major design element is that directory
+entries ARE INODES.  They are one and the same.  In addition to directory
+entries being inodes the data for very small files (512 bytes or smaller)
+can be directly embedded in the inode (overloaded onto the same space that
+the direct blockref array uses).  This should result in very high
+performance.
+
+Inode numbers are not spatially referenced, which complicates NFS servers
+but doesn't complicate anything else.  The inode number is stored in the
+inode itself, an absolutely necessary feature in order to support the
+hugely flexible snapshots that we want to have in HAMMER2.
+
+                                 HARDLINKS
+
+Hardlinks are a particularly sticky problem for HAMMER2 due to the lack of
+a spatial reference to the inode number.  We do not want to have to have
+an index of inode numbers for any basic HAMMER2 feature if we can help it.
+
+Hardlinks are handled by placing the inode for a multiply-hardlinked file
+in the closest common parent directory.  If "a/x" and "a/y" are hardlinked
+the inode for the hardlinked file will be placed in directory "a", e.g.
+"a/3239944", but it will be invisible and will be in an out-of-band namespace.
+The directory entries "a/x" and "a/y" will be given the same inode number
+but in fact just be placemarks that cause HAMMER2 to recurse upwards through
+the directory tree to find the invisible inode number.
+
+Because directories are hashed and a different namespace (hash key range)
+is used for hardlinked inodes, standard directory scans are able to trivially
+skip this invisible namespace and inode-specific lookups can restrict their
+lookup to within this space.
+
+The nature of snapshotting makes handling link-count 2->1 and 1->2 cases
+trivial.  Basically the inode media structure is copied as needed to break-up
+or re-form the standard directory entry/inode.  There are no backpointers in
+HAMMER2 and no reference counts on the blocks (see FREEMAP NOTES below), so
+it is an utterly trivial operation.
+
+                               FREEMAP NOTES
+
+In order to implement fast snapshots (and writable snapshots for that
+matter), HAMMER2 does NOT ref-count allocations.  The freemap which
+is still under design just won't do that.  All the freemap does is
+keep track of 100% free blocks.
+
+This not only trivializes all the snapshot features it also trivializes
+hardlink handling and solves the problem of keeping the freemap sychronized
+in the event of a crash.  Now all we have to do after a crash is make
+sure blocks allocated before the freemap was flushed are properly
+marked as allocated in the allocmap.  This is a trivial exercise using the
+same algorithm the mirror streaming code uses (which is very similar to
+HAMMER1)... an incremental meta-data scan that covers only the blocks that
+might have been allocated between the last allocation map sync and now.
+
+Thus the freemap does not have to be synchronized during a fsync().
+
+The complexity is in figuring out what can be freed... that is, when one
+can mark blocks in the freemap as being free.  HAMMER2 implements this as
+a background task which essentially must scan available meta-data to
+determine which blocks are not being referenced.
+
+Part of the ongoing design work is finding ways to reduce the scope of this
+meta-data scan so the entire filesystem's meta-data does not need to be
+scanned (though in tests with HAMMER1, even full meta-data scans have
+turned out to be fairly low cost).  In other words, its an area that we
+can continue to improve on as the filesystem matures.  Not only that, but
+we can completely change the freemap algorithms without creating
+incompatibilities (at worse simply having to require that a R+W mount do
+a full meta-data scan when upgrading or downgrading the freemap algorithm).
+
+                                 CLUSTERING
+
+Clustering, as always, is the most difficult bit but we have some advantages
+with HAMMER2 that we did not have with HAMMER1.  First, HAMMER2's media
+structures generally follow the kernel's filesystem hiearchy.  Second,
+HAMMER2's writable snapshots make it possible to implement several forms
+of multi-master clustering.
+
+This is important: The mount device path you specify serves to bootstrap
+your entry into the cluster, but your mount will make active connections
+to ALL copy elements in the hammer2_copy_data[] array (stored in the volume
+header) which match the PFSID of the directory in the super-root that you
+specified.  The local media path does not have to be mentioned in this
+array but becomes part of the cluster based on its type and access
+rights.  ALL ELEMENTS ARE TREATED ACCORDING TO TYPE NO MATTER WHICH ONE
+YOU MOUNT FROM.
+
+The actual cluster may be far larger than the elements you list in the
+hammer2_copy_data[] array.  You list only the elements you wish to
+directly connect to and you are able to access the rest of the cluster
+indirectly through those connections.
+
+All nodes in the cluster may act as administrative proxies.  All nodes
+in the cluster, including your mount point, are classified as one of the
+following as specified in the inode's structure:
+
+    ADMIN      - Media does not participate, administrative proxy only
+    CACHE      - Media only acts as a persistent cache
+    COPY       - Media only acts as a local copy
+    SLAVE      - Media is a RO slave that can be mounted RW
+
+    SOFT_SLAVE - This is a SLAVE which can become writable when
+                 the quorum is not available, but is not guaranteed
+                 to be able to be merged back when the quorum becomes
+                 available again.  Elements which cannot be merged
+                 back remain localized and writable until manual
+                 or scripted intervention recombines them.
+
+    SOFT_MASTER        - Similar to the above but can form a sub-cluster
+                 and run the quorum protocol within the sub-cluster
+                 to serve machines that connect to the sub-cluster
+                 when the master cluster is not available.
+
+                 The SOFT_MASTER nodes in a sub-cluster must be
+                 fully interconnected with each other.
+
+    MASTER     - This is a MASTER node in the quorum protocol.
+
+                 The MASTER nodes in a cluster must be fully
+                 interconnected with each other.
+
+There are four major protocols:
+
+    Quorum protocol
+
+       This protocol is used between MASTER nodes to vote on operations
+       and resolve deadlocks.
+
+       This protocol is used between SOFT_MASTER nodes in a sub-cluster
+       to vote on operations, resolve deadlocks, determine what the latest
+       transaction id for an element is, and to perform commits.
+
+    Cache sub-protocol
+
+       This is the MESI sub-protocol which runs under the Quorum
+       protocol.  This protocol is used to maintain cache state for
+       sub-trees to ensure that operations remain cache coherent.
+
+       Depending on administrative rights this protocol may or may
+       not allow a leaf node in the cluster to hold a cache element
+       indefinitely.  The administrative controller may preemptively
+       downgrade a leaf with insufficient administrative rights
+       without giving it a chance to synchronize any modified state
+       back to the cluster.
+
+    Proxy protocol
+
+       The Quorum and Cache protocols only operate between MASTER
+       and SOFT_MASTER nodes.  All other node types must use the
+       Proxy protocol to perform similar actions.  This protocol
+       differs in that proxy requests are typically sent to just
+       one adjacent node and that node then maintains state and
+       forwards the request or performs the required operation.
+       When the link is lost to the proxy, the proxy automatically
+       forwards a deletion of the state to the other nodes based on
+       what it has recorded.
+
+       If a leaf has insufficient administrative rights it may not
+       be allowed to actually initiate a quorum operation and may only
+       be allowed to maintain partial MESI cache state or perhaps none
+       at all (since cache state can block other machines in the
+       cluster).  Instead a leaf with insufficient rights will have to
+       make due with a preemptive loss of cache state and any allowed
+       modifying operations will have to be forwarded to the proxy which
+       continues forwarding it until a node with sufficient administrative
+       rights is encountered.
+
+       To reduce issues and give the cluster more breath, sub-clusters
+       made up of SOFT_MASTERs can be formed in order to provide full
+       cache coherent within a subset of machines and yet still tie them
+       into a greater cluster that they normally would not have such
+       access to.  This effectively makes it possible to create a two
+       or three-tier fan-out of groups of machines which are cache-coherent
+       within the group, but perhaps not between groups, and use other
+       means to synchronize between the groups.
+
+    Media protocol
+
+       This is basically the physical media protocol.
+
+There are lots of ways to implement multi-master environments using the
+above core features but the implementation is going to be fairly complex
+even with HAMMER2's feature set.
+
+Keep in mind that modifications propagate all the way to the super-root
+and volume header, so in any clustered arrangement the use of (modify_tid)
+and (mirror_tid) is critical in determining the synchronization state of
+portion(s) of the filesystem.
+
+Specifically, since any modification propagates to the root the (mirror_tid)
+in higher level directories is going to be in a constant state of flux.  This
+state of flux DOES NOT invalidate the cache state for these higher levels
+of directories.  Instead, the (modify_tid) is used on a node-by-node basis
+to determine cache state at any given level, and (mirror_tid) is used to
+determine whether any recursively underlying state is desynchronized.
+The inode structure also has two additional transaction ids used to optimize
+path lookups, stat, and directory lookup/scan operations.
diff --git a/sys/vfs/hammer2/Makefile b/sys/vfs/hammer2/Makefile
new file mode 100644 (file)
index 0000000..f0b2ffe
--- /dev/null
@@ -0,0 +1,12 @@
+# Makefile for hammer2 vfs
+#
+#
+.PATH: ${.CURDIR}
+
+CFLAGS+= -DINVARIANTS
+KMOD=  hammer2
+SRCS=  hammer2_vfsops.c hammer2_vnops.c hammer2_inode.c
+SRCS+= hammer2_chain.c hammer2_freemap.c hammer2_subr.c hammer2_icrc.c
+SRCS+= hammer2_ioctl.c
+
+.include <bsd.kmod.mk>
diff --git a/sys/vfs/hammer2/TODO b/sys/vfs/hammer2/TODO
new file mode 100644 (file)
index 0000000..df6ae9d
--- /dev/null
@@ -0,0 +1,56 @@
+* Nesting problems in the flusher.
+
+* Inefficient vfsync due to thousands of file buffers, one per-vnode.
+  (need to aggregate using a device buffer?)
+
+* Adjust the flusher to unlock the parent after the child is locked,
+  then restart if the parent changed out from under us.  This will
+  greatly reduce namecache contention.
+
+* Use bp->b_dep to interlock the buffer with the chain structure so the
+  strategy code can calculate the crc and assert that the chain is marked
+  modified (not yet flushed).
+
+* Deleted inode not reachable via tree for volume flush but still reachable
+  via fsync/inactive/reclaim.  Its tree can be destroyed at that point.
+
+* The direct write code needs to invalidate any underlying physical buffers.
+  Direct write needs to be implemented.
+
+* Make sure a resized block (hammer2_chain_resize()) calculates a new
+  hash code in the parent bref
+
+* The freemap allocator needs to getblk/clrbuf/bdwrite any partial
+  block allocations (less than 64KB) that allocate out of a new 64K
+  block, to avoid causing a read-before-write I/O.
+
+* Check flush race upward recursion setting SUBMODIFIED vs downward
+  recursion checking SUBMODIFIED then locking (must clear before the
+  recursion and might need additional synchronization)
+
+* There is definitely a flush race in the hardlink implementation between
+  the forwarding entries and the actual (hidden) hardlink inode.
+
+  This will require us to associate a small hard-link-adjust structure
+  with the chain whenever we create or delete hardlinks, on top of
+  adjusting the hardlink inode itself.  Any actual flush to the media
+  has to synchronize the correct nlinks value based on whether related
+  created or deleted hardlinks were also flushed.
+
+* When a directory entry is created and also if an indirect block is
+  created and entries moved into it, the directory seek position can
+  potentially become incorrect during a scan.
+
+* When a directory entry is deleted a directory seek position depending
+  on that key can cause readdir to skip entries.
+
+* TWO PHASE COMMIT - store two data offsets in the chain, and
+  hammer2_chain_delete() needs to leave the chain intact if MODIFIED2 is
+  set on its buffer until the flusher gets to it?
+
+
+                               OPTIMIZATIONS
+
+* If a file is unlinked buts its descriptors is left open and used, we
+  should allow data blocks on-media to be reused since there is no
+  topology left to point at them.
diff --git a/sys/vfs/hammer2/donew b/sys/vfs/hammer2/donew
new file mode 100755 (executable)
index 0000000..db573ea
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/csh
+#
+
+umount /mnt
+newfs_hammer2 -L ROOT /dev/da0s1d
diff --git a/sys/vfs/hammer2/donew2 b/sys/vfs/hammer2/donew2
new file mode 100755 (executable)
index 0000000..d98c5a2
--- /dev/null
@@ -0,0 +1,5 @@
+#!/bin/csh
+#
+
+umount /mnt
+newfs_hammer2 -L ROOT /dev/da0s1b
diff --git a/sys/vfs/hammer2/dossd b/sys/vfs/hammer2/dossd
new file mode 100755 (executable)
index 0000000..946bf7b
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/csh
+#
+
+umount /mnt >& /dev/null
+kldunload hammer2.ko >& /dev/null
+kldstat | fgrep hammer2.ko >& /dev/null
+if ( $status > 0 ) then
+    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
+endif
+mount_hammer2 /dev/da0s1d@ROOT /mnt
+sysctl vfs.hammer2.debug=0
diff --git a/sys/vfs/hammer2/dossd2 b/sys/vfs/hammer2/dossd2
new file mode 100755 (executable)
index 0000000..124869c
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/csh
+#
+
+umount /mnt >& /dev/null
+kldunload hammer2.ko >& /dev/null
+kldstat | fgrep hammer2.ko >& /dev/null
+if ( $status > 0 ) then
+    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
+endif
+mount_hammer2 /dev/da0s1b@ROOT /mnt
+sysctl vfs.hammer2.debug=0
diff --git a/sys/vfs/hammer2/dotest b/sys/vfs/hammer2/dotest
new file mode 100755 (executable)
index 0000000..803de0c
--- /dev/null
@@ -0,0 +1,11 @@
+#!/bin/csh
+#
+
+# ./mkvntest
+umount /mnt >& /dev/null
+kldunload hammer2.ko >& /dev/null
+kldstat | fgrep hammer2.ko >& /dev/null
+if ( $status > 0 ) then
+    kldload /usr/obj/usr/src/sys/vfs/hammer2/hammer2.ko
+endif
+mount_hammer2 /dev/vn0@ROOT /mnt
diff --git a/sys/vfs/hammer2/hammer2.h b/sys/vfs/hammer2/hammer2.h
new file mode 100644 (file)
index 0000000..cb4b071
--- /dev/null
@@ -0,0 +1,428 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * This header file contains structures used internally by the HAMMER2
+ * implementation.  See hammer2_disk.h for on-disk structures.
+ */
+
+#ifndef _VFS_HAMMER2_HAMMER2_H_
+#define _VFS_HAMMER2_HAMMER2_H_
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/kernel.h>
+#include <sys/conf.h>
+#include <sys/systm.h>
+#include <sys/tree.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/proc.h>
+#include <sys/mountctl.h>
+#include <sys/priv.h>
+#include <sys/stat.h>
+#include <sys/globaldata.h>
+#include <sys/lockf.h>
+#include <sys/buf.h>
+#include <sys/queue.h>
+#include <sys/limits.h>
+#include <sys/buf2.h>
+#include <sys/signal2.h>
+#include <sys/tree.h>
+
+#include "hammer2_disk.h"
+#include "hammer2_mount.h"
+#include "hammer2_ioctl.h"
+
+struct hammer2_chain;
+struct hammer2_inode;
+struct hammer2_mount;
+struct hammer2_pfsmount;
+
+/*
+ * The chain structure tracks blockref recursions all the way to
+ * the root volume.  These consist of indirect blocks, inodes,
+ * and eventually the volume header.
+ *
+ * The chain structure is embedded in the hammer2_mount, hammer2_inode,
+ * and other system memory structures.  The chain structure typically
+ * implements the reference count and busy flag for the larger structure.
+ *
+ * It is always possible to track a chain element all the way back to the
+ * root by following the (parent) links.  (index) is a type-dependent index
+ * in the parent indicating where in the parent the chain element resides.
+ *
+ * When a blockref is added or deleted the related chain element is marked
+ * modified and all of its parents are marked SUBMODIFIED (the parent
+ * recursion can stop once we hit a node that is already marked SUBMODIFIED).
+ * A deleted chain element must remain intact until synchronized against
+ * its parent.
+ *
+ * The blockref at (parent, index) is not adjusted until the modified chain
+ * element is flushed and unmarked.  Until then the child's blockref may
+ * not match the blockref at (parent, index).
+ */
+SPLAY_HEAD(hammer2_chain_splay, hammer2_chain);
+
+struct hammer2_chain {
+       struct hammer2_blockref bref;
+       struct hammer2_chain *parent;           /* return chain to root */
+       struct hammer2_chain_splay shead;
+       SPLAY_ENTRY(hammer2_chain) snode;
+       TAILQ_ENTRY(hammer2_chain) flush_node;  /* flush deferral list */
+       union {
+               struct hammer2_inode *ip;
+               struct hammer2_indblock *np;
+               struct hammer2_data *dp;
+               void *mem;
+       } u;
+
+       struct buf      *bp;            /* buffer cache (ro) */
+       hammer2_media_data_t *data;     /* modified copy of data (rw) */
+       u_int           bytes;          /* physical size of data */
+       struct lock     lk;             /* lockmgr lock */
+       int             index;          /* index in parent */
+       u_int           refs;
+       u_int           busy;           /* soft-busy */
+       u_int           flags;
+};
+
+typedef struct hammer2_chain hammer2_chain_t;
+
+int hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2);
+SPLAY_PROTOTYPE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
+
+#define HAMMER2_CHAIN_MODIFIED         0x00000001      /* active mods */
+#define HAMMER2_CHAIN_DIRTYEMBED       0x00000002      /* inode embedded */
+#define HAMMER2_CHAIN_DIRTYBP          0x00000004      /* dirty on unlock */
+#define HAMMER2_CHAIN_SUBMODIFIED      0x00000008      /* 1+ subs modified */
+#define HAMMER2_CHAIN_DELETED          0x00000010
+#define HAMMER2_CHAIN_INITIAL          0x00000020      /* initial create */
+#define HAMMER2_CHAIN_FLUSHED          0x00000040      /* flush on unlock */
+#define HAMMER2_CHAIN_MOVED            0x00000080      /* moved */
+#define HAMMER2_CHAIN_IOFLUSH          0x00000100      /* bawrite on put */
+#define HAMMER2_CHAIN_DEFERRED         0x00000200      /* on a deferral list*/
+#define HAMMER2_CHAIN_DESTROYED                0x00000400      /* destroying */
+#define HAMMER2_CHAIN_MODIFIED_AUX     0x00000800      /* hmp->vchain only */
+#define HAMMER2_CHAIN_MOUNTED          0x00001000      /* PFS is mounted */
+
+/*
+ * Flags passed to hammer2_chain_lookup() and hammer2_chain_next()
+ */
+#define HAMMER2_LOOKUP_NOLOCK          0x00000001      /* ref only */
+#define HAMMER2_LOOKUP_NODATA          0x00000002      /* data left NULL */
+
+/*
+ * Flags passed to hammer2_chain_modify() and hammer2_chain_resize()
+ *
+ * NOTE: OPTDATA allows us to avoid instantiating buffers for INDIRECT
+ *      blocks in the INITIAL-create state.
+ */
+#define HAMMER2_MODIFY_NOSUB           0x00000001      /* do not set SUBMOD */
+#define HAMMER2_MODIFY_OPTDATA         0x00000002      /* data can be NULL */
+
+/*
+ * Flags passed to hammer2_chain_lock()
+ */
+#define HAMMER2_RESOLVE_NEVER          1
+#define HAMMER2_RESOLVE_MAYBE          2
+#define HAMMER2_RESOLVE_ALWAYS         3
+
+/*
+ * Cluster different types of storage together for allocations
+ */
+#define HAMMER2_FREECACHE_INODE                0
+#define HAMMER2_FREECACHE_INDIR                1
+#define HAMMER2_FREECACHE_DATA         2
+#define HAMMER2_FREECACHE_UNUSED3      3
+#define HAMMER2_FREECACHE_TYPES                4
+
+/*
+ * BMAP read-ahead maximum parameters
+ */
+#define HAMMER2_BMAP_COUNT             16      /* max bmap read-ahead */
+#define HAMMER2_BMAP_BYTES             (HAMMER2_PBUFSIZE * HAMMER2_BMAP_COUNT)
+
+/*
+ * Misc
+ */
+#define HAMMER2_FLUSH_DEPTH_LIMIT      40      /* stack recursion limit */
+
+/*
+ * HAMMER2 IN-MEMORY CACHE OF MEDIA STRUCTURES
+ *
+ * There is an in-memory representation of all on-media data structure.
+ *
+ * When accessed read-only the data will be mapped to the related buffer
+ * cache buffer.
+ *
+ * When accessed read-write (marked modified) a kmalloc()'d copy of the
+ * is created which can then be modified.  The copy is destroyed when a
+ * filesystem block is allocated to replace it.
+ *
+ * Active inodes (those with vnodes attached) will maintain the kmalloc()'d
+ * copy for both the read-only and the read-write case.  The combination of
+ * (bp) and (data) determines whether (data) was allocated or not.
+ *
+ * The in-memory representation may remain cached (for example in order to
+ * placemark clustering locks) even after the related data has been
+ * detached.
+ */
+
+/*
+ * A hammer2 inode.
+ */
+struct hammer2_inode {
+       struct hammer2_mount    *hmp;           /* Global mount */
+       struct hammer2_pfsmount *pmp;           /* PFS mount */
+       struct hammer2_inode    *pip;           /* parent inode */
+       struct vnode            *vp;
+       hammer2_chain_t         chain;
+       struct hammer2_inode_data ip_data;
+       struct lockf            advlock;
+};
+
+typedef struct hammer2_inode hammer2_inode_t;
+
+/*
+ * A hammer2 indirect block
+ */
+struct hammer2_indblock {
+       hammer2_chain_t         chain;
+};
+
+typedef struct hammer2_indblock hammer2_indblock_t;
+
+/*
+ * A hammer2 data block
+ */
+struct hammer2_data {
+       hammer2_chain_t         chain;
+};
+
+typedef struct hammer2_data hammer2_data_t;
+
+/*
+ * Global (per device) mount structure for device (aka vp->v_mount->hmp)
+ */
+struct hammer2_mount {
+       struct vnode    *devvp;         /* device vnode */
+       int             ronly;          /* read-only mount */
+       int             pmp_count;      /* PFS mounts backed by us */
+       TAILQ_ENTRY(hammer2_mount) mntentry; /* hammer2_mntlist */
+
+       struct malloc_type *minode;
+       int             ninodes;
+       int             maxinodes;
+
+       struct malloc_type *mchain;
+       int             nipstacks;
+       int             maxipstacks;
+       hammer2_chain_t vchain;         /* anchor chain */
+       hammer2_chain_t *schain;        /* super-root */
+       struct lock     alloclk;        /* lockmgr lock */
+       struct lock     voldatalk;      /* lockmgr lock */
+
+       hammer2_volume_data_t voldata;
+       hammer2_off_t   freecache[HAMMER2_FREECACHE_TYPES][HAMMER2_MAX_RADIX+1];
+};
+
+typedef struct hammer2_mount hammer2_mount_t;
+
+/*
+ * Per-PFS mount structure for device (aka vp->v_mount)
+ */
+struct hammer2_pfsmount {
+       struct mount            *mp;            /* kernel mount */
+       struct hammer2_mount    *hmp;           /* device global mount */
+       hammer2_chain_t         *rchain;        /* PFS root chain */
+       hammer2_inode_t         *iroot;         /* PFS root inode */
+       struct netexport        export;         /* nfs export */
+       int                     ronly;          /* read-only mount */
+};
+
+typedef struct hammer2_pfsmount hammer2_pfsmount_t;
+
+#if defined(_KERNEL)
+
+MALLOC_DECLARE(M_HAMMER2);
+
+#define VTOI(vp)       ((hammer2_inode_t *)(vp)->v_data)
+#define ITOV(ip)       ((ip)->vp)
+
+static __inline
+hammer2_pfsmount_t *
+MPTOPMP(struct mount *mp)
+{
+       return ((hammer2_pfsmount_t *)mp->mnt_data);
+}
+
+static __inline
+hammer2_mount_t *
+MPTOHMP(struct mount *mp)
+{
+       return (((hammer2_pfsmount_t *)mp->mnt_data)->hmp);
+}
+
+extern struct vop_ops hammer2_vnode_vops;
+extern struct vop_ops hammer2_spec_vops;
+extern struct vop_ops hammer2_fifo_vops;
+
+extern int hammer2_debug;
+extern int hammer2_cluster_enable;
+extern long hammer2_iod_file_read;
+extern long hammer2_iod_meta_read;
+extern long hammer2_iod_indr_read;
+extern long hammer2_iod_file_write;
+extern long hammer2_iod_meta_write;
+extern long hammer2_iod_indr_write;
+extern long hammer2_iod_volu_write;
+extern long hammer2_ioa_file_read;
+extern long hammer2_ioa_meta_read;
+extern long hammer2_ioa_indr_read;
+extern long hammer2_ioa_file_write;
+extern long hammer2_ioa_meta_write;
+extern long hammer2_ioa_indr_write;
+extern long hammer2_ioa_volu_write;
+
+/*
+ * hammer2_subr.c
+ */
+void hammer2_inode_lock_ex(hammer2_inode_t *ip);
+void hammer2_inode_unlock_ex(hammer2_inode_t *ip);
+void hammer2_inode_lock_sh(hammer2_inode_t *ip);
+void hammer2_inode_unlock_sh(hammer2_inode_t *ip);
+void hammer2_inode_busy(hammer2_inode_t *ip);
+void hammer2_inode_unbusy(hammer2_inode_t *ip);
+void hammer2_voldata_lock(hammer2_mount_t *hmp);
+void hammer2_voldata_unlock(hammer2_mount_t *hmp);
+
+void hammer2_mount_exlock(hammer2_mount_t *hmp);
+void hammer2_mount_shlock(hammer2_mount_t *hmp);
+void hammer2_mount_unlock(hammer2_mount_t *hmp);
+
+int hammer2_get_dtype(hammer2_inode_t *ip);
+int hammer2_get_vtype(hammer2_inode_t *ip);
+u_int8_t hammer2_get_obj_type(enum vtype vtype);
+void hammer2_time_to_timespec(u_int64_t xtime, struct timespec *ts);
+u_int32_t hammer2_to_unix_xid(uuid_t *uuid);
+
+hammer2_key_t hammer2_dirhash(const unsigned char *name, size_t len);
+int hammer2_bytes_to_radix(size_t bytes);
+
+int hammer2_calc_logical(hammer2_inode_t *ip, hammer2_off_t uoff,
+                        hammer2_key_t *lbasep, hammer2_key_t *leofp);
+
+/*
+ * hammer2_inode.c
+ */
+struct vnode *hammer2_igetv(hammer2_inode_t *ip, int *errorp);
+
+hammer2_inode_t *hammer2_inode_alloc(hammer2_pfsmount_t *pmp, void *data);
+void hammer2_inode_free(hammer2_inode_t *ip);
+void hammer2_inode_ref(hammer2_inode_t *ip);
+void hammer2_inode_drop(hammer2_inode_t *ip);
+int hammer2_inode_calc_alloc(hammer2_key_t filesize);
+
+int hammer2_inode_create(hammer2_inode_t *dip,
+                        struct vattr *vap, struct ucred *cred,
+                        const uint8_t *name, size_t name_len,
+                        hammer2_inode_t **nipp);
+
+int hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *nip,
+                       const uint8_t *name, size_t name_len);
+
+int hammer2_hardlink_create(hammer2_inode_t *ip, hammer2_inode_t *dip,
+                       const uint8_t *name, size_t name_len);
+
+int hammer2_unlink_file(hammer2_inode_t *dip,
+                       const uint8_t *name, size_t name_len,
+                       int isdir, int adjlinks);
+
+/*
+ * hammer2_chain.c
+ */
+void hammer2_modify_volume(hammer2_mount_t *hmp);
+hammer2_chain_t *hammer2_chain_alloc(hammer2_mount_t *hmp,
+                               hammer2_blockref_t *bref);
+void hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+void hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+void hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+int hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how);
+void hammer2_chain_moved(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+void hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain,
+                               int flags);
+void hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain,
+                               int nradix, int flags);
+void hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+hammer2_chain_t *hammer2_chain_find(hammer2_mount_t *hmp,
+                               hammer2_chain_t *parent, int index);
+hammer2_chain_t *hammer2_chain_get(hammer2_mount_t *hmp,
+                               hammer2_chain_t *parent,
+                               int index, int flags);
+hammer2_chain_t *hammer2_chain_lookup(hammer2_mount_t *hmp,
+                               hammer2_chain_t **parentp,
+                               hammer2_key_t key_beg, hammer2_key_t key_end,
+                               int flags);
+hammer2_chain_t *hammer2_chain_next(hammer2_mount_t *hmp,
+                               hammer2_chain_t **parentp,
+                               hammer2_chain_t *chain,
+                               hammer2_key_t key_beg, hammer2_key_t key_end,
+                               int flags);
+hammer2_chain_t *hammer2_chain_create(hammer2_mount_t *hmp,
+                               hammer2_chain_t *parent,
+                               hammer2_chain_t *chain,
+                               hammer2_key_t key, int keybits,
+                               int type, size_t bytes);
+void hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                               hammer2_chain_t *chain);
+void hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+void hammer2_chain_commit(hammer2_mount_t *hmp, hammer2_chain_t *chain);
+
+/*
+ * hammer2_ioctl.c
+ */
+int hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data,
+                               int fflag, struct ucred *cred);
+
+/*
+ * hammer2_freemap.c
+ */
+hammer2_off_t hammer2_freemap_alloc(hammer2_mount_t *hmp,
+                               int type, size_t bytes);
+
+#endif /* !_KERNEL */
+#endif /* !_VFS_HAMMER2_HAMMER2_H_ */
diff --git a/sys/vfs/hammer2/hammer2_chain.c b/sys/vfs/hammer2/hammer2_chain.c
new file mode 100644 (file)
index 0000000..579efcc
--- /dev/null
@@ -0,0 +1,2586 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This subsystem handles direct and indirect block searches, recursions,
+ * creation, and deletion.  Chains of blockrefs are tracked and modifications
+ * are flag for propagation... eventually all the way back to the volume
+ * header.
+ */
+
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/uuid.h>
+
+#include "hammer2.h"
+
+static int hammer2_indirect_optimize;  /* XXX SYSCTL */
+
+static hammer2_chain_t *hammer2_chain_create_indirect(
+                       hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                       hammer2_key_t key, int keybits);
+
+/*
+ * Splay tree
+ */
+SPLAY_GENERATE(hammer2_chain_splay, hammer2_chain, snode, hammer2_chain_cmp);
+
+int
+hammer2_chain_cmp(hammer2_chain_t *chain1, hammer2_chain_t *chain2)
+{
+       return(chain2->index - chain1->index);
+}
+
+/*
+ * Recursively mark the parent chain elements so flushes can find
+ * modified elements.
+ *
+ * NOTE: The flush code will modify a SUBMODIFIED-flagged chain
+ *      during the flush recursion after clearing the parent's
+ *      SUBMODIFIED bit.  We don't want to re-set the parent's
+ *      SUBMODIFIED bit in this case!
+ *
+ * XXX rename of parent can create a SMP race
+ */
+static void
+hammer2_chain_parent_setsubmod(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       hammer2_chain_t *parent;
+
+       if ((chain->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+               parent = chain->parent;
+               while (parent &&
+                      (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) == 0) {
+                       atomic_set_int(&parent->flags,
+                                      HAMMER2_CHAIN_SUBMODIFIED);
+                       parent = parent->parent;
+               }
+       }
+}
+
+/*
+ * Allocate a new disconnected chain element representing the specified
+ * bref.  The chain element is locked exclusively and refs is set to 1.
+ *
+ * This essentially allocates a system memory structure representing one
+ * of the media structure types, including inodes.
+ */
+hammer2_chain_t *
+hammer2_chain_alloc(hammer2_mount_t *hmp, hammer2_blockref_t *bref)
+{
+       hammer2_chain_t *chain;
+       hammer2_inode_t *ip;
+       hammer2_indblock_t *np;
+       hammer2_data_t *dp;
+       u_int bytes = 1U << (int)(bref->data_off & HAMMER2_OFF_MASK_RADIX);
+
+       /*
+        * Construct the appropriate system structure.
+        */
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               ip = kmalloc(sizeof(*ip), hmp->minode, M_WAITOK | M_ZERO);
+               chain = &ip->chain;
+               chain->u.ip = ip;
+               lockinit(&chain->lk, "inode", 0, LK_CANRECURSE);
+               ip->hmp = hmp;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               np = kmalloc(sizeof(*np), hmp->mchain, M_WAITOK | M_ZERO);
+               chain = &np->chain;
+               chain->u.np = np;
+               lockinit(&chain->lk, "iblk", 0, LK_CANRECURSE);
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               dp = kmalloc(sizeof(*dp), hmp->mchain, M_WAITOK | M_ZERO);
+               chain = &dp->chain;
+               chain->u.dp = dp;
+               lockinit(&chain->lk, "dblk", 0, LK_CANRECURSE);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               chain = NULL;
+               panic("hammer2_chain_alloc volume type illegal for op");
+       default:
+               chain = NULL;
+               panic("hammer2_chain_alloc: unrecognized blockref type: %d",
+                     bref->type);
+       }
+       chain->bref = *bref;
+       chain->index = -1;      /* not yet assigned */
+       chain->refs = 1;
+       chain->bytes = bytes;
+       lockmgr(&chain->lk, LK_EXCLUSIVE);
+
+       return (chain);
+}
+
+/*
+ * Free a disconnected chain element
+ */
+void
+hammer2_chain_free(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       void *mem;
+
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE ||
+           chain->bref.type == HAMMER2_BREF_TYPE_VOLUME) {
+               chain->data = NULL;
+       }
+
+       KKASSERT(chain->bp == NULL);
+       KKASSERT(chain->data == NULL);
+       KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
+                chain->u.ip->vp == NULL);
+
+       if ((mem = chain->u.mem) != NULL) {
+               chain->u.mem = NULL;
+               if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
+                       kfree(mem, hmp->minode);
+               else
+                       kfree(mem, hmp->mchain);
+       }
+}
+
+/*
+ * Add a reference to a chain element (for shared access).  The chain
+ * element must already have at least 1 ref controlled by the caller.
+ */
+void
+hammer2_chain_ref(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       KKASSERT(chain->refs > 0);
+       atomic_add_int(&chain->refs, 1);
+}
+
+/*
+ * Drop the callers reference to the chain element.  If the ref count
+ * reaches zero the chain element and its related structure (typically an
+ * inode or indirect block) will be freed and the parent will be
+ * recursively dropped.
+ *
+ * Modified elements hold an additional reference so it should not be
+ * possible for the count on a modified element to drop to 0.
+ *
+ * The chain element must NOT be locked by the caller.
+ *
+ * The parent might or might not be locked by the caller but if so it
+ * will also be referenced so we shouldn't recurse upward.
+ */
+void
+hammer2_chain_drop(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       hammer2_chain_t *parent;
+       u_int refs;
+
+       while (chain) {
+               refs = chain->refs;
+               cpu_ccfence();
+               KKASSERT(refs > 0);
+               if (refs == 1) {
+                       KKASSERT(chain != &hmp->vchain);
+                       parent = chain->parent;
+                       if (parent)
+                               lockmgr(&parent->lk, LK_EXCLUSIVE);
+                       if (atomic_cmpset_int(&chain->refs, 1, 0)) {
+                               /*
+                                * Succeeded, recurse and drop parent
+                                */
+                               if (!(chain->flags & HAMMER2_CHAIN_DELETED)) {
+                                       SPLAY_REMOVE(hammer2_chain_splay,
+                                                    &parent->shead, chain);
+                                       atomic_set_int(&chain->flags,
+                                                      HAMMER2_CHAIN_DELETED);
+                                       /* parent refs dropped via recursion */
+                               }
+                               chain->parent = NULL;
+                               if (parent)
+                                       lockmgr(&parent->lk, LK_RELEASE);
+                               hammer2_chain_free(hmp, chain);
+                               chain = parent;
+                               /* recurse on parent */
+                       } else {
+                               if (parent)
+                                       lockmgr(&parent->lk, LK_RELEASE);
+                               /* retry the same chain */
+                       }
+               } else {
+                       if (atomic_cmpset_int(&chain->refs, refs, refs - 1)) {
+                               /*
+                                * Succeeded, count did not reach zero so
+                                * cut out of the loop.
+                                */
+                               break;
+                       }
+                       /* retry the same chain */
+               }
+       }
+}
+
+/*
+ * Ref and lock a chain element, acquiring its data with I/O if necessary,
+ * and specify how you would like the data to be resolved.
+ *
+ * Returns 0 on success or an error code if the data could not be acquired.
+ * The chain element is locked either way.
+ *
+ * The lock is allowed to recurse, multiple locking ops will aggregate
+ * the requested resolve types.  Once data is assigned it will not be
+ * removed until the last unlock.
+ *
+ * HAMMER2_RESOLVE_NEVER - Do not resolve the data element.
+ *                        (typically used to avoid device/logical buffer
+ *                         aliasing for data)
+ *
+ * HAMMER2_RESOLVE_MAYBE - Do not resolve data elements for chains in
+ *                        the INITIAL-create state (indirect blocks only).
+ *
+ *                        Do not resolve data elements for DATA chains.
+ *                        (typically used to avoid device/logical buffer
+ *                         aliasing for data)
+ *
+ * HAMMER2_RESOLVE_ALWAYS- Always resolve the data element.
+ *
+ *
+ * NOTE: Embedded elements (volume header, inodes) are always resolved
+ *      regardless.
+ *
+ * NOTE: Specifying HAMMER2_RESOLVE_ALWAYS on a newly-created non-embedded
+ *      element will instantiate and zero its buffer, and flush it on
+ *      release.
+ *
+ * NOTE: (data) elements are normally locked RESOLVE_NEVER or RESOLVE_MAYBE
+ *      so as not to instantiate a device buffer, which could alias against
+ *      a logical file buffer.  However, if ALWAYS is specified the
+ *      device buffer will be instantiated anyway.
+ */
+int
+hammer2_chain_lock(hammer2_mount_t *hmp, hammer2_chain_t *chain, int how)
+{
+       hammer2_blockref_t *bref;
+       hammer2_off_t pbase;
+       hammer2_off_t peof;
+       size_t boff;
+       size_t bbytes;
+       int error;
+       char *bdata;
+
+       /*
+        * Lock the element.  Under certain conditions this might end up
+        * being a recursive lock.
+        */
+       KKASSERT(chain->refs > 0);
+       atomic_add_int(&chain->refs, 1);
+       lockmgr(&chain->lk, LK_EXCLUSIVE);
+
+       /*
+        * If we already have a valid data pointer no further action is
+        * necessary.
+        */
+       if (chain->data)
+               return (0);
+
+       /*
+        * Do we have to resolve the data?
+        */
+       switch(how) {
+       case HAMMER2_RESOLVE_NEVER:
+               return(0);
+       case HAMMER2_RESOLVE_MAYBE:
+               if (chain->flags & HAMMER2_CHAIN_INITIAL)
+                       return(0);
+               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+                       return(0);
+               /* fall through */
+       case HAMMER2_RESOLVE_ALWAYS:
+               break;
+       }
+
+       /*
+        * We must resolve to a device buffer, either by issuing I/O or
+        * by creating a zero-fill element.  We do not mark the buffer
+        * dirty when creating a zero-fill element (the hammer2_chain_modify()
+        * API must still be used to do that).
+        *
+        * The device buffer is variable-sized in powers of 2 down
+        * to HAMMER2_MINALLOCSIZE (typically 1K).  A 64K physical storage
+        * chunk always contains buffers of the same size. (XXX)
+        *
+        * The minimum physical IO size may be larger than the variable
+        * block size.
+        */
+       bref = &chain->bref;
+
+       if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+               bbytes = HAMMER2_MINIOSIZE;
+       pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
+       peof = (pbase + HAMMER2_PBUFSIZE64) & ~HAMMER2_PBUFMASK64;
+       boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+       KKASSERT(pbase != 0);
+
+       /*
+        * The getblk() optimization can only be used on newly created
+        * elements if the physical block size matches the request.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_INITIAL) &&
+           chain->bytes == bbytes) {
+               chain->bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+               error = 0;
+       } else if (hammer2_cluster_enable) {
+               error = cluster_read(hmp->devvp, peof, pbase, bbytes,
+                                    HAMMER2_PBUFSIZE, HAMMER2_PBUFSIZE,
+                                    &chain->bp);
+       } else {
+               error = bread(hmp->devvp, pbase, bbytes, &chain->bp);
+       }
+
+       if (error) {
+               kprintf("hammer2_chain_get: I/O error %016jx: %d\n",
+                       (intmax_t)pbase, error);
+               bqrelse(chain->bp);
+               chain->bp = NULL;
+               return (error);
+       }
+
+       /*
+        * Zero the data area if the chain is in the INITIAL-create state
+        */
+       bdata = (char *)chain->bp->b_data + boff;
+       if (chain->flags & HAMMER2_CHAIN_INITIAL)
+               bzero(bdata, chain->bytes);
+
+       /*
+        * Setup the data pointer, either pointing it to an embedded data
+        * structure and copying the data from the buffer, or pointing it
+        * into the buffer.
+        *
+        * The buffer is not retained when copying to an embedded data
+        * structure in order to avoid potential deadlocks or recursions
+        * on the same physical buffer.
+        */
+       switch (bref->type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+               /*
+                * Copy data from bp to embedded buffer
+                */
+               panic("hammer2_chain_lock: called on unresolved volume header");
+#if 0
+               /* NOT YET */
+               KKASSERT(pbase == 0);
+               KKASSERT(chain->bytes == HAMMER2_PBUFSIZE);
+               bcopy(bdata, &hmp->voldata, chain->bytes);
+               chain->data = (void *)&hmp->voldata;
+               bqrelse(chain->bp);
+               chain->bp = NULL;
+#endif
+               break;
+       case HAMMER2_BREF_TYPE_INODE:
+               /*
+                * Copy data from bp to embedded buffer, do not retain the
+                * device buffer.
+                */
+               bcopy(bdata, &chain->u.ip->ip_data, chain->bytes);
+               chain->data = (void *)&chain->u.ip->ip_data;
+               bqrelse(chain->bp);
+               chain->bp = NULL;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+       case HAMMER2_BREF_TYPE_DATA:
+       default:
+               /*
+                * Point data at the device buffer and leave bp intact.
+                */
+               chain->data = (void *)bdata;
+               break;
+       }
+       return (0);
+}
+
+/*
+ * Unlock and deref a chain element.
+ *
+ * On the last lock release any non-embedded data (chain->bp) will be
+ * retired.
+ */
+void
+hammer2_chain_unlock(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       long *counterp;
+
+       /*
+        * Undo a recursive lock
+        */
+       if (lockcountnb(&chain->lk) > 1) {
+               KKASSERT(chain->refs > 1);
+               atomic_add_int(&chain->refs, -1);
+               lockmgr(&chain->lk, LK_RELEASE);
+               return;
+       }
+
+       /*
+        * Shortcut the case if the data is embedded or not resolved.
+        * Do NOT null-out pointers to embedded data (e.g. inode).
+        */
+       if (chain->bp == NULL) {
+               lockmgr(&chain->lk, LK_RELEASE);
+               hammer2_chain_drop(hmp, chain);
+               return;
+       }
+
+       /*
+        * Statistics
+        */
+       if ((chain->flags & HAMMER2_CHAIN_DIRTYBP) == 0) {
+               ;
+       } else if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
+               switch(chain->bref.type) {
+               case HAMMER2_BREF_TYPE_DATA:
+                       counterp = &hammer2_ioa_file_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INODE:
+                       counterp = &hammer2_ioa_meta_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       counterp = &hammer2_ioa_indr_write;
+                       break;
+               default:
+                       counterp = &hammer2_ioa_volu_write;
+                       break;
+               }
+               ++*counterp;
+       } else {
+               switch(chain->bref.type) {
+               case HAMMER2_BREF_TYPE_DATA:
+                       counterp = &hammer2_iod_file_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INODE:
+                       counterp = &hammer2_iod_meta_write;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       counterp = &hammer2_iod_indr_write;
+                       break;
+               default:
+                       counterp = &hammer2_iod_volu_write;
+                       break;
+               }
+               ++*counterp;
+       }
+
+       /*
+        * Clean out the bp.
+        *
+        * If a device buffer was used for data be sure to destroy the
+        * buffer when we are done to avoid aliases (XXX what about the
+        * underlying VM pages?).
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_DATA)
+               chain->bp->b_flags |= B_RELBUF;
+
+       chain->data = NULL;
+       if (chain->flags & HAMMER2_CHAIN_DIRTYBP) {
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+               if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
+                       atomic_clear_int(&chain->flags,
+                                        HAMMER2_CHAIN_IOFLUSH);
+                       chain->bp->b_flags |= B_RELBUF;
+                       cluster_awrite(chain->bp);
+               } else {
+                       chain->bp->b_flags |= B_CLUSTEROK;
+                       bdwrite(chain->bp);
+               }
+       } else {
+               if (chain->flags & HAMMER2_CHAIN_IOFLUSH) {
+                       atomic_clear_int(&chain->flags,
+                                        HAMMER2_CHAIN_IOFLUSH);
+                       chain->bp->b_flags |= B_RELBUF;
+                       brelse(chain->bp);
+               } else {
+                       /* bp might still be dirty */
+                       bqrelse(chain->bp);
+               }
+       }
+       chain->bp = NULL;
+       lockmgr(&chain->lk, LK_RELEASE);
+       hammer2_chain_drop(hmp, chain);
+}
+
+/*
+ * Resize the chain's physical storage allocation.  Chains can be resized
+ * smaller without reallocating the storage.  Resizing larger will reallocate
+ * the storage.
+ *
+ * Must be passed a locked chain.  If you want the resize to copy the data
+ * you should lock the chain with RESOLVE_MAYBE or RESOLVE_ALWAYS, otherwise
+ * the resize operation will not copy the data.
+ *
+ * This function is mostly used with DATA blocks locked RESOLVE_NEVER in order
+ * to avoid instantiating a device buffer that conflicts with the vnode
+ * data buffer.
+ *
+ * XXX flags currently ignored, uses chain->bp to detect data/no-data.
+ */
+void
+hammer2_chain_resize(hammer2_mount_t *hmp, hammer2_chain_t *chain,
+                    int nradix, int flags)
+{
+       struct buf *nbp;
+       hammer2_off_t pbase;
+       size_t obytes;
+       size_t nbytes;
+       size_t bbytes;
+       int boff;
+       char *bdata;
+       int error;
+
+       /*
+        * Only data and indirect blocks can be resized for now
+        */
+       KKASSERT(chain != &hmp->vchain);
+       KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_DATA ||
+                chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT);
+
+       /*
+        * Nothing to do if the element is already the proper size
+        */
+       obytes = chain->bytes;
+       nbytes = 1U << nradix;
+       if (obytes == nbytes)
+               return;
+
+       /*
+        * Set MODIFIED and add a chain ref to prevent destruction.  Both
+        * modified flags share the same ref.
+        */
+       if ((chain->flags & HAMMER2_CHAIN_MODIFIED) == 0) {
+               atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+               hammer2_chain_ref(hmp, chain);
+       }
+
+       /*
+        * Relocate the block, even if making it smaller (because different
+        * block sizes may be in different regions).
+        */
+       chain->bref.data_off = hammer2_freemap_alloc(hmp, chain->bref.type,
+                                                    nbytes);
+       chain->bytes = nbytes;
+
+       /*
+        * The device buffer may be larger than the allocation size.
+        */
+       if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+               bbytes = HAMMER2_MINIOSIZE;
+       pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
+       boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+       /*
+        * Only copy the data if resolved, otherwise the caller is
+        * responsible.
+        */
+       if (chain->bp) {
+               KKASSERT(chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT ||
+                        chain->bref.type == HAMMER2_BREF_TYPE_DATA);
+               KKASSERT(chain != &hmp->vchain);        /* safety */
+
+               /*
+                * The getblk() optimization can only be used if the
+                * physical block size matches the request.
+                */
+               if (nbytes == bbytes) {
+                       nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+                       error = 0;
+               } else {
+                       error = bread(hmp->devvp, pbase, bbytes, &nbp);
+                       KKASSERT(error == 0);
+               }
+               bdata = (char *)nbp->b_data + boff;
+
+               if (nbytes < obytes) {
+                       bcopy(chain->data, bdata, nbytes);
+               } else {
+                       bcopy(chain->data, bdata, obytes);
+                       bzero(bdata + obytes, nbytes - obytes);
+               }
+
+               /*
+                * NOTE: The INITIAL state of the chain is left intact.
+                *
+                * NOTE: Because of the reallocation we have to set DIRTYBP
+                *       if INITIAL is not set.
+                */
+               chain->bp->b_flags |= B_RELBUF;
+               brelse(chain->bp);
+               chain->bp = nbp;
+               chain->data = (void *)bdata;
+               if ((chain->flags & HAMMER2_CHAIN_INITIAL) == 0)
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+       }
+       hammer2_chain_parent_setsubmod(hmp, chain);
+}
+
+/*
+ * Convert a locked chain that was retrieved read-only to read-write.
+ *
+ * If not already marked modified a new physical block will be allocated
+ * and assigned to the bref.
+ *
+ * Non-data blocks - The chain should be locked to at least the RESOLVE_MAYBE
+ *                  level or the COW operation will not work.
+ *
+ * Data blocks    - The chain is usually locked RESOLVE_NEVER so as not to
+ *                  run the data through the device buffers.
+ */
+void
+hammer2_chain_modify(hammer2_mount_t *hmp, hammer2_chain_t *chain, int flags)
+{
+       struct buf *nbp;
+       int error;
+       hammer2_off_t pbase;
+       size_t bbytes;
+       size_t boff;
+       void *bdata;
+
+       /*
+        * If the chain is already marked MODIFIED we can just return.
+        */
+       if (chain->flags & HAMMER2_CHAIN_MODIFIED) {
+               if ((flags & HAMMER2_MODIFY_OPTDATA) == 0 &&
+                   chain->bp == NULL) {
+                       goto skip1;
+               }
+               return;
+       }
+
+       /*
+        * Set MODIFIED and add a chain ref to prevent destruction.  Both
+        * modified flags share the same ref.
+        */
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MODIFIED);
+       hammer2_chain_ref(hmp, chain);
+
+       /*
+        * We must allocate the copy-on-write block.
+        *
+        * If the data is embedded no other action is required.
+        *
+        * If the data is not embedded we acquire and clear the
+        * new block.  If chain->data is not NULL we then do the
+        * copy-on-write.  chain->data will then be repointed to the new
+        * buffer and the old buffer will be released.
+        *
+        * For newly created elements with no prior allocation we go
+        * through the copy-on-write steps except without the copying part.
+        */
+       if (chain != &hmp->vchain) {
+               if ((hammer2_debug & 0x0001) &&
+                   (chain->bref.data_off & HAMMER2_OFF_MASK)) {
+                       kprintf("Replace %d\n", chain->bytes);
+               }
+               chain->bref.data_off =
+                       hammer2_freemap_alloc(hmp, chain->bref.type,
+                                             chain->bytes);
+               /* XXX failed allocation */
+       }
+
+       /*
+        * If data instantiation is optional and the chain has no current
+        * data association (typical for DATA and newly-created INDIRECT
+        * elements), don't instantiate the buffer now.
+        */
+       if ((flags & HAMMER2_MODIFY_OPTDATA) && chain->bp == NULL)
+               goto skip2;
+
+skip1:
+       /*
+        * Setting the DIRTYBP flag will cause the buffer to be dirtied or
+        * written-out on unlock.  This bit is independent of the MODIFIED
+        * bit because the chain may still need meta-data adjustments done
+        * by virtue of MODIFIED for its parent, and the buffer can be
+        * flushed out (possibly multiple times) by the OS before that.
+        *
+        * Clearing the INITIAL flag (for indirect blocks) indicates that
+        * a zero-fill buffer has been instantiated.
+        */
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DIRTYBP);
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
+
+       /*
+        * We currently should never instantiate a device buffer for a
+        * data chain.
+        */
+       KKASSERT(chain->bref.type != HAMMER2_BREF_TYPE_DATA);
+
+       /*
+        * Execute COW operation
+        */
+       switch(chain->bref.type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+       case HAMMER2_BREF_TYPE_INODE:
+               /*
+                * The data is embedded, no copy-on-write operation is
+                * needed.
+                */
+               KKASSERT(chain->bp == NULL);
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               /*
+                * Perform the copy-on-write operation
+                */
+               KKASSERT(chain != &hmp->vchain);        /* safety */
+               /*
+                * The device buffer may be larger than the allocation size.
+                */
+               if ((bbytes = chain->bytes) < HAMMER2_MINIOSIZE)
+                       bbytes = HAMMER2_MINIOSIZE;
+               pbase = chain->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
+               boff = chain->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+               /*
+                * The getblk() optimization can only be used if the
+                * physical block size matches the request.
+                */
+               if (chain->bytes == bbytes) {
+                       nbp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+                       error = 0;
+               } else {
+                       error = bread(hmp->devvp, pbase, bbytes, &nbp);
+                       KKASSERT(error == 0);
+               }
+               bdata = (char *)nbp->b_data + boff;
+
+               /*
+                * Copy or zero-fill on write depending on whether
+                * chain->data exists or not.
+                */
+               if (chain->data) {
+                       bcopy(chain->data, bdata, chain->bytes);
+                       KKASSERT(chain->bp != NULL);
+               } else {
+                       bzero(bdata, chain->bytes);
+               }
+               if (chain->bp) {
+                       chain->bp->b_flags |= B_RELBUF;
+                       brelse(chain->bp);
+               }
+               chain->bp = nbp;
+               chain->data = bdata;
+               break;
+       default:
+               panic("hammer2_chain_modify: illegal non-embedded type %d",
+                     chain->bref.type);
+               break;
+
+       }
+skip2:
+       if ((flags & HAMMER2_MODIFY_NOSUB) == 0)
+               hammer2_chain_parent_setsubmod(hmp, chain);
+}
+
+/*
+ * Mark the volume as having been modified.  This short-cut version
+ * does not have to lock the volume's chain, which allows the ioctl
+ * code to make adjustments to connections without deadlocking.
+ */
+void
+hammer2_modify_volume(hammer2_mount_t *hmp)
+{
+       hammer2_voldata_lock(hmp);
+       atomic_set_int(&hmp->vchain.flags, HAMMER2_CHAIN_MODIFIED_AUX);
+       hammer2_voldata_unlock(hmp);
+}
+
+/*
+ * Locate an in-memory chain.  The parent must be locked.  The in-memory
+ * chain is returned or NULL if no in-memory chain is present.
+ *
+ * NOTE: A chain on-media might exist for this index when NULL is returned.
+ */
+hammer2_chain_t *
+hammer2_chain_find(hammer2_mount_t *hmp, hammer2_chain_t *parent, int index)
+{
+       hammer2_chain_t dummy;
+       hammer2_chain_t *chain;
+
+       dummy.index = index;
+       chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
+       return (chain);
+}
+
+/*
+ * Return a locked chain structure with all associated data acquired.
+ *
+ * Caller must lock the parent on call, the returned child will be locked.
+ */
+hammer2_chain_t *
+hammer2_chain_get(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                 int index, int flags)
+{
+       hammer2_blockref_t *bref;
+       hammer2_chain_t *chain;
+       hammer2_chain_t dummy;
+       int how;
+
+       /*
+        * Figure out how to lock.  MAYBE can be used to optimized
+        * the initial-create state for indirect blocks.
+        */
+       if (flags & (HAMMER2_LOOKUP_NODATA | HAMMER2_LOOKUP_NOLOCK))
+               how = HAMMER2_RESOLVE_NEVER;
+       else
+               how = HAMMER2_RESOLVE_MAYBE;
+
+       /*
+        * First see if we have a (possibly modified) chain element cached
+        * for this (parent, index).  Acquire the data if necessary.
+        *
+        * If chain->data is non-NULL the chain should already be marked
+        * modified.
+        */
+       dummy.index = index;
+       chain = SPLAY_FIND(hammer2_chain_splay, &parent->shead, &dummy);
+       if (chain) {
+               if (flags & HAMMER2_LOOKUP_NOLOCK)
+                       hammer2_chain_ref(hmp, chain);
+               else
+                       hammer2_chain_lock(hmp, chain, how);
+               return(chain);
+       }
+
+       /*
+        * the get function must always succeed, panic if there's no
+        * data to index.
+        */
+       if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+               panic("hammer2_chain_get: Missing bref(1)");
+               /* NOT REACHED */
+       }
+
+       /*
+        * Otherwise lookup the bref and issue I/O (switch on the parent)
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
+               bref = &parent->data->ipdata.u.blockset.blockref[index];
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               KKASSERT(parent->data != NULL);
+               KKASSERT(index >= 0 &&
+                        index < parent->bytes / sizeof(hammer2_blockref_t));
+               bref = &parent->data->npdata.blockref[index];
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               KKASSERT(index >= 0 && index < HAMMER2_SET_COUNT);
+               bref = &hmp->voldata.sroot_blockset.blockref[index];
+               break;
+       default:
+               bref = NULL;
+               panic("hammer2_chain_get: unrecognized blockref type: %d",
+                     parent->bref.type);
+       }
+       if (bref->type == 0) {
+               panic("hammer2_chain_get: Missing bref(2)");
+               /* NOT REACHED */
+       }
+
+       /*
+        * Allocate a chain structure representing the existing media
+        * entry.
+        *
+        * The locking operation we do later will issue I/O to read it.
+        */
+       chain = hammer2_chain_alloc(hmp, bref);
+
+       /*
+        * Link the chain into its parent.  Caller is expected to hold an
+        * exclusive lock on the parent.
+        */
+       chain->parent = parent;
+       chain->index = index;
+       if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
+               panic("hammer2_chain_link: collision");
+       KKASSERT(parent->refs > 0);
+       atomic_add_int(&parent->refs, 1);       /* for splay entry */
+
+       /*
+        * Additional linkage for inodes.  Reuse the parent pointer to
+        * find the parent directory.
+        */
+       if (bref->type == HAMMER2_BREF_TYPE_INODE) {
+               while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
+                       parent = parent->parent;
+               if (parent->bref.type == HAMMER2_BREF_TYPE_INODE) {
+                       chain->u.ip->pip = parent->u.ip;
+                       chain->u.ip->pmp = parent->u.ip->pmp;
+               }
+       }
+
+       /*
+        * Our new chain structure has already been referenced and locked
+        * but the lock code handles the I/O so call it to resolve the data.
+        * Then release one of our two exclusive locks.
+        *
+        * If NOLOCK is set the release will release the one-and-only lock.
+        */
+       if ((flags & HAMMER2_LOOKUP_NOLOCK) == 0) {
+               hammer2_chain_lock(hmp, chain, how);    /* recusive lock */
+               hammer2_chain_drop(hmp, chain);         /* excess ref */
+       }
+       lockmgr(&chain->lk, LK_RELEASE);                /* from alloc */
+
+       return (chain);
+}
+
+/*
+ * Locate any key between key_beg and key_end inclusive.  (*parentp)
+ * typically points to an inode but can also point to a related indirect
+ * block and this function will recurse upwards and find the inode again.
+ *
+ * WARNING!  THIS DOES NOT RETURN KEYS IN LOGICAL KEY ORDER!  ANY KEY
+ *          WITHIN THE RANGE CAN BE RETURNED.  HOWEVER, AN ITERATION
+ *          WHICH PICKS UP WHERE WE LEFT OFF WILL CONTINUE THE SCAN.
+ *
+ * (*parentp) must be exclusively locked and referenced and can be an inode
+ * or an existing indirect block within the inode.
+ *
+ * On return (*parentp) will be modified to point at the deepest parent chain
+ * element encountered during the search, as a helper for an insertion or
+ * deletion.   The new (*parentp) will be locked and referenced and the old
+ * will be unlocked and dereferenced (no change if they are both the same).
+ *
+ * The matching chain will be returned exclusively locked and referenced.
+ *
+ * NULL is returned if no match was found, but (*parentp) will still
+ * potentially be adjusted.
+ *
+ * This function will also recurse up the chain if the key is not within the
+ * current parent's range.  (*parentp) can never be set to NULL.  An iteration
+ * can simply allow (*parentp) to float inside the loop.
+ */
+hammer2_chain_t *
+hammer2_chain_lookup(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
+                    hammer2_key_t key_beg, hammer2_key_t key_end,
+                    int flags)
+{
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       hammer2_chain_t *tmp;
+       hammer2_blockref_t *base;
+       hammer2_blockref_t *bref;
+       hammer2_key_t scan_beg;
+       hammer2_key_t scan_end;
+       int count = 0;
+       int i;
+
+       /*
+        * Recurse (*parentp) upward if necessary until the parent completely
+        * encloses the key range or we hit the inode.
+        */
+       parent = *parentp;
+       while (parent->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
+               scan_beg = parent->bref.key;
+               scan_end = scan_beg +
+                          ((hammer2_key_t)1 << parent->bref.keybits) - 1;
+               if (key_beg >= scan_beg && key_end <= scan_end)
+                       break;
+               hammer2_chain_ref(hmp, parent);         /* ref old parent */
+               hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
+               parent = parent->parent;
+                                                       /* lock new parent */
+               hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
+               hammer2_chain_drop(hmp, *parentp);      /* drop old parent */
+               *parentp = parent;                      /* new parent */
+       }
+
+again:
+       /*
+        * Locate the blockref array.  Currently we do a fully associative
+        * search through the array.
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               /*
+                * Special shortcut for embedded data returns the inode
+                * itself.  Callers must detect this condition and access
+                * the embedded data (the strategy code does this for us).
+                *
+                * This is only applicable to regular files and softlinks.
+                */
+               if (parent->data->ipdata.op_flags & HAMMER2_OPFLAG_DIRECTDATA) {
+                       if (flags & HAMMER2_LOOKUP_NOLOCK)
+                               hammer2_chain_ref(hmp, parent);
+                       else
+                               hammer2_chain_lock(hmp, parent,
+                                                  HAMMER2_RESOLVE_ALWAYS);
+                       return (parent);
+               }
+               base = &parent->data->ipdata.u.blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               /*
+                * Optimize indirect blocks in the INITIAL state to avoid
+                * I/O.
+                */
+               if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                       base = NULL;
+               } else {
+                       if (parent->data == NULL)
+                               panic("parent->data is NULL");
+                       base = &parent->data->npdata.blockref[0];
+               }
+               count = parent->bytes / sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               panic("hammer2_chain_lookup: unrecognized blockref type: %d",
+                     parent->bref.type);
+               base = NULL;    /* safety */
+               count = 0;      /* safety */
+       }
+
+       /*
+        * If the element and key overlap we use the element.
+        */
+       bref = NULL;
+       for (i = 0; i < count; ++i) {
+               tmp = hammer2_chain_find(hmp, parent, i);
+               if (tmp) {
+                       bref = &tmp->bref;
+                       KKASSERT(bref->type != 0);
+               } else if (base == NULL || base[i].type == 0) {
+                       continue;
+               } else {
+                       bref = &base[i];
+               }
+               scan_beg = bref->key;
+               scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
+               if (key_beg <= scan_end && key_end >= scan_beg)
+                       break;
+       }
+       if (i == count) {
+               if (key_beg == key_end)
+                       return (NULL);
+               return (hammer2_chain_next(hmp, parentp, NULL,
+                                          key_beg, key_end, flags));
+       }
+
+       /*
+        * Acquire the new chain element.  If the chain element is an
+        * indirect block we must search recursively.
+        */
+       chain = hammer2_chain_get(hmp, parent, i, flags);
+       if (chain == NULL)
+               return (NULL);
+
+       /*
+        * If the chain element is an indirect block it becomes the new
+        * parent and we loop on it.
+        *
+        * The parent always has to be locked with at least RESOLVE_MAYBE,
+        * so it might need a fixup if the caller passed incompatible flags.
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
+               hammer2_chain_unlock(hmp, parent);
+               *parentp = parent = chain;
+               if (flags & HAMMER2_LOOKUP_NOLOCK) {
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_drop(hmp, chain); /* excess ref */
+               } else if (flags & HAMMER2_LOOKUP_NODATA) {
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_unlock(hmp, chain);
+               }
+               goto again;
+       }
+
+       /*
+        * All done, return chain
+        */
+       return (chain);
+}
+
+/*
+ * After having issued a lookup we can iterate all matching keys.
+ *
+ * If chain is non-NULL we continue the iteration from just after it's index.
+ *
+ * If chain is NULL we assume the parent was exhausted and continue the
+ * iteration at the next parent.
+ *
+ * parent must be locked on entry and remains locked throughout.  chain's
+ * lock status must match flags.
+ */
+hammer2_chain_t *
+hammer2_chain_next(hammer2_mount_t *hmp, hammer2_chain_t **parentp,
+                  hammer2_chain_t *chain,
+                  hammer2_key_t key_beg, hammer2_key_t key_end,
+                  int flags)
+{
+       hammer2_chain_t *parent;
+       hammer2_chain_t *tmp;
+       hammer2_blockref_t *base;
+       hammer2_blockref_t *bref;
+       hammer2_key_t scan_beg;
+       hammer2_key_t scan_end;
+       int i;
+       int count;
+
+       parent = *parentp;
+
+again:
+       /*
+        * Calculate the next index and recalculate the parent if necessary.
+        */
+       if (chain) {
+               /*
+                * Continue iteration within current parent.  If not NULL
+                * the passed-in chain may or may not be locked, based on
+                * the LOOKUP_NOLOCK flag (passed in as returned from lookup
+                * or a prior next).
+                */
+               i = chain->index + 1;
+               if (flags & HAMMER2_LOOKUP_NOLOCK)
+                       hammer2_chain_drop(hmp, chain);
+               else
+                       hammer2_chain_unlock(hmp, chain);
+
+               /*
+                * Any scan where the lookup returned degenerate data embedded
+                * in the inode has an invalid index and must terminate.
+                */
+               if (chain == parent)
+                       return(NULL);
+               chain = NULL;
+       } else if (parent->bref.type != HAMMER2_BREF_TYPE_INDIRECT) {
+               /*
+                * We reached the end of the iteration.
+                */
+               return (NULL);
+       } else {
+               /*
+                * Continue iteration with next parent unless the current
+                * parent covers the range.
+                */
+               hammer2_chain_t *nparent;
+
+               scan_beg = parent->bref.key;
+               scan_end = scan_beg +
+                           ((hammer2_key_t)1 << parent->bref.keybits) - 1;
+               if (key_beg >= scan_beg && key_end <= scan_end)
+                       return (NULL);
+
+               i = parent->index + 1;
+               nparent = parent->parent;
+               hammer2_chain_ref(hmp, nparent);        /* ref new parent */
+               hammer2_chain_unlock(hmp, parent);      /* unlock old parent */
+                                                       /* lock new parent */
+               hammer2_chain_lock(hmp, nparent, HAMMER2_RESOLVE_MAYBE);
+               hammer2_chain_drop(hmp, nparent);       /* drop excess ref */
+               *parentp = parent = nparent;
+       }
+
+again2:
+       /*
+        * Locate the blockref array.  Currently we do a fully associative
+        * search through the array.
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               base = &parent->data->ipdata.u.blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                       base = NULL;
+               } else {
+                       KKASSERT(parent->data != NULL);
+                       base = &parent->data->npdata.blockref[0];
+               }
+               count = parent->bytes / sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               panic("hammer2_chain_next: unrecognized blockref type: %d",
+                     parent->bref.type);
+               base = NULL;    /* safety */
+               count = 0;      /* safety */
+               break;
+       }
+       KKASSERT(i <= count);
+
+       /*
+        * Look for the key.  If we are unable to find a match and an exact
+        * match was requested we return NULL.  If a range was requested we
+        * run hammer2_chain_next() to iterate.
+        */
+       bref = NULL;
+       while (i < count) {
+               tmp = hammer2_chain_find(hmp, parent, i);
+               if (tmp) {
+                       bref = &tmp->bref;
+               } else if (base == NULL || base[i].type == 0) {
+                       ++i;
+                       continue;
+               } else {
+                       bref = &base[i];
+               }
+               scan_beg = bref->key;
+               scan_end = scan_beg + ((hammer2_key_t)1 << bref->keybits) - 1;
+               if (key_beg <= scan_end && key_end >= scan_beg)
+                       break;
+               ++i;
+       }
+
+       /*
+        * If we couldn't find a match recurse up a parent to continue the
+        * search.
+        */
+       if (i == count)
+               goto again;
+
+       /*
+        * Acquire the new chain element.  If the chain element is an
+        * indirect block we must search recursively.
+        */
+       chain = hammer2_chain_get(hmp, parent, i, flags);
+       if (chain == NULL)
+               return (NULL);
+
+       /*
+        * If the chain element is an indirect block it becomes the new
+        * parent and we loop on it.
+        *
+        * The parent always has to be locked with at least RESOLVE_MAYBE,
+        * so it might need a fixup if the caller passed incompatible flags.
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
+               hammer2_chain_unlock(hmp, parent);
+               *parentp = parent = chain;
+               chain = NULL;
+               if (flags & HAMMER2_LOOKUP_NOLOCK) {
+                       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_drop(hmp, parent);        /* excess ref */
+               } else if (flags & HAMMER2_LOOKUP_NODATA) {
+                       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_unlock(hmp, parent);
+               }
+               i = 0;
+               goto again2;
+       }
+
+       /*
+        * All done, return chain
+        */
+       return (chain);
+}
+
+/*
+ * Create and return a new hammer2 system memory structure of the specified
+ * key, type and size and insert it RELATIVE TO (PARENT).
+ *
+ * (parent) is typically either an inode or an indirect block, acquired
+ * acquired as a side effect of issuing a prior failed lookup.  parent
+ * must be locked and held.  Do not pass the inode chain to this function
+ * unless that is the chain returned by the failed lookup.
+ *
+ * Non-indirect types will automatically allocate indirect blocks as required
+ * if the new item does not fit in the current (parent).
+ *
+ * Indirect types will move a portion of the existing blockref array in
+ * (parent) into the new indirect type and then use one of the free slots
+ * to emplace the new indirect type.
+ *
+ * A new locked, referenced chain element is returned of the specified type.
+ * The element may or may not have a data area associated with it:
+ *
+ *     VOLUME          not allowed here
+ *     INODE           embedded data are will be set-up
+ *     INDIRECT        not allowed here
+ *     DATA            no data area will be set-up (caller is expected
+ *                     to have logical buffers, we don't want to alias
+ *                     the data onto device buffers!).
+ */
+hammer2_chain_t *
+hammer2_chain_create(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                    hammer2_chain_t *chain,
+                    hammer2_key_t key, int keybits, int type, size_t bytes)
+{
+       hammer2_blockref_t dummy;
+       hammer2_blockref_t *base;
+       hammer2_chain_t dummy_chain;
+       int unlock_parent = 0;
+       int allocated = 0;
+       int count;
+       int i;
+
+       if (chain == NULL) {
+               /*
+                * First allocate media space and construct the dummy bref,
+                * then allocate the in-memory chain structure.
+                */
+               bzero(&dummy, sizeof(dummy));
+               dummy.type = type;
+               dummy.key = key;
+               dummy.keybits = keybits;
+               dummy.data_off = hammer2_bytes_to_radix(bytes);
+               chain = hammer2_chain_alloc(hmp, &dummy);
+               allocated = 1;
+
+               /*
+                * We do NOT set INITIAL here (yet).  INITIAL is only
+                * used for indirect blocks.
+                *
+                * Recalculate bytes to reflect the actual media block
+                * allocation.
+                */
+               bytes = (hammer2_off_t)1 <<
+                       (int)(chain->bref.data_off & HAMMER2_OFF_MASK_RADIX);
+               chain->bytes = bytes;
+
+               switch(type) {
+               case HAMMER2_BREF_TYPE_VOLUME:
+                       panic("hammer2_chain_create: called with volume type");
+                       break;
+               case HAMMER2_BREF_TYPE_INODE:
+                       KKASSERT(bytes == HAMMER2_INODE_BYTES);
+                       chain->data = (void *)&chain->u.ip->ip_data;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       panic("hammer2_chain_create: cannot be used to"
+                             "create indirect block");
+                       break;
+               case HAMMER2_BREF_TYPE_DATA:
+               default:
+                       /* leave chain->data NULL */
+                       KKASSERT(chain->data == NULL);
+                       break;
+               }
+       } else {
+               /*
+                * Potentially update the chain's key/keybits.
+                */
+               chain->bref.key = key;
+               chain->bref.keybits = keybits;
+       }
+
+again:
+       /*
+        * Locate a free blockref in the parent's array
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               KKASSERT(parent->data != NULL);
+               base = &parent->data->ipdata.u.blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+                       base = NULL;
+               } else {
+                       KKASSERT(parent->data != NULL);
+                       base = &parent->data->npdata.blockref[0];
+               }
+               count = parent->bytes / sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               KKASSERT(parent->data != NULL);
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               panic("hammer2_chain_create: unrecognized blockref type: %d",
+                     parent->bref.type);
+               count = 0;
+               break;
+       }
+
+       /*
+        * Scan for an unallocated bref, also skipping any slots occupied
+        * by in-memory chain elements that may not yet have been updated
+        * in the parent's bref array.
+        */
+       bzero(&dummy_chain, sizeof(dummy_chain));
+       for (i = 0; i < count; ++i) {
+               if (base == NULL) {
+                       dummy_chain.index = i;
+                       if (SPLAY_FIND(hammer2_chain_splay,
+                                      &parent->shead, &dummy_chain) == NULL) {
+                               break;
+                       }
+               } else if (base[i].type == 0) {
+                       dummy_chain.index = i;
+                       if (SPLAY_FIND(hammer2_chain_splay,
+                                      &parent->shead, &dummy_chain) == NULL) {
+                               break;
+                       }
+               }
+       }
+
+       /*
+        * If no free blockref count be found we must create an indirect
+        * block and move a number of blockrefs into it.  With the parent
+        * locked we can safely lock each child in order to move it without
+        * causing a deadlock.
+        *
+        * This may return the new indirect block or the old parent depending
+        * on where the key falls.
+        */
+       if (i == count) {
+               hammer2_chain_t *nparent;
+
+               nparent = hammer2_chain_create_indirect(hmp, parent,
+                                                       key, keybits);
+               if (nparent == NULL) {
+                       if (allocated)
+                               hammer2_chain_free(hmp, chain);
+                       chain = NULL;
+                       goto done;
+               }
+               if (parent != nparent) {
+                       if (unlock_parent)
+                               hammer2_chain_unlock(hmp, parent);
+                       parent = nparent;
+                       unlock_parent = 1;
+               }
+               goto again;
+       }
+
+       /*
+        * Link the chain into its parent.
+        */
+       if (chain->parent != NULL)
+               panic("hammer2: hammer2_chain_create: chain already connected");
+       KKASSERT(chain->parent == NULL);
+       chain->parent = parent;
+       chain->index = i;
+       if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, chain))
+               panic("hammer2_chain_link: collision");
+       atomic_clear_int(&chain->flags, HAMMER2_CHAIN_DELETED);
+       KKASSERT(parent->refs > 0);
+       atomic_add_int(&parent->refs, 1);
+
+       /*
+        * Additional linkage for inodes.  Reuse the parent pointer to
+        * find the parent directory.
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE) {
+               hammer2_chain_t *scan = parent;
+               while (scan->bref.type == HAMMER2_BREF_TYPE_INDIRECT)
+                       scan = scan->parent;
+               if (scan->bref.type == HAMMER2_BREF_TYPE_INODE) {
+                       chain->u.ip->pip = scan->u.ip;
+                       chain->u.ip->pmp = scan->u.ip->pmp;
+               }
+       }
+
+       /*
+        * (allocated) indicates that this is a newly-created chain element
+        * rather than a renamed chain element.  In this situation we want
+        * to place the chain element in the MODIFIED state.
+        *
+        * The data area will be set up as follows:
+        *
+        *      VOLUME          not allowed here.
+        *
+        *      INODE           embedded data are will be set-up.
+        *
+        *      INDIRECT        not allowed here.
+        *
+        *      DATA            no data area will be set-up (caller is expected
+        *                      to have logical buffers, we don't want to alias
+        *                      the data onto device buffers!).
+        */
+       if (allocated) {
+               if (chain->bref.type == HAMMER2_BREF_TYPE_DATA) {
+                       hammer2_chain_modify(hmp, chain,
+                                            HAMMER2_MODIFY_OPTDATA);
+               } else if (chain->bref.type == HAMMER2_BREF_TYPE_INDIRECT) {
+                       /* not supported in this function */
+                       panic("hammer2_chain_create: bad type");
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_INITIAL);
+                       hammer2_chain_modify(hmp, chain,
+                                            HAMMER2_MODIFY_OPTDATA);
+               } else {
+                       hammer2_chain_modify(hmp, chain, 0);
+               }
+       } else {
+               /*
+                * When reconnecting inodes we have to call setsubmod()
+                * to ensure that its state propagates up the newly
+                * connected parent.
+                *
+                * We cannot depend on the chain being in a MODIFIED
+                * state, or it might already be in that state, so
+                * even if the parent calls hammer2_chain_modify()
+                * MOVED might not get set.  Thus we have to set it
+                * here, too.
+                */
+               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+                       hammer2_chain_ref(hmp, chain);
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
+               }
+               hammer2_chain_parent_setsubmod(hmp, chain);
+       }
+
+done:
+       if (unlock_parent)
+               hammer2_chain_unlock(hmp, parent);
+       return (chain);
+}
+
+/*
+ * Create an indirect block that covers one or more of the elements in the
+ * current parent.  Either returns the existing parent with no locking or
+ * ref changes or returns the new indirect block locked and referenced,
+ * depending on what the specified key falls into.
+ *
+ * The key/keybits for the indirect mode only needs to follow three rules:
+ *
+ * (1) That all elements underneath it fit within its key space and
+ *
+ * (2) That all elements outside it are outside its key space.
+ *
+ * (3) When creating the new indirect block any elements in the current
+ *     parent that fit within the new indirect block's keyspace must be
+ *     moved into the new indirect block.
+ *
+ * (4) The keyspace chosen for the inserted indirect block CAN cover a wider
+ *     keyspace the the current parent, but lookup/iteration rules will
+ *     ensure (and must ensure) that rule (2) for all parents leading up
+ *     to the nearest inode or the root volume header is adhered to.  This
+ *     is accomplished by always recursing through matching keyspaces in
+ *     the hammer2_chain_lookup() and hammer2_chain_next() API.
+ *
+ * The current implementation calculates the current worst-case keyspace by
+ * iterating the current parent and then divides it into two halves, choosing
+ * whichever half has the most elements (not necessarily the half containing
+ * the requested key).
+ *
+ * We can also opt to use the half with the least number of elements.  This
+ * causes lower-numbered keys (aka logical file offsets) to recurse through
+ * fewer indirect blocks and higher-numbered keys to recurse through more.
+ * This also has the risk of not moving enough elements to the new indirect
+ * block and being forced to create several indirect blocks before the element
+ * can be inserted.
+ */
+static
+hammer2_chain_t *
+hammer2_chain_create_indirect(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                             hammer2_key_t create_key, int create_bits)
+{
+       hammer2_blockref_t *base;
+       hammer2_blockref_t *bref;
+       hammer2_chain_t *chain;
+       hammer2_chain_t *ichain;
+       hammer2_chain_t dummy;
+       hammer2_key_t key = create_key;
+       int keybits = create_bits;
+       int locount = 0;
+       int hicount = 0;
+       int count;
+       int nbytes;
+       int i;
+
+       /*
+        * Calculate the base blockref pointer or NULL if the chain
+        * is known to be empty.
+        */
+       hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
+       if (parent->flags & HAMMER2_CHAIN_INITIAL) {
+               base = NULL;
+
+               /*
+                * We still need to calculate the count for SPLAY lookups
+                */
+               switch(parent->bref.type) {
+               case HAMMER2_BREF_TYPE_INODE:
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       count = parent->bytes / sizeof(hammer2_blockref_t);
+                       break;
+               case HAMMER2_BREF_TYPE_VOLUME:
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               default:
+                       panic("hammer2_chain_create_indirect: "
+                             "unrecognized blockref type: %d",
+                             parent->bref.type);
+                       count = 0;
+                       break;
+               }
+       } else {
+               /*
+                * Locate a free blockref in the parent's array
+                */
+               switch(parent->bref.type) {
+               case HAMMER2_BREF_TYPE_INODE:
+                       base = &parent->data->ipdata.u.blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               case HAMMER2_BREF_TYPE_INDIRECT:
+                       base = &parent->data->npdata.blockref[0];
+                       count = parent->bytes / sizeof(hammer2_blockref_t);
+                       break;
+               case HAMMER2_BREF_TYPE_VOLUME:
+                       base = &hmp->voldata.sroot_blockset.blockref[0];
+                       count = HAMMER2_SET_COUNT;
+                       break;
+               default:
+                       panic("hammer2_chain_create_indirect: "
+                             "unrecognized blockref type: %d",
+                             parent->bref.type);
+                       count = 0;
+                       break;
+               }
+       }
+
+       /*
+        * Scan for an unallocated bref, also skipping any slots occupied
+        * by in-memory chain elements that may not yet have been updated
+        * in the parent's bref array.
+        */
+       bzero(&dummy, sizeof(dummy));
+       for (i = 0; i < count; ++i) {
+               int nkeybits;
+
+               /*
+                * Optimize the case where the parent is still in its
+                * initially created state.
+                */
+               if (base == NULL || base[i].type == 0) {
+                       dummy.index = i;
+                       chain = SPLAY_FIND(hammer2_chain_splay,
+                                          &parent->shead, &dummy);
+                       if (chain == NULL)
+                               continue;
+                       bref = &chain->bref;
+               } else {
+                       bref = &base[i];
+               }
+
+               /*
+                * Expand our calculated key range (key, keybits) to fit
+                * the scanned key.  nkeybits represents the full range
+                * that we will later cut in half (two halves @ nkeybits - 1).
+                */
+               nkeybits = keybits;
+               if (nkeybits < bref->keybits)
+                       nkeybits = bref->keybits;
+               while ((~(((hammer2_key_t)1 << nkeybits) - 1) &
+                       (key ^ bref->key)) != 0) {
+                       ++nkeybits;
+               }
+
+               /*
+                * If the new key range is larger we have to determine
+                * which side of the new key range the existing keys fall
+                * under by checking the high bit, then collapsing the
+                * locount into the hicount or vise-versa.
+                */
+               if (keybits != nkeybits) {
+                       if (((hammer2_key_t)1 << (nkeybits - 1)) & key) {
+                               hicount += locount;
+                               locount = 0;
+                       } else {
+                               locount += hicount;
+                               hicount = 0;
+                       }
+                       keybits = nkeybits;
+               }
+
+               /*
+                * The newly scanned key will be in the lower half or the
+                * higher half of the (new) key range.
+                */
+               if (((hammer2_key_t)1 << (nkeybits - 1)) & bref->key)
+                       ++hicount;
+               else
+                       ++locount;
+       }
+
+       /*
+        * Adjust keybits to represent half of the full range calculated
+        * above.
+        */
+       --keybits;
+
+       /*
+        * Select whichever half contains the most elements.  Theoretically
+        * we can select either side as long as it contains at least one
+        * element (in order to ensure that a free slot is present to hold
+        * the indirect block).
+        */
+       key &= ~(((hammer2_key_t)1 << keybits) - 1);
+       if (hammer2_indirect_optimize) {
+               /*
+                * Insert node for least number of keys, this will arrange
+                * the first few blocks of a large file or the first few
+                * inodes in a directory with fewer indirect blocks when
+                * created linearly.
+                */
+               if (hicount < locount && hicount != 0)
+                       key |= (hammer2_key_t)1 << keybits;
+               else
+                       key &= ~(hammer2_key_t)1 << keybits;
+       } else {
+               /*
+                * Insert node for most number of keys, best for heavily
+                * fragmented files.
+                */
+               if (hicount > locount)
+                       key |= (hammer2_key_t)1 << keybits;
+               else
+                       key &= ~(hammer2_key_t)1 << keybits;
+       }
+
+       /*
+        * How big should our new indirect block be?  It has to be at least
+        * as large as its parent.
+        */
+       if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
+               nbytes = HAMMER2_IND_BYTES_MIN;
+       else
+               nbytes = HAMMER2_IND_BYTES_MAX;
+       if (nbytes < count * sizeof(hammer2_blockref_t))
+               nbytes = count * sizeof(hammer2_blockref_t);
+
+       /*
+        * Ok, create our new indirect block
+        */
+       dummy.bref.type = HAMMER2_BREF_TYPE_INDIRECT;
+       dummy.bref.key = key;
+       dummy.bref.keybits = keybits;
+       dummy.bref.data_off = hammer2_bytes_to_radix(nbytes);
+       ichain = hammer2_chain_alloc(hmp, &dummy.bref);
+       atomic_set_int(&ichain->flags, HAMMER2_CHAIN_INITIAL);
+
+       /*
+        * Iterate the original parent and move the matching brefs into
+        * the new indirect block.
+        */
+       for (i = 0; i < count; ++i) {
+               /*
+                * For keying purposes access the bref from the media or
+                * from our in-memory cache.  In cases where the in-memory
+                * cache overrides the media the keyrefs will be the same
+                * anyway so we can avoid checking the cache when the media
+                * has a key.
+                */
+               if (base == NULL || base[i].type == 0) {
+                       dummy.index = i;
+                       chain = SPLAY_FIND(hammer2_chain_splay,
+                                          &parent->shead, &dummy);
+                       if (chain == NULL) {
+                               /*
+                                * Select index indirect block is placed in
+                                */
+                               if (ichain->index < 0)
+                                       ichain->index = i;
+                               continue;
+                       }
+                       bref = &chain->bref;
+               } else {
+                       bref = &base[i];
+               }
+
+               /*
+                * Skip keys not in the chosen half (low or high), only bit
+                * (keybits - 1) needs to be compared but for safety we
+                * will compare all msb bits plus that bit again.
+                */
+               if ((~(((hammer2_key_t)1 << keybits) - 1) &
+                   (key ^ bref->key)) != 0) {
+                       continue;
+               }
+
+               /*
+                * This element is being moved, its slot is available
+                * for our indirect block.
+                */
+               if (ichain->index < 0)
+                       ichain->index = i;
+
+               /*
+                * Load the new indirect block by acquiring or allocating
+                * the related chain entries, then simply move it to the
+                * new parent (ichain).
+                *
+                * Flagging the new chain entry MOVED will cause a flush
+                * to synchronize its block into the new indirect block.
+                * The chain is unlocked after being moved but needs to
+                * retain a reference for the MOVED state
+                *
+                * We must still set SUBMODIFIED in the parent but we do
+                * that after the loop.
+                *
+                * XXX we really need a lock here but we don't need the
+                *     data.  NODATA feature needed.
+                */
+               chain = hammer2_chain_get(hmp, parent, i,
+                                         HAMMER2_LOOKUP_NODATA);
+               SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
+               if (SPLAY_INSERT(hammer2_chain_splay, &ichain->shead, chain))
+                       panic("hammer2_chain_create_indirect: collision");
+               chain->parent = ichain;
+               if (base)
+                       bzero(&base[i], sizeof(base[i]));
+               atomic_add_int(&parent->refs, -1);
+               atomic_add_int(&ichain->refs, 1);
+               if ((chain->flags & HAMMER2_CHAIN_MOVED) == 0) {
+                       hammer2_chain_ref(hmp, chain);
+                       atomic_set_int(&chain->flags, HAMMER2_CHAIN_MOVED);
+               }
+               hammer2_chain_unlock(hmp, chain);
+               KKASSERT(parent->refs > 0);
+               chain = NULL;
+       }
+
+       /*
+        * Insert the new indirect block into the parent now that we've
+        * cleared out some entries in the parent.  We calculated a good
+        * insertion index in the loop above (ichain->index).
+        */
+       KKASSERT(ichain->index >= 0);
+       if (SPLAY_INSERT(hammer2_chain_splay, &parent->shead, ichain))
+               panic("hammer2_chain_create_indirect: ichain insertion");
+       ichain->parent = parent;
+       atomic_add_int(&parent->refs, 1);
+
+       /*
+        * Mark the new indirect block modified after insertion, which
+        * will propagate up through parent all the way to the root and
+        * also allocate the physical block in ichain for our caller,
+        * and assign ichain->data to a pre-zero'd space (because there
+        * is not prior data to copy into it).
+        *
+        * We have to set SUBMODIFIED in ichain's flags manually so the
+        * flusher knows it has to recurse through it to get to all of
+        * our moved blocks, then call setsubmod() to set the bit
+        * recursively.
+        */
+       hammer2_chain_modify(hmp, ichain, HAMMER2_MODIFY_OPTDATA);
+       atomic_set_int(&ichain->flags, HAMMER2_CHAIN_SUBMODIFIED);
+       hammer2_chain_parent_setsubmod(hmp, ichain);
+
+       /*
+        * Figure out what to return.
+        */
+       if (create_bits >= keybits) {
+               /*
+                * Key being created is way outside the key range,
+                * return the original parent.
+                */
+               hammer2_chain_unlock(hmp, ichain);
+       } else if (~(((hammer2_key_t)1 << keybits) - 1) &
+                  (create_key ^ key)) {
+               /*
+                * Key being created is outside the key range,
+                * return the original parent.
+                */
+               hammer2_chain_unlock(hmp, ichain);
+       } else {
+               /*
+                * Otherwise its in the range, return the new parent.
+                */
+               parent = ichain;
+       }
+
+       return(parent);
+}
+
+/*
+ * Physically delete the specified chain element.  Note that inodes with
+ * open descriptors should not be deleted (as with other filesystems) until
+ * the last open descriptor is closed.
+ *
+ * This routine will remove the chain element from its parent and potentially
+ * also recurse upward and delete indirect blocks which become empty as a
+ * side effect.
+ *
+ * The caller must pass a pointer to the chain's parent, also locked and
+ * referenced.  (*parentp) will be modified in a manner similar to a lookup
+ * or iteration when indirect blocks are also deleted as a side effect.
+ */
+void
+hammer2_chain_delete(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                    hammer2_chain_t *chain)
+{
+       hammer2_blockref_t *base;
+       int count;
+
+       if (chain->parent != parent)
+               panic("hammer2_chain_delete: parent mismatch");
+
+       /*
+        * Mark the parent modified so our base[] pointer remains valid
+        * while we move entries.  For the optimized indirect block
+        * case mark the parent moved instead.
+        *
+        * Calculate the blockref reference in the parent
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               hammer2_chain_modify(hmp, parent, 0);
+               base = &parent->data->ipdata.u.blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_OPTDATA);
+               if (parent->flags & HAMMER2_CHAIN_INITIAL)
+                       base = NULL;
+               else
+                       base = &parent->data->npdata.blockref[0];
+               count = parent->bytes / sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               hammer2_chain_modify(hmp, parent, 0);
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               panic("hammer2_chain_delete: unrecognized blockref type: %d",
+                     parent->bref.type);
+               count = 0;
+               break;
+       }
+
+       /*
+        * Disconnect the bref in the parent, remove the chain, and
+        * disconnect in-memory fields from the parent.
+        */
+       KKASSERT(chain->index >= 0 && chain->index < count);
+       if (base)
+               bzero(&base[chain->index], sizeof(*base));
+
+       SPLAY_REMOVE(hammer2_chain_splay, &parent->shead, chain);
+       atomic_set_int(&chain->flags, HAMMER2_CHAIN_DELETED);
+       atomic_add_int(&parent->refs, -1);      /* for splay entry */
+       chain->index = -1;
+       chain->parent = NULL;
+
+       /*
+        * If this is an inode clear the pip.
+        */
+       if (chain->bref.type == HAMMER2_BREF_TYPE_INODE)
+               chain->u.ip->pip = NULL;
+
+       /*
+        * The chain is still likely referenced, possibly even by a vnode
+        * (if an inode), so defer further action until the chain gets
+        * dropped.
+        */
+}
+
+/*
+ * Recursively flush the specified chain.  The chain is locked and
+ * referenced by the caller and will remain so on return.  The chain
+ * will remain referenced throughout but can temporarily lose its
+ * lock during the recursion to avoid unnecessarily stalling user
+ * processes.
+ *
+ *
+ */
+TAILQ_HEAD(flush_deferral_list, hammer2_chain);
+
+struct hammer2_flush_info {
+       struct flush_deferral_list flush_list;
+       int             depth;
+};
+
+typedef struct hammer2_flush_info hammer2_flush_info_t;
+
+static void
+hammer2_chain_flush_pass1(hammer2_mount_t *hmp, hammer2_chain_t *parent,
+                         hammer2_flush_info_t *info)
+{
+       hammer2_blockref_t *bref;
+       hammer2_off_t pbase;
+       size_t bbytes;
+       size_t boff;
+       char *bdata;
+       struct buf *bp;
+       int error;
+
+       /*
+        * If we hit the stack recursion depth limit defer the operation.
+        * The controller of the info structure will execute the deferral
+        * list and then retry.
+        *
+        * This is only applicable if SUBMODIFIED is set.  After a reflush
+        * SUBMODIFIED will probably be cleared and we want to drop through
+        * to finish processing the current element so our direct parent
+        * can process the results.
+        */
+       if (info->depth == HAMMER2_FLUSH_DEPTH_LIMIT &&
+           (parent->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
+               if ((parent->flags & HAMMER2_CHAIN_DEFERRED) == 0 &&
+                   ((parent->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                                      HAMMER2_CHAIN_MODIFIED |
+                                      HAMMER2_CHAIN_MODIFIED_AUX |
+                                      HAMMER2_CHAIN_MOVED)) != 0)) {
+                       hammer2_chain_ref(hmp, parent);
+                       TAILQ_INSERT_TAIL(&info->flush_list,
+                                         parent, flush_node);
+                       atomic_set_int(&parent->flags, HAMMER2_CHAIN_DEFERRED);
+               }
+               return;
+       }
+
+       if (hammer2_debug & 0x0008)
+               kprintf("%*.*sCHAIN type=%d@%08jx %p/%d %04x {\n",
+                       info->depth, info->depth, "",
+                       parent->bref.type, parent->bref.data_off,
+                       parent, parent->refs, parent->flags);
+
+       /*
+        * Flush any children of this parent.
+        *
+        * NOTE: If we use a while() here an active filesystem can
+        *       prevent the flush from ever finishing.
+        */
+       if (parent->flags & HAMMER2_CHAIN_SUBMODIFIED) {
+               hammer2_blockref_t *base;
+               hammer2_chain_t *chain;
+               hammer2_chain_t *next;
+               int count;
+               int submodified = 0;
+               int submoved = 0;
+
+               /*
+                * Clear SUBMODIFIED now.  Flag any races during the flush
+                * with the (submodified) local variable and re-arm it
+                * as necessary after the loop is done.
+                *
+                * Delaying the setting of the parent to MODIFIED can reduce
+                * unnecessary I/O.
+                *
+                * Modifications to the children will propagate up, forcing
+                * us to become modified and copy-on-write too.  Be sure
+                * to modify parent (as a side effect of the recursive
+                * flush) ONLY if it is actually being modified by the
+                * recursive flush.
+                */
+               atomic_clear_int(&parent->flags, HAMMER2_CHAIN_SUBMODIFIED);
+
+               /*
+                * Flush the children and update the blockrefs in the parent.
+                * Be careful of ripouts during the loop.
+                */
+               next = SPLAY_MIN(hammer2_chain_splay, &parent->shead);
+               while ((chain = next) != NULL) {
+                       next = SPLAY_NEXT(hammer2_chain_splay,
+                                         &parent->shead, chain);
+                       /*
+                        * We only recurse if SUBMODIFIED (internal node)
+                        * or MODIFIED (internal node or leaf) is set.
+                        * However, we must still track whether any MOVED
+                        * entries are present to determine if the parent's
+                        * blockref's need updating or not.
+                        */
+                       if (chain->flags & HAMMER2_CHAIN_MOVED)
+                               submoved = 1;
+                       if ((chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                                            HAMMER2_CHAIN_MODIFIED |
+                                           HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
+                               continue;
+                       }
+
+                       /*
+                        * Propagate the DESTROYED flag if found set, then
+                        * recurse the flush.
+                        */
+                       hammer2_chain_lock(hmp, chain, HAMMER2_RESOLVE_MAYBE);
+                       if ((parent->flags & HAMMER2_CHAIN_DESTROYED) &&
+                           (chain->flags & HAMMER2_CHAIN_DESTROYED) == 0) {
+                               atomic_set_int(&chain->flags,
+                                              HAMMER2_CHAIN_DESTROYED |
+                                              HAMMER2_CHAIN_SUBMODIFIED);
+                       }
+                       ++info->depth;
+                       hammer2_chain_flush_pass1(hmp, chain, info);
+                       --info->depth;
+
+                       /*
+                        * No point loading blockrefs yet if the
+                        * child (recursively) is still dirty.
+                        */
+                       if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                                          HAMMER2_CHAIN_MODIFIED |
+                                          HAMMER2_CHAIN_MODIFIED_AUX)) {
+                               submodified = 1;
+                               if (hammer2_debug & 0x0008)
+                                       kprintf("s");
+                       }
+                       if (chain->flags & HAMMER2_CHAIN_MOVED) {
+                               if (hammer2_debug & 0x0008)
+                                       kprintf("m");
+                               submoved = 1;
+                       }
+                       if (hammer2_debug & 0x0008)
+                               kprintf("\n");
+                       hammer2_chain_unlock(hmp, chain);
+               }
+
+               if (submodified ||
+                   (parent->flags & HAMMER2_CHAIN_SUBMODIFIED)) {
+                       /*
+                        * No point loading up the blockrefs if submodified
+                        * got re-set.
+                        *
+                        * NOTE: Even though we cleared the SUBMODIFIED flag
+                        *       it can still get re-set by operations
+                        *       occuring under our chain, so check both.
+                        */
+                       atomic_set_int(&parent->flags,
+                                      HAMMER2_CHAIN_SUBMODIFIED);
+               } else if (submoved) {
+                       /*
+                        * Ok, we can modify the blockrefs in this parent
+                        * entry.  Mark it modified.  Calculate the
+                        * blockref array after marking it modified (since
+                        * that may change the underlying data ptr).
+                        *
+                        * NOTE: We only do this if submoved != 0, otherwise
+                        *       there may not be any changes and setting
+                        *       the parent modified will re-arm the MOVED
+                        *       bit recursively, resulting in O(N^2)
+                        *       flushes.
+                        *
+                        * NOTE: We don't want hammer2_chain_modify() to
+                        *       recursively set the SUBMODIFIED flag
+                        *       upward in this case!
+                        */
+                       hammer2_chain_modify(hmp, parent, HAMMER2_MODIFY_NOSUB);
+
+                       switch(parent->bref.type) {
+                       case HAMMER2_BREF_TYPE_INODE:
+                               base = &parent->data->ipdata.u.blockset.
+                                       blockref[0];
+                               count = HAMMER2_SET_COUNT;
+                               break;
+                       case HAMMER2_BREF_TYPE_INDIRECT:
+                               base = &parent->data->npdata.blockref[0];
+                               count = parent->bytes /
+                                       sizeof(hammer2_blockref_t);
+                               break;
+                       case HAMMER2_BREF_TYPE_VOLUME:
+                               base = &hmp->voldata.sroot_blockset.blockref[0];
+                               count = HAMMER2_SET_COUNT;
+                               break;
+                       default:
+                               base = NULL;
+                               panic("hammer2_chain_get: "
+                                     "unrecognized blockref type: %d",
+                                     parent->bref.type);
+                       }
+
+                       /*
+                        * Update the blockrefs.
+                        */
+                       next = SPLAY_MIN(hammer2_chain_splay, &parent->shead);
+                       while ((chain = next) != NULL) {
+                               next = SPLAY_NEXT(hammer2_chain_splay,
+                                                 &parent->shead, chain);
+                               KKASSERT(chain->index >= 0 &&
+                                        chain->index < count);
+                               hammer2_chain_lock(hmp, chain,
+                                                  HAMMER2_RESOLVE_NEVER);
+                               base[chain->index] = chain->bref;
+                               if (chain->flags & HAMMER2_CHAIN_MOVED) {
+                                       atomic_clear_int(&chain->flags,
+                                                HAMMER2_CHAIN_MOVED);
+                                       hammer2_chain_drop(hmp, chain);
+                               }
+                               hammer2_chain_unlock(hmp, chain);
+                       }
+               }
+       }
+
+       /*
+        * If destroying the object we unconditonally clear the MODIFIED
+        * and MOVED bits, and we destroy the buffer without writing it
+        * out.
+        *
+        * We don't bother updating the hash/crc or the parent bref.
+        *
+        * XXX allocations for unflushed data can be returned to the
+        *     free pool.
+        */
+       if (parent->flags & HAMMER2_CHAIN_DESTROYED) {
+               if (parent->flags & HAMMER2_CHAIN_MODIFIED) {
+                       if (parent->bp) {
+                               parent->bp->b_flags |= B_INVAL|B_RELBUF;
+                       }
+                       atomic_clear_int(&parent->flags,
+                                        HAMMER2_CHAIN_MODIFIED);
+                       hammer2_chain_drop(hmp, parent);
+               }
+               if (parent->flags & HAMMER2_CHAIN_MODIFIED_AUX) {
+                       atomic_clear_int(&parent->flags,
+                                        HAMMER2_CHAIN_MODIFIED_AUX);
+               }
+               if (parent->flags & HAMMER2_CHAIN_MOVED) {
+                       atomic_clear_int(&parent->flags,
+                                        HAMMER2_CHAIN_MOVED);
+                       hammer2_chain_drop(hmp, parent);
+               }
+               return;
+       }
+
+       /*
+        * Flush this chain entry only if it is marked modified.
+        */
+       if ((parent->flags & (HAMMER2_CHAIN_MODIFIED |
+                             HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
+               goto done;
+       }
+
+       /*
+        * Clear MODIFIED and set HAMMER2_CHAIN_MOVED.  The caller
+        * will re-test the MOVED bit.
+        *
+        * bits own a single parent ref and the MOVED bit owns its own
+        * parent ref.
+        */
+       if (parent->flags & HAMMER2_CHAIN_MODIFIED) {
+               atomic_clear_int(&parent->flags, HAMMER2_CHAIN_MODIFIED);
+               if (parent->flags & HAMMER2_CHAIN_MOVED) {
+                       hammer2_chain_drop(hmp, parent);
+               } else {
+                       /* inherit ref from the MODIFIED we cleared */
+                       atomic_set_int(&parent->flags, HAMMER2_CHAIN_MOVED);
+               }
+       }
+       atomic_clear_int(&parent->flags, HAMMER2_CHAIN_MODIFIED_AUX);
+
+       /*
+        * If this is part of a recursive flush we can go ahead and write
+        * out the buffer cache buffer and pass a new bref back up the chain.
+        *
+        * This will never be a volume header.
+        */
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+               /*
+                * The volume header is flushed manually by the syncer, not
+                * here.
+                */
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               /*
+                * Data elements have already been flushed via the logical
+                * file buffer cache.  Their hash was set in the bref by
+                * the vop_write code.
+                *
+                * Make sure the buffer(s) have been flushed out here.
+                */
+#if 1
+               bbytes = parent->bytes;
+               pbase = parent->bref.data_off & ~(hammer2_off_t)(bbytes - 1);
+               boff = parent->bref.data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+               bp = getblk(hmp->devvp, pbase, bbytes, GETBLK_NOWAIT, 0);
+               if (bp) {
+                       if ((bp->b_flags & (B_CACHE | B_DIRTY)) ==
+                           (B_CACHE | B_DIRTY)) {
+                               kprintf("x");
+                               cluster_awrite(bp);
+                       } else {
+                               bp->b_flags |= B_RELBUF;
+                               brelse(bp);
+                       }
+               }
+#endif
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               /*
+                * Indirect blocks may be in an INITIAL state.
+                */
+               break;
+       default:
+               /*
+                * Embedded elements have to be flushed out.
+                */
+               KKASSERT(parent->data != NULL);
+               bref = &parent->bref;
+
+               KKASSERT((bref->data_off & HAMMER2_OFF_MASK) != 0);
+
+               if (parent->bp == NULL) {
+                       /*
+                        * The data is embedded, we have to acquire the
+                        * buffer cache buffer and copy the data into it.
+                        */
+                       if ((bbytes = parent->bytes) < HAMMER2_MINIOSIZE)
+                               bbytes = HAMMER2_MINIOSIZE;
+                       pbase = bref->data_off & ~(hammer2_off_t)(bbytes - 1);
+                       boff = bref->data_off & HAMMER2_OFF_MASK & (bbytes - 1);
+
+                       /*
+                        * The getblk() optimization can only be used if the
+                        * physical block size matches the request.
+                        */
+                       if (parent->bytes == bbytes) {
+                               bp = getblk(hmp->devvp, pbase, bbytes, 0, 0);
+                               error = 0;
+                       } else {
+                               error = bread(hmp->devvp, pbase, bbytes, &bp);
+                               KKASSERT(error == 0);
+                       }
+                       bdata = (char *)bp->b_data + boff;
+
+                       /*
+                        * Copy the data to the buffer, mark the buffer
+                        * dirty, and convert the parent to unmodified.
+                        */
+                       bcopy(parent->data, bdata, parent->bytes);
+                       bp->b_flags |= B_CLUSTEROK;
+                       bdwrite(bp);
+                       bp = NULL;
+                       parent->bref.check.iscsi32.value =
+                               hammer2_icrc32(parent->data, parent->bytes);
+                       if (parent->bref.type == HAMMER2_BREF_TYPE_INODE)
+                               ++hammer2_iod_meta_write;
+                       else
+                               ++hammer2_iod_indr_write;
+               } else {
+                       parent->bref.check.iscsi32.value =
+                               hammer2_icrc32(parent->data, parent->bytes);
+               }
+       }
+
+       /*
+        * Special handling
+        */
+       bref = &parent->bref;
+
+       switch(bref->type) {
+       case HAMMER2_BREF_TYPE_VOLUME:
+               KKASSERT(parent->data != NULL);
+               KKASSERT(parent->bp == NULL);
+
+               hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT1]=
+                       hammer2_icrc32(
+                               (char *)&hmp->voldata +
+                                HAMMER2_VOLUME_ICRC1_OFF,
+                               HAMMER2_VOLUME_ICRC1_SIZE);
+               hmp->voldata.icrc_sects[HAMMER2_VOL_ICRC_SECT0]=
+                       hammer2_icrc32(
+                               (char *)&hmp->voldata +
+                                HAMMER2_VOLUME_ICRC0_OFF,
+                               HAMMER2_VOLUME_ICRC0_SIZE);
+               hmp->voldata.icrc_volheader =
+                       hammer2_icrc32(
+                               (char *)&hmp->voldata +
+                                HAMMER2_VOLUME_ICRCVH_OFF,
+                               HAMMER2_VOLUME_ICRCVH_SIZE);
+               break;
+       }
+done:
+       if (hammer2_debug & 0x0008) {
+               kprintf("%*.*s} %p/%d %04x ",
+                       info->depth, info->depth, "",
+                       parent, parent->refs, parent->flags);
+       }
+}
+
+#if 0
+/*
+ * PASS2 - not yet implemented (should be called only with the root chain?)
+ */
+static void
+hammer2_chain_flush_pass2(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+}
+#endif
+
+/*
+ * Stand-alone flush.  If the chain is unable to completely flush we have
+ * to be sure that SUBMODIFIED propagates up the parent chain.
+ *
+ * This routine can be called from several places but the most important
+ * is from the hammer2_vop_reclaim() function.  We want to try to completely
+ * clean out the inode structure to prevent disconnected inodes from
+ * building up and blowing out the kmalloc pool.
+ */
+void
+hammer2_chain_flush(hammer2_mount_t *hmp, hammer2_chain_t *chain)
+{
+       hammer2_chain_t *parent;
+       hammer2_chain_t *scan;
+       hammer2_blockref_t *base;
+       hammer2_flush_info_t info;
+       int count;
+       int reflush;
+
+       /*
+        * Execute the recursive flush and handle deferrals.
+        *
+        * Chains can be ridiculously long (thousands deep), so to
+        * avoid blowing out the kernel stack the recursive flush has a
+        * depth limit.  Elements at the limit are placed on a list
+        * for re-execution after the stack has been popped.
+        */
+       bzero(&info, sizeof(info));
+       TAILQ_INIT(&info.flush_list);
+       reflush = 1;
+
+       while (reflush) {
+               /*
+                * Primary recursion
+                */
+               hammer2_chain_flush_pass1(hmp, chain, &info);
+               reflush = 0;
+
+               while ((scan = TAILQ_FIRST(&info.flush_list)) != NULL) {
+                       /*
+                        * Secondary recursion.  Note that a reference is
+                        * retained from the element's presence on the
+                        * deferral list.
+                        */
+                       KKASSERT(scan->flags & HAMMER2_CHAIN_DEFERRED);
+                       TAILQ_REMOVE(&info.flush_list, scan, flush_node);
+                       atomic_clear_int(&scan->flags, HAMMER2_CHAIN_DEFERRED);
+
+                       /*
+                        * Now that we've popped back up we can do a secondary
+                        * recursion on the deferred elements.
+                        */
+                       if (hammer2_debug & 0x0040)
+                               kprintf("defered flush %p\n", scan);
+                       hammer2_chain_lock(hmp, scan, HAMMER2_RESOLVE_MAYBE);
+                       hammer2_chain_flush(hmp, scan);
+                       hammer2_chain_unlock(hmp, scan);
+
+                       /*
+                        * Only flag a reflush if SUBMODIFIED is no longer
+                        * set.  If SUBMODIFIED is set the element will just
+                        * wind up on our flush_list again.
+                        */
+                       if ((scan->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                                           HAMMER2_CHAIN_MODIFIED |
+                                           HAMMER2_CHAIN_MODIFIED_AUX)) == 0) {
+                               reflush = 1;
+                       }
+                       hammer2_chain_drop(hmp, scan);
+               }
+               if ((hammer2_debug & 0x0040) && reflush)
+                       kprintf("reflush %p\n", chain);
+       }
+
+       /*
+        * The SUBMODIFIED bit must propagate upward if the chain could not
+        * be completely flushed.
+        */
+       if (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                           HAMMER2_CHAIN_MODIFIED |
+                           HAMMER2_CHAIN_MODIFIED_AUX |
+                           HAMMER2_CHAIN_MOVED)) {
+               hammer2_chain_parent_setsubmod(hmp, chain);
+       }
+
+       /*
+        * If the only thing left is a simple bref update try to
+        * pro-actively update the parent, otherwise return early.
+        */
+       parent = chain->parent;
+       if (parent == NULL ||
+           chain->bref.type != HAMMER2_BREF_TYPE_INODE ||
+           (chain->flags & (HAMMER2_CHAIN_SUBMODIFIED |
+                            HAMMER2_CHAIN_MODIFIED |
+                            HAMMER2_CHAIN_MODIFIED_AUX |
+                            HAMMER2_CHAIN_MOVED)) != HAMMER2_CHAIN_MOVED) {
+               return;
+       }
+
+       /*
+        * We are locking backwards so allow the lock to fail
+        */
+       if (lockmgr(&parent->lk, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
+               return;
+       }
+
+       /*
+        * We are updating brefs but we have to call chain_modify() w/
+        * setsubmod = TRUE because our caller is not a recursive
+        * flush.
+        */
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_MAYBE);
+       hammer2_chain_modify(hmp, parent, 0);
+
+       switch(parent->bref.type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               base = &parent->data->ipdata.u.blockset.
+                       blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               base = &parent->data->npdata.blockref[0];
+               count = parent->bytes /
+                       sizeof(hammer2_blockref_t);
+               break;
+       case HAMMER2_BREF_TYPE_VOLUME:
+               base = &hmp->voldata.sroot_blockset.blockref[0];
+               count = HAMMER2_SET_COUNT;
+               break;
+       default:
+               base = NULL;
+               panic("hammer2_chain_flush: "
+                     "unrecognized blockref type: %d",
+                     parent->bref.type);
+       }
+
+       /*
+        * Update the blockref in the parent
+        */
+       KKASSERT(chain->index >= 0 &&
+                chain->index < count);
+       base[chain->index] = chain->bref;
+       if (chain->flags & HAMMER2_CHAIN_MOVED) {
+               atomic_clear_int(&chain->flags, HAMMER2_CHAIN_MOVED);
+               hammer2_chain_drop(hmp, chain);
+       }
+
+       lockmgr(&parent->lk, LK_RELEASE);       /* release manual lockmgr op */
+       hammer2_chain_unlock(hmp, parent);
+}
diff --git a/sys/vfs/hammer2/hammer2_disk.h b/sys/vfs/hammer2/hammer2_disk.h
new file mode 100644 (file)
index 0000000..c3f5f98
--- /dev/null
@@ -0,0 +1,841 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef VFS_HAMMER2_DISK_H_
+#define VFS_HAMMER2_DISK_H_
+
+#ifndef _SYS_UUID_H_
+#include <sys/uuid.h>
+#endif
+
+/*
+ * The structures below represent the on-disk media structures for the HAMMER2
+ * filesystem.  Note that all fields for on-disk structures are naturally
+ * aligned.  The host endian format is typically used - compatibility is
+ * possible if the implementation detects reversed endian and adjusts accesses
+ * accordingly.
+ *
+ * HAMMER2 primarily revolves around the directory topology:  inodes,
+ * directory entries, and block tables.  Block device buffer cache buffers
+ * are always 64KB.  Logical file buffers are typically 16KB.  All data
+ * references utilize 64-bit byte offsets.
+ *
+ * Free block management is handled independently using blocks reserved by
+ * the media topology.
+ */
+
+/*
+ * The data at the end of a file or directory may be a fragment in order
+ * to optimize storage efficiency.  The minimum fragment size is 64 bytes.
+ * Since allocations are in powers of 2 fragments must also be sized in
+ * powers of 2 (64, 128, 256, ... 65536).
+ *
+ * For the moment the maximum allocation size is HAMMER2_PBUFSIZE (64K),
+ * which is 2^16.  Larger extents may be supported in the future.
+ *
+ * A full indirect block uses supports 1024 x 64-byte blockrefs.
+ *
+ * A maximally sized file (2^64-1 bytes) requires 5 indirect block levels.
+ * The hammer2_blockset in the volume header or file inode has another 8
+ * entries, giving us 66+3 = 69 bits of address space.  However, some bits
+ * are taken up by (potentially) requests for redundant copies.  HAMMER2
+ * currently supports up to 8 copies, which brings the address space down
+ * to 66 bits and gives us 2 bits of leeway.
+ */
+#define HAMMER2_MIN_ALLOC      64      /* minimum allocation size */
+#define HAMMER2_MIN_RADIX      6       /* minimum allocation size 2^N */
+#define HAMMER2_MAX_RADIX      16      /* maximum allocation size 2^N */
+#define HAMMER2_KEY_RADIX      64      /* number of bits in key */
+
+/*
+ * MINALLOCSIZE                - The minimum allocation size.  This can be smaller
+ *                       or larger than the minimum physical IO size.
+ *
+ *                       NOTE: Should not be larger than 1K since inodes
+ *                             are 1K.
+ *
+ * MINIOSIZE           - The minimum IO size.  This must be less than
+ *                       or equal to HAMMER2_PBUFSIZE.
+ *
+ *                       XXX currently must be set to MINALLOCSIZE until/if
+ *                           we deal with recursive buffer cache locks.
+ *
+ * HAMMER2_PBUFSIZE    - Topological block size used by files for all
+ *                       blocks except the block straddling EOF.
+ *
+ * HAMMER2_SEGSIZE     - Allocation map segment size, typically 2MB
+ */
+
+#define HAMMER2_SEGSIZE                (65536 * 8)
+
+#define HAMMER2_PBUFRADIX      16      /* physical buf (1<<16) bytes */
+#define HAMMER2_PBUFSIZE       65536
+#define HAMMER2_LBUFRADIX      14      /* logical buf (1<<14) bytes */
+#define HAMMER2_LBUFSIZE       16384
+
+#if 0
+#define HAMMER2_MINIORADIX     16      /* minimum phsical IO size */
+#define HAMMER2_MINIOSIZE      65536
+#endif
+#define HAMMER2_MINIORADIX     HAMMER2_MINALLOCRADIX
+#define HAMMER2_MINIOSIZE      HAMMER2_MINALLOCSIZE
+
+#define HAMMER2_MINALLOCRADIX  10      /* minimum block allocation size */
+#define HAMMER2_MINALLOCSIZE   1024
+#define HAMMER2_IND_BYTES_MIN  4096    /* first indirect layer only */
+#define HAMMER2_IND_BYTES_MAX  HAMMER2_PBUFSIZE
+#define HAMMER2_IND_COUNT_MIN  (HAMMER2_IND_BYTES_MIN / \
+                                sizeof(hammer2_blockref_t))
+#define HAMMER2_IND_COUNT_MAX  (HAMMER2_IND_BYTES_MAX / \
+                                sizeof(hammer2_blockref_t))
+
+/*
+ * HAMMER2 processes blockrefs in sets of 8.  The set is fully associative,
+ * is not sorted, and may contain holes.
+ *
+ * A full indirect block supports 1024 blockrefs.
+ *
+ * An inode embeds one set of blockrefs but may also use the data area for
+ * up to 512 bytes of direct data.
+ */
+#define HAMMER2_SET_COUNT      8       /* direct entries & associativity */
+#define HAMMER2_SET_RADIX      3
+#define HAMMER2_EMBEDDED_BYTES 512
+#define HAMMER2_EMBEDDED_RADIX 9
+
+#define HAMMER2_PBUFMASK       (HAMMER2_PBUFSIZE - 1)
+#define HAMMER2_LBUFMASK       (HAMMER2_LBUFSIZE - 1)
+#define HAMMER2_SEGMASK                (HAMMER2_SEGSIZE - 1)
+
+#define HAMMER2_LBUFMASK64     ((hammer2_off_t)HAMMER2_LBUFMASK)
+#define HAMMER2_PBUFSIZE64     ((hammer2_off_t)HAMMER2_PBUFSIZE)
+#define HAMMER2_PBUFMASK64     ((hammer2_off_t)HAMMER2_PBUFMASK)
+#define HAMMER2_SEGSIZE64      ((hammer2_off_t)HAMMER2_SEGSIZE)
+#define HAMMER2_SEGMASK64      ((hammer2_off_t)HAMMER2_SEGMASK)
+
+#define HAMMER2_UUID_STRING    "5cbb9ad1-862d-11dc-a94d-01301bb8a9f5"
+
+/*
+ * A HAMMER2 filesystem is always sized in multiples of 8MB.
+ *
+ * A 4MB segment is reserved at the beginning of each 2GB zone.  This segment
+ * contains the volume header, the free block table, and possibly other
+ * information in the future.  4MB = 64 x 64K blocks.
+ */
+#define HAMMER2_VOLUME_ALIGN           (8 * 1024 * 1024)
+#define HAMMER2_VOLUME_ALIGN64         ((hammer2_off_t)HAMMER2_VOLUME_ALIGN)
+#define HAMMER2_VOLUME_ALIGNMASK       (HAMMER2_VOLUME_ALIGN - 1)
+#define HAMMER2_VOLUME_ALIGNMASK64     ((hammer2_off_t)HAMMER2_VOLUME_ALIGNMASK)
+
+#define HAMMER2_NEWFS_ALIGN            (HAMMER2_VOLUME_ALIGN)
+#define HAMMER2_NEWFS_ALIGN64          ((hammer2_off_t)HAMMER2_VOLUME_ALIGN)
+#define HAMMER2_NEWFS_ALIGNMASK                (HAMMER2_VOLUME_ALIGN - 1)
+#define HAMMER2_NEWFS_ALIGNMASK64      ((hammer2_off_t)HAMMER2_NEWFS_ALIGNMASK)
+
+#define HAMMER2_RESERVE_BYTES64                (2LLU * 1024 * 1024 * 1024)
+#define HAMMER2_RESERVE_MASK64         (HAMMER2_RESERVE_BYTES64 - 1)
+#define HAMMER2_RESERVE_SEG            (4 * 1024 * 1024)
+#define HAMMER2_RESERVE_SEG64          ((hammer2_off_t)HAMMER2_RESERVE_SEG)
+#define HAMMER2_RESERVE_BLOCKS         (HAMMER2_RESERVE_SEG / HAMMER2_PBUFSIZE)
+
+/*
+ * Two linear areas can be reserved after the initial 2MB segment in the base
+ * zone (the one starting at offset 0).  These areas are NOT managed by the
+ * block allocator and do not fall under HAMMER2 crc checking rules based
+ * at the volume header (but can be self-CRCd internally, depending).
+ */
+#define HAMMER2_BOOT_MIN_BYTES         HAMMER2_VOLUME_ALIGN
+#define HAMMER2_BOOT_NOM_BYTES         (64*1024*1024)
+#define HAMMER2_BOOT_MAX_BYTES         (256*1024*1024)
+
+#define HAMMER2_REDO_MIN_BYTES         HAMMER2_VOLUME_ALIGN
+#define HAMMER2_REDO_NOM_BYTES         (256*1024*1024)
+#define HAMMER2_REDO_MAX_BYTES         (1024*1024*1024)
+
+/*
+ * Most HAMMER2 types are implemented as unsigned 64-bit integers.
+ * Transaction ids are monotonic.
+ *
+ * We utilize 32-bit iSCSI CRCs.
+ */
+typedef uint64_t hammer2_tid_t;
+typedef uint64_t hammer2_off_t;
+typedef uint64_t hammer2_key_t;
+typedef uint32_t hammer2_crc32_t;
+
+/*
+ * Miscellanious ranges (all are unsigned).
+ */
+#define HAMMER2_MIN_TID                1ULL
+#define HAMMER2_MAX_TID                0xFFFFFFFFFFFFFFFFULL
+#define HAMMER2_MIN_KEY                0ULL
+#define HAMMER2_MAX_KEY                0xFFFFFFFFFFFFFFFFULL
+#define HAMMER2_MIN_OFFSET     0ULL
+#define HAMMER2_MAX_OFFSET     0xFFFFFFFFFFFFFFFFULL
+
+/*
+ * HAMMER2 data offset special cases and masking.
+ *
+ * All HAMMER2 data offsets have to be broken down into a 64K buffer base
+ * offset (HAMMER2_OFF_MASK_HI) and a 64K buffer index (HAMMER2_OFF_MASK_LO).
+ *
+ * Indexes into physical buffers are always 64-byte aligned.  The low 6 bits
+ * of the data offset field specifies how large the data chunk being pointed
+ * to as a power of 2.  This value typically ranges from HAMMER2_MIN_RADIX
+ * to HAMMER2_MAX_RADIX (6-16).  Larger values may be supported in the future
+ * to support file extents.
+ */
+#define HAMMER2_OFF_BAD                ((hammer2_off_t)-1)
+#define HAMMER2_OFF_MASK       0xFFFFFFFFFFFFFFC0ULL
+#define HAMMER2_OFF_MASK_LO    (HAMMER2_OFF_MASK & HAMMER2_PBUFMASK64)
+#define HAMMER2_OFF_MASK_HI    (~HAMMER2_PBUFMASK64)
+#define HAMMER2_OFF_MASK_RADIX 0x000000000000003FULL
+#define HAMMER2_MAX_COPIES     6
+
+/*
+ * HAMMER2 directory support and pre-defined keys
+ */
+#define HAMMER2_DIRHASH_VISIBLE        0x8000000000000000ULL
+#define HAMMER2_DIRHASH_USERMSK        0x7FFFFFFFFFFFFFFFULL
+#define HAMMER2_DIRHASH_LOMASK 0x0000000000007FFFULL
+#define HAMMER2_DIRHASH_HIMASK 0xFFFFFFFFFFFF0000ULL
+#define HAMMER2_DIRHASH_FORCED 0x0000000000008000ULL   /* bit forced on */
+
+#define HAMMER2_SROOT_KEY      0x0000000000000000ULL   /* volume to sroot */
+
+/*
+ * The media block reference structure.  This forms the core of the HAMMER2
+ * media topology recursion.  This 64-byte data structure is embedded in the
+ * volume header, in inodes (which are also directory entries), and in
+ * indirect blocks.
+ *
+ * A blockref references a single media item, which typically can be a
+ * directory entry (aka inode), indirect block, or data block.
+ *
+ * The primary feature a blockref represents is the ability to validate
+ * the entire tree underneath it via its check code.  Any modification to
+ * anything propagates up the blockref tree all the way to the root, replacing
+ * the related blocks.  Propagations can shortcut to the volume root to
+ * implement the 'fast syncing' feature but this only delays the eventual
+ * propagation.
+ *
+ * The check code can be a simple 32-bit iscsi code, a 64-bit crc,
+ * or as complex as a 192 bit cryptographic hash.  192 bits is the maximum
+ * supported check code size, which is not sufficient for unverified dedup
+ * UNLESS one doesn't mind once-in-a-blue-moon data corruption (such as when
+ * farming web data).  HAMMER2 has an unverified dedup feature for just this
+ * purpose.
+ */
+struct hammer2_blockref {              /* MUST BE EXACTLY 64 BYTES */
+       uint8_t         type;           /* type of underlying item */
+       uint8_t         methods;        /* check method & compression method */
+       uint8_t         copyid;         /* specify which copy this is */
+       uint8_t         keybits;        /* #of keybits masked off 0=leaf */
+       uint8_t         vradix;         /* virtual data/meta-data size */
+       uint8_t         flags;          /* blockref flags */
+       uint8_t         reserved06;
+       uint8_t         reserved07;
+       hammer2_key_t   key;            /* key specification */
+       hammer2_tid_t   mirror_tid;     /* propagate for mirror scan */
+       hammer2_tid_t   modify_tid;     /* modifications sans propagation */
+       hammer2_off_t   data_off;       /* low 6 bits is phys size (radix)*/
+       union {                         /* check info */
+               char    buf[24];
+               struct {
+                       uint32_t value;
+                       uint32_t unused[5];
+               } iscsi32;
+               struct {
+                       uint64_t value;
+                       uint64_t unused[2];
+               } crc64;
+               struct {
+                       char data[24];
+               } sha192;
+       } check;
+};
+
+typedef struct hammer2_blockref hammer2_blockref_t;
+
+#define HAMMER2_BREF_SYNC1             0x01    /* modification synchronized */
+#define HAMMER2_BREF_SYNC2             0x02    /* modification committed */
+#define HAMMER2_BREF_DESYNCCHLD                0x04    /* desynchronize children */
+#define HAMMER2_BREF_DELETED           0x80    /* indicates a deletion */
+
+#define HAMMER2_BLOCKREF_BYTES         64      /* blockref struct in bytes */
+
+#define HAMMER2_BREF_TYPE_EMPTY                0
+#define HAMMER2_BREF_TYPE_INODE                1
+#define HAMMER2_BREF_TYPE_INDIRECT     2
+#define HAMMER2_BREF_TYPE_DATA         3
+#define HAMMER2_BREF_TYPE_VOLUME       255     /* pseudo-type */
+
+#define HAMMER2_ENC_COMPMETHOD(n)      (n)
+#define HAMMER2_ENC_CHECKMETHOD(n)     ((n) << 4)
+#define HAMMER2_DEC_COMPMETHOD(n)      ((n) & 15)
+#define HAMMER2_DEC_CHECKMETHOD(n)     (((n) >> 4) & 15)
+
+/*
+ * HAMMER2 block references are collected into sets of 8 blockrefs.  These
+ * sets are fully associative, meaning the elements making up a set are
+ * not sorted in any way and may contain duplicate entries, holes, or
+ * entries which shortcut multiple levels of indirection.  Sets are used
+ * in various ways:
+ *
+ * (1) When redundancy is desired a set may contain several duplicate
+ *     entries pointing to different copies of the same data.  Up to 8 copies
+ *     are supported but the set structure becomes a bit inefficient once
+ *     you go over 4.
+ *
+ * (2) The blockrefs in a set can shortcut multiple levels of indirections
+ *     within the bounds imposed by the parent of set.
+ *
+ * When a set fills up another level of indirection is inserted, moving
+ * some or all of the set's contents into indirect blocks placed under the
+ * set.  This is a top-down approach in that indirect blocks are not created
+ * until the set actually becomes full (that is, the entries in the set can
+ * shortcut the indirect blocks when the set is not full).  Depending on how
+ * things are filled multiple indirect blocks will eventually be created.
+ */
+struct hammer2_blockset {
+       hammer2_blockref_t      blockref[HAMMER2_SET_COUNT];
+};
+
+typedef struct hammer2_blockset hammer2_blockset_t;
+
+/*
+ * Catch programmer snafus
+ */
+#if (1 << HAMMER2_SET_RADIX) != HAMMER2_SET_COUNT
+#error "hammer2 direct radix is incorrect"
+#endif
+#if (1 << HAMMER2_PBUFRADIX) != HAMMER2_PBUFSIZE
+#error "HAMMER2_PBUFRADIX and HAMMER2_PBUFSIZE are inconsistent"
+#endif
+#if (1 << HAMMER2_MIN_RADIX) != HAMMER2_MIN_ALLOC
+#error "HAMMER2_MIN_RADIX and HAMMER2_MIN_ALLOC are inconsistent"
+#endif
+
+/*
+ * The media indirect block structure.
+ */
+struct hammer2_indblock_data {
+       hammer2_blockref_t blockref[HAMMER2_IND_COUNT_MAX];
+};
+
+typedef struct hammer2_indblock_data hammer2_indblock_data_t;
+
+/*
+ * In HAMMER2 inodes ARE directory entries, with a special exception for
+ * hardlinks.  The inode number is stored in the inode rather than being
+ * based on the location of the inode (since the location moves every time
+ * the inode or anything underneath the inode is modified).
+ *
+ * The inode is 1024 bytes, made up of 256 bytes of meta-data, 256 bytes
+ * for the filename, and 512 bytes worth of direct file data OR an embedded
+ * blockset.
+ *
+ * Directories represent one inode per blockref.  Inodes are not laid out
+ * as a file but instead are represented by the related blockrefs.  The
+ * blockrefs, in turn, are indexed by the 64-bit directory hash key.  Remember
+ * that blocksets are fully associative, so a certain degree efficiency is
+ * achieved just from that.
+ *
+ * Up to 512 bytes of direct data can be embedded in an inode, and since
+ * inodes are essentially directory entries this also means that small data
+ * files end up simply being laid out linearly in the directory, resulting
+ * in fewer seeks and highly optimal access.
+ *
+ * The compression mode can be changed at any time in the inode and is
+ * recorded on a blockref-by-blockref basis.
+ *
+ * Hardlinks are supported via the inode map.  Essentially the way a hardlink
+ * works is that all individual directory entries representing the same file
+ * are special cased and specify the same inode number.  The actual file
+ * is placed in the nearest parent directory that is parent to all instances
+ * of the hardlink.  If all hardlinks to a file are in the same directory
+ * the actual file will also be placed in that directory.  This file uses
+ * the inode number as the directory entry key and is invisible to normal
+ * directory scans.  Real directory entry keys are differentiated from the
+ * inode number key via bit 63.  Access to the hardlink silently looks up
+ * the real file and forwards all operations to that file.  Removal of the
+ * last hardlink also removes the real file.
+ *
+ * (attr_tid) is only updated when the inode's specific attributes or regular
+ * file size has changed, and affects path lookups and stat.  (attr_tid)
+ * represents a special cache coherency lock under the inode.  The inode
+ * blockref's modify_tid will always cover it.
+ *
+ * (dirent_tid) is only updated when an entry under a directory inode has
+ * been created, deleted, renamed, or had its attributes change, and affects
+ * directory lookups and scans.  (dirent_tid) represents another special cache
+ * coherency lock under the inode.  The inode blockref's modify_tid will
+ * always cover it.
+ */
+#define HAMMER2_INODE_BYTES            1024    /* (asserted by code) */
+#define HAMMER2_INODE_MAXNAME          256     /* maximum name in bytes */
+#define HAMMER2_INODE_VERSION_ONE      1
+
+struct hammer2_inode_data {
+       uint16_t        version;        /* 0000 inode data version */
+       uint16_t        reserved02;     /* 0002 */
+
+       /*
+        * core inode attributes, inode type, misc flags
+        */
+       uint32_t        uflags;         /* 0004 chflags */
+       uint32_t        rmajor;         /* 0008 available for device nodes */
+       uint32_t        rminor;         /* 000C available for device nodes */
+       uint64_t        ctime;          /* 0010 inode change time */
+       uint64_t        mtime;          /* 0018 modified time */
+       uint64_t        atime;          /* 0020 access time (unsupported) */
+       uint64_t        btime;          /* 0028 birth time */
+       uuid_t          uid;            /* 0030 uid / degenerate unix uid */
+       uuid_t          gid;            /* 0040 gid / degenerate unix gid */
+
+       uint8_t         type;           /* 0050 object type */
+       uint8_t         op_flags;       /* 0051 operational flags */
+       uint16_t        cap_flags;      /* 0052 capability flags */
+       uint32_t        mode;           /* 0054 unix modes (typ low 16 bits) */
+
+       /*
+        * inode size, identification, localized recursive configuration
+        * for compression and backup copies.
+        */
+       hammer2_tid_t   inum;           /* 0058 inode number */
+       hammer2_off_t   size;           /* 0060 size of file */
+       uint64_t        nlinks;         /* 0068 hard links (typ only dirs) */
+       hammer2_tid_t   iparent;        /* 0070 parent inum (recovery only) */
+       hammer2_key_t   name_key;       /* 0078 full filename key */
+       uint16_t        name_len;       /* 0080 filename length */
+       uint8_t         ncopies;        /* 0082 ncopies to local media */
+       uint8_t         comp_algo;      /* 0083 compression request & algo */
+
+       /*
+        * These fields are currently only applicable to PFSROOTs.
+        *
+        * NOTE: We can't use {volume_data->fsid, pfs_id} to uniquely
+        *       identify an instance of a PFS in the cluster because
+        *       a mount may contain more than one copy of the PFS as
+        *       a separate node.  {pfs_fsid, pfs_id} must be used for
+        *       registration in the cluster.
+        */
+       uint8_t         reserved84;     /* 0084 */
+       uint8_t         reserved85;     /* 0085 */
+       uint8_t         reserved86;     /* 0086 */
+       uint8_t         pfs_type;       /* 0087 (if PFSROOT) node type */
+       uint64_t        pfs_inum;       /* 0088 (if PFSROOT) inum allocator */
+       uuid_t          pfs_id;         /* 0090 (if PFSROOT) pfs uuid */
+       uuid_t          pfs_fsid;       /* 00A0 (if PFSROOT) unique pfs uuid */
+
+       /*
+        * Quotas and cumulative sub-tree counters.
+        */
+       hammer2_off_t   data_quota;     /* 00B0 subtree quota in bytes */
+       hammer2_off_t   data_count;     /* 00B8 subtree byte count */
+       hammer2_off_t   inode_quota;    /* 00C0 subtree quota inode count */
+       hammer2_off_t   inode_count;    /* 00C8 subtree inode count */
+       hammer2_tid_t   attr_tid;       /* 00D0 attributes changed */
+       hammer2_tid_t   dirent_tid;     /* 00D8 directory/attr changed */
+       uint64_t        reservedE0;     /* 00E0 */
+       uint64_t        reservedE8;     /* 00E8 */
+       uint64_t        reservedF0;     /* 00F0 */
+       uint64_t        reservedF8;     /* 00F8 */
+
+       unsigned char   filename[HAMMER2_INODE_MAXNAME];
+                                       /* 0100-01FF (256 char, unterminated) */
+       union {                         /* 0200-03FF (64x8 = 512 bytes) */
+               struct hammer2_blockset blockset;
+               char data[HAMMER2_EMBEDDED_BYTES];
+       } u;
+};
+
+typedef struct hammer2_inode_data hammer2_inode_data_t;
+
+#define HAMMER2_OPFLAG_DIRECTDATA      0x01
+#define HAMMER2_OPFLAG_PFSROOT         0x02
+#define HAMMER2_OPFLAG_COPYIDS         0x04    /* copyids override parent */
+
+#define HAMMER2_OBJTYPE_UNKNOWN                0
+#define HAMMER2_OBJTYPE_DIRECTORY      1
+#define HAMMER2_OBJTYPE_REGFILE                2
+#define HAMMER2_OBJTYPE_FIFO           4
+#define HAMMER2_OBJTYPE_CDEV           5
+#define HAMMER2_OBJTYPE_BDEV           6
+#define HAMMER2_OBJTYPE_SOFTLINK       7
+#define HAMMER2_OBJTYPE_HARDLINK       8       /* dummy entry for hardlink */
+#define HAMMER2_OBJTYPE_SOCKET         9
+#define HAMMER2_OBJTYPE_WHITEOUT       10
+
+#define HAMMER2_COPYID_NONE            0
+#define HAMMER2_COPYID_LOCAL           ((uint8_t)-1)
+
+#define HAMMER2_COMP_NONE              0
+#define HAMMER2_COMP_AUTOZERO          1
+
+#define HAMMER2_CHECK_NONE             0
+#define HAMMER2_CHECK_ICRC             1
+
+#define HAMMER2_PFSTYPE_NONE           0
+#define HAMMER2_PFSTYPE_ADMIN          1
+#define HAMMER2_PFSTYPE_CACHE          2
+#define HAMMER2_PFSTYPE_COPY           3
+#define HAMMER2_PFSTYPE_SLAVE          4
+#define HAMMER2_PFSTYPE_SOFT_SLAVE     5
+#define HAMMER2_PFSTYPE_SOFT_MASTER    6
+#define HAMMER2_PFSTYPE_MASTER         7
+
+/*
+ * The allocref structure represents the allocation table.  One 64K block
+ * is broken down into 4096 x 16 byte entries.  Each indirect block chops
+ * 11 bits off the 64-bit storage space, with leaf entries representing
+ * 64KB blocks.  So:  (12, 12, 12, 12, 16) = 64 bit storage space.
+ *
+ * Each 64K freemap block breaks the 4096 entries into a 64x64 tree with
+ * big_hint1 representing the top level every 64th entry and big_hint2
+ * representing the lower level in each entry.  These fields specify the
+ * largest contiguous radix (1-63) available for allocation in the related
+ * sub-tree.  The largest contiguous radix available for the entire block
+ * is saved in the parent (for the root this will be alloc_blockref in the
+ * volume header).  The hints may be larger than actual and will be corrected
+ * on the fly but must not be smaller.  The allocator uses the hints to
+ * very quickly locate nearby blocks of the desired size.
+ *
+ * In indirect blocks the 64-bit free[_or_mask] field stores the total free
+ * space for each of the 4096 sub-nodes in bytes.  The total free space
+ * represented by the indirect block is stored in its parent.
+ *
+ * Each leaf element represents a 64K block.  A bitmap replaces the free space
+ * count, giving us a 1KB allocation resolution.  A micro-allocation append
+ * offset replaces the icrc field.  The micro-allocation feature is not
+ * currently implemented and the field will be set to 65536.
+ *
+ * The allocation map uses reserved blocks so no data block reference is
+ * required, only a bit in the flags field to specify which of two possible
+ * reserved blocks to use.  This allows the allocation map to be flushed to
+ * disk with minimal synchronization.
+ */
+struct hammer2_allocref {
+       uint32_t        icrc_or_app;    /* node: icrc, leaf: append offset */
+       uint16_t        flags;
+       uint8_t         big_hint1;      /* upper level hint */
+       uint8_t         big_hint2;      /* lower level hint */
+       uint64_t        free_or_mask;   /* node: free bytes, leaf: bitmask */
+};
+
+typedef struct hammer2_allocref hammer2_allocref_t;
+
+/*
+ * WARNING - allocref size x entries must equate to the hammer buffer size,
+ *          and 12 bits per recursion is assumed by the allocator.
+ *
+ * ALTA-D      Since no data_offset is specified flags are needed to select
+ *             which sub-block to recurse down into for root & internal nodes.
+ *             (only ALTA and ALTB is currently supported).
+ *
+ * LEAF                Terminal entry, always set for leafs.  May be used to support
+ *             4MB extent allocations and early termination in the future.
+ *             (not required to shortcut allocation scans as the big_hint1/2
+ *             fields are used for this).
+ */
+#define HAMMER2_ALLOCREF_BYTES         16      /* structure size */
+#define HAMMER2_ALLOCREF_ENTRIES       4096    /* entries */
+#define HAMMER2_ALLOCREF_RADIX         12      /* log2(entries) */
+
+#if (HAMMER2_ALLOCREF_BYTES * HAMMER2_ALLOCREF_ENTRIES) != HAMMER2_PBUFSIZE
+#error "allocref parameters do not fit in hammer buffer"
+#endif
+#if (1 << HAMMER2_ALLOCREF_RADIX) != HAMMER2_ALLOCREF_ENTRIES
+#error "allocref parameters are inconsistent"
+#endif
+
+#define HAMMER2_ALLOCREF_ALTMASK       0x0003  /* select block for recurse */
+#define HAMMER2_ALLOCREF_ALTA          0x0000
+#define HAMMER2_ALLOCREF_ALTB          0x0001
+#define HAMMER2_ALLOCREF_ALTC          0x0002  /* unsupported */
+#define HAMMER2_ALLOCREF_ALTD          0x0003  /* unsupported */
+#define HAMMER2_ALLOCREF_LEAF          0x0004
+
+/*
+ * All HAMMER2 directories directly under the super-root on your local
+ * media can be mounted separately, even if they share the same physical
+ * device.
+ *
+ * When you do a HAMMER2 mount you are effectively tying into a HAMMER2
+ * cluster via local media.  The local media does not have to participate
+ * in the cluster, other than to provide the hammer2_copy_data[] array and
+ * root inode for the mount.
+ *
+ * This is important: The mount device path you specify serves to bootstrap
+ * your entry into the cluster, but your mount will make active connections
+ * to ALL copy elements in the hammer2_copy_data[] array which match the
+ * PFSID of the directory in the super-root that you specified.  The local
+ * media path does not have to be mentioned in this array but becomes part
+ * of the cluster based on its type and access rights.  ALL ELEMENTS ARE
+ * TREATED ACCORDING TO TYPE NO MATTER WHICH ONE YOU MOUNT FROM.
+ *
+ * The actual cluster may be far larger than the elements you list in the
+ * hammer2_copy_data[] array.  You list only the elements you wish to
+ * directly connect to and you are able to access the rest of the cluster
+ * indirectly through those connections.
+ *
+ * This structure must be exactly 128 bytes long.
+ */
+struct hammer2_copy_data {
+       uint8_t copyid;         /* 00    copyid 0-255 (must match slot) */
+       uint8_t inprog;         /* 01    operation in progress, or 0 */
+       uint8_t chain_to;       /* 02    operation chaining to, or 0 */
+       uint8_t chain_from;     /* 03    operation chaining from, or 0 */
+       uint16_t flags;         /* 04-05 flags field */
+       uint8_t error;          /* 06    last operational error */
+       uint8_t priority;       /* 07    priority and round-robin flag */
+       uint8_t remote_pfstype; /* 08    probed direct remote PFS type */
+       uint8_t reserved08[23]; /* 09-1F */
+       uuid_t  pfs_id;         /* 20-2F copy target must match this uuid */
+       uint8_t label[16];      /* 30-3F import/export label */
+       uint8_t path[64];       /* 40-7F target specification string or key */
+};
+
+typedef struct hammer2_copy_data hammer2_copy_data_t;
+
+#define COPYDATAF_ENABLED      0x0001
+#define COPYDATAF_INPROG       0x0002
+#define COPYDATAF_CONN_RR      0x80    /* round-robin at same priority */
+#define COPYDATAF_CONN_EF      0x40    /* media errors flagged */
+#define COPYDATAF_CONN_PRI     0x0F    /* select priority 0-15 (15=best) */
+
+/*
+ * The volume header eats a 64K block.  There is currently an issue where
+ * we want to try to fit all nominal filesystem updates in a 512-byte section
+ * but it may be a lost cause due to the need for a blockset.
+ *
+ * All information is stored in host byte order.  The volume header's magic
+ * number may be checked to determine the byte order.  If you wish to mount
+ * between machines w/ different endian modes you'll need filesystem code
+ * which acts on the media data consistently (either all one way or all the
+ * other).  Our code currently does not do that.
+ *
+ * A read-write mount may have to recover missing allocations by doing an
+ * incremental mirror scan looking for modifications made after alloc_tid.
+ * If alloc_tid == last_tid then no recovery operation is needed.  Recovery
+ * operations are usually very, very fast.
+ *
+ * Read-only mounts do not need to do any recovery, access to the filesystem
+ * topology is always consistent after a crash (is always consistent, period).
+ * However, there may be shortcutted blockref updates present from deep in
+ * the tree which are stored in the volumeh eader and must be tracked on
+ * the fly.
+ *
+ * COPIES: Multiple copies may be specified on the mount line AND/OR you
+ *        just specify one and the mount code tries to pick up the others
+ *        from copyinfo[].  The copyid field in the volume header along
+ *        with the fsid validates the copies.
+ *
+ * NOTE: root_blockref points to the super-root directory, not the root
+ *      directory.  The root directory will be a subdirectory under the
+ *      super-root.
+ *
+ *      The super-root directory contains all root directories and all
+ *      snapshots (readonly or writable).  It is possible to do a
+ *      null-mount of the super-root using special path constructions
+ *      relative to your mounted root.
+ *
+ * NOTE: HAMMER2 allows any subdirectory tree to be managed as if it were
+ *      a PFS, including mirroring and storage quota operations, and this is
+ *      prefered over creating discrete PFSs in the super-root.  Instead
+ *      the super-root is most typically used to create writable snapshots,
+ *      alternative roots, and so forth.  The super-root is also used by
+ *      the automatic snapshotting mechanism.
+ */
+#define HAMMER2_VOLUME_ID_HBO  0x48414d3205172011LLU
+#define HAMMER2_VOLUME_ID_ABO  0x11201705324d4148LLU
+
+#define HAMMER2_COPYID_COUNT   256
+
+struct hammer2_volume_data {
+       /*
+        * sector #0 - 512 bytes
+        */
+       uint64_t        magic;                  /* 0000 Signature */
+       hammer2_off_t   boot_beg;               /* 0008 Boot area (future) */
+       hammer2_off_t   boot_end;               /* 0010 (size = end - beg) */
+       hammer2_off_t   aux_beg;                /* 0018 Aux area (future) */
+       hammer2_off_t   aux_end;                /* 0020 (size = end - beg) */
+       hammer2_off_t   volu_size;              /* 0028 Volume size, bytes */
+
+       uint32_t        version;                /* 0030 */
+       uint32_t        flags;                  /* 0034 */
+       uint8_t         copyid;                 /* 0038 copyid of phys vol */
+       uint8_t         freemap_version;        /* 0039 freemap algorithm */
+       uint8_t         reserved003A;           /* 003A */
+       uint8_t         reserved003B;           /* 003B */
+       uint32_t        reserved003C;           /* 003C */
+
+       uuid_t          fsid;                   /* 0040 */
+       uuid_t          fstype;                 /* 0050 */
+
+       /*
+        * allocator_size is precalculated at newfs time and does not include
+        * reserved blocks, boot, or redo areas.
+        *
+        * Initial non-reserved-area allocations do not use the allocation
+        * map but instead adjust alloc_iterator.  Dynamic allocations take
+        * over starting at (allocator_beg).  This makes newfs_hammer2's
+        * job a lot easier and can also serve as a testing jig.
+        */
+       hammer2_off_t   allocator_size;         /* 0060 Total data space */
+       hammer2_off_t   allocator_free;         /* 0068 Free space */
+       hammer2_tid_t   allocator_beg;          /* 0070 Initial allocations */
+       hammer2_tid_t   last_tid;               /* 0078 Last transaction id */
+       hammer2_tid_t   alloc_tid;              /* 0080 Alloctable modify tid */
+       hammer2_blockref_t alloc_blockref;      /* 0088-00C7 */
+
+       /*
+        * Copyids are allocated dynamically from the copyexists bitmap.
+        * An id from the active copies set (up to 8, see copyinfo later on)
+        * may still exist after the copy set has been removed from the
+        * volume header and its bit will remain active in the bitmap and
+        * cannot be reused until it is 100% removed from the hierarchy.
+        */
+       uint32_t        copyexists[8];          /* 00C8-00E7 copy exists bmap */
+       char            reserved0140[248];      /* 00E8-01DF */
+
+       /*
+        * 32 bit CRC array at the end of the first 512 byte sector.
+        *
+        * icrc_sects[7] - First 512-4 bytes of volume header (including all
+        *                 the other icrc's except the last one).
+        *
+        * icrc_sects[6] - Second 512-4 bytes of volume header, which is
+        *                 the blockset for the root.
+        */
+       hammer2_crc32_t icrc_sects[8];          /* 01E0-01FF */
+
+       /*
+        * sector #1 - 512 bytes
+        *
+        * The entire sector is used by a blockset.
+        */
+       hammer2_blockset_t sroot_blockset;      /* 0200-03FF Superroot dir */
+
+       /*
+        * sector #2-7
+        */
+       char    sector2[512];                   /* 0400-05FF reserved */
+       char    sector3[512];                   /* 0600-07FF reserved */
+       char    sector4[512];                   /* 0800-09FF reserved */
+       char    sector5[512];                   /* 0A00-0BFF reserved */
+       char    sector6[512];                   /* 0C00-0DFF reserved */
+       char    sector7[512];                   /* 0E00-0FFF reserved */
+
+       /*
+        * sector #8-71 - 32768 bytes
+        *
+        * Contains the configuration for up to 256 copyinfo targets.  These
+        * specify local and remote copies operating as masters or slaves.
+        * copyid's 0 and 255 are reserved (0 indicates an empty slot and 255
+        * indicates the local media).
+        *
+        * Each inode contains a set of up to 8 copyids, either inherited
+        * from its parent or explicitly specified in the inode, which
+        * indexes into this array.
+        */
+                                               /* 1000-8FFF copyinfo config */
+       struct hammer2_copy_data copyinfo[HAMMER2_COPYID_COUNT];
+
+       /*
+        *
+        */
+
+       /*
+        * Remaining sections are reserved for future use.
+        */
+       char            reserved0400[0x6FFC];   /* 9000-FFFB reserved */
+
+       /*
+        * icrc on entire volume header
+        */
+       hammer2_crc32_t icrc_volheader;         /* FFFC-FFFF full volume icrc*/
+};
+
+typedef struct hammer2_volume_data hammer2_volume_data_t;
+
+/*
+ * Various parts of the volume header have their own iCRCs.
+ *
+ * The first 512 bytes has its own iCRC stored at the end of the 512 bytes
+ * and not included the icrc calculation.
+ *
+ * The second 512 bytes also has its own iCRC but it is stored in the first
+ * 512 bytes so it covers the entire second 512 bytes.
+ *
+ * The whole volume block (64KB) has an iCRC covering all but the last 4 bytes,
+ * which is where the iCRC for the whole volume is stored.  This is currently
+ * a catch-all for anything not individually iCRCd.
+ */
+#define HAMMER2_VOL_ICRC_SECT0         7
+#define HAMMER2_VOL_ICRC_SECT1         6
+
+#define HAMMER2_VOLUME_BYTES           65536
+
+#define HAMMER2_VOLUME_ICRC0_OFF       0
+#define HAMMER2_VOLUME_ICRC1_OFF       512
+#define HAMMER2_VOLUME_ICRCVH_OFF      0
+
+#define HAMMER2_VOLUME_ICRC0_SIZE      (512 - 4)
+#define HAMMER2_VOLUME_ICRC1_SIZE      (512)
+#define HAMMER2_VOLUME_ICRCVH_SIZE     (65536 - 4)
+
+#define HAMMER2_VOL_VERSION_MIN                1
+#define HAMMER2_VOL_VERSION_DEFAULT    1
+#define HAMMER2_VOL_VERSION_WIP        2
+
+#define HAMMER2_NUM_VOLHDRS            4
+
+union hammer2_media_data {
+        hammer2_inode_data_t    ipdata;
+       hammer2_indblock_data_t npdata;
+       char                    buf[HAMMER2_PBUFSIZE];
+};
+
+typedef union hammer2_media_data hammer2_media_data_t;
+
+/*
+ * Prototypes for user & kernel functions.  Kernel-only prototypes are
+ * elsewhere.
+ */
+uint32_t hammer2_icrc32(const void *buf, size_t size);
+uint32_t hammer2_icrc32c(const void *buf, size_t size, uint32_t crc);
+
+#endif
diff --git a/sys/vfs/hammer2/hammer2_freemap.c b/sys/vfs/hammer2/hammer2_freemap.c
new file mode 100644 (file)
index 0000000..e39b274
--- /dev/null
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/buf.h>
+#include <sys/proc.h>
+#include <sys/namei.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+#include <sys/mountctl.h>
+
+#include "hammer2.h"
+
+/*
+ * Allocate media space, returning a combined data offset and radix.
+ *
+ * XXX when diving a new full block create a clean empty buffer and bqrelse()
+ *     it, so small data structures do not have to issue read-IO when they
+ *     do the read-modify-write on the backing store.
+ */
+hammer2_off_t
+hammer2_freemap_alloc(hammer2_mount_t *hmp, int type, size_t bytes)
+{
+       hammer2_off_t data_off;
+       hammer2_off_t data_next;
+       /*struct buf *bp;*/
+       int radix;
+       int fctype;
+
+       switch(type) {
+       case HAMMER2_BREF_TYPE_INODE:
+               fctype = HAMMER2_FREECACHE_INODE;
+               break;
+       case HAMMER2_BREF_TYPE_INDIRECT:
+               fctype = HAMMER2_FREECACHE_INODE;
+               break;
+       case HAMMER2_BREF_TYPE_DATA:
+               fctype = HAMMER2_FREECACHE_DATA;
+               break;
+       default:
+               fctype = HAMMER2_FREECACHE_DATA;
+               break;
+       }
+
+       /*
+        * Figure out the base 2 radix of the allocation (rounded up)
+        */
+       radix = hammer2_bytes_to_radix(bytes);
+       bytes = 1 << radix;
+
+       lockmgr(&hmp->alloclk, LK_EXCLUSIVE);
+       if (radix <= HAMMER2_MAX_RADIX && hmp->freecache[fctype][radix]) {
+               /*
+                * Allocate from our packing cache
+                */
+               data_off = hmp->freecache[fctype][radix];
+               hmp->freecache[fctype][radix] += bytes;
+               if ((hmp->freecache[fctype][radix] & HAMMER2_SEGMASK) == 0)
+                       hmp->freecache[fctype][radix] = 0;
+       } else {
+               /*
+                * Allocate from the allocation iterator using a SEGSIZE
+                * aligned block and reload the packing cache if possible.
+                */
+               data_off = hmp->voldata.allocator_beg;
+               data_off = (data_off + HAMMER2_SEGMASK64) & ~HAMMER2_SEGMASK64;
+               data_next = data_off + bytes;
+
+               if ((data_next & HAMMER2_SEGMASK) == 0) {
+                       hmp->voldata.allocator_beg = data_next;
+               } else {
+                       KKASSERT(radix <= HAMMER2_MAX_RADIX);
+                       hmp->voldata.allocator_beg =
+                                       (data_next + HAMMER2_SEGMASK64) &
+                                       ~HAMMER2_SEGMASK64;
+                       hmp->freecache[fctype][radix] = data_next;
+               }
+       }
+       lockmgr(&hmp->alloclk, LK_RELEASE);
+
+#if 0
+       /*
+        * Allocations on-media are always in multiples of 64K but
+        * partial-block allocations can be tracked in-memory.
+        *
+        * We can reduce the need for read-modify-write IOs by
+        * telling the kernel that the contents of a new 64K block is
+        * initially good (before we use any of it).
+        *
+        * Worst case is the kernel evicts the buffer and causes HAMMER2's
+        * bread later on to actually issue a read I/O.
+        *
+        * XXX Maybe do this in SEGSIZE increments? Needs a lot of work.
+        *     Also watch out for buffer size mismatches.
+        */
+       if (bytes < HAMMER2_MINIOSIZE &&
+           (data_off & (HAMMER2_MINIOSIZE - 1)) == 0) {
+               bp = getblk(hmp->devvp, data_off, HAMMER2_MINIOSIZE, 0, 0);
+               bp->b_flags |= B_CACHE;
+               bp->b_resid = 0;
+               bqrelse(bp);
+       }
+#endif
+
+       if (hammer2_debug & 0x0001) {
+               kprintf("hammer2: allocate %d %016jx: %zd\n",
+                       type, (intmax_t)data_off, bytes);
+       }
+       return (data_off | radix);
+}
+
+#if 0
+/*
+ * Allocate media space, returning a combined data offset and radix.
+ * Also return the related (device) buffer cache buffer.
+ */
+hammer2_off_t
+hammer2_freemap_alloc_bp(hammer2_mount_t *hmp, size_t bytes, struct buf **bpp)
+{
+}
+
+#endif
diff --git a/sys/vfs/hammer2/hammer2_icrc.c b/sys/vfs/hammer2/hammer2_icrc.c
new file mode 100644 (file)
index 0000000..d18a866
--- /dev/null
@@ -0,0 +1,147 @@
+/*-
+ * Copyright (c) 2005-2010 Daniel Braniss <danny@cs.huji.ac.il>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ | iSCSI
+ | $Id: isc_subr.c 560 2009-05-07 07:37:49Z danny $
+ */
+
+#include <sys/types.h>
+#include <sys/uuid.h>
+
+#include "hammer2_disk.h"
+
+/*****************************************************************/
+/*                                                               */
+/* CRC LOOKUP TABLE                                              */
+/* ================                                              */
+/* The following CRC lookup table was generated automagically    */
+/* by the Rocksoft^tm Model CRC Algorithm Table Generation       */
+/* Program V1.0 using the following model parameters:            */
+/*                                                               */
+/*    Width   : 4 bytes.                                         */
+/*    Poly    : 0x1EDC6F41L                                      */
+/*    Reverse : TRUE.                                            */
+/*                                                               */
+/* For more information on the Rocksoft^tm Model CRC Algorithm,  */
+/* see the document titled "A Painless Guide to CRC Error        */
+/* Detection Algorithms" by Ross Williams                        */
+/* (ross@guest.adelaide.edu.au.). This document is likely to be  */
+/* in the FTP archive "ftp.adelaide.edu.au/pub/rocksoft".        */
+/*                                                               */
+/*****************************************************************/
+
+static uint32_t crc32Table[256] = {
+    0x00000000L, 0xF26B8303L, 0xE13B70F7L, 0x1350F3F4L,
+    0xC79A971FL, 0x35F1141CL, 0x26A1E7E8L, 0xD4CA64EBL,
+    0x8AD958CFL, 0x78B2DBCCL, 0x6BE22838L, 0x9989AB3BL,
+    0x4D43CFD0L, 0xBF284CD3L, 0xAC78BF27L, 0x5E133C24L,
+    0x105EC76FL, 0xE235446CL, 0xF165B798L, 0x030E349BL,
+    0xD7C45070L, 0x25AFD373L, 0x36FF2087L, 0xC494A384L,
+    0x9A879FA0L, 0x68EC1CA3L, 0x7BBCEF57L, 0x89D76C54L,
+    0x5D1D08BFL, 0xAF768BBCL, 0xBC267848L, 0x4E4DFB4BL,
+    0x20BD8EDEL, 0xD2D60DDDL, 0xC186FE29L, 0x33ED7D2AL,
+    0xE72719C1L, 0x154C9AC2L, 0x061C6936L, 0xF477EA35L,
+    0xAA64D611L, 0x580F5512L, 0x4B5FA6E6L, 0xB93425E5L,
+    0x6DFE410EL, 0x9F95C20DL, 0x8CC531F9L, 0x7EAEB2FAL,
+    0x30E349B1L, 0xC288CAB2L, 0xD1D83946L, 0x23B3BA45L,
+    0xF779DEAEL, 0x05125DADL, 0x1642AE59L, 0xE4292D5AL,
+    0xBA3A117EL, 0x4851927DL, 0x5B016189L, 0xA96AE28AL,
+    0x7DA08661L, 0x8FCB0562L, 0x9C9BF696L, 0x6EF07595L,
+    0x417B1DBCL, 0xB3109EBFL, 0xA0406D4BL, 0x522BEE48L,
+    0x86E18AA3L, 0x748A09A0L, 0x67DAFA54L, 0x95B17957L,
+    0xCBA24573L, 0x39C9C670L, 0x2A993584L, 0xD8F2B687L,
+    0x0C38D26CL, 0xFE53516FL, 0xED03A29BL, 0x1F682198L,
+    0x5125DAD3L, 0xA34E59D0L, 0xB01EAA24L, 0x42752927L,
+    0x96BF4DCCL, 0x64D4CECFL, 0x77843D3BL, 0x85EFBE38L,
+    0xDBFC821CL, 0x2997011FL, 0x3AC7F2EBL, 0xC8AC71E8L,
+    0x1C661503L, 0xEE0D9600L, 0xFD5D65F4L, 0x0F36E6F7L,
+    0x61C69362L, 0x93AD1061L, 0x80FDE395L, 0x72966096L,
+    0xA65C047DL, 0x5437877EL, 0x4767748AL, 0xB50CF789L,
+    0xEB1FCBADL, 0x197448AEL, 0x0A24BB5AL, 0xF84F3859L,
+    0x2C855CB2L, 0xDEEEDFB1L, 0xCDBE2C45L, 0x3FD5AF46L,
+    0x7198540DL, 0x83F3D70EL, 0x90A324FAL, 0x62C8A7F9L,
+    0xB602C312L, 0x44694011L, 0x5739B3E5L, 0xA55230E6L,
+    0xFB410CC2L, 0x092A8FC1L, 0x1A7A7C35L, 0xE811FF36L,
+    0x3CDB9BDDL, 0xCEB018DEL, 0xDDE0EB2AL, 0x2F8B6829L,
+    0x82F63B78L, 0x709DB87BL, 0x63CD4B8FL, 0x91A6C88CL,
+    0x456CAC67L, 0xB7072F64L, 0xA457DC90L, 0x563C5F93L,
+    0x082F63B7L, 0xFA44E0B4L, 0xE9141340L, 0x1B7F9043L,
+    0xCFB5F4A8L, 0x3DDE77ABL, 0x2E8E845FL, 0xDCE5075CL,
+    0x92A8FC17L, 0x60C37F14L, 0x73938CE0L, 0x81F80FE3L,
+    0x55326B08L, 0xA759E80BL, 0xB4091BFFL, 0x466298FCL,
+    0x1871A4D8L, 0xEA1A27DBL, 0xF94AD42FL, 0x0B21572CL,
+    0xDFEB33C7L, 0x2D80B0C4L, 0x3ED04330L, 0xCCBBC033L,
+    0xA24BB5A6L, 0x502036A5L, 0x4370C551L, 0xB11B4652L,
+    0x65D122B9L, 0x97BAA1BAL, 0x84EA524EL, 0x7681D14DL,
+    0x2892ED69L, 0xDAF96E6AL, 0xC9A99D9EL, 0x3BC21E9DL,
+    0xEF087A76L, 0x1D63F975L, 0x0E330A81L, 0xFC588982L,
+    0xB21572C9L, 0x407EF1CAL, 0x532E023EL, 0xA145813DL,
+    0x758FE5D6L, 0x87E466D5L, 0x94B49521L, 0x66DF1622L,
+    0x38CC2A06L, 0xCAA7A905L, 0xD9F75AF1L, 0x2B9CD9F2L,
+    0xFF56BD19L, 0x0D3D3E1AL, 0x1E6DCDEEL, 0xEC064EEDL,
+    0xC38D26C4L, 0x31E6A5C7L, 0x22B65633L, 0xD0DDD530L,
+    0x0417B1DBL, 0xF67C32D8L, 0xE52CC12CL, 0x1747422FL,
+    0x49547E0BL, 0xBB3FFD08L, 0xA86F0EFCL, 0x5A048DFFL,
+    0x8ECEE914L, 0x7CA56A17L, 0x6FF599E3L, 0x9D9E1AE0L,
+    0xD3D3E1ABL, 0x21B862A8L, 0x32E8915CL, 0xC083125FL,
+    0x144976B4L, 0xE622F5B7L, 0xF5720643L, 0x07198540L,
+    0x590AB964L, 0xAB613A67L, 0xB831C993L, 0x4A5A4A90L,
+    0x9E902E7BL, 0x6CFBAD78L, 0x7FAB5E8CL, 0x8DC0DD8FL,
+    0xE330A81AL, 0x115B2B19L, 0x020BD8EDL, 0xF0605BEEL,
+    0x24AA3F05L, 0xD6C1BC06L, 0xC5914FF2L, 0x37FACCF1L,
+    0x69E9F0D5L, 0x9B8273D6L, 0x88D28022L, 0x7AB90321L,
+    0xAE7367CAL, 0x5C18E4C9L, 0x4F48173DL, 0xBD23943EL,
+    0xF36E6F75L, 0x0105EC76L, 0x12551F82L, 0xE03E9C81L,
+    0x34F4F86AL, 0xC69F7B69L, 0xD5CF889DL, 0x27A40B9EL,
+    0x79B737BAL, 0x8BDCB4B9L, 0x988C474DL, 0x6AE7C44EL,
+    0xBE2DA0A5L, 0x4C4623A6L, 0x5F16D052L, 0xAD7D5351L
+};
+
+uint32_t
+hammer2_icrc32(const void *buf, size_t size)
+{
+     const uint8_t *p = buf;
+     uint32_t crc = 0;
+
+     crc = crc ^ 0xffffffff;
+     while (size--)
+         crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+     crc = crc ^ 0xffffffff;
+     return crc;
+}
+
+uint32_t
+hammer2_icrc32c(const void *buf, size_t size, uint32_t crc)
+{
+     const uint8_t *p = buf;
+
+     crc = crc ^ 0xffffffff;
+     while (size--)
+         crc = crc32Table[(crc ^ *p++) & 0xff] ^ (crc >> 8);
+     crc = crc ^ 0xffffffff;
+     return crc;
+}
diff --git a/sys/vfs/hammer2/hammer2_inode.c b/sys/vfs/hammer2/hammer2_inode.c
new file mode 100644 (file)
index 0000000..3a4b18d
--- /dev/null
@@ -0,0 +1,590 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#include <sys/cdefs.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/types.h>
+#include <sys/lock.h>
+#include <sys/uuid.h>
+
+#include "hammer2.h"
+
+/*
+ * Adding a ref to an inode is only legal if the inode already has at least
+ * one ref.
+ */
+void
+hammer2_inode_ref(hammer2_inode_t *ip)
+{
+       hammer2_chain_ref(ip->hmp, &ip->chain);
+}
+
+/*
+ * Drop an inode reference, freeing the inode when the last reference goes
+ * away.
+ */
+void
+hammer2_inode_drop(hammer2_inode_t *ip)
+{
+       hammer2_chain_drop(ip->hmp, &ip->chain);
+}
+
+/*
+ * Get the vnode associated with the given inode, allocating the vnode if
+ * necessary.
+ *
+ * Great care must be taken to avoid deadlocks and vnode acquisition/reclaim
+ * races.
+ *
+ * The vnode will be returned exclusively locked and referenced.  The
+ * reference on the vnode prevents it from being reclaimed.
+ *
+ * The inode (ip) must be referenced by the caller and not locked to avoid
+ * it getting ripped out from under us or deadlocked.
+ */
+struct vnode *
+hammer2_igetv(hammer2_inode_t *ip, int *errorp)
+{
+       struct vnode *vp;
+       hammer2_pfsmount_t *pmp;
+
+       pmp = ip->pmp;
+       KKASSERT(pmp != NULL);
+       *errorp = 0;
+
+       for (;;) {
+               /*
+                * Attempt to reuse an existing vnode assignment.  It is
+                * possible to race a reclaim so the vget() may fail.  The
+                * inode must be unlocked during the vget() to avoid a
+                * deadlock against a reclaim.
+                */
+               vp = ip->vp;
+               if (vp) {
+                       /*
+                        * Lock the inode and check for a reclaim race
+                        */
+                       hammer2_inode_lock_ex(ip);
+                       if (ip->vp != vp) {
+                               hammer2_inode_unlock_ex(ip);
+                               continue;
+                       }
+
+                       /*
+                        * Inode must be unlocked during the vget() to avoid
+                        * possible deadlocks, vnode is held to prevent
+                        * destruction during the vget().  The vget() can
+                        * still fail if we lost a reclaim race on the vnode.
+                        */
+                       vhold_interlocked(vp);
+                       hammer2_inode_unlock_ex(ip);
+                       if (vget(vp, LK_EXCLUSIVE)) {
+                               vdrop(vp);
+                               continue;
+                       }
+                       vdrop(vp);
+                       /* vp still locked and ref from vget */
+                       *errorp = 0;
+                       break;
+               }
+
+               /*
+                * No vnode exists, allocate a new vnode.  Beware of
+                * allocation races.  This function will return an
+                * exclusively locked and referenced vnode.
+                */
+               *errorp = getnewvnode(VT_HAMMER2, pmp->mp, &vp, 0, 0);
+               if (*errorp) {
+                       vp = NULL;
+                       break;
+               }
+
+               /*
+                * Lock the inode and check for an allocation race.
+                */
+               hammer2_inode_lock_ex(ip);
+               if (ip->vp != NULL) {
+                       vp->v_type = VBAD;
+                       vx_put(vp);
+                       hammer2_inode_unlock_ex(ip);
+                       continue;
+               }
+
+               switch (ip->ip_data.type) {
+               case HAMMER2_OBJTYPE_DIRECTORY:
+                       vp->v_type = VDIR;
+                       break;
+               case HAMMER2_OBJTYPE_REGFILE:
+                       vp->v_type = VREG;
+                       vinitvmio(vp, ip->ip_data.size,
+                                 HAMMER2_LBUFSIZE,
+                                 (int)ip->ip_data.size & HAMMER2_LBUFMASK);
+                       break;
+               case HAMMER2_OBJTYPE_SOFTLINK:
+                       /*
+                        * XXX for now we are using the generic file_read
+                        * and file_write code so we need a buffer cache
+                        * association.
+                        */
+                       vp->v_type = VLNK;
+                       vinitvmio(vp, ip->ip_data.size,
+                                 HAMMER2_LBUFSIZE,
+                                 (int)ip->ip_data.size & HAMMER2_LBUFMASK);
+                       break;
+               /* XXX FIFO */
+               default:
+                       panic("hammer2: unhandled objtype %d",
+                             ip->ip_data.type);
+                       break;
+               }
+
+               if (ip == pmp->iroot)
+                       vsetflags(vp, VROOT);
+
+               vp->v_data = ip;
+               ip->vp = vp;
+               hammer2_chain_ref(ip->hmp, &ip->chain); /* vp association */
+               hammer2_inode_unlock_ex(ip);
+               break;
+       }
+
+       /*
+        * Return non-NULL vp and *errorp == 0, or NULL vp and *errorp != 0.
+        */
+       if (hammer2_debug & 0x0002) {
+               kprintf("igetv vp %p refs %d aux %d\n",
+                       vp, vp->v_sysref.refcnt, vp->v_auxrefs);
+       }
+       return (vp);
+}
+
+/*
+ * Create a new inode in the specified directory using the vattr to
+ * figure out the type of inode.
+ *
+ * If no error occurs the new inode with its chain locked is returned in
+ * *nipp, otherwise an error is returned and *nipp is set to NULL.
+ *
+ * If vap and/or cred are NULL the related fields are not set and the
+ * inode type defaults to a directory.  This is used when creating PFSs
+ * under the super-root, so the inode number is set to 1 in this case.
+ */
+int
+hammer2_inode_create(hammer2_inode_t *dip,
+                    struct vattr *vap, struct ucred *cred,
+                    const uint8_t *name, size_t name_len,
+                    hammer2_inode_t **nipp)
+{
+       hammer2_mount_t *hmp = dip->hmp;
+       hammer2_chain_t *chain;
+       hammer2_chain_t *parent;
+       hammer2_inode_t *nip;
+       hammer2_key_t lhc;
+       int error;
+
+       lhc = hammer2_dirhash(name, name_len);
+
+       /*
+        * Locate the inode or indirect block to create the new
+        * entry in.  At the same time check for key collisions
+        * and iterate until we don't get one.
+        */
+       parent = &dip->chain;
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
+
+       error = 0;
+       while (error == 0) {
+               chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
+               if (chain == NULL)
+                       break;
+               if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
+                       error = ENOSPC;
+               hammer2_chain_unlock(hmp, chain);
+               chain = NULL;
+               ++lhc;
+       }
+       if (error == 0) {
+               chain = hammer2_chain_create(hmp, parent, NULL, lhc, 0,
+                                            HAMMER2_BREF_TYPE_INODE,
+                                            HAMMER2_INODE_BYTES);
+               if (chain == NULL)
+                       error = EIO;
+       }
+       hammer2_chain_unlock(hmp, parent);
+
+       /*
+        * Handle the error case
+        */
+       if (error) {
+               KKASSERT(chain == NULL);
+               *nipp = NULL;
+               return (error);
+       }
+
+       /*
+        * Set up the new inode
+        */
+       nip = chain->u.ip;
+       *nipp = nip;
+
+       hammer2_voldata_lock(hmp);
+       if (vap) {
+               nip->ip_data.type = hammer2_get_obj_type(vap->va_type);
+               nip->ip_data.inum = hmp->voldata.alloc_tid++;
+               /* XXX modify/lock */
+       } else {
+               nip->ip_data.type = HAMMER2_OBJTYPE_DIRECTORY;
+               nip->ip_data.inum = 1;
+       }
+       hammer2_voldata_unlock(hmp);
+       nip->ip_data.version = HAMMER2_INODE_VERSION_ONE;
+       nip->ip_data.ctime = 0;
+       nip->ip_data.mtime = 0;
+       if (vap)
+               nip->ip_data.mode = vap->va_mode;
+       nip->ip_data.nlinks = 1;
+       /* uid, gid, etc */
+
+       /*
+        * Regular files and softlinks allow a small amount of data to be
+        * directly embedded in the inode.  This flag will be cleared if
+        * the size is extended past the embedded limit.
+        */
+       if (nip->ip_data.type == HAMMER2_OBJTYPE_REGFILE ||
+           nip->ip_data.type == HAMMER2_OBJTYPE_SOFTLINK) {
+               nip->ip_data.op_flags |= HAMMER2_OPFLAG_DIRECTDATA;
+       }
+
+       KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
+       bcopy(name, nip->ip_data.filename, name_len);
+       nip->ip_data.name_key = lhc;
+       nip->ip_data.name_len = name_len;
+
+       return (0);
+}
+
+/*
+ * Connect inode (ip) to the specified directory using the specified name.
+ * (ip) must be locked.
+ */
+int
+hammer2_inode_connect(hammer2_inode_t *dip, hammer2_inode_t *ip,
+                     const uint8_t *name, size_t name_len)
+{
+       hammer2_mount_t *hmp = dip->hmp;
+       hammer2_chain_t *chain;
+       hammer2_chain_t *parent;
+       hammer2_key_t lhc;
+       int error;
+
+       lhc = hammer2_dirhash(name, name_len);
+
+       /*
+        * Locate the inode or indirect block to create the new
+        * entry in.  At the same time check for key collisions
+        * and iterate until we don't get one.
+        */
+       parent = &dip->chain;
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
+
+       error = 0;
+       while (error == 0) {
+               chain = hammer2_chain_lookup(hmp, &parent, lhc, lhc, 0);
+               if (chain == NULL)
+                       break;
+               if ((lhc & HAMMER2_DIRHASH_LOMASK) == HAMMER2_DIRHASH_LOMASK)
+                       error = ENOSPC;
+               hammer2_chain_unlock(hmp, chain);
+               chain = NULL;
+               ++lhc;
+       }
+
+       /*
+        * Passing a non-NULL chain to hammer2_chain_create() reconnects the
+        * existing chain instead of creating a new one.  The chain's bref
+        * will be properly updated.
+        */
+       if (error == 0) {
+               chain = hammer2_chain_create(hmp, parent, &ip->chain, lhc, 0,
+                                            HAMMER2_BREF_TYPE_INODE /* n/a */,
+                                            HAMMER2_INODE_BYTES);   /* n/a */
+               if (chain == NULL)
+                       error = EIO;
+       }
+       hammer2_chain_unlock(hmp, parent);
+
+       /*
+        * Handle the error case
+        */
+       if (error) {
+               KKASSERT(chain == NULL);
+               return (error);
+       }
+
+       /*
+        * Directory entries are inodes so if the name has changed we have
+        * to update the inode.
+        */
+       if (ip->ip_data.name_len != name_len ||
+           bcmp(ip->ip_data.filename, name, name_len) != 0) {
+               hammer2_chain_modify(hmp, chain, 0);
+               KKASSERT(name_len < HAMMER2_INODE_MAXNAME);
+               bcopy(name, ip->ip_data.filename, name_len);
+               ip->ip_data.name_key = lhc;
+               ip->ip_data.name_len = name_len;
+       }
+       /*nip->ip_data.nlinks = 1;*/
+
+       return (0);
+}
+
+/*
+ * Create a hardlink forwarding entry (dip, name) to the specified (ip).
+ *
+ * This is one of the more complex implementations in HAMMER2.  The
+ * filesystem strictly updates its chains bottom-up in a copy-on-write
+ * fashion.  This makes hardlinks difficult to implement but we've come up
+ * with a dandy solution.
+ *
+ * When a file has more than one link the actual inode is created as a
+ * hidden directory entry (indexed by inode number) in a common parent of
+ * all hardlinks which reference the file.  The hardlinks in each directory
+ * are merely forwarding entries to the hidden inode.
+ *
+ * Implementation:
+ *
+ *     Most VOPs can be blissfully unaware of the forwarding entries.
+ *     nresolve, nlink, and remove code have to be forwarding-aware
+ *     in order to return the (ip/vp) for the actual file (and otherwise do
+ *     the right thing).
+ *
+ *     (1) If the ip we are linking to is a normal embedded inode (nlinks==1)
+ *         we have to replace the directory entry with a forwarding inode
+ *         and move the normal ip/vp to a hidden entry indexed by the inode
+ *         number in a common parent directory.
+ *
+ *     (2) If the ip we are linking to is already a hidden entry but is not
+ *         a common parent we have to move its entry to a common parent by
+ *         moving the entry upward.
+ *
+ *     (3) The trivial case is the entry is already hidden and already a
+ *         common parent.  We adjust nlinks for the entry and are done.
+ *         (this is the fall-through case).
+ */
+int
+hammer2_hardlink_create(hammer2_inode_t *ip, hammer2_inode_t *dip,
+                       const uint8_t *name, size_t name_len)
+{
+       return ENOTSUP;
+#if 0
+       hammer2_inode_t *nip;
+       hammer2_inode_t *xip;
+
+
+       hammer2_inode_t *nip;   /* hardlink forwarding inode */
+        error = hammer2_inode_create(hmp, NULL, ap->a_cred,
+                                     dip, name, name_len, &nip);
+        if (error) {
+                KKASSERT(nip == NULL);
+                return error;
+        }
+        KKASSERT(nip->ip_data.type == HAMMER2_OBJTYPE_HARDLINK);
+        hammer2_chain_modify(&nip->chain, 0);
+        nip->ip_data.inum = ip->ip_data.inum;
+       hammer2_chain_unlock(hmp, &nip->chain);
+       /
+#endif
+}
+
+/*
+ * Unlink the file from the specified directory inode.  The directory inode
+ * does not need to be locked.
+ *
+ * isdir determines whether a directory/non-directory check should be made.
+ * No check is made if isdir is set to -1.
+ *
+ * adjlinks tells unlink that we want to adjust the nlinks count of the
+ * inode.  When removing the last link for a NON forwarding entry we can
+ * just ignore the link count... no point updating the inode that we are
+ * about to dereference, it would just result in a lot of wasted I/O.
+ *
+ * However, if the entry is a forwarding entry (aka a hardlink), and adjlinks
+ * is non-zero, we have to locate the hardlink and adjust its nlinks field.
+ */
+int
+hammer2_unlink_file(hammer2_inode_t *dip, const uint8_t *name, size_t name_len,
+                   int isdir, int adjlinks)
+{
+       hammer2_mount_t *hmp;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       hammer2_chain_t *dparent;
+       hammer2_chain_t *dchain;
+       hammer2_key_t lhc;
+       int error;
+
+       error = 0;
+
+       hmp = dip->hmp;
+       lhc = hammer2_dirhash(name, name_len);
+
+       /*
+        * Search for the filename in the directory
+        */
+       parent = &dip->chain;
+       hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
+       chain = hammer2_chain_lookup(hmp, &parent,
+                                    lhc, lhc + HAMMER2_DIRHASH_LOMASK,
+                                    0);
+       while (chain) {
+               if (chain->bref.type == HAMMER2_BREF_TYPE_INODE &&
+                   chain->u.ip &&
+                   name_len == chain->data->ipdata.name_len &&
+                   bcmp(name, chain->data->ipdata.filename, name_len) == 0) {
+                       break;
+               }
+               chain = hammer2_chain_next(hmp, &parent, chain,
+                                          lhc, lhc + HAMMER2_DIRHASH_LOMASK,
+                                          0);
+       }
+
+       /*
+        * Not found or wrong type (isdir < 0 disables the type check).
+        */
+       if (chain == NULL) {
+               hammer2_chain_unlock(hmp, parent);
+               return ENOENT;
+       }
+       if (chain->data->ipdata.type == HAMMER2_OBJTYPE_DIRECTORY &&
+           isdir == 0) {
+               error = ENOTDIR;
+               goto done;
+       }
+       if (chain->data->ipdata.type != HAMMER2_OBJTYPE_DIRECTORY &&
+           isdir == 1) {
+               error = EISDIR;
+               goto done;
+       }
+
+       /*
+        * If this is a directory the directory must be empty.  However, if
+        * isdir < 0 we are doing a rename and the directory does not have
+        * to be empty.
+        */
+       if (chain->data->ipdata.type == HAMMER2_OBJTYPE_DIRECTORY &&
+           isdir >= 0) {
+               dparent = chain;
+               hammer2_chain_lock(hmp, dparent, HAMMER2_RESOLVE_ALWAYS);
+               dchain = hammer2_chain_lookup(hmp, &dparent,
+                                             0, (hammer2_key_t)-1,
+                                             HAMMER2_LOOKUP_NODATA);
+               if (dchain) {
+                       hammer2_chain_unlock(hmp, dchain);
+                       hammer2_chain_unlock(hmp, dparent);
+                       error = ENOTEMPTY;
+                       goto done;
+               }
+               hammer2_chain_unlock(hmp, dparent);
+               dparent = NULL;
+               /* dchain NULL */
+       }
+
+#if 0
+       /*
+        * If adjlinks is non-zero this is a real deletion (otherwise it is
+        * probably a rename).  XXX
+        */
+       if (adjlinks) {
+               if (chain->data->ipdata.type == HAMMER2_OBJTYPE_HARDLINK) {
+                       /*hammer2_adjust_hardlink(chain->u.ip, -1);*/
+                       /* error handling */
+               } else {
+                       waslastlink = 1;
+               }
+       } else {
+               waslastlink = 0;
+       }
+#endif
+
+       /*
+        * Found, the chain represents the inode.  Remove the parent reference
+        * to the chain.  The chain itself is no longer referenced and will
+        * be marked unmodified by hammer2_chain_delete(), avoiding unnecessary
+        * I/O.
+        */
+       hammer2_chain_delete(hmp, parent, chain);
+       /* XXX nlinks (hardlink special case) */
+       /* XXX nlinks (parent directory) */
+
+#if 0
+       /*
+        * Destroy any associated vnode, but only if this was the last
+        * link.  XXX this might not be needed.
+        */
+       if (chain->u.ip->vp) {
+               struct vnode *vp;
+               vp = hammer2_igetv(chain->u.ip, &error);
+               if (error == 0) {
+                       vn_unlock(vp);
+                       /* hammer2_knote(vp, NOTE_DELETE); */
+                       cache_inval_vp(vp, CINV_DESTROY);
+                       vrele(vp);
+               }
+       }
+#endif
+       error = 0;
+
+done:
+       hammer2_chain_unlock(hmp, chain);
+       hammer2_chain_unlock(hmp, parent);
+
+       return error;
+}
+
+/*
+ * Calculate the allocation size for the file fragment straddling EOF
+ */
+int
+hammer2_inode_calc_alloc(hammer2_key_t filesize)
+{
+       int frag = (int)filesize & HAMMER2_PBUFMASK;
+       int radix;
+
+       if (frag == 0)
+               return(0);
+       for (radix = HAMMER2_MINALLOCRADIX; frag > (1 << radix); ++radix)
+               ;
+       return (radix);
+}
diff --git a/sys/vfs/hammer2/hammer2_ioctl.c b/sys/vfs/hammer2/hammer2_ioctl.c
new file mode 100644 (file)
index 0000000..74ab131
--- /dev/null
@@ -0,0 +1,385 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * Ioctl Functions.
+ *
+ * WARNING! The ioctl functions which manipulate the connection state need
+ *         to be able to run without deadlock on the volume's chain lock.
+ *         Most of these functions use a separate lock.
+ */
+
+#include "hammer2.h"
+
+static int hammer2_ioctl_version_get(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_remote_get(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_remote_add(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_remote_del(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_remote_rep(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_socket_get(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_socket_set(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data);
+static int hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data);
+
+int
+hammer2_ioctl(hammer2_inode_t *ip, u_long com, void *data, int fflag,
+             struct ucred *cred)
+{
+       int error;
+
+       /*
+        * Standard root cred checks, will be selectively ignored below
+        * for ioctls that do not require root creds.
+        */
+       error = priv_check_cred(cred, PRIV_HAMMER_IOCTL, 0);
+
+       switch(com) {
+       case HAMMER2IOC_VERSION_GET:
+               error = hammer2_ioctl_version_get(ip, data);
+               break;
+       case HAMMER2IOC_REMOTE_GET:
+               if (error == 0)
+                       error = hammer2_ioctl_remote_get(ip, data);
+               break;
+       case HAMMER2IOC_REMOTE_ADD:
+               if (error == 0)
+                       error = hammer2_ioctl_remote_add(ip, data);
+               break;
+       case HAMMER2IOC_REMOTE_DEL:
+               if (error == 0)
+                       error = hammer2_ioctl_remote_del(ip, data);
+               break;
+       case HAMMER2IOC_REMOTE_REP:
+               if (error == 0)
+                       error = hammer2_ioctl_remote_rep(ip, data);
+               break;
+       case HAMMER2IOC_SOCKET_GET:
+               if (error == 0)
+                       error = hammer2_ioctl_socket_get(ip, data);
+               break;
+       case HAMMER2IOC_SOCKET_SET:
+               if (error == 0)
+                       error = hammer2_ioctl_socket_set(ip, data);
+               break;
+       case HAMMER2IOC_PFS_GET:
+               if (error == 0)
+                       error = hammer2_ioctl_pfs_get(ip, data);
+               break;
+       case HAMMER2IOC_PFS_CREATE:
+               if (error == 0)
+                       error = hammer2_ioctl_pfs_create(ip, data);
+               break;
+       case HAMMER2IOC_PFS_DELETE:
+               if (error == 0)
+                       error = hammer2_ioctl_pfs_delete(ip, data);
+               break;
+       default:
+               error = EOPNOTSUPP;
+               break;
+       }
+       return (error);
+}
+
+/*
+ * Retrieve version and basic info
+ */
+static int
+hammer2_ioctl_version_get(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_version_t *version = data;
+
+       version->version = hmp->voldata.version;
+       return 0;
+}
+
+/*
+ * Retrieve information about a remote
+ */
+static int
+hammer2_ioctl_remote_get(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_remote_t *remote = data;
+       int copyid = remote->copyid;
+
+       if (copyid < 0 || copyid >= HAMMER2_COPYID_COUNT)
+               return (EINVAL);
+
+       hammer2_voldata_lock(hmp);
+       remote->copy1 = hmp->voldata.copyinfo[copyid];
+       hammer2_voldata_unlock(hmp);
+
+       /*
+        * Adjust nextid (GET only)
+        */
+       while (++copyid < HAMMER2_COPYID_COUNT &&
+              hmp->voldata.copyinfo[copyid].copyid == 0) {
+               ++copyid;
+       }
+       if (copyid == HAMMER2_COPYID_COUNT)
+               remote->nextid = -1;
+       else
+               remote->nextid = copyid;
+
+       return(0);
+}
+
+/*
+ * Add new remote entry
+ */
+static int
+hammer2_ioctl_remote_add(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_remote_t *remote = data;
+       int copyid = remote->copyid;
+       int error = 0;
+
+       if (copyid >= HAMMER2_COPYID_COUNT)
+               return (EINVAL);
+
+       hammer2_voldata_lock(hmp);
+       if (copyid < 0) {
+               for (copyid = 1; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
+                       if (hmp->voldata.copyinfo[copyid].copyid == 0)
+                               break;
+               }
+               if (copyid == HAMMER2_COPYID_COUNT) {
+                       error = ENOSPC;
+                       goto failed;
+               }
+       }
+       hammer2_modify_volume(hmp);
+       kprintf("copyid %d\n", copyid);
+       remote->copy1.copyid = copyid;
+       hmp->voldata.copyinfo[copyid] = remote->copy1;
+failed:
+       hammer2_voldata_unlock(hmp);
+       return (error);
+}
+
+/*
+ * Delete existing remote entry
+ */
+static int
+hammer2_ioctl_remote_del(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_remote_t *remote = data;
+       int copyid = remote->copyid;
+       int error = 0;
+
+       if (copyid >= HAMMER2_COPYID_COUNT)
+               return (EINVAL);
+       remote->copy1.path[sizeof(remote->copy1.path) - 1] = 0;
+       hammer2_voldata_lock(hmp);
+       if (copyid < 0) {
+               for (copyid = 1; copyid < HAMMER2_COPYID_COUNT; ++copyid) {
+                       if (hmp->voldata.copyinfo[copyid].copyid == 0)
+                               continue;
+                       if (strcmp(remote->copy1.path,
+                           hmp->voldata.copyinfo[copyid].path) == 0) {
+                               break;
+                       }
+               }
+               if (copyid == HAMMER2_COPYID_COUNT) {
+                       error = ENOENT;
+                       goto failed;
+               }
+       }
+       hammer2_modify_volume(hmp);
+       hmp->voldata.copyinfo[copyid].copyid = 0;
+failed:
+       hammer2_voldata_unlock(hmp);
+       return (error);
+}
+
+/*
+ * Replace existing remote entry
+ */
+static int
+hammer2_ioctl_remote_rep(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_remote_t *remote = data;
+       int copyid = remote->copyid;
+
+       if (copyid < 0 || copyid >= HAMMER2_COPYID_COUNT)
+               return (EINVAL);
+
+       hammer2_voldata_lock(hmp);
+       hammer2_voldata_unlock(hmp);
+
+       return(0);
+}
+
+/*
+ * Retrieve communications socket
+ */
+static int
+hammer2_ioctl_socket_get(hammer2_inode_t *ip, void *data)
+{
+       return (EOPNOTSUPP);
+}
+
+/*
+ * Set communications socket for connection
+ */
+static int
+hammer2_ioctl_socket_set(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_remote_t *remote = data;
+       int copyid = remote->copyid;
+
+       if (copyid < 0 || copyid >= HAMMER2_COPYID_COUNT)
+               return (EINVAL);
+
+       hammer2_voldata_lock(hmp);
+       hammer2_voldata_unlock(hmp);
+
+       return(0);
+}
+
+/*
+ * Used to scan PFSs, which are directories under the super-root.
+ */
+static int
+hammer2_ioctl_pfs_get(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_pfs_t *pfs = data;
+       hammer2_chain_t *parent;
+       hammer2_chain_t *chain;
+       hammer2_inode_t *xip;
+       int error = 0;
+
+       parent = hmp->schain;
+       error = hammer2_chain_lock(hmp, parent, HAMMER2_RESOLVE_ALWAYS);
+       if (error)
+               goto done;
+
+       /*
+        * Search for the first key or specific key.  Remember that keys
+        * can be returned in any order.
+        */
+       if (pfs->name_key == 0) {
+               chain = hammer2_chain_lookup(hmp, &parent,
+                                            0, (hammer2_key_t)-1, 0);
+       } else {
+               chain = hammer2_chain_lookup(hmp, &parent,
+                                            pfs->name_key, pfs->name_key, 0);
+       }
+       while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE) {
+               chain = hammer2_chain_next(hmp, &parent, chain,
+                                    0, (hammer2_key_t)-1, 0);
+       }
+       if (chain) {
+               /*
+                * Load the data being returned by the ioctl.
+                */
+               xip = chain->u.ip;
+               pfs->name_key = xip->ip_data.name_key;
+               pfs->pfs_type = xip->ip_data.pfs_type;
+               pfs->pfs_id = xip->ip_data.pfs_id;
+               pfs->pfs_fsid = xip->ip_data.pfs_fsid;
+               KKASSERT(xip->ip_data.name_len < sizeof(pfs->name));
+               bcopy(xip->ip_data.filename, pfs->name,
+                     xip->ip_data.name_len);
+               pfs->name[xip->ip_data.name_len] = 0;
+
+               /*
+                * Calculate the next field
+                */
+               do {
+                       chain = hammer2_chain_next(hmp, &parent, chain,
+                                            0, (hammer2_key_t)-1, 0);
+               } while (chain && chain->bref.type != HAMMER2_BREF_TYPE_INODE);
+               if (chain) {
+                       pfs->name_next = chain->u.ip->ip_data.name_key;
+                       hammer2_chain_unlock(hmp, chain);
+               } else {
+                       pfs->name_next = (hammer2_key_t)-1;
+               }
+       } else {
+               pfs->name_next = (hammer2_key_t)-1;
+               error = ENOENT;
+       }
+done:
+       hammer2_chain_unlock(hmp, parent);
+       return (error);
+}
+
+/*
+ * Create a new PFS under the super-root
+ */
+static int
+hammer2_ioctl_pfs_create(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_pfs_t *pfs = data;
+       hammer2_inode_t *nip = NULL;
+       int error;
+
+       pfs->name[sizeof(pfs->name) - 1] = 0;   /* ensure 0-termination */
+       error = hammer2_inode_create(hmp->schain->u.ip, NULL, NULL,
+                                    pfs->name, strlen(pfs->name),
+                                    &nip);
+       if (error == 0) {
+               hammer2_chain_modify(hmp, &nip->chain, 0);
+               nip->ip_data.pfs_type = pfs->pfs_type;
+               nip->ip_data.pfs_id = pfs->pfs_id;
+               nip->ip_data.pfs_fsid = pfs->pfs_fsid;
+               hammer2_chain_unlock(hmp, &nip->chain);
+       }
+       return (error);
+}
+
+/*
+ * Destroy an existing PFS under the super-root
+ */
+static int
+hammer2_ioctl_pfs_delete(hammer2_inode_t *ip, void *data)
+{
+       hammer2_mount_t *hmp = ip->hmp;
+       hammer2_ioc_pfs_t *pfs = data;
+       int error;
+
+       error = hammer2_unlink_file(hmp->schain->u.ip,
+                                   pfs->name, strlen(pfs->name),
+                                   0, 1);
+       return (error);
+}
diff --git a/sys/vfs/hammer2/hammer2_ioctl.h b/sys/vfs/hammer2/hammer2_ioctl.h
new file mode 100644 (file)
index 0000000..ffa49ad
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ * Copyright (c) 2011-2012 The DragonFly Project.  All rights reserved.
+ *
+ * This code is derived from software contributed to The DragonFly Project
+ * by Matthew Dillon <dillon@dragonflybsd.org>
+ * by Venkatesh Srinivas <vsrinivas@dragonflybsd.org>
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in
+ *    the documentation and/or other materials provided with the
+ *    distribution.
+ * 3. Neither the name of The DragonFly Project nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific, prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
+ * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
+ * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
+ * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+#ifndef VFS_HAMMER2_IOCTL_H_
+#define VFS_HAMMER2_IOCTL_H_
+
+#ifndef _SYS_IOCCOM_H_
+#include <sys/ioccom.h>
+#endif